encoding/simplifiedchinese: make HZGB2312 replace with FFFD on error
Not sure if the right number of replacement characters is
returned in all instances, but seems reasonable.
Updates golang/go#18898
Change-Id: Ibf6efdb079191aa6db4eb05b41b7dae593947bd0
Reviewed-on: https://go-review.googlesource.com/37324
Run-TryBot: Marcel van Lohuizen <mpvl@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Nigel Tao <nigeltao@golang.org>
diff --git a/encoding/simplifiedchinese/all_test.go b/encoding/simplifiedchinese/all_test.go
index 96a0585..b369da2 100644
--- a/encoding/simplifiedchinese/all_test.go
+++ b/encoding/simplifiedchinese/all_test.go
@@ -51,6 +51,12 @@
{dec, GB18030, "\xfe\x30\x81\x21", "\ufffd0\ufffd!"},
{dec, GB18030, strings.Repeat("\xfe\x30", n), strings.Repeat("\ufffd0", n)},
+
+ {dec, HZGB2312, "~/", "\ufffd"},
+ {dec, HZGB2312, "~{a\x80", "\ufffd"},
+ {dec, HZGB2312, "~{a\x80", "\ufffd"},
+ {dec, HZGB2312, "~{" + strings.Repeat("z~", n), strings.Repeat("\ufffd", n)},
+ {dec, HZGB2312, "~{" + strings.Repeat("\xfe\x30", n), strings.Repeat("\ufffd", n*2)},
}
for _, tc := range testCases {
dir, tr, wantErr := tc.init(tc.e)
diff --git a/encoding/simplifiedchinese/hzgb2312.go b/encoding/simplifiedchinese/hzgb2312.go
index 85de6b1..eb3157f 100644
--- a/encoding/simplifiedchinese/hzgb2312.go
+++ b/encoding/simplifiedchinese/hzgb2312.go
@@ -5,7 +5,6 @@
package simplifiedchinese
import (
- "errors"
"unicode/utf8"
"golang.org/x/text/encoding"
@@ -31,8 +30,6 @@
return new(hzGB2312Encoder)
}
-var errInvalidHZGB2312 = errors.New("simplifiedchinese: invalid HZ-GB2312 encoding")
-
const (
asciiState = iota
gbState
@@ -50,14 +47,18 @@
for ; nSrc < len(src); nSrc += size {
c0 := src[nSrc]
if c0 >= utf8.RuneSelf {
- err = errInvalidHZGB2312
- break loop
+ r, size = utf8.RuneError, 1
+ goto write
}
if c0 == '~' {
if nSrc+1 >= len(src) {
- err = transform.ErrShortSrc
- break loop
+ if !atEOF {
+ err = transform.ErrShortSrc
+ break loop
+ }
+ r = utf8.RuneError
+ goto write
}
size = 2
switch src[nSrc+1] {
@@ -78,8 +79,8 @@
case '\n':
continue
default:
- err = errInvalidHZGB2312
- break loop
+ r = utf8.RuneError
+ goto write
}
}
@@ -87,33 +88,37 @@
r, size = rune(c0), 1
} else {
if nSrc+1 >= len(src) {
- err = transform.ErrShortSrc
- break loop
+ if !atEOF {
+ err = transform.ErrShortSrc
+ break loop
+ }
+ r, size = utf8.RuneError, 1
+ goto write
}
+ size = 2
c1 := src[nSrc+1]
if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
- err = errInvalidHZGB2312
- break loop
- }
-
- r, size = '\ufffd', 2
- if i := int(c0-0x01)*190 + int(c1+0x3f); i < len(decode) {
+ // error
+ } else if i := int(c0-0x01)*190 + int(c1+0x3f); i < len(decode) {
r = rune(decode[i])
- if r == 0 {
- r = '\ufffd'
+ if r != 0 {
+ goto write
}
}
+ if c1 > utf8.RuneSelf {
+ // Be consistent and always treat non-ASCII as a single error.
+ size = 1
+ }
+ r = utf8.RuneError
}
+ write:
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
- if atEOF && err == transform.ErrShortSrc {
- err = errInvalidHZGB2312
- }
return nDst, nSrc, err
}