encoding/simplifiedchinese: Fixes € encoding in GB18030
The euro sign is an exception which is given a single byte code of 0x80
in Microsoft's later versions of CP936/GBK and a two byte code of A2 E3
in GB18030. https://en.wikipedia.org/wiki/GB_18030#cite_note-4
Fixes golang/go#48691
Change-Id: I6a4460274d4313ad1d03bcd8070373af674691eb
GitHub-Last-Rev: acbbc50f20d663452f8da77cf2a66d8d893bec1d
GitHub-Pull-Request: golang/text#26
Reviewed-on: https://go-review.googlesource.com/c/text/+/353712
Reviewed-by: Nigel Tao <nigeltao@golang.org>
Trust: Nigel Tao <nigeltao@golang.org>
Trust: Alberto Donizetti <alb.donizetti@gmail.com>
Run-TryBot: Nigel Tao <nigeltao@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
diff --git a/encoding/simplifiedchinese/all_test.go b/encoding/simplifiedchinese/all_test.go
index a556c94..fbb623c 100644
--- a/encoding/simplifiedchinese/all_test.go
+++ b/encoding/simplifiedchinese/all_test.go
@@ -40,7 +40,9 @@
{enc, HZGB2312, "a갂", "a"},
{enc, HZGB2312, "\u6cf5갂", "~{1C~}"},
+ {dec, GBK, "\xa2\xe3", "€"},
{dec, GB18030, "\x80", "€"},
+
{dec, GB18030, "\x81", "\ufffd"},
{dec, GB18030, "\x81\x20", "\ufffd "},
{dec, GB18030, "\xfe\xfe", "\ufffd"},
@@ -125,6 +127,14 @@
encPrefix: "~{",
encoded: ";(<dR;:x>F#,6@WCN^O`GW!#",
utf8: "花间一壶酒,独酌无相亲。",
+ }, {
+ e: GBK,
+ encoded: "\x80",
+ utf8: "€",
+ }, {
+ e: GB18030,
+ encoded: "\xa2\xe3",
+ utf8: "€",
}}
for _, tc := range testCases {
diff --git a/encoding/simplifiedchinese/gbk.go b/encoding/simplifiedchinese/gbk.go
index b89c45b..0e0fabf 100644
--- a/encoding/simplifiedchinese/gbk.go
+++ b/encoding/simplifiedchinese/gbk.go
@@ -55,6 +55,8 @@
// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
// says to treat "gbk" as Code Page 936.
+ // GBK’s decoder is gb18030’s decoder. https://encoding.spec.whatwg.org/#gbk-decoder
+ // If byte is 0x80, return code point U+20AC. https://encoding.spec.whatwg.org/#gb18030-decoder
case c0 == 0x80:
r, size = '€', 1
@@ -180,7 +182,9 @@
// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
// says to treat "gbk" as Code Page 936.
- if r == '€' {
+ // GBK’s encoder is gb18030’s encoder with its _is GBK_ set to true. https://encoding.spec.whatwg.org/#gbk-encoder
+ // If _is GBK_ is true and code point is U+20AC, return byte 0x80. https://encoding.spec.whatwg.org/#gb18030-encoder
+ if !e.gb18030 && r == '€' {
r = 0x80
goto write1
}