encoding/simplifiedchinese: Fixes € encoding in GB18030 The euro sign is an exception which is given a single byte code of 0x80 in Microsoft's later versions of CP936/GBK and a two byte code of A2 E3 in GB18030. https://en.wikipedia.org/wiki/GB_18030#cite_note-4 Fixes golang/go#48691 Change-Id: I6a4460274d4313ad1d03bcd8070373af674691eb GitHub-Last-Rev: acbbc50f20d663452f8da77cf2a66d8d893bec1d GitHub-Pull-Request: golang/text#26 Reviewed-on: https://go-review.googlesource.com/c/text/+/353712 Reviewed-by: Nigel Tao <nigeltao@golang.org> Trust: Nigel Tao <nigeltao@golang.org> Trust: Alberto Donizetti <alb.donizetti@gmail.com> Run-TryBot: Nigel Tao <nigeltao@golang.org> TryBot-Result: Go Bot <gobot@golang.org>
diff --git a/encoding/simplifiedchinese/all_test.go b/encoding/simplifiedchinese/all_test.go index a556c94..fbb623c 100644 --- a/encoding/simplifiedchinese/all_test.go +++ b/encoding/simplifiedchinese/all_test.go
@@ -40,7 +40,9 @@ {enc, HZGB2312, "a갂", "a"}, {enc, HZGB2312, "\u6cf5갂", "~{1C~}"}, + {dec, GBK, "\xa2\xe3", "€"}, {dec, GB18030, "\x80", "€"}, + {dec, GB18030, "\x81", "\ufffd"}, {dec, GB18030, "\x81\x20", "\ufffd "}, {dec, GB18030, "\xfe\xfe", "\ufffd"}, @@ -125,6 +127,14 @@ encPrefix: "~{", encoded: ";(<dR;:x>F#,6@WCN^O`GW!#", utf8: "花间一壶酒,独酌无相亲。", + }, { + e: GBK, + encoded: "\x80", + utf8: "€", + }, { + e: GB18030, + encoded: "\xa2\xe3", + utf8: "€", }} for _, tc := range testCases {
diff --git a/encoding/simplifiedchinese/gbk.go b/encoding/simplifiedchinese/gbk.go index b89c45b..0e0fabf 100644 --- a/encoding/simplifiedchinese/gbk.go +++ b/encoding/simplifiedchinese/gbk.go
@@ -55,6 +55,8 @@ // Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC // as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk // says to treat "gbk" as Code Page 936. + // GBK’s decoder is gb18030’s decoder. https://encoding.spec.whatwg.org/#gbk-decoder + // If byte is 0x80, return code point U+20AC. https://encoding.spec.whatwg.org/#gb18030-decoder case c0 == 0x80: r, size = '€', 1 @@ -180,7 +182,9 @@ // Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC // as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk // says to treat "gbk" as Code Page 936. - if r == '€' { + // GBK’s encoder is gb18030’s encoder with its _is GBK_ set to true. https://encoding.spec.whatwg.org/#gbk-encoder + // If _is GBK_ is true and code point is U+20AC, return byte 0x80. https://encoding.spec.whatwg.org/#gb18030-encoder + if !e.gb18030 && r == '€' { r = 0x80 goto write1 }