encoding/unicode: add UTF8BOM encoding
Some editors always add a BOM to UTF-8. Tradionally the BOM
has been used to detect byte order encoding, which is irrelevant
for UTF-8. These editors, however, use the UTF-8 BOM as
a signature, allowing to override a charmap encoding if such
a BOM is present. This is possible as the occurence of a BOM
in such encodings is highly unlikely.
This UTF8BOM encoding implements a simple encoding for this.
It is intended for applications that require a UTF-8 encoding, but
want to handle files written by such editors without explicit BOM
handling. It can also be used to create such files.
NOTE: there is currently no encoding that implements the fallback
encoding of such editors. The BOMOverride functlinality in this
package allows implementing such an encoder, with relative ease,
though.
Change-Id: I430851a1d93351bf6055eebe88005984dde451d9
Reviewed-on: https://go-review.googlesource.com/c/text/+/234277
Reviewed-by: Russ Cox <rsc@golang.org>
diff --git a/encoding/unicode/unicode.go b/encoding/unicode/unicode.go
index 4850ff3..f2e576d 100644
--- a/encoding/unicode/unicode.go
+++ b/encoding/unicode/unicode.go
@@ -6,6 +6,7 @@
package unicode // import "golang.org/x/text/encoding/unicode"
import (
+ "bytes"
"errors"
"unicode/utf16"
"unicode/utf8"
@@ -25,15 +26,95 @@
// the introduction of some kind of error type for conveying the erroneous code
// point.
-// UTF8 is the UTF-8 encoding.
+// UTF8 is the UTF-8 encoding. It neither removes nor adds byte order marks.
var UTF8 encoding.Encoding = utf8enc
+// UTF8BOM is an UTF-8 encoding where the decoder strips a leading byte order
+// mark while the encoder adds one.
+//
+// Some editors add a byte order mark as a signature to UTF-8 files. Although
+// the byte order mark is not useful for detecting byte order in UTF-8, it is
+// sometimes used as a convention to mark UTF-8-encoded files. This relies on
+// the observation that the UTF-8 byte order mark is either an illegal or at
+// least very unlikely sequence in any other character encoding.
+var UTF8BOM encoding.Encoding = utf8bomEncoding{}
+
+type utf8bomEncoding struct{}
+
+func (utf8bomEncoding) String() string {
+ return "UTF-8-BOM"
+}
+
+func (utf8bomEncoding) ID() (identifier.MIB, string) {
+ return identifier.Unofficial, "x-utf8bom"
+}
+
+func (utf8bomEncoding) NewEncoder() *encoding.Encoder {
+ return &encoding.Encoder{
+ Transformer: &utf8bomEncoder{t: runes.ReplaceIllFormed()},
+ }
+}
+
+func (utf8bomEncoding) NewDecoder() *encoding.Decoder {
+ return &encoding.Decoder{Transformer: &utf8bomDecoder{}}
+}
+
var utf8enc = &internal.Encoding{
&internal.SimpleEncoding{utf8Decoder{}, runes.ReplaceIllFormed()},
"UTF-8",
identifier.UTF8,
}
+type utf8bomDecoder struct {
+ checked bool
+}
+
+func (t *utf8bomDecoder) Reset() {
+ t.checked = false
+}
+
+func (t *utf8bomDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+ if !t.checked {
+ if !atEOF && len(src) < len(utf8BOM) {
+ if len(src) == 0 {
+ return 0, 0, nil
+ }
+ return 0, 0, transform.ErrShortSrc
+ }
+ if bytes.HasPrefix(src, []byte(utf8BOM)) {
+ nSrc += len(utf8BOM)
+ src = src[len(utf8BOM):]
+ }
+ t.checked = true
+ }
+ nDst, n, err := utf8Decoder.Transform(utf8Decoder{}, dst[nDst:], src, atEOF)
+ nSrc += n
+ return nDst, nSrc, err
+}
+
+type utf8bomEncoder struct {
+ written bool
+ t transform.Transformer
+}
+
+func (t *utf8bomEncoder) Reset() {
+ t.written = false
+ t.t.Reset()
+}
+
+func (t *utf8bomEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+ if !t.written {
+ if len(dst) < len(utf8BOM) {
+ return nDst, 0, transform.ErrShortDst
+ }
+ nDst = copy(dst, utf8BOM)
+ t.written = true
+ }
+ n, nSrc, err := utf8Decoder.Transform(utf8Decoder{}, dst[nDst:], src, atEOF)
+ nDst += n
+ return nDst, nSrc, err
+}
+
type utf8Decoder struct{ transform.NopResetter }
func (utf8Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
diff --git a/encoding/unicode/unicode_test.go b/encoding/unicode/unicode_test.go
index 51b4712..02520c9 100644
--- a/encoding/unicode/unicode_test.go
+++ b/encoding/unicode/unicode_test.go
@@ -454,6 +454,212 @@
}
}
+func TestUTF8BOMDecoder(t *testing.T) {
+ testCases := []struct {
+ desc string
+ src string
+ notEOF bool // the inverse of atEOF
+ sizeDst int
+ want string
+ nSrc int
+ err error
+ wantAll string
+ }{{
+ desc: "empty string, empty dest buffer",
+ }, {
+ desc: "empty string",
+ sizeDst: 8,
+ }, {
+ desc: "empty string, streaming",
+ notEOF: true,
+ sizeDst: 8,
+ }, {
+ desc: "ascii",
+ src: "abcde",
+ sizeDst: 8,
+ want: "abcde",
+ nSrc: 5,
+ wantAll: "abcde",
+ }, {
+ desc: "ascii with bom",
+ src: utf8BOM + "abcde",
+ sizeDst: 11,
+ want: "abcde",
+ nSrc: 8,
+ wantAll: "abcde",
+ }, {
+ desc: "error with bom",
+ src: utf8BOM + "ab\x80de",
+ sizeDst: 11,
+ want: "ab\ufffdde",
+ nSrc: 8,
+ wantAll: "ab\ufffdde",
+ }, {
+ desc: "short bom",
+ src: utf8BOM[:2],
+ notEOF: true,
+ sizeDst: 7,
+ want: "",
+ nSrc: 0,
+ wantAll: "\ufffd", // needs to be 1 replacement
+ err: transform.ErrShortSrc,
+ }, {
+ desc: "short bom at end",
+ src: utf8BOM[:2],
+ sizeDst: 7,
+ want: "\ufffd", // needs to be 1 replacement
+ nSrc: 2,
+ wantAll: "\ufffd", // needs to be 1 replacement
+ err: nil,
+ }, {
+ desc: "short source buffer",
+ src: "abc\xf0\x90",
+ notEOF: true,
+ sizeDst: 10,
+ want: "abc",
+ nSrc: 3,
+ wantAll: "abc\ufffd",
+ err: transform.ErrShortSrc,
+ }, {
+ desc: "short source buffer with bom",
+ src: utf8BOM + "abc\xf0\x90",
+ notEOF: true,
+ sizeDst: 15,
+ want: "abc",
+ nSrc: 6,
+ wantAll: "abc\ufffd",
+ err: transform.ErrShortSrc,
+ }, {
+ desc: "short dst for error",
+ src: utf8BOM + "abc\x80",
+ notEOF: true,
+ sizeDst: 5,
+ want: "abc",
+ nSrc: 6,
+ wantAll: "abc\ufffd",
+ err: transform.ErrShortDst,
+ }}
+ tr := UTF8BOM.NewDecoder()
+ for i, tc := range testCases {
+ tr.Reset()
+ b := make([]byte, tc.sizeDst)
+ nDst, nSrc, err := tr.Transform(b, []byte(tc.src), !tc.notEOF)
+ if err != tc.err {
+ t.Errorf("%d:%s: error was %v; want %v", i, tc.desc, err, tc.err)
+ }
+ if got := string(b[:nDst]); got != tc.want {
+ t.Errorf("%d:%s: result was %q: want %q", i, tc.desc, got, tc.want)
+ }
+ if nSrc != tc.nSrc {
+ t.Errorf("%d:%s: nSrc was %d; want %d", i, tc.desc, nSrc, tc.nSrc)
+ }
+ if got, _ := tr.String(tc.src); got != tc.wantAll {
+ t.Errorf("%d:%s: String was %s; want %s", i, tc.desc, got, tc.wantAll)
+ }
+ }
+}
+
+func TestUTF8SigEncoder(t *testing.T) {
+ testCases := []struct {
+ desc string
+ src string
+ notEOF bool // the inverse of atEOF
+ sizeDst int
+ want string
+ wantAll string // converting all bytes
+ nSrc int
+ err error
+ }{{
+ desc: "empty string, empty dest buffer",
+ err: transform.ErrShortDst,
+ wantAll: utf8BOM,
+ }, {
+ desc: "empty string",
+ sizeDst: 8,
+ want: utf8BOM,
+ wantAll: utf8BOM,
+ }, {
+ desc: "empty string, streaming",
+ notEOF: true,
+ sizeDst: 8,
+ want: utf8BOM,
+ wantAll: utf8BOM,
+ }, {
+ desc: "ascii",
+ src: "abcde",
+ sizeDst: 8,
+ want: utf8BOM + "abcde",
+ nSrc: 5,
+ wantAll: utf8BOM + "abcde",
+ }, {
+ desc: "short bom at end",
+ src: utf8BOM[:2],
+ sizeDst: 11,
+ want: utf8BOM + "\ufffd",
+ nSrc: 2,
+ wantAll: utf8BOM + "\ufffd",
+ }, {
+ desc: "short bom",
+ src: utf8BOM[:2],
+ notEOF: true,
+ sizeDst: 7,
+ want: utf8BOM,
+ nSrc: 0,
+ err: transform.ErrShortSrc,
+ wantAll: utf8BOM + "\ufffd",
+ }, {
+ desc: "short bom at end",
+ src: utf8BOM[:2],
+ sizeDst: 7,
+ want: utf8BOM + "\ufffd", // needs to be 1 replacement
+ nSrc: 2,
+ err: nil,
+ wantAll: utf8BOM + "\ufffd",
+ }, {
+ desc: "short dst buffer 2",
+ src: "ab",
+ sizeDst: 2,
+ want: "",
+ nSrc: 0,
+ err: transform.ErrShortDst,
+ wantAll: utf8BOM + "ab",
+ }, {
+ desc: "short dst buffer 3",
+ src: "ab",
+ sizeDst: 3,
+ want: utf8BOM,
+ nSrc: 0,
+ err: transform.ErrShortDst,
+ wantAll: utf8BOM + "ab",
+ }, {
+ desc: "short dst buffer 4",
+ src: "ab",
+ sizeDst: 4,
+ want: utf8BOM + "a",
+ nSrc: 1,
+ err: transform.ErrShortDst,
+ wantAll: utf8BOM + "ab",
+ }}
+ tr := UTF8BOM.NewEncoder()
+ for i, tc := range testCases {
+ tr.Reset()
+ b := make([]byte, tc.sizeDst)
+ nDst, nSrc, err := tr.Transform(b, []byte(tc.src), !tc.notEOF)
+ if err != tc.err {
+ t.Errorf("%d:%s: error was %v; want %v", i, tc.desc, err, tc.err)
+ }
+ if got := string(b[:nDst]); got != tc.want {
+ t.Errorf("%d:%s: result was %q: want %q", i, tc.desc, got, tc.want)
+ }
+ if nSrc != tc.nSrc {
+ t.Errorf("%d:%s: nSrc was %d; want %d", i, tc.desc, nSrc, tc.nSrc)
+ }
+ if got, _ := tr.String(tc.src); got != tc.wantAll {
+ t.Errorf("%d:%s: String was %s; want %s", i, tc.desc, got, tc.wantAll)
+ }
+ }
+}
+
func TestBOMOverride(t *testing.T) {
dec := BOMOverride(charmap.CodePage437.NewDecoder())
dst := make([]byte, 100)