encoding/unicode: add UTF8BOM encoding Some editors always add a BOM to UTF-8. Tradionally the BOM has been used to detect byte order encoding, which is irrelevant for UTF-8. These editors, however, use the UTF-8 BOM as a signature, allowing to override a charmap encoding if such a BOM is present. This is possible as the occurence of a BOM in such encodings is highly unlikely. This UTF8BOM encoding implements a simple encoding for this. It is intended for applications that require a UTF-8 encoding, but want to handle files written by such editors without explicit BOM handling. It can also be used to create such files. NOTE: there is currently no encoding that implements the fallback encoding of such editors. The BOMOverride functlinality in this package allows implementing such an encoder, with relative ease, though. Change-Id: I430851a1d93351bf6055eebe88005984dde451d9 Reviewed-on: https://go-review.googlesource.com/c/text/+/234277 Reviewed-by: Russ Cox <rsc@golang.org>

commit: 3a82255431918bb7c2e1c09c964a18991756910b [log] [tgz]
author: Marcel van Lohuizen <mpvl@golang.org> Sun May 17 12:22:50 2020 +0200
committer: Marcel van Lohuizen <mpvl@golang.org> Fri Jun 12 05:17:30 2020 +0000
tree: 6317f8c4f3b7b5b0bc89f02b2ad1a5d259dc3bf9
parent: 81608d7e9c6863c922f599e4ff1329a685218c0d [diff]
diff --git a/encoding/unicode/unicode.go b/encoding/unicode/unicode.go
index 4850ff3..f2e576d 100644
--- a/encoding/unicode/unicode.go
+++ b/encoding/unicode/unicode.go

@@ -6,6 +6,7 @@
 package unicode // import "golang.org/x/text/encoding/unicode"
 
 import (
+	"bytes"
 	"errors"
 	"unicode/utf16"
 	"unicode/utf8"
@@ -25,15 +26,95 @@
 // the introduction of some kind of error type for conveying the erroneous code
 // point.
 
-// UTF8 is the UTF-8 encoding.
+// UTF8 is the UTF-8 encoding. It neither removes nor adds byte order marks.
 var UTF8 encoding.Encoding = utf8enc
 
+// UTF8BOM is an UTF-8 encoding where the decoder strips a leading byte order
+// mark while the encoder adds one.
+//
+// Some editors add a byte order mark as a signature to UTF-8 files. Although
+// the byte order mark is not useful for detecting byte order in UTF-8, it is
+// sometimes used as a convention to mark UTF-8-encoded files. This relies on
+// the observation that the UTF-8 byte order mark is either an illegal or at
+// least very unlikely sequence in any other character encoding.
+var UTF8BOM encoding.Encoding = utf8bomEncoding{}
+
+type utf8bomEncoding struct{}
+
+func (utf8bomEncoding) String() string {
+	return "UTF-8-BOM"
+}
+
+func (utf8bomEncoding) ID() (identifier.MIB, string) {
+	return identifier.Unofficial, "x-utf8bom"
+}
+
+func (utf8bomEncoding) NewEncoder() *encoding.Encoder {
+	return &encoding.Encoder{
+		Transformer: &utf8bomEncoder{t: runes.ReplaceIllFormed()},
+	}
+}
+
+func (utf8bomEncoding) NewDecoder() *encoding.Decoder {
+	return &encoding.Decoder{Transformer: &utf8bomDecoder{}}
+}
+
 var utf8enc = &internal.Encoding{
 	&internal.SimpleEncoding{utf8Decoder{}, runes.ReplaceIllFormed()},
 	"UTF-8",
 	identifier.UTF8,
 }
 
+type utf8bomDecoder struct {
+	checked bool
+}
+
+func (t *utf8bomDecoder) Reset() {
+	t.checked = false
+}
+
+func (t *utf8bomDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+	if !t.checked {
+		if !atEOF && len(src) < len(utf8BOM) {
+			if len(src) == 0 {
+				return 0, 0, nil
+			}
+			return 0, 0, transform.ErrShortSrc
+		}
+		if bytes.HasPrefix(src, []byte(utf8BOM)) {
+			nSrc += len(utf8BOM)
+			src = src[len(utf8BOM):]
+		}
+		t.checked = true
+	}
+	nDst, n, err := utf8Decoder.Transform(utf8Decoder{}, dst[nDst:], src, atEOF)
+	nSrc += n
+	return nDst, nSrc, err
+}
+
+type utf8bomEncoder struct {
+	written bool
+	t       transform.Transformer
+}
+
+func (t *utf8bomEncoder) Reset() {
+	t.written = false
+	t.t.Reset()
+}
+
+func (t *utf8bomEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+	if !t.written {
+		if len(dst) < len(utf8BOM) {
+			return nDst, 0, transform.ErrShortDst
+		}
+		nDst = copy(dst, utf8BOM)
+		t.written = true
+	}
+	n, nSrc, err := utf8Decoder.Transform(utf8Decoder{}, dst[nDst:], src, atEOF)
+	nDst += n
+	return nDst, nSrc, err
+}
+
 type utf8Decoder struct{ transform.NopResetter }
 
 func (utf8Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {

diff --git a/encoding/unicode/unicode_test.go b/encoding/unicode/unicode_test.go
index 51b4712..02520c9 100644
--- a/encoding/unicode/unicode_test.go
+++ b/encoding/unicode/unicode_test.go

@@ -454,6 +454,212 @@
 	}
 }
 
+func TestUTF8BOMDecoder(t *testing.T) {
+	testCases := []struct {
+		desc    string
+		src     string
+		notEOF  bool // the inverse of atEOF
+		sizeDst int
+		want    string
+		nSrc    int
+		err     error
+		wantAll string
+	}{{
+		desc: "empty string, empty dest buffer",
+	}, {
+		desc:    "empty string",
+		sizeDst: 8,
+	}, {
+		desc:    "empty string, streaming",
+		notEOF:  true,
+		sizeDst: 8,
+	}, {
+		desc:    "ascii",
+		src:     "abcde",
+		sizeDst: 8,
+		want:    "abcde",
+		nSrc:    5,
+		wantAll: "abcde",
+	}, {
+		desc:    "ascii with bom",
+		src:     utf8BOM + "abcde",
+		sizeDst: 11,
+		want:    "abcde",
+		nSrc:    8,
+		wantAll: "abcde",
+	}, {
+		desc:    "error with bom",
+		src:     utf8BOM + "ab\x80de",
+		sizeDst: 11,
+		want:    "ab\ufffdde",
+		nSrc:    8,
+		wantAll: "ab\ufffdde",
+	}, {
+		desc:    "short bom",
+		src:     utf8BOM[:2],
+		notEOF:  true,
+		sizeDst: 7,
+		want:    "",
+		nSrc:    0,
+		wantAll: "\ufffd", // needs to be 1 replacement
+		err:     transform.ErrShortSrc,
+	}, {
+		desc:    "short bom at end",
+		src:     utf8BOM[:2],
+		sizeDst: 7,
+		want:    "\ufffd", // needs to be 1 replacement
+		nSrc:    2,
+		wantAll: "\ufffd", // needs to be 1 replacement
+		err:     nil,
+	}, {
+		desc:    "short source buffer",
+		src:     "abc\xf0\x90",
+		notEOF:  true,
+		sizeDst: 10,
+		want:    "abc",
+		nSrc:    3,
+		wantAll: "abc\ufffd",
+		err:     transform.ErrShortSrc,
+	}, {
+		desc:    "short source buffer with bom",
+		src:     utf8BOM + "abc\xf0\x90",
+		notEOF:  true,
+		sizeDst: 15,
+		want:    "abc",
+		nSrc:    6,
+		wantAll: "abc\ufffd",
+		err:     transform.ErrShortSrc,
+	}, {
+		desc:    "short dst for error",
+		src:     utf8BOM + "abc\x80",
+		notEOF:  true,
+		sizeDst: 5,
+		want:    "abc",
+		nSrc:    6,
+		wantAll: "abc\ufffd",
+		err:     transform.ErrShortDst,
+	}}
+	tr := UTF8BOM.NewDecoder()
+	for i, tc := range testCases {
+		tr.Reset()
+		b := make([]byte, tc.sizeDst)
+		nDst, nSrc, err := tr.Transform(b, []byte(tc.src), !tc.notEOF)
+		if err != tc.err {
+			t.Errorf("%d:%s: error was %v; want %v", i, tc.desc, err, tc.err)
+		}
+		if got := string(b[:nDst]); got != tc.want {
+			t.Errorf("%d:%s: result was %q: want %q", i, tc.desc, got, tc.want)
+		}
+		if nSrc != tc.nSrc {
+			t.Errorf("%d:%s: nSrc was %d; want %d", i, tc.desc, nSrc, tc.nSrc)
+		}
+		if got, _ := tr.String(tc.src); got != tc.wantAll {
+			t.Errorf("%d:%s: String was %s; want %s", i, tc.desc, got, tc.wantAll)
+		}
+	}
+}
+
+func TestUTF8SigEncoder(t *testing.T) {
+	testCases := []struct {
+		desc    string
+		src     string
+		notEOF  bool // the inverse of atEOF
+		sizeDst int
+		want    string
+		wantAll string // converting all bytes
+		nSrc    int
+		err     error
+	}{{
+		desc:    "empty string, empty dest buffer",
+		err:     transform.ErrShortDst,
+		wantAll: utf8BOM,
+	}, {
+		desc:    "empty string",
+		sizeDst: 8,
+		want:    utf8BOM,
+		wantAll: utf8BOM,
+	}, {
+		desc:    "empty string, streaming",
+		notEOF:  true,
+		sizeDst: 8,
+		want:    utf8BOM,
+		wantAll: utf8BOM,
+	}, {
+		desc:    "ascii",
+		src:     "abcde",
+		sizeDst: 8,
+		want:    utf8BOM + "abcde",
+		nSrc:    5,
+		wantAll: utf8BOM + "abcde",
+	}, {
+		desc:    "short bom at end",
+		src:     utf8BOM[:2],
+		sizeDst: 11,
+		want:    utf8BOM + "\ufffd",
+		nSrc:    2,
+		wantAll: utf8BOM + "\ufffd",
+	}, {
+		desc:    "short bom",
+		src:     utf8BOM[:2],
+		notEOF:  true,
+		sizeDst: 7,
+		want:    utf8BOM,
+		nSrc:    0,
+		err:     transform.ErrShortSrc,
+		wantAll: utf8BOM + "\ufffd",
+	}, {
+		desc:    "short bom at end",
+		src:     utf8BOM[:2],
+		sizeDst: 7,
+		want:    utf8BOM + "\ufffd", // needs to be 1 replacement
+		nSrc:    2,
+		err:     nil,
+		wantAll: utf8BOM + "\ufffd",
+	}, {
+		desc:    "short dst buffer 2",
+		src:     "ab",
+		sizeDst: 2,
+		want:    "",
+		nSrc:    0,
+		err:     transform.ErrShortDst,
+		wantAll: utf8BOM + "ab",
+	}, {
+		desc:    "short dst buffer 3",
+		src:     "ab",
+		sizeDst: 3,
+		want:    utf8BOM,
+		nSrc:    0,
+		err:     transform.ErrShortDst,
+		wantAll: utf8BOM + "ab",
+	}, {
+		desc:    "short dst buffer 4",
+		src:     "ab",
+		sizeDst: 4,
+		want:    utf8BOM + "a",
+		nSrc:    1,
+		err:     transform.ErrShortDst,
+		wantAll: utf8BOM + "ab",
+	}}
+	tr := UTF8BOM.NewEncoder()
+	for i, tc := range testCases {
+		tr.Reset()
+		b := make([]byte, tc.sizeDst)
+		nDst, nSrc, err := tr.Transform(b, []byte(tc.src), !tc.notEOF)
+		if err != tc.err {
+			t.Errorf("%d:%s: error was %v; want %v", i, tc.desc, err, tc.err)
+		}
+		if got := string(b[:nDst]); got != tc.want {
+			t.Errorf("%d:%s: result was %q: want %q", i, tc.desc, got, tc.want)
+		}
+		if nSrc != tc.nSrc {
+			t.Errorf("%d:%s: nSrc was %d; want %d", i, tc.desc, nSrc, tc.nSrc)
+		}
+		if got, _ := tr.String(tc.src); got != tc.wantAll {
+			t.Errorf("%d:%s: String was %s; want %s", i, tc.desc, got, tc.wantAll)
+		}
+	}
+}
+
 func TestBOMOverride(t *testing.T) {
 	dec := BOMOverride(charmap.CodePage437.NewDecoder())
 	dst := make([]byte, 100)
commit	3a82255431918bb7c2e1c09c964a18991756910b	[log] [tgz]
author	Marcel van Lohuizen <mpvl@golang.org>	Sun May 17 12:22:50 2020 +0200
committer	Marcel van Lohuizen <mpvl@golang.org>	Fri Jun 12 05:17:30 2020 +0000
tree	6317f8c4f3b7b5b0bc89f02b2ad1a5d259dc3bf9
parent	81608d7e9c6863c922f599e4ff1329a685218c0d [diff]