encoding/unicode: add WhatWG-compliant UTF8 Decoder For background see CL https://go-review.googlesource.com/#/c/11494. I think this implemenation is better as it: 1) is faster, and 2) doesn't expose the alternative handling of invalid UTF-8 bytes to outside this transform, even if only as an internal function. This Transform is likely the only place one should ever handle the invalid bytes differently from standard Go. The algorithm to handle UTF-8 is similar to the one in unicode/utf8. The table is put in a separate package as it can be used to optimize other parts, such as the trie, which doesn't use the utf8 package, and possibly other transforms. The utf8internal package can also be a placeholder for other often-needed, but low-level utf8 stuff (hangul conversion, surrugate info, etc.). Change-Id: Ib211033b79b438c6a016c1fb56f54e2be049ef1e Reviewed-on: https://go-review.googlesource.com/17650 Reviewed-by: Hyang-Ah Hana Kim <hyangah@gmail.com>

commit: a0f606852dec5915cd2e76d0f125eb0feb7cec79 [log] [tgz]
author: Marcel van Lohuizen <mpvl@golang.org> Wed Dec 09 09:53:17 2015 +0100
committer: Marcel van Lohuizen <mpvl@golang.org> Mon Dec 14 10:45:12 2015 +0000
tree: bd21f64d98108b94df2ea3a8e2bed615d1593937
parent: 87ad79f47ac96edbf2568691d123d113fd7ac535 [diff]
diff --git a/encoding/encoding_test.go b/encoding/encoding_test.go
index d6e21bf..faee1bf 100644
--- a/encoding/encoding_test.go
+++ b/encoding/encoding_test.go

@@ -938,6 +938,7 @@
 	{simplifiedchinese.HZGB2312, "sunzi-bingfa-gb-levels-1-and-2", "hz-gb2312"},
 	{traditionalchinese.Big5, "sunzi-bingfa-traditional", "big5"},
 	{utf16LEIB, "candide", "utf-16le"},
+	{unicode.UTF8, "candide", "utf-8"},
 
 	// GB18030 is a superset of GBK and is nominally a Simplified Chinese
 	// encoding, but it can also represent the entire Basic Multilingual
@@ -1032,5 +1033,7 @@
 func BenchmarkISO2022JPEncoder(b *testing.B) { benchmark(b, "Encode", japanese.ISO2022JP) }
 func BenchmarkShiftJISDecoder(b *testing.B)  { benchmark(b, "Decode", japanese.ShiftJIS) }
 func BenchmarkShiftJISEncoder(b *testing.B)  { benchmark(b, "Encode", japanese.ShiftJIS) }
+func BenchmarkUTF8Decoder(b *testing.B)      { benchmark(b, "Decode", unicode.UTF8) }
+func BenchmarkUTF8Encoder(b *testing.B)      { benchmark(b, "Encode", unicode.UTF8) }
 func BenchmarkUTF16Decoder(b *testing.B)     { benchmark(b, "Decode", utf16LEIB) }
 func BenchmarkUTF16Encoder(b *testing.B)     { benchmark(b, "Encode", utf16LEIB) }

diff --git a/encoding/unicode/unicode.go b/encoding/unicode/unicode.go
index 17468f2..11cdbb1 100644
--- a/encoding/unicode/unicode.go
+++ b/encoding/unicode/unicode.go

@@ -11,7 +11,9 @@
 	"unicode/utf8"
 
 	"golang.org/x/text/encoding"
+	"golang.org/x/text/encoding/internal"
 	"golang.org/x/text/encoding/internal/identifier"
+	"golang.org/x/text/internal/utf8internal"
 	"golang.org/x/text/transform"
 )
 
@@ -23,9 +25,107 @@
 // point.
 
 // TODO:
-// - Define UTF-8 (mostly for BOM handling.)
 // - Define UTF-32?
 
+// UTF8 is the UTF-8 encoding.
+var UTF8 encoding.Encoding = utf8enc
+
+var utf8enc = &internal.Encoding{
+	// TODO: transform.Nop needs to be replaced with encoding.UTF8Validator
+	// once we don't allow encoding errors to be silently ignored.
+	&internal.SimpleEncoding{utf8Decoder{}, transform.Nop},
+	"UTF-8",
+	identifier.UTF8,
+}
+
+type utf8Decoder struct{ transform.NopResetter }
+
+func (utf8Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+	var pSrc int // point from which to start copy in src
+	var accept utf8internal.AcceptRange
+
+	// The decoder can only make the input larger, not smaller.
+	n := len(src)
+	if len(dst) < n {
+		err = transform.ErrShortDst
+		n = len(dst)
+		atEOF = false
+	}
+	for nSrc < n {
+		c := src[nSrc]
+		if c < utf8.RuneSelf {
+			nSrc++
+			continue
+		}
+		first := utf8internal.First[c]
+		size := int(first & utf8internal.SizeMask)
+		if first == utf8internal.FirstInvalid {
+			goto handleInvalid // invalid starter byte
+		}
+		accept = utf8internal.AcceptRanges[first>>utf8internal.AcceptShift]
+		if nSrc+size > n {
+			if !atEOF {
+				// We may stop earlier than necessary here if the short sequence
+				// has invalid bytes. Not checking for this simplifies the code
+				// and may avoid duplicate computations in certain conditions.
+				if err == nil {
+					err = transform.ErrShortSrc
+				}
+				break
+			}
+			// Determine the maximal subpart of an ill-formed subsequence.
+			switch {
+			case nSrc+1 >= n || src[nSrc+1] < accept.Lo || accept.Hi < src[nSrc+1]:
+				size = 1
+			case nSrc+2 >= n || src[nSrc+2] < utf8internal.LoCB || utf8internal.HiCB < src[nSrc+2]:
+				size = 2
+			default:
+				size = 3 // As we are short, the maximum is 3.
+			}
+			goto handleInvalid
+		}
+		if c = src[nSrc+1]; c < accept.Lo || accept.Hi < c {
+			size = 1
+			goto handleInvalid // invalid continuation byte
+		} else if size == 2 {
+		} else if c = src[nSrc+2]; c < utf8internal.LoCB || utf8internal.HiCB < c {
+			size = 2
+			goto handleInvalid // invalid continuation byte
+		} else if size == 3 {
+		} else if c = src[nSrc+3]; c < utf8internal.LoCB || utf8internal.HiCB < c {
+			size = 3
+			goto handleInvalid // invalid continuation byte
+		}
+		nSrc += size
+		continue
+
+	handleInvalid:
+		// Copy the scanned input so far.
+		nDst += copy(dst[nDst:], src[pSrc:nSrc])
+
+		// Append RuneError to the destination.
+		const runeError = "\ufffd"
+		if nDst+len(runeError) > len(dst) {
+			return nDst, nSrc, transform.ErrShortDst
+		}
+		nDst += copy(dst[nDst:], runeError)
+
+		// Skip the maximal subpart of an ill-formed subsequence according to
+		// the W3C standard way instead of the Go way. This Transform is
+		// probably the only place in the text repo where it is warranted.
+		nSrc += size
+		pSrc = nSrc
+
+		// Recompute the maximum source length.
+		if sz := len(dst) - nDst; sz < len(src)-nSrc {
+			err = transform.ErrShortDst
+			n = nSrc + sz
+			atEOF = false
+		}
+	}
+	return nDst + copy(dst[nDst:], src[pSrc:nSrc]), nSrc, err
+}
+
 // UTF16 returns a UTF-16 Encoding for the given default endianness and byte
 // order mark (BOM) policy.
 //
@@ -77,13 +177,12 @@
 
 // All lists a configuration for each IANA-defined UTF-16 variant.
 var All = []encoding.Encoding{
+	UTF8,
 	UTF16(BigEndian, UseBOM),
 	UTF16(BigEndian, IgnoreBOM),
 	UTF16(LittleEndian, IgnoreBOM),
 }
 
-// TODO: also include UTF-8
-
 // BOMPolicy is a UTF-16 encoding's byte order mark policy.
 type BOMPolicy uint8
 

diff --git a/encoding/unicode/unicode_test.go b/encoding/unicode/unicode_test.go
new file mode 100644
index 0000000..2bc9615
--- /dev/null
+++ b/encoding/unicode/unicode_test.go

@@ -0,0 +1,178 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package unicode
+
+import (
+	"testing"
+
+	"golang.org/x/text/transform"
+)
+
+func TestUTF8Decoder(t *testing.T) {
+	testCases := []struct {
+		desc    string
+		src     string
+		notEOF  bool // the inverse of atEOF
+		sizeDst int
+		want    string
+		nSrc    int
+		err     error
+	}{{
+		desc: "empty string, empty dest buffer",
+	}, {
+		desc:    "empty string",
+		sizeDst: 8,
+	}, {
+		desc:    "empty string, streaming",
+		notEOF:  true,
+		sizeDst: 8,
+	}, {
+		desc:    "ascii",
+		src:     "abcde",
+		sizeDst: 8,
+		want:    "abcde",
+		nSrc:    5,
+	}, {
+		desc:    "ascii and error",
+		src:     "ab\x80de",
+		sizeDst: 7,
+		want:    "ab\ufffdde",
+		nSrc:    5,
+	}, {
+		desc:    "valid two-byte sequence",
+		src:     "a\u0300bc",
+		sizeDst: 7,
+		want:    "a\u0300bc",
+		nSrc:    5,
+	}, {
+		desc:    "valid three-byte sequence",
+		src:     "a\u0300中",
+		sizeDst: 7,
+		want:    "a\u0300中",
+		nSrc:    6,
+	}, {
+		desc:    "valid four-byte sequence",
+		src:     "a中\U00016F50",
+		sizeDst: 8,
+		want:    "a中\U00016F50",
+		nSrc:    8,
+	}, {
+		desc:    "short source buffer",
+		src:     "abc\xf0\x90",
+		notEOF:  true,
+		sizeDst: 10,
+		want:    "abc",
+		nSrc:    3,
+		err:     transform.ErrShortSrc,
+	}, {
+		// We don't check for the maximal subpart of an ill-formed subsequence
+		// at the end of an open segment.
+		desc:    "complete invalid that looks like short at end",
+		src:     "abc\xf0\x80",
+		notEOF:  true,
+		sizeDst: 10,
+		want:    "abc", // instead of "abc\ufffd\ufffd",
+		nSrc:    3,
+		err:     transform.ErrShortSrc,
+	}, {
+		desc:    "incomplete sequence at end",
+		src:     "a\x80bc\xf0\x90",
+		sizeDst: 9,
+		want:    "a\ufffdbc\ufffd",
+		nSrc:    6,
+	}, {
+		desc:    "invalid second byte",
+		src:     "abc\xf0dddd",
+		sizeDst: 10,
+		want:    "abc\ufffddddd",
+		nSrc:    8,
+	}, {
+		desc:    "invalid second byte at end",
+		src:     "abc\xf0d",
+		sizeDst: 10,
+		want:    "abc\ufffdd",
+		nSrc:    5,
+	}, {
+		desc:    "invalid third byte",
+		src:     "a\u0300bc\xf0\x90dddd",
+		sizeDst: 12,
+		want:    "a\u0300bc\ufffddddd",
+		nSrc:    11,
+	}, {
+		desc:    "invalid third byte at end",
+		src:     "a\u0300bc\xf0\x90d",
+		sizeDst: 12,
+		want:    "a\u0300bc\ufffdd",
+		nSrc:    8,
+	}, {
+		desc:    "invalid fourth byte, tight buffer",
+		src:     "a\u0300bc\xf0\x90\x80d",
+		sizeDst: 9,
+		want:    "a\u0300bc\ufffdd",
+		nSrc:    9,
+	}, {
+		desc:    "invalid fourth byte at end",
+		src:     "a\u0300bc\xf0\x90\x80",
+		sizeDst: 8,
+		want:    "a\u0300bc\ufffd",
+		nSrc:    8,
+	}, {
+		desc:    "invalid fourth byte and short four byte sequence",
+		src:     "a\u0300bc\xf0\x90\x80\xf0\x90\x80",
+		notEOF:  true,
+		sizeDst: 20,
+		want:    "a\u0300bc\ufffd",
+		nSrc:    8,
+		err:     transform.ErrShortSrc,
+	}, {
+		desc:    "valid four-byte sequence overflowing short buffer",
+		src:     "a\u0300bc\xf0\x90\x80\x80",
+		notEOF:  true,
+		sizeDst: 8,
+		want:    "a\u0300bc",
+		nSrc:    5,
+		err:     transform.ErrShortDst,
+	}, {
+		desc:    "invalid fourth byte at end short, but short dst",
+		src:     "a\u0300bc\xf0\x90\x80\xf0\x90\x80",
+		notEOF:  true,
+		sizeDst: 8,
+		// More bytes would fit in the buffer, but this seems to require a more
+		// complicated and slower algorithm.
+		want: "a\u0300bc", // instead of "a\u0300bc"
+		nSrc: 5,
+		err:  transform.ErrShortDst,
+	}, {
+		desc:    "short dst for error",
+		src:     "abc\x80",
+		notEOF:  true,
+		sizeDst: 5,
+		want:    "abc",
+		nSrc:    3,
+		err:     transform.ErrShortDst,
+	}, {
+		desc:    "adjusting short dst buffer",
+		src:     "abc\x80ef",
+		notEOF:  true,
+		sizeDst: 6,
+		want:    "abc\ufffd",
+		nSrc:    4,
+		err:     transform.ErrShortDst,
+	}}
+	tr := UTF8.NewDecoder()
+	for i, tc := range testCases {
+		b := make([]byte, tc.sizeDst)
+		nDst, nSrc, err := tr.Transform(b, []byte(tc.src), !tc.notEOF)
+		if err != tc.err {
+			t.Errorf("%d:%s: error was %v; want %v", i, tc.desc, err, tc.err)
+		}
+		if got := string(b[:nDst]); got != tc.want {
+			t.Errorf("%d:%s: result was %q: want %q", i, tc.desc, got, tc.want)
+		}
+		if nSrc != tc.nSrc {
+			t.Errorf("%d:%s: nSrc was %d; want %d", i, tc.desc, nSrc, tc.nSrc)
+		}
+	}
+}

diff --git a/internal/utf8internal/utf8internal.go b/internal/utf8internal/utf8internal.go
new file mode 100644
index 0000000..575cea8
--- /dev/null
+++ b/internal/utf8internal/utf8internal.go

@@ -0,0 +1,87 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package utf8internal contains low-level utf8-related constants, tables, etc.
+// that are used internally by the text package.
+package utf8internal
+
+// The default lowest and highest continuation byte.
+const (
+	LoCB = 0x80 // 1000 0000
+	HiCB = 0xBF // 1011 1111
+)
+
+// Constants related to getting information of first bytes of UTF-8 sequences.
+const (
+	// ASCII identifies a UTF-8 byte as ASCII.
+	ASCII = as
+
+	// FirstInvalid indicates a byte is invalid as a first byte of a UTF-8
+	// sequence.
+	FirstInvalid = xx
+
+	// SizeMask is a mask for the size bits. Use use x&SizeMask to get the size.
+	SizeMask = 7
+
+	// AcceptShift is the right-shift count for the first byte info byte to get
+	// the index into the AcceptRanges table. See AcceptRanges.
+	AcceptShift = 4
+
+	// The names of these constants are chosen to give nice alignment in the
+	// table below. The first nibble is an index into acceptRanges or F for
+	// special one-byte cases. The second nibble is the Rune length or the
+	// Status for the special one-byte case.
+	xx = 0xF1 // invalid: size 1
+	as = 0xF0 // ASCII: size 1
+	s1 = 0x02 // accept 0, size 2
+	s2 = 0x13 // accept 1, size 3
+	s3 = 0x03 // accept 0, size 3
+	s4 = 0x23 // accept 2, size 3
+	s5 = 0x34 // accept 3, size 4
+	s6 = 0x04 // accept 0, size 4
+	s7 = 0x44 // accept 4, size 4
+)
+
+// First is information about the first byte in a UTF-8 sequence.
+var First = [256]uint8{
+	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
+	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
+	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
+	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
+	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
+	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
+	xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
+	s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
+	s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
+	s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
+}
+
+// AcceptRange gives the range of valid values for the second byte in a UTF-8
+// sequence for any value for First that is not ASCII or FirstInvalid.
+type AcceptRange struct {
+	Lo uint8 // lowest value for second byte.
+	Hi uint8 // highest value for second byte.
+}
+
+// AcceptRanges is a slice of AcceptRange values. For a given byte sequence b
+//
+//		AcceptRanges[First[b[0]]>>AcceptShift]
+//
+// will give the value of AcceptRange for the multi-byte UTF-8 sequence starting
+// at b[0].
+var AcceptRanges = [...]AcceptRange{
+	0: {LoCB, HiCB},
+	1: {0xA0, HiCB},
+	2: {LoCB, 0x9F},
+	3: {0x90, HiCB},
+	4: {LoCB, 0x8F},
+}
commit	a0f606852dec5915cd2e76d0f125eb0feb7cec79	[log] [tgz]
author	Marcel van Lohuizen <mpvl@golang.org>	Wed Dec 09 09:53:17 2015 +0100
committer	Marcel van Lohuizen <mpvl@golang.org>	Mon Dec 14 10:45:12 2015 +0000
tree	bd21f64d98108b94df2ea3a8e2bed615d1593937
parent	87ad79f47ac96edbf2568691d123d113fd7ac535 [diff]