internal/encoding/text/decode: stop using regexp

This eliminates the last user of the regexp package, which should save
about 130K from the resulting stripped binary importing this package
(unless, of course, regexp is brought in directly of via another
dependency).

Added some new cases to TestDecoder to test the new function.

Benchmark (not included) shows the following results, comparing to
old implementation using regexp.Find:

name     old time/op    new time/op    delta
ErrId-4    1.93µs ± 1%    0.21µs ± 1%   -89.20%  (p=0.002 n=6+6)

name     old alloc/op   new alloc/op   delta
ErrId-4      128B ± 0%        0B       -100.00%  (p=0.002 n=6+6)

name     old allocs/op  new allocs/op  delta
ErrId-4      13.0 ± 0%       0.0       -100.00%  (p=0.002 n=6+6)

Change-Id: I5569a47580f41cc60f92c444e8d43bb3f26faa4e
Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/402774
Reviewed-by: Cassondra Foesch <cfoesch@gmail.com>
Reviewed-by: Damien Neil <dneil@google.com>
Reviewed-by: Lasse Folger <lassefolger@google.com>
diff --git a/internal/encoding/text/decode.go b/internal/encoding/text/decode.go
index 3780377..636e34b 100644
--- a/internal/encoding/text/decode.go
+++ b/internal/encoding/text/decode.go
@@ -8,7 +8,6 @@
 	"bytes"
 	"fmt"
 	"io"
-	"regexp"
 	"strconv"
 	"unicode/utf8"
 
@@ -421,7 +420,7 @@
 		return Token{}, d.newSyntaxError("invalid field number: %s", d.in[:num.size])
 	}
 
-	return Token{}, d.newSyntaxError("invalid field name: %s", errRegexp.Find(d.in))
+	return Token{}, d.newSyntaxError("invalid field name: %s", errId(d.in))
 }
 
 // parseTypeName parses Any type URL or extension field name. The name is
@@ -571,7 +570,7 @@
 		return tok, nil
 	}
 
-	return Token{}, d.newSyntaxError("invalid scalar value: %s", errRegexp.Find(d.in))
+	return Token{}, d.newSyntaxError("invalid scalar value: %s", errId(d.in))
 }
 
 // parseLiteralValue parses a literal value. A literal value is used for
@@ -653,8 +652,25 @@
 	return b
 }
 
-// Any sequence that looks like a non-delimiter (for error reporting).
-var errRegexp = regexp.MustCompile(`^([-+._a-zA-Z0-9\/]+|.)`)
+// errId extracts a byte sequence that looks like an invalid ID
+// (for the purposes of error reporting).
+func errId(seq []byte) []byte {
+	for i := 0; i < len(seq); {
+		r, size := utf8.DecodeRune(seq[i:])
+		if r > utf8.RuneSelf || (r != '/' && isDelim(byte(r))) {
+			if i == 0 {
+				// Either the first byte is invalid UTF-8 or a
+				// delimiter, or the first rune is non-ASCII.
+				// Return it as-is.
+				i = size
+			}
+			return seq[:i:i]
+		}
+		i += size
+	}
+	// No delimiter found.
+	return seq
+}
 
 // isDelim returns true if given byte is a delimiter character.
 func isDelim(c byte) bool {
diff --git a/internal/encoding/text/decode_test.go b/internal/encoding/text/decode_test.go
index 9e38cb3..65fd2f9 100644
--- a/internal/encoding/text/decode_test.go
+++ b/internal/encoding/text/decode_test.go
@@ -381,6 +381,30 @@
 			want: []R{{E: "invalid field name: 123name"}},
 		},
 		{
+			in:   `/`,
+			want: []R{{E: `invalid field name: /`}},
+		},
+		{
+			in:   `世界`,
+			want: []R{{E: `invalid field name: 世`}},
+		},
+		{
+			in:   `1a/b`,
+			want: []R{{E: `invalid field name: 1a`}},
+		},
+		{
+			in:   `1c\d`,
+			want: []R{{E: `invalid field name: 1c`}},
+		},
+		{
+			in:   "\x84f",
+			want: []R{{E: "invalid field name: \x84"}},
+		},
+		{
+			in:   "\uFFFDxxx",
+			want: []R{{E: "invalid field name: \uFFFD"}},
+		},
+		{
 			in: "[type]",
 			want: []R{
 				{K: text.Name, T: NT{K: text.TypeName, S: "type"}, RS: "[type]"},