internal/encoding/text/decode: stop using regexp This eliminates the last user of the regexp package, which should save about 130K from the resulting stripped binary importing this package (unless, of course, regexp is brought in directly of via another dependency). Added some new cases to TestDecoder to test the new function. Benchmark (not included) shows the following results, comparing to old implementation using regexp.Find: name old time/op new time/op delta ErrId-4 1.93µs ± 1% 0.21µs ± 1% -89.20% (p=0.002 n=6+6) name old alloc/op new alloc/op delta ErrId-4 128B ± 0% 0B -100.00% (p=0.002 n=6+6) name old allocs/op new allocs/op delta ErrId-4 13.0 ± 0% 0.0 -100.00% (p=0.002 n=6+6) Change-Id: I5569a47580f41cc60f92c444e8d43bb3f26faa4e Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/402774 Reviewed-by: Cassondra Foesch <cfoesch@gmail.com> Reviewed-by: Damien Neil <dneil@google.com> Reviewed-by: Lasse Folger <lassefolger@google.com>

commit: a0482351bab7c7f4f0e59acc8e5f3956ce7b2de6 [log] [tgz]
author: Kir Kolyshkin <kolyshkin@gmail.com> Wed Apr 27 17:22:38 2022 -0700
committer: Damien Neil <dneil@google.com> Mon May 16 20:59:57 2022 +0000
tree: f5da4833e87aaf03c87e817c716d04da5203aec0
parent: 8a7ba0762cb3b39fc0536379eac2f7fa5796f187 [diff]
diff --git a/internal/encoding/text/decode.go b/internal/encoding/text/decode.go
index 3780377..636e34b 100644
--- a/internal/encoding/text/decode.go
+++ b/internal/encoding/text/decode.go

@@ -8,7 +8,6 @@
 	"bytes"
 	"fmt"
 	"io"
-	"regexp"
 	"strconv"
 	"unicode/utf8"
 
@@ -421,7 +420,7 @@
 		return Token{}, d.newSyntaxError("invalid field number: %s", d.in[:num.size])
 	}
 
-	return Token{}, d.newSyntaxError("invalid field name: %s", errRegexp.Find(d.in))
+	return Token{}, d.newSyntaxError("invalid field name: %s", errId(d.in))
 }
 
 // parseTypeName parses Any type URL or extension field name. The name is
@@ -571,7 +570,7 @@
 		return tok, nil
 	}
 
-	return Token{}, d.newSyntaxError("invalid scalar value: %s", errRegexp.Find(d.in))
+	return Token{}, d.newSyntaxError("invalid scalar value: %s", errId(d.in))
 }
 
 // parseLiteralValue parses a literal value. A literal value is used for
@@ -653,8 +652,25 @@
 	return b
 }
 
-// Any sequence that looks like a non-delimiter (for error reporting).
-var errRegexp = regexp.MustCompile(`^([-+._a-zA-Z0-9\/]+|.)`)
+// errId extracts a byte sequence that looks like an invalid ID
+// (for the purposes of error reporting).
+func errId(seq []byte) []byte {
+	for i := 0; i < len(seq); {
+		r, size := utf8.DecodeRune(seq[i:])
+		if r > utf8.RuneSelf || (r != '/' && isDelim(byte(r))) {
+			if i == 0 {
+				// Either the first byte is invalid UTF-8 or a
+				// delimiter, or the first rune is non-ASCII.
+				// Return it as-is.
+				i = size
+			}
+			return seq[:i:i]
+		}
+		i += size
+	}
+	// No delimiter found.
+	return seq
+}
 
 // isDelim returns true if given byte is a delimiter character.
 func isDelim(c byte) bool {

diff --git a/internal/encoding/text/decode_test.go b/internal/encoding/text/decode_test.go
index 9e38cb3..65fd2f9 100644
--- a/internal/encoding/text/decode_test.go
+++ b/internal/encoding/text/decode_test.go

@@ -381,6 +381,30 @@
 			want: []R{{E: "invalid field name: 123name"}},
 		},
 		{
+			in:   `/`,
+			want: []R{{E: `invalid field name: /`}},
+		},
+		{
+			in:   `世界`,
+			want: []R{{E: `invalid field name: 世`}},
+		},
+		{
+			in:   `1a/b`,
+			want: []R{{E: `invalid field name: 1a`}},
+		},
+		{
+			in:   `1c\d`,
+			want: []R{{E: `invalid field name: 1c`}},
+		},
+		{
+			in:   "\x84f",
+			want: []R{{E: "invalid field name: \x84"}},
+		},
+		{
+			in:   "\uFFFDxxx",
+			want: []R{{E: "invalid field name: \uFFFD"}},
+		},
+		{
 			in: "[type]",
 			want: []R{
 				{K: text.Name, T: NT{K: text.TypeName, S: "type"}, RS: "[type]"},
commit	a0482351bab7c7f4f0e59acc8e5f3956ce7b2de6	[log] [tgz]
author	Kir Kolyshkin <kolyshkin@gmail.com>	Wed Apr 27 17:22:38 2022 -0700
committer	Damien Neil <dneil@google.com>	Mon May 16 20:59:57 2022 +0000
tree	f5da4833e87aaf03c87e817c716d04da5203aec0
parent	8a7ba0762cb3b39fc0536379eac2f7fa5796f187 [diff]