encoding/json: avoid work when unquoting strings We can work out how many bytes can be unquoted trivially in rescanLiteral, which already iterates over a string's bytes. Removing the extra loop in unquoteBytes simplifies the function and speeds it up, especially when decoding simple strings, which are common. While at it, we can remove unnecessary checks like len(s)<2 and s[0]=='"'. Add a comment explaining why. name old time/op new time/op delta CodeDecoder-8 11.2ms ± 0% 11.1ms ± 1% -1.63% (p=0.000 n=9+10) name old speed new speed delta CodeDecoder-8 173MB/s ± 0% 175MB/s ± 1% +1.66% (p=0.000 n=9+10) Updates #28923. Change-Id: I2436a3a7f8148a2f7a6a4cdbd7dec6b32ef5e20c Reviewed-on: https://go-review.googlesource.com/c/go/+/151157 Run-TryBot: Daniel Martí <mvdan@mvdan.cc> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>

commit: fe1afe8d4ca06c56f583a9296282697f39d27d97 [log] [tgz]
author: Daniel Martí <mvdan@mvdan.cc> Sun Nov 25 00:24:20 2018 +0000
committer: Daniel Martí <mvdan@mvdan.cc> Mon Apr 22 15:07:38 2019 +0000
tree: 4e526becf7c12061dfbbfdb0eee31fee1cc9f645
parent: d17d41e58d2f69d284398f1d86d93c0f31648b16 [diff]
diff --git a/src/encoding/json/decode.go b/src/encoding/json/decode.go
index 59b6fd1..3c40eb9 100644
--- a/src/encoding/json/decode.go
+++ b/src/encoding/json/decode.go

@@ -273,6 +273,9 @@
 	savedError            error
 	useNumber             bool
 	disallowUnknownFields bool
+	// safeUnquote is the number of current string literal bytes that don't
+	// need to be unquoted. When negative, no bytes need unquoting.
+	safeUnquote int
 }
 
 // readIndex returns the position of the last byte read.
@@ -374,13 +377,27 @@
 Switch:
 	switch data[i-1] {
 	case '"': // string
+		// safeUnquote is initialized at -1, which means that all bytes
+		// checked so far can be unquoted at a later time with no work
+		// at all. When reaching the closing '"', if safeUnquote is
+		// still -1, all bytes can be unquoted with no work. Otherwise,
+		// only those bytes up until the first '\\' or non-ascii rune
+		// can be safely unquoted.
+		safeUnquote := -1
 		for ; i < len(data); i++ {
-			switch data[i] {
-			case '\\':
+			if c := data[i]; c == '\\' {
+				if safeUnquote < 0 { // first unsafe byte
+					safeUnquote = int(i - d.off)
+				}
 				i++ // escaped char
-			case '"':
+			} else if c == '"' {
+				d.safeUnquote = safeUnquote
 				i++ // tokenize the closing quote too
 				break Switch
+			} else if c >= utf8.RuneSelf {
+				if safeUnquote < 0 { // first unsafe byte
+					safeUnquote = int(i - d.off)
+				}
 			}
 		}
 	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-': // number
@@ -725,7 +742,7 @@
 		start := d.readIndex()
 		d.rescanLiteral()
 		item := d.data[start:d.readIndex()]
-		key, ok := unquoteBytes(item)
+		key, ok := d.unquoteBytes(item)
 		if !ok {
 			panic(phasePanicMsg)
 		}
@@ -922,7 +939,7 @@
 			d.saveError(&UnmarshalTypeError{Value: val, Type: v.Type(), Offset: int64(d.readIndex())})
 			return nil
 		}
-		s, ok := unquoteBytes(item)
+		s, ok := d.unquoteBytes(item)
 		if !ok {
 			if fromQuoted {
 				return fmt.Errorf("json: invalid use of ,string struct tag, trying to unmarshal %q into %v", item, v.Type())
@@ -973,7 +990,7 @@
 		}
 
 	case '"': // string
-		s, ok := unquoteBytes(item)
+		s, ok := d.unquoteBytes(item)
 		if !ok {
 			if fromQuoted {
 				return fmt.Errorf("json: invalid use of ,string struct tag, trying to unmarshal %q into %v", item, v.Type())
@@ -1131,7 +1148,7 @@
 		start := d.readIndex()
 		d.rescanLiteral()
 		item := d.data[start:d.readIndex()]
-		key, ok := unquote(item)
+		key, ok := d.unquote(item)
 		if !ok {
 			panic(phasePanicMsg)
 		}
@@ -1180,7 +1197,7 @@
 		return c == 't'
 
 	case '"': // string
-		s, ok := unquote(item)
+		s, ok := d.unquote(item)
 		if !ok {
 			panic(phasePanicMsg)
 		}
@@ -1223,38 +1240,21 @@
 
 // unquote converts a quoted JSON string literal s into an actual string t.
 // The rules are different than for Go, so cannot use strconv.Unquote.
-func unquote(s []byte) (t string, ok bool) {
-	s, ok = unquoteBytes(s)
+func (d *decodeState) unquote(s []byte) (t string, ok bool) {
+	s, ok = d.unquoteBytes(s)
 	t = string(s)
 	return
 }
 
-func unquoteBytes(s []byte) (t []byte, ok bool) {
-	if len(s) < 2 || s[0] != '"' || s[len(s)-1] != '"' {
-		return
-	}
+func (d *decodeState) unquoteBytes(s []byte) (t []byte, ok bool) {
+	r := d.safeUnquote
+	// The bytes have been scanned, so we know that the first and last bytes
+	// are double quotes.
 	s = s[1 : len(s)-1]
 
-	// Check for unusual characters. If there are none,
-	// then no unquoting is needed, so return a slice of the
-	// original bytes.
-	r := 0
-	for r < len(s) {
-		c := s[r]
-		if c == '\\' || c == '"' || c < ' ' {
-			break
-		}
-		if c < utf8.RuneSelf {
-			r++
-			continue
-		}
-		rr, size := utf8.DecodeRune(s[r:])
-		if rr == utf8.RuneError && size == 1 {
-			break
-		}
-		r += size
-	}
-	if r == len(s) {
+	// If there are no unusual characters, no unquoting is needed, so return
+	// a slice of the original bytes.
+	if r == -1 {
 		return s, true
 	}
commit	fe1afe8d4ca06c56f583a9296282697f39d27d97	[log] [tgz]
author	Daniel Martí <mvdan@mvdan.cc>	Sun Nov 25 00:24:20 2018 +0000
committer	Daniel Martí <mvdan@mvdan.cc>	Mon Apr 22 15:07:38 2019 +0000
tree	4e526becf7c12061dfbbfdb0eee31fee1cc9f645
parent	d17d41e58d2f69d284398f1d86d93c0f31648b16 [diff]