encoding/json: speed up tokenization of literals Decoder.Decode and Unmarshal actually scan the input bytes twice - the first time to check for syntax errors and the length of the value, and the second to perform the decoding. It's in the second scan that we actually tokenize the bytes. Since syntax errors aren't a possibility, we can take shortcuts. In particular, literals such as quoted strings are very common in JSON, so we can avoid a lot of work by special casing them. name old time/op new time/op delta CodeDecoder-8 10.3ms ± 1% 9.1ms ± 0% -11.89% (p=0.002 n=6+6) UnicodeDecoder-8 342ns ± 0% 283ns ± 0% -17.25% (p=0.000 n=6+5) DecoderStream-8 239ns ± 0% 230ns ± 0% -3.90% (p=0.000 n=6+5) CodeUnmarshal-8 11.0ms ± 0% 9.8ms ± 0% -11.45% (p=0.002 n=6+6) CodeUnmarshalReuse-8 10.3ms ± 0% 9.0ms ± 0% -12.72% (p=0.004 n=5+6) UnmarshalString-8 104ns ± 0% 92ns ± 0% -11.35% (p=0.002 n=6+6) UnmarshalFloat64-8 93.2ns ± 0% 87.6ns ± 0% -6.01% (p=0.010 n=6+4) UnmarshalInt64-8 74.5ns ± 0% 71.5ns ± 0% -3.91% (p=0.000 n=5+6) name old speed new speed delta CodeDecoder-8 189MB/s ± 1% 214MB/s ± 0% +13.50% (p=0.002 n=6+6) UnicodeDecoder-8 40.9MB/s ± 0% 49.5MB/s ± 0% +20.96% (p=0.002 n=6+6) CodeUnmarshal-8 176MB/s ± 0% 199MB/s ± 0% +12.93% (p=0.002 n=6+6) Updates #28923. Change-Id: I7a5e2aef51bd4ddf2004aad24210f6f50e01eaeb Reviewed-on: https://go-review.googlesource.com/c/go/+/151042 Run-TryBot: Daniel Martí <mvdan@mvdan.cc> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>

commit: 2cc17bc5f205345ae215806266343d0bb013a861 [log] [tgz]
author: Daniel Martí <mvdan@mvdan.cc> Fri Nov 23 16:56:23 2018 +0000
committer: Daniel Martí <mvdan@mvdan.cc> Wed Apr 03 17:01:44 2019 +0000
tree: c6ffef11849d2cf1b5018f9a1808842b391b2208
parent: 556b3f5bdf935d80d83b26520838852fd840ef70 [diff]
diff --git a/src/encoding/json/decode.go b/src/encoding/json/decode.go
index 3f9fe1f..59b6fd1 100644
--- a/src/encoding/json/decode.go
+++ b/src/encoding/json/decode.go

@@ -361,6 +361,52 @@
 	d.opcode = d.scan.eof()
 }
 
+// rescanLiteral is similar to scanWhile(scanContinue), but it specialises the
+// common case where we're decoding a literal. The decoder scans the input
+// twice, once for syntax errors and to check the length of the value, and the
+// second to perform the decoding.
+//
+// Only in the second step do we use decodeState to tokenize literals, so we
+// know there aren't any syntax errors. We can take advantage of that knowledge,
+// and scan a literal's bytes much more quickly.
+func (d *decodeState) rescanLiteral() {
+	data, i := d.data, d.off
+Switch:
+	switch data[i-1] {
+	case '"': // string
+		for ; i < len(data); i++ {
+			switch data[i] {
+			case '\\':
+				i++ // escaped char
+			case '"':
+				i++ // tokenize the closing quote too
+				break Switch
+			}
+		}
+	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-': // number
+		for ; i < len(data); i++ {
+			switch data[i] {
+			case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
+				'.', 'e', 'E', '+', '-':
+			default:
+				break Switch
+			}
+		}
+	case 't': // true
+		i += len("rue")
+	case 'f': // false
+		i += len("alse")
+	case 'n': // null
+		i += len("ull")
+	}
+	if i < len(data) {
+		d.opcode = stateEndValue(&d.scan, data[i])
+	} else {
+		d.opcode = scanEnd
+	}
+	d.off = i + 1
+}
+
 // value consumes a JSON value from d.data[d.off-1:], decoding into v, and
 // reads the following byte ahead. If v is invalid, the value is discarded.
 // The first byte of the value has been read already.
@@ -392,7 +438,7 @@
 	case scanBeginLiteral:
 		// All bytes inside literal return scanContinue op code.
 		start := d.readIndex()
-		d.scanWhile(scanContinue)
+		d.rescanLiteral()
 
 		if v.IsValid() {
 			if err := d.literalStore(d.data[start:d.readIndex()], v, false); err != nil {
@@ -677,7 +723,7 @@
 
 		// Read key.
 		start := d.readIndex()
-		d.scanWhile(scanContinue)
+		d.rescanLiteral()
 		item := d.data[start:d.readIndex()]
 		key, ok := unquoteBytes(item)
 		if !ok {
@@ -1083,7 +1129,7 @@
 
 		// Read string key.
 		start := d.readIndex()
-		d.scanWhile(scanContinue)
+		d.rescanLiteral()
 		item := d.data[start:d.readIndex()]
 		key, ok := unquote(item)
 		if !ok {
@@ -1122,7 +1168,7 @@
 func (d *decodeState) literalInterface() interface{} {
 	// All bytes inside literal return scanContinue op code.
 	start := d.readIndex()
-	d.scanWhile(scanContinue)
+	d.rescanLiteral()
 
 	item := d.data[start:d.readIndex()]
commit	2cc17bc5f205345ae215806266343d0bb013a861	[log] [tgz]
author	Daniel Martí <mvdan@mvdan.cc>	Fri Nov 23 16:56:23 2018 +0000
committer	Daniel Martí <mvdan@mvdan.cc>	Wed Apr 03 17:01:44 2019 +0000
tree	c6ffef11849d2cf1b5018f9a1808842b391b2208
parent	556b3f5bdf935d80d83b26520838852fd840ef70 [diff]