bytes: make IndexRune faster

re-implement IndexRune by IndexByte and Index which are well optimized
to get performance gain.

name                  old time/op   new time/op     delta
IndexRune/10-4         53.2ns ± 1%     29.1ns ± 1%    -45.32%  (p=0.008 n=5+5)
IndexRune/32-4          191ns ± 1%       27ns ± 1%    -85.75%  (p=0.008 n=5+5)
IndexRune/4K-4         23.5µs ± 1%      1.0µs ± 1%    -95.77%  (p=0.008 n=5+5)
IndexRune/4M-4         23.8ms ± 0%      1.0ms ± 2%    -95.90%  (p=0.008 n=5+5)
IndexRune/64M-4         384ms ± 1%       15ms ± 1%    -95.98%  (p=0.008 n=5+5)
IndexRuneASCII/10-4    61.5ns ± 0%     10.3ns ± 4%    -83.17%  (p=0.008 n=5+5)
IndexRuneASCII/32-4     203ns ± 0%       11ns ± 5%    -94.68%  (p=0.008 n=5+5)
IndexRuneASCII/4K-4    23.4µs ± 0%      0.3µs ± 2%    -98.60%  (p=0.008 n=5+5)
IndexRuneASCII/4M-4    24.0ms ± 1%      0.3ms ± 1%    -98.60%  (p=0.008 n=5+5)
IndexRuneASCII/64M-4    386ms ± 2%        6ms ± 1%    -98.57%  (p=0.008 n=5+5)

name                  old speed     new speed       delta
IndexRune/10-4        188MB/s ± 1%    344MB/s ± 1%    +82.91%  (p=0.008 n=5+5)
IndexRune/32-4        167MB/s ± 0%   1175MB/s ± 1%   +603.52%  (p=0.008 n=5+5)
IndexRune/4K-4        174MB/s ± 1%   4117MB/s ± 1%  +2262.71%  (p=0.008 n=5+5)
IndexRune/4M-4        176MB/s ± 0%   4299MB/s ± 2%  +2340.46%  (p=0.008 n=5+5)
IndexRune/64M-4       175MB/s ± 1%   4354MB/s ± 1%  +2388.57%  (p=0.008 n=5+5)
IndexRuneASCII/10-4   163MB/s ± 0%    968MB/s ± 4%   +494.66%  (p=0.008 n=5+5)
IndexRuneASCII/32-4   157MB/s ± 0%   2974MB/s ± 4%  +1788.59%  (p=0.008 n=5+5)
IndexRuneASCII/4K-4   175MB/s ± 0%  12481MB/s ± 2%  +7027.71%  (p=0.008 n=5+5)
IndexRuneASCII/4M-4   175MB/s ± 1%  12510MB/s ± 1%  +7061.15%  (p=0.008 n=5+5)
IndexRuneASCII/64M-4  174MB/s ± 2%  12143MB/s ± 1%  +6881.70%  (p=0.008 n=5+5)

Change-Id: I0632eadb83937c2a9daa7f0ce79df1dee64f992e
Reviewed-on: https://go-review.googlesource.com/28537
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
diff --git a/src/bytes/bytes.go b/src/bytes/bytes.go
index c35a1c0..3286ca3 100644
--- a/src/bytes/bytes.go
+++ b/src/bytes/bytes.go
@@ -131,14 +131,12 @@
 // It returns the byte index of the first occurrence in s of the given rune.
 // It returns -1 if rune is not present in s.
 func IndexRune(s []byte, r rune) int {
-	for i := 0; i < len(s); {
-		r1, size := utf8.DecodeRune(s[i:])
-		if r == r1 {
-			return i
-		}
-		i += size
+	if r < utf8.RuneSelf {
+		return IndexByte(s, byte(r))
 	}
-	return -1
+	var b [utf8.UTFMax]byte
+	n := utf8.EncodeRune(b[:], r)
+	return Index(s, b[:n])
 }
 
 // IndexAny interprets s as a sequence of UTF-8-encoded Unicode code points.
diff --git a/src/bytes/bytes_amd64.go b/src/bytes/bytes_amd64.go
index e8be28b..b683e67 100644
--- a/src/bytes/bytes_amd64.go
+++ b/src/bytes/bytes_amd64.go
@@ -4,6 +4,8 @@
 
 package bytes
 
+//go:noescape
+
 // indexShortStr returns the index of the first instance of c in s, or -1 if c is not present in s.
 // indexShortStr requires 2 <= len(c) <= shortStringLen
 func indexShortStr(s, c []byte) int // ../runtime/asm_$GOARCH.s
diff --git a/src/bytes/bytes_test.go b/src/bytes/bytes_test.go
index c48f662..e9a022b 100644
--- a/src/bytes/bytes_test.go
+++ b/src/bytes/bytes_test.go
@@ -354,6 +354,20 @@
 			t.Errorf(`IndexRune(%q, '%c') = %v`, tt.a, r, pos)
 		}
 	}
+
+	haystack := []byte("test世界")
+
+	allocs := testing.AllocsPerRun(1000, func() {
+		if i := IndexRune(haystack, 's'); i != 2 {
+			t.Fatalf("'s' at %d; want 2", i)
+		}
+		if i := IndexRune(haystack, '世'); i != 4 {
+			t.Fatalf("'世' at %d; want 4", i)
+		}
+	})
+	if allocs != 0 {
+		t.Errorf(`expected no allocations, got %f`, allocs)
+	}
 }
 
 var bmbuf []byte
@@ -404,6 +418,44 @@
 	}
 }
 
+func BenchmarkIndexRune(b *testing.B) {
+	benchBytes(b, indexSizes, bmIndexRune(IndexRune))
+}
+
+func BenchmarkIndexRuneASCII(b *testing.B) {
+	benchBytes(b, indexSizes, bmIndexRuneASCII(IndexRune))
+}
+
+func bmIndexRuneASCII(index func([]byte, rune) int) func(b *testing.B, n int) {
+	return func(b *testing.B, n int) {
+		buf := bmbuf[0:n]
+		buf[n-1] = 'x'
+		for i := 0; i < b.N; i++ {
+			j := index(buf, 'x')
+			if j != n-1 {
+				b.Fatal("bad index", j)
+			}
+		}
+		buf[n-1] = '\x00'
+	}
+}
+
+func bmIndexRune(index func([]byte, rune) int) func(b *testing.B, n int) {
+	return func(b *testing.B, n int) {
+		buf := bmbuf[0:n]
+		utf8.EncodeRune(buf[n-3:], '世')
+		for i := 0; i < b.N; i++ {
+			j := index(buf, '世')
+			if j != n-3 {
+				b.Fatal("bad index", j)
+			}
+		}
+		buf[n-3] = '\x00'
+		buf[n-2] = '\x00'
+		buf[n-1] = '\x00'
+	}
+}
+
 func BenchmarkEqual(b *testing.B) {
 	b.Run("0", func(b *testing.B) {
 		var buf [4]byte