bytes: Use the same algorithm as strings for Index
name old time/op new time/op delta
IndexByte32-48 9.05ns ± 7% 9.59ns ±11% +5.93% (p=0.001 n=19+20)
IndexByte4K-48 118ns ± 4% 122ns ± 8% +3.52% (p=0.002 n=19+19)
IndexByte4M-48 172µs ±13% 188µs ±12% +9.49% (p=0.000 n=20+20)
IndexByte64M-48 8.00ms ±14% 8.05ms ±23% ~ (p=0.799 n=20+20)
IndexBytePortable32-48 41.7ns ±15% 42.5ns ±12% ~ (p=0.372 n=20+20)
IndexBytePortable4K-48 3.08µs ±16% 3.26µs ±10% +5.77% (p=0.018 n=20+20)
IndexBytePortable4M-48 3.12ms ±17% 3.20ms ±10% ~ (p=0.157 n=20+20)
IndexBytePortable64M-48 54.0ms ±14% 55.3ms ±14% ~ (p=0.640 n=20+20)
Index32-48 230ns ±12% 46ns ± 6% -79.87% (p=0.000 n=20+19)
Index4K-48 43.2µs ± 9% 3.2µs ±12% -92.58% (p=0.000 n=19+20)
Index4M-48 44.4ms ± 7% 3.3ms ±13% -92.59% (p=0.000 n=19+20)
Index64M-48 714ms ±10% 56ms ± 8% -92.22% (p=0.000 n=19+19)
IndexEasy32-48 52.7ns ±10% 31.0ns ±11% -41.21% (p=0.000 n=20+20)
IndexEasy4K-48 139ns ± 5% 1598ns ± 6% +1046.37% (p=0.000 n=19+19)
IndexEasy4M-48 179µs ± 8% 1674µs ±10% +834.31% (p=0.000 n=19+20)
IndexEasy64M-48 8.56ms ±10% 27.82ms ±16% +225.14% (p=0.000 n=19+20)
name old speed new speed delta
IndexByte32-48 3.52GB/s ± 7% 3.35GB/s ±11% -4.99% (p=0.001 n=20+20)
IndexByte4K-48 34.5GB/s ± 7% 33.2GB/s ±10% -3.67% (p=0.002 n=20+20)
IndexByte4M-48 24.6GB/s ±14% 22.4GB/s ±14% -8.73% (p=0.000 n=20+20)
IndexByte64M-48 8.42GB/s ±16% 8.42GB/s ±19% ~ (p=0.799 n=20+20)
IndexBytePortable32-48 770MB/s ±13% 756MB/s ±11% ~ (p=0.383 n=20+20)
IndexBytePortable4K-48 1.34GB/s ±14% 1.26GB/s ±10% -5.76% (p=0.018 n=20+20)
IndexBytePortable4M-48 1.35GB/s ±15% 1.31GB/s ±11% ~ (p=0.157 n=20+20)
IndexBytePortable64M-48 1.25GB/s ±16% 1.22GB/s ±13% ~ (p=0.640 n=20+20)
Index32-48 138MB/s ± 8% 687MB/s ± 8% +398.57% (p=0.000 n=19+20)
Index4K-48 94.9MB/s ± 9% 1280.5MB/s ±11% +1249.11% (p=0.000 n=19+20)
Index4M-48 94.6MB/s ± 7% 1278.5MB/s ±12% +1250.99% (p=0.000 n=19+20)
Index64M-48 94.2MB/s ±10% 1210.9MB/s ± 8% +1185.04% (p=0.000 n=19+19)
IndexEasy32-48 608MB/s ±10% 1035MB/s ±10% +70.15% (p=0.000 n=20+20)
IndexEasy4K-48 29.3GB/s ± 6% 2.6GB/s ± 6% -91.24% (p=0.000 n=19+19)
IndexEasy4M-48 23.3GB/s ±10% 2.5GB/s ± 9% -89.23% (p=0.000 n=20+20)
IndexEasy64M-48 7.86GB/s ±11% 2.42GB/s ±14% -69.18% (p=0.000 n=19+20)
Change-Id: Ia191f0a6ca80e113397d9ed98d25f195768b65bc
Reviewed-on: https://go-review.googlesource.com/22550
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
diff --git a/src/bytes/bytes.go b/src/bytes/bytes.go
index 305c85d..c35a1c0 100644
--- a/src/bytes/bytes.go
+++ b/src/bytes/bytes.go
@@ -93,37 +93,6 @@
return IndexRune(b, r) >= 0
}
-// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
-func Index(s, sep []byte) int {
- n := len(sep)
- if n == 0 {
- return 0
- }
- if n > len(s) {
- return -1
- }
- c := sep[0]
- if n == 1 {
- return IndexByte(s, c)
- }
- i := 0
- t := s[:len(s)-n+1]
- for i < len(t) {
- if t[i] != c {
- o := IndexByte(t[i:], c)
- if o < 0 {
- break
- }
- i += o
- }
- if Equal(s[i:i+n], sep) {
- return i
- }
- i++
- }
- return -1
-}
-
func indexBytePortable(s []byte, c byte) int {
for i, b := range s {
if b == c {
diff --git a/src/bytes/bytes_amd64.go b/src/bytes/bytes_amd64.go
new file mode 100644
index 0000000..e8be28b
--- /dev/null
+++ b/src/bytes/bytes_amd64.go
@@ -0,0 +1,69 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytes
+
+// indexShortStr returns the index of the first instance of c in s, or -1 if c is not present in s.
+// indexShortStr requires 2 <= len(c) <= shortStringLen
+func indexShortStr(s, c []byte) int // ../runtime/asm_$GOARCH.s
+const shortStringLen = 31
+
+// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
+func Index(s, sep []byte) int {
+ n := len(sep)
+ switch {
+ case n == 0:
+ return 0
+ case n == 1:
+ return IndexByte(s, sep[0])
+ case n <= shortStringLen:
+ return indexShortStr(s, sep)
+ case n == len(s):
+ if Equal(sep, s) {
+ return 0
+ }
+ return -1
+ case n > len(s):
+ return -1
+ }
+ // Rabin-Karp search
+ hashsep, pow := hashStr(sep)
+ var h uint32
+ for i := 0; i < n; i++ {
+ h = h*primeRK + uint32(s[i])
+ }
+ if h == hashsep && Equal(s[:n], sep) {
+ return 0
+ }
+ for i := n; i < len(s); {
+ h *= primeRK
+ h += uint32(s[i])
+ h -= pow * uint32(s[i-n])
+ i++
+ if h == hashsep && Equal(s[i-n:i], sep) {
+ return i - n
+ }
+ }
+ return -1
+}
+
+// primeRK is the prime base used in Rabin-Karp algorithm.
+const primeRK = 16777619
+
+// hashStr returns the hash and the appropriate multiplicative
+// factor for use in Rabin-Karp algorithm.
+func hashStr(sep []byte) (uint32, uint32) {
+ hash := uint32(0)
+ for i := 0; i < len(sep); i++ {
+ hash = hash*primeRK + uint32(sep[i])
+ }
+ var pow, sq uint32 = 1, primeRK
+ for i := len(sep); i > 0; i >>= 1 {
+ if i&1 != 0 {
+ pow *= sq
+ }
+ sq *= sq
+ }
+ return hash, pow
+}
diff --git a/src/bytes/bytes_generic.go b/src/bytes/bytes_generic.go
new file mode 100644
index 0000000..88e232e
--- /dev/null
+++ b/src/bytes/bytes_generic.go
@@ -0,0 +1,41 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64
+
+package bytes
+
+// TODO: implements short string optimization on non amd64 platforms
+// and get rid of bytes_amd64.go
+
+// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
+func Index(s, sep []byte) int {
+ n := len(sep)
+ if n == 0 {
+ return 0
+ }
+ if n > len(s) {
+ return -1
+ }
+ c := sep[0]
+ if n == 1 {
+ return IndexByte(s, c)
+ }
+ i := 0
+ t := s[:len(s)-n+1]
+ for i < len(t) {
+ if t[i] != c {
+ o := IndexByte(t[i:], c)
+ if o < 0 {
+ break
+ }
+ i += o
+ }
+ if Equal(s[i:i+n], sep) {
+ return i
+ }
+ i++
+ }
+ return -1
+}
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index f44fc11..c9d6b90 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -1695,13 +1695,31 @@
JMP loop
-// TODO: Also use this in bytes.Index
TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
MOVQ s+0(FP), DI
// We want len in DX and AX, because PCMPESTRI implicitly consumes them
MOVQ s_len+8(FP), DX
MOVQ c+16(FP), BP
MOVQ c_len+24(FP), AX
+ MOVQ DI, R10
+ LEAQ ret+32(FP), R11
+ JMP runtime·indexShortStr(SB)
+
+TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
+ MOVQ s+0(FP), DI
+ MOVQ s_len+8(FP), DX
+ MOVQ c+24(FP), BP
+ MOVQ c_len+32(FP), AX
+ MOVQ DI, R10
+ LEAQ ret+48(FP), R11
+ JMP runtime·indexShortStr(SB)
+
+// AX: length of string, that we are searching for
+// DX: length of string, in which we are searching
+// DI: pointer to string, in which we are searching
+// BP: pointer to string, that we are searching for
+// R11: address, where to put return value
+TEXT runtime·indexShortStr(SB),NOSPLIT,$0
CMPQ AX, DX
JA fail
CMPQ DX, $16
@@ -1853,7 +1871,7 @@
CMPQ DI,DX
JB loop17to31
fail:
- MOVQ $-1, ret+32(FP)
+ MOVQ $-1, (R11)
RET
sse42:
MOVL runtime·cpuid_ecx(SB), CX
@@ -1893,8 +1911,8 @@
sse42_success:
ADDQ CX, DI
success:
- SUBQ s+0(FP), DI
- MOVQ DI, ret+32(FP)
+ SUBQ R10, DI
+ MOVQ DI, (R11)
RET