bytes: faster Count, Index, Equal
Benchmarks are from GOARCH=amd64 on a MacPro5,1.
benchmark old MB/s new MB/s speedup
bytes_test.BenchmarkEqual32 452.89 891.07 1.97x
bytes_test.BenchmarkEqual4K 852.71 1700.44 1.99x
bytes_test.BenchmarkEqual4M 841.53 1587.93 1.89x
bytes_test.BenchmarkEqual64M 838.22 1578.14 1.88x
bytes_test.BenchmarkIndex32 58.02 48.99 0.84x
bytes_test.BenchmarkIndex4K 48.26 41.32 0.86x
bytes_test.BenchmarkIndex4M 48.20 41.24 0.86x
bytes_test.BenchmarkIndex64M 48.08 41.21 0.86x
bytes_test.BenchmarkIndexEasy32 410.04 546.82 1.33x
bytes_test.BenchmarkIndexEasy4K 849.26 14257.37 16.79x
bytes_test.BenchmarkIndexEasy4M 854.54 17222.15 20.15x
bytes_test.BenchmarkIndexEasy64M 843.57 11060.40 13.11x
bytes_test.BenchmarkCount32 57.24 50.68 0.89x
bytes_test.BenchmarkCount4K 48.19 41.82 0.87x
bytes_test.BenchmarkCount4M 48.18 41.74 0.87x
bytes_test.BenchmarkCount64M 48.17 41.71 0.87x
bytes_test.BenchmarkCountEasy32 433.11 547.44 1.26x
bytes_test.BenchmarkCountEasy4K 1130.59 14194.06 12.55x
bytes_test.BenchmarkCountEasy4M 1131.23 17231.18 15.23x
bytes_test.BenchmarkCountEasy64M 1111.40 11068.88 9.96x
The non-easy Count/Index benchmarks are a worst case input.
regexp.BenchmarkMatchEasy0_32 237.46 221.47 0.93x
regexp.BenchmarkMatchEasy0_1K 553.53 1019.72 1.84x
regexp.BenchmarkMatchEasy0_32K 693.99 1672.06 2.41x
regexp.BenchmarkMatchEasy0_1M 688.72 1611.68 2.34x
regexp.BenchmarkMatchEasy0_32M 680.70 1565.05 2.30x
regexp.BenchmarkMatchEasy1_32 165.56 243.08 1.47x
regexp.BenchmarkMatchEasy1_1K 336.45 496.32 1.48x
regexp.BenchmarkMatchEasy1_32K 302.80 425.63 1.41x
regexp.BenchmarkMatchEasy1_1M 300.42 414.20 1.38x
regexp.BenchmarkMatchEasy1_32M 299.64 413.47 1.38x
R=golang-dev, r, iant
CC=golang-dev
https://golang.org/cl/5451116
diff --git a/src/pkg/bytes/asm_386.s b/src/pkg/bytes/asm_386.s
index f339174..e7833de 100644
--- a/src/pkg/bytes/asm_386.s
+++ b/src/pkg/bytes/asm_386.s
@@ -15,3 +15,19 @@
SUBL $1, DI
MOVL DI, ret+16(FP)
RET
+
+TEXT ·Equal(SB),7,$0
+ MOVL len+4(FP), BX
+ MOVL len1+16(FP), CX
+ MOVL $0, AX
+ CMPL BX, CX
+ JNE eqret
+ MOVL p+0(FP), SI
+ MOVL q+12(FP), DI
+ CLD
+ REP; CMPSB
+ JNE eqret
+ MOVL $1, AX
+eqret:
+ MOVB AX, ret+24(FP)
+ RET
diff --git a/src/pkg/bytes/asm_amd64.s b/src/pkg/bytes/asm_amd64.s
index c6793cb..bc6e886 100644
--- a/src/pkg/bytes/asm_amd64.s
+++ b/src/pkg/bytes/asm_amd64.s
@@ -90,3 +90,19 @@
MOVL DI, ret+24(FP)
RET
+TEXT ·Equal(SB),7,$0
+ MOVL len+8(FP), BX
+ MOVL len1+24(FP), CX
+ MOVL $0, AX
+ MOVL $1, DX
+ CMPL BX, CX
+ JNE eqret
+ MOVQ p+0(FP), SI
+ MOVQ q+16(FP), DI
+ CLD
+ REP; CMPSB
+ CMOVLEQ DX, AX
+eqret:
+ MOVB AX, ret+32(FP)
+ RET
+
diff --git a/src/pkg/bytes/asm_arm.s b/src/pkg/bytes/asm_arm.s
index f32fca1..4ed0c15 100644
--- a/src/pkg/bytes/asm_arm.s
+++ b/src/pkg/bytes/asm_arm.s
@@ -6,3 +6,6 @@
TEXT ·IndexByte(SB),7,$0
B ·indexBytePortable(SB)
+// no memcmp implementation on arm yet
+TEXT ·Equal(SB),7,$0
+ B ·equalPortable(SB)
diff --git a/src/pkg/bytes/bytes.go b/src/pkg/bytes/bytes.go
index 9bfd88f..307c89a 100644
--- a/src/pkg/bytes/bytes.go
+++ b/src/pkg/bytes/bytes.go
@@ -37,7 +37,9 @@
}
// Equal returns a boolean reporting whether a == b.
-func Equal(a, b []byte) bool {
+func Equal(a, b []byte) bool
+
+func equalPortable(a, b []byte) bool {
if len(a) != len(b) {
return false
}
@@ -74,18 +76,33 @@
// Count counts the number of non-overlapping instances of sep in s.
func Count(s, sep []byte) int {
- if len(sep) == 0 {
+ n := len(sep)
+ if n == 0 {
return utf8.RuneCount(s) + 1
}
- c := sep[0]
- n := 0
- for i := 0; i+len(sep) <= len(s); i++ {
- if s[i] == c && (len(sep) == 1 || Equal(s[i:i+len(sep)], sep)) {
- n++
- i += len(sep) - 1
- }
+ if n > len(s) {
+ return 0
}
- return n
+ count := 0
+ c := sep[0]
+ i := 0
+ t := s[:len(s)-n+1]
+ for i < len(t) {
+ if t[i] != c {
+ o := IndexByte(t[i:], c)
+ if o < 0 {
+ break
+ }
+ i += o
+ }
+ if n == 1 || Equal(s[i:i+n], sep) {
+ count++
+ i += n
+ continue
+ }
+ i++
+ }
+ return count
}
// Contains returns whether subslice is within b.
@@ -99,11 +116,27 @@
if n == 0 {
return 0
}
+ if n > len(s) {
+ return -1
+ }
c := sep[0]
- for i := 0; i+n <= len(s); i++ {
- if s[i] == c && (n == 1 || Equal(s[i:i+n], sep)) {
+ if n == 1 {
+ return IndexByte(s, c)
+ }
+ i := 0
+ t := s[:len(s)-n+1]
+ for i < len(t) {
+ if t[i] != c {
+ o := IndexByte(t[i:], c)
+ if o < 0 {
+ break
+ }
+ i += o
+ }
+ if Equal(s[i:i+n], sep) {
return i
}
+ i++
}
return -1
}
diff --git a/src/pkg/bytes/bytes_test.go b/src/pkg/bytes/bytes_test.go
index 829ef05..a2a08c2 100644
--- a/src/pkg/bytes/bytes_test.go
+++ b/src/pkg/bytes/bytes_test.go
@@ -64,13 +64,17 @@
a := []byte(tt.a)
b := []byte(tt.b)
cmp := Compare(a, b)
- eql := Equal(a, b)
if cmp != tt.i {
t.Errorf(`Compare(%q, %q) = %v`, tt.a, tt.b, cmp)
}
+ eql := Equal(a, b)
if eql != (tt.i == 0) {
t.Errorf(`Equal(%q, %q) = %v`, tt.a, tt.b, eql)
}
+ eql = EqualPortable(a, b)
+ if eql != (tt.i == 0) {
+ t.Errorf(`EqualPortable(%q, %q) = %v`, tt.a, tt.b, eql)
+ }
}
}
@@ -264,27 +268,18 @@
}
}
-func BenchmarkIndexByte4K(b *testing.B) { bmIndex(b, IndexByte, 4<<10) }
-
-func BenchmarkIndexByte4M(b *testing.B) { bmIndex(b, IndexByte, 4<<20) }
-
-func BenchmarkIndexByte64M(b *testing.B) { bmIndex(b, IndexByte, 64<<20) }
-
-func BenchmarkIndexBytePortable4K(b *testing.B) {
- bmIndex(b, IndexBytePortable, 4<<10)
-}
-
-func BenchmarkIndexBytePortable4M(b *testing.B) {
- bmIndex(b, IndexBytePortable, 4<<20)
-}
-
-func BenchmarkIndexBytePortable64M(b *testing.B) {
- bmIndex(b, IndexBytePortable, 64<<20)
-}
-
var bmbuf []byte
-func bmIndex(b *testing.B, index func([]byte, byte) int, n int) {
+func BenchmarkIndexByte32(b *testing.B) { bmIndexByte(b, IndexByte, 32) }
+func BenchmarkIndexByte4K(b *testing.B) { bmIndexByte(b, IndexByte, 4<<10) }
+func BenchmarkIndexByte4M(b *testing.B) { bmIndexByte(b, IndexByte, 4<<20) }
+func BenchmarkIndexByte64M(b *testing.B) { bmIndexByte(b, IndexByte, 64<<20) }
+func BenchmarkIndexBytePortable32(b *testing.B) { bmIndexByte(b, IndexBytePortable, 32) }
+func BenchmarkIndexBytePortable4K(b *testing.B) { bmIndexByte(b, IndexBytePortable, 4<<10) }
+func BenchmarkIndexBytePortable4M(b *testing.B) { bmIndexByte(b, IndexBytePortable, 4<<20) }
+func BenchmarkIndexBytePortable64M(b *testing.B) { bmIndexByte(b, IndexBytePortable, 64<<20) }
+
+func bmIndexByte(b *testing.B, index func([]byte, byte) int, n int) {
if len(bmbuf) < n {
bmbuf = make([]byte, n)
}
@@ -298,7 +293,127 @@
panic("bad index")
}
}
- buf[n-1] = '0'
+ buf[n-1] = '\x00'
+}
+
+func BenchmarkEqual32(b *testing.B) { bmEqual(b, Equal, 32) }
+func BenchmarkEqual4K(b *testing.B) { bmEqual(b, Equal, 4<<10) }
+func BenchmarkEqual4M(b *testing.B) { bmEqual(b, Equal, 4<<20) }
+func BenchmarkEqual64M(b *testing.B) { bmEqual(b, Equal, 64<<20) }
+func BenchmarkEqualPort32(b *testing.B) { bmEqual(b, EqualPortable, 32) }
+func BenchmarkEqualPort4K(b *testing.B) { bmEqual(b, EqualPortable, 4<<10) }
+func BenchmarkEqualPortable4M(b *testing.B) { bmEqual(b, EqualPortable, 4<<20) }
+func BenchmarkEqualPortable64M(b *testing.B) { bmEqual(b, EqualPortable, 64<<20) }
+
+func bmEqual(b *testing.B, equal func([]byte, []byte) bool, n int) {
+ if len(bmbuf) < 2*n {
+ bmbuf = make([]byte, 2*n)
+ }
+ b.SetBytes(int64(n))
+ buf1 := bmbuf[0:n]
+ buf2 := bmbuf[n : 2*n]
+ buf1[n-1] = 'x'
+ buf2[n-1] = 'x'
+ for i := 0; i < b.N; i++ {
+ eq := equal(buf1, buf2)
+ if !eq {
+ panic("bad equal")
+ }
+ }
+ buf1[n-1] = '\x00'
+ buf2[n-1] = '\x00'
+}
+
+func BenchmarkIndex32(b *testing.B) { bmIndex(b, Index, 32) }
+func BenchmarkIndex4K(b *testing.B) { bmIndex(b, Index, 4<<10) }
+func BenchmarkIndex4M(b *testing.B) { bmIndex(b, Index, 4<<20) }
+func BenchmarkIndex64M(b *testing.B) { bmIndex(b, Index, 64<<20) }
+
+func bmIndex(b *testing.B, index func([]byte, []byte) int, n int) {
+ if len(bmbuf) < n {
+ bmbuf = make([]byte, n)
+ }
+ b.SetBytes(int64(n))
+ buf := bmbuf[0:n]
+ buf[n-1] = 'x'
+ for i := 0; i < b.N; i++ {
+ j := index(buf, buf[n-7:])
+ if j != n-7 {
+ println("bad index", j)
+ panic("bad index")
+ }
+ }
+ buf[n-1] = '\x00'
+}
+
+func BenchmarkIndexEasy32(b *testing.B) { bmIndexEasy(b, Index, 32) }
+func BenchmarkIndexEasy4K(b *testing.B) { bmIndexEasy(b, Index, 4<<10) }
+func BenchmarkIndexEasy4M(b *testing.B) { bmIndexEasy(b, Index, 4<<20) }
+func BenchmarkIndexEasy64M(b *testing.B) { bmIndexEasy(b, Index, 64<<20) }
+
+func bmIndexEasy(b *testing.B, index func([]byte, []byte) int, n int) {
+ if len(bmbuf) < n {
+ bmbuf = make([]byte, n)
+ }
+ b.SetBytes(int64(n))
+ buf := bmbuf[0:n]
+ buf[n-1] = 'x'
+ buf[n-7] = 'x'
+ for i := 0; i < b.N; i++ {
+ j := index(buf, buf[n-7:])
+ if j != n-7 {
+ println("bad index", j)
+ panic("bad index")
+ }
+ }
+ buf[n-1] = '\x00'
+ buf[n-7] = '\x00'
+}
+
+func BenchmarkCount32(b *testing.B) { bmCount(b, Count, 32) }
+func BenchmarkCount4K(b *testing.B) { bmCount(b, Count, 4<<10) }
+func BenchmarkCount4M(b *testing.B) { bmCount(b, Count, 4<<20) }
+func BenchmarkCount64M(b *testing.B) { bmCount(b, Count, 64<<20) }
+
+func bmCount(b *testing.B, count func([]byte, []byte) int, n int) {
+ if len(bmbuf) < n {
+ bmbuf = make([]byte, n)
+ }
+ b.SetBytes(int64(n))
+ buf := bmbuf[0:n]
+ buf[n-1] = 'x'
+ for i := 0; i < b.N; i++ {
+ j := count(buf, buf[n-7:])
+ if j != 1 {
+ println("bad count", j)
+ panic("bad count")
+ }
+ }
+ buf[n-1] = '\x00'
+}
+
+func BenchmarkCountEasy32(b *testing.B) { bmCountEasy(b, Count, 32) }
+func BenchmarkCountEasy4K(b *testing.B) { bmCountEasy(b, Count, 4<<10) }
+func BenchmarkCountEasy4M(b *testing.B) { bmCountEasy(b, Count, 4<<20) }
+func BenchmarkCountEasy64M(b *testing.B) { bmCountEasy(b, Count, 64<<20) }
+
+func bmCountEasy(b *testing.B, count func([]byte, []byte) int, n int) {
+ if len(bmbuf) < n {
+ bmbuf = make([]byte, n)
+ }
+ b.SetBytes(int64(n))
+ buf := bmbuf[0:n]
+ buf[n-1] = 'x'
+ buf[n-7] = 'x'
+ for i := 0; i < b.N; i++ {
+ j := count(buf, buf[n-7:])
+ if j != 1 {
+ println("bad count", j)
+ panic("bad count")
+ }
+ }
+ buf[n-1] = '\x00'
+ buf[n-7] = '\x00'
}
type ExplodeTest struct {
diff --git a/src/pkg/bytes/export_test.go b/src/pkg/bytes/export_test.go
index b65428d..f61523e 100644
--- a/src/pkg/bytes/export_test.go
+++ b/src/pkg/bytes/export_test.go
@@ -6,3 +6,4 @@
// Export func for testing
var IndexBytePortable = indexBytePortable
+var EqualPortable = equalPortable