runtime: Implement faster equals for strings and bytes.
(amd64)
benchmark old ns/op new ns/op delta
BenchmarkEqual0 16 6 -63.15%
BenchmarkEqual9 22 7 -65.37%
BenchmarkEqual32 36 9 -74.91%
BenchmarkEqual4K 2187 120 -94.51%
benchmark old MB/s new MB/s speedup
BenchmarkEqual9 392.22 1134.38 2.89x
BenchmarkEqual32 866.72 3457.39 3.99x
BenchmarkEqual4K 1872.73 33998.87 18.15x
(386)
benchmark old ns/op new ns/op delta
BenchmarkEqual0 16 5 -63.85%
BenchmarkEqual9 22 7 -67.84%
BenchmarkEqual32 34 12 -64.94%
BenchmarkEqual4K 2196 113 -94.85%
benchmark old MB/s new MB/s speedup
BenchmarkEqual9 405.81 1260.18 3.11x
BenchmarkEqual32 919.55 2631.21 2.86x
BenchmarkEqual4K 1864.85 36072.54 19.34x
Update #3751
R=bradfitz, r, khr, dave, remyoudompheng, fullung, minux.ma, ality
CC=golang-dev
https://golang.org/cl/8056043
diff --git a/src/pkg/bytes/asm_386.s b/src/pkg/bytes/asm_386.s
index 997738f..27cd4e7 100644
--- a/src/pkg/bytes/asm_386.s
+++ b/src/pkg/bytes/asm_386.s
@@ -15,19 +15,3 @@
SUBL $1, DI
MOVL DI, ret+16(FP)
RET
-
-TEXT ·Equal(SB),7,$0
- MOVL a_len+4(FP), BX
- MOVL b_len+16(FP), CX
- MOVL $0, AX
- CMPL BX, CX
- JNE eqret
- MOVL a+0(FP), SI
- MOVL b+12(FP), DI
- CLD
- REP; CMPSB
- JNE eqret
- MOVL $1, AX
-eqret:
- MOVB AX, ret+24(FP)
- RET
diff --git a/src/pkg/bytes/asm_amd64.s b/src/pkg/bytes/asm_amd64.s
index b8f9f1b..b84957b 100644
--- a/src/pkg/bytes/asm_amd64.s
+++ b/src/pkg/bytes/asm_amd64.s
@@ -89,20 +89,3 @@
SUBL $1, DI
MOVQ DI, ret+32(FP)
RET
-
-TEXT ·Equal(SB),7,$0
- MOVQ a_len+8(FP), BX
- MOVQ b_len+32(FP), CX
- MOVL $0, AX
- CMPQ BX, CX
- JNE eqret
- MOVQ a+0(FP), SI
- MOVQ b+24(FP), DI
- CLD
- REP; CMPSB
- MOVL $1, DX
- CMOVLEQ DX, AX
-eqret:
- MOVB AX, ret+48(FP)
- RET
-
diff --git a/src/pkg/bytes/bytes_decl.go b/src/pkg/bytes/bytes_decl.go
index ce78be4..fbf9282 100644
--- a/src/pkg/bytes/bytes_decl.go
+++ b/src/pkg/bytes/bytes_decl.go
@@ -13,4 +13,4 @@
// Equal returns a boolean reporting whether a == b.
// A nil argument is equivalent to an empty slice.
-func Equal(a, b []byte) bool // asm_$GOARCH.s
+func Equal(a, b []byte) bool // asm_arm.s or ../runtime/asm_{386,amd64}.s
diff --git a/src/pkg/bytes/bytes_test.go b/src/pkg/bytes/bytes_test.go
index 1d6274c..d296224 100644
--- a/src/pkg/bytes/bytes_test.go
+++ b/src/pkg/bytes/bytes_test.go
@@ -61,6 +61,10 @@
{[]byte("ab"), []byte("x"), -1},
{[]byte("x"), []byte("a"), 1},
{[]byte("b"), []byte("x"), -1},
+ // test runtime·memeq's chunked implementation
+ {[]byte("abcdefgh"), []byte("abcdefgh"), 0},
+ {[]byte("abcdefghi"), []byte("abcdefghi"), 0},
+ {[]byte("abcdefghi"), []byte("abcdefghj"), -1},
// nil tests
{nil, nil, 0},
{[]byte(""), nil, 0},
@@ -86,6 +90,58 @@
}
}
+func TestEqual(t *testing.T) {
+ var size = 128
+ if testing.Short() {
+ size = 32
+ }
+ a := make([]byte, size)
+ b := make([]byte, size)
+ b_init := make([]byte, size)
+ // randomish but deterministic data
+ for i := 0; i < size; i++ {
+ a[i] = byte(17 * i)
+ b_init[i] = byte(23*i + 100)
+ }
+
+ for len := 0; len <= size; len++ {
+ for x := 0; x <= size-len; x++ {
+ for y := 0; y <= size-len; y++ {
+ copy(b, b_init)
+ copy(b[y:y+len], a[x:x+len])
+ if !Equal(a[x:x+len], b[y:y+len]) || !Equal(b[y:y+len], a[x:x+len]) {
+ t.Errorf("Equal(%d, %d, %d) = false", len, x, y)
+ }
+ }
+ }
+ }
+}
+
+// make sure Equal returns false for minimally different strings. The data
+// is all zeros except for a single one in one location.
+func TestNotEqual(t *testing.T) {
+ var size = 128
+ if testing.Short() {
+ size = 32
+ }
+ a := make([]byte, size)
+ b := make([]byte, size)
+
+ for len := 0; len <= size; len++ {
+ for x := 0; x <= size-len; x++ {
+ for y := 0; y <= size-len; y++ {
+ for diffpos := x; diffpos < x+len; diffpos++ {
+ a[diffpos] = 1
+ if Equal(a[x:x+len], b[y:y+len]) || Equal(b[y:y+len], a[x:x+len]) {
+ t.Errorf("NotEqual(%d, %d, %d, %d) = true", len, x, y, diffpos)
+ }
+ a[diffpos] = 0
+ }
+ }
+ }
+ }
+}
+
var indexTests = []BinOpTest{
{"", "", 0},
{"", "a", -1},
@@ -303,10 +359,30 @@
buf[n-1] = '\x00'
}
+func BenchmarkEqual0(b *testing.B) {
+ var buf [4]byte
+ buf1 := buf[0:0]
+ buf2 := buf[1:1]
+ for i := 0; i < b.N; i++ {
+ eq := Equal(buf1, buf2)
+ if !eq {
+ b.Fatal("bad equal")
+ }
+ }
+}
+
+func BenchmarkEqual1(b *testing.B) { bmEqual(b, Equal, 1) }
+func BenchmarkEqual6(b *testing.B) { bmEqual(b, Equal, 6) }
+func BenchmarkEqual9(b *testing.B) { bmEqual(b, Equal, 9) }
+func BenchmarkEqual15(b *testing.B) { bmEqual(b, Equal, 15) }
+func BenchmarkEqual16(b *testing.B) { bmEqual(b, Equal, 16) }
+func BenchmarkEqual20(b *testing.B) { bmEqual(b, Equal, 20) }
func BenchmarkEqual32(b *testing.B) { bmEqual(b, Equal, 32) }
func BenchmarkEqual4K(b *testing.B) { bmEqual(b, Equal, 4<<10) }
func BenchmarkEqual4M(b *testing.B) { bmEqual(b, Equal, 4<<20) }
func BenchmarkEqual64M(b *testing.B) { bmEqual(b, Equal, 64<<20) }
+func BenchmarkEqualPort1(b *testing.B) { bmEqual(b, EqualPortable, 1) }
+func BenchmarkEqualPort6(b *testing.B) { bmEqual(b, EqualPortable, 6) }
func BenchmarkEqualPort32(b *testing.B) { bmEqual(b, EqualPortable, 32) }
func BenchmarkEqualPort4K(b *testing.B) { bmEqual(b, EqualPortable, 4<<10) }
func BenchmarkEqualPortable4M(b *testing.B) { bmEqual(b, EqualPortable, 4<<20) }
diff --git a/src/pkg/bytes/equal_test.go b/src/pkg/bytes/equal_test.go
new file mode 100644
index 0000000..a393d5e
--- /dev/null
+++ b/src/pkg/bytes/equal_test.go
@@ -0,0 +1,45 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// +build linux
+
+package bytes_test
+
+import (
+ . "bytes"
+ "syscall"
+ "testing"
+ "unsafe"
+)
+
+// This file tests the situation where memeq is checking
+// data very near to a page boundary. We want to make sure
+// equal does not read across the boundary and cause a page
+// fault where it shouldn't.
+
+// This test runs only on linux. The code being tested is
+// not OS-specific, so it does not need to be tested on all
+// operating systems.
+
+func TestEqualNearPageBoundary(t *testing.T) {
+ pagesize := syscall.Getpagesize()
+ b := make([]byte, 4*pagesize)
+ i := pagesize
+ for ; uintptr(unsafe.Pointer(&b[i]))%uintptr(pagesize) != 0; i++ {
+ }
+ syscall.Mprotect(b[i-pagesize:i], 0)
+ syscall.Mprotect(b[i+pagesize:i+2*pagesize], 0)
+
+ // both of these should fault
+ //pagesize += int(b[i-1])
+ //pagesize += int(b[i+pagesize])
+
+ for j := 0; j < pagesize; j++ {
+ b[i+j] = 'A'
+ }
+ for j := 0; j <= pagesize; j++ {
+ Equal(b[i:i+j], b[i+pagesize-j:i+pagesize])
+ Equal(b[i+pagesize-j:i+pagesize], b[i:i+j])
+ }
+}
diff --git a/src/pkg/runtime/alg.c b/src/pkg/runtime/alg.c
index 2dc8212..a78d978 100644
--- a/src/pkg/runtime/alg.c
+++ b/src/pkg/runtime/alg.c
@@ -37,25 +37,11 @@
void
runtime·memequal(bool *eq, uintptr s, void *a, void *b)
{
- byte *ba, *bb, *aend;
-
if(a == b) {
*eq = 1;
return;
}
- ba = a;
- bb = b;
- aend = ba+s;
- while(ba != aend) {
- if(*ba != *bb) {
- *eq = 0;
- return;
- }
- ba++;
- bb++;
- }
- *eq = 1;
- return;
+ *eq = runtime·memeq(a, b, s);
}
void
@@ -323,6 +309,7 @@
runtime·strequal(bool *eq, uintptr s, void *a, void *b)
{
intgo alen;
+ byte *s1, *s2;
USED(s);
alen = ((String*)a)->len;
@@ -330,11 +317,13 @@
*eq = false;
return;
}
- if(((String*)a)->str == ((String*)b)->str) {
+ s1 = ((String*)a)->str;
+ s2 = ((String*)b)->str;
+ if(s1 == s2) {
*eq = true;
return;
}
- runtime·memequal(eq, alen, ((String*)a)->str, ((String*)b)->str);
+ *eq = runtime·memeq(s1, s2, alen);
}
void
diff --git a/src/pkg/runtime/asm_386.s b/src/pkg/runtime/asm_386.s
index 57de87b..531057f 100644
--- a/src/pkg/runtime/asm_386.s
+++ b/src/pkg/runtime/asm_386.s
@@ -986,3 +986,118 @@
LONG $0x0c0b0a09
LONG $0xff0f0e0d
+TEXT runtime·memeq(SB),7,$0
+ MOVL a+0(FP), SI
+ MOVL b+4(FP), DI
+ MOVL count+8(FP), BX
+ JMP runtime·memeqbody(SB)
+
+
+TEXT bytes·Equal(SB),7,$0
+ MOVL a_len+4(FP), BX
+ MOVL b_len+16(FP), CX
+ XORL AX, AX
+ CMPL BX, CX
+ JNE eqret
+ MOVL a+0(FP), SI
+ MOVL b+12(FP), DI
+ CALL runtime·memeqbody(SB)
+eqret:
+ MOVB AX, ret+24(FP)
+ RET
+
+// a in SI
+// b in DI
+// count in BX
+TEXT runtime·memeqbody(SB),7,$0
+ XORL AX, AX
+
+ CMPL BX, $4
+ JB small
+
+ // 64 bytes at a time using xmm registers
+hugeloop:
+ CMPL BX, $64
+ JB bigloop
+ TESTL $0x4000000, runtime·cpuid_edx(SB) // check for sse2
+ JE bigloop
+ MOVOU (SI), X0
+ MOVOU (DI), X1
+ MOVOU 16(SI), X2
+ MOVOU 16(DI), X3
+ MOVOU 32(SI), X4
+ MOVOU 32(DI), X5
+ MOVOU 48(SI), X6
+ MOVOU 48(DI), X7
+ PCMPEQB X1, X0
+ PCMPEQB X3, X2
+ PCMPEQB X5, X4
+ PCMPEQB X7, X6
+ PAND X2, X0
+ PAND X6, X4
+ PAND X4, X0
+ PMOVMSKB X0, DX
+ ADDL $64, SI
+ ADDL $64, DI
+ SUBL $64, BX
+ CMPL DX, $0xffff
+ JEQ hugeloop
+ RET
+
+ // 4 bytes at a time using 32-bit register
+bigloop:
+ CMPL BX, $4
+ JBE leftover
+ MOVL (SI), CX
+ MOVL (DI), DX
+ ADDL $4, SI
+ ADDL $4, DI
+ SUBL $4, BX
+ CMPL CX, DX
+ JEQ bigloop
+ RET
+
+ // remaining 0-4 bytes
+leftover:
+ MOVL -4(SI)(BX*1), CX
+ MOVL -4(DI)(BX*1), DX
+ CMPL CX, DX
+ SETEQ AX
+ RET
+
+small:
+ CMPL BX, $0
+ JEQ equal
+
+ LEAL 0(BX*8), CX
+ NEGL CX
+
+ MOVL SI, DX
+ CMPB DX, $0xfc
+ JA si_high
+
+ // load at SI won't cross a page boundary.
+ MOVL (SI), SI
+ JMP si_finish
+si_high:
+ // address ends in 111111xx. Load up to bytes we want, move to correct position.
+ MOVL -4(SI)(BX*1), SI
+ SHRL CX, SI
+si_finish:
+
+ // same for DI.
+ MOVL DI, DX
+ CMPB DX, $0xfc
+ JA di_high
+ MOVL (DI), DI
+ JMP di_finish
+di_high:
+ MOVL -4(DI)(BX*1), DI
+ SHRL CX, DI
+di_finish:
+
+ SUBL SI, DI
+ SHLL CX, DI
+equal:
+ SETEQ AX
+ RET
diff --git a/src/pkg/runtime/asm_amd64.s b/src/pkg/runtime/asm_amd64.s
index af2064ff..0dee155 100644
--- a/src/pkg/runtime/asm_amd64.s
+++ b/src/pkg/runtime/asm_amd64.s
@@ -907,3 +907,115 @@
QUAD $0xffff0f0e0d0c0b0a
QUAD $0x0807060504030201
QUAD $0xff0f0e0d0c0b0a09
+
+TEXT runtime·memeq(SB),7,$0
+ MOVQ a+0(FP), SI
+ MOVQ b+8(FP), DI
+ MOVQ count+16(FP), BX
+ JMP runtime·memeqbody(SB)
+
+
+TEXT bytes·Equal(SB),7,$0
+ MOVQ a_len+8(FP), BX
+ MOVQ b_len+32(FP), CX
+ XORQ AX, AX
+ CMPQ BX, CX
+ JNE eqret
+ MOVQ a+0(FP), SI
+ MOVQ b+24(FP), DI
+ CALL runtime·memeqbody(SB)
+eqret:
+ MOVB AX, ret+48(FP)
+ RET
+
+// a in SI
+// b in DI
+// count in BX
+TEXT runtime·memeqbody(SB),7,$0
+ XORQ AX, AX
+
+ CMPQ BX, $8
+ JB small
+
+ // 64 bytes at a time using xmm registers
+hugeloop:
+ CMPQ BX, $64
+ JB bigloop
+ MOVOU (SI), X0
+ MOVOU (DI), X1
+ MOVOU 16(SI), X2
+ MOVOU 16(DI), X3
+ MOVOU 32(SI), X4
+ MOVOU 32(DI), X5
+ MOVOU 48(SI), X6
+ MOVOU 48(DI), X7
+ PCMPEQB X1, X0
+ PCMPEQB X3, X2
+ PCMPEQB X5, X4
+ PCMPEQB X7, X6
+ PAND X2, X0
+ PAND X6, X4
+ PAND X4, X0
+ PMOVMSKB X0, DX
+ ADDQ $64, SI
+ ADDQ $64, DI
+ SUBQ $64, BX
+ CMPL DX, $0xffff
+ JEQ hugeloop
+ RET
+
+ // 8 bytes at a time using 64-bit register
+bigloop:
+ CMPQ BX, $8
+ JBE leftover
+ MOVQ (SI), CX
+ MOVQ (DI), DX
+ ADDQ $8, SI
+ ADDQ $8, DI
+ SUBQ $8, BX
+ CMPQ CX, DX
+ JEQ bigloop
+ RET
+
+ // remaining 0-8 bytes
+leftover:
+ MOVQ -8(SI)(BX*1), CX
+ MOVQ -8(DI)(BX*1), DX
+ CMPQ CX, DX
+ SETEQ AX
+ RET
+
+small:
+ CMPQ BX, $0
+ JEQ equal
+
+ LEAQ 0(BX*8), CX
+ NEGQ CX
+
+ CMPB SI, $0xf8
+ JA si_high
+
+ // load at SI won't cross a page boundary.
+ MOVQ (SI), SI
+ JMP si_finish
+si_high:
+ // address ends in 11111xxx. Load up to bytes we want, move to correct position.
+ MOVQ -8(SI)(BX*1), SI
+ SHRQ CX, SI
+si_finish:
+
+ // same for DI.
+ CMPB DI, $0xf8
+ JA di_high
+ MOVQ (DI), DI
+ JMP di_finish
+di_high:
+ MOVQ -8(DI)(BX*1), DI
+ SHRQ CX, DI
+di_finish:
+
+ SUBQ SI, DI
+ SHLQ CX, DI
+equal:
+ SETEQ AX
+ RET
diff --git a/src/pkg/runtime/asm_arm.s b/src/pkg/runtime/asm_arm.s
index e544933..ee9acb7 100644
--- a/src/pkg/runtime/asm_arm.s
+++ b/src/pkg/runtime/asm_arm.s
@@ -487,7 +487,7 @@
MOVW R2, limit+4(FP)
RET
-// not implemented for ARM
+// AES hashing not implemented for ARM
TEXT runtime·aeshash(SB),7,$-4
MOVW $0, R0
MOVW (R0), R1
@@ -500,3 +500,20 @@
TEXT runtime·aeshashstr(SB),7,$-4
MOVW $0, R0
MOVW (R0), R1
+
+TEXT runtime·memeq(SB),7,$-4
+ MOVW a+0(FP), R1
+ MOVW b+4(FP), R2
+ MOVW n+8(FP), R3
+ ADD R1, R3, R6
+ MOVW $1, R0
+_next:
+ CMP R1, R6
+ RET.EQ
+ MOVBU.P 1(R1), R4
+ MOVBU.P 1(R2), R5
+ CMP R4, R5
+ BEQ _next
+
+ MOVW $0, R0
+ RET
diff --git a/src/pkg/runtime/hashmap.c b/src/pkg/runtime/hashmap.c
index 3f26a15..d639be3 100644
--- a/src/pkg/runtime/hashmap.c
+++ b/src/pkg/runtime/hashmap.c
@@ -562,7 +562,7 @@
#define HASH_LOOKUP2 runtime·mapaccess2_faststr
#define KEYTYPE String
#define HASHFUNC runtime·algarray[ASTRING].hash
-#define EQFUNC(x,y) ((x).len == (y).len && ((x).str == (y).str || runtime·mcmp((x).str, (y).str, (x).len) == 0))
+#define EQFUNC(x,y) ((x).len == (y).len && ((x).str == (y).str || runtime·memeq((x).str, (y).str, (x).len)))
#define QUICKEQ(x) ((x).len < 32)
#include "hashmap_fast.c"
diff --git a/src/pkg/runtime/mapspeed_test.go b/src/pkg/runtime/mapspeed_test.go
index 4d77347..73be434 100644
--- a/src/pkg/runtime/mapspeed_test.go
+++ b/src/pkg/runtime/mapspeed_test.go
@@ -118,6 +118,17 @@
}
}
+func BenchmarkMegEqMap(b *testing.B) {
+ m := make(map[string]bool)
+ key1 := strings.Repeat("X", 1<<20)
+ key2 := strings.Repeat("X", 1<<20) // equal but different instance
+ m[key1] = true
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ _, _ = m[key2]
+ }
+}
+
func BenchmarkMegEmptyMap(b *testing.B) {
m := make(map[string]bool)
key := strings.Repeat("X", 1<<20)
diff --git a/src/pkg/runtime/runtime.h b/src/pkg/runtime/runtime.h
index 11e713a..864b2aa 100644
--- a/src/pkg/runtime/runtime.h
+++ b/src/pkg/runtime/runtime.h
@@ -593,6 +593,8 @@
void runtime·interequal(bool*, uintptr, void*, void*);
void runtime·nilinterequal(bool*, uintptr, void*, void*);
+bool runtime·memeq(void*, void*, uintptr);
+
void runtime·memprint(uintptr, void*);
void runtime·strprint(uintptr, void*);
void runtime·interprint(uintptr, void*);
diff --git a/src/pkg/runtime/string.goc b/src/pkg/runtime/string.goc
index c0d3f2b..49bf114 100644
--- a/src/pkg/runtime/string.goc
+++ b/src/pkg/runtime/string.goc
@@ -206,8 +206,6 @@
}
func eqstring(s1 String, s2 String) (v bool) {
- uintgo i, l;
-
if(s1.len != s2.len) {
v = false;
return;
@@ -216,13 +214,7 @@
v = true;
return;
}
- l = s1.len;
- for(i=0; i<l; i++)
- if(s1.str[i] != s2.str[i]) {
- v = false;
- return;
- }
- v = true;
+ v = runtime·memeq(s1.str, s2.str, s1.len);
}
int32
diff --git a/src/pkg/runtime/string_test.go b/src/pkg/runtime/string_test.go
index 6ba3c1d..df3ff06 100644
--- a/src/pkg/runtime/string_test.go
+++ b/src/pkg/runtime/string_test.go
@@ -47,3 +47,31 @@
}
}
}
+
+func BenchmarkCompareStringBigUnaligned(b *testing.B) {
+ bytes := make([]byte, 0, 1<<20)
+ for len(bytes) < 1<<20 {
+ bytes = append(bytes, "Hello Gophers!"...)
+ }
+ s1, s2 := string(bytes), "hello"+string(bytes)
+ for i := 0; i < b.N; i++ {
+ if s1 != s2[len("hello"):] {
+ b.Fatal("s1 != s2")
+ }
+ }
+ b.SetBytes(int64(len(s1)))
+}
+
+func BenchmarkCompareStringBig(b *testing.B) {
+ bytes := make([]byte, 0, 1<<20)
+ for len(bytes) < 1<<20 {
+ bytes = append(bytes, "Hello Gophers!"...)
+ }
+ s1, s2 := string(bytes), string(bytes)
+ for i := 0; i < b.N; i++ {
+ if s1 != s2 {
+ b.Fatal("s1 != s2")
+ }
+ }
+ b.SetBytes(int64(len(s1)))
+}