runtime: Implement faster equals for strings and bytes.
(amd64)
benchmark old ns/op new ns/op delta
BenchmarkEqual0 16 6 -63.15%
BenchmarkEqual9 22 7 -65.37%
BenchmarkEqual32 36 9 -74.91%
BenchmarkEqual4K 2187 120 -94.51%
benchmark old MB/s new MB/s speedup
BenchmarkEqual9 392.22 1134.38 2.89x
BenchmarkEqual32 866.72 3457.39 3.99x
BenchmarkEqual4K 1872.73 33998.87 18.15x
(386)
benchmark old ns/op new ns/op delta
BenchmarkEqual0 16 5 -63.85%
BenchmarkEqual9 22 7 -67.84%
BenchmarkEqual32 34 12 -64.94%
BenchmarkEqual4K 2196 113 -94.85%
benchmark old MB/s new MB/s speedup
BenchmarkEqual9 405.81 1260.18 3.11x
BenchmarkEqual32 919.55 2631.21 2.86x
BenchmarkEqual4K 1864.85 36072.54 19.34x
Update #3751
R=bradfitz, r, khr, dave, remyoudompheng, fullung, minux.ma, ality
CC=golang-dev
https://golang.org/cl/8056043
diff --git a/src/pkg/runtime/asm_386.s b/src/pkg/runtime/asm_386.s
index 57de87b..531057f 100644
--- a/src/pkg/runtime/asm_386.s
+++ b/src/pkg/runtime/asm_386.s
@@ -986,3 +986,118 @@
LONG $0x0c0b0a09
LONG $0xff0f0e0d
+TEXT runtime·memeq(SB),7,$0
+ MOVL a+0(FP), SI
+ MOVL b+4(FP), DI
+ MOVL count+8(FP), BX
+ JMP runtime·memeqbody(SB)
+
+
+TEXT bytes·Equal(SB),7,$0
+ MOVL a_len+4(FP), BX
+ MOVL b_len+16(FP), CX
+ XORL AX, AX
+ CMPL BX, CX
+ JNE eqret
+ MOVL a+0(FP), SI
+ MOVL b+12(FP), DI
+ CALL runtime·memeqbody(SB)
+eqret:
+ MOVB AX, ret+24(FP)
+ RET
+
+// a in SI
+// b in DI
+// count in BX
+TEXT runtime·memeqbody(SB),7,$0
+ XORL AX, AX
+
+ CMPL BX, $4
+ JB small
+
+ // 64 bytes at a time using xmm registers
+hugeloop:
+ CMPL BX, $64
+ JB bigloop
+ TESTL $0x4000000, runtime·cpuid_edx(SB) // check for sse2
+ JE bigloop
+ MOVOU (SI), X0
+ MOVOU (DI), X1
+ MOVOU 16(SI), X2
+ MOVOU 16(DI), X3
+ MOVOU 32(SI), X4
+ MOVOU 32(DI), X5
+ MOVOU 48(SI), X6
+ MOVOU 48(DI), X7
+ PCMPEQB X1, X0
+ PCMPEQB X3, X2
+ PCMPEQB X5, X4
+ PCMPEQB X7, X6
+ PAND X2, X0
+ PAND X6, X4
+ PAND X4, X0
+ PMOVMSKB X0, DX
+ ADDL $64, SI
+ ADDL $64, DI
+ SUBL $64, BX
+ CMPL DX, $0xffff
+ JEQ hugeloop
+ RET
+
+ // 4 bytes at a time using 32-bit register
+bigloop:
+ CMPL BX, $4
+ JBE leftover
+ MOVL (SI), CX
+ MOVL (DI), DX
+ ADDL $4, SI
+ ADDL $4, DI
+ SUBL $4, BX
+ CMPL CX, DX
+ JEQ bigloop
+ RET
+
+ // remaining 0-4 bytes
+leftover:
+ MOVL -4(SI)(BX*1), CX
+ MOVL -4(DI)(BX*1), DX
+ CMPL CX, DX
+ SETEQ AX
+ RET
+
+small:
+ CMPL BX, $0
+ JEQ equal
+
+ LEAL 0(BX*8), CX
+ NEGL CX
+
+ MOVL SI, DX
+ CMPB DX, $0xfc
+ JA si_high
+
+ // load at SI won't cross a page boundary.
+ MOVL (SI), SI
+ JMP si_finish
+si_high:
+ // address ends in 111111xx. Load up to bytes we want, move to correct position.
+ MOVL -4(SI)(BX*1), SI
+ SHRL CX, SI
+si_finish:
+
+ // same for DI.
+ MOVL DI, DX
+ CMPB DX, $0xfc
+ JA di_high
+ MOVL (DI), DI
+ JMP di_finish
+di_high:
+ MOVL -4(DI)(BX*1), DI
+ SHRL CX, DI
+di_finish:
+
+ SUBL SI, DI
+ SHLL CX, DI
+equal:
+ SETEQ AX
+ RET