runtime/bytes: fast Compare for byte arrays and strings.

Uses SSE instructions to process 16 bytes at a time.

fixes #5354

R=bradfitz, google
CC=golang-dev
https://golang.org/cl/8853048
diff --git a/src/pkg/runtime/asm_386.s b/src/pkg/runtime/asm_386.s
index 531057f..2a854a8 100644
--- a/src/pkg/runtime/asm_386.s
+++ b/src/pkg/runtime/asm_386.s
@@ -1101,3 +1101,138 @@
 equal:
 	SETEQ	AX
 	RET
+
+TEXT runtime·cmpstring(SB),7,$0
+	MOVL	s1+0(FP), SI
+	MOVL	s1+4(FP), BX
+	MOVL	s2+8(FP), DI
+	MOVL	s2+12(FP), DX
+	CALL	runtime·cmpbody(SB)
+	MOVL	AX, res+16(FP)
+	RET
+
+TEXT bytes·Compare(SB),7,$0
+	MOVL	s1+0(FP), SI
+	MOVL	s1+4(FP), BX
+	MOVL	s2+12(FP), DI
+	MOVL	s2+16(FP), DX
+	CALL	runtime·cmpbody(SB)
+	MOVL	AX, res+24(FP)
+	RET
+
+// input:
+//   SI = a
+//   DI = b
+//   BX = alen
+//   DX = blen
+// output:
+//   AX = 1/0/-1
+TEXT runtime·cmpbody(SB),7,$0
+	CMPL	SI, DI
+	JEQ	cmp_allsame
+	CMPL	BX, DX
+	MOVL	DX, BP
+	CMOVLLT	BX, BP // BP = min(alen, blen)
+	CMPL	BP, $4
+	JB	cmp_small
+	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
+	JE	cmp_mediumloop
+cmp_largeloop:
+	CMPL	BP, $16
+	JB	cmp_mediumloop
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORL	$0xffff, AX	// convert EQ to NE
+	JNE	cmp_diff16	// branch if at least one byte is not equal
+	ADDL	$16, SI
+	ADDL	$16, DI
+	SUBL	$16, BP
+	JMP	cmp_largeloop
+
+cmp_diff16:
+	BSFL	AX, BX	// index of first byte that differs
+	XORL	AX, AX
+	MOVB	(SI)(BX*1), CX
+	CMPB	CX, (DI)(BX*1)
+	SETHI	AX
+	LEAL	-1(AX*2), AX	// convert 1/0 to +1/-1
+	RET
+
+cmp_mediumloop:
+	CMPL	BP, $4
+	JBE	cmp_0through4
+	MOVL	(SI), AX
+	MOVL	(DI), CX
+	CMPL	AX, CX
+	JNE	cmp_diff4
+	ADDL	$4, SI
+	ADDL	$4, DI
+	SUBL	$4, BP
+	JMP	cmp_mediumloop
+
+cmp_0through4:
+	MOVL	-4(SI)(BP*1), AX
+	MOVL	-4(DI)(BP*1), CX
+	CMPL	AX, CX
+	JEQ	cmp_allsame
+
+cmp_diff4:
+	BSWAPL	AX	// reverse order of bytes
+	BSWAPL	CX
+	XORL	AX, CX	// find bit differences
+	BSRL	CX, CX	// index of highest bit difference
+	SHRL	CX, AX	// move a's bit to bottom
+	ANDL	$1, AX	// mask bit
+	LEAL	-1(AX*2), AX // 1/0 => +1/-1
+	RET
+
+	// 0-3 bytes in common
+cmp_small:
+	LEAL	(BP*8), CX
+	NEGL	CX
+	JEQ	cmp_allsame
+
+	// load si
+	CMPB	SI, $0xfc
+	JA	cmp_si_high
+	MOVL	(SI), SI
+	JMP	cmp_si_finish
+cmp_si_high:
+	MOVL	-4(SI)(BP*1), SI
+	SHRL	CX, SI
+cmp_si_finish:
+	SHLL	CX, SI
+
+	// same for di
+	CMPB	DI, $0xfc
+	JA	cmp_di_high
+	MOVL	(DI), DI
+	JMP	cmp_di_finish
+cmp_di_high:
+	MOVL	-4(DI)(BP*1), DI
+	SHRL	CX, DI
+cmp_di_finish:
+	SHLL	CX, DI
+
+	BSWAPL	SI	// reverse order of bytes
+	BSWAPL	DI
+	XORL	SI, DI	// find bit differences
+	JEQ	cmp_allsame
+	BSRL	DI, CX	// index of highest bit difference
+	SHRL	CX, SI	// move a's bit to bottom
+	ANDL	$1, SI	// mask bit
+	LEAL	-1(SI*2), AX // 1/0 => +1/-1
+	RET
+
+	// all the bytes in common are the same, so we just need
+	// to compare the lengths.
+cmp_allsame:
+	XORL	AX, AX
+	XORL	CX, CX
+	CMPL	BX, DX
+	SETGT	AX	// 1 if alen > blen
+	SETEQ	CX	// 1 if alen == blen
+	LEAL	-1(CX)(AX*2), AX	// 1,0,-1 result
+	RET