|  | // Copyright 2018 The Go Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style | 
|  | // license that can be found in the LICENSE file. | 
|  |  | 
|  | #include "go_asm.h" | 
|  | #include "textflag.h" | 
|  |  | 
|  | TEXT ·Compare(SB),NOSPLIT,$0-56 | 
|  | MOVQ	a_base+0(FP), SI | 
|  | MOVQ	a_len+8(FP), BX | 
|  | MOVQ	b_base+24(FP), DI | 
|  | MOVQ	b_len+32(FP), DX | 
|  | LEAQ	ret+48(FP), R9 | 
|  | JMP	cmpbody<>(SB) | 
|  |  | 
|  | TEXT runtime·cmpstring(SB),NOSPLIT,$0-40 | 
|  | MOVQ	a_base+0(FP), SI | 
|  | MOVQ	a_len+8(FP), BX | 
|  | MOVQ	b_base+16(FP), DI | 
|  | MOVQ	b_len+24(FP), DX | 
|  | LEAQ	ret+32(FP), R9 | 
|  | JMP	cmpbody<>(SB) | 
|  |  | 
|  | // input: | 
|  | //   SI = a | 
|  | //   DI = b | 
|  | //   BX = alen | 
|  | //   DX = blen | 
|  | //   R9 = address of output word (stores -1/0/1 here) | 
|  | TEXT cmpbody<>(SB),NOSPLIT,$0-0 | 
|  | CMPQ	SI, DI | 
|  | JEQ	allsame | 
|  | CMPQ	BX, DX | 
|  | MOVQ	DX, R8 | 
|  | CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare | 
|  | CMPQ	R8, $8 | 
|  | JB	small | 
|  |  | 
|  | CMPQ	R8, $63 | 
|  | JBE	loop | 
|  | CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 | 
|  | JEQ     big_loop_avx2 | 
|  | JMP	big_loop | 
|  | loop: | 
|  | CMPQ	R8, $16 | 
|  | JBE	_0through16 | 
|  | MOVOU	(SI), X0 | 
|  | MOVOU	(DI), X1 | 
|  | PCMPEQB X0, X1 | 
|  | PMOVMSKB X1, AX | 
|  | XORQ	$0xffff, AX	// convert EQ to NE | 
|  | JNE	diff16	// branch if at least one byte is not equal | 
|  | ADDQ	$16, SI | 
|  | ADDQ	$16, DI | 
|  | SUBQ	$16, R8 | 
|  | JMP	loop | 
|  |  | 
|  | diff64: | 
|  | ADDQ	$48, SI | 
|  | ADDQ	$48, DI | 
|  | JMP	diff16 | 
|  | diff48: | 
|  | ADDQ	$32, SI | 
|  | ADDQ	$32, DI | 
|  | JMP	diff16 | 
|  | diff32: | 
|  | ADDQ	$16, SI | 
|  | ADDQ	$16, DI | 
|  | // AX = bit mask of differences | 
|  | diff16: | 
|  | BSFQ	AX, BX	// index of first byte that differs | 
|  | XORQ	AX, AX | 
|  | MOVB	(SI)(BX*1), CX | 
|  | CMPB	CX, (DI)(BX*1) | 
|  | SETHI	AX | 
|  | LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1 | 
|  | MOVQ	AX, (R9) | 
|  | RET | 
|  |  | 
|  | // 0 through 16 bytes left, alen>=8, blen>=8 | 
|  | _0through16: | 
|  | CMPQ	R8, $8 | 
|  | JBE	_0through8 | 
|  | MOVQ	(SI), AX | 
|  | MOVQ	(DI), CX | 
|  | CMPQ	AX, CX | 
|  | JNE	diff8 | 
|  | _0through8: | 
|  | MOVQ	-8(SI)(R8*1), AX | 
|  | MOVQ	-8(DI)(R8*1), CX | 
|  | CMPQ	AX, CX | 
|  | JEQ	allsame | 
|  |  | 
|  | // AX and CX contain parts of a and b that differ. | 
|  | diff8: | 
|  | BSWAPQ	AX	// reverse order of bytes | 
|  | BSWAPQ	CX | 
|  | XORQ	AX, CX | 
|  | BSRQ	CX, CX	// index of highest bit difference | 
|  | SHRQ	CX, AX	// move a's bit to bottom | 
|  | ANDQ	$1, AX	// mask bit | 
|  | LEAQ	-1(AX*2), AX // 1/0 => +1/-1 | 
|  | MOVQ	AX, (R9) | 
|  | RET | 
|  |  | 
|  | // 0-7 bytes in common | 
|  | small: | 
|  | LEAQ	(R8*8), CX	// bytes left -> bits left | 
|  | NEGQ	CX		//  - bits lift (== 64 - bits left mod 64) | 
|  | JEQ	allsame | 
|  |  | 
|  | // load bytes of a into high bytes of AX | 
|  | CMPB	SI, $0xf8 | 
|  | JA	si_high | 
|  | MOVQ	(SI), SI | 
|  | JMP	si_finish | 
|  | si_high: | 
|  | MOVQ	-8(SI)(R8*1), SI | 
|  | SHRQ	CX, SI | 
|  | si_finish: | 
|  | SHLQ	CX, SI | 
|  |  | 
|  | // load bytes of b in to high bytes of BX | 
|  | CMPB	DI, $0xf8 | 
|  | JA	di_high | 
|  | MOVQ	(DI), DI | 
|  | JMP	di_finish | 
|  | di_high: | 
|  | MOVQ	-8(DI)(R8*1), DI | 
|  | SHRQ	CX, DI | 
|  | di_finish: | 
|  | SHLQ	CX, DI | 
|  |  | 
|  | BSWAPQ	SI	// reverse order of bytes | 
|  | BSWAPQ	DI | 
|  | XORQ	SI, DI	// find bit differences | 
|  | JEQ	allsame | 
|  | BSRQ	DI, CX	// index of highest bit difference | 
|  | SHRQ	CX, SI	// move a's bit to bottom | 
|  | ANDQ	$1, SI	// mask bit | 
|  | LEAQ	-1(SI*2), AX // 1/0 => +1/-1 | 
|  | MOVQ	AX, (R9) | 
|  | RET | 
|  |  | 
|  | allsame: | 
|  | XORQ	AX, AX | 
|  | XORQ	CX, CX | 
|  | CMPQ	BX, DX | 
|  | SETGT	AX	// 1 if alen > blen | 
|  | SETEQ	CX	// 1 if alen == blen | 
|  | LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result | 
|  | MOVQ	AX, (R9) | 
|  | RET | 
|  |  | 
|  | // this works for >= 64 bytes of data. | 
|  | big_loop: | 
|  | MOVOU	(SI), X0 | 
|  | MOVOU	(DI), X1 | 
|  | PCMPEQB X0, X1 | 
|  | PMOVMSKB X1, AX | 
|  | XORQ	$0xffff, AX | 
|  | JNE	diff16 | 
|  |  | 
|  | MOVOU	16(SI), X0 | 
|  | MOVOU	16(DI), X1 | 
|  | PCMPEQB X0, X1 | 
|  | PMOVMSKB X1, AX | 
|  | XORQ	$0xffff, AX | 
|  | JNE	diff32 | 
|  |  | 
|  | MOVOU	32(SI), X0 | 
|  | MOVOU	32(DI), X1 | 
|  | PCMPEQB X0, X1 | 
|  | PMOVMSKB X1, AX | 
|  | XORQ	$0xffff, AX | 
|  | JNE	diff48 | 
|  |  | 
|  | MOVOU	48(SI), X0 | 
|  | MOVOU	48(DI), X1 | 
|  | PCMPEQB X0, X1 | 
|  | PMOVMSKB X1, AX | 
|  | XORQ	$0xffff, AX | 
|  | JNE	diff64 | 
|  |  | 
|  | ADDQ	$64, SI | 
|  | ADDQ	$64, DI | 
|  | SUBQ	$64, R8 | 
|  | CMPQ	R8, $64 | 
|  | JBE	loop | 
|  | JMP	big_loop | 
|  |  | 
|  | // Compare 64-bytes per loop iteration. | 
|  | // Loop is unrolled and uses AVX2. | 
|  | big_loop_avx2: | 
|  | VMOVDQU	(SI), Y2 | 
|  | VMOVDQU	(DI), Y3 | 
|  | VMOVDQU	32(SI), Y4 | 
|  | VMOVDQU	32(DI), Y5 | 
|  | VPCMPEQB Y2, Y3, Y0 | 
|  | VPMOVMSKB Y0, AX | 
|  | XORL	$0xffffffff, AX | 
|  | JNE	diff32_avx2 | 
|  | VPCMPEQB Y4, Y5, Y6 | 
|  | VPMOVMSKB Y6, AX | 
|  | XORL	$0xffffffff, AX | 
|  | JNE	diff64_avx2 | 
|  |  | 
|  | ADDQ	$64, SI | 
|  | ADDQ	$64, DI | 
|  | SUBQ	$64, R8 | 
|  | CMPQ	R8, $64 | 
|  | JB	big_loop_avx2_exit | 
|  | JMP	big_loop_avx2 | 
|  |  | 
|  | // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk. | 
|  | diff32_avx2: | 
|  | VZEROUPPER | 
|  | JMP diff16 | 
|  |  | 
|  | // Same as diff32_avx2, but for last 32 bytes. | 
|  | diff64_avx2: | 
|  | VZEROUPPER | 
|  | JMP diff48 | 
|  |  | 
|  | // For <64 bytes remainder jump to normal loop. | 
|  | big_loop_avx2_exit: | 
|  | VZEROUPPER | 
|  | JMP loop |