| // Copyright 2018 The Go Authors. All rights reserved. | 
 | // Use of this source code is governed by a BSD-style | 
 | // license that can be found in the LICENSE file. | 
 |  | 
 | #include "go_asm.h" | 
 | #include "textflag.h" | 
 |  | 
 | TEXT ·Compare<ABIInternal>(SB),NOSPLIT,$0-56 | 
 | #ifdef GOEXPERIMENT_regabiargs | 
 | 	// AX = a_base (want in SI) | 
 | 	// BX = a_len  (want in BX) | 
 | 	// CX = a_cap  (unused) | 
 | 	// DI = b_base (want in DI) | 
 | 	// SI = b_len  (want in DX) | 
 | 	// R8 = b_cap  (unused) | 
 | 	MOVQ	SI, DX | 
 | 	MOVQ	AX, SI | 
 | #else | 
 | 	MOVQ	a_base+0(FP), SI | 
 | 	MOVQ	a_len+8(FP), BX | 
 | 	MOVQ	b_base+24(FP), DI | 
 | 	MOVQ	b_len+32(FP), DX | 
 | 	LEAQ	ret+48(FP), R9 | 
 | #endif | 
 | 	JMP	cmpbody<>(SB) | 
 |  | 
 | TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40 | 
 | #ifdef GOEXPERIMENT_regabiargs | 
 | 	// AX = a_base (want in SI) | 
 | 	// BX = a_len  (want in BX) | 
 | 	// CX = b_base (want in DI) | 
 | 	// DI = b_len  (want in DX) | 
 | 	MOVQ	AX, SI | 
 | 	MOVQ	DI, DX | 
 | 	MOVQ	CX, DI | 
 | #else | 
 | 	MOVQ	a_base+0(FP), SI | 
 | 	MOVQ	a_len+8(FP), BX | 
 | 	MOVQ	b_base+16(FP), DI | 
 | 	MOVQ	b_len+24(FP), DX | 
 | 	LEAQ	ret+32(FP), R9 | 
 | #endif | 
 | 	JMP	cmpbody<>(SB) | 
 |  | 
 | // input: | 
 | //   SI = a | 
 | //   DI = b | 
 | //   BX = alen | 
 | //   DX = blen | 
 | #ifndef GOEXPERIMENT_regabiargs | 
 | //   R9 = address of output word (stores -1/0/1 here) | 
 | #else | 
 | // output: | 
 | //   AX = output (-1/0/1) | 
 | #endif | 
 | TEXT cmpbody<>(SB),NOSPLIT,$0-0 | 
 | 	CMPQ	SI, DI | 
 | 	JEQ	allsame | 
 | 	CMPQ	BX, DX | 
 | 	MOVQ	DX, R8 | 
 | 	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare | 
 | 	CMPQ	R8, $8 | 
 | 	JB	small | 
 |  | 
 | 	CMPQ	R8, $63 | 
 | 	JBE	loop | 
 | 	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 | 
 | 	JEQ     big_loop_avx2 | 
 | 	JMP	big_loop | 
 | loop: | 
 | 	CMPQ	R8, $16 | 
 | 	JBE	_0through16 | 
 | 	MOVOU	(SI), X0 | 
 | 	MOVOU	(DI), X1 | 
 | 	PCMPEQB X0, X1 | 
 | 	PMOVMSKB X1, AX | 
 | 	XORQ	$0xffff, AX	// convert EQ to NE | 
 | 	JNE	diff16	// branch if at least one byte is not equal | 
 | 	ADDQ	$16, SI | 
 | 	ADDQ	$16, DI | 
 | 	SUBQ	$16, R8 | 
 | 	JMP	loop | 
 |  | 
 | diff64: | 
 | 	ADDQ	$48, SI | 
 | 	ADDQ	$48, DI | 
 | 	JMP	diff16 | 
 | diff48: | 
 | 	ADDQ	$32, SI | 
 | 	ADDQ	$32, DI | 
 | 	JMP	diff16 | 
 | diff32: | 
 | 	ADDQ	$16, SI | 
 | 	ADDQ	$16, DI | 
 | 	// AX = bit mask of differences | 
 | diff16: | 
 | 	BSFQ	AX, BX	// index of first byte that differs | 
 | 	XORQ	AX, AX | 
 | 	MOVB	(SI)(BX*1), CX | 
 | 	CMPB	CX, (DI)(BX*1) | 
 | 	SETHI	AX | 
 | 	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1 | 
 | #ifndef GOEXPERIMENT_regabiargs | 
 | 	MOVQ	AX, (R9) | 
 | #endif | 
 | 	RET | 
 |  | 
 | 	// 0 through 16 bytes left, alen>=8, blen>=8 | 
 | _0through16: | 
 | 	CMPQ	R8, $8 | 
 | 	JBE	_0through8 | 
 | 	MOVQ	(SI), AX | 
 | 	MOVQ	(DI), CX | 
 | 	CMPQ	AX, CX | 
 | 	JNE	diff8 | 
 | _0through8: | 
 | 	MOVQ	-8(SI)(R8*1), AX | 
 | 	MOVQ	-8(DI)(R8*1), CX | 
 | 	CMPQ	AX, CX | 
 | 	JEQ	allsame | 
 |  | 
 | 	// AX and CX contain parts of a and b that differ. | 
 | diff8: | 
 | 	BSWAPQ	AX	// reverse order of bytes | 
 | 	BSWAPQ	CX | 
 | 	XORQ	AX, CX | 
 | 	BSRQ	CX, CX	// index of highest bit difference | 
 | 	SHRQ	CX, AX	// move a's bit to bottom | 
 | 	ANDQ	$1, AX	// mask bit | 
 | 	LEAQ	-1(AX*2), AX // 1/0 => +1/-1 | 
 | #ifndef GOEXPERIMENT_regabiargs | 
 | 	MOVQ	AX, (R9) | 
 | #endif | 
 | 	RET | 
 |  | 
 | 	// 0-7 bytes in common | 
 | small: | 
 | 	LEAQ	(R8*8), CX	// bytes left -> bits left | 
 | 	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64) | 
 | 	JEQ	allsame | 
 |  | 
 | 	// load bytes of a into high bytes of AX | 
 | 	CMPB	SI, $0xf8 | 
 | 	JA	si_high | 
 | 	MOVQ	(SI), SI | 
 | 	JMP	si_finish | 
 | si_high: | 
 | 	MOVQ	-8(SI)(R8*1), SI | 
 | 	SHRQ	CX, SI | 
 | si_finish: | 
 | 	SHLQ	CX, SI | 
 |  | 
 | 	// load bytes of b in to high bytes of BX | 
 | 	CMPB	DI, $0xf8 | 
 | 	JA	di_high | 
 | 	MOVQ	(DI), DI | 
 | 	JMP	di_finish | 
 | di_high: | 
 | 	MOVQ	-8(DI)(R8*1), DI | 
 | 	SHRQ	CX, DI | 
 | di_finish: | 
 | 	SHLQ	CX, DI | 
 |  | 
 | 	BSWAPQ	SI	// reverse order of bytes | 
 | 	BSWAPQ	DI | 
 | 	XORQ	SI, DI	// find bit differences | 
 | 	JEQ	allsame | 
 | 	BSRQ	DI, CX	// index of highest bit difference | 
 | 	SHRQ	CX, SI	// move a's bit to bottom | 
 | 	ANDQ	$1, SI	// mask bit | 
 | 	LEAQ	-1(SI*2), AX // 1/0 => +1/-1 | 
 | #ifndef GOEXPERIMENT_regabiargs | 
 | 	MOVQ	AX, (R9) | 
 | #endif | 
 | 	RET | 
 |  | 
 | allsame: | 
 | 	XORQ	AX, AX | 
 | 	XORQ	CX, CX | 
 | 	CMPQ	BX, DX | 
 | 	SETGT	AX	// 1 if alen > blen | 
 | 	SETEQ	CX	// 1 if alen == blen | 
 | 	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result | 
 | #ifndef GOEXPERIMENT_regabiargs | 
 | 	MOVQ	AX, (R9) | 
 | #endif | 
 | 	RET | 
 |  | 
 | 	// this works for >= 64 bytes of data. | 
 | big_loop: | 
 | 	MOVOU	(SI), X0 | 
 | 	MOVOU	(DI), X1 | 
 | 	PCMPEQB X0, X1 | 
 | 	PMOVMSKB X1, AX | 
 | 	XORQ	$0xffff, AX | 
 | 	JNE	diff16 | 
 |  | 
 | 	MOVOU	16(SI), X0 | 
 | 	MOVOU	16(DI), X1 | 
 | 	PCMPEQB X0, X1 | 
 | 	PMOVMSKB X1, AX | 
 | 	XORQ	$0xffff, AX | 
 | 	JNE	diff32 | 
 |  | 
 | 	MOVOU	32(SI), X0 | 
 | 	MOVOU	32(DI), X1 | 
 | 	PCMPEQB X0, X1 | 
 | 	PMOVMSKB X1, AX | 
 | 	XORQ	$0xffff, AX | 
 | 	JNE	diff48 | 
 |  | 
 | 	MOVOU	48(SI), X0 | 
 | 	MOVOU	48(DI), X1 | 
 | 	PCMPEQB X0, X1 | 
 | 	PMOVMSKB X1, AX | 
 | 	XORQ	$0xffff, AX | 
 | 	JNE	diff64 | 
 |  | 
 | 	ADDQ	$64, SI | 
 | 	ADDQ	$64, DI | 
 | 	SUBQ	$64, R8 | 
 | 	CMPQ	R8, $64 | 
 | 	JBE	loop | 
 | 	JMP	big_loop | 
 |  | 
 | 	// Compare 64-bytes per loop iteration. | 
 | 	// Loop is unrolled and uses AVX2. | 
 | big_loop_avx2: | 
 | 	VMOVDQU	(SI), Y2 | 
 | 	VMOVDQU	(DI), Y3 | 
 | 	VMOVDQU	32(SI), Y4 | 
 | 	VMOVDQU	32(DI), Y5 | 
 | 	VPCMPEQB Y2, Y3, Y0 | 
 | 	VPMOVMSKB Y0, AX | 
 | 	XORL	$0xffffffff, AX | 
 | 	JNE	diff32_avx2 | 
 | 	VPCMPEQB Y4, Y5, Y6 | 
 | 	VPMOVMSKB Y6, AX | 
 | 	XORL	$0xffffffff, AX | 
 | 	JNE	diff64_avx2 | 
 |  | 
 | 	ADDQ	$64, SI | 
 | 	ADDQ	$64, DI | 
 | 	SUBQ	$64, R8 | 
 | 	CMPQ	R8, $64 | 
 | 	JB	big_loop_avx2_exit | 
 | 	JMP	big_loop_avx2 | 
 |  | 
 | 	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk. | 
 | diff32_avx2: | 
 | 	VZEROUPPER | 
 | 	JMP diff16 | 
 |  | 
 | 	// Same as diff32_avx2, but for last 32 bytes. | 
 | diff64_avx2: | 
 | 	VZEROUPPER | 
 | 	JMP diff48 | 
 |  | 
 | 	// For <64 bytes remainder jump to normal loop. | 
 | big_loop_avx2_exit: | 
 | 	VZEROUPPER | 
 | 	JMP loop |