src/internal/bytealg/compare_amd64.s - go - Git at Google

 // Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 #include "go_asm.h"
 #include "textflag.h"

 TEXT ·Compare(SB),NOSPLIT,$0-56
 	MOVQ	a_base+0(FP), SI
 	MOVQ	a_len+8(FP), BX
 	MOVQ	b_base+24(FP), DI
 	MOVQ	b_len+32(FP), DX
 	LEAQ	ret+48(FP), R9
 	JMP	cmpbody<>(SB)

 TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
 	MOVQ	a_base+0(FP), SI
 	MOVQ	a_len+8(FP), BX
 	MOVQ	b_base+16(FP), DI
 	MOVQ	b_len+24(FP), DX
 	LEAQ	ret+32(FP), R9
 	JMP	cmpbody<>(SB)

 // input:
 //   SI = a
 //   DI = b
 //   BX = alen
 //   DX = blen
 //   R9 = address of output word (stores -1/0/1 here)
 TEXT cmpbody<>(SB),NOSPLIT,$0-0
 	CMPQ	SI, DI
 	JEQ	allsame
 	CMPQ	BX, DX
 	MOVQ	DX, R8
 	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
 	CMPQ	R8, $8
 	JB	small

 	CMPQ	R8, $63
 	JBE	loop
 	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
 	JEQ     big_loop_avx2
 	JMP	big_loop
 loop:
 	CMPQ	R8, $16
 	JBE	_0through16
 	MOVOU	(SI), X0
 	MOVOU	(DI), X1
 	PCMPEQB X0, X1
 	PMOVMSKB X1, AX
 	XORQ	$0xffff, AX	// convert EQ to NE
 	JNE	diff16	// branch if at least one byte is not equal
 	ADDQ	$16, SI
 	ADDQ	$16, DI
 	SUBQ	$16, R8
 	JMP	loop

 diff64:
 	ADDQ	$48, SI
 	ADDQ	$48, DI
 	JMP	diff16
 diff48:
 	ADDQ	$32, SI
 	ADDQ	$32, DI
 	JMP	diff16
 diff32:
 	ADDQ	$16, SI
 	ADDQ	$16, DI
 	// AX = bit mask of differences
 diff16:
 	BSFQ	AX, BX	// index of first byte that differs
 	XORQ	AX, AX
 	MOVB	(SI)(BX*1), CX
 	CMPB	CX, (DI)(BX*1)
 	SETHI	AX
 	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
 	MOVQ	AX, (R9)
 	RET

 	// 0 through 16 bytes left, alen>=8, blen>=8
 _0through16:
 	CMPQ	R8, $8
 	JBE	_0through8
 	MOVQ	(SI), AX
 	MOVQ	(DI), CX
 	CMPQ	AX, CX
 	JNE	diff8
 _0through8:
 	MOVQ	-8(SI)(R8*1), AX
 	MOVQ	-8(DI)(R8*1), CX
 	CMPQ	AX, CX
 	JEQ	allsame

 	// AX and CX contain parts of a and b that differ.
 diff8:
 	BSWAPQ	AX	// reverse order of bytes
 	BSWAPQ	CX
 	XORQ	AX, CX
 	BSRQ	CX, CX	// index of highest bit difference
 	SHRQ	CX, AX	// move a's bit to bottom
 	ANDQ	$1, AX	// mask bit
 	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
 	MOVQ	AX, (R9)
 	RET

 	// 0-7 bytes in common
 small:
 	LEAQ	(R8*8), CX	// bytes left -> bits left
 	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
 	JEQ	allsame

 	// load bytes of a into high bytes of AX
 	CMPB	SI, $0xf8
 	JA	si_high
 	MOVQ	(SI), SI
 	JMP	si_finish
 si_high:
 	MOVQ	-8(SI)(R8*1), SI
 	SHRQ	CX, SI
 si_finish:
 	SHLQ	CX, SI

 	// load bytes of b in to high bytes of BX
 	CMPB	DI, $0xf8
 	JA	di_high
 	MOVQ	(DI), DI
 	JMP	di_finish
 di_high:
 	MOVQ	-8(DI)(R8*1), DI
 	SHRQ	CX, DI
 di_finish:
 	SHLQ	CX, DI

 	BSWAPQ	SI	// reverse order of bytes
 	BSWAPQ	DI
 	XORQ	SI, DI	// find bit differences
 	JEQ	allsame
 	BSRQ	DI, CX	// index of highest bit difference
 	SHRQ	CX, SI	// move a's bit to bottom
 	ANDQ	$1, SI	// mask bit
 	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
 	MOVQ	AX, (R9)
 	RET

 allsame:
 	XORQ	AX, AX
 	XORQ	CX, CX
 	CMPQ	BX, DX
 	SETGT	AX	// 1 if alen > blen
 	SETEQ	CX	// 1 if alen == blen
 	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
 	MOVQ	AX, (R9)
 	RET

 	// this works for >= 64 bytes of data.
 big_loop:
 	MOVOU	(SI), X0
 	MOVOU	(DI), X1
 	PCMPEQB X0, X1
 	PMOVMSKB X1, AX
 	XORQ	$0xffff, AX
 	JNE	diff16

 	MOVOU	16(SI), X0
 	MOVOU	16(DI), X1
 	PCMPEQB X0, X1
 	PMOVMSKB X1, AX
 	XORQ	$0xffff, AX
 	JNE	diff32

 	MOVOU	32(SI), X0
 	MOVOU	32(DI), X1
 	PCMPEQB X0, X1
 	PMOVMSKB X1, AX
 	XORQ	$0xffff, AX
 	JNE	diff48

 	MOVOU	48(SI), X0
 	MOVOU	48(DI), X1
 	PCMPEQB X0, X1
 	PMOVMSKB X1, AX
 	XORQ	$0xffff, AX
 	JNE	diff64

 	ADDQ	$64, SI
 	ADDQ	$64, DI
 	SUBQ	$64, R8
 	CMPQ	R8, $64
 	JBE	loop
 	JMP	big_loop

 	// Compare 64-bytes per loop iteration.
 	// Loop is unrolled and uses AVX2.
 big_loop_avx2:
 	VMOVDQU	(SI), Y2
 	VMOVDQU	(DI), Y3
 	VMOVDQU	32(SI), Y4
 	VMOVDQU	32(DI), Y5
 	VPCMPEQB Y2, Y3, Y0
 	VPMOVMSKB Y0, AX
 	XORL	$0xffffffff, AX
 	JNE	diff32_avx2
 	VPCMPEQB Y4, Y5, Y6
 	VPMOVMSKB Y6, AX
 	XORL	$0xffffffff, AX
 	JNE	diff64_avx2

 	ADDQ	$64, SI
 	ADDQ	$64, DI
 	SUBQ	$64, R8
 	CMPQ	R8, $64
 	JB	big_loop_avx2_exit
 	JMP	big_loop_avx2

 	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
 diff32_avx2:
 	VZEROUPPER
 	JMP diff16

 	// Same as diff32_avx2, but for last 32 bytes.
 diff64_avx2:
 	VZEROUPPER
 	JMP diff48

 	// For <64 bytes remainder jump to normal loop.
 big_loop_avx2_exit:
 	VZEROUPPER
 	JMP loop
	// Copyright 2018 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	#include "go_asm.h"
	#include "textflag.h"

	TEXT ·Compare(SB),NOSPLIT,$0-56
	MOVQ a_base+0(FP), SI
	MOVQ a_len+8(FP), BX
	MOVQ b_base+24(FP), DI
	MOVQ b_len+32(FP), DX
	LEAQ ret+48(FP), R9
	JMP cmpbody<>(SB)

	TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
	MOVQ a_base+0(FP), SI
	MOVQ a_len+8(FP), BX
	MOVQ b_base+16(FP), DI
	MOVQ b_len+24(FP), DX
	LEAQ ret+32(FP), R9
	JMP cmpbody<>(SB)

	// input:
	// SI = a
	// DI = b
	// BX = alen
	// DX = blen
	// R9 = address of output word (stores -1/0/1 here)
	TEXT cmpbody<>(SB),NOSPLIT,$0-0
	CMPQ SI, DI
	JEQ allsame
	CMPQ BX, DX
	MOVQ DX, R8
	CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare
	CMPQ R8, $8
	JB small

	CMPQ R8, $63
	JBE loop
	CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
	JEQ big_loop_avx2
	JMP big_loop
	loop:
	CMPQ R8, $16
	JBE _0through16
	MOVOU (SI), X0
	MOVOU (DI), X1
	PCMPEQB X0, X1
	PMOVMSKB X1, AX
	XORQ $0xffff, AX // convert EQ to NE
	JNE diff16 // branch if at least one byte is not equal
	ADDQ $16, SI
	ADDQ $16, DI
	SUBQ $16, R8
	JMP loop

	diff64:
	ADDQ $48, SI
	ADDQ $48, DI
	JMP diff16
	diff48:
	ADDQ $32, SI
	ADDQ $32, DI
	JMP diff16
	diff32:
	ADDQ $16, SI
	ADDQ $16, DI
	// AX = bit mask of differences
	diff16:
	BSFQ AX, BX // index of first byte that differs
	XORQ AX, AX
	MOVB (SI)(BX*1), CX
	CMPB CX, (DI)(BX*1)
	SETHI AX
	LEAQ -1(AX*2), AX // convert 1/0 to +1/-1
	MOVQ AX, (R9)
	RET

	// 0 through 16 bytes left, alen>=8, blen>=8
	_0through16:
	CMPQ R8, $8
	JBE _0through8
	MOVQ (SI), AX
	MOVQ (DI), CX
	CMPQ AX, CX
	JNE diff8
	_0through8:
	MOVQ -8(SI)(R8*1), AX
	MOVQ -8(DI)(R8*1), CX
	CMPQ AX, CX
	JEQ allsame

	// AX and CX contain parts of a and b that differ.
	diff8:
	BSWAPQ AX // reverse order of bytes
	BSWAPQ CX
	XORQ AX, CX
	BSRQ CX, CX // index of highest bit difference
	SHRQ CX, AX // move a's bit to bottom
	ANDQ $1, AX // mask bit
	LEAQ -1(AX*2), AX // 1/0 => +1/-1
	MOVQ AX, (R9)
	RET

	// 0-7 bytes in common
	small:
	LEAQ (R8*8), CX // bytes left -> bits left
	NEGQ CX // - bits lift (== 64 - bits left mod 64)
	JEQ allsame

	// load bytes of a into high bytes of AX
	CMPB SI, $0xf8
	JA si_high
	MOVQ (SI), SI
	JMP si_finish
	si_high:
	MOVQ -8(SI)(R8*1), SI
	SHRQ CX, SI
	si_finish:
	SHLQ CX, SI

	// load bytes of b in to high bytes of BX
	CMPB DI, $0xf8
	JA di_high
	MOVQ (DI), DI
	JMP di_finish
	di_high:
	MOVQ -8(DI)(R8*1), DI
	SHRQ CX, DI
	di_finish:
	SHLQ CX, DI

	BSWAPQ SI // reverse order of bytes
	BSWAPQ DI
	XORQ SI, DI // find bit differences
	JEQ allsame
	BSRQ DI, CX // index of highest bit difference
	SHRQ CX, SI // move a's bit to bottom
	ANDQ $1, SI // mask bit
	LEAQ -1(SI*2), AX // 1/0 => +1/-1
	MOVQ AX, (R9)
	RET

	allsame:
	XORQ AX, AX
	XORQ CX, CX
	CMPQ BX, DX
	SETGT AX // 1 if alen > blen
	SETEQ CX // 1 if alen == blen
	LEAQ -1(CX)(AX*2), AX // 1,0,-1 result
	MOVQ AX, (R9)
	RET

	// this works for >= 64 bytes of data.
	big_loop:
	MOVOU (SI), X0
	MOVOU (DI), X1
	PCMPEQB X0, X1
	PMOVMSKB X1, AX
	XORQ $0xffff, AX
	JNE diff16

	MOVOU 16(SI), X0
	MOVOU 16(DI), X1
	PCMPEQB X0, X1
	PMOVMSKB X1, AX
	XORQ $0xffff, AX
	JNE diff32

	MOVOU 32(SI), X0
	MOVOU 32(DI), X1
	PCMPEQB X0, X1
	PMOVMSKB X1, AX
	XORQ $0xffff, AX
	JNE diff48

	MOVOU 48(SI), X0
	MOVOU 48(DI), X1
	PCMPEQB X0, X1
	PMOVMSKB X1, AX
	XORQ $0xffff, AX
	JNE diff64

	ADDQ $64, SI
	ADDQ $64, DI
	SUBQ $64, R8
	CMPQ R8, $64
	JBE loop
	JMP big_loop

	// Compare 64-bytes per loop iteration.
	// Loop is unrolled and uses AVX2.
	big_loop_avx2:
	VMOVDQU (SI), Y2
	VMOVDQU (DI), Y3
	VMOVDQU 32(SI), Y4
	VMOVDQU 32(DI), Y5
	VPCMPEQB Y2, Y3, Y0
	VPMOVMSKB Y0, AX
	XORL $0xffffffff, AX
	JNE diff32_avx2
	VPCMPEQB Y4, Y5, Y6
	VPMOVMSKB Y6, AX
	XORL $0xffffffff, AX
	JNE diff64_avx2

	ADDQ $64, SI
	ADDQ $64, DI
	SUBQ $64, R8
	CMPQ R8, $64
	JB big_loop_avx2_exit
	JMP big_loop_avx2

	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
	diff32_avx2:
	VZEROUPPER
	JMP diff16

	// Same as diff32_avx2, but for last 32 bytes.
	diff64_avx2:
	VZEROUPPER
	JMP diff48

	// For <64 bytes remainder jump to normal loop.
	big_loop_avx2_exit:
	VZEROUPPER
	JMP loop