| // Copyright 2018 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| #include "go_asm.h" |
| #include "textflag.h" |
| |
| TEXT ·Compare(SB),NOSPLIT,$0-56 |
| MOVQ a_base+0(FP), SI |
| MOVQ a_len+8(FP), BX |
| MOVQ b_base+24(FP), DI |
| MOVQ b_len+32(FP), DX |
| LEAQ ret+48(FP), R9 |
| JMP cmpbody<>(SB) |
| |
| TEXT runtime·cmpstring(SB),NOSPLIT,$0-40 |
| MOVQ a_base+0(FP), SI |
| MOVQ a_len+8(FP), BX |
| MOVQ b_base+16(FP), DI |
| MOVQ b_len+24(FP), DX |
| LEAQ ret+32(FP), R9 |
| JMP cmpbody<>(SB) |
| |
| // input: |
| // SI = a |
| // DI = b |
| // BX = alen |
| // DX = blen |
| // R9 = address of output word (stores -1/0/1 here) |
| TEXT cmpbody<>(SB),NOSPLIT,$0-0 |
| CMPQ SI, DI |
| JEQ allsame |
| CMPQ BX, DX |
| MOVQ DX, R8 |
| CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare |
| CMPQ R8, $8 |
| JB small |
| |
| CMPQ R8, $63 |
| JBE loop |
| CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 |
| JEQ big_loop_avx2 |
| JMP big_loop |
| loop: |
| CMPQ R8, $16 |
| JBE _0through16 |
| MOVOU (SI), X0 |
| MOVOU (DI), X1 |
| PCMPEQB X0, X1 |
| PMOVMSKB X1, AX |
| XORQ $0xffff, AX // convert EQ to NE |
| JNE diff16 // branch if at least one byte is not equal |
| ADDQ $16, SI |
| ADDQ $16, DI |
| SUBQ $16, R8 |
| JMP loop |
| |
| diff64: |
| ADDQ $48, SI |
| ADDQ $48, DI |
| JMP diff16 |
| diff48: |
| ADDQ $32, SI |
| ADDQ $32, DI |
| JMP diff16 |
| diff32: |
| ADDQ $16, SI |
| ADDQ $16, DI |
| // AX = bit mask of differences |
| diff16: |
| BSFQ AX, BX // index of first byte that differs |
| XORQ AX, AX |
| MOVB (SI)(BX*1), CX |
| CMPB CX, (DI)(BX*1) |
| SETHI AX |
| LEAQ -1(AX*2), AX // convert 1/0 to +1/-1 |
| MOVQ AX, (R9) |
| RET |
| |
| // 0 through 16 bytes left, alen>=8, blen>=8 |
| _0through16: |
| CMPQ R8, $8 |
| JBE _0through8 |
| MOVQ (SI), AX |
| MOVQ (DI), CX |
| CMPQ AX, CX |
| JNE diff8 |
| _0through8: |
| MOVQ -8(SI)(R8*1), AX |
| MOVQ -8(DI)(R8*1), CX |
| CMPQ AX, CX |
| JEQ allsame |
| |
| // AX and CX contain parts of a and b that differ. |
| diff8: |
| BSWAPQ AX // reverse order of bytes |
| BSWAPQ CX |
| XORQ AX, CX |
| BSRQ CX, CX // index of highest bit difference |
| SHRQ CX, AX // move a's bit to bottom |
| ANDQ $1, AX // mask bit |
| LEAQ -1(AX*2), AX // 1/0 => +1/-1 |
| MOVQ AX, (R9) |
| RET |
| |
| // 0-7 bytes in common |
| small: |
| LEAQ (R8*8), CX // bytes left -> bits left |
| NEGQ CX // - bits lift (== 64 - bits left mod 64) |
| JEQ allsame |
| |
| // load bytes of a into high bytes of AX |
| CMPB SI, $0xf8 |
| JA si_high |
| MOVQ (SI), SI |
| JMP si_finish |
| si_high: |
| MOVQ -8(SI)(R8*1), SI |
| SHRQ CX, SI |
| si_finish: |
| SHLQ CX, SI |
| |
| // load bytes of b in to high bytes of BX |
| CMPB DI, $0xf8 |
| JA di_high |
| MOVQ (DI), DI |
| JMP di_finish |
| di_high: |
| MOVQ -8(DI)(R8*1), DI |
| SHRQ CX, DI |
| di_finish: |
| SHLQ CX, DI |
| |
| BSWAPQ SI // reverse order of bytes |
| BSWAPQ DI |
| XORQ SI, DI // find bit differences |
| JEQ allsame |
| BSRQ DI, CX // index of highest bit difference |
| SHRQ CX, SI // move a's bit to bottom |
| ANDQ $1, SI // mask bit |
| LEAQ -1(SI*2), AX // 1/0 => +1/-1 |
| MOVQ AX, (R9) |
| RET |
| |
| allsame: |
| XORQ AX, AX |
| XORQ CX, CX |
| CMPQ BX, DX |
| SETGT AX // 1 if alen > blen |
| SETEQ CX // 1 if alen == blen |
| LEAQ -1(CX)(AX*2), AX // 1,0,-1 result |
| MOVQ AX, (R9) |
| RET |
| |
| // this works for >= 64 bytes of data. |
| big_loop: |
| MOVOU (SI), X0 |
| MOVOU (DI), X1 |
| PCMPEQB X0, X1 |
| PMOVMSKB X1, AX |
| XORQ $0xffff, AX |
| JNE diff16 |
| |
| MOVOU 16(SI), X0 |
| MOVOU 16(DI), X1 |
| PCMPEQB X0, X1 |
| PMOVMSKB X1, AX |
| XORQ $0xffff, AX |
| JNE diff32 |
| |
| MOVOU 32(SI), X0 |
| MOVOU 32(DI), X1 |
| PCMPEQB X0, X1 |
| PMOVMSKB X1, AX |
| XORQ $0xffff, AX |
| JNE diff48 |
| |
| MOVOU 48(SI), X0 |
| MOVOU 48(DI), X1 |
| PCMPEQB X0, X1 |
| PMOVMSKB X1, AX |
| XORQ $0xffff, AX |
| JNE diff64 |
| |
| ADDQ $64, SI |
| ADDQ $64, DI |
| SUBQ $64, R8 |
| CMPQ R8, $64 |
| JBE loop |
| JMP big_loop |
| |
| // Compare 64-bytes per loop iteration. |
| // Loop is unrolled and uses AVX2. |
| big_loop_avx2: |
| VMOVDQU (SI), Y2 |
| VMOVDQU (DI), Y3 |
| VMOVDQU 32(SI), Y4 |
| VMOVDQU 32(DI), Y5 |
| VPCMPEQB Y2, Y3, Y0 |
| VPMOVMSKB Y0, AX |
| XORL $0xffffffff, AX |
| JNE diff32_avx2 |
| VPCMPEQB Y4, Y5, Y6 |
| VPMOVMSKB Y6, AX |
| XORL $0xffffffff, AX |
| JNE diff64_avx2 |
| |
| ADDQ $64, SI |
| ADDQ $64, DI |
| SUBQ $64, R8 |
| CMPQ R8, $64 |
| JB big_loop_avx2_exit |
| JMP big_loop_avx2 |
| |
| // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk. |
| diff32_avx2: |
| VZEROUPPER |
| JMP diff16 |
| |
| // Same as diff32_avx2, but for last 32 bytes. |
| diff64_avx2: |
| VZEROUPPER |
| JMP diff48 |
| |
| // For <64 bytes remainder jump to normal loop. |
| big_loop_avx2_exit: |
| VZEROUPPER |
| JMP loop |