|  | // Copyright 2018 The Go Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style | 
|  | // license that can be found in the LICENSE file. | 
|  |  | 
|  | #include "go_asm.h" | 
|  | #include "textflag.h" | 
|  |  | 
|  | TEXT ·Index(SB),NOSPLIT,$0-56 | 
|  | MOVQ a_base+0(FP), DI | 
|  | MOVQ a_len+8(FP), DX | 
|  | MOVQ b_base+24(FP), R8 | 
|  | MOVQ b_len+32(FP), AX | 
|  | MOVQ DI, R10 | 
|  | LEAQ ret+48(FP), R11 | 
|  | JMP  indexbody<>(SB) | 
|  |  | 
|  | TEXT ·IndexString(SB),NOSPLIT,$0-40 | 
|  | MOVQ a_base+0(FP), DI | 
|  | MOVQ a_len+8(FP), DX | 
|  | MOVQ b_base+16(FP), R8 | 
|  | MOVQ b_len+24(FP), AX | 
|  | MOVQ DI, R10 | 
|  | LEAQ ret+32(FP), R11 | 
|  | JMP  indexbody<>(SB) | 
|  |  | 
|  | // AX: length of string, that we are searching for | 
|  | // DX: length of string, in which we are searching | 
|  | // DI: pointer to string, in which we are searching | 
|  | // R8: pointer to string, that we are searching for | 
|  | // R11: address, where to put return value | 
|  | // Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them | 
|  | TEXT indexbody<>(SB),NOSPLIT,$0 | 
|  | CMPQ AX, DX | 
|  | JA fail | 
|  | CMPQ DX, $16 | 
|  | JAE sse42 | 
|  | no_sse42: | 
|  | CMPQ AX, $2 | 
|  | JA   _3_or_more | 
|  | MOVW (R8), R8 | 
|  | LEAQ -1(DI)(DX*1), DX | 
|  | PCALIGN $16 | 
|  | loop2: | 
|  | MOVW (DI), SI | 
|  | CMPW SI,R8 | 
|  | JZ success | 
|  | ADDQ $1,DI | 
|  | CMPQ DI,DX | 
|  | JB loop2 | 
|  | JMP fail | 
|  | _3_or_more: | 
|  | CMPQ AX, $3 | 
|  | JA   _4_or_more | 
|  | MOVW 1(R8), BX | 
|  | MOVW (R8), R8 | 
|  | LEAQ -2(DI)(DX*1), DX | 
|  | loop3: | 
|  | MOVW (DI), SI | 
|  | CMPW SI,R8 | 
|  | JZ   partial_success3 | 
|  | ADDQ $1,DI | 
|  | CMPQ DI,DX | 
|  | JB loop3 | 
|  | JMP fail | 
|  | partial_success3: | 
|  | MOVW 1(DI), SI | 
|  | CMPW SI,BX | 
|  | JZ success | 
|  | ADDQ $1,DI | 
|  | CMPQ DI,DX | 
|  | JB loop3 | 
|  | JMP fail | 
|  | _4_or_more: | 
|  | CMPQ AX, $4 | 
|  | JA   _5_or_more | 
|  | MOVL (R8), R8 | 
|  | LEAQ -3(DI)(DX*1), DX | 
|  | loop4: | 
|  | MOVL (DI), SI | 
|  | CMPL SI,R8 | 
|  | JZ   success | 
|  | ADDQ $1,DI | 
|  | CMPQ DI,DX | 
|  | JB loop4 | 
|  | JMP fail | 
|  | _5_or_more: | 
|  | CMPQ AX, $7 | 
|  | JA   _8_or_more | 
|  | LEAQ 1(DI)(DX*1), DX | 
|  | SUBQ AX, DX | 
|  | MOVL -4(R8)(AX*1), BX | 
|  | MOVL (R8), R8 | 
|  | loop5to7: | 
|  | MOVL (DI), SI | 
|  | CMPL SI,R8 | 
|  | JZ   partial_success5to7 | 
|  | ADDQ $1,DI | 
|  | CMPQ DI,DX | 
|  | JB loop5to7 | 
|  | JMP fail | 
|  | partial_success5to7: | 
|  | MOVL -4(AX)(DI*1), SI | 
|  | CMPL SI,BX | 
|  | JZ success | 
|  | ADDQ $1,DI | 
|  | CMPQ DI,DX | 
|  | JB loop5to7 | 
|  | JMP fail | 
|  | _8_or_more: | 
|  | CMPQ AX, $8 | 
|  | JA   _9_or_more | 
|  | MOVQ (R8), R8 | 
|  | LEAQ -7(DI)(DX*1), DX | 
|  | loop8: | 
|  | MOVQ (DI), SI | 
|  | CMPQ SI,R8 | 
|  | JZ   success | 
|  | ADDQ $1,DI | 
|  | CMPQ DI,DX | 
|  | JB loop8 | 
|  | JMP fail | 
|  | _9_or_more: | 
|  | CMPQ AX, $15 | 
|  | JA   _16_or_more | 
|  | LEAQ 1(DI)(DX*1), DX | 
|  | SUBQ AX, DX | 
|  | MOVQ -8(R8)(AX*1), BX | 
|  | MOVQ (R8), R8 | 
|  | loop9to15: | 
|  | MOVQ (DI), SI | 
|  | CMPQ SI,R8 | 
|  | JZ   partial_success9to15 | 
|  | ADDQ $1,DI | 
|  | CMPQ DI,DX | 
|  | JB loop9to15 | 
|  | JMP fail | 
|  | partial_success9to15: | 
|  | MOVQ -8(AX)(DI*1), SI | 
|  | CMPQ SI,BX | 
|  | JZ success | 
|  | ADDQ $1,DI | 
|  | CMPQ DI,DX | 
|  | JB loop9to15 | 
|  | JMP fail | 
|  | _16_or_more: | 
|  | CMPQ AX, $16 | 
|  | JA   _17_or_more | 
|  | MOVOU (R8), X1 | 
|  | LEAQ -15(DI)(DX*1), DX | 
|  | loop16: | 
|  | MOVOU (DI), X2 | 
|  | PCMPEQB X1, X2 | 
|  | PMOVMSKB X2, SI | 
|  | CMPQ  SI, $0xffff | 
|  | JE   success | 
|  | ADDQ $1,DI | 
|  | CMPQ DI,DX | 
|  | JB loop16 | 
|  | JMP fail | 
|  | _17_or_more: | 
|  | CMPQ AX, $31 | 
|  | JA   _32_or_more | 
|  | LEAQ 1(DI)(DX*1), DX | 
|  | SUBQ AX, DX | 
|  | MOVOU -16(R8)(AX*1), X0 | 
|  | MOVOU (R8), X1 | 
|  | loop17to31: | 
|  | MOVOU (DI), X2 | 
|  | PCMPEQB X1,X2 | 
|  | PMOVMSKB X2, SI | 
|  | CMPQ  SI, $0xffff | 
|  | JE   partial_success17to31 | 
|  | ADDQ $1,DI | 
|  | CMPQ DI,DX | 
|  | JB loop17to31 | 
|  | JMP fail | 
|  | partial_success17to31: | 
|  | MOVOU -16(AX)(DI*1), X3 | 
|  | PCMPEQB X0, X3 | 
|  | PMOVMSKB X3, SI | 
|  | CMPQ  SI, $0xffff | 
|  | JE success | 
|  | ADDQ $1,DI | 
|  | CMPQ DI,DX | 
|  | JB loop17to31 | 
|  | JMP fail | 
|  | // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63 | 
|  | // So no need to check cpuid | 
|  | _32_or_more: | 
|  | CMPQ AX, $32 | 
|  | JA   _33_to_63 | 
|  | VMOVDQU (R8), Y1 | 
|  | LEAQ -31(DI)(DX*1), DX | 
|  | loop32: | 
|  | VMOVDQU (DI), Y2 | 
|  | VPCMPEQB Y1, Y2, Y3 | 
|  | VPMOVMSKB Y3, SI | 
|  | CMPL  SI, $0xffffffff | 
|  | JE   success_avx2 | 
|  | ADDQ $1,DI | 
|  | CMPQ DI,DX | 
|  | JB loop32 | 
|  | JMP fail_avx2 | 
|  | _33_to_63: | 
|  | LEAQ 1(DI)(DX*1), DX | 
|  | SUBQ AX, DX | 
|  | VMOVDQU -32(R8)(AX*1), Y0 | 
|  | VMOVDQU (R8), Y1 | 
|  | loop33to63: | 
|  | VMOVDQU (DI), Y2 | 
|  | VPCMPEQB Y1, Y2, Y3 | 
|  | VPMOVMSKB Y3, SI | 
|  | CMPL  SI, $0xffffffff | 
|  | JE   partial_success33to63 | 
|  | ADDQ $1,DI | 
|  | CMPQ DI,DX | 
|  | JB loop33to63 | 
|  | JMP fail_avx2 | 
|  | partial_success33to63: | 
|  | VMOVDQU -32(AX)(DI*1), Y3 | 
|  | VPCMPEQB Y0, Y3, Y4 | 
|  | VPMOVMSKB Y4, SI | 
|  | CMPL  SI, $0xffffffff | 
|  | JE success_avx2 | 
|  | ADDQ $1,DI | 
|  | CMPQ DI,DX | 
|  | JB loop33to63 | 
|  | fail_avx2: | 
|  | VZEROUPPER | 
|  | fail: | 
|  | MOVQ $-1, (R11) | 
|  | RET | 
|  | success_avx2: | 
|  | VZEROUPPER | 
|  | JMP success | 
|  | sse42: | 
|  | #ifndef hasSSE42 | 
|  | CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1 | 
|  | JNE no_sse42 | 
|  | #endif | 
|  | CMPQ AX, $12 | 
|  | // PCMPESTRI is slower than normal compare, | 
|  | // so using it makes sense only if we advance 4+ bytes per compare | 
|  | // This value was determined experimentally and is the ~same | 
|  | // on Nehalem (first with SSE42) and Haswell. | 
|  | JAE _9_or_more | 
|  | LEAQ 16(R8), SI | 
|  | TESTW $0xff0, SI | 
|  | JEQ no_sse42 | 
|  | MOVOU (R8), X1 | 
|  | LEAQ -15(DI)(DX*1), SI | 
|  | MOVQ $16, R9 | 
|  | SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9 | 
|  | PCALIGN $16 | 
|  | loop_sse42: | 
|  | // 0x0c means: unsigned byte compare (bits 0,1 are 00) | 
|  | // for equality (bits 2,3 are 11) | 
|  | // result is not masked or inverted (bits 4,5 are 00) | 
|  | // and corresponds to first matching byte (bit 6 is 0) | 
|  | PCMPESTRI $0x0c, (DI), X1 | 
|  | // CX == 16 means no match, | 
|  | // CX > R9 means partial match at the end of the string, | 
|  | // otherwise sep is at offset CX from X1 start | 
|  | CMPQ CX, R9 | 
|  | JBE sse42_success | 
|  | ADDQ R9, DI | 
|  | CMPQ DI, SI | 
|  | JB loop_sse42 | 
|  | PCMPESTRI $0x0c, -1(SI), X1 | 
|  | CMPQ CX, R9 | 
|  | JA fail | 
|  | LEAQ -1(SI), DI | 
|  | sse42_success: | 
|  | ADDQ CX, DI | 
|  | success: | 
|  | SUBQ R10, DI | 
|  | MOVQ DI, (R11) | 
|  | RET |