// Copyright 2018 The Go Authors. All rights reserved. | |

// Use of this source code is governed by a BSD-style | |

// license that can be found in the LICENSE file. | |

#include "go_asm.h" | |

#include "textflag.h" | |

TEXT ·Index(SB),NOSPLIT,$0-56 | |

MOVQ a_base+0(FP), DI | |

MOVQ a_len+8(FP), DX | |

MOVQ b_base+24(FP), BP | |

MOVQ b_len+32(FP), AX | |

MOVQ DI, R10 | |

LEAQ ret+48(FP), R11 | |

JMP indexbody<>(SB) | |

TEXT ·IndexString(SB),NOSPLIT,$0-40 | |

MOVQ a_base+0(FP), DI | |

MOVQ a_len+8(FP), DX | |

MOVQ b_base+16(FP), BP | |

MOVQ b_len+24(FP), AX | |

MOVQ DI, R10 | |

LEAQ ret+32(FP), R11 | |

JMP indexbody<>(SB) | |

// AX: length of string, that we are searching for | |

// DX: length of string, in which we are searching | |

// DI: pointer to string, in which we are searching | |

// BP: pointer to string, that we are searching for | |

// R11: address, where to put return value | |

// Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them | |

TEXT indexbody<>(SB),NOSPLIT,$0 | |

CMPQ AX, DX | |

JA fail | |

CMPQ DX, $16 | |

JAE sse42 | |

no_sse42: | |

CMPQ AX, $2 | |

JA _3_or_more | |

MOVW (BP), BP | |

LEAQ -1(DI)(DX*1), DX | |

loop2: | |

MOVW (DI), SI | |

CMPW SI,BP | |

JZ success | |

ADDQ $1,DI | |

CMPQ DI,DX | |

JB loop2 | |

JMP fail | |

_3_or_more: | |

CMPQ AX, $3 | |

JA _4_or_more | |

MOVW 1(BP), BX | |

MOVW (BP), BP | |

LEAQ -2(DI)(DX*1), DX | |

loop3: | |

MOVW (DI), SI | |

CMPW SI,BP | |

JZ partial_success3 | |

ADDQ $1,DI | |

CMPQ DI,DX | |

JB loop3 | |

JMP fail | |

partial_success3: | |

MOVW 1(DI), SI | |

CMPW SI,BX | |

JZ success | |

ADDQ $1,DI | |

CMPQ DI,DX | |

JB loop3 | |

JMP fail | |

_4_or_more: | |

CMPQ AX, $4 | |

JA _5_or_more | |

MOVL (BP), BP | |

LEAQ -3(DI)(DX*1), DX | |

loop4: | |

MOVL (DI), SI | |

CMPL SI,BP | |

JZ success | |

ADDQ $1,DI | |

CMPQ DI,DX | |

JB loop4 | |

JMP fail | |

_5_or_more: | |

CMPQ AX, $7 | |

JA _8_or_more | |

LEAQ 1(DI)(DX*1), DX | |

SUBQ AX, DX | |

MOVL -4(BP)(AX*1), BX | |

MOVL (BP), BP | |

loop5to7: | |

MOVL (DI), SI | |

CMPL SI,BP | |

JZ partial_success5to7 | |

ADDQ $1,DI | |

CMPQ DI,DX | |

JB loop5to7 | |

JMP fail | |

partial_success5to7: | |

MOVL -4(AX)(DI*1), SI | |

CMPL SI,BX | |

JZ success | |

ADDQ $1,DI | |

CMPQ DI,DX | |

JB loop5to7 | |

JMP fail | |

_8_or_more: | |

CMPQ AX, $8 | |

JA _9_or_more | |

MOVQ (BP), BP | |

LEAQ -7(DI)(DX*1), DX | |

loop8: | |

MOVQ (DI), SI | |

CMPQ SI,BP | |

JZ success | |

ADDQ $1,DI | |

CMPQ DI,DX | |

JB loop8 | |

JMP fail | |

_9_or_more: | |

CMPQ AX, $15 | |

JA _16_or_more | |

LEAQ 1(DI)(DX*1), DX | |

SUBQ AX, DX | |

MOVQ -8(BP)(AX*1), BX | |

MOVQ (BP), BP | |

loop9to15: | |

MOVQ (DI), SI | |

CMPQ SI,BP | |

JZ partial_success9to15 | |

ADDQ $1,DI | |

CMPQ DI,DX | |

JB loop9to15 | |

JMP fail | |

partial_success9to15: | |

MOVQ -8(AX)(DI*1), SI | |

CMPQ SI,BX | |

JZ success | |

ADDQ $1,DI | |

CMPQ DI,DX | |

JB loop9to15 | |

JMP fail | |

_16_or_more: | |

CMPQ AX, $16 | |

JA _17_or_more | |

MOVOU (BP), X1 | |

LEAQ -15(DI)(DX*1), DX | |

loop16: | |

MOVOU (DI), X2 | |

PCMPEQB X1, X2 | |

PMOVMSKB X2, SI | |

CMPQ SI, $0xffff | |

JE success | |

ADDQ $1,DI | |

CMPQ DI,DX | |

JB loop16 | |

JMP fail | |

_17_or_more: | |

CMPQ AX, $31 | |

JA _32_or_more | |

LEAQ 1(DI)(DX*1), DX | |

SUBQ AX, DX | |

MOVOU -16(BP)(AX*1), X0 | |

MOVOU (BP), X1 | |

loop17to31: | |

MOVOU (DI), X2 | |

PCMPEQB X1,X2 | |

PMOVMSKB X2, SI | |

CMPQ SI, $0xffff | |

JE partial_success17to31 | |

ADDQ $1,DI | |

CMPQ DI,DX | |

JB loop17to31 | |

JMP fail | |

partial_success17to31: | |

MOVOU -16(AX)(DI*1), X3 | |

PCMPEQB X0, X3 | |

PMOVMSKB X3, SI | |

CMPQ SI, $0xffff | |

JE success | |

ADDQ $1,DI | |

CMPQ DI,DX | |

JB loop17to31 | |

JMP fail | |

// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63 | |

// So no need to check cpuid | |

_32_or_more: | |

CMPQ AX, $32 | |

JA _33_to_63 | |

VMOVDQU (BP), Y1 | |

LEAQ -31(DI)(DX*1), DX | |

loop32: | |

VMOVDQU (DI), Y2 | |

VPCMPEQB Y1, Y2, Y3 | |

VPMOVMSKB Y3, SI | |

CMPL SI, $0xffffffff | |

JE success_avx2 | |

ADDQ $1,DI | |

CMPQ DI,DX | |

JB loop32 | |

JMP fail_avx2 | |

_33_to_63: | |

LEAQ 1(DI)(DX*1), DX | |

SUBQ AX, DX | |

VMOVDQU -32(BP)(AX*1), Y0 | |

VMOVDQU (BP), Y1 | |

loop33to63: | |

VMOVDQU (DI), Y2 | |

VPCMPEQB Y1, Y2, Y3 | |

VPMOVMSKB Y3, SI | |

CMPL SI, $0xffffffff | |

JE partial_success33to63 | |

ADDQ $1,DI | |

CMPQ DI,DX | |

JB loop33to63 | |

JMP fail_avx2 | |

partial_success33to63: | |

VMOVDQU -32(AX)(DI*1), Y3 | |

VPCMPEQB Y0, Y3, Y4 | |

VPMOVMSKB Y4, SI | |

CMPL SI, $0xffffffff | |

JE success_avx2 | |

ADDQ $1,DI | |

CMPQ DI,DX | |

JB loop33to63 | |

fail_avx2: | |

VZEROUPPER | |

fail: | |

MOVQ $-1, (R11) | |

RET | |

success_avx2: | |

VZEROUPPER | |

JMP success | |

sse42: | |

CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1 | |

JNE no_sse42 | |

CMPQ AX, $12 | |

// PCMPESTRI is slower than normal compare, | |

// so using it makes sense only if we advance 4+ bytes per compare | |

// This value was determined experimentally and is the ~same | |

// on Nehalem (first with SSE42) and Haswell. | |

JAE _9_or_more | |

LEAQ 16(BP), SI | |

TESTW $0xff0, SI | |

JEQ no_sse42 | |

MOVOU (BP), X1 | |

LEAQ -15(DI)(DX*1), SI | |

MOVQ $16, R9 | |

SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9 | |

loop_sse42: | |

// 0x0c means: unsigned byte compare (bits 0,1 are 00) | |

// for equality (bits 2,3 are 11) | |

// result is not masked or inverted (bits 4,5 are 00) | |

// and corresponds to first matching byte (bit 6 is 0) | |

PCMPESTRI $0x0c, (DI), X1 | |

// CX == 16 means no match, | |

// CX > R9 means partial match at the end of the string, | |

// otherwise sep is at offset CX from X1 start | |

CMPQ CX, R9 | |

JBE sse42_success | |

ADDQ R9, DI | |

CMPQ DI, SI | |

JB loop_sse42 | |

PCMPESTRI $0x0c, -1(SI), X1 | |

CMPQ CX, R9 | |

JA fail | |

LEAQ -1(SI), DI | |

sse42_success: | |

ADDQ CX, DI | |

success: | |

SUBQ R10, DI | |

MOVQ DI, (R11) | |

RET |