blob: 31730e539415b6ee1aaa40f4ce98b203c9653dbe [file] [log] [blame]
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
TEXT ·Index(SB),NOSPLIT,$0-56
MOVQ a_base+0(FP), DI
MOVQ a_len+8(FP), DX
MOVQ b_base+24(FP), R8
MOVQ b_len+32(FP), AX
MOVQ DI, R10
LEAQ ret+48(FP), R11
JMP indexbody<>(SB)
TEXT ·IndexString(SB),NOSPLIT,$0-40
MOVQ a_base+0(FP), DI
MOVQ a_len+8(FP), DX
MOVQ b_base+16(FP), R8
MOVQ b_len+24(FP), AX
MOVQ DI, R10
LEAQ ret+32(FP), R11
JMP indexbody<>(SB)
// AX: length of string, that we are searching for
// DX: length of string, in which we are searching
// DI: pointer to string, in which we are searching
// R8: pointer to string, that we are searching for
// R11: address, where to put return value
// Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
TEXT indexbody<>(SB),NOSPLIT,$0
CMPQ AX, DX
JA fail
CMPQ DX, $16
JAE sse42
no_sse42:
CMPQ AX, $2
JA _3_or_more
MOVW (R8), R8
LEAQ -1(DI)(DX*1), DX
PCALIGN $16
loop2:
MOVW (DI), SI
CMPW SI,R8
JZ success
ADDQ $1,DI
CMPQ DI,DX
JB loop2
JMP fail
_3_or_more:
CMPQ AX, $3
JA _4_or_more
MOVW 1(R8), BX
MOVW (R8), R8
LEAQ -2(DI)(DX*1), DX
loop3:
MOVW (DI), SI
CMPW SI,R8
JZ partial_success3
ADDQ $1,DI
CMPQ DI,DX
JB loop3
JMP fail
partial_success3:
MOVW 1(DI), SI
CMPW SI,BX
JZ success
ADDQ $1,DI
CMPQ DI,DX
JB loop3
JMP fail
_4_or_more:
CMPQ AX, $4
JA _5_or_more
MOVL (R8), R8
LEAQ -3(DI)(DX*1), DX
loop4:
MOVL (DI), SI
CMPL SI,R8
JZ success
ADDQ $1,DI
CMPQ DI,DX
JB loop4
JMP fail
_5_or_more:
CMPQ AX, $7
JA _8_or_more
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
MOVL -4(R8)(AX*1), BX
MOVL (R8), R8
loop5to7:
MOVL (DI), SI
CMPL SI,R8
JZ partial_success5to7
ADDQ $1,DI
CMPQ DI,DX
JB loop5to7
JMP fail
partial_success5to7:
MOVL -4(AX)(DI*1), SI
CMPL SI,BX
JZ success
ADDQ $1,DI
CMPQ DI,DX
JB loop5to7
JMP fail
_8_or_more:
CMPQ AX, $8
JA _9_or_more
MOVQ (R8), R8
LEAQ -7(DI)(DX*1), DX
loop8:
MOVQ (DI), SI
CMPQ SI,R8
JZ success
ADDQ $1,DI
CMPQ DI,DX
JB loop8
JMP fail
_9_or_more:
CMPQ AX, $15
JA _16_or_more
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
MOVQ -8(R8)(AX*1), BX
MOVQ (R8), R8
loop9to15:
MOVQ (DI), SI
CMPQ SI,R8
JZ partial_success9to15
ADDQ $1,DI
CMPQ DI,DX
JB loop9to15
JMP fail
partial_success9to15:
MOVQ -8(AX)(DI*1), SI
CMPQ SI,BX
JZ success
ADDQ $1,DI
CMPQ DI,DX
JB loop9to15
JMP fail
_16_or_more:
CMPQ AX, $16
JA _17_or_more
MOVOU (R8), X1
LEAQ -15(DI)(DX*1), DX
loop16:
MOVOU (DI), X2
PCMPEQB X1, X2
PMOVMSKB X2, SI
CMPQ SI, $0xffff
JE success
ADDQ $1,DI
CMPQ DI,DX
JB loop16
JMP fail
_17_or_more:
CMPQ AX, $31
JA _32_or_more
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
MOVOU -16(R8)(AX*1), X0
MOVOU (R8), X1
loop17to31:
MOVOU (DI), X2
PCMPEQB X1,X2
PMOVMSKB X2, SI
CMPQ SI, $0xffff
JE partial_success17to31
ADDQ $1,DI
CMPQ DI,DX
JB loop17to31
JMP fail
partial_success17to31:
MOVOU -16(AX)(DI*1), X3
PCMPEQB X0, X3
PMOVMSKB X3, SI
CMPQ SI, $0xffff
JE success
ADDQ $1,DI
CMPQ DI,DX
JB loop17to31
JMP fail
// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
// So no need to check cpuid
_32_or_more:
CMPQ AX, $32
JA _33_to_63
VMOVDQU (R8), Y1
LEAQ -31(DI)(DX*1), DX
loop32:
VMOVDQU (DI), Y2
VPCMPEQB Y1, Y2, Y3
VPMOVMSKB Y3, SI
CMPL SI, $0xffffffff
JE success_avx2
ADDQ $1,DI
CMPQ DI,DX
JB loop32
JMP fail_avx2
_33_to_63:
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
VMOVDQU -32(R8)(AX*1), Y0
VMOVDQU (R8), Y1
loop33to63:
VMOVDQU (DI), Y2
VPCMPEQB Y1, Y2, Y3
VPMOVMSKB Y3, SI
CMPL SI, $0xffffffff
JE partial_success33to63
ADDQ $1,DI
CMPQ DI,DX
JB loop33to63
JMP fail_avx2
partial_success33to63:
VMOVDQU -32(AX)(DI*1), Y3
VPCMPEQB Y0, Y3, Y4
VPMOVMSKB Y4, SI
CMPL SI, $0xffffffff
JE success_avx2
ADDQ $1,DI
CMPQ DI,DX
JB loop33to63
fail_avx2:
VZEROUPPER
fail:
MOVQ $-1, (R11)
RET
success_avx2:
VZEROUPPER
JMP success
sse42:
#ifndef hasSSE42
CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1
JNE no_sse42
#endif
CMPQ AX, $12
// PCMPESTRI is slower than normal compare,
// so using it makes sense only if we advance 4+ bytes per compare
// This value was determined experimentally and is the ~same
// on Nehalem (first with SSE42) and Haswell.
JAE _9_or_more
LEAQ 16(R8), SI
TESTW $0xff0, SI
JEQ no_sse42
MOVOU (R8), X1
LEAQ -15(DI)(DX*1), SI
MOVQ $16, R9
SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
PCALIGN $16
loop_sse42:
// 0x0c means: unsigned byte compare (bits 0,1 are 00)
// for equality (bits 2,3 are 11)
// result is not masked or inverted (bits 4,5 are 00)
// and corresponds to first matching byte (bit 6 is 0)
PCMPESTRI $0x0c, (DI), X1
// CX == 16 means no match,
// CX > R9 means partial match at the end of the string,
// otherwise sep is at offset CX from X1 start
CMPQ CX, R9
JBE sse42_success
ADDQ R9, DI
CMPQ DI, SI
JB loop_sse42
PCMPESTRI $0x0c, -1(SI), X1
CMPQ CX, R9
JA fail
LEAQ -1(SI), DI
sse42_success:
ADDQ CX, DI
success:
SUBQ R10, DI
MOVQ DI, (R11)
RET