blob: 491d5bcfd25a009bcd5149b4ce68bebbf9dcc1a0 [file] [log] [blame]
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
// Caller must confirm availability of vx facility before calling.
TEXT ·Index(SB),NOSPLIT|NOFRAME,$0-56
LMG a_base+0(FP), R1, R2 // R1=&s[0], R2=len(s)
LMG b_base+24(FP), R3, R4 // R3=&sep[0], R4=len(sep)
MOVD $ret+48(FP), R5
BR indexbody<>(SB)
// Caller must confirm availability of vx facility before calling.
TEXT ·IndexString(SB),NOSPLIT|NOFRAME,$0-40
LMG a_base+0(FP), R1, R2 // R1=&s[0], R2=len(s)
LMG b_base+16(FP), R3, R4 // R3=&sep[0], R4=len(sep)
MOVD $ret+32(FP), R5
BR indexbody<>(SB)
// s: string we are searching
// sep: string to search for
// R1=&s[0], R2=len(s)
// R3=&sep[0], R4=len(sep)
// R5=&ret (int)
// Caller must confirm availability of vx facility before calling.
TEXT indexbody<>(SB),NOSPLIT|NOFRAME,$0
CMPBGT R4, R2, notfound
ADD R1, R2
SUB R4, R2 // R2=&s[len(s)-len(sep)] (last valid index)
CMPBEQ R4, $0, notfound
SUB $1, R4 // R4=len(sep)-1 for use as VLL index
VLL R4, (R3), V0 // contains first 16 bytes of sep
MOVD R1, R7
index2plus:
CMPBNE R4, $1, index3plus
MOVD $15(R7), R9
CMPBGE R9, R2, index2to16
VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00...
VONE V16
VREPH $0, V0, V1
CMPBGE R9, R2, index2to16
index2loop:
VL 0(R7), V2 // 16 bytes, even indices
VL 1(R7), V4 // 16 bytes, odd indices
VCEQH V1, V2, V5 // compare even indices
VCEQH V1, V4, V6 // compare odd indices
VSEL V5, V6, V31, V7 // merge even and odd indices
VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
BLT foundV17
MOVD $16(R7), R7 // R7+=16
ADD $15, R7, R9
CMPBLE R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search)
CMPBLE R7, R2, index2to16
BR notfound
index3plus:
CMPBNE R4, $2, index4plus
ADD $15, R7, R9
CMPBGE R9, R2, index2to16
MOVD $1, R0
VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00...
VONE V16
VREPH $0, V0, V1
VREPB $2, V0, V8
index3loop:
VL (R7), V2 // load 16-bytes into V2
VLL R0, 16(R7), V3 // load 2-bytes into V3
VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1
VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<2
VCEQH V1, V2, V5 // compare 2-byte even indices
VCEQH V1, V4, V6 // compare 2-byte odd indices
VCEQB V8, V9, V10 // compare last bytes
VSEL V5, V6, V31, V7 // merge even and odd indices
VN V7, V10, V7 // AND indices with last byte
VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
BLT foundV17
MOVD $16(R7), R7 // R7+=16
ADD $15, R7, R9
CMPBLE R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search)
CMPBLE R7, R2, index2to16
BR notfound
index4plus:
CMPBNE R4, $3, index5plus
ADD $15, R7, R9
CMPBGE R9, R2, index2to16
MOVD $2, R0
VGBM $0x8888, V29 // 0xff000000ff000000...
VGBM $0x2222, V30 // 0x0000ff000000ff00...
VGBM $0xcccc, V31 // 0xffff0000ffff0000...
VONE V16
VREPF $0, V0, V1
index4loop:
VL (R7), V2 // load 16-bytes into V2
VLL R0, 16(R7), V3 // load 3-bytes into V3
VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1
VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<1
VSLDB $3, V2, V3, V10 // V10=(V2:V3)<<1
VCEQF V1, V2, V5 // compare index 0, 4, ...
VCEQF V1, V4, V6 // compare index 1, 5, ...
VCEQF V1, V9, V11 // compare index 2, 6, ...
VCEQF V1, V10, V12 // compare index 3, 7, ...
VSEL V5, V6, V29, V13 // merge index 0, 1, 4, 5, ...
VSEL V11, V12, V30, V14 // merge index 2, 3, 6, 7, ...
VSEL V13, V14, V31, V7 // final merge
VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
BLT foundV17
MOVD $16(R7), R7 // R7+=16
ADD $15, R7, R9
CMPBLE R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search)
CMPBLE R7, R2, index2to16
BR notfound
index5plus:
CMPBGT R4, $15, index17plus
index2to16:
CMPBGT R7, R2, notfound
MOVD $1(R7), R8
CMPBGT R8, R2, index2to16tail
index2to16loop:
// unrolled 2x
VLL R4, (R7), V1
VLL R4, 1(R7), V2
VCEQGS V0, V1, V3
BEQ found
MOVD $1(R7), R7
VCEQGS V0, V2, V4
BEQ found
MOVD $1(R7), R7
CMPBLT R7, R2, index2to16loop
CMPBGT R7, R2, notfound
index2to16tail:
VLL R4, (R7), V1
VCEQGS V0, V1, V2
BEQ found
BR notfound
index17plus:
CMPBGT R4, $31, index33plus
SUB $16, R4, R0
VLL R0, 16(R3), V1
VONE V7
index17to32loop:
VL (R7), V2
VLL R0, 16(R7), V3
VCEQG V0, V2, V4
VCEQG V1, V3, V5
VN V4, V5, V6
VCEQGS V6, V7, V8
BEQ found
MOVD $1(R7), R7
CMPBLE R7, R2, index17to32loop
BR notfound
index33plus:
CMPBGT R4, $47, index49plus
SUB $32, R4, R0
VL 16(R3), V1
VLL R0, 32(R3), V2
VONE V11
index33to48loop:
VL (R7), V3
VL 16(R7), V4
VLL R0, 32(R7), V5
VCEQG V0, V3, V6
VCEQG V1, V4, V7
VCEQG V2, V5, V8
VN V6, V7, V9
VN V8, V9, V10
VCEQGS V10, V11, V12
BEQ found
MOVD $1(R7), R7
CMPBLE R7, R2, index33to48loop
BR notfound
index49plus:
CMPBGT R4, $63, index65plus
SUB $48, R4, R0
VL 16(R3), V1
VL 32(R3), V2
VLL R0, 48(R3), V3
VONE V15
index49to64loop:
VL (R7), V4
VL 16(R7), V5
VL 32(R7), V6
VLL R0, 48(R7), V7
VCEQG V0, V4, V8
VCEQG V1, V5, V9
VCEQG V2, V6, V10
VCEQG V3, V7, V11
VN V8, V9, V12
VN V10, V11, V13
VN V12, V13, V14
VCEQGS V14, V15, V16
BEQ found
MOVD $1(R7), R7
CMPBLE R7, R2, index49to64loop
notfound:
MOVD $-1, (R5)
RET
index65plus:
// not implemented
MOVD $0, (R0)
RET
foundV17: // index is in doubleword V17[0]
VLGVG $0, V17, R8
ADD R8, R7
found:
SUB R1, R7
MOVD R7, (R5)
RET