blob: 1016db738dee1be2032fad7c59527b5b277a1f03 [file] [log] [blame]
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
TEXT ·Index<ABIInternal>(SB),NOSPLIT,$0-56
MOVV R7, R6 // R6 = separator pointer
MOVV R8, R7 // R7 = separator length
JMP indexbody<>(SB)
TEXT ·IndexString<ABIInternal>(SB),NOSPLIT,$0-40
JMP indexbody<>(SB)
// input:
// R4 = string
// R5 = length
// R6 = separator pointer
// R7 = separator length (2 <= len <= 64)
TEXT indexbody<>(SB),NOSPLIT,$0
// main idea is to load 'sep' into separate register(s)
// to avoid repeatedly re-load it again and again
// for sebsequent substring comparisons
SUBV R7, R5, R8
ADDV R4, R8 // R8 contains the start of last substring for comparison
ADDV $1, R4, R9 // store base for later
MOVV $8, R5
BGE R7, R5, len_gt_or_eq_8
len_2_7:
AND $0x4, R7, R5
BNE R5, len_4_7
len_2_3:
AND $0x1, R7, R5
BNE R5, len_3
len_2:
MOVHU (R6), R10
loop_2:
BLT R8, R4, not_found
MOVHU (R4), R11
ADDV $1, R4
BNE R10, R11, loop_2
JMP found
len_3:
MOVHU (R6), R10
MOVBU 2(R6), R11
loop_3:
BLT R8, R4, not_found
MOVHU (R4), R12
ADDV $1, R4
BNE R10, R12, loop_3
MOVBU 1(R4), R13
BNE R11, R13, loop_3
JMP found
len_4_7:
AND $0x2, R7, R5
BNE R5, len_6_7
AND $0x1, R7, R5
BNE R5, len_5
len_4:
MOVWU (R6), R10
loop_4:
BLT R8, R4, not_found
MOVWU (R4), R11
ADDV $1, R4
BNE R10, R11, loop_4
JMP found
len_5:
MOVWU (R6), R10
MOVBU 4(R6), R11
loop_5:
BLT R8, R4, not_found
MOVWU (R4), R12
ADDV $1, R4
BNE R10, R12, loop_5
MOVBU 3(R4), R13
BNE R11, R13, loop_5
JMP found
len_6_7:
AND $0x1, R7, R5
BNE R5, len_7
len_6:
MOVWU (R6), R10
MOVHU 4(R6), R11
loop_6:
BLT R8, R4, not_found
MOVWU (R4), R12
ADDV $1, R4
BNE R10, R12, loop_6
MOVHU 3(R4), R13
BNE R11, R13, loop_6
JMP found
len_7:
MOVWU (R6), R10
MOVWU 3(R6), R11
loop_7:
BLT R8, R4, not_found
MOVWU (R4), R12
ADDV $1, R4
BNE R10, R12, loop_7
MOVWU 2(R4), R13
BNE R11, R13, loop_7
JMP found
len_gt_or_eq_8:
BEQ R5, R7, len_8
MOVV $17, R5
BGE R7, R5, len_gt_or_eq_17
JMP len_9_16
len_8:
MOVV (R6), R10
loop_8:
BLT R8, R4, not_found
MOVV (R4), R11
ADDV $1, R4
BNE R10, R11, loop_8
JMP found
len_9_16:
MOVV (R6), R10
SUBV $8, R7
MOVV (R6)(R7), R11
SUBV $1, R7
loop_9_16:
BLT R8, R4, not_found
MOVV (R4), R12
ADDV $1, R4
BNE R10, R12, loop_9_16
MOVV (R4)(R7), R13
BNE R11, R13, loop_9_16
JMP found
len_gt_or_eq_17:
MOVV $25, R5
BGE R7, R5, len_gt_or_eq_25
len_17_24:
MOVV 0(R6), R10
MOVV 8(R6), R11
SUBV $8, R7
MOVV (R6)(R7), R12
SUBV $1, R7
loop_17_24:
BLT R8, R4, not_found
MOVV (R4), R13
ADDV $1, R4
BNE R10, R13, loop_17_24
MOVV 7(R4), R14
BNE R11, R14, loop_17_24
MOVV (R4)(R7), R15
BNE R12, R15, loop_17_24
JMP found
len_gt_or_eq_25:
MOVV $33, R5
BGE R7, R5, len_gt_or_eq_33
MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R10
BNE R10, lsx_len_25_32
len_25_32:
MOVV 0(R6), R10
MOVV 8(R6), R11
MOVV 16(R6), R12
SUBV $8, R7
MOVV (R6)(R7), R13
SUBV $1, R7
loop_25_32:
BLT R8, R4, not_found
MOVV (R4), R14
ADDV $1, R4
BNE R10, R14, loop_25_32
MOVV 7(R4), R15
BNE R11, R15, loop_25_32
MOVV 15(R4), R16
BNE R12, R16, loop_25_32
MOVV (R4)(R7), R17
BNE R13, R17, loop_25_32
JMP found
// On loong64, LSX is included if LASX is supported.
lasx_len_25_32:
lsx_len_25_32:
VMOVQ 0(R6), V0
SUBV $16, R7
VMOVQ (R6)(R7), V1
SUBV $1, R7
lsx_loop_25_32:
BLT R8, R4, not_found
VMOVQ (R4), V2
ADDV $1, R4
VSEQV V0, V2, V2
VSETANYEQV V2, FCC0
BFPT FCC0, lsx_loop_25_32
VMOVQ (R4)(R7), V3
VSEQV V1, V3, V3
VSETANYEQV V3, FCC1
BFPT FCC1, lsx_loop_25_32
JMP found
len_gt_or_eq_33:
MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R10
MOVV $49, R5
BGE R7, R5, len_gt_or_eq_49
len_33_48:
BNE R10, lasx_len_33_48
JMP lsx_len_33_48
len_gt_or_eq_49:
len_49_64:
BNE R10, lasx_len_49_64
JMP lsx_len_49_64
lsx_len_33_48:
VMOVQ 0(R6), V0
VMOVQ 16(R6), V1
SUBV $16, R7
VMOVQ (R6)(R7), V2
SUBV $1, R7
lsx_loop_33_48:
BLT R8, R4, not_found
VMOVQ 0(R4), V3
ADDV $1, R4
VSEQV V0, V3, V3
VSETANYEQV V3, FCC0
BFPT FCC0, lsx_loop_33_48
VMOVQ 15(R4), V4
VSEQV V1, V4, V4
VSETANYEQV V4, FCC1
BFPT FCC1, lsx_loop_33_48
VMOVQ (R4)(R7), V5
VSEQV V2, V5, V5
VSETANYEQV V5, FCC2
BFPT FCC2, lsx_loop_33_48
JMP found
lsx_len_49_64:
VMOVQ 0(R6), V0
VMOVQ 16(R6), V1
VMOVQ 32(R6), V2
SUBV $16, R7
VMOVQ (R6)(R7), V3
SUBV $1, R7
lsx_loop_49_64:
BLT R8, R4, not_found
VMOVQ 0(R4), V4
ADDV $1, R4
VSEQV V0, V4, V4
VSETANYEQV V4, FCC0
BFPT FCC0, lsx_loop_49_64
VMOVQ 15(R4), V5
VSEQV V1, V5, V5
VSETANYEQV V5, FCC1
BFPT FCC1, lsx_loop_49_64
VMOVQ 31(R4), V6
VSEQV V2, V6, V6
VSETANYEQV V6, FCC2
BFPT FCC2, lsx_loop_49_64
VMOVQ (R4)(R7), V7
VSEQV V3, V7, V7
VSETANYEQV V7, FCC3
BFPT FCC3, lsx_loop_49_64
JMP found
lasx_len_33_48:
lasx_len_49_64:
lasx_len_33_64:
XVMOVQ (R6), X0
SUBV $32, R7
XVMOVQ (R6)(R7), X1
SUBV $1, R7
lasx_loop_33_64:
BLT R8, R4, not_found
XVMOVQ (R4), X2
ADDV $1, R4
XVSEQV X0, X2, X3
XVSETANYEQV X3, FCC0
BFPT FCC0, lasx_loop_33_64
XVMOVQ (R4)(R7), X4
XVSEQV X1, X4, X5
XVSETANYEQV X5, FCC1
BFPT FCC1, lasx_loop_33_64
JMP found
found:
SUBV R9, R4
RET
not_found:
MOVV $-1, R4
RET