blob: 74c4c2472ae46dea5d88251d9a36d6cf1893a2af [file] [log] [blame]
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
TEXT ·Count<ABIInternal>(SB),NOSPLIT,$0-40
// R4 = b_base
// R5 = b_len
// R6 = b_cap (unused)
// R7 = byte to count
AND $0xff, R7, R6
JMP countbody<>(SB)
TEXT ·CountString<ABIInternal>(SB),NOSPLIT,$0-32
// R4 = s_base
// R5 = s_len
// R6 = byte to count
AND $0xff, R6
JMP countbody<>(SB)
// input:
// R4 = s_base
// R5 = s_len
// R6 = byte to count
TEXT countbody<>(SB),NOSPLIT,$0
MOVV R0, R7 // count
// short path to handle 0-byte case
BEQ R5, done
// jump directly to tail length < 8
MOVV $8, R8
BLT R5, R8, tail
// Implemented using 256-bit SMID instructions
lasxCountBody:
MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R8
BEQ R8, lsxCountBody
XVMOVQ R6, X0.B32
// jump directly to lasx32 if length < 128
MOVV $128, R8
BLT R5, R8, lasx32
lasx128:
lasx128Loop:
XVMOVQ 0(R4), X1
XVMOVQ 32(R4), X2
XVMOVQ 64(R4), X3
XVMOVQ 96(R4), X4
XVSEQB X0, X1, X5
XVSEQB X0, X2, X6
XVSEQB X0, X3, X7
XVSEQB X0, X4, X8
XVANDB $1, X5, X5
XVANDB $1, X6, X6
XVANDB $1, X7, X7
XVANDB $1, X8, X8
XVPCNTV X5, X1
XVPCNTV X6, X2
XVPCNTV X7, X3
XVPCNTV X8, X4
XVADDV X2, X1
XVADDV X4, X3
XVADDV X3, X1
XVMOVQ X1.V[0], R9
XVMOVQ X1.V[1], R10
XVMOVQ X1.V[2], R11
XVMOVQ X1.V[3], R12
ADDV R9, R10
ADDV R11, R12
ADDV R10, R7
ADDV R12, R7
ADDV $-128, R5
ADDV $128, R4
BGE R5, R8, lasx128Loop
lasx32:
// jump directly to lasx8 if length < 32
MOVV $32, R8
BLT R5, R8, lasx8
lasx32Loop:
XVMOVQ 0(R4), X1
XVSEQB X0, X1, X2
XVANDB $1, X2, X2
XVPCNTV X2, X1
XVMOVQ X1.V[0], R9
XVMOVQ X1.V[1], R10
XVMOVQ X1.V[2], R11
XVMOVQ X1.V[3], R12
ADDV R9, R10
ADDV R11, R12
ADDV R10, R7
ADDV R12, R7
ADDV $-32, R5
ADDV $32, R4
BGE R5, R8, lasx32Loop
lasx8:
// jump directly to tail if length < 8
MOVV $8, R8
BLT R5, R8, tail
lasx8Loop:
MOVV 0(R4), R9
VMOVQ R9, V1.V[0]
VSEQB V0, V1, V2
VANDB $1, V2, V2
VPCNTV V2, V1
VMOVQ V1.V[0], R9
ADDV R9, R7
ADDV $-8, R5
ADDV $8, R4
BGE R5, R8, lasx8Loop
JMP tail
// Implemented using 128-bit SMID instructions
lsxCountBody:
MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R8
BEQ R8, genericCountBody
VMOVQ R6, V0.B16
// jump directly to lsx16 if length < 64
MOVV $64, R8
BLT R5, R8, lsx16
lsx64:
lsx64Loop:
VMOVQ 0(R4), V1
VMOVQ 16(R4), V2
VMOVQ 32(R4), V3
VMOVQ 48(R4), V4
VSEQB V0, V1, V5
VSEQB V0, V2, V6
VSEQB V0, V3, V7
VSEQB V0, V4, V8
VANDB $1, V5, V5
VANDB $1, V6, V6
VANDB $1, V7, V7
VANDB $1, V8, V8
VPCNTV V5, V1
VPCNTV V6, V2
VPCNTV V7, V3
VPCNTV V8, V4
VADDV V2, V1
VADDV V4, V3
VADDV V3, V1
VMOVQ V1.V[0], R9
VMOVQ V1.V[1], R10
ADDV R9, R7
ADDV R10, R7
ADDV $-64, R5
ADDV $64, R4
BGE R5, R8, lsx64Loop
lsx16:
// jump directly to lsx8 if length < 16
MOVV $16, R8
BLT R5, R8, lsx8
lsx16Loop:
VMOVQ 0(R4), V1
VSEQB V0, V1, V2
VANDB $1, V2, V2
VPCNTV V2, V1
VMOVQ V1.V[0], R9
VMOVQ V1.V[1], R10
ADDV R9, R7
ADDV R10, R7
ADDV $-16, R5
ADDV $16, R4
BGE R5, R8, lsx16Loop
lsx8:
// jump directly to tail if length < 8
MOVV $8, R8
BLT R5, R8, tail
lsx8Loop:
MOVV 0(R4), R9
VMOVQ R9, V1.V[0]
VSEQB V0, V1, V2
VANDB $1, V2, V2
VPCNTV V2, V1
VMOVQ V1.V[0], R9
ADDV R9, R7
ADDV $-8, R5
ADDV $8, R4
BGE R5, R8, lsx8Loop
JMP tail
// Implemented using general instructions
genericCountBody:
MOVV $4, R8
MOVV $1, R9
genericLoop:
BLT R5, R8, tail
ADDV $-4, R5
MOVWU (R4)(R5), R10
BSTRPICKW $7, R10, $0, R11
BSTRPICKW $15, R10, $8, R12
XOR R6, R11
XOR R6, R12
MASKNEZ R11, R9, R13
MASKNEZ R12, R9, R14
ADDV R13, R7
ADDV R14, R7
BSTRPICKW $23, R10, $16, R11
BSTRPICKW $31, R10, $24, R12
XOR R6, R11
XOR R6, R12
MASKNEZ R11, R9, R13
MASKNEZ R12, R9, R14
ADDV R13, R7
ADDV R14, R7
JMP genericLoop
// Work with tail shorter than 8 bytes
tail:
BEQ R5, done
ADDV $-1, R5
MOVBU (R4)(R5), R8
BNE R6, R8, tail
ADDV $1, R7
JMP tail
done:
MOVV R7, R4
RET