blob: 76d8fb56bfd89979d02c534735753c58ec9b678e [file] [log] [blame]
// Copyright 2022 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
// Register map
//
// R4: ptr
// R5: n
// R6: ptrend
// R7: tmp
// Algorithm:
//
// 1. if lasx is enabled:
// THRESHOLD = 256, ALIGNMENTS = 32, LOOPBLOCKS = 256,
// else if lsx is enabled:
// THRESHOLD = 128, ALIGNMENTS = 16, LOOPBLOCKS = 128,
// else
// THRESHOLD = 64, ALIGNMENTS = 8, LOOPBLOCKS = 64,
//
// 2. when 'count <= THRESHOLD' bytes, memory alignment check is omitted.
// The handling is divided into distinct cases based on the size of count:
// a. clr_0, clr_1, clr_2, clr_3, clr_4, clr_5through7, clr_8,
// clr_9through16, clr_17through32, clr_33through64,
// b. lsx_clr_17through32, lsx_clr_33through64, lsx_clr_65through128,
// c. lasx_clr_17through32, lasx_clr_33through64, lsx_clr_65through128,
// lasx_clr_65through128, lasx_clr_129through256
//
// 3. when 'count > THRESHOLD' bytes, memory alignment check is performed. Unaligned
// bytes are processed first (that is, ALIGNMENTS - (ptr & (ALIGNMENTS-1))), and then
// a LOOPBLOCKS-byte loop is executed to zero out memory.
// When the number of remaining bytes not cleared is n < LOOPBLOCKS bytes, a tail
// processing is performed, invoking the corresponding case based on the size of n.
//
// example:
// THRESHOLD = 64, ALIGNMENTS = 8, LOOPBLOCKS = 64
//
// ptr newptr ptrend
// | |<----count after correction---->|
// |<-------------count before correction---------->|
// |<--8-(ptr&7)-->| |<---64 bytes--->|
// +------------------------------------------------+
// | Head | Body | Tail |
// +---------------+---------------+----------------+
// newptr = ptr - (ptr & 7) + 8
// count = count - 8 + (ptr & 7)
// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB),NOSPLIT,$0-16
BEQ R5, clr_0
ADDV R4, R5, R6
tail:
// <=64 bytes, clear directly, not check aligned
SGTU $2, R5, R7
BNE R7, clr_1
SGTU $3, R5, R7
BNE R7, clr_2
SGTU $4, R5, R7
BNE R7, clr_3
SGTU $5, R5, R7
BNE R7, clr_4
SGTU $8, R5, R7
BNE R7, clr_5through7
SGTU $9, R5, R7
BNE R7, clr_8
SGTU $17, R5, R7
BNE R7, clr_9through16
MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R7
BNE R7, lasx_tail
MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7
BNE R7, lsx_tail
SGTU $33, R5, R7
BNE R7, clr_17through32
SGTU $65, R5, R7
BNE R7, clr_33through64
JMP clr_large
lasx_tail:
// X0 = 0
XVXORV X0, X0, X0
SGTU $33, R5, R7
BNE R7, lasx_clr_17through32
SGTU $65, R5, R7
BNE R7, lasx_clr_33through64
SGTU $129, R5, R7
BNE R7, lasx_clr_65through128
SGTU $257, R5, R7
BNE R7, lasx_clr_129through256
JMP lasx_clr_large
lsx_tail:
// V0 = 0
VXORV V0, V0, V0
SGTU $33, R5, R7
BNE R7, lsx_clr_17through32
SGTU $65, R5, R7
BNE R7, lsx_clr_33through64
SGTU $129, R5, R7
BNE R7, lsx_clr_65through128
JMP lsx_clr_large
// use simd 256 instructions to implement memclr
// n > 256 bytes, check 32-byte alignment
lasx_clr_large:
AND $31, R4, R7
BEQ R7, lasx_clr_256loop
XVMOVQ X0, (R4)
SUBV R7, R4
ADDV R7, R5
SUBV $32, R5 // newn = n - (32 - (ptr & 31))
ADDV $32, R4 // newptr = ptr + (32 - (ptr & 31))
SGTU $257, R5, R7
BNE R7, lasx_clr_129through256
lasx_clr_256loop:
SUBV $256, R5
SGTU $256, R5, R7
XVMOVQ X0, 0(R4)
XVMOVQ X0, 32(R4)
XVMOVQ X0, 64(R4)
XVMOVQ X0, 96(R4)
XVMOVQ X0, 128(R4)
XVMOVQ X0, 160(R4)
XVMOVQ X0, 192(R4)
XVMOVQ X0, 224(R4)
ADDV $256, R4
BEQ R7, lasx_clr_256loop
// remaining_length is 0
BEQ R5, clr_0
// 128 < remaining_length < 256
SGTU $129, R5, R7
BEQ R7, lasx_clr_129through256
// 64 < remaining_length <= 128
SGTU $65, R5, R7
BEQ R7, lasx_clr_65through128
// 32 < remaining_length <= 64
SGTU $33, R5, R7
BEQ R7, lasx_clr_33through64
// 16 < remaining_length <= 32
SGTU $17, R5, R7
BEQ R7, lasx_clr_17through32
// 0 < remaining_length <= 16
JMP tail
// use simd 128 instructions to implement memclr
// n > 128 bytes, check 16-byte alignment
lsx_clr_large:
// check 16-byte alignment
AND $15, R4, R7
BEQ R7, lsx_clr_128loop
VMOVQ V0, (R4)
SUBV R7, R4
ADDV R7, R5
SUBV $16, R5 // newn = n - (16 - (ptr & 15))
ADDV $16, R4 // newptr = ptr + (16 - (ptr & 15))
SGTU $129, R5, R7
BNE R7, lsx_clr_65through128
lsx_clr_128loop:
SUBV $128, R5
SGTU $128, R5, R7
VMOVQ V0, 0(R4)
VMOVQ V0, 16(R4)
VMOVQ V0, 32(R4)
VMOVQ V0, 48(R4)
VMOVQ V0, 64(R4)
VMOVQ V0, 80(R4)
VMOVQ V0, 96(R4)
VMOVQ V0, 112(R4)
ADDV $128, R4
BEQ R7, lsx_clr_128loop
// remaining_length is 0
BEQ R5, clr_0
// 64 < remaining_length <= 128
SGTU $65, R5, R7
BEQ R7, lsx_clr_65through128
// 32 < remaining_length <= 64
SGTU $33, R5, R7
BEQ R7, lsx_clr_33through64
// 16 < remaining_length <= 32
SGTU $17, R5, R7
BEQ R7, lsx_clr_17through32
// 0 < remaining_length <= 16
JMP tail
// use general instructions to implement memclr
// n > 64 bytes, check 16-byte alignment
clr_large:
AND $7, R4, R7
BEQ R7, clr_64loop
MOVV R0, (R4)
SUBV R7, R4
ADDV R7, R5
ADDV $8, R4 // newptr = ptr + (8 - (ptr & 7))
SUBV $8, R5 // newn = n - (8 - (ptr & 7))
MOVV $64, R7
BLT R5, R7, clr_33through64
clr_64loop:
SUBV $64, R5
SGTU $64, R5, R7
MOVV R0, (R4)
MOVV R0, 8(R4)
MOVV R0, 16(R4)
MOVV R0, 24(R4)
MOVV R0, 32(R4)
MOVV R0, 40(R4)
MOVV R0, 48(R4)
MOVV R0, 56(R4)
ADDV $64, R4
BEQ R7, clr_64loop
// remaining_length is 0
BEQ R5, clr_0
// 32 < remaining_length < 64
SGTU $33, R5, R7
BEQ R7, clr_33through64
// 16 < remaining_length <= 32
SGTU $17, R5, R7
BEQ R7, clr_17through32
// 0 < remaining_length <= 16
JMP tail
clr_0:
RET
clr_1:
MOVB R0, (R4)
RET
clr_2:
MOVH R0, (R4)
RET
clr_3:
MOVH R0, (R4)
MOVB R0, 2(R4)
RET
clr_4:
MOVW R0, (R4)
RET
clr_5through7:
MOVW R0, (R4)
MOVW R0, -4(R6)
RET
clr_8:
MOVV R0, (R4)
RET
clr_9through16:
MOVV R0, (R4)
MOVV R0, -8(R6)
RET
clr_17through32:
MOVV R0, (R4)
MOVV R0, 8(R4)
MOVV R0, -16(R6)
MOVV R0, -8(R6)
RET
clr_33through64:
MOVV R0, (R4)
MOVV R0, 8(R4)
MOVV R0, 16(R4)
MOVV R0, 24(R4)
MOVV R0, -32(R6)
MOVV R0, -24(R6)
MOVV R0, -16(R6)
MOVV R0, -8(R6)
RET
lasx_clr_17through32:
VMOVQ V0, 0(R4)
VMOVQ V0, -16(R6)
RET
lasx_clr_33through64:
XVMOVQ X0, 0(R4)
XVMOVQ X0, -32(R6)
RET
lasx_clr_65through128:
XVMOVQ X0, 0(R4)
XVMOVQ X0, 32(R4)
XVMOVQ X0, -64(R6)
XVMOVQ X0, -32(R6)
RET
lasx_clr_129through256:
XVMOVQ X0, 0(R4)
XVMOVQ X0, 32(R4)
XVMOVQ X0, 64(R4)
XVMOVQ X0, 96(R4)
XVMOVQ X0, -128(R6)
XVMOVQ X0, -96(R6)
XVMOVQ X0, -64(R6)
XVMOVQ X0, -32(R6)
RET
lsx_clr_17through32:
VMOVQ V0, 0(R4)
VMOVQ V0, -16(R6)
RET
lsx_clr_33through64:
VMOVQ V0, 0(R4)
VMOVQ V0, 16(R4)
VMOVQ V0, -32(R6)
VMOVQ V0, -16(R6)
RET
lsx_clr_65through128:
VMOVQ V0, 0(R4)
VMOVQ V0, 16(R4)
VMOVQ V0, 32(R4)
VMOVQ V0, 48(R4)
VMOVQ V0, -64(R6)
VMOVQ V0, -48(R6)
VMOVQ V0, -32(R6)
VMOVQ V0, -16(R6)
RET