blob: 8827ca0742a4e39a17e65fd8af8b424fc81f83ae [file] [log] [blame]
// Copyright 2022 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// See memmove Go doc for important implementation constraints.
// Register map
//
// to R4
// from R5
// n(aka count) R6
// to-end R7
// from-end R8
// data R11-R18
// tmp R9
// Algorithm:
//
// Memory alignment check is only performed for copy size greater
// than 64 bytes to minimize overhead.
//
// when copy size <= 64 bytes, jump to label tail, according to the
// copy size to select the appropriate case and copy directly.
// Based on the common memory access instructions of loong64, the
// currently implemented cases are:
// move_0, move_1, move_2, move_3, move_4, move_5through7, move_8,
// move_9through16, move_17through32, move_33through64
//
// when copy size > 64 bytes, use the destination-aligned copying,
// adopt the following strategy to copy in 3 parts:
// 1. Head: do the memory alignment
// 2. Body: a 64-byte loop structure
// 3. Tail: processing of the remaining part (<= 64 bytes)
//
// forward:
//
// Dst NewDst Dstend
// | |<----count after correction---->|
// |<-------------count before correction---------->|
// |<--8-(Dst&7)-->| |<---64 bytes--->|
// +------------------------------------------------+
// | Head | Body | Tail |
// +---------------+---------------+----------------+
// NewDst = Dst - (Dst & 7) + 8
// count = count - 8 + (Dst & 7)
// Src = Src - (Dst & 7) + 8
//
// backward:
//
// Dst NewDstend Dstend
// |<-----count after correction------>| |
// |<------------count before correction--------------->|
// |<---64 bytes--->| |<---Dstend&7--->|
// +----------------------------------------------------+
// | Tail | Body | Head |
// +----------------+------------------+----------------+
// NewDstend = Dstend - (Dstend & 7)
// count = count - (Dstend & 7)
// Srcend = Srcend - (Dstend & 7)
// func memmove(to, from unsafe.Pointer, n uintptr)
TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
BEQ R4, R5, move_0
BEQ R6, move_0
ADDV R4, R6, R7 // to-end pointer
ADDV R5, R6, R8 // from-end pointer
tail:
//copy size <= 64 bytes, copy directly, not check aligned
// < 2 bytes
SGTU $2, R6, R9
BNE R9, move_1
// < 3 bytes
SGTU $3, R6, R9
BNE R9, move_2
// < 4 bytes
SGTU $4, R6, R9
BNE R9, move_3
// < 5 bytes
SGTU $5, R6, R9
BNE R9, move_4
// >= 5 bytes and < 8 bytes
SGTU $8, R6, R9
BNE R9, move_5through7
// < 9 bytes
SGTU $9, R6, R9
BNE R9, move_8
// >= 9 bytes and < 17 bytes
SGTU $17, R6, R9
BNE R9, move_9through16
// >= 17 bytes and < 33 bytes
SGTU $33, R6, R9
BNE R9, move_17through32
// >= 33 bytes and < 65 bytes
SGTU $65, R6, R9
BNE R9, move_33through64
// if (dst > src) && (dst < src + count), regarded as memory
// overlap, jump to backward
// else, jump to forward
BGEU R5, R4, forward
ADDV R5, R6, R10
BLTU R4, R10, backward
forward:
AND $7, R4, R9 // dst & 7
BEQ R9, body
head:
MOVV $8, R10
SUBV R9, R10 // head = 8 - (dst & 7)
MOVB (R5), R11
SUBV $1, R10
ADDV $1, R5
MOVB R11, (R4)
ADDV $1, R4
BNE R10, -5(PC)
ADDV R9, R6
ADDV $-8, R6 // newcount = count + (dst & 7) - 8
// if newcount < 65 bytes, use move_33through64 to copy is enough
SGTU $65, R6, R9
BNE R9, move_33through64
body:
MOVV (R5), R11
MOVV 8(R5), R12
MOVV 16(R5), R13
MOVV 24(R5), R14
MOVV 32(R5), R15
MOVV 40(R5), R16
MOVV 48(R5), R17
MOVV 56(R5), R18
MOVV R11, (R4)
MOVV R12, 8(R4)
MOVV R13, 16(R4)
MOVV R14, 24(R4)
MOVV R15, 32(R4)
MOVV R16, 40(R4)
MOVV R17, 48(R4)
MOVV R18, 56(R4)
ADDV $-64, R6
ADDV $64, R4
ADDV $64, R5
SGTU $64, R6, R9
// if the remaining part >= 64 bytes, jmp to body
BEQ R9, body
// if the remaining part == 0 bytes, use move_0 to return
BEQ R6, move_0
// if the remaining part in (0, 63] bytes, jmp to tail
JMP tail
// The backward copy algorithm is the same as the forward copy,
// except for the direction.
backward:
AND $7, R7, R9 // dstend & 7
BEQ R9, b_body
b_head:
MOVV -8(R8), R11
SUBV R9, R6 // newcount = count - (dstend & 7)
SUBV R9, R8 // newsrcend = srcend - (dstend & 7)
MOVV -8(R8), R12
MOVV R11, -8(R7)
SUBV R9, R7 // newdstend = dstend - (dstend & 7)
MOVV R12, -8(R7)
SUBV $8, R6
SUBV $8, R7
SUBV $8, R8
SGTU $65, R6, R9
BNE R9, move_33through64
b_body:
MOVV -8(R8), R11
MOVV -16(R8), R12
MOVV -24(R8), R13
MOVV -32(R8), R14
MOVV -40(R8), R15
MOVV -48(R8), R16
MOVV -56(R8), R17
MOVV -64(R8), R18
MOVV R11, -8(R7)
MOVV R12, -16(R7)
MOVV R13, -24(R7)
MOVV R14, -32(R7)
MOVV R15, -40(R7)
MOVV R16, -48(R7)
MOVV R17, -56(R7)
MOVV R18, -64(R7)
ADDV $-64, R6
ADDV $-64, R7
ADDV $-64, R8
SGTU $64, R6, R9
BEQ R9, b_body
BEQ R6, move_0
JMP tail
move_0:
RET
move_1:
MOVB (R5), R11
MOVB R11, (R4)
RET
move_2:
MOVH (R5), R11
MOVH R11, (R4)
RET
move_3:
MOVH (R5), R11
MOVB -1(R8), R12
MOVH R11, (R4)
MOVB R12, -1(R7)
RET
move_4:
MOVW (R5), R11
MOVW R11, (R4)
RET
move_5through7:
MOVW (R5), R11
MOVW -4(R8), R12
MOVW R11, (R4)
MOVW R12, -4(R7)
RET
move_8:
MOVV (R5), R11
MOVV R11, (R4)
RET
move_9through16:
MOVV (R5), R11
MOVV -8(R8), R12
MOVV R11, (R4)
MOVV R12, -8(R7)
RET
move_17through32:
MOVV (R5), R11
MOVV 8(R5), R12
MOVV -16(R8), R13
MOVV -8(R8), R14
MOVV R11, (R4)
MOVV R12, 8(R4)
MOVV R13, -16(R7)
MOVV R14, -8(R7)
RET
move_33through64:
MOVV (R5), R11
MOVV 8(R5), R12
MOVV 16(R5), R13
MOVV 24(R5), R14
MOVV -32(R8), R15
MOVV -24(R8), R16
MOVV -16(R8), R17
MOVV -8(R8), R18
MOVV R11, (R4)
MOVV R12, 8(R4)
MOVV R13, 16(R4)
MOVV R14, 24(R4)
MOVV R15, -32(R7)
MOVV R16, -24(R7)
MOVV R17, -16(R7)
MOVV R18, -8(R7)
RET