blob: e099a64100273883c8cd8728ba7778d9f8d646ec [file] [log] [blame]
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// See memmove Go doc for important implementation constraints.
// void runtime·memmove(void*, void*, uintptr)
TEXT runtime·memmove<ABIInternal>(SB),NOSPLIT,$-0-24
// X10 = to
// X11 = from
// X12 = n
BEQ X10, X11, done
BEQZ X12, done
// If the destination is ahead of the source, start at the end of the
// buffer and go backward.
BGTU X10, X11, backward
// If less than 8 bytes, do single byte copies.
MOV $8, X9
BLT X12, X9, f_loop4_check
// Check alignment - if alignment differs we have to do one byte at a time.
AND $7, X10, X5
AND $7, X11, X6
BNE X5, X6, f_loop8_unaligned_check
BEQZ X5, f_loop_check
// Move one byte at a time until we reach 8 byte alignment.
SUB X5, X9, X5
SUB X5, X12, X12
f_align:
SUB $1, X5
MOVB 0(X11), X14
MOVB X14, 0(X10)
ADD $1, X10
ADD $1, X11
BNEZ X5, f_align
f_loop_check:
MOV $16, X9
BLT X12, X9, f_loop8_check
MOV $32, X9
BLT X12, X9, f_loop16_check
MOV $64, X9
BLT X12, X9, f_loop32_check
f_loop64:
MOV 0(X11), X14
MOV 8(X11), X15
MOV 16(X11), X16
MOV 24(X11), X17
MOV 32(X11), X18
MOV 40(X11), X19
MOV 48(X11), X20
MOV 56(X11), X21
MOV X14, 0(X10)
MOV X15, 8(X10)
MOV X16, 16(X10)
MOV X17, 24(X10)
MOV X18, 32(X10)
MOV X19, 40(X10)
MOV X20, 48(X10)
MOV X21, 56(X10)
ADD $64, X10
ADD $64, X11
SUB $64, X12
BGE X12, X9, f_loop64
BEQZ X12, done
f_loop32_check:
MOV $32, X9
BLT X12, X9, f_loop16_check
f_loop32:
MOV 0(X11), X14
MOV 8(X11), X15
MOV 16(X11), X16
MOV 24(X11), X17
MOV X14, 0(X10)
MOV X15, 8(X10)
MOV X16, 16(X10)
MOV X17, 24(X10)
ADD $32, X10
ADD $32, X11
SUB $32, X12
BGE X12, X9, f_loop32
BEQZ X12, done
f_loop16_check:
MOV $16, X9
BLT X12, X9, f_loop8_check
f_loop16:
MOV 0(X11), X14
MOV 8(X11), X15
MOV X14, 0(X10)
MOV X15, 8(X10)
ADD $16, X10
ADD $16, X11
SUB $16, X12
BGE X12, X9, f_loop16
BEQZ X12, done
f_loop8_check:
MOV $8, X9
BLT X12, X9, f_loop4_check
f_loop8:
MOV 0(X11), X14
MOV X14, 0(X10)
ADD $8, X10
ADD $8, X11
SUB $8, X12
BGE X12, X9, f_loop8
BEQZ X12, done
JMP f_loop4_check
f_loop8_unaligned_check:
MOV $8, X9
BLT X12, X9, f_loop4_check
f_loop8_unaligned:
MOVB 0(X11), X14
MOVB 1(X11), X15
MOVB 2(X11), X16
MOVB 3(X11), X17
MOVB 4(X11), X18
MOVB 5(X11), X19
MOVB 6(X11), X20
MOVB 7(X11), X21
MOVB X14, 0(X10)
MOVB X15, 1(X10)
MOVB X16, 2(X10)
MOVB X17, 3(X10)
MOVB X18, 4(X10)
MOVB X19, 5(X10)
MOVB X20, 6(X10)
MOVB X21, 7(X10)
ADD $8, X10
ADD $8, X11
SUB $8, X12
BGE X12, X9, f_loop8_unaligned
f_loop4_check:
MOV $4, X9
BLT X12, X9, f_loop1
f_loop4:
MOVB 0(X11), X14
MOVB 1(X11), X15
MOVB 2(X11), X16
MOVB 3(X11), X17
MOVB X14, 0(X10)
MOVB X15, 1(X10)
MOVB X16, 2(X10)
MOVB X17, 3(X10)
ADD $4, X10
ADD $4, X11
SUB $4, X12
BGE X12, X9, f_loop4
f_loop1:
BEQZ X12, done
MOVB 0(X11), X14
MOVB X14, 0(X10)
ADD $1, X10
ADD $1, X11
SUB $1, X12
JMP f_loop1
backward:
ADD X10, X12, X10
ADD X11, X12, X11
// If less than 8 bytes, do single byte copies.
MOV $8, X9
BLT X12, X9, b_loop4_check
// Check alignment - if alignment differs we have to do one byte at a time.
AND $7, X10, X5
AND $7, X11, X6
BNE X5, X6, b_loop8_unaligned_check
BEQZ X5, b_loop_check
// Move one byte at a time until we reach 8 byte alignment.
SUB X5, X12, X12
b_align:
SUB $1, X5
SUB $1, X10
SUB $1, X11
MOVB 0(X11), X14
MOVB X14, 0(X10)
BNEZ X5, b_align
b_loop_check:
MOV $16, X9
BLT X12, X9, b_loop8_check
MOV $32, X9
BLT X12, X9, b_loop16_check
MOV $64, X9
BLT X12, X9, b_loop32_check
b_loop64:
SUB $64, X10
SUB $64, X11
MOV 0(X11), X14
MOV 8(X11), X15
MOV 16(X11), X16
MOV 24(X11), X17
MOV 32(X11), X18
MOV 40(X11), X19
MOV 48(X11), X20
MOV 56(X11), X21
MOV X14, 0(X10)
MOV X15, 8(X10)
MOV X16, 16(X10)
MOV X17, 24(X10)
MOV X18, 32(X10)
MOV X19, 40(X10)
MOV X20, 48(X10)
MOV X21, 56(X10)
SUB $64, X12
BGE X12, X9, b_loop64
BEQZ X12, done
b_loop32_check:
MOV $32, X9
BLT X12, X9, b_loop16_check
b_loop32:
SUB $32, X10
SUB $32, X11
MOV 0(X11), X14
MOV 8(X11), X15
MOV 16(X11), X16
MOV 24(X11), X17
MOV X14, 0(X10)
MOV X15, 8(X10)
MOV X16, 16(X10)
MOV X17, 24(X10)
SUB $32, X12
BGE X12, X9, b_loop32
BEQZ X12, done
b_loop16_check:
MOV $16, X9
BLT X12, X9, b_loop8_check
b_loop16:
SUB $16, X10
SUB $16, X11
MOV 0(X11), X14
MOV 8(X11), X15
MOV X14, 0(X10)
MOV X15, 8(X10)
SUB $16, X12
BGE X12, X9, b_loop16
BEQZ X12, done
b_loop8_check:
MOV $8, X9
BLT X12, X9, b_loop4_check
b_loop8:
SUB $8, X10
SUB $8, X11
MOV 0(X11), X14
MOV X14, 0(X10)
SUB $8, X12
BGE X12, X9, b_loop8
BEQZ X12, done
JMP b_loop4_check
b_loop8_unaligned_check:
MOV $8, X9
BLT X12, X9, b_loop4_check
b_loop8_unaligned:
SUB $8, X10
SUB $8, X11
MOVB 0(X11), X14
MOVB 1(X11), X15
MOVB 2(X11), X16
MOVB 3(X11), X17
MOVB 4(X11), X18
MOVB 5(X11), X19
MOVB 6(X11), X20
MOVB 7(X11), X21
MOVB X14, 0(X10)
MOVB X15, 1(X10)
MOVB X16, 2(X10)
MOVB X17, 3(X10)
MOVB X18, 4(X10)
MOVB X19, 5(X10)
MOVB X20, 6(X10)
MOVB X21, 7(X10)
SUB $8, X12
BGE X12, X9, b_loop8_unaligned
b_loop4_check:
MOV $4, X9
BLT X12, X9, b_loop1
b_loop4:
SUB $4, X10
SUB $4, X11
MOVB 0(X11), X14
MOVB 1(X11), X15
MOVB 2(X11), X16
MOVB 3(X11), X17
MOVB X14, 0(X10)
MOVB X15, 1(X10)
MOVB X16, 2(X10)
MOVB X17, 3(X10)
SUB $4, X12
BGE X12, X9, b_loop4
b_loop1:
BEQZ X12, done
SUB $1, X10
SUB $1, X11
MOVB 0(X11), X14
MOVB X14, 0(X10)
SUB $1, X12
JMP b_loop1
done:
RET