|  | // Copyright 2014 The Go Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style | 
|  | // license that can be found in the LICENSE file. | 
|  |  | 
|  | #include "textflag.h" | 
|  |  | 
|  | // See memmove Go doc for important implementation constraints. | 
|  |  | 
|  | // Register map | 
|  | // | 
|  | // dstin  R0 | 
|  | // src    R1 | 
|  | // count  R2 | 
|  | // dst    R3 (same as R0, but gets modified in unaligned cases) | 
|  | // srcend R4 | 
|  | // dstend R5 | 
|  | // data   R6-R17 | 
|  | // tmp1   R14 | 
|  |  | 
|  | // Copies are split into 3 main cases: small copies of up to 32 bytes, medium | 
|  | // copies of up to 128 bytes, and large copies. The overhead of the overlap | 
|  | // check is negligible since it is only required for large copies. | 
|  | // | 
|  | // Large copies use a software pipelined loop processing 64 bytes per iteration. | 
|  | // The destination pointer is 16-byte aligned to minimize unaligned accesses. | 
|  | // The loop tail is handled by always copying 64 bytes from the end. | 
|  |  | 
|  | // func memmove(to, from unsafe.Pointer, n uintptr) | 
|  | TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24 | 
|  | MOVD	to+0(FP), R0 | 
|  | MOVD	from+8(FP), R1 | 
|  | MOVD	n+16(FP), R2 | 
|  | CBZ	R2, copy0 | 
|  |  | 
|  | // Small copies: 1..16 bytes | 
|  | CMP	$16, R2 | 
|  | BLE	copy16 | 
|  |  | 
|  | // Large copies | 
|  | CMP	$128, R2 | 
|  | BHI	copy_long | 
|  | CMP	$32, R2 | 
|  | BHI	copy32_128 | 
|  |  | 
|  | // Small copies: 17..32 bytes. | 
|  | LDP	(R1), (R6, R7) | 
|  | ADD	R1, R2, R4          // R4 points just past the last source byte | 
|  | LDP	-16(R4), (R12, R13) | 
|  | STP	(R6, R7), (R0) | 
|  | ADD	R0, R2, R5          // R5 points just past the last destination byte | 
|  | STP	(R12, R13), -16(R5) | 
|  | RET | 
|  |  | 
|  | // Small copies: 1..16 bytes. | 
|  | copy16: | 
|  | ADD	R1, R2, R4 // R4 points just past the last source byte | 
|  | ADD	R0, R2, R5 // R5 points just past the last destination byte | 
|  | CMP	$8, R2 | 
|  | BLT	copy7 | 
|  | MOVD	(R1), R6 | 
|  | MOVD	-8(R4), R7 | 
|  | MOVD	R6, (R0) | 
|  | MOVD	R7, -8(R5) | 
|  | RET | 
|  |  | 
|  | copy7: | 
|  | TBZ	$2, R2, copy3 | 
|  | MOVWU	(R1), R6 | 
|  | MOVWU	-4(R4), R7 | 
|  | MOVW	R6, (R0) | 
|  | MOVW	R7, -4(R5) | 
|  | RET | 
|  |  | 
|  | copy3: | 
|  | TBZ	$1, R2, copy1 | 
|  | MOVHU	(R1), R6 | 
|  | MOVHU	-2(R4), R7 | 
|  | MOVH	R6, (R0) | 
|  | MOVH	R7, -2(R5) | 
|  | RET | 
|  |  | 
|  | copy1: | 
|  | MOVBU	(R1), R6 | 
|  | MOVB	R6, (R0) | 
|  |  | 
|  | copy0: | 
|  | RET | 
|  |  | 
|  | // Medium copies: 33..128 bytes. | 
|  | copy32_128: | 
|  | ADD	R1, R2, R4          // R4 points just past the last source byte | 
|  | ADD	R0, R2, R5          // R5 points just past the last destination byte | 
|  | LDP	(R1), (R6, R7) | 
|  | LDP	16(R1), (R8, R9) | 
|  | LDP	-32(R4), (R10, R11) | 
|  | LDP	-16(R4), (R12, R13) | 
|  | CMP	$64, R2 | 
|  | BHI	copy128 | 
|  | STP	(R6, R7), (R0) | 
|  | STP	(R8, R9), 16(R0) | 
|  | STP	(R10, R11), -32(R5) | 
|  | STP	(R12, R13), -16(R5) | 
|  | RET | 
|  |  | 
|  | // Copy 65..128 bytes. | 
|  | copy128: | 
|  | LDP	32(R1), (R14, R15) | 
|  | LDP	48(R1), (R16, R17) | 
|  | CMP	$96, R2 | 
|  | BLS	copy96 | 
|  | LDP	-64(R4), (R2, R3) | 
|  | LDP	-48(R4), (R1, R4) | 
|  | STP	(R2, R3), -64(R5) | 
|  | STP	(R1, R4), -48(R5) | 
|  |  | 
|  | copy96: | 
|  | STP	(R6, R7), (R0) | 
|  | STP	(R8, R9), 16(R0) | 
|  | STP	(R14, R15), 32(R0) | 
|  | STP	(R16, R17), 48(R0) | 
|  | STP	(R10, R11), -32(R5) | 
|  | STP	(R12, R13), -16(R5) | 
|  | RET | 
|  |  | 
|  | // Copy more than 128 bytes. | 
|  | copy_long: | 
|  | ADD	R1, R2, R4 // R4 points just past the last source byte | 
|  | ADD	R0, R2, R5 // R5 points just past the last destination byte | 
|  | MOVD	ZR, R7 | 
|  | MOVD	ZR, R8 | 
|  |  | 
|  | CMP	$1024, R2 | 
|  | BLT	backward_check | 
|  | // feature detect to decide how to align | 
|  | MOVBU	runtime·arm64UseAlignedLoads(SB), R6 | 
|  | CBNZ	R6, use_aligned_loads | 
|  | MOVD	R0, R7 | 
|  | MOVD	R5, R8 | 
|  | B	backward_check | 
|  | use_aligned_loads: | 
|  | MOVD	R1, R7 | 
|  | MOVD	R4, R8 | 
|  | // R7 and R8 are used here for the realignment calculation. In | 
|  | // the use_aligned_loads case, R7 is the src pointer and R8 is | 
|  | // srcend pointer, which is used in the backward copy case. | 
|  | // When doing aligned stores, R7 is the dst pointer and R8 is | 
|  | // the dstend pointer. | 
|  |  | 
|  | backward_check: | 
|  | // Use backward copy if there is an overlap. | 
|  | SUB	R1, R0, R14 | 
|  | CBZ	R14, copy0 | 
|  | CMP	R2, R14 | 
|  | BCC	copy_long_backward | 
|  |  | 
|  | // Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment. | 
|  | LDP	(R1), (R12, R13)     // Load  A | 
|  | AND	$15, R7, R14         // Calculate the realignment offset | 
|  | SUB	R14, R1, R1 | 
|  | SUB	R14, R0, R3          // move dst back same amount as src | 
|  | ADD	R14, R2, R2 | 
|  | LDP	16(R1), (R6, R7)     // Load   B | 
|  | STP	(R12, R13), (R0)     // Store A | 
|  | LDP	32(R1), (R8, R9)     // Load    C | 
|  | LDP	48(R1), (R10, R11)   // Load     D | 
|  | LDP.W	64(R1), (R12, R13)   // Load      E | 
|  | // 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end | 
|  | SUBS	$144, R2, R2 | 
|  | BLS	copy64_from_end | 
|  |  | 
|  | loop64: | 
|  | STP	(R6, R7), 16(R3)     // Store  B | 
|  | LDP	16(R1), (R6, R7)     // Load   B (next iteration) | 
|  | STP	(R8, R9), 32(R3)     // Store   C | 
|  | LDP	32(R1), (R8, R9)     // Load    C | 
|  | STP	(R10, R11), 48(R3)   // Store    D | 
|  | LDP	48(R1), (R10, R11)   // Load     D | 
|  | STP.W	(R12, R13), 64(R3)   // Store     E | 
|  | LDP.W	64(R1), (R12, R13)   // Load      E | 
|  | SUBS	$64, R2, R2 | 
|  | BHI	loop64 | 
|  |  | 
|  | // Write the last iteration and copy 64 bytes from the end. | 
|  | copy64_from_end: | 
|  | LDP	-64(R4), (R14, R15)  // Load       F | 
|  | STP	(R6, R7), 16(R3)     // Store  B | 
|  | LDP	-48(R4), (R6, R7)    // Load        G | 
|  | STP	(R8, R9), 32(R3)     // Store   C | 
|  | LDP	-32(R4), (R8, R9)    // Load         H | 
|  | STP	(R10, R11), 48(R3)   // Store    D | 
|  | LDP	-16(R4), (R10, R11)  // Load          I | 
|  | STP	(R12, R13), 64(R3)   // Store     E | 
|  | STP	(R14, R15), -64(R5)  // Store      F | 
|  | STP	(R6, R7), -48(R5)    // Store       G | 
|  | STP	(R8, R9), -32(R5)    // Store        H | 
|  | STP	(R10, R11), -16(R5)  // Store         I | 
|  | RET | 
|  |  | 
|  | // Large backward copy for overlapping copies. | 
|  | // Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment. | 
|  | copy_long_backward: | 
|  | LDP	-16(R4), (R12, R13) | 
|  | AND	$15, R8, R14 | 
|  | SUB	R14, R4, R4 | 
|  | SUB	R14, R2, R2 | 
|  | LDP	-16(R4), (R6, R7) | 
|  | STP	(R12, R13), -16(R5) | 
|  | LDP	-32(R4), (R8, R9) | 
|  | LDP	-48(R4), (R10, R11) | 
|  | LDP.W	-64(R4), (R12, R13) | 
|  | SUB	R14, R5, R5 | 
|  | SUBS	$128, R2, R2 | 
|  | BLS	copy64_from_start | 
|  |  | 
|  | loop64_backward: | 
|  | STP	(R6, R7), -16(R5) | 
|  | LDP	-16(R4), (R6, R7) | 
|  | STP	(R8, R9), -32(R5) | 
|  | LDP	-32(R4), (R8, R9) | 
|  | STP	(R10, R11), -48(R5) | 
|  | LDP	-48(R4), (R10, R11) | 
|  | STP.W	(R12, R13), -64(R5) | 
|  | LDP.W	-64(R4), (R12, R13) | 
|  | SUBS	$64, R2, R2 | 
|  | BHI	loop64_backward | 
|  |  | 
|  | // Write the last iteration and copy 64 bytes from the start. | 
|  | copy64_from_start: | 
|  | LDP	48(R1), (R2, R3) | 
|  | STP	(R6, R7), -16(R5) | 
|  | LDP	32(R1), (R6, R7) | 
|  | STP	(R8, R9), -32(R5) | 
|  | LDP	16(R1), (R8, R9) | 
|  | STP	(R10, R11), -48(R5) | 
|  | LDP	(R1), (R10, R11) | 
|  | STP	(R12, R13), -64(R5) | 
|  | STP	(R2, R3), 48(R0) | 
|  | STP	(R6, R7), 32(R0) | 
|  | STP	(R8, R9), 16(R0) | 
|  | STP	(R10, R11), (R0) | 
|  | RET |