src/runtime/memmove_arm.s - go - Git at Google

 // Inferno's libkern/memmove-arm.s
 // https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memmove-arm.s
 //
 //         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
 //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
 //         Portions Copyright 2009 The Go Authors. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in
 // all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.

 #include "textflag.h"

 // TE or TS are spilled to the stack during bulk register moves.
 #define TS	R0
 #define TE	R8

 // Warning: the linker will use R11 to synthesize certain instructions. Please
 // take care and double check with objdump.
 #define FROM	R11
 #define N	R12
 #define TMP	R12				/* N and TMP don't overlap */
 #define TMP1	R5

 #define RSHIFT	R5
 #define LSHIFT	R6
 #define OFFSET	R7

 #define BR0	R0					/* shared with TS */
 #define BW0	R1
 #define BR1	R1
 #define BW1	R2
 #define BR2	R2
 #define BW2	R3
 #define BR3	R3
 #define BW3	R4

 #define FW0	R1
 #define FR0	R2
 #define FW1	R2
 #define FR1	R3
 #define FW2	R3
 #define FR2	R4
 #define FW3	R4
 #define FR3	R8					/* shared with TE */

 TEXT runtime·memmove(SB), NOSPLIT, $4-12
 _memmove:
 	MOVW	to+0(FP), TS
 	MOVW	from+4(FP), FROM
 	MOVW	n+8(FP), N

 	ADD	N, TS, TE	/* to end pointer */

 	CMP	FROM, TS
 	BLS	_forward

 _back:
 	ADD	N, FROM		/* from end pointer */
 	CMP	$4, N		/* need at least 4 bytes to copy */
 	BLT	_b1tail

 _b4align:				/* align destination on 4 */
 	AND.S	$3, TE, TMP
 	BEQ	_b4aligned

 	MOVBU.W	-1(FROM), TMP	/* pre-indexed */
 	MOVBU.W	TMP, -1(TE)	/* pre-indexed */
 	B	_b4align

 _b4aligned:				/* is source now aligned? */
 	AND.S	$3, FROM, TMP
 	BNE	_bunaligned

 	ADD	$31, TS, TMP	/* do 32-byte chunks if possible */
 	MOVW	TS, savedts-4(SP)
 _b32loop:
 	CMP	TMP, TE
 	BLS	_b4tail

 	MOVM.DB.W (FROM), [R0-R7]
 	MOVM.DB.W [R0-R7], (TE)
 	B	_b32loop

 _b4tail:				/* do remaining words if possible */
 	MOVW	savedts-4(SP), TS
 	ADD	$3, TS, TMP
 _b4loop:
 	CMP	TMP, TE
 	BLS	_b1tail

 	MOVW.W	-4(FROM), TMP1	/* pre-indexed */
 	MOVW.W	TMP1, -4(TE)	/* pre-indexed */
 	B	_b4loop

 _b1tail:				/* remaining bytes */
 	CMP	TE, TS
 	BEQ	_return

 	MOVBU.W	-1(FROM), TMP	/* pre-indexed */
 	MOVBU.W	TMP, -1(TE)	/* pre-indexed */
 	B	_b1tail

 _forward:
 	CMP	$4, N		/* need at least 4 bytes to copy */
 	BLT	_f1tail

 _f4align:				/* align destination on 4 */
 	AND.S	$3, TS, TMP
 	BEQ	_f4aligned

 	MOVBU.P	1(FROM), TMP	/* implicit write back */
 	MOVBU.P	TMP, 1(TS)	/* implicit write back */
 	B	_f4align

 _f4aligned:				/* is source now aligned? */
 	AND.S	$3, FROM, TMP
 	BNE	_funaligned

 	SUB	$31, TE, TMP	/* do 32-byte chunks if possible */
 	MOVW	TE, savedte-4(SP)
 _f32loop:
 	CMP	TMP, TS
 	BHS	_f4tail

 	MOVM.IA.W (FROM), [R1-R8]
 	MOVM.IA.W [R1-R8], (TS)
 	B	_f32loop

 _f4tail:
 	MOVW	savedte-4(SP), TE
 	SUB	$3, TE, TMP	/* do remaining words if possible */
 _f4loop:
 	CMP	TMP, TS
 	BHS	_f1tail

 	MOVW.P	4(FROM), TMP1	/* implicit write back */
 	MOVW.P	TMP1, 4(TS)	/* implicit write back */
 	B	_f4loop

 _f1tail:
 	CMP	TS, TE
 	BEQ	_return

 	MOVBU.P	1(FROM), TMP	/* implicit write back */
 	MOVBU.P	TMP, 1(TS)	/* implicit write back */
 	B	_f1tail

 _return:
 	MOVW	to+0(FP), R0
 	RET

 _bunaligned:
 	CMP	$2, TMP		/* is TMP < 2 ? */

 	MOVW.LT	$8, RSHIFT		/* (R(n)<<24)|(R(n-1)>>8) */
 	MOVW.LT	$24, LSHIFT
 	MOVW.LT	$1, OFFSET

 	MOVW.EQ	$16, RSHIFT		/* (R(n)<<16)|(R(n-1)>>16) */
 	MOVW.EQ	$16, LSHIFT
 	MOVW.EQ	$2, OFFSET

 	MOVW.GT	$24, RSHIFT		/* (R(n)<<8)|(R(n-1)>>24) */
 	MOVW.GT	$8, LSHIFT
 	MOVW.GT	$3, OFFSET

 	ADD	$16, TS, TMP	/* do 16-byte chunks if possible */
 	CMP	TMP, TE
 	BLS	_b1tail

 	BIC	$3, FROM		/* align source */
 	MOVW	TS, savedts-4(SP)
 	MOVW	(FROM), BR0	/* prime first block register */

 _bu16loop:
 	CMP	TMP, TE
 	BLS	_bu1tail

 	MOVW	BR0<<LSHIFT, BW3
 	MOVM.DB.W (FROM), [BR0-BR3]
 	ORR	BR3>>RSHIFT, BW3

 	MOVW	BR3<<LSHIFT, BW2
 	ORR	BR2>>RSHIFT, BW2

 	MOVW	BR2<<LSHIFT, BW1
 	ORR	BR1>>RSHIFT, BW1

 	MOVW	BR1<<LSHIFT, BW0
 	ORR	BR0>>RSHIFT, BW0

 	MOVM.DB.W [BW0-BW3], (TE)
 	B	_bu16loop

 _bu1tail:
 	MOVW	savedts-4(SP), TS
 	ADD	OFFSET, FROM
 	B	_b1tail

 _funaligned:
 	CMP	$2, TMP

 	MOVW.LT	$8, RSHIFT		/* (R(n+1)<<24)|(R(n)>>8) */
 	MOVW.LT	$24, LSHIFT
 	MOVW.LT	$3, OFFSET

 	MOVW.EQ	$16, RSHIFT		/* (R(n+1)<<16)|(R(n)>>16) */
 	MOVW.EQ	$16, LSHIFT
 	MOVW.EQ	$2, OFFSET

 	MOVW.GT	$24, RSHIFT		/* (R(n+1)<<8)|(R(n)>>24) */
 	MOVW.GT	$8, LSHIFT
 	MOVW.GT	$1, OFFSET

 	SUB	$16, TE, TMP	/* do 16-byte chunks if possible */
 	CMP	TMP, TS
 	BHS	_f1tail

 	BIC	$3, FROM		/* align source */
 	MOVW	TE, savedte-4(SP)
 	MOVW.P	4(FROM), FR3	/* prime last block register, implicit write back */

 _fu16loop:
 	CMP	TMP, TS
 	BHS	_fu1tail

 	MOVW	FR3>>RSHIFT, FW0
 	MOVM.IA.W (FROM), [FR0,FR1,FR2,FR3]
 	ORR	FR0<<LSHIFT, FW0

 	MOVW	FR0>>RSHIFT, FW1
 	ORR	FR1<<LSHIFT, FW1

 	MOVW	FR1>>RSHIFT, FW2
 	ORR	FR2<<LSHIFT, FW2

 	MOVW	FR2>>RSHIFT, FW3
 	ORR	FR3<<LSHIFT, FW3

 	MOVM.IA.W [FW0,FW1,FW2,FW3], (TS)
 	B	_fu16loop

 _fu1tail:
 	MOVW	savedte-4(SP), TE
 	SUB	OFFSET, FROM
 	B	_f1tail
	// Inferno's libkern/memmove-arm.s
	// https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memmove-arm.s
	//
	// Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
	// Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved.
	// Portions Copyright 2009 The Go Authors. All rights reserved.
	//
	// Permission is hereby granted, free of charge, to any person obtaining a copy
	// of this software and associated documentation files (the "Software"), to deal
	// in the Software without restriction, including without limitation the rights
	// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	// copies of the Software, and to permit persons to whom the Software is
	// furnished to do so, subject to the following conditions:
	//
	// The above copyright notice and this permission notice shall be included in
	// all copies or substantial portions of the Software.
	//
	// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	// THE SOFTWARE.

	#include "textflag.h"

	// TE or TS are spilled to the stack during bulk register moves.
	#define TS R0
	#define TE R8

	// Warning: the linker will use R11 to synthesize certain instructions. Please
	// take care and double check with objdump.
	#define FROM R11
	#define N R12
	#define TMP R12 /* N and TMP don't overlap */
	#define TMP1 R5

	#define RSHIFT R5
	#define LSHIFT R6
	#define OFFSET R7

	#define BR0 R0 /* shared with TS */
	#define BW0 R1
	#define BR1 R1
	#define BW1 R2
	#define BR2 R2
	#define BW2 R3
	#define BR3 R3
	#define BW3 R4

	#define FW0 R1
	#define FR0 R2
	#define FW1 R2
	#define FR1 R3
	#define FW2 R3
	#define FR2 R4
	#define FW3 R4
	#define FR3 R8 /* shared with TE */

	TEXT runtime·memmove(SB), NOSPLIT, $4-12
	_memmove:
	MOVW to+0(FP), TS
	MOVW from+4(FP), FROM
	MOVW n+8(FP), N

	ADD N, TS, TE /* to end pointer */

	CMP FROM, TS
	BLS _forward

	_back:
	ADD N, FROM /* from end pointer */
	CMP $4, N /* need at least 4 bytes to copy */
	BLT _b1tail

	_b4align: /* align destination on 4 */
	AND.S $3, TE, TMP
	BEQ _b4aligned

	MOVBU.W -1(FROM), TMP /* pre-indexed */
	MOVBU.W TMP, -1(TE) /* pre-indexed */
	B _b4align

	_b4aligned: /* is source now aligned? */
	AND.S $3, FROM, TMP
	BNE _bunaligned

	ADD $31, TS, TMP /* do 32-byte chunks if possible */
	MOVW TS, savedts-4(SP)
	_b32loop:
	CMP TMP, TE
	BLS _b4tail

	MOVM.DB.W (FROM), [R0-R7]
	MOVM.DB.W [R0-R7], (TE)
	B _b32loop

	_b4tail: /* do remaining words if possible */
	MOVW savedts-4(SP), TS
	ADD $3, TS, TMP
	_b4loop:
	CMP TMP, TE
	BLS _b1tail

	MOVW.W -4(FROM), TMP1 /* pre-indexed */
	MOVW.W TMP1, -4(TE) /* pre-indexed */
	B _b4loop

	_b1tail: /* remaining bytes */
	CMP TE, TS
	BEQ _return

	MOVBU.W -1(FROM), TMP /* pre-indexed */
	MOVBU.W TMP, -1(TE) /* pre-indexed */
	B _b1tail

	_forward:
	CMP $4, N /* need at least 4 bytes to copy */
	BLT _f1tail

	_f4align: /* align destination on 4 */
	AND.S $3, TS, TMP
	BEQ _f4aligned

	MOVBU.P 1(FROM), TMP /* implicit write back */
	MOVBU.P TMP, 1(TS) /* implicit write back */
	B _f4align

	_f4aligned: /* is source now aligned? */
	AND.S $3, FROM, TMP
	BNE _funaligned

	SUB $31, TE, TMP /* do 32-byte chunks if possible */
	MOVW TE, savedte-4(SP)
	_f32loop:
	CMP TMP, TS
	BHS _f4tail

	MOVM.IA.W (FROM), [R1-R8]
	MOVM.IA.W [R1-R8], (TS)
	B _f32loop

	_f4tail:
	MOVW savedte-4(SP), TE
	SUB $3, TE, TMP /* do remaining words if possible */
	_f4loop:
	CMP TMP, TS
	BHS _f1tail

	MOVW.P 4(FROM), TMP1 /* implicit write back */
	MOVW.P TMP1, 4(TS) /* implicit write back */
	B _f4loop

	_f1tail:
	CMP TS, TE
	BEQ _return

	MOVBU.P 1(FROM), TMP /* implicit write back */
	MOVBU.P TMP, 1(TS) /* implicit write back */
	B _f1tail

	_return:
	MOVW to+0(FP), R0
	RET

	_bunaligned:
	CMP $2, TMP /* is TMP < 2 ? */

	MOVW.LT $8, RSHIFT /* (R(n)<<24)\|(R(n-1)>>8) */
	MOVW.LT $24, LSHIFT
	MOVW.LT $1, OFFSET

	MOVW.EQ $16, RSHIFT /* (R(n)<<16)\|(R(n-1)>>16) */
	MOVW.EQ $16, LSHIFT
	MOVW.EQ $2, OFFSET

	MOVW.GT $24, RSHIFT /* (R(n)<<8)\|(R(n-1)>>24) */
	MOVW.GT $8, LSHIFT
	MOVW.GT $3, OFFSET

	ADD $16, TS, TMP /* do 16-byte chunks if possible */
	CMP TMP, TE
	BLS _b1tail

	BIC $3, FROM /* align source */
	MOVW TS, savedts-4(SP)
	MOVW (FROM), BR0 /* prime first block register */

	_bu16loop:
	CMP TMP, TE
	BLS _bu1tail

	MOVW BR0<<LSHIFT, BW3
	MOVM.DB.W (FROM), [BR0-BR3]
	ORR BR3>>RSHIFT, BW3

	MOVW BR3<<LSHIFT, BW2
	ORR BR2>>RSHIFT, BW2

	MOVW BR2<<LSHIFT, BW1
	ORR BR1>>RSHIFT, BW1

	MOVW BR1<<LSHIFT, BW0
	ORR BR0>>RSHIFT, BW0

	MOVM.DB.W [BW0-BW3], (TE)
	B _bu16loop

	_bu1tail:
	MOVW savedts-4(SP), TS
	ADD OFFSET, FROM
	B _b1tail

	_funaligned:
	CMP $2, TMP

	MOVW.LT $8, RSHIFT /* (R(n+1)<<24)\|(R(n)>>8) */
	MOVW.LT $24, LSHIFT
	MOVW.LT $3, OFFSET

	MOVW.EQ $16, RSHIFT /* (R(n+1)<<16)\|(R(n)>>16) */
	MOVW.EQ $16, LSHIFT
	MOVW.EQ $2, OFFSET

	MOVW.GT $24, RSHIFT /* (R(n+1)<<8)\|(R(n)>>24) */
	MOVW.GT $8, LSHIFT
	MOVW.GT $1, OFFSET

	SUB $16, TE, TMP /* do 16-byte chunks if possible */
	CMP TMP, TS
	BHS _f1tail

	BIC $3, FROM /* align source */
	MOVW TE, savedte-4(SP)
	MOVW.P 4(FROM), FR3 /* prime last block register, implicit write back */

	_fu16loop:
	CMP TMP, TS
	BHS _fu1tail

	MOVW FR3>>RSHIFT, FW0
	MOVM.IA.W (FROM), [FR0,FR1,FR2,FR3]
	ORR FR0<<LSHIFT, FW0

	MOVW FR0>>RSHIFT, FW1
	ORR FR1<<LSHIFT, FW1

	MOVW FR1>>RSHIFT, FW2
	ORR FR2<<LSHIFT, FW2

	MOVW FR2>>RSHIFT, FW3
	ORR FR3<<LSHIFT, FW3

	MOVM.IA.W [FW0,FW1,FW2,FW3], (TS)
	B _fu16loop

	_fu1tail:
	MOVW savedte-4(SP), TE
	SUB OFFSET, FROM
	B _f1tail