[dev.cc] all: edit assembly source for ARM to be more regular
Several .s files for ARM had several properties the new assembler will not support.
These include:
- mentioning SP or PC as a hardware register
These are always pseudo-registers except that in some contexts
they're not, and it's confusing because the context should not affect
which register you mean. Change the references to the hardware
registers to be explicit: R13 for SP, R15 for PC.
- constant creation using assignment
The files say a=b when they could instead say #define a b.
There is no reason to have both mechanisms.
- R(0) to refer to R0.
Some macros use this to a great extent. Again, it's easy just to
use a #define to rename a register.
Change-Id: I002335ace8e876c5b63c71c2560533eb835346d2
Reviewed-on: https://go-review.googlesource.com/4822
Reviewed-by: Dave Cheney <dave@cheney.net>
diff --git a/src/runtime/asm_arm.s b/src/runtime/asm_arm.s
index 2efeaaa..cd81c25 100644
--- a/src/runtime/asm_arm.s
+++ b/src/runtime/asm_arm.s
@@ -107,7 +107,7 @@
// save state in Gobuf; setjmp
TEXT runtime·gosave(SB),NOSPLIT,$-4-4
MOVW 0(FP), R0 // gobuf
- MOVW SP, gobuf_sp(R0)
+ MOVW R13, gobuf_sp(R0)
MOVW LR, gobuf_pc(R0)
MOVW g, gobuf_g(R0)
MOVW $0, R11
@@ -133,7 +133,7 @@
// after this point: it must be straight-line code until the
// final B instruction.
// See large comment in sigprof for more details.
- MOVW gobuf_sp(R1), SP // restore SP
+ MOVW gobuf_sp(R1), R13 // restore SP==R13
MOVW gobuf_lr(R1), LR
MOVW gobuf_ret(R1), R0
MOVW gobuf_ctxt(R1), R7
@@ -152,7 +152,7 @@
// to keep running g.
TEXT runtime·mcall(SB),NOSPLIT,$-4-4
// Save caller state in g->sched.
- MOVW SP, (g_sched+gobuf_sp)(g)
+ MOVW R13, (g_sched+gobuf_sp)(g)
MOVW LR, (g_sched+gobuf_pc)(g)
MOVW $0, R11
MOVW R11, (g_sched+gobuf_lr)(g)
@@ -170,8 +170,8 @@
CMP $0, R11
BL.NE runtime·save_g(SB)
MOVW fn+0(FP), R0
- MOVW (g_sched+gobuf_sp)(g), SP
- SUB $8, SP
+ MOVW (g_sched+gobuf_sp)(g), R13
+ SUB $8, R13
MOVW R1, 4(SP)
MOVW R0, R7
MOVW 0(R0), R0
@@ -217,7 +217,7 @@
MOVW $runtime·systemstack_switch(SB), R3
ADD $4, R3, R3 // get past push {lr}
MOVW R3, (g_sched+gobuf_pc)(g)
- MOVW SP, (g_sched+gobuf_sp)(g)
+ MOVW R13, (g_sched+gobuf_sp)(g)
MOVW LR, (g_sched+gobuf_lr)(g)
MOVW g, (g_sched+gobuf_g)(g)
@@ -231,7 +231,7 @@
SUB $4, R3, R3
MOVW $runtime·mstart(SB), R4
MOVW R4, 0(R3)
- MOVW R3, SP
+ MOVW R3, R13
// call target function
MOVW R0, R7
@@ -242,7 +242,7 @@
MOVW g_m(g), R1
MOVW m_curg(R1), R0
BL setg<>(SB)
- MOVW (g_sched+gobuf_sp)(g), SP
+ MOVW (g_sched+gobuf_sp)(g), R13
MOVW $0, R3
MOVW R3, (g_sched+gobuf_sp)(g)
RET
@@ -284,21 +284,21 @@
// Called from f.
// Set g->sched to context in f.
MOVW R7, (g_sched+gobuf_ctxt)(g)
- MOVW SP, (g_sched+gobuf_sp)(g)
+ MOVW R13, (g_sched+gobuf_sp)(g)
MOVW LR, (g_sched+gobuf_pc)(g)
MOVW R3, (g_sched+gobuf_lr)(g)
// Called from f.
// Set m->morebuf to f's caller.
MOVW R3, (m_morebuf+gobuf_pc)(R8) // f's caller's PC
- MOVW SP, (m_morebuf+gobuf_sp)(R8) // f's caller's SP
+ MOVW R13, (m_morebuf+gobuf_sp)(R8) // f's caller's SP
MOVW $4(SP), R3 // f's argument pointer
MOVW g, (m_morebuf+gobuf_g)(R8)
// Call newstack on m->g0's stack.
MOVW m_g0(R8), R0
BL setg<>(SB)
- MOVW (g_sched+gobuf_sp)(g), SP
+ MOVW (g_sched+gobuf_sp)(g), R13
BL runtime·newstack(SB)
// Not reached, but make sure the return PC from the call to newstack
@@ -362,7 +362,7 @@
/* copy arguments to stack */ \
MOVW argptr+8(FP), R0; \
MOVW argsize+12(FP), R2; \
- ADD $4, SP, R1; \
+ ADD $4, R13, R1; \
CMP $0, R2; \
B.EQ 5(PC); \
MOVBU.P 1(R0), R5; \
@@ -378,7 +378,7 @@
MOVW argptr+8(FP), R0; \
MOVW argsize+12(FP), R2; \
MOVW retoffset+16(FP), R3; \
- ADD $4, SP, R1; \
+ ADD $4, R13, R1; \
ADD R3, R1; \
ADD R3, R0; \
SUB R3, R2; \
@@ -443,8 +443,8 @@
MOVW 0(SP), LR
MOVW $-4(LR), LR // BL deferreturn
MOVW fv+0(FP), R7
- MOVW argp+4(FP), SP
- MOVW $-4(SP), SP // SP is 4 below argp, due to saved LR
+ MOVW argp+4(FP), R13
+ MOVW $-4(SP), R13 // SP is 4 below argp, due to saved LR
MOVW 0(R7), R1
B (R1)
diff --git a/src/runtime/memclr_arm.s b/src/runtime/memclr_arm.s
index 1824d33..8b5fe31 100644
--- a/src/runtime/memclr_arm.s
+++ b/src/runtime/memclr_arm.s
@@ -25,31 +25,31 @@
#include "textflag.h"
-TO = 8
-TOE = 11
-N = 12
-TMP = 12 /* N and TMP don't overlap */
+#define TO R8
+#define TOE R11
+#define N R12
+#define TMP R12 /* N and TMP don't overlap */
TEXT runtime·memclr(SB),NOSPLIT,$0-8
- MOVW ptr+0(FP), R(TO)
- MOVW n+4(FP), R(N)
- MOVW $0, R(0)
+ MOVW ptr+0(FP), TO
+ MOVW n+4(FP), N
+ MOVW $0, R0
- ADD R(N), R(TO), R(TOE) /* to end pointer */
+ ADD N, TO, TOE /* to end pointer */
- CMP $4, R(N) /* need at least 4 bytes to copy */
+ CMP $4, N /* need at least 4 bytes to copy */
BLT _1tail
_4align: /* align on 4 */
- AND.S $3, R(TO), R(TMP)
+ AND.S $3, TO, TMP
BEQ _4aligned
- MOVBU.P R(0), 1(R(TO)) /* implicit write back */
+ MOVBU.P R0, 1(TO) /* implicit write back */
B _4align
_4aligned:
- SUB $31, R(TOE), R(TMP) /* do 32-byte chunks if possible */
- CMP R(TMP), R(TO)
+ SUB $31, TOE, TMP /* do 32-byte chunks if possible */
+ CMP TMP, TO
BHS _4tail
MOVW R0, R1 /* replicate */
@@ -61,26 +61,26 @@
MOVW R0, R7
_f32loop:
- CMP R(TMP), R(TO)
+ CMP TMP, TO
BHS _4tail
- MOVM.IA.W [R0-R7], (R(TO))
+ MOVM.IA.W [R0-R7], (TO)
B _f32loop
_4tail:
- SUB $3, R(TOE), R(TMP) /* do remaining words if possible */
+ SUB $3, TOE, TMP /* do remaining words if possible */
_4loop:
- CMP R(TMP), R(TO)
+ CMP TMP, TO
BHS _1tail
- MOVW.P R(0), 4(R(TO)) /* implicit write back */
+ MOVW.P R0, 4(TO) /* implicit write back */
B _4loop
_1tail:
- CMP R(TO), R(TOE)
+ CMP TO, TOE
BEQ _return
- MOVBU.P R(0), 1(R(TO)) /* implicit write back */
+ MOVBU.P R0, 1(TO) /* implicit write back */
B _1tail
_return:
diff --git a/src/runtime/memmove_arm.s b/src/runtime/memmove_arm.s
index f187d42..35f04a8 100644
--- a/src/runtime/memmove_arm.s
+++ b/src/runtime/memmove_arm.s
@@ -26,138 +26,138 @@
#include "textflag.h"
// TE or TS are spilled to the stack during bulk register moves.
-TS = 0
-TE = 8
+#define TS R0
+#define TE R8
// Warning: the linker will use R11 to synthesize certain instructions. Please
// take care and double check with objdump.
-FROM = 11
-N = 12
-TMP = 12 /* N and TMP don't overlap */
-TMP1 = 5
+#define FROM R11
+#define N R12
+#define TMP R12 /* N and TMP don't overlap */
+#define TMP1 R5
-RSHIFT = 5
-LSHIFT = 6
-OFFSET = 7
+#define RSHIFT R5
+#define LSHIFT R6
+#define OFFSET R7
-BR0 = 0 /* shared with TS */
-BW0 = 1
-BR1 = 1
-BW1 = 2
-BR2 = 2
-BW2 = 3
-BR3 = 3
-BW3 = 4
+#define BR0 R0 /* shared with TS */
+#define BW0 R1
+#define BR1 R1
+#define BW1 R2
+#define BR2 R2
+#define BW2 R3
+#define BR3 R3
+#define BW3 R4
-FW0 = 1
-FR0 = 2
-FW1 = 2
-FR1 = 3
-FW2 = 3
-FR2 = 4
-FW3 = 4
-FR3 = 8 /* shared with TE */
+#define FW0 R1
+#define FR0 R2
+#define FW1 R2
+#define FR1 R3
+#define FW2 R3
+#define FR2 R4
+#define FW3 R4
+#define FR3 R8 /* shared with TE */
TEXT runtime·memmove(SB), NOSPLIT, $4-12
_memmove:
- MOVW to+0(FP), R(TS)
- MOVW from+4(FP), R(FROM)
- MOVW n+8(FP), R(N)
+ MOVW to+0(FP), TS
+ MOVW from+4(FP), FROM
+ MOVW n+8(FP), N
- ADD R(N), R(TS), R(TE) /* to end pointer */
+ ADD N, TS, TE /* to end pointer */
- CMP R(FROM), R(TS)
+ CMP FROM, TS
BLS _forward
_back:
- ADD R(N), R(FROM) /* from end pointer */
- CMP $4, R(N) /* need at least 4 bytes to copy */
+ ADD N, FROM /* from end pointer */
+ CMP $4, N /* need at least 4 bytes to copy */
BLT _b1tail
_b4align: /* align destination on 4 */
- AND.S $3, R(TE), R(TMP)
+ AND.S $3, TE, TMP
BEQ _b4aligned
- MOVBU.W -1(R(FROM)), R(TMP) /* pre-indexed */
- MOVBU.W R(TMP), -1(R(TE)) /* pre-indexed */
+ MOVBU.W -1(FROM), TMP /* pre-indexed */
+ MOVBU.W TMP, -1(TE) /* pre-indexed */
B _b4align
_b4aligned: /* is source now aligned? */
- AND.S $3, R(FROM), R(TMP)
+ AND.S $3, FROM, TMP
BNE _bunaligned
- ADD $31, R(TS), R(TMP) /* do 32-byte chunks if possible */
- MOVW R(TS), savedts-4(SP)
+ ADD $31, TS, TMP /* do 32-byte chunks if possible */
+ MOVW TS, savedts-4(SP)
_b32loop:
- CMP R(TMP), R(TE)
+ CMP TMP, TE
BLS _b4tail
- MOVM.DB.W (R(FROM)), [R0-R7]
- MOVM.DB.W [R0-R7], (R(TE))
+ MOVM.DB.W (FROM), [R0-R7]
+ MOVM.DB.W [R0-R7], (TE)
B _b32loop
_b4tail: /* do remaining words if possible */
- MOVW savedts-4(SP), R(TS)
- ADD $3, R(TS), R(TMP)
+ MOVW savedts-4(SP), TS
+ ADD $3, TS, TMP
_b4loop:
- CMP R(TMP), R(TE)
+ CMP TMP, TE
BLS _b1tail
- MOVW.W -4(R(FROM)), R(TMP1) /* pre-indexed */
- MOVW.W R(TMP1), -4(R(TE)) /* pre-indexed */
+ MOVW.W -4(FROM), TMP1 /* pre-indexed */
+ MOVW.W TMP1, -4(TE) /* pre-indexed */
B _b4loop
_b1tail: /* remaining bytes */
- CMP R(TE), R(TS)
+ CMP TE, TS
BEQ _return
- MOVBU.W -1(R(FROM)), R(TMP) /* pre-indexed */
- MOVBU.W R(TMP), -1(R(TE)) /* pre-indexed */
+ MOVBU.W -1(FROM), TMP /* pre-indexed */
+ MOVBU.W TMP, -1(TE) /* pre-indexed */
B _b1tail
_forward:
- CMP $4, R(N) /* need at least 4 bytes to copy */
+ CMP $4, N /* need at least 4 bytes to copy */
BLT _f1tail
_f4align: /* align destination on 4 */
- AND.S $3, R(TS), R(TMP)
+ AND.S $3, TS, TMP
BEQ _f4aligned
- MOVBU.P 1(R(FROM)), R(TMP) /* implicit write back */
- MOVBU.P R(TMP), 1(R(TS)) /* implicit write back */
+ MOVBU.P 1(FROM), TMP /* implicit write back */
+ MOVBU.P TMP, 1(TS) /* implicit write back */
B _f4align
_f4aligned: /* is source now aligned? */
- AND.S $3, R(FROM), R(TMP)
+ AND.S $3, FROM, TMP
BNE _funaligned
- SUB $31, R(TE), R(TMP) /* do 32-byte chunks if possible */
- MOVW R(TE), savedte-4(SP)
+ SUB $31, TE, TMP /* do 32-byte chunks if possible */
+ MOVW TE, savedte-4(SP)
_f32loop:
- CMP R(TMP), R(TS)
+ CMP TMP, TS
BHS _f4tail
- MOVM.IA.W (R(FROM)), [R1-R8]
- MOVM.IA.W [R1-R8], (R(TS))
+ MOVM.IA.W (FROM), [R1-R8]
+ MOVM.IA.W [R1-R8], (TS)
B _f32loop
_f4tail:
- MOVW savedte-4(SP), R(TE)
- SUB $3, R(TE), R(TMP) /* do remaining words if possible */
+ MOVW savedte-4(SP), TE
+ SUB $3, TE, TMP /* do remaining words if possible */
_f4loop:
- CMP R(TMP), R(TS)
+ CMP TMP, TS
BHS _f1tail
- MOVW.P 4(R(FROM)), R(TMP1) /* implicit write back */
- MOVW.P R(TMP1), 4(R(TS)) /* implicit write back */
+ MOVW.P 4(FROM), TMP1 /* implicit write back */
+ MOVW.P TMP1, 4(TS) /* implicit write back */
B _f4loop
_f1tail:
- CMP R(TS), R(TE)
+ CMP TS, TE
BEQ _return
- MOVBU.P 1(R(FROM)), R(TMP) /* implicit write back */
- MOVBU.P R(TMP), 1(R(TS)) /* implicit write back */
+ MOVBU.P 1(FROM), TMP /* implicit write back */
+ MOVBU.P TMP, 1(TS) /* implicit write back */
B _f1tail
_return:
@@ -165,97 +165,97 @@
RET
_bunaligned:
- CMP $2, R(TMP) /* is R(TMP) < 2 ? */
+ CMP $2, TMP /* is TMP < 2 ? */
- MOVW.LT $8, R(RSHIFT) /* (R(n)<<24)|(R(n-1)>>8) */
- MOVW.LT $24, R(LSHIFT)
- MOVW.LT $1, R(OFFSET)
+ MOVW.LT $8, RSHIFT /* (R(n)<<24)|(R(n-1)>>8) */
+ MOVW.LT $24, LSHIFT
+ MOVW.LT $1, OFFSET
- MOVW.EQ $16, R(RSHIFT) /* (R(n)<<16)|(R(n-1)>>16) */
- MOVW.EQ $16, R(LSHIFT)
- MOVW.EQ $2, R(OFFSET)
+ MOVW.EQ $16, RSHIFT /* (R(n)<<16)|(R(n-1)>>16) */
+ MOVW.EQ $16, LSHIFT
+ MOVW.EQ $2, OFFSET
- MOVW.GT $24, R(RSHIFT) /* (R(n)<<8)|(R(n-1)>>24) */
- MOVW.GT $8, R(LSHIFT)
- MOVW.GT $3, R(OFFSET)
+ MOVW.GT $24, RSHIFT /* (R(n)<<8)|(R(n-1)>>24) */
+ MOVW.GT $8, LSHIFT
+ MOVW.GT $3, OFFSET
- ADD $16, R(TS), R(TMP) /* do 16-byte chunks if possible */
- CMP R(TMP), R(TE)
+ ADD $16, TS, TMP /* do 16-byte chunks if possible */
+ CMP TMP, TE
BLS _b1tail
- BIC $3, R(FROM) /* align source */
- MOVW R(TS), savedts-4(SP)
- MOVW (R(FROM)), R(BR0) /* prime first block register */
+ BIC $3, FROM /* align source */
+ MOVW TS, savedts-4(SP)
+ MOVW (FROM), BR0 /* prime first block register */
_bu16loop:
- CMP R(TMP), R(TE)
+ CMP TMP, TE
BLS _bu1tail
- MOVW R(BR0)<<R(LSHIFT), R(BW3)
- MOVM.DB.W (R(FROM)), [R(BR0)-R(BR3)]
- ORR R(BR3)>>R(RSHIFT), R(BW3)
+ MOVW BR0<<LSHIFT, BW3
+ MOVM.DB.W (FROM), [BR0-BR3]
+ ORR BR3>>RSHIFT, BW3
- MOVW R(BR3)<<R(LSHIFT), R(BW2)
- ORR R(BR2)>>R(RSHIFT), R(BW2)
+ MOVW BR3<<LSHIFT, BW2
+ ORR BR2>>RSHIFT, BW2
- MOVW R(BR2)<<R(LSHIFT), R(BW1)
- ORR R(BR1)>>R(RSHIFT), R(BW1)
+ MOVW BR2<<LSHIFT, BW1
+ ORR BR1>>RSHIFT, BW1
- MOVW R(BR1)<<R(LSHIFT), R(BW0)
- ORR R(BR0)>>R(RSHIFT), R(BW0)
+ MOVW BR1<<LSHIFT, BW0
+ ORR BR0>>RSHIFT, BW0
- MOVM.DB.W [R(BW0)-R(BW3)], (R(TE))
+ MOVM.DB.W [BW0-BW3], (TE)
B _bu16loop
_bu1tail:
- MOVW savedts-4(SP), R(TS)
- ADD R(OFFSET), R(FROM)
+ MOVW savedts-4(SP), TS
+ ADD OFFSET, FROM
B _b1tail
_funaligned:
- CMP $2, R(TMP)
+ CMP $2, TMP
- MOVW.LT $8, R(RSHIFT) /* (R(n+1)<<24)|(R(n)>>8) */
- MOVW.LT $24, R(LSHIFT)
- MOVW.LT $3, R(OFFSET)
+ MOVW.LT $8, RSHIFT /* (R(n+1)<<24)|(R(n)>>8) */
+ MOVW.LT $24, LSHIFT
+ MOVW.LT $3, OFFSET
- MOVW.EQ $16, R(RSHIFT) /* (R(n+1)<<16)|(R(n)>>16) */
- MOVW.EQ $16, R(LSHIFT)
- MOVW.EQ $2, R(OFFSET)
+ MOVW.EQ $16, RSHIFT /* (R(n+1)<<16)|(R(n)>>16) */
+ MOVW.EQ $16, LSHIFT
+ MOVW.EQ $2, OFFSET
- MOVW.GT $24, R(RSHIFT) /* (R(n+1)<<8)|(R(n)>>24) */
- MOVW.GT $8, R(LSHIFT)
- MOVW.GT $1, R(OFFSET)
+ MOVW.GT $24, RSHIFT /* (R(n+1)<<8)|(R(n)>>24) */
+ MOVW.GT $8, LSHIFT
+ MOVW.GT $1, OFFSET
- SUB $16, R(TE), R(TMP) /* do 16-byte chunks if possible */
- CMP R(TMP), R(TS)
+ SUB $16, TE, TMP /* do 16-byte chunks if possible */
+ CMP TMP, TS
BHS _f1tail
- BIC $3, R(FROM) /* align source */
- MOVW R(TE), savedte-4(SP)
- MOVW.P 4(R(FROM)), R(FR3) /* prime last block register, implicit write back */
+ BIC $3, FROM /* align source */
+ MOVW TE, savedte-4(SP)
+ MOVW.P 4(FROM), FR3 /* prime last block register, implicit write back */
_fu16loop:
- CMP R(TMP), R(TS)
+ CMP TMP, TS
BHS _fu1tail
- MOVW R(FR3)>>R(RSHIFT), R(FW0)
- MOVM.IA.W (R(FROM)), [R(FR0),R(FR1),R(FR2),R(FR3)]
- ORR R(FR0)<<R(LSHIFT), R(FW0)
+ MOVW FR3>>RSHIFT, FW0
+ MOVM.IA.W (FROM), [FR0,FR1,FR2,FR3]
+ ORR FR0<<LSHIFT, FW0
- MOVW R(FR0)>>R(RSHIFT), R(FW1)
- ORR R(FR1)<<R(LSHIFT), R(FW1)
+ MOVW FR0>>RSHIFT, FW1
+ ORR FR1<<LSHIFT, FW1
- MOVW R(FR1)>>R(RSHIFT), R(FW2)
- ORR R(FR2)<<R(LSHIFT), R(FW2)
+ MOVW FR1>>RSHIFT, FW2
+ ORR FR2<<LSHIFT, FW2
- MOVW R(FR2)>>R(RSHIFT), R(FW3)
- ORR R(FR3)<<R(LSHIFT), R(FW3)
+ MOVW FR2>>RSHIFT, FW3
+ ORR FR3<<LSHIFT, FW3
- MOVM.IA.W [R(FW0),R(FW1),R(FW2),R(FW3)], (R(TS))
+ MOVM.IA.W [FW0,FW1,FW2,FW3], (TS)
B _fu16loop
_fu1tail:
- MOVW savedte-4(SP), R(TE)
- SUB R(OFFSET), R(FROM)
+ MOVW savedte-4(SP), TE
+ SUB OFFSET, FROM
B _f1tail
diff --git a/src/runtime/rt0_linux_arm.s b/src/runtime/rt0_linux_arm.s
index 15a57cb..15c1092 100644
--- a/src/runtime/rt0_linux_arm.s
+++ b/src/runtime/rt0_linux_arm.s
@@ -77,7 +77,7 @@
GLOBL bad_abi_msg(SB), RODATA, $45
TEXT oabi_syscall<>(SB),NOSPLIT,$-4
- ADD $1, PC, R4
+ ADD $1, R15, R4 // R15 is hardware PC
WORD $0xe12fff14 //BX (R4) // enter thumb mode
// TODO(minux): only supports little-endian CPUs
WORD $0x4770df01 // swi $1; bx lr
diff --git a/src/runtime/sys_linux_arm.s b/src/runtime/sys_linux_arm.s
index bf0c810..b0a9b4f 100644
--- a/src/runtime/sys_linux_arm.s
+++ b/src/runtime/sys_linux_arm.s
@@ -383,7 +383,7 @@
// Use kernel version instead of native armcas in asm_arm.s.
// See ../sync/atomic/asm_linux_arm.s for details.
TEXT cas<>(SB),NOSPLIT,$0
- MOVW $0xffff0fc0, PC
+ MOVW $0xffff0fc0, R15 // R15 is hardware PC.
TEXT runtime·cas(SB),NOSPLIT,$0
MOVW ptr+0(FP), R2
diff --git a/src/runtime/vlop_arm.s b/src/runtime/vlop_arm.s
index 5354bf9..28f7519 100644
--- a/src/runtime/vlop_arm.s
+++ b/src/runtime/vlop_arm.s
@@ -27,8 +27,6 @@
#include "go_tls.h"
#include "textflag.h"
-arg=0
-
/* replaced use of R10 by R11 because the former can be the data segment base register */
TEXT _mulv(SB), NOSPLIT, $0
@@ -111,70 +109,71 @@
// Reference:
// Sloss, Andrew et. al; ARM System Developer's Guide: Designing and Optimizing System Software
// Morgan Kaufmann; 1 edition (April 8, 2004), ISBN 978-1558608740
-q = 0 // input d, output q
-r = 1 // input n, output r
-s = 2 // three temporary variables
-M = 3
-a = 11
-// Be careful: R(a) == R11 will be used by the linker for synthesized instructions.
-TEXT udiv<>(SB),NOSPLIT,$-4
- CLZ R(q), R(s) // find normalizing shift
- MOVW.S R(q)<<R(s), R(a)
- MOVW $fast_udiv_tab<>-64(SB), R(M)
- ADD.NE R(a)>>25, R(M), R(a) // index by most significant 7 bits of divisor
- MOVBU.NE (R(a)), R(a)
+#define Rq R0 // input d, output q
+#define Rr R1 // input n, output r
+#define Rs R2 // three temporary variables
+#define RM R3
+#define Ra R11
- SUB.S $7, R(s)
- RSB $0, R(q), R(M) // M = -q
- MOVW.PL R(a)<<R(s), R(q)
+// Be careful: Ra == R11 will be used by the linker for synthesized instructions.
+TEXT udiv<>(SB),NOSPLIT,$-4
+ CLZ Rq, Rs // find normalizing shift
+ MOVW.S Rq<<Rs, Ra
+ MOVW $fast_udiv_tab<>-64(SB), RM
+ ADD.NE Ra>>25, RM, Ra // index by most significant 7 bits of divisor
+ MOVBU.NE (Ra), Ra
+
+ SUB.S $7, Rs
+ RSB $0, Rq, RM // M = -q
+ MOVW.PL Ra<<Rs, Rq
// 1st Newton iteration
- MUL.PL R(M), R(q), R(a) // a = -q*d
+ MUL.PL RM, Rq, Ra // a = -q*d
BMI udiv_by_large_d
- MULAWT R(a), R(q), R(q), R(q) // q approx q-(q*q*d>>32)
- TEQ R(M)->1, R(M) // check for d=0 or d=1
+ MULAWT Ra, Rq, Rq, Rq // q approx q-(q*q*d>>32)
+ TEQ RM->1, RM // check for d=0 or d=1
// 2nd Newton iteration
- MUL.NE R(M), R(q), R(a)
- MOVW.NE $0, R(s)
- MULAL.NE R(q), R(a), (R(q),R(s))
+ MUL.NE RM, Rq, Ra
+ MOVW.NE $0, Rs
+ MULAL.NE Rq, Ra, (Rq,Rs)
BEQ udiv_by_0_or_1
// q now accurate enough for a remainder r, 0<=r<3*d
- MULLU R(q), R(r), (R(q),R(s)) // q = (r * q) >> 32
- ADD R(M), R(r), R(r) // r = n - d
- MULA R(M), R(q), R(r), R(r) // r = n - (q+1)*d
+ MULLU Rq, Rr, (Rq,Rs) // q = (r * q) >> 32
+ ADD RM, Rr, Rr // r = n - d
+ MULA RM, Rq, Rr, Rr // r = n - (q+1)*d
// since 0 <= n-q*d < 3*d; thus -d <= r < 2*d
- CMN R(M), R(r) // t = r-d
- SUB.CS R(M), R(r), R(r) // if (t<-d || t>=0) r=r+d
- ADD.CC $1, R(q)
- ADD.PL R(M)<<1, R(r)
- ADD.PL $2, R(q)
+ CMN RM, Rr // t = r-d
+ SUB.CS RM, Rr, Rr // if (t<-d || t>=0) r=r+d
+ ADD.CC $1, Rq
+ ADD.PL RM<<1, Rr
+ ADD.PL $2, Rq
RET
udiv_by_large_d:
// at this point we know d>=2^(31-6)=2^25
- SUB $4, R(a), R(a)
- RSB $0, R(s), R(s)
- MOVW R(a)>>R(s), R(q)
- MULLU R(q), R(r), (R(q),R(s))
- MULA R(M), R(q), R(r), R(r)
+ SUB $4, Ra, Ra
+ RSB $0, Rs, Rs
+ MOVW Ra>>Rs, Rq
+ MULLU Rq, Rr, (Rq,Rs)
+ MULA RM, Rq, Rr, Rr
// q now accurate enough for a remainder r, 0<=r<4*d
- CMN R(r)>>1, R(M) // if(r/2 >= d)
- ADD.CS R(M)<<1, R(r)
- ADD.CS $2, R(q)
- CMN R(r), R(M)
- ADD.CS R(M), R(r)
- ADD.CS $1, R(q)
+ CMN Rr>>1, RM // if(r/2 >= d)
+ ADD.CS RM<<1, Rr
+ ADD.CS $2, Rq
+ CMN Rr, RM
+ ADD.CS RM, Rr
+ ADD.CS $1, Rq
RET
udiv_by_0_or_1:
// carry set if d==1, carry clear if d==0
BCC udiv_by_0
- MOVW R(r), R(q)
- MOVW $0, R(r)
+ MOVW Rr, Rq
+ MOVW $0, Rr
RET
udiv_by_0:
@@ -216,96 +215,96 @@
DATA fast_udiv_tab<>+0x3c(SB)/4, $0x81828384
GLOBL fast_udiv_tab<>(SB), RODATA, $64
-// The linker will pass numerator in R(TMP), and it also
-// expects the result in R(TMP)
-TMP = 11
+// The linker will pass numerator in RTMP, and it also
+// expects the result in RTMP
+#define RTMP R11
TEXT _divu(SB), NOSPLIT, $16
- MOVW R(q), 4(R13)
- MOVW R(r), 8(R13)
- MOVW R(s), 12(R13)
- MOVW R(M), 16(R13)
+ MOVW Rq, 4(R13)
+ MOVW Rr, 8(R13)
+ MOVW Rs, 12(R13)
+ MOVW RM, 16(R13)
- MOVW R(TMP), R(r) /* numerator */
- MOVW 0(FP), R(q) /* denominator */
+ MOVW RTMP, Rr /* numerator */
+ MOVW 0(FP), Rq /* denominator */
BL udiv<>(SB)
- MOVW R(q), R(TMP)
- MOVW 4(R13), R(q)
- MOVW 8(R13), R(r)
- MOVW 12(R13), R(s)
- MOVW 16(R13), R(M)
+ MOVW Rq, RTMP
+ MOVW 4(R13), Rq
+ MOVW 8(R13), Rr
+ MOVW 12(R13), Rs
+ MOVW 16(R13), RM
RET
TEXT _modu(SB), NOSPLIT, $16
- MOVW R(q), 4(R13)
- MOVW R(r), 8(R13)
- MOVW R(s), 12(R13)
- MOVW R(M), 16(R13)
+ MOVW Rq, 4(R13)
+ MOVW Rr, 8(R13)
+ MOVW Rs, 12(R13)
+ MOVW RM, 16(R13)
- MOVW R(TMP), R(r) /* numerator */
- MOVW 0(FP), R(q) /* denominator */
+ MOVW RTMP, Rr /* numerator */
+ MOVW 0(FP), Rq /* denominator */
BL udiv<>(SB)
- MOVW R(r), R(TMP)
- MOVW 4(R13), R(q)
- MOVW 8(R13), R(r)
- MOVW 12(R13), R(s)
- MOVW 16(R13), R(M)
+ MOVW Rr, RTMP
+ MOVW 4(R13), Rq
+ MOVW 8(R13), Rr
+ MOVW 12(R13), Rs
+ MOVW 16(R13), RM
RET
TEXT _div(SB),NOSPLIT,$16
- MOVW R(q), 4(R13)
- MOVW R(r), 8(R13)
- MOVW R(s), 12(R13)
- MOVW R(M), 16(R13)
- MOVW R(TMP), R(r) /* numerator */
- MOVW 0(FP), R(q) /* denominator */
- CMP $0, R(r)
+ MOVW Rq, 4(R13)
+ MOVW Rr, 8(R13)
+ MOVW Rs, 12(R13)
+ MOVW RM, 16(R13)
+ MOVW RTMP, Rr /* numerator */
+ MOVW 0(FP), Rq /* denominator */
+ CMP $0, Rr
BGE d1
- RSB $0, R(r), R(r)
- CMP $0, R(q)
+ RSB $0, Rr, Rr
+ CMP $0, Rq
BGE d2
- RSB $0, R(q), R(q)
+ RSB $0, Rq, Rq
d0:
BL udiv<>(SB) /* none/both neg */
- MOVW R(q), R(TMP)
+ MOVW Rq, RTMP
B out1
d1:
- CMP $0, R(q)
+ CMP $0, Rq
BGE d0
- RSB $0, R(q), R(q)
+ RSB $0, Rq, Rq
d2:
BL udiv<>(SB) /* one neg */
- RSB $0, R(q), R(TMP)
+ RSB $0, Rq, RTMP
out1:
- MOVW 4(R13), R(q)
- MOVW 8(R13), R(r)
- MOVW 12(R13), R(s)
- MOVW 16(R13), R(M)
+ MOVW 4(R13), Rq
+ MOVW 8(R13), Rr
+ MOVW 12(R13), Rs
+ MOVW 16(R13), RM
RET
TEXT _mod(SB),NOSPLIT,$16
- MOVW R(q), 4(R13)
- MOVW R(r), 8(R13)
- MOVW R(s), 12(R13)
- MOVW R(M), 16(R13)
- MOVW R(TMP), R(r) /* numerator */
- MOVW 0(FP), R(q) /* denominator */
- CMP $0, R(q)
- RSB.LT $0, R(q), R(q)
- CMP $0, R(r)
+ MOVW Rq, 4(R13)
+ MOVW Rr, 8(R13)
+ MOVW Rs, 12(R13)
+ MOVW RM, 16(R13)
+ MOVW RTMP, Rr /* numerator */
+ MOVW 0(FP), Rq /* denominator */
+ CMP $0, Rq
+ RSB.LT $0, Rq, Rq
+ CMP $0, Rr
BGE m1
- RSB $0, R(r), R(r)
+ RSB $0, Rr, Rr
BL udiv<>(SB) /* neg numerator */
- RSB $0, R(r), R(TMP)
+ RSB $0, Rr, RTMP
B out
m1:
BL udiv<>(SB) /* pos numerator */
- MOVW R(r), R(TMP)
+ MOVW Rr, RTMP
out:
- MOVW 4(R13), R(q)
- MOVW 8(R13), R(r)
- MOVW 12(R13), R(s)
- MOVW 16(R13), R(M)
+ MOVW 4(R13), Rq
+ MOVW 8(R13), Rr
+ MOVW 12(R13), Rs
+ MOVW 16(R13), RM
RET
// _mul64by32 and _div64by32 not implemented on arm