[dev.cc] all: edit assembly source for ARM to be more regular

Several .s files for ARM had several properties the new assembler will not support.
These include:

- mentioning SP or PC as a hardware register
	These are always pseudo-registers except that in some contexts
	they're not, and it's confusing because the context should not affect
	which register you mean. Change the references to the hardware
	registers to be explicit: R13 for SP, R15 for PC.
- constant creation using assignment
	The files say a=b when they could instead say #define a b.
	There is no reason to have both mechanisms.
- R(0) to refer to R0.
	Some macros use this to a great extent. Again, it's easy just to
	use a #define to rename a register.

Change-Id: I002335ace8e876c5b63c71c2560533eb835346d2
Reviewed-on: https://go-review.googlesource.com/4822
Reviewed-by: Dave Cheney <dave@cheney.net>
diff --git a/src/runtime/asm_arm.s b/src/runtime/asm_arm.s
index 2efeaaa..cd81c25 100644
--- a/src/runtime/asm_arm.s
+++ b/src/runtime/asm_arm.s
@@ -107,7 +107,7 @@
 // save state in Gobuf; setjmp
 TEXT runtime·gosave(SB),NOSPLIT,$-4-4
 	MOVW	0(FP), R0		// gobuf
-	MOVW	SP, gobuf_sp(R0)
+	MOVW	R13, gobuf_sp(R0)
 	MOVW	LR, gobuf_pc(R0)
 	MOVW	g, gobuf_g(R0)
 	MOVW	$0, R11
@@ -133,7 +133,7 @@
 	// after this point: it must be straight-line code until the
 	// final B instruction.
 	// See large comment in sigprof for more details.
-	MOVW	gobuf_sp(R1), SP	// restore SP
+	MOVW	gobuf_sp(R1), R13	// restore SP==R13
 	MOVW	gobuf_lr(R1), LR
 	MOVW	gobuf_ret(R1), R0
 	MOVW	gobuf_ctxt(R1), R7
@@ -152,7 +152,7 @@
 // to keep running g.
 TEXT runtime·mcall(SB),NOSPLIT,$-4-4
 	// Save caller state in g->sched.
-	MOVW	SP, (g_sched+gobuf_sp)(g)
+	MOVW	R13, (g_sched+gobuf_sp)(g)
 	MOVW	LR, (g_sched+gobuf_pc)(g)
 	MOVW	$0, R11
 	MOVW	R11, (g_sched+gobuf_lr)(g)
@@ -170,8 +170,8 @@
 	CMP	$0, R11
 	BL.NE	runtime·save_g(SB)
 	MOVW	fn+0(FP), R0
-	MOVW	(g_sched+gobuf_sp)(g), SP
-	SUB	$8, SP
+	MOVW	(g_sched+gobuf_sp)(g), R13
+	SUB	$8, R13
 	MOVW	R1, 4(SP)
 	MOVW	R0, R7
 	MOVW	0(R0), R0
@@ -217,7 +217,7 @@
 	MOVW	$runtime·systemstack_switch(SB), R3
 	ADD	$4, R3, R3 // get past push {lr}
 	MOVW	R3, (g_sched+gobuf_pc)(g)
-	MOVW	SP, (g_sched+gobuf_sp)(g)
+	MOVW	R13, (g_sched+gobuf_sp)(g)
 	MOVW	LR, (g_sched+gobuf_lr)(g)
 	MOVW	g, (g_sched+gobuf_g)(g)
 
@@ -231,7 +231,7 @@
 	SUB	$4, R3, R3
 	MOVW	$runtime·mstart(SB), R4
 	MOVW	R4, 0(R3)
-	MOVW	R3, SP
+	MOVW	R3, R13
 
 	// call target function
 	MOVW	R0, R7
@@ -242,7 +242,7 @@
 	MOVW	g_m(g), R1
 	MOVW	m_curg(R1), R0
 	BL	setg<>(SB)
-	MOVW	(g_sched+gobuf_sp)(g), SP
+	MOVW	(g_sched+gobuf_sp)(g), R13
 	MOVW	$0, R3
 	MOVW	R3, (g_sched+gobuf_sp)(g)
 	RET
@@ -284,21 +284,21 @@
 	// Called from f.
 	// Set g->sched to context in f.
 	MOVW	R7, (g_sched+gobuf_ctxt)(g)
-	MOVW	SP, (g_sched+gobuf_sp)(g)
+	MOVW	R13, (g_sched+gobuf_sp)(g)
 	MOVW	LR, (g_sched+gobuf_pc)(g)
 	MOVW	R3, (g_sched+gobuf_lr)(g)
 
 	// Called from f.
 	// Set m->morebuf to f's caller.
 	MOVW	R3, (m_morebuf+gobuf_pc)(R8)	// f's caller's PC
-	MOVW	SP, (m_morebuf+gobuf_sp)(R8)	// f's caller's SP
+	MOVW	R13, (m_morebuf+gobuf_sp)(R8)	// f's caller's SP
 	MOVW	$4(SP), R3			// f's argument pointer
 	MOVW	g, (m_morebuf+gobuf_g)(R8)
 
 	// Call newstack on m->g0's stack.
 	MOVW	m_g0(R8), R0
 	BL	setg<>(SB)
-	MOVW	(g_sched+gobuf_sp)(g), SP
+	MOVW	(g_sched+gobuf_sp)(g), R13
 	BL	runtime·newstack(SB)
 
 	// Not reached, but make sure the return PC from the call to newstack
@@ -362,7 +362,7 @@
 	/* copy arguments to stack */		\
 	MOVW	argptr+8(FP), R0;		\
 	MOVW	argsize+12(FP), R2;		\
-	ADD	$4, SP, R1;			\
+	ADD	$4, R13, R1;			\
 	CMP	$0, R2;				\
 	B.EQ	5(PC);				\
 	MOVBU.P	1(R0), R5;			\
@@ -378,7 +378,7 @@
 	MOVW	argptr+8(FP), R0;		\
 	MOVW	argsize+12(FP), R2;		\
 	MOVW	retoffset+16(FP), R3;		\
-	ADD	$4, SP, R1;			\
+	ADD	$4, R13, R1;			\
 	ADD	R3, R1;				\
 	ADD	R3, R0;				\
 	SUB	R3, R2;				\
@@ -443,8 +443,8 @@
 	MOVW	0(SP), LR
 	MOVW	$-4(LR), LR	// BL deferreturn
 	MOVW	fv+0(FP), R7
-	MOVW	argp+4(FP), SP
-	MOVW	$-4(SP), SP	// SP is 4 below argp, due to saved LR
+	MOVW	argp+4(FP), R13
+	MOVW	$-4(SP), R13	// SP is 4 below argp, due to saved LR
 	MOVW	0(R7), R1
 	B	(R1)
 
diff --git a/src/runtime/memclr_arm.s b/src/runtime/memclr_arm.s
index 1824d33..8b5fe31 100644
--- a/src/runtime/memclr_arm.s
+++ b/src/runtime/memclr_arm.s
@@ -25,31 +25,31 @@
 
 #include "textflag.h"
 
-TO = 8
-TOE = 11
-N = 12
-TMP = 12				/* N and TMP don't overlap */
+#define TO	R8
+#define TOE	R11
+#define N	R12
+#define TMP	R12				/* N and TMP don't overlap */
 
 TEXT runtime·memclr(SB),NOSPLIT,$0-8
-	MOVW	ptr+0(FP), R(TO)
-	MOVW	n+4(FP), R(N)
-	MOVW	$0, R(0)
+	MOVW	ptr+0(FP), TO
+	MOVW	n+4(FP), N
+	MOVW	$0, R0
 
-	ADD	R(N), R(TO), R(TOE)	/* to end pointer */
+	ADD	N, TO, TOE	/* to end pointer */
 
-	CMP	$4, R(N)		/* need at least 4 bytes to copy */
+	CMP	$4, N		/* need at least 4 bytes to copy */
 	BLT	_1tail
 
 _4align:				/* align on 4 */
-	AND.S	$3, R(TO), R(TMP)
+	AND.S	$3, TO, TMP
 	BEQ	_4aligned
 
-	MOVBU.P	R(0), 1(R(TO))		/* implicit write back */
+	MOVBU.P	R0, 1(TO)		/* implicit write back */
 	B	_4align
 
 _4aligned:
-	SUB	$31, R(TOE), R(TMP)	/* do 32-byte chunks if possible */
-	CMP	R(TMP), R(TO)
+	SUB	$31, TOE, TMP	/* do 32-byte chunks if possible */
+	CMP	TMP, TO
 	BHS	_4tail
 
 	MOVW	R0, R1			/* replicate */
@@ -61,26 +61,26 @@
 	MOVW	R0, R7
 
 _f32loop:
-	CMP	R(TMP), R(TO)
+	CMP	TMP, TO
 	BHS	_4tail
 
-	MOVM.IA.W [R0-R7], (R(TO))
+	MOVM.IA.W [R0-R7], (TO)
 	B	_f32loop
 
 _4tail:
-	SUB	$3, R(TOE), R(TMP)	/* do remaining words if possible */
+	SUB	$3, TOE, TMP	/* do remaining words if possible */
 _4loop:
-	CMP	R(TMP), R(TO)
+	CMP	TMP, TO
 	BHS	_1tail
 
-	MOVW.P	R(0), 4(R(TO))		/* implicit write back */
+	MOVW.P	R0, 4(TO)		/* implicit write back */
 	B	_4loop
 
 _1tail:
-	CMP	R(TO), R(TOE)
+	CMP	TO, TOE
 	BEQ	_return
 
-	MOVBU.P	R(0), 1(R(TO))		/* implicit write back */
+	MOVBU.P	R0, 1(TO)		/* implicit write back */
 	B	_1tail
 
 _return:
diff --git a/src/runtime/memmove_arm.s b/src/runtime/memmove_arm.s
index f187d42..35f04a8 100644
--- a/src/runtime/memmove_arm.s
+++ b/src/runtime/memmove_arm.s
@@ -26,138 +26,138 @@
 #include "textflag.h"
 
 // TE or TS are spilled to the stack during bulk register moves.
-TS = 0
-TE = 8
+#define TS	R0
+#define TE	R8
 
 // Warning: the linker will use R11 to synthesize certain instructions. Please
 // take care and double check with objdump.
-FROM = 11
-N = 12
-TMP = 12				/* N and TMP don't overlap */
-TMP1 = 5
+#define FROM	R11
+#define N	R12
+#define TMP	R12				/* N and TMP don't overlap */
+#define TMP1	R5
 
-RSHIFT = 5
-LSHIFT = 6
-OFFSET = 7
+#define RSHIFT	R5
+#define LSHIFT	R6
+#define OFFSET	R7
 
-BR0 = 0					/* shared with TS */
-BW0 = 1
-BR1 = 1
-BW1 = 2
-BR2 = 2
-BW2 = 3
-BR3 = 3
-BW3 = 4
+#define BR0	R0					/* shared with TS */
+#define BW0	R1
+#define BR1	R1
+#define BW1	R2
+#define BR2	R2
+#define BW2	R3
+#define BR3	R3
+#define BW3	R4
 
-FW0 = 1
-FR0 = 2
-FW1 = 2
-FR1 = 3
-FW2 = 3
-FR2 = 4
-FW3 = 4
-FR3 = 8					/* shared with TE */
+#define FW0	R1
+#define FR0	R2
+#define FW1	R2
+#define FR1	R3
+#define FW2	R3
+#define FR2	R4
+#define FW3	R4
+#define FR3	R8					/* shared with TE */
 
 TEXT runtime·memmove(SB), NOSPLIT, $4-12
 _memmove:
-	MOVW	to+0(FP), R(TS)
-	MOVW	from+4(FP), R(FROM)
-	MOVW	n+8(FP), R(N)
+	MOVW	to+0(FP), TS
+	MOVW	from+4(FP), FROM
+	MOVW	n+8(FP), N
 
-	ADD	R(N), R(TS), R(TE)	/* to end pointer */
+	ADD	N, TS, TE	/* to end pointer */
 
-	CMP	R(FROM), R(TS)
+	CMP	FROM, TS
 	BLS	_forward
 
 _back:
-	ADD	R(N), R(FROM)		/* from end pointer */
-	CMP	$4, R(N)		/* need at least 4 bytes to copy */
+	ADD	N, FROM		/* from end pointer */
+	CMP	$4, N		/* need at least 4 bytes to copy */
 	BLT	_b1tail
 
 _b4align:				/* align destination on 4 */
-	AND.S	$3, R(TE), R(TMP)
+	AND.S	$3, TE, TMP
 	BEQ	_b4aligned
 
-	MOVBU.W	-1(R(FROM)), R(TMP)	/* pre-indexed */
-	MOVBU.W	R(TMP), -1(R(TE))	/* pre-indexed */
+	MOVBU.W	-1(FROM), TMP	/* pre-indexed */
+	MOVBU.W	TMP, -1(TE)	/* pre-indexed */
 	B	_b4align
 
 _b4aligned:				/* is source now aligned? */
-	AND.S	$3, R(FROM), R(TMP)
+	AND.S	$3, FROM, TMP
 	BNE	_bunaligned
 
-	ADD	$31, R(TS), R(TMP)	/* do 32-byte chunks if possible */
-	MOVW	R(TS), savedts-4(SP)
+	ADD	$31, TS, TMP	/* do 32-byte chunks if possible */
+	MOVW	TS, savedts-4(SP)
 _b32loop:
-	CMP	R(TMP), R(TE)
+	CMP	TMP, TE
 	BLS	_b4tail
 
-	MOVM.DB.W (R(FROM)), [R0-R7]
-	MOVM.DB.W [R0-R7], (R(TE))
+	MOVM.DB.W (FROM), [R0-R7]
+	MOVM.DB.W [R0-R7], (TE)
 	B	_b32loop
 
 _b4tail:				/* do remaining words if possible */
-	MOVW	savedts-4(SP), R(TS)
-	ADD	$3, R(TS), R(TMP)
+	MOVW	savedts-4(SP), TS
+	ADD	$3, TS, TMP
 _b4loop:
-	CMP	R(TMP), R(TE)
+	CMP	TMP, TE
 	BLS	_b1tail
 
-	MOVW.W	-4(R(FROM)), R(TMP1)	/* pre-indexed */
-	MOVW.W	R(TMP1), -4(R(TE))	/* pre-indexed */
+	MOVW.W	-4(FROM), TMP1	/* pre-indexed */
+	MOVW.W	TMP1, -4(TE)	/* pre-indexed */
 	B	_b4loop
 
 _b1tail:				/* remaining bytes */
-	CMP	R(TE), R(TS)
+	CMP	TE, TS
 	BEQ	_return
 
-	MOVBU.W	-1(R(FROM)), R(TMP)	/* pre-indexed */
-	MOVBU.W	R(TMP), -1(R(TE))	/* pre-indexed */
+	MOVBU.W	-1(FROM), TMP	/* pre-indexed */
+	MOVBU.W	TMP, -1(TE)	/* pre-indexed */
 	B	_b1tail
 
 _forward:
-	CMP	$4, R(N)		/* need at least 4 bytes to copy */
+	CMP	$4, N		/* need at least 4 bytes to copy */
 	BLT	_f1tail
 
 _f4align:				/* align destination on 4 */
-	AND.S	$3, R(TS), R(TMP)
+	AND.S	$3, TS, TMP
 	BEQ	_f4aligned
 
-	MOVBU.P	1(R(FROM)), R(TMP)	/* implicit write back */
-	MOVBU.P	R(TMP), 1(R(TS))	/* implicit write back */
+	MOVBU.P	1(FROM), TMP	/* implicit write back */
+	MOVBU.P	TMP, 1(TS)	/* implicit write back */
 	B	_f4align
 
 _f4aligned:				/* is source now aligned? */
-	AND.S	$3, R(FROM), R(TMP)
+	AND.S	$3, FROM, TMP
 	BNE	_funaligned
 
-	SUB	$31, R(TE), R(TMP)	/* do 32-byte chunks if possible */
-	MOVW	R(TE), savedte-4(SP)
+	SUB	$31, TE, TMP	/* do 32-byte chunks if possible */
+	MOVW	TE, savedte-4(SP)
 _f32loop:
-	CMP	R(TMP), R(TS)
+	CMP	TMP, TS
 	BHS	_f4tail
 
-	MOVM.IA.W (R(FROM)), [R1-R8] 
-	MOVM.IA.W [R1-R8], (R(TS))
+	MOVM.IA.W (FROM), [R1-R8] 
+	MOVM.IA.W [R1-R8], (TS)
 	B	_f32loop
 
 _f4tail:
-	MOVW	savedte-4(SP), R(TE)
-	SUB	$3, R(TE), R(TMP)	/* do remaining words if possible */
+	MOVW	savedte-4(SP), TE
+	SUB	$3, TE, TMP	/* do remaining words if possible */
 _f4loop:
-	CMP	R(TMP), R(TS)
+	CMP	TMP, TS
 	BHS	_f1tail
 
-	MOVW.P	4(R(FROM)), R(TMP1)	/* implicit write back */
-	MOVW.P	R(TMP1), 4(R(TS))	/* implicit write back */
+	MOVW.P	4(FROM), TMP1	/* implicit write back */
+	MOVW.P	TMP1, 4(TS)	/* implicit write back */
 	B	_f4loop
 
 _f1tail:
-	CMP	R(TS), R(TE)
+	CMP	TS, TE
 	BEQ	_return
 
-	MOVBU.P	1(R(FROM)), R(TMP)	/* implicit write back */
-	MOVBU.P	R(TMP), 1(R(TS))	/* implicit write back */
+	MOVBU.P	1(FROM), TMP	/* implicit write back */
+	MOVBU.P	TMP, 1(TS)	/* implicit write back */
 	B	_f1tail
 
 _return:
@@ -165,97 +165,97 @@
 	RET
 
 _bunaligned:
-	CMP	$2, R(TMP)		/* is R(TMP) < 2 ? */
+	CMP	$2, TMP		/* is TMP < 2 ? */
 
-	MOVW.LT	$8, R(RSHIFT)		/* (R(n)<<24)|(R(n-1)>>8) */
-	MOVW.LT	$24, R(LSHIFT)
-	MOVW.LT	$1, R(OFFSET)
+	MOVW.LT	$8, RSHIFT		/* (R(n)<<24)|(R(n-1)>>8) */
+	MOVW.LT	$24, LSHIFT
+	MOVW.LT	$1, OFFSET
 
-	MOVW.EQ	$16, R(RSHIFT)		/* (R(n)<<16)|(R(n-1)>>16) */
-	MOVW.EQ	$16, R(LSHIFT)
-	MOVW.EQ	$2, R(OFFSET)
+	MOVW.EQ	$16, RSHIFT		/* (R(n)<<16)|(R(n-1)>>16) */
+	MOVW.EQ	$16, LSHIFT
+	MOVW.EQ	$2, OFFSET
 
-	MOVW.GT	$24, R(RSHIFT)		/* (R(n)<<8)|(R(n-1)>>24) */
-	MOVW.GT	$8, R(LSHIFT)
-	MOVW.GT	$3, R(OFFSET)
+	MOVW.GT	$24, RSHIFT		/* (R(n)<<8)|(R(n-1)>>24) */
+	MOVW.GT	$8, LSHIFT
+	MOVW.GT	$3, OFFSET
 
-	ADD	$16, R(TS), R(TMP)	/* do 16-byte chunks if possible */
-	CMP	R(TMP), R(TE)
+	ADD	$16, TS, TMP	/* do 16-byte chunks if possible */
+	CMP	TMP, TE
 	BLS	_b1tail
 
-	BIC	$3, R(FROM)		/* align source */
-	MOVW	R(TS), savedts-4(SP)
-	MOVW	(R(FROM)), R(BR0)	/* prime first block register */
+	BIC	$3, FROM		/* align source */
+	MOVW	TS, savedts-4(SP)
+	MOVW	(FROM), BR0	/* prime first block register */
 
 _bu16loop:
-	CMP	R(TMP), R(TE)
+	CMP	TMP, TE
 	BLS	_bu1tail
 
-	MOVW	R(BR0)<<R(LSHIFT), R(BW3)
-	MOVM.DB.W (R(FROM)), [R(BR0)-R(BR3)]
-	ORR	R(BR3)>>R(RSHIFT), R(BW3)
+	MOVW	BR0<<LSHIFT, BW3
+	MOVM.DB.W (FROM), [BR0-BR3]
+	ORR	BR3>>RSHIFT, BW3
 
-	MOVW	R(BR3)<<R(LSHIFT), R(BW2)
-	ORR	R(BR2)>>R(RSHIFT), R(BW2)
+	MOVW	BR3<<LSHIFT, BW2
+	ORR	BR2>>RSHIFT, BW2
 
-	MOVW	R(BR2)<<R(LSHIFT), R(BW1)
-	ORR	R(BR1)>>R(RSHIFT), R(BW1)
+	MOVW	BR2<<LSHIFT, BW1
+	ORR	BR1>>RSHIFT, BW1
 
-	MOVW	R(BR1)<<R(LSHIFT), R(BW0)
-	ORR	R(BR0)>>R(RSHIFT), R(BW0)
+	MOVW	BR1<<LSHIFT, BW0
+	ORR	BR0>>RSHIFT, BW0
 
-	MOVM.DB.W [R(BW0)-R(BW3)], (R(TE))
+	MOVM.DB.W [BW0-BW3], (TE)
 	B	_bu16loop
 
 _bu1tail:
-	MOVW	savedts-4(SP), R(TS)
-	ADD	R(OFFSET), R(FROM)
+	MOVW	savedts-4(SP), TS
+	ADD	OFFSET, FROM
 	B	_b1tail
 
 _funaligned:
-	CMP	$2, R(TMP)
+	CMP	$2, TMP
 
-	MOVW.LT	$8, R(RSHIFT)		/* (R(n+1)<<24)|(R(n)>>8) */
-	MOVW.LT	$24, R(LSHIFT)
-	MOVW.LT	$3, R(OFFSET)
+	MOVW.LT	$8, RSHIFT		/* (R(n+1)<<24)|(R(n)>>8) */
+	MOVW.LT	$24, LSHIFT
+	MOVW.LT	$3, OFFSET
 
-	MOVW.EQ	$16, R(RSHIFT)		/* (R(n+1)<<16)|(R(n)>>16) */
-	MOVW.EQ	$16, R(LSHIFT)
-	MOVW.EQ	$2, R(OFFSET)
+	MOVW.EQ	$16, RSHIFT		/* (R(n+1)<<16)|(R(n)>>16) */
+	MOVW.EQ	$16, LSHIFT
+	MOVW.EQ	$2, OFFSET
 
-	MOVW.GT	$24, R(RSHIFT)		/* (R(n+1)<<8)|(R(n)>>24) */
-	MOVW.GT	$8, R(LSHIFT)
-	MOVW.GT	$1, R(OFFSET)
+	MOVW.GT	$24, RSHIFT		/* (R(n+1)<<8)|(R(n)>>24) */
+	MOVW.GT	$8, LSHIFT
+	MOVW.GT	$1, OFFSET
 
-	SUB	$16, R(TE), R(TMP)	/* do 16-byte chunks if possible */
-	CMP	R(TMP), R(TS)
+	SUB	$16, TE, TMP	/* do 16-byte chunks if possible */
+	CMP	TMP, TS
 	BHS	_f1tail
 
-	BIC	$3, R(FROM)		/* align source */
-	MOVW	R(TE), savedte-4(SP)
-	MOVW.P	4(R(FROM)), R(FR3)	/* prime last block register, implicit write back */
+	BIC	$3, FROM		/* align source */
+	MOVW	TE, savedte-4(SP)
+	MOVW.P	4(FROM), FR3	/* prime last block register, implicit write back */
 
 _fu16loop:
-	CMP	R(TMP), R(TS)
+	CMP	TMP, TS
 	BHS	_fu1tail
 
-	MOVW	R(FR3)>>R(RSHIFT), R(FW0)
-	MOVM.IA.W (R(FROM)), [R(FR0),R(FR1),R(FR2),R(FR3)]
-	ORR	R(FR0)<<R(LSHIFT), R(FW0)
+	MOVW	FR3>>RSHIFT, FW0
+	MOVM.IA.W (FROM), [FR0,FR1,FR2,FR3]
+	ORR	FR0<<LSHIFT, FW0
 
-	MOVW	R(FR0)>>R(RSHIFT), R(FW1)
-	ORR	R(FR1)<<R(LSHIFT), R(FW1)
+	MOVW	FR0>>RSHIFT, FW1
+	ORR	FR1<<LSHIFT, FW1
 
-	MOVW	R(FR1)>>R(RSHIFT), R(FW2)
-	ORR	R(FR2)<<R(LSHIFT), R(FW2)
+	MOVW	FR1>>RSHIFT, FW2
+	ORR	FR2<<LSHIFT, FW2
 
-	MOVW	R(FR2)>>R(RSHIFT), R(FW3)
-	ORR	R(FR3)<<R(LSHIFT), R(FW3)
+	MOVW	FR2>>RSHIFT, FW3
+	ORR	FR3<<LSHIFT, FW3
 
-	MOVM.IA.W [R(FW0),R(FW1),R(FW2),R(FW3)], (R(TS))
+	MOVM.IA.W [FW0,FW1,FW2,FW3], (TS)
 	B	_fu16loop
 
 _fu1tail:
-	MOVW	savedte-4(SP), R(TE)
-	SUB	R(OFFSET), R(FROM)
+	MOVW	savedte-4(SP), TE
+	SUB	OFFSET, FROM
 	B	_f1tail
diff --git a/src/runtime/rt0_linux_arm.s b/src/runtime/rt0_linux_arm.s
index 15a57cb..15c1092 100644
--- a/src/runtime/rt0_linux_arm.s
+++ b/src/runtime/rt0_linux_arm.s
@@ -77,7 +77,7 @@
 GLOBL bad_abi_msg(SB), RODATA, $45
 
 TEXT oabi_syscall<>(SB),NOSPLIT,$-4
-	ADD $1, PC, R4
+	ADD $1, R15, R4 // R15 is hardware PC
 	WORD $0xe12fff14 //BX	(R4) // enter thumb mode
 	// TODO(minux): only supports little-endian CPUs
 	WORD $0x4770df01 // swi $1; bx lr
diff --git a/src/runtime/sys_linux_arm.s b/src/runtime/sys_linux_arm.s
index bf0c810..b0a9b4f 100644
--- a/src/runtime/sys_linux_arm.s
+++ b/src/runtime/sys_linux_arm.s
@@ -383,7 +383,7 @@
 // Use kernel version instead of native armcas in asm_arm.s.
 // See ../sync/atomic/asm_linux_arm.s for details.
 TEXT cas<>(SB),NOSPLIT,$0
-	MOVW	$0xffff0fc0, PC
+	MOVW	$0xffff0fc0, R15 // R15 is hardware PC.
 
 TEXT runtime·cas(SB),NOSPLIT,$0
 	MOVW	ptr+0(FP), R2
diff --git a/src/runtime/vlop_arm.s b/src/runtime/vlop_arm.s
index 5354bf9..28f7519 100644
--- a/src/runtime/vlop_arm.s
+++ b/src/runtime/vlop_arm.s
@@ -27,8 +27,6 @@
 #include "go_tls.h"
 #include "textflag.h"
 
-arg=0
-
 /* replaced use of R10 by R11 because the former can be the data segment base register */
 
 TEXT _mulv(SB), NOSPLIT, $0
@@ -111,70 +109,71 @@
 // Reference: 
 // Sloss, Andrew et. al; ARM System Developer's Guide: Designing and Optimizing System Software
 // Morgan Kaufmann; 1 edition (April 8, 2004), ISBN 978-1558608740
-q = 0 // input d, output q
-r = 1 // input n, output r
-s = 2 // three temporary variables
-M = 3
-a = 11
-// Be careful: R(a) == R11 will be used by the linker for synthesized instructions.
-TEXT udiv<>(SB),NOSPLIT,$-4
-	CLZ 	R(q), R(s) // find normalizing shift
-	MOVW.S	R(q)<<R(s), R(a)
-	MOVW	$fast_udiv_tab<>-64(SB), R(M)
-	ADD.NE	R(a)>>25, R(M), R(a) // index by most significant 7 bits of divisor
-	MOVBU.NE	(R(a)), R(a)
+#define Rq	R0 // input d, output q
+#define Rr	R1 // input n, output r
+#define Rs	R2 // three temporary variables
+#define RM	R3
+#define Ra	R11
 
-	SUB.S	$7, R(s)
-	RSB 	$0, R(q), R(M) // M = -q
-	MOVW.PL	R(a)<<R(s), R(q)
+// Be careful: Ra == R11 will be used by the linker for synthesized instructions.
+TEXT udiv<>(SB),NOSPLIT,$-4
+	CLZ 	Rq, Rs // find normalizing shift
+	MOVW.S	Rq<<Rs, Ra
+	MOVW	$fast_udiv_tab<>-64(SB), RM
+	ADD.NE	Ra>>25, RM, Ra // index by most significant 7 bits of divisor
+	MOVBU.NE	(Ra), Ra
+
+	SUB.S	$7, Rs
+	RSB 	$0, Rq, RM // M = -q
+	MOVW.PL	Ra<<Rs, Rq
 
 	// 1st Newton iteration
-	MUL.PL	R(M), R(q), R(a) // a = -q*d
+	MUL.PL	RM, Rq, Ra // a = -q*d
 	BMI 	udiv_by_large_d
-	MULAWT	R(a), R(q), R(q), R(q) // q approx q-(q*q*d>>32)
-	TEQ 	R(M)->1, R(M) // check for d=0 or d=1
+	MULAWT	Ra, Rq, Rq, Rq // q approx q-(q*q*d>>32)
+	TEQ 	RM->1, RM // check for d=0 or d=1
 
 	// 2nd Newton iteration
-	MUL.NE	R(M), R(q), R(a)
-	MOVW.NE	$0, R(s)
-	MULAL.NE R(q), R(a), (R(q),R(s))
+	MUL.NE	RM, Rq, Ra
+	MOVW.NE	$0, Rs
+	MULAL.NE Rq, Ra, (Rq,Rs)
 	BEQ 	udiv_by_0_or_1
 
 	// q now accurate enough for a remainder r, 0<=r<3*d
-	MULLU	R(q), R(r), (R(q),R(s)) // q = (r * q) >> 32	
-	ADD 	R(M), R(r), R(r) // r = n - d
-	MULA	R(M), R(q), R(r), R(r) // r = n - (q+1)*d
+	MULLU	Rq, Rr, (Rq,Rs) // q = (r * q) >> 32
+	ADD 	RM, Rr, Rr // r = n - d
+	MULA	RM, Rq, Rr, Rr // r = n - (q+1)*d
 
 	// since 0 <= n-q*d < 3*d; thus -d <= r < 2*d
-	CMN 	R(M), R(r) // t = r-d
-	SUB.CS	R(M), R(r), R(r) // if (t<-d || t>=0) r=r+d
-	ADD.CC	$1, R(q)
-	ADD.PL	R(M)<<1, R(r)
-	ADD.PL	$2, R(q)
+	CMN 	RM, Rr // t = r-d
+	SUB.CS	RM, Rr, Rr // if (t<-d || t>=0) r=r+d
+	ADD.CC	$1, Rq
+	ADD.PL	RM<<1, Rr
+	ADD.PL	$2, Rq
 	RET
 
 udiv_by_large_d:
 	// at this point we know d>=2^(31-6)=2^25
-	SUB 	$4, R(a), R(a)
-	RSB 	$0, R(s), R(s)
-	MOVW	R(a)>>R(s), R(q)
-	MULLU	R(q), R(r), (R(q),R(s))
-	MULA	R(M), R(q), R(r), R(r)
+	SUB 	$4, Ra, Ra
+	RSB 	$0, Rs, Rs
+	MOVW	Ra>>Rs, Rq
+	MULLU	Rq, Rr, (Rq,Rs)
+	MULA	RM, Rq, Rr, Rr
 
 	// q now accurate enough for a remainder r, 0<=r<4*d
-	CMN 	R(r)>>1, R(M) // if(r/2 >= d)
-	ADD.CS	R(M)<<1, R(r)
-	ADD.CS	$2, R(q)
-	CMN 	R(r), R(M)
-	ADD.CS	R(M), R(r)
-	ADD.CS	$1, R(q)
+	CMN 	Rr>>1, RM // if(r/2 >= d)
+	ADD.CS	RM<<1, Rr
+	ADD.CS	$2, Rq
+	CMN 	Rr, RM
+	ADD.CS	RM, Rr
+	ADD.CS	$1, Rq
 	RET
 
 udiv_by_0_or_1:
 	// carry set if d==1, carry clear if d==0
 	BCC udiv_by_0
-	MOVW	R(r), R(q)
-	MOVW	$0, R(r)
+	MOVW	Rr, Rq
+	MOVW	$0, Rr
 	RET
 
 udiv_by_0:
@@ -216,96 +215,96 @@
 DATA fast_udiv_tab<>+0x3c(SB)/4, $0x81828384
 GLOBL fast_udiv_tab<>(SB), RODATA, $64
 
-// The linker will pass numerator in R(TMP), and it also
-// expects the result in R(TMP)
-TMP = 11
+// The linker will pass numerator in RTMP, and it also
+// expects the result in RTMP
+#define RTMP R11
 
 TEXT _divu(SB), NOSPLIT, $16
-	MOVW	R(q), 4(R13)
-	MOVW	R(r), 8(R13)
-	MOVW	R(s), 12(R13)
-	MOVW	R(M), 16(R13)
+	MOVW	Rq, 4(R13)
+	MOVW	Rr, 8(R13)
+	MOVW	Rs, 12(R13)
+	MOVW	RM, 16(R13)
 
-	MOVW	R(TMP), R(r)		/* numerator */
-	MOVW	0(FP), R(q) 		/* denominator */
+	MOVW	RTMP, Rr		/* numerator */
+	MOVW	0(FP), Rq 		/* denominator */
 	BL  	udiv<>(SB)
-	MOVW	R(q), R(TMP)
-	MOVW	4(R13), R(q)
-	MOVW	8(R13), R(r)
-	MOVW	12(R13), R(s)
-	MOVW	16(R13), R(M)
+	MOVW	Rq, RTMP
+	MOVW	4(R13), Rq
+	MOVW	8(R13), Rr
+	MOVW	12(R13), Rs
+	MOVW	16(R13), RM
 	RET
 
 TEXT _modu(SB), NOSPLIT, $16
-	MOVW	R(q), 4(R13)
-	MOVW	R(r), 8(R13)
-	MOVW	R(s), 12(R13)
-	MOVW	R(M), 16(R13)
+	MOVW	Rq, 4(R13)
+	MOVW	Rr, 8(R13)
+	MOVW	Rs, 12(R13)
+	MOVW	RM, 16(R13)
 
-	MOVW	R(TMP), R(r)		/* numerator */
-	MOVW	0(FP), R(q) 		/* denominator */
+	MOVW	RTMP, Rr		/* numerator */
+	MOVW	0(FP), Rq 		/* denominator */
 	BL  	udiv<>(SB)
-	MOVW	R(r), R(TMP)
-	MOVW	4(R13), R(q)
-	MOVW	8(R13), R(r)
-	MOVW	12(R13), R(s)
-	MOVW	16(R13), R(M)
+	MOVW	Rr, RTMP
+	MOVW	4(R13), Rq
+	MOVW	8(R13), Rr
+	MOVW	12(R13), Rs
+	MOVW	16(R13), RM
 	RET
 
 TEXT _div(SB),NOSPLIT,$16
-	MOVW	R(q), 4(R13)
-	MOVW	R(r), 8(R13)
-	MOVW	R(s), 12(R13)
-	MOVW	R(M), 16(R13)
-	MOVW	R(TMP), R(r)		/* numerator */
-	MOVW	0(FP), R(q) 		/* denominator */
-	CMP 	$0, R(r)
+	MOVW	Rq, 4(R13)
+	MOVW	Rr, 8(R13)
+	MOVW	Rs, 12(R13)
+	MOVW	RM, 16(R13)
+	MOVW	RTMP, Rr		/* numerator */
+	MOVW	0(FP), Rq 		/* denominator */
+	CMP 	$0, Rr
 	BGE 	d1
-	RSB 	$0, R(r), R(r)
-	CMP 	$0, R(q)
+	RSB 	$0, Rr, Rr
+	CMP 	$0, Rq
 	BGE 	d2
-	RSB 	$0, R(q), R(q)
+	RSB 	$0, Rq, Rq
 d0:
 	BL  	udiv<>(SB)  		/* none/both neg */
-	MOVW	R(q), R(TMP)
+	MOVW	Rq, RTMP
 	B		out1
 d1:
-	CMP 	$0, R(q)
+	CMP 	$0, Rq
 	BGE 	d0
-	RSB 	$0, R(q), R(q)
+	RSB 	$0, Rq, Rq
 d2:
 	BL  	udiv<>(SB)  		/* one neg */
-	RSB		$0, R(q), R(TMP)
+	RSB		$0, Rq, RTMP
 out1:
-	MOVW	4(R13), R(q)
-	MOVW	8(R13), R(r)
-	MOVW	12(R13), R(s)
-	MOVW	16(R13), R(M)
+	MOVW	4(R13), Rq
+	MOVW	8(R13), Rr
+	MOVW	12(R13), Rs
+	MOVW	16(R13), RM
 	RET
 
 TEXT _mod(SB),NOSPLIT,$16
-	MOVW	R(q), 4(R13)
-	MOVW	R(r), 8(R13)
-	MOVW	R(s), 12(R13)
-	MOVW	R(M), 16(R13)
-	MOVW	R(TMP), R(r)		/* numerator */
-	MOVW	0(FP), R(q) 		/* denominator */
-	CMP 	$0, R(q)
-	RSB.LT	$0, R(q), R(q)
-	CMP 	$0, R(r)
+	MOVW	Rq, 4(R13)
+	MOVW	Rr, 8(R13)
+	MOVW	Rs, 12(R13)
+	MOVW	RM, 16(R13)
+	MOVW	RTMP, Rr		/* numerator */
+	MOVW	0(FP), Rq 		/* denominator */
+	CMP 	$0, Rq
+	RSB.LT	$0, Rq, Rq
+	CMP 	$0, Rr
 	BGE 	m1
-	RSB 	$0, R(r), R(r)
+	RSB 	$0, Rr, Rr
 	BL  	udiv<>(SB)  		/* neg numerator */
-	RSB 	$0, R(r), R(TMP)
+	RSB 	$0, Rr, RTMP
 	B   	out
 m1:
 	BL  	udiv<>(SB)  		/* pos numerator */
-	MOVW	R(r), R(TMP)
+	MOVW	Rr, RTMP
 out:
-	MOVW	4(R13), R(q)
-	MOVW	8(R13), R(r)
-	MOVW	12(R13), R(s)
-	MOVW	16(R13), R(M)
+	MOVW	4(R13), Rq
+	MOVW	8(R13), Rr
+	MOVW	12(R13), Rs
+	MOVW	16(R13), RM
 	RET
 
 // _mul64by32 and _div64by32 not implemented on arm