poly1305: don't move R13 in sum_arm.s.

Rather than change the value of R13 during the execution, keep R13 fixed
(after the initial prelude) and always use offsets from it.

This should help the runtime figure out what's going on if, say, a
signal should occur while running this code.

I've also trimmed the set of saved registers since Go doesn't require
the callee to maintain anything except R10 and R13.

Change-Id: Ifbeca73c1d964cc43bb7f8c20c61066f22fd562d
Reviewed-on: https://go-review.googlesource.com/31717
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
diff --git a/poly1305/sum_arm.s b/poly1305/sum_arm.s
index 9c3d60f..93167b2 100644
--- a/poly1305/sum_arm.s
+++ b/poly1305/sum_arm.s
@@ -20,10 +20,12 @@
 // take care and verify that no synthetic instructions use it.
 
 TEXT poly1305_init_ext_armv6<>(SB), NOSPLIT, $0
-	// Needs 32 bytes of stack and 64 bytes of space pointed to by R0.
-	// (It might look like it's only 60 bytes of space but the final
-	// four bytes will be written by another function.)
-	MOVM.DB.W [R4-R11], (R13)
+	// Needs 16 bytes of stack and 64 bytes of space pointed to by R0.  (It
+	// might look like it's only 60 bytes of space but the final four bytes
+	// will be written by another function.) We need to skip over four
+	// bytes of stack because that's saving the value of 'g'.
+	ADD       $4, R13, R8
+	MOVM.IB   [R4-R7], (R8)
 	MOVM.IA.W (R1), [R2-R5]
 	MOVW      $poly1305_init_constants_armv6<>(SB), R7
 	MOVW      R2, R8
@@ -49,7 +51,8 @@
 	MOVM.IA.W [R2-R6], (R0)
 	MOVM.IA.W (R1), [R2-R5]
 	MOVM.IA   [R2-R6], (R0)
-	MOVM.IA.W (R13), [R4-R11]
+	ADD       $20, R13, R0
+	MOVM.DA   (R0), [R4-R7]
 	RET
 
 #define MOVW_UNALIGNED(Rsrc, Rdst, Rtmp, offset) \
@@ -63,29 +66,34 @@
 	MOVBU Rtmp, (offset+3)(Rdst)
 
 TEXT poly1305_blocks_armv6<>(SB), NOSPLIT, $0
-	// Needs 36 + 128 bytes of stack.
-	MOVM.DB.W [R4, R5, R6, R7, R8, R9, g, R11, R14], (R13)
-	SUB       $128, R13
-	MOVW      R0, 36(R13)
-	MOVW      R1, 40(R13)
-	MOVW      R2, 44(R13)
-	MOVW      R1, R14
-	MOVW      R2, R12
-	MOVW      56(R0), R8
-	WORD      $0xe1180008                                  // TST R8, R8 not working see issue 5921
-	EOR       R6, R6, R6
-	MOVW.EQ   $(1<<24), R6
-	MOVW      R6, 32(R13)
-	ADD       $64, R13, g
-	MOVM.IA   (R0), [R0-R9]
-	MOVM.IA   [R0-R4], (g)
-	CMP       $16, R12
-	BLO       poly1305_blocks_armv6_done
+	// Needs 24 bytes of stack for saved registers and then 88 bytes of
+	// scratch space after that. We assume that 24 bytes at (R13) have
+	// already been used: four bytes for the link register saved in the
+	// prelude of poly1305_auth_armv6, four bytes for saving the value of g
+	// in that function and 16 bytes of scratch space used around
+	// poly1305_finish_ext_armv6_skip1.
+	ADD     $24, R13, R12
+	MOVM.IB [R4-R8, R14], (R12)
+	MOVW    R0, 88(R13)
+	MOVW    R1, 92(R13)
+	MOVW    R2, 96(R13)
+	MOVW    R1, R14
+	MOVW    R2, R12
+	MOVW    56(R0), R8
+	WORD    $0xe1180008                // TST R8, R8 not working see issue 5921
+	EOR     R6, R6, R6
+	MOVW.EQ $(1<<24), R6
+	MOVW    R6, 84(R13)
+	ADD     $116, R13, g
+	MOVM.IA (R0), [R0-R9]
+	MOVM.IA [R0-R4], (g)
+	CMP     $16, R12
+	BLO     poly1305_blocks_armv6_done
 
 poly1305_blocks_armv6_mainloop:
 	WORD    $0xe31e0003                            // TST R14, #3 not working see issue 5921
 	BEQ     poly1305_blocks_armv6_mainloop_aligned
-	ADD     $48, R13, g
+	ADD     $100, R13, g
 	MOVW_UNALIGNED(R14, g, R0, 0)
 	MOVW_UNALIGNED(R14, g, R0, 4)
 	MOVW_UNALIGNED(R14, g, R0, 8)
@@ -101,21 +109,21 @@
 	MOVW    R0>>26, g
 	MOVW    R1>>20, R11
 	MOVW    R2>>14, R12
-	MOVW    R14, 40(R13)
+	MOVW    R14, 92(R13)
 	MOVW    R3>>8, R4
 	ORR     R1<<6, g, g
 	ORR     R2<<12, R11, R11
 	ORR     R3<<18, R12, R12
 	BIC     $0xfc000000, R0, R0
 	BIC     $0xfc000000, g, g
-	MOVW    32(R13), R3
+	MOVW    84(R13), R3
 	BIC     $0xfc000000, R11, R11
 	BIC     $0xfc000000, R12, R12
 	ADD     R0, R5, R5
 	ADD     g, R6, R6
 	ORR     R3, R4, R4
 	ADD     R11, R7, R7
-	ADD     $64, R13, R14
+	ADD     $116, R13, R14
 	ADD     R12, R8, R8
 	ADD     R4, R9, R9
 	MOVM.IA (R14), [R0-R4]
@@ -131,10 +139,10 @@
 	MULALU  R0, R8, (R14, R12)
 	MULALU  R0, R9, (R11, g)
 	MULALU  R4, R9, (R14, R12)
-	MOVW    g, 24(R13)
-	MOVW    R11, 28(R13)
-	MOVW    R12, 16(R13)
-	MOVW    R14, 20(R13)
+	MOVW    g, 76(R13)
+	MOVW    R11, 80(R13)
+	MOVW    R12, 68(R13)
+	MOVW    R14, 72(R13)
 	MULLU   R2, R5, (R11, g)
 	MULLU   R1, R5, (R14, R12)
 	MULALU  R1, R6, (R11, g)
@@ -147,16 +155,17 @@
 	MULALU  R3, R8, (R14, R12)
 	MULALU  R3, R9, (R11, g)
 	MULALU  R2, R9, (R14, R12)
-	MOVW    g, 8(R13)
-	MOVW    R11, 12(R13)
-	MOVW    R12, 0(R13)
-	MOVW    R14, w+4(SP)
+	MOVW    g, 60(R13)
+	MOVW    R11, 64(R13)
+	MOVW    R12, 52(R13)
+	MOVW    R14, 56(R13)
 	MULLU   R0, R5, (R11, g)
 	MULALU  R4, R6, (R11, g)
 	MULALU  R3, R7, (R11, g)
 	MULALU  R2, R8, (R11, g)
 	MULALU  R1, R9, (R11, g)
-	MOVM.IA (R13), [R0-R7]
+	ADD     $52, R13, R0
+	MOVM.IA (R0), [R0-R7]
 	MOVW    g>>26, R12
 	MOVW    R4>>26, R14
 	ORR     R11<<6, R12, R12
@@ -187,23 +196,23 @@
 	MOVW    R4>>26, R12
 	BIC     $0xfc000000, R4, R8
 	ADD     R12, R6, R9
-	MOVW    w+44(SP), R12
-	MOVW    w+40(SP), R14
+	MOVW    96(R13), R12
+	MOVW    92(R13), R14
 	MOVW    R0, R6
 	CMP     $32, R12
 	SUB     $16, R12, R12
-	MOVW    R12, 44(R13)
+	MOVW    R12, 96(R13)
 	BHS     poly1305_blocks_armv6_mainloop
 
 poly1305_blocks_armv6_done:
-	MOVW      36(R13), R12
-	MOVW      R5, 20(R12)
-	MOVW      R6, 24(R12)
-	MOVW      R7, 28(R12)
-	MOVW      R8, 32(R12)
-	MOVW      R9, 36(R12)
-	ADD       $128, R13, R13
-	MOVM.IA.W (R13), [R4, R5, R6, R7, R8, R9, g, R11, R14]
+	MOVW    88(R13), R12
+	MOVW    R5, 20(R12)
+	MOVW    R6, 24(R12)
+	MOVW    R7, 28(R12)
+	MOVW    R8, 32(R12)
+	MOVW    R9, 36(R12)
+	ADD     $48, R13, R0
+	MOVM.DA (R0), [R4-R8, R14]
 	RET
 
 #define MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp) \
@@ -216,26 +225,76 @@
 	MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp); \
 	MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp)
 
-TEXT poly1305_finish_ext_armv6<>(SB), NOSPLIT, $0
-	// Needs 36 + 16 bytes of stack.
-	MOVM.DB.W [R4, R5, R6, R7, R8, R9, g, R11, R14], (R13)
-	SUB       $16, R13, R13
-	MOVW      R0, R5
-	MOVW      R1, R6
-	MOVW      R2, R7
-	MOVW      R3, R8
-	AND.S     R2, R2, R2
-	BEQ       poly1305_finish_ext_armv6_noremaining
-	EOR       R0, R0
-	MOVW      R13, R9
-	MOVW      R0, 0(R13)
-	MOVW      R0, 4(R13)
-	MOVW      R0, 8(R13)
-	MOVW      R0, 12(R13)
-	WORD      $0xe3110003                                  // TST R1, #3 not working see issue 5921
-	BEQ       poly1305_finish_ext_armv6_aligned
-	WORD      $0xe3120008                                  // TST R2, #8 not working see issue 5921
-	BEQ       poly1305_finish_ext_armv6_skip8
+// func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]key)
+TEXT ·poly1305_auth_armv6(SB), $196-16
+	// The value 196, just above, is the sum of 64 (the size of the context
+	// structure) and 132 (the amount of stack needed).
+	//
+	// At this point, the stack pointer (R13) has been moved down. It
+	// points to the saved link register and there's 196 bytes of free
+	// space above it.
+	//
+	// The stack for this function looks like:
+	//
+	// +---------------------
+	// |
+	// | 64 bytes of context structure
+	// |
+	// +---------------------
+	// |
+	// | 112 bytes for poly1305_blocks_armv6
+	// |
+	// +---------------------
+	// | 16 bytes of final block, constructed at
+	// | poly1305_finish_ext_armv6_skip8
+	// +---------------------
+	// | four bytes of saved 'g'
+	// +---------------------
+	// | lr, saved by prelude    <- R13 points here
+	// +---------------------
+	MOVW g, 4(R13)
+
+	MOVW out+0(FP), R4
+	MOVW m+4(FP), R5
+	MOVW mlen+8(FP), R6
+	MOVW key+12(FP), R7
+
+	ADD  $136, R13, R0 // 136 = 4 + 4 + 16 + 112
+	MOVW R7, R1
+
+	// poly1305_init_ext_armv6 will write to the stack from R13+4, but
+	// that's ok because none of the other values have been written yet.
+	BL    poly1305_init_ext_armv6<>(SB)
+	BIC.S $15, R6, R2
+	BEQ   poly1305_auth_armv6_noblocks
+	ADD   $136, R13, R0
+	MOVW  R5, R1
+	ADD   R2, R5, R5
+	SUB   R2, R6, R6
+	BL    poly1305_blocks_armv6<>(SB)
+
+poly1305_auth_armv6_noblocks:
+	ADD  $136, R13, R0
+	MOVW R5, R1
+	MOVW R6, R2
+	MOVW R4, R3
+
+	MOVW  R0, R5
+	MOVW  R1, R6
+	MOVW  R2, R7
+	MOVW  R3, R8
+	AND.S R2, R2, R2
+	BEQ   poly1305_finish_ext_armv6_noremaining
+	EOR   R0, R0
+	ADD   $8, R13, R9                           // 8 = offset to 16 byte scratch space
+	MOVW  R0, (R9)
+	MOVW  R0, 4(R9)
+	MOVW  R0, 8(R9)
+	MOVW  R0, 12(R9)
+	WORD  $0xe3110003                           // TST R1, #3 not working see issue 5921
+	BEQ   poly1305_finish_ext_armv6_aligned
+	WORD  $0xe3120008                           // TST R2, #8 not working see issue 5921
+	BEQ   poly1305_finish_ext_armv6_skip8
 	MOVWP_UNALIGNED(R1, R9, g)
 	MOVWP_UNALIGNED(R1, R9, g)
 
@@ -279,7 +338,7 @@
 	MOVBU R11, 0(R9)
 	MOVW  R11, 56(R5)
 	MOVW  R5, R0
-	MOVW  R13, R1
+	ADD   $8, R13, R1
 	MOVW  $16, R2
 	BL    poly1305_blocks_armv6<>(SB)
 
@@ -318,14 +377,14 @@
 	MOVW      $-(1<<26), R12
 	ADD       R11>>26, R12, R12
 	BIC       $0xfc000000, R11, R11
-	ADD       R12, R4, R14
-	MOVW      R14>>31, R12
+	ADD       R12, R4, R9
+	MOVW      R9>>31, R12
 	SUB       $1, R12
 	AND       R12, R6, R6
 	AND       R12, R7, R7
 	AND       R12, g, g
 	AND       R12, R11, R11
-	AND       R12, R14, R14
+	AND       R12, R9, R9
 	MVN       R12, R12
 	AND       R12, R0, R0
 	AND       R12, R1, R1
@@ -336,7 +395,7 @@
 	ORR       R7, R1, R1
 	ORR       g, R2, R2
 	ORR       R11, R3, R3
-	ORR       R14, R4, R4
+	ORR       R9, R4, R4
 	ORR       R1<<26, R0, R0
 	MOVW      R1>>6, R1
 	ORR       R2<<20, R1, R1
@@ -364,53 +423,5 @@
 	EOR       R7, R7, R7
 	MOVM.IA.W [R0-R7], (R12)
 	MOVM.IA   [R0-R7], (R12)
-	ADD       $16, R13, R13
-	MOVM.IA.W (R13), [R4, R5, R6, R7, R8, R9, g, R11, R14]
-	RET
-
-// func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]key)
-TEXT ·poly1305_auth_armv6(SB), $228-16
-	// The value 228, just above, is the sum of 64 (the size of the context
-	// structure) and 164 (the amount of stack that |poly1305_blocks_armv6|
-	// needs).
-	//
-	// At this point, the stack pointer (R13) has been moved down. It
-	// points to the saved link register and there's 228 bytes of free
-	// space above it.
-	MOVW out+0(FP), R4
-	MOVW m+4(FP), R5
-	MOVW mlen+8(FP), R6
-	MOVW key+12(FP), R7
-
-	// We need to keep a 64-byte structure on the stack and have enough
-	// space for |poly1305_blocks_armv6| (which needs 164 bytes of stack
-	// space). This assembly code was written for a C-based world where
-	// code just assumes that sufficient stack is available below the
-	// current stack pointer. So the structure is kept at the highest
-	// addresses of the frame and the stack for other functions exists just
-	// below it.
-	//
-	// (In ARM, R13 points at the value currently at the top of the stack,
-	// so the structure address and stack pointer are the same value.)
-	//
-	// We add 168, not 164, because the link register is saved at *R13.
-	ADD   $168, R13, R13
-	MOVW  R13, R0
-	MOVW  R7, R1
-	BL    poly1305_init_ext_armv6<>(SB)
-	BIC.S $15, R6, R2
-	BEQ   poly1305_auth_armv6_noblocks
-	MOVW  R13, R0
-	MOVW  R5, R1
-	ADD   R2, R5, R5
-	SUB   R2, R6, R6
-	BL    poly1305_blocks_armv6<>(SB)
-
-poly1305_auth_armv6_noblocks:
-	MOVW R13, R0
-	MOVW R5, R1
-	MOVW R6, R2
-	MOVW R4, R3
-	BL   poly1305_finish_ext_armv6<>(SB)
-	SUB  $168, R13, R13
+	MOVW      4(R13), g
 	RET