salsa20/salsa: fix amd64 assembly not to smash SP

For golang/go#44269.

Change-Id: Ica352261d696317addbdd422d4cde5bf07fef839
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/292050
Trust: Russ Cox <rsc@golang.org>
Trust: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Jason A. Donenfeld <Jason@zx2c4.com>
diff --git a/salsa20/salsa/salsa20_amd64.s b/salsa20/salsa/salsa20_amd64.s
index 9c84012..f97efc6 100644
--- a/salsa20/salsa/salsa20_amd64.s
+++ b/salsa20/salsa/salsa20_amd64.s
@@ -8,7 +8,7 @@
 // domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html
 
 // func salsa2020XORKeyStream(out, in *byte, n uint64, nonce, key *byte)
-// This needs up to 64 bytes at 360(SP); hence the non-obvious frame size.
+// This needs up to 64 bytes at 360(R12); hence the non-obvious frame size.
 TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
 	MOVQ out+0(FP),DI
 	MOVQ in+8(FP),SI
@@ -17,10 +17,8 @@
 	MOVQ key+32(FP),R8
 
 	MOVQ SP,R12
-	MOVQ SP,R9
-	ADDQ $31, R9
-	ANDQ $~31, R9
-	MOVQ R9, SP
+	ADDQ $31, R12
+	ANDQ $~31, R12
 
 	MOVQ DX,R9
 	MOVQ CX,DX
@@ -32,116 +30,116 @@
 	MOVL 0(R10),R8
 	MOVL 0(DX),AX
 	MOVL 16(R10),R11
-	MOVL CX,0(SP)
-	MOVL R8, 4 (SP)
-	MOVL AX, 8 (SP)
-	MOVL R11, 12 (SP)
+	MOVL CX,0(R12)
+	MOVL R8, 4 (R12)
+	MOVL AX, 8 (R12)
+	MOVL R11, 12 (R12)
 	MOVL 8(DX),CX
 	MOVL 24(R10),R8
 	MOVL 4(R10),AX
 	MOVL 4(DX),R11
-	MOVL CX,16(SP)
-	MOVL R8, 20 (SP)
-	MOVL AX, 24 (SP)
-	MOVL R11, 28 (SP)
+	MOVL CX,16(R12)
+	MOVL R8, 20 (R12)
+	MOVL AX, 24 (R12)
+	MOVL R11, 28 (R12)
 	MOVL 12(DX),CX
 	MOVL 12(R10),DX
 	MOVL 28(R10),R8
 	MOVL 8(R10),AX
-	MOVL DX,32(SP)
-	MOVL CX, 36 (SP)
-	MOVL R8, 40 (SP)
-	MOVL AX, 44 (SP)
+	MOVL DX,32(R12)
+	MOVL CX, 36 (R12)
+	MOVL R8, 40 (R12)
+	MOVL AX, 44 (R12)
 	MOVQ $1634760805,DX
 	MOVQ $857760878,CX
 	MOVQ $2036477234,R8
 	MOVQ $1797285236,AX
-	MOVL DX,48(SP)
-	MOVL CX, 52 (SP)
-	MOVL R8, 56 (SP)
-	MOVL AX, 60 (SP)
+	MOVL DX,48(R12)
+	MOVL CX, 52 (R12)
+	MOVL R8, 56 (R12)
+	MOVL AX, 60 (R12)
 	CMPQ R9,$256
 	JB BYTESBETWEEN1AND255
-	MOVOA 48(SP),X0
+	MOVOA 48(R12),X0
 	PSHUFL $0X55,X0,X1
 	PSHUFL $0XAA,X0,X2
 	PSHUFL $0XFF,X0,X3
 	PSHUFL $0X00,X0,X0
-	MOVOA X1,64(SP)
-	MOVOA X2,80(SP)
-	MOVOA X3,96(SP)
-	MOVOA X0,112(SP)
-	MOVOA 0(SP),X0
+	MOVOA X1,64(R12)
+	MOVOA X2,80(R12)
+	MOVOA X3,96(R12)
+	MOVOA X0,112(R12)
+	MOVOA 0(R12),X0
 	PSHUFL $0XAA,X0,X1
 	PSHUFL $0XFF,X0,X2
 	PSHUFL $0X00,X0,X3
 	PSHUFL $0X55,X0,X0
-	MOVOA X1,128(SP)
-	MOVOA X2,144(SP)
-	MOVOA X3,160(SP)
-	MOVOA X0,176(SP)
-	MOVOA 16(SP),X0
+	MOVOA X1,128(R12)
+	MOVOA X2,144(R12)
+	MOVOA X3,160(R12)
+	MOVOA X0,176(R12)
+	MOVOA 16(R12),X0
 	PSHUFL $0XFF,X0,X1
 	PSHUFL $0X55,X0,X2
 	PSHUFL $0XAA,X0,X0
-	MOVOA X1,192(SP)
-	MOVOA X2,208(SP)
-	MOVOA X0,224(SP)
-	MOVOA 32(SP),X0
+	MOVOA X1,192(R12)
+	MOVOA X2,208(R12)
+	MOVOA X0,224(R12)
+	MOVOA 32(R12),X0
 	PSHUFL $0X00,X0,X1
 	PSHUFL $0XAA,X0,X2
 	PSHUFL $0XFF,X0,X0
-	MOVOA X1,240(SP)
-	MOVOA X2,256(SP)
-	MOVOA X0,272(SP)
+	MOVOA X1,240(R12)
+	MOVOA X2,256(R12)
+	MOVOA X0,272(R12)
 	BYTESATLEAST256:
-	MOVL 16(SP),DX
-	MOVL  36 (SP),CX
-	MOVL DX,288(SP)
-	MOVL CX,304(SP)
+	MOVL 16(R12),DX
+	MOVL  36 (R12),CX
+	MOVL DX,288(R12)
+	MOVL CX,304(R12)
 	SHLQ $32,CX
 	ADDQ CX,DX
 	ADDQ $1,DX
 	MOVQ DX,CX
 	SHRQ $32,CX
-	MOVL DX, 292 (SP)
-	MOVL CX, 308 (SP)
+	MOVL DX, 292 (R12)
+	MOVL CX, 308 (R12)
 	ADDQ $1,DX
 	MOVQ DX,CX
 	SHRQ $32,CX
-	MOVL DX, 296 (SP)
-	MOVL CX, 312 (SP)
+	MOVL DX, 296 (R12)
+	MOVL CX, 312 (R12)
 	ADDQ $1,DX
 	MOVQ DX,CX
 	SHRQ $32,CX
-	MOVL DX, 300 (SP)
-	MOVL CX, 316 (SP)
+	MOVL DX, 300 (R12)
+	MOVL CX, 316 (R12)
 	ADDQ $1,DX
 	MOVQ DX,CX
 	SHRQ $32,CX
-	MOVL DX,16(SP)
-	MOVL CX, 36 (SP)
-	MOVQ R9,352(SP)
+	MOVL DX,16(R12)
+	MOVL CX, 36 (R12)
+	MOVQ R9,352(R12)
 	MOVQ $20,DX
-	MOVOA 64(SP),X0
-	MOVOA 80(SP),X1
-	MOVOA 96(SP),X2
-	MOVOA 256(SP),X3
-	MOVOA 272(SP),X4
-	MOVOA 128(SP),X5
-	MOVOA 144(SP),X6
-	MOVOA 176(SP),X7
-	MOVOA 192(SP),X8
-	MOVOA 208(SP),X9
-	MOVOA 224(SP),X10
-	MOVOA 304(SP),X11
-	MOVOA 112(SP),X12
-	MOVOA 160(SP),X13
-	MOVOA 240(SP),X14
-	MOVOA 288(SP),X15
+	MOVOA 64(R12),X0
+	MOVOA 80(R12),X1
+	MOVOA 96(R12),X2
+	MOVOA 256(R12),X3
+	MOVOA 272(R12),X4
+	MOVOA 128(R12),X5
+	MOVOA 144(R12),X6
+	MOVOA 176(R12),X7
+	MOVOA 192(R12),X8
+	MOVOA 208(R12),X9
+	MOVOA 224(R12),X10
+	MOVOA 304(R12),X11
+	MOVOA 112(R12),X12
+	MOVOA 160(R12),X13
+	MOVOA 240(R12),X14
+	MOVOA 288(R12),X15
 	MAINLOOP1:
-	MOVOA X1,320(SP)
-	MOVOA X2,336(SP)
+	MOVOA X1,320(R12)
+	MOVOA X2,336(R12)
 	MOVOA X13,X1
 	PADDL X12,X1
 	MOVOA X1,X2
@@ -191,8 +189,8 @@
 	PXOR X1,X12
 	PSRLL $14,X2
 	PXOR X2,X12
-	MOVOA 320(SP),X1
-	MOVOA X12,320(SP)
+	MOVOA 320(R12),X1
+	MOVOA X12,320(R12)
 	MOVOA X9,X2
 	PADDL X7,X2
 	MOVOA X2,X12
@@ -207,8 +205,8 @@
 	PXOR X2,X3
 	PSRLL $25,X12
 	PXOR X12,X3
-	MOVOA 336(SP),X2
-	MOVOA X0,336(SP)
+	MOVOA 336(R12),X2
+	MOVOA X0,336(R12)
 	MOVOA X6,X0
 	PADDL X2,X0
 	MOVOA X0,X12
@@ -251,8 +249,8 @@
 	PXOR X0,X1
 	PSRLL $14,X12
 	PXOR X12,X1
-	MOVOA 320(SP),X0
-	MOVOA X1,320(SP)
+	MOVOA 320(R12),X0
+	MOVOA X1,320(R12)
 	MOVOA X4,X1
 	PADDL X0,X1
 	MOVOA X1,X12
@@ -267,8 +265,8 @@
 	PXOR X1,X2
 	PSRLL $14,X12
 	PXOR X12,X2
-	MOVOA 336(SP),X12
-	MOVOA X2,336(SP)
+	MOVOA 336(R12),X12
+	MOVOA X2,336(R12)
 	MOVOA X14,X1
 	PADDL X12,X1
 	MOVOA X1,X2
@@ -311,8 +309,8 @@
 	PXOR X1,X0
 	PSRLL $14,X2
 	PXOR X2,X0
-	MOVOA 320(SP),X1
-	MOVOA X0,320(SP)
+	MOVOA 320(R12),X1
+	MOVOA X0,320(R12)
 	MOVOA X8,X0
 	PADDL X14,X0
 	MOVOA X0,X2
@@ -327,8 +325,8 @@
 	PXOR X0,X6
 	PSRLL $25,X2
 	PXOR X2,X6
-	MOVOA 336(SP),X2
-	MOVOA X12,336(SP)
+	MOVOA 336(R12),X2
+	MOVOA X12,336(R12)
 	MOVOA X3,X0
 	PADDL X2,X0
 	MOVOA X0,X12
@@ -378,14 +376,14 @@
 	PXOR X0,X2
 	PSRLL $14,X12
 	PXOR X12,X2
-	MOVOA 320(SP),X12
-	MOVOA 336(SP),X0
+	MOVOA 320(R12),X12
+	MOVOA 336(R12),X0
 	SUBQ $2,DX
 	JA MAINLOOP1
-	PADDL 112(SP),X12
-	PADDL 176(SP),X7
-	PADDL 224(SP),X10
-	PADDL 272(SP),X4
+	PADDL 112(R12),X12
+	PADDL 176(R12),X7
+	PADDL 224(R12),X10
+	PADDL 272(R12),X4
 	MOVD X12,DX
 	MOVD X7,CX
 	MOVD X10,R8
@@ -446,10 +444,10 @@
 	MOVL CX,196(DI)
 	MOVL R8,200(DI)
 	MOVL R9,204(DI)
-	PADDL 240(SP),X14
-	PADDL 64(SP),X0
-	PADDL 128(SP),X5
-	PADDL 192(SP),X8
+	PADDL 240(R12),X14
+	PADDL 64(R12),X0
+	PADDL 128(R12),X5
+	PADDL 192(R12),X8
 	MOVD X14,DX
 	MOVD X0,CX
 	MOVD X5,R8
@@ -510,10 +508,10 @@
 	MOVL CX,212(DI)
 	MOVL R8,216(DI)
 	MOVL R9,220(DI)
-	PADDL 288(SP),X15
-	PADDL 304(SP),X11
-	PADDL 80(SP),X1
-	PADDL 144(SP),X6
+	PADDL 288(R12),X15
+	PADDL 304(R12),X11
+	PADDL 80(R12),X1
+	PADDL 144(R12),X6
 	MOVD X15,DX
 	MOVD X11,CX
 	MOVD X1,R8
@@ -574,10 +572,10 @@
 	MOVL CX,228(DI)
 	MOVL R8,232(DI)
 	MOVL R9,236(DI)
-	PADDL 160(SP),X13
-	PADDL 208(SP),X9
-	PADDL 256(SP),X3
-	PADDL 96(SP),X2
+	PADDL 160(R12),X13
+	PADDL 208(R12),X9
+	PADDL 256(R12),X3
+	PADDL 96(R12),X2
 	MOVD X13,DX
 	MOVD X9,CX
 	MOVD X3,R8
@@ -638,7 +636,7 @@
 	MOVL CX,244(DI)
 	MOVL R8,248(DI)
 	MOVL R9,252(DI)
-	MOVQ 352(SP),R9
+	MOVQ 352(R12),R9
 	SUBQ $256,R9
 	ADDQ $256,SI
 	ADDQ $256,DI
@@ -650,17 +648,17 @@
 	CMPQ R9,$64
 	JAE NOCOPY
 	MOVQ DI,DX
-	LEAQ 360(SP),DI
+	LEAQ 360(R12),DI
 	MOVQ R9,CX
 	REP; MOVSB
-	LEAQ 360(SP),DI
-	LEAQ 360(SP),SI
+	LEAQ 360(R12),DI
+	LEAQ 360(R12),SI
 	NOCOPY:
-	MOVQ R9,352(SP)
-	MOVOA 48(SP),X0
-	MOVOA 0(SP),X1
-	MOVOA 16(SP),X2
-	MOVOA 32(SP),X3
+	MOVQ R9,352(R12)
+	MOVOA 48(R12),X0
+	MOVOA 0(R12),X1
+	MOVOA 16(R12),X2
+	MOVOA 32(R12),X3
 	MOVOA X1,X4
 	MOVQ $20,CX
 	MAINLOOP2:
@@ -791,10 +789,10 @@
 	PSHUFL $0X39,X3,X3
 	PXOR X6,X0
 	JA MAINLOOP2
-	PADDL 48(SP),X0
-	PADDL 0(SP),X1
-	PADDL 16(SP),X2
-	PADDL 32(SP),X3
+	PADDL 48(R12),X0
+	PADDL 0(R12),X1
+	PADDL 16(R12),X2
+	PADDL 32(R12),X3
 	MOVD X0,CX
 	MOVD X1,R8
 	MOVD X2,R9
@@ -855,16 +853,16 @@
 	MOVL R8,44(DI)
 	MOVL R9,28(DI)
 	MOVL AX,12(DI)
-	MOVQ 352(SP),R9
-	MOVL 16(SP),CX
-	MOVL  36 (SP),R8
+	MOVQ 352(R12),R9
+	MOVL 16(R12),CX
+	MOVL  36 (R12),R8
 	ADDQ $1,CX
 	SHLQ $32,R8
 	ADDQ R8,CX
 	MOVQ CX,R8
 	SHRQ $32,R8
-	MOVL CX,16(SP)
-	MOVL R8, 36 (SP)
+	MOVL CX,16(R12)
+	MOVL R8, 36 (R12)
 	CMPQ R9,$64
 	JA BYTESATLEAST65
 	JAE BYTESATLEAST64
@@ -874,7 +872,6 @@
 	REP; MOVSB
 	BYTESATLEAST64:
 	DONE:
-	MOVQ R12,SP
 	RET
 	BYTESATLEAST65:
 	SUBQ $64,R9