blake2b: fix amd64 assembly not to smash SP

For golang/go#44269.

Change-Id: I7e405afd0b55c96ce0a4c6058ba01e8be1173a8c
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/292051
Trust: Russ Cox <rsc@golang.org>
Trust: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Jason A. Donenfeld <Jason@zx2c4.com>
diff --git a/blake2b/blake2bAVX2_amd64.s b/blake2b/blake2bAVX2_amd64.s
index 8608a7f..a78ab3b 100644
--- a/blake2b/blake2bAVX2_amd64.s
+++ b/blake2b/blake2bAVX2_amd64.s
@@ -282,14 +282,12 @@
 	MOVQ blocks_len+32(FP), DI
 
 	MOVQ SP, DX
-	MOVQ SP, R9
-	ADDQ $31, R9
-	ANDQ $~31, R9
-	MOVQ R9, SP
+	ADDQ $31, DX
+	ANDQ $~31, DX
 
-	MOVQ CX, 16(SP)
+	MOVQ CX, 16(DX)
 	XORQ CX, CX
-	MOVQ CX, 24(SP)
+	MOVQ CX, 24(DX)
 
 	VMOVDQU ·AVX2_c40<>(SB), Y4
 	VMOVDQU ·AVX2_c48<>(SB), Y5
@@ -301,33 +299,33 @@
 
 	MOVQ 0(BX), R8
 	MOVQ 8(BX), R9
-	MOVQ R9, 8(SP)
+	MOVQ R9, 8(DX)
 
 loop:
 	ADDQ $128, R8
-	MOVQ R8, 0(SP)
+	MOVQ R8, 0(DX)
 	CMPQ R8, $128
 	JGE  noinc
 	INCQ R9
-	MOVQ R9, 8(SP)
+	MOVQ R9, 8(DX)
 
 noinc:
 	VMOVDQA Y8, Y0
 	VMOVDQA Y9, Y1
 	VMOVDQA Y6, Y2
-	VPXOR   0(SP), Y7, Y3
+	VPXOR   0(DX), Y7, Y3
 
 	LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15()
-	VMOVDQA Y12, 32(SP)
-	VMOVDQA Y13, 64(SP)
-	VMOVDQA Y14, 96(SP)
-	VMOVDQA Y15, 128(SP)
+	VMOVDQA Y12, 32(DX)
+	VMOVDQA Y13, 64(DX)
+	VMOVDQA Y14, 96(DX)
+	VMOVDQA Y15, 128(DX)
 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
 	LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3()
-	VMOVDQA Y12, 160(SP)
-	VMOVDQA Y13, 192(SP)
-	VMOVDQA Y14, 224(SP)
-	VMOVDQA Y15, 256(SP)
+	VMOVDQA Y12, 160(DX)
+	VMOVDQA Y13, 192(DX)
+	VMOVDQA Y14, 224(DX)
+	VMOVDQA Y15, 256(DX)
 
 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
 	LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4()
@@ -347,8 +345,8 @@
 	LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0()
 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
 
-	ROUND_AVX2(32(SP), 64(SP), 96(SP), 128(SP), Y10, Y4, Y5)
-	ROUND_AVX2(160(SP), 192(SP), 224(SP), 256(SP), Y10, Y4, Y5)
+	ROUND_AVX2(32(DX), 64(DX), 96(DX), 128(DX), Y10, Y4, Y5)
+	ROUND_AVX2(160(DX), 192(DX), 224(DX), 256(DX), Y10, Y4, Y5)
 
 	VPXOR Y0, Y8, Y8
 	VPXOR Y1, Y9, Y9
@@ -366,7 +364,6 @@
 	VMOVDQU Y9, 32(AX)
 	VZEROUPPER
 
-	MOVQ DX, SP
 	RET
 
 #define VPUNPCKLQDQ_X2_X2_X15 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xFA
@@ -584,11 +581,9 @@
 	MOVQ blocks_base+24(FP), SI
 	MOVQ blocks_len+32(FP), DI
 
-	MOVQ SP, BP
-	MOVQ SP, R9
-	ADDQ $15, R9
-	ANDQ $~15, R9
-	MOVQ R9, SP
+	MOVQ SP, R10
+	ADDQ $15, R10
+	ANDQ $~15, R10
 
 	VMOVDQU ·AVX_c40<>(SB), X0
 	VMOVDQU ·AVX_c48<>(SB), X1
@@ -596,8 +591,8 @@
 	VMOVDQA X1, X9
 
 	VMOVDQU ·AVX_iv3<>(SB), X0
-	VMOVDQA X0, 0(SP)
-	XORQ    CX, 0(SP)          // 0(SP) = ·AVX_iv3 ^ (CX || 0)
+	VMOVDQA X0, 0(R10)
+	XORQ    CX, 0(R10)          // 0(R10) = ·AVX_iv3 ^ (CX || 0)
 
 	VMOVDQU 0(AX), X10
 	VMOVDQU 16(AX), X11
@@ -624,35 +619,35 @@
 	VMOVDQU ·AVX_iv2<>(SB), X6
 
 	VPXOR   X15, X6, X6
-	VMOVDQA 0(SP), X7
+	VMOVDQA 0(R10), X7
 
 	LOAD_MSG_AVX_0_2_4_6_1_3_5_7()
-	VMOVDQA X12, 16(SP)
-	VMOVDQA X13, 32(SP)
-	VMOVDQA X14, 48(SP)
-	VMOVDQA X15, 64(SP)
+	VMOVDQA X12, 16(R10)
+	VMOVDQA X13, 32(R10)
+	VMOVDQA X14, 48(R10)
+	VMOVDQA X15, 64(R10)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX()
 	LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15)
-	VMOVDQA X12, 80(SP)
-	VMOVDQA X13, 96(SP)
-	VMOVDQA X14, 112(SP)
-	VMOVDQA X15, 128(SP)
+	VMOVDQA X12, 80(R10)
+	VMOVDQA X13, 96(R10)
+	VMOVDQA X14, 112(R10)
+	VMOVDQA X15, 128(R10)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX_INV()
 
 	LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6)
-	VMOVDQA X12, 144(SP)
-	VMOVDQA X13, 160(SP)
-	VMOVDQA X14, 176(SP)
-	VMOVDQA X15, 192(SP)
+	VMOVDQA X12, 144(R10)
+	VMOVDQA X13, 160(R10)
+	VMOVDQA X14, 176(R10)
+	VMOVDQA X15, 192(R10)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX()
 	LOAD_MSG_AVX_1_0_11_5_12_2_7_3()
-	VMOVDQA X12, 208(SP)
-	VMOVDQA X13, 224(SP)
-	VMOVDQA X14, 240(SP)
-	VMOVDQA X15, 256(SP)
+	VMOVDQA X12, 208(R10)
+	VMOVDQA X13, 224(R10)
+	VMOVDQA X14, 240(R10)
+	VMOVDQA X15, 256(R10)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX_INV()
 
@@ -712,14 +707,14 @@
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX_INV()
 
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(SP), 32(SP), 48(SP), 64(SP), X15, X8, X9)
+	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X15, X8, X9)
 	SHUFFLE_AVX()
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(SP), 96(SP), 112(SP), 128(SP), X15, X8, X9)
+	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X15, X8, X9)
 	SHUFFLE_AVX_INV()
 
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(SP), 160(SP), 176(SP), 192(SP), X15, X8, X9)
+	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X15, X8, X9)
 	SHUFFLE_AVX()
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(SP), 224(SP), 240(SP), 256(SP), X15, X8, X9)
+	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X15, X8, X9)
 	SHUFFLE_AVX_INV()
 
 	VMOVDQU 32(AX), X14
@@ -746,5 +741,4 @@
 	MOVQ R9, 8(BX)
 	VZEROUPPER
 
-	MOVQ BP, SP
 	RET
diff --git a/blake2b/blake2b_amd64.s b/blake2b/blake2b_amd64.s
index 1f4c6a9..bb72a03 100644
--- a/blake2b/blake2b_amd64.s
+++ b/blake2b/blake2b_amd64.s
@@ -118,15 +118,13 @@
 	MOVQ blocks_base+24(FP), SI
 	MOVQ blocks_len+32(FP), DI
 
-	MOVQ SP, BP
-	MOVQ SP, R9
-	ADDQ $15, R9
-	ANDQ $~15, R9
-	MOVQ R9, SP
+	MOVQ SP, R10
+	ADDQ $15, R10
+	ANDQ $~15, R10
 
 	MOVOU ·iv3<>(SB), X0
-	MOVO  X0, 0(SP)
-	XORQ  CX, 0(SP)     // 0(SP) = ·iv3 ^ (CX || 0)
+	MOVO  X0, 0(R10)
+	XORQ  CX, 0(R10)     // 0(R10) = ·iv3 ^ (CX || 0)
 
 	MOVOU ·c40<>(SB), X13
 	MOVOU ·c48<>(SB), X14
@@ -156,35 +154,35 @@
 	MOVOU ·iv2<>(SB), X6
 
 	PXOR X8, X6
-	MOVO 0(SP), X7
+	MOVO 0(R10), X7
 
 	LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7)
-	MOVO X8, 16(SP)
-	MOVO X9, 32(SP)
-	MOVO X10, 48(SP)
-	MOVO X11, 64(SP)
+	MOVO X8, 16(R10)
+	MOVO X9, 32(R10)
+	MOVO X10, 48(R10)
+	MOVO X11, 64(R10)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15)
-	MOVO X8, 80(SP)
-	MOVO X9, 96(SP)
-	MOVO X10, 112(SP)
-	MOVO X11, 128(SP)
+	MOVO X8, 80(R10)
+	MOVO X9, 96(R10)
+	MOVO X10, 112(R10)
+	MOVO X11, 128(R10)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 
 	LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6)
-	MOVO X8, 144(SP)
-	MOVO X9, 160(SP)
-	MOVO X10, 176(SP)
-	MOVO X11, 192(SP)
+	MOVO X8, 144(R10)
+	MOVO X9, 160(R10)
+	MOVO X10, 176(R10)
+	MOVO X11, 192(R10)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3)
-	MOVO X8, 208(SP)
-	MOVO X9, 224(SP)
-	MOVO X10, 240(SP)
-	MOVO X11, 256(SP)
+	MOVO X8, 208(R10)
+	MOVO X9, 224(R10)
+	MOVO X10, 240(R10)
+	MOVO X11, 256(R10)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 
@@ -244,14 +242,14 @@
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(SP), 32(SP), 48(SP), 64(SP), X11, X13, X14)
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(SP), 96(SP), 112(SP), 128(SP), X11, X13, X14)
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(SP), 160(SP), 176(SP), 192(SP), X11, X13, X14)
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(SP), 224(SP), 240(SP), 256(SP), X11, X13, X14)
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 
 	MOVOU 32(AX), X10
@@ -277,5 +275,4 @@
 	MOVQ R8, 0(BX)
 	MOVQ R9, 8(BX)
 
-	MOVQ BP, SP
 	RET