blake2s: fix 386 assembly not to smash SP

For golang/go#44269.

Change-Id: I92e168674612af390bcb80a0579df5c777c26970
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/292052
Trust: Russ Cox <rsc@golang.org>
Trust: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Jason A. Donenfeld <Jason@zx2c4.com>
diff --git a/blake2s/blake2s_386.s b/blake2s/blake2s_386.s
index 023532b..82894e5 100644
--- a/blake2s/blake2s_386.s
+++ b/blake2s/blake2s_386.s
@@ -297,19 +297,17 @@
 	MOVL blocks_base+12(FP), SI
 	MOVL blocks_len+16(FP), DX
 
-	MOVL SP, BP
 	MOVL SP, DI
 	ADDL $15, DI
 	ANDL $~15, DI
-	MOVL DI, SP
 
-	MOVL CX, 8(SP)
+	MOVL CX, 8(DI)
 	MOVL 0(BX), CX
-	MOVL CX, 0(SP)
+	MOVL CX, 0(DI)
 	MOVL 4(BX), CX
-	MOVL CX, 4(SP)
+	MOVL CX, 4(DI)
 	XORL CX, CX
-	MOVL CX, 12(SP)
+	MOVL CX, 12(DI)
 
 	MOVOU 0(AX), X0
 	MOVOU 16(AX), X1
@@ -321,22 +319,22 @@
 	MOVOU iv0<>(SB), X6
 	MOVOU iv1<>(SB), X7
 
-	MOVO  0(SP), X3
+	MOVO  0(DI), X3
 	PADDQ X2, X3
 	PXOR  X3, X7
-	MOVO  X3, 0(SP)
+	MOVO  X3, 0(DI)
 
-	PRECOMPUTE(SP, 16, SI, CX)
-	ROUND_SSE2(X4, X5, X6, X7, 16(SP), 32(SP), 48(SP), 64(SP), X3)
-	ROUND_SSE2(X4, X5, X6, X7, 16+64(SP), 32+64(SP), 48+64(SP), 64+64(SP), X3)
-	ROUND_SSE2(X4, X5, X6, X7, 16+128(SP), 32+128(SP), 48+128(SP), 64+128(SP), X3)
-	ROUND_SSE2(X4, X5, X6, X7, 16+192(SP), 32+192(SP), 48+192(SP), 64+192(SP), X3)
-	ROUND_SSE2(X4, X5, X6, X7, 16+256(SP), 32+256(SP), 48+256(SP), 64+256(SP), X3)
-	ROUND_SSE2(X4, X5, X6, X7, 16+320(SP), 32+320(SP), 48+320(SP), 64+320(SP), X3)
-	ROUND_SSE2(X4, X5, X6, X7, 16+384(SP), 32+384(SP), 48+384(SP), 64+384(SP), X3)
-	ROUND_SSE2(X4, X5, X6, X7, 16+448(SP), 32+448(SP), 48+448(SP), 64+448(SP), X3)
-	ROUND_SSE2(X4, X5, X6, X7, 16+512(SP), 32+512(SP), 48+512(SP), 64+512(SP), X3)
-	ROUND_SSE2(X4, X5, X6, X7, 16+576(SP), 32+576(SP), 48+576(SP), 64+576(SP), X3)
+	PRECOMPUTE(DI, 16, SI, CX)
+	ROUND_SSE2(X4, X5, X6, X7, 16(DI), 32(DI), 48(DI), 64(DI), X3)
+	ROUND_SSE2(X4, X5, X6, X7, 16+64(DI), 32+64(DI), 48+64(DI), 64+64(DI), X3)
+	ROUND_SSE2(X4, X5, X6, X7, 16+128(DI), 32+128(DI), 48+128(DI), 64+128(DI), X3)
+	ROUND_SSE2(X4, X5, X6, X7, 16+192(DI), 32+192(DI), 48+192(DI), 64+192(DI), X3)
+	ROUND_SSE2(X4, X5, X6, X7, 16+256(DI), 32+256(DI), 48+256(DI), 64+256(DI), X3)
+	ROUND_SSE2(X4, X5, X6, X7, 16+320(DI), 32+320(DI), 48+320(DI), 64+320(DI), X3)
+	ROUND_SSE2(X4, X5, X6, X7, 16+384(DI), 32+384(DI), 48+384(DI), 64+384(DI), X3)
+	ROUND_SSE2(X4, X5, X6, X7, 16+448(DI), 32+448(DI), 48+448(DI), 64+448(DI), X3)
+	ROUND_SSE2(X4, X5, X6, X7, 16+512(DI), 32+512(DI), 48+512(DI), 64+512(DI), X3)
+	ROUND_SSE2(X4, X5, X6, X7, 16+576(DI), 32+576(DI), 48+576(DI), 64+576(DI), X3)
 
 	PXOR X4, X0
 	PXOR X5, X1
@@ -347,15 +345,14 @@
 	SUBL $64, DX
 	JNE  loop
 
-	MOVL 0(SP), CX
+	MOVL 0(DI), CX
 	MOVL CX, 0(BX)
-	MOVL 4(SP), CX
+	MOVL 4(DI), CX
 	MOVL CX, 4(BX)
 
 	MOVOU X0, 0(AX)
 	MOVOU X1, 16(AX)
 
-	MOVL BP, SP
 	RET
 
 // func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
@@ -366,54 +363,52 @@
 	MOVL blocks_base+12(FP), SI
 	MOVL blocks_len+16(FP), DX
 
-	MOVL SP, BP
 	MOVL SP, DI
 	ADDL $15, DI
 	ANDL $~15, DI
-	MOVL DI, SP
 
-	MOVL CX, 8(SP)
+	MOVL CX, 8(DI)
 	MOVL 0(BX), CX
-	MOVL CX, 0(SP)
+	MOVL CX, 0(DI)
 	MOVL 4(BX), CX
-	MOVL CX, 4(SP)
+	MOVL CX, 4(DI)
 	XORL CX, CX
-	MOVL CX, 12(SP)
+	MOVL CX, 12(DI)
 
 	MOVOU 0(AX), X0
 	MOVOU 16(AX), X1
 	MOVOU counter<>(SB), X2
 
 loop:
-	MOVO  X0, 656(SP)
-	MOVO  X1, 672(SP)
+	MOVO  X0, 656(DI)
+	MOVO  X1, 672(DI)
 	MOVO  X0, X4
 	MOVO  X1, X5
 	MOVOU iv0<>(SB), X6
 	MOVOU iv1<>(SB), X7
 
-	MOVO  0(SP), X3
+	MOVO  0(DI), X3
 	PADDQ X2, X3
 	PXOR  X3, X7
-	MOVO  X3, 0(SP)
+	MOVO  X3, 0(DI)
 
 	MOVOU rol16<>(SB), X0
 	MOVOU rol8<>(SB), X1
 
-	PRECOMPUTE(SP, 16, SI, CX)
-	ROUND_SSSE3(X4, X5, X6, X7, 16(SP), 32(SP), 48(SP), 64(SP), X3, X0, X1)
-	ROUND_SSSE3(X4, X5, X6, X7, 16+64(SP), 32+64(SP), 48+64(SP), 64+64(SP), X3, X0, X1)
-	ROUND_SSSE3(X4, X5, X6, X7, 16+128(SP), 32+128(SP), 48+128(SP), 64+128(SP), X3, X0, X1)
-	ROUND_SSSE3(X4, X5, X6, X7, 16+192(SP), 32+192(SP), 48+192(SP), 64+192(SP), X3, X0, X1)
-	ROUND_SSSE3(X4, X5, X6, X7, 16+256(SP), 32+256(SP), 48+256(SP), 64+256(SP), X3, X0, X1)
-	ROUND_SSSE3(X4, X5, X6, X7, 16+320(SP), 32+320(SP), 48+320(SP), 64+320(SP), X3, X0, X1)
-	ROUND_SSSE3(X4, X5, X6, X7, 16+384(SP), 32+384(SP), 48+384(SP), 64+384(SP), X3, X0, X1)
-	ROUND_SSSE3(X4, X5, X6, X7, 16+448(SP), 32+448(SP), 48+448(SP), 64+448(SP), X3, X0, X1)
-	ROUND_SSSE3(X4, X5, X6, X7, 16+512(SP), 32+512(SP), 48+512(SP), 64+512(SP), X3, X0, X1)
-	ROUND_SSSE3(X4, X5, X6, X7, 16+576(SP), 32+576(SP), 48+576(SP), 64+576(SP), X3, X0, X1)
+	PRECOMPUTE(DI, 16, SI, CX)
+	ROUND_SSSE3(X4, X5, X6, X7, 16(DI), 32(DI), 48(DI), 64(DI), X3, X0, X1)
+	ROUND_SSSE3(X4, X5, X6, X7, 16+64(DI), 32+64(DI), 48+64(DI), 64+64(DI), X3, X0, X1)
+	ROUND_SSSE3(X4, X5, X6, X7, 16+128(DI), 32+128(DI), 48+128(DI), 64+128(DI), X3, X0, X1)
+	ROUND_SSSE3(X4, X5, X6, X7, 16+192(DI), 32+192(DI), 48+192(DI), 64+192(DI), X3, X0, X1)
+	ROUND_SSSE3(X4, X5, X6, X7, 16+256(DI), 32+256(DI), 48+256(DI), 64+256(DI), X3, X0, X1)
+	ROUND_SSSE3(X4, X5, X6, X7, 16+320(DI), 32+320(DI), 48+320(DI), 64+320(DI), X3, X0, X1)
+	ROUND_SSSE3(X4, X5, X6, X7, 16+384(DI), 32+384(DI), 48+384(DI), 64+384(DI), X3, X0, X1)
+	ROUND_SSSE3(X4, X5, X6, X7, 16+448(DI), 32+448(DI), 48+448(DI), 64+448(DI), X3, X0, X1)
+	ROUND_SSSE3(X4, X5, X6, X7, 16+512(DI), 32+512(DI), 48+512(DI), 64+512(DI), X3, X0, X1)
+	ROUND_SSSE3(X4, X5, X6, X7, 16+576(DI), 32+576(DI), 48+576(DI), 64+576(DI), X3, X0, X1)
 
-	MOVO 656(SP), X0
-	MOVO 672(SP), X1
+	MOVO 656(DI), X0
+	MOVO 672(DI), X1
 	PXOR X4, X0
 	PXOR X5, X1
 	PXOR X6, X0
@@ -423,13 +418,12 @@
 	SUBL $64, DX
 	JNE  loop
 
-	MOVL 0(SP), CX
+	MOVL 0(DI), CX
 	MOVL CX, 0(BX)
-	MOVL 4(SP), CX
+	MOVL 4(DI), CX
 	MOVL CX, 4(BX)
 
 	MOVOU X0, 0(AX)
 	MOVOU X1, 16(AX)
 
-	MOVL BP, SP
 	RET