blake2s: fix amd64 assembly not to smash SP
For golang/go#44269.
Change-Id: I877a8056dbd8ab1dedadb562aa1b3d9e1e0d55da
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/292049
Trust: Russ Cox <rsc@golang.org>
Trust: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Jason A. Donenfeld <Jason@zx2c4.com>
diff --git a/blake2s/blake2s_amd64.s b/blake2s/blake2s_amd64.s
index b905944..9b8824f 100644
--- a/blake2s/blake2s_amd64.s
+++ b/blake2s/blake2s_amd64.s
@@ -317,30 +317,30 @@
MOVL R15, 8*4+off+576(dst)
#define BLAKE2s_SSE2() \
- PRECOMPUTE_MSG(SP, 16, SI, R8, R9, R10, R11, R12, R13, R14, R15); \
- ROUND_SSE2(X4, X5, X6, X7, 16(SP), 32(SP), 48(SP), 64(SP), X8); \
- ROUND_SSE2(X4, X5, X6, X7, 16+64(SP), 32+64(SP), 48+64(SP), 64+64(SP), X8); \
- ROUND_SSE2(X4, X5, X6, X7, 16+128(SP), 32+128(SP), 48+128(SP), 64+128(SP), X8); \
- ROUND_SSE2(X4, X5, X6, X7, 16+192(SP), 32+192(SP), 48+192(SP), 64+192(SP), X8); \
- ROUND_SSE2(X4, X5, X6, X7, 16+256(SP), 32+256(SP), 48+256(SP), 64+256(SP), X8); \
- ROUND_SSE2(X4, X5, X6, X7, 16+320(SP), 32+320(SP), 48+320(SP), 64+320(SP), X8); \
- ROUND_SSE2(X4, X5, X6, X7, 16+384(SP), 32+384(SP), 48+384(SP), 64+384(SP), X8); \
- ROUND_SSE2(X4, X5, X6, X7, 16+448(SP), 32+448(SP), 48+448(SP), 64+448(SP), X8); \
- ROUND_SSE2(X4, X5, X6, X7, 16+512(SP), 32+512(SP), 48+512(SP), 64+512(SP), X8); \
- ROUND_SSE2(X4, X5, X6, X7, 16+576(SP), 32+576(SP), 48+576(SP), 64+576(SP), X8)
+ PRECOMPUTE_MSG(BP, 16, SI, R8, R9, R10, R11, R12, R13, R14, R15); \
+ ROUND_SSE2(X4, X5, X6, X7, 16(BP), 32(BP), 48(BP), 64(BP), X8); \
+ ROUND_SSE2(X4, X5, X6, X7, 16+64(BP), 32+64(BP), 48+64(BP), 64+64(BP), X8); \
+ ROUND_SSE2(X4, X5, X6, X7, 16+128(BP), 32+128(BP), 48+128(BP), 64+128(BP), X8); \
+ ROUND_SSE2(X4, X5, X6, X7, 16+192(BP), 32+192(BP), 48+192(BP), 64+192(BP), X8); \
+ ROUND_SSE2(X4, X5, X6, X7, 16+256(BP), 32+256(BP), 48+256(BP), 64+256(BP), X8); \
+ ROUND_SSE2(X4, X5, X6, X7, 16+320(BP), 32+320(BP), 48+320(BP), 64+320(BP), X8); \
+ ROUND_SSE2(X4, X5, X6, X7, 16+384(BP), 32+384(BP), 48+384(BP), 64+384(BP), X8); \
+ ROUND_SSE2(X4, X5, X6, X7, 16+448(BP), 32+448(BP), 48+448(BP), 64+448(BP), X8); \
+ ROUND_SSE2(X4, X5, X6, X7, 16+512(BP), 32+512(BP), 48+512(BP), 64+512(BP), X8); \
+ ROUND_SSE2(X4, X5, X6, X7, 16+576(BP), 32+576(BP), 48+576(BP), 64+576(BP), X8)
#define BLAKE2s_SSSE3() \
- PRECOMPUTE_MSG(SP, 16, SI, R8, R9, R10, R11, R12, R13, R14, R15); \
- ROUND_SSSE3(X4, X5, X6, X7, 16(SP), 32(SP), 48(SP), 64(SP), X8, X13, X14); \
- ROUND_SSSE3(X4, X5, X6, X7, 16+64(SP), 32+64(SP), 48+64(SP), 64+64(SP), X8, X13, X14); \
- ROUND_SSSE3(X4, X5, X6, X7, 16+128(SP), 32+128(SP), 48+128(SP), 64+128(SP), X8, X13, X14); \
- ROUND_SSSE3(X4, X5, X6, X7, 16+192(SP), 32+192(SP), 48+192(SP), 64+192(SP), X8, X13, X14); \
- ROUND_SSSE3(X4, X5, X6, X7, 16+256(SP), 32+256(SP), 48+256(SP), 64+256(SP), X8, X13, X14); \
- ROUND_SSSE3(X4, X5, X6, X7, 16+320(SP), 32+320(SP), 48+320(SP), 64+320(SP), X8, X13, X14); \
- ROUND_SSSE3(X4, X5, X6, X7, 16+384(SP), 32+384(SP), 48+384(SP), 64+384(SP), X8, X13, X14); \
- ROUND_SSSE3(X4, X5, X6, X7, 16+448(SP), 32+448(SP), 48+448(SP), 64+448(SP), X8, X13, X14); \
- ROUND_SSSE3(X4, X5, X6, X7, 16+512(SP), 32+512(SP), 48+512(SP), 64+512(SP), X8, X13, X14); \
- ROUND_SSSE3(X4, X5, X6, X7, 16+576(SP), 32+576(SP), 48+576(SP), 64+576(SP), X8, X13, X14)
+ PRECOMPUTE_MSG(BP, 16, SI, R8, R9, R10, R11, R12, R13, R14, R15); \
+ ROUND_SSSE3(X4, X5, X6, X7, 16(BP), 32(BP), 48(BP), 64(BP), X8, X13, X14); \
+ ROUND_SSSE3(X4, X5, X6, X7, 16+64(BP), 32+64(BP), 48+64(BP), 64+64(BP), X8, X13, X14); \
+ ROUND_SSSE3(X4, X5, X6, X7, 16+128(BP), 32+128(BP), 48+128(BP), 64+128(BP), X8, X13, X14); \
+ ROUND_SSSE3(X4, X5, X6, X7, 16+192(BP), 32+192(BP), 48+192(BP), 64+192(BP), X8, X13, X14); \
+ ROUND_SSSE3(X4, X5, X6, X7, 16+256(BP), 32+256(BP), 48+256(BP), 64+256(BP), X8, X13, X14); \
+ ROUND_SSSE3(X4, X5, X6, X7, 16+320(BP), 32+320(BP), 48+320(BP), 64+320(BP), X8, X13, X14); \
+ ROUND_SSSE3(X4, X5, X6, X7, 16+384(BP), 32+384(BP), 48+384(BP), 64+384(BP), X8, X13, X14); \
+ ROUND_SSSE3(X4, X5, X6, X7, 16+448(BP), 32+448(BP), 48+448(BP), 64+448(BP), X8, X13, X14); \
+ ROUND_SSSE3(X4, X5, X6, X7, 16+512(BP), 32+512(BP), 48+512(BP), 64+512(BP), X8, X13, X14); \
+ ROUND_SSSE3(X4, X5, X6, X7, 16+576(BP), 32+576(BP), 48+576(BP), 64+576(BP), X8, X13, X14)
#define BLAKE2s_SSE4() \
LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15); \
@@ -372,16 +372,12 @@
MOVQ blocks_len, DX; \
\
MOVQ SP, BP; \
- MOVQ SP, R9; \
- ADDQ $15, R9; \
- ANDQ $~15, R9; \
- MOVQ R9, SP; \
+ ADDQ $15, BP; \
+ ANDQ $~15, BP; \
\
MOVQ 0(BX), R9; \
- MOVQ R9, 0(SP); \
- XORQ R9, R9; \
- MOVQ R9, 8(SP); \
- MOVL CX, 8(SP); \
+ MOVQ R9, 0(BP); \
+ MOVQ CX, 8(BP); \
\
MOVOU 0(AX), X0; \
MOVOU 16(AX), X1; \
@@ -391,7 +387,7 @@
MOVOU counter<>(SB), X12; \
MOVOU rol16<>(SB), X13; \
MOVOU rol8<>(SB), X14; \
- MOVO 0(SP), X15; \
+ MOVO 0(BP), X15; \
\
loop: \
MOVO X0, X4; \
@@ -413,14 +409,12 @@
SUBQ $64, DX; \
JNE loop; \
\
- MOVO X15, 0(SP); \
- MOVQ 0(SP), R9; \
+ MOVO X15, 0(BP); \
+ MOVQ 0(BP), R9; \
MOVQ R9, 0(BX); \
\
MOVOU X0, 0(AX); \
- MOVOU X1, 16(AX); \
- \
- MOVQ BP, SP
+ MOVOU X1, 16(AX)
// func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
TEXT ·hashBlocksSSE2(SB), 0, $672-48 // frame = 656 + 16 byte alignment