argon2: avoid clobbering BP

go vet was reporting

  blamka_amd64.s:203:1: [amd64] mixBlocksSSE2: invalid offset a+24(FP); expected a+8(FP)
  blamka_amd64.s:226:1: [amd64] xorBlocksSSE2: invalid offset a+24(FP); expected a+8(FP)
  blamka_amd64.s:204:1: frame pointer is clobbered before saving
  blamka_amd64.s:227:1: frame pointer is clobbered before saving

Also fix a similar naming issue in sha3:

  sha3\keccakf_amd64.s:325:1: [amd64] keccakF1600: unknown variable state; offset 0 is a+0(FP)

Updates golang/go#47027

Change-Id: Ia74852cdb0721ae0216787054197b0cac9e1c0f8
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/332289
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Nicola Murino <nicola.murino@gmail.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Dmitri Shuralyov <dmitshur@golang.org>
Reviewed-by: Filippo Valsorda <filippo@golang.org>
diff --git a/argon2/blamka_amd64.s b/argon2/blamka_amd64.s
index f3b653a..6713acc 100644
--- a/argon2/blamka_amd64.s
+++ b/argon2/blamka_amd64.s
@@ -199,8 +199,8 @@
 	MOVQ out+0(FP), DX
 	MOVQ a+8(FP), AX
 	MOVQ b+16(FP), BX
-	MOVQ a+24(FP), CX
-	MOVQ $128, BP
+	MOVQ c+24(FP), CX
+	MOVQ $128, DI
 
 loop:
 	MOVOU 0(AX), X0
@@ -213,7 +213,7 @@
 	ADDQ  $16, BX
 	ADDQ  $16, CX
 	ADDQ  $16, DX
-	SUBQ  $2, BP
+	SUBQ  $2, DI
 	JA    loop
 	RET
 
@@ -222,8 +222,8 @@
 	MOVQ out+0(FP), DX
 	MOVQ a+8(FP), AX
 	MOVQ b+16(FP), BX
-	MOVQ a+24(FP), CX
-	MOVQ $128, BP
+	MOVQ c+24(FP), CX
+	MOVQ $128, DI
 
 loop:
 	MOVOU 0(AX), X0
@@ -238,6 +238,6 @@
 	ADDQ  $16, BX
 	ADDQ  $16, CX
 	ADDQ  $16, DX
-	SUBQ  $2, BP
+	SUBQ  $2, DI
 	JA    loop
 	RET
diff --git a/sha3/keccakf_amd64.s b/sha3/keccakf_amd64.s
index 8fb26ae..1f53938 100644
--- a/sha3/keccakf_amd64.s
+++ b/sha3/keccakf_amd64.s
@@ -319,9 +319,9 @@
 	MOVQ rDi, _si(oState); \
 	MOVQ rDo, _so(oState)  \
 
-// func keccakF1600(state *[25]uint64)
+// func keccakF1600(a *[25]uint64)
 TEXT ·keccakF1600(SB), 0, $200-8
-	MOVQ state+0(FP), rpState
+	MOVQ a+0(FP), rpState
 
 	// Convert the user state into an internal state
 	NOTQ _be(rpState)