blake2b: port blake2b_amd64.s to Avo

This implementation utilizes the same registers found in the reference
implementation, aiming to produce a minimal semantic diff between the
Avo-generated output and the original hand-written assembly.

To verify the Avo implementation, the reference and Avo-generated
assembly files are fed to `go tool asm`, capturing the debug output into
corresponding temp files. The debug output contains supplementary
metadata (line numbers, instruction offsets, and source file references)
that must be removed in order to obtain a semantic diff of the two
files. This is accomplished via a small utility script written in awk.

Commands used to verify Avo output:

GOROOT=$(go env GOROOT)
ASM_PATH="blake2b/blake2b_amd64.s"
REFERENCE="b2d3a6a4b4d36521cd7f653879cf6981e7c5c340"

go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \
  <(git cat-file -p "$REFERENCE:$ASM_PATH") \
  > /tmp/reference.s

go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \
  "$ASM_PATH" \
  > /tmp/avo.s

normalize(){
  awk '{
    $1=$2=$3="";
    print substr($0,4)
  }'
}

diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s)

Change-Id: I6dd59fb0b0365674aa5e43b69a57ea60fbcc4ba1
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/600456
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Roland Shoemaker <roland@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Filippo Valsorda <filippo@golang.org>
diff --git a/blake2b/_asm/standard/blake2b_amd64_asm.go b/blake2b/_asm/standard/blake2b_amd64_asm.go
new file mode 100644
index 0000000..a34db3f
--- /dev/null
+++ b/blake2b/_asm/standard/blake2b_amd64_asm.go
@@ -0,0 +1,361 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	. "github.com/mmcloughlin/avo/build"
+	. "github.com/mmcloughlin/avo/operand"
+	. "github.com/mmcloughlin/avo/reg"
+	_ "golang.org/x/crypto/blake2b"
+)
+
+//go:generate go run . -out ../../blake2b_amd64.s -pkg blake2b
+
+const ThatPeskyUnicodeDot = "\u00b7"
+
+var iv0_DATA_ptr, iv1_DATA_ptr, iv2_DATA_ptr, iv3_DATA_ptr, c40_DATA_ptr, c48_DATA_ptr *Mem
+
+func main() {
+	Package("golang.org/x/crypto/blake2b")
+	ConstraintExpr("amd64,gc,!purego")
+	hashBlocksSSE4()
+	Generate()
+}
+
+func SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2 VecPhysical) {
+	MOVO(v4, t1)
+	MOVO(v5, v4)
+	MOVO(t1, v5)
+	MOVO(v6, t1)
+	PUNPCKLQDQ(v6, t2)
+	PUNPCKHQDQ(v7, v6)
+	PUNPCKHQDQ(t2, v6)
+	PUNPCKLQDQ(v7, t2)
+	MOVO(t1, v7)
+	MOVO(v2, t1)
+	PUNPCKHQDQ(t2, v7)
+	PUNPCKLQDQ(v3, t2)
+	PUNPCKHQDQ(t2, v2)
+	PUNPCKLQDQ(t1, t2)
+	PUNPCKHQDQ(t2, v3)
+}
+
+func SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2 VecPhysical) {
+	MOVO(v4, t1)
+	MOVO(v5, v4)
+	MOVO(t1, v5)
+	MOVO(v2, t1)
+	PUNPCKLQDQ(v2, t2)
+	PUNPCKHQDQ(v3, v2)
+	PUNPCKHQDQ(t2, v2)
+	PUNPCKLQDQ(v3, t2)
+	MOVO(t1, v3)
+	MOVO(v6, t1)
+	PUNPCKHQDQ(t2, v3)
+	PUNPCKLQDQ(v7, t2)
+	PUNPCKHQDQ(t2, v6)
+	PUNPCKLQDQ(t1, t2)
+	PUNPCKHQDQ(t2, v7)
+}
+
+func HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7 VecPhysical, m0, m1, m2, m3 Op, t0, c40, c48 VecPhysical) {
+	PADDQ(m0, v0)
+	PADDQ(m1, v1)
+	PADDQ(v2, v0)
+	PADDQ(v3, v1)
+	PXOR(v0, v6)
+	PXOR(v1, v7)
+	PSHUFD(Imm(0xB1), v6, v6)
+	PSHUFD(Imm(0xB1), v7, v7)
+	PADDQ(v6, v4)
+	PADDQ(v7, v5)
+	PXOR(v4, v2)
+	PXOR(v5, v3)
+	PSHUFB(c40, v2)
+	PSHUFB(c40, v3)
+	PADDQ(m2, v0)
+	PADDQ(m3, v1)
+	PADDQ(v2, v0)
+	PADDQ(v3, v1)
+	PXOR(v0, v6)
+	PXOR(v1, v7)
+	PSHUFB(c48, v6)
+	PSHUFB(c48, v7)
+	PADDQ(v6, v4)
+	PADDQ(v7, v5)
+	PXOR(v4, v2)
+	PXOR(v5, v3)
+	MOVOU(v2, t0)
+	PADDQ(v2, t0)
+	PSRLQ(Imm(63), v2)
+	PXOR(t0, v2)
+	MOVOU(v3, t0)
+	PADDQ(v3, t0)
+	PSRLQ(Imm(63), v3)
+	PXOR(t0, v3)
+}
+
+func LOAD_MSG(m0, m1, m2, m3 VecPhysical, src GPPhysical, i0, i1, i2, i3, i4, i5, i6, i7 int) {
+	MOVQ(Mem{Base: src}.Offset(i0*8), m0)
+	PINSRQ(Imm(1), Mem{Base: src}.Offset(i1*8), m0)
+	MOVQ(Mem{Base: src}.Offset(i2*8), m1)
+	PINSRQ(Imm(1), Mem{Base: src}.Offset(i3*8), m1)
+	MOVQ(Mem{Base: src}.Offset(i4*8), m2)
+	PINSRQ(Imm(1), Mem{Base: src}.Offset(i5*8), m2)
+	MOVQ(Mem{Base: src}.Offset(i6*8), m3)
+	PINSRQ(Imm(1), Mem{Base: src}.Offset(i7*8), m3)
+}
+
+func hashBlocksSSE4() {
+	Implement("hashBlocksSSE4")
+	Attributes(4)
+	AllocLocal(288) // frame size = 272 + 16 byte alignment
+
+	Load(Param("h"), RAX)
+	Load(Param("c"), RBX)
+	Load(Param("flag"), RCX)
+	Load(Param("blocks").Base(), RSI)
+	Load(Param("blocks").Len(), RDI)
+
+	MOVQ(RSP, R10)
+	ADDQ(Imm(15), R10)
+	ANDQ(I32(-16), R10)
+
+	iv3 := iv3_DATA()
+	MOVOU(iv3, X0)
+	MOVO(X0, Mem{Base: R10}.Offset(0))
+	XORQ(RCX, Mem{Base: R10}.Offset(0)) // 0(R10) = ·iv3 ^ (CX || 0)
+
+	c40 := c40_DATA()
+	c48 := c48_DATA()
+	MOVOU(c40, X13)
+	MOVOU(c48, X14)
+
+	MOVOU(Mem{Base: AX}.Offset(0), X12)
+	MOVOU(Mem{Base: AX}.Offset(16), X15)
+
+	MOVQ(Mem{Base: BX}.Offset(0), R8)
+	MOVQ(Mem{Base: BX}.Offset(8), R9)
+
+	Label("loop")
+	ADDQ(Imm(128), R8)
+	CMPQ(R8, Imm(128))
+	JGE(LabelRef("noinc"))
+	INCQ(R9)
+
+	Label("noinc")
+	MOVQ(R8, X8)
+	PINSRQ(Imm(1), R9, X8)
+
+	iv0 := iv0_DATA()
+	iv1 := iv1_DATA()
+	iv2 := iv2_DATA()
+
+	MOVO(X12, X0)
+	MOVO(X15, X1)
+	MOVOU(Mem{Base: AX}.Offset(32), X2)
+	MOVOU(Mem{Base: AX}.Offset(48), X3)
+	MOVOU(iv0, X4)
+	MOVOU(iv1, X5)
+	MOVOU(iv2, X6)
+
+	PXOR(X8, X6)
+	MOVO(Mem{Base: R10}.Offset(0), X7)
+
+	LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7)
+	MOVO(X8, Mem{Base: R10}.Offset(16))
+	MOVO(X9, Mem{Base: R10}.Offset(32))
+	MOVO(X10, Mem{Base: R10}.Offset(48))
+	MOVO(X11, Mem{Base: R10}.Offset(64))
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
+	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
+	LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15)
+	MOVO(X8, Mem{Base: R10}.Offset(80))
+	MOVO(X9, Mem{Base: R10}.Offset(96))
+	MOVO(X10, Mem{Base: R10}.Offset(112))
+	MOVO(X11, Mem{Base: R10}.Offset(128))
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
+	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
+
+	LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6)
+	MOVO(X8, Mem{Base: R10}.Offset(144))
+	MOVO(X9, Mem{Base: R10}.Offset(160))
+	MOVO(X10, Mem{Base: R10}.Offset(176))
+	MOVO(X11, Mem{Base: R10}.Offset(192))
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
+	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
+	LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3)
+	MOVO(X8, Mem{Base: R10}.Offset(208))
+	MOVO(X9, Mem{Base: R10}.Offset(224))
+	MOVO(X10, Mem{Base: R10}.Offset(240))
+	MOVO(X11, Mem{Base: R10}.Offset(256))
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
+	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
+
+	LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13)
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
+	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
+	LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4)
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
+	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
+
+	LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14)
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
+	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
+	LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8)
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
+	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
+
+	LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15)
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
+	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
+	LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13)
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
+	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
+
+	LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3)
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
+	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
+	LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9)
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
+	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
+
+	LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10)
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
+	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
+	LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11)
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
+	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
+
+	LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9)
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
+	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
+	LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10)
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
+	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
+
+	LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8)
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
+	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
+	LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5)
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
+	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
+
+	LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5)
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
+	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
+	LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0)
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
+	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
+
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(16), Mem{Base: R10}.Offset(32), Mem{Base: R10}.Offset(48), Mem{Base: R10}.Offset(64), X11, X13, X14)
+	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(80), Mem{Base: R10}.Offset(96), Mem{Base: R10}.Offset(112), Mem{Base: R10}.Offset(128), X11, X13, X14)
+	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
+
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(144), Mem{Base: R10}.Offset(160), Mem{Base: R10}.Offset(176), Mem{Base: R10}.Offset(192), X11, X13, X14)
+	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(208), Mem{Base: R10}.Offset(224), Mem{Base: R10}.Offset(240), Mem{Base: R10}.Offset(256), X11, X13, X14)
+	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
+
+	MOVOU(Mem{Base: AX}.Offset(32), X10)
+	MOVOU(Mem{Base: AX}.Offset(48), X11)
+	PXOR(X0, X12)
+	PXOR(X1, X15)
+	PXOR(X2, X10)
+	PXOR(X3, X11)
+	PXOR(X4, X12)
+	PXOR(X5, X15)
+	PXOR(X6, X10)
+	PXOR(X7, X11)
+	MOVOU(X10, Mem{Base: AX}.Offset(32))
+	MOVOU(X11, Mem{Base: AX}.Offset(48))
+
+	LEAQ(Mem{Base: SI}.Offset(128), RSI)
+	SUBQ(Imm(128), RDI)
+	JNE(LabelRef("loop"))
+
+	MOVOU(X12, Mem{Base: AX}.Offset(0))
+	MOVOU(X15, Mem{Base: AX}.Offset(16))
+
+	MOVQ(R8, Mem{Base: BX}.Offset(0))
+	MOVQ(R9, Mem{Base: BX}.Offset(8))
+
+	RET()
+}
+
+// #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~DATA SECTION~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
+
+func iv0_DATA() Mem {
+	if iv0_DATA_ptr != nil {
+		return *iv0_DATA_ptr
+	}
+
+	iv0 := GLOBL(ThatPeskyUnicodeDot+"iv0", NOPTR|RODATA)
+	iv0_DATA_ptr = &iv0
+	DATA(0x00, U64(0x6a09e667f3bcc908))
+	DATA(0x08, U64(0xbb67ae8584caa73b))
+	return iv0
+}
+
+func iv1_DATA() Mem {
+	if iv1_DATA_ptr != nil {
+		return *iv1_DATA_ptr
+	}
+
+	iv1 := GLOBL(ThatPeskyUnicodeDot+"iv1", NOPTR|RODATA)
+	iv1_DATA_ptr = &iv1
+	DATA(0x00, U64(0x3c6ef372fe94f82b))
+	DATA(0x08, U64(0xa54ff53a5f1d36f1))
+	return iv1
+}
+
+func iv2_DATA() Mem {
+	if iv2_DATA_ptr != nil {
+		return *iv2_DATA_ptr
+	}
+
+	iv2 := GLOBL(ThatPeskyUnicodeDot+"iv2", NOPTR|RODATA)
+	iv2_DATA_ptr = &iv2
+	DATA(0x00, U64(0x510e527fade682d1))
+	DATA(0x08, U64(0x9b05688c2b3e6c1f))
+	return iv2
+}
+
+func iv3_DATA() Mem {
+	if iv3_DATA_ptr != nil {
+		return *iv3_DATA_ptr
+	}
+
+	iv3 := GLOBL(ThatPeskyUnicodeDot+"iv3", NOPTR|RODATA)
+	iv3_DATA_ptr = &iv3
+	DATA(0x00, U64(0x1f83d9abfb41bd6b))
+	DATA(0x08, U64(0x5be0cd19137e2179))
+	return iv3
+}
+
+func c40_DATA() Mem {
+	if c40_DATA_ptr != nil {
+		return *c40_DATA_ptr
+	}
+
+	c40 := GLOBL(ThatPeskyUnicodeDot+"c40", NOPTR|RODATA)
+	c40_DATA_ptr = &c40
+	DATA(0x00, U64(0x0201000706050403))
+	DATA(0x08, U64(0x0a09080f0e0d0c0b))
+	return c40
+}
+
+func c48_DATA() Mem {
+	if c48_DATA_ptr != nil {
+		return *c48_DATA_ptr
+	}
+
+	c48 := GLOBL(ThatPeskyUnicodeDot+"c48", NOPTR|RODATA)
+	c48_DATA_ptr = &c48
+	DATA(0x00, U64(0x0100070605040302))
+	DATA(0x08, U64(0x09080f0e0d0c0b0a))
+	return c48
+}
diff --git a/blake2b/_asm/standard/go.mod b/blake2b/_asm/standard/go.mod
new file mode 100644
index 0000000..8063f1b
--- /dev/null
+++ b/blake2b/_asm/standard/go.mod
@@ -0,0 +1,15 @@
+module blake2b/_asm
+
+go 1.23
+
+require (
+	github.com/mmcloughlin/avo v0.6.0
+	golang.org/x/crypto v0.26.0
+)
+
+require (
+	golang.org/x/mod v0.20.0 // indirect
+	golang.org/x/sync v0.8.0 // indirect
+	golang.org/x/sys v0.24.0 // indirect
+	golang.org/x/tools v0.24.0 // indirect
+)
diff --git a/blake2b/_asm/standard/go.sum b/blake2b/_asm/standard/go.sum
new file mode 100644
index 0000000..62ea9df
--- /dev/null
+++ b/blake2b/_asm/standard/go.sum
@@ -0,0 +1,12 @@
+github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY=
+github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8=
+golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw=
+golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54=
+golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0=
+golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
+golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ=
+golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg=
+golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24=
+golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ=
diff --git a/blake2b/blake2b_amd64.s b/blake2b/blake2b_amd64.s
index adfac00..9a0ce21 100644
--- a/blake2b/blake2b_amd64.s
+++ b/blake2b/blake2b_amd64.s
@@ -1,278 +1,1441 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
+// Code generated by command: go run blake2b_amd64_asm.go -out ../../blake2b_amd64.s -pkg blake2b. DO NOT EDIT.
 
 //go:build amd64 && gc && !purego
 
 #include "textflag.h"
 
-DATA ·iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
-DATA ·iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
-GLOBL ·iv0<>(SB), (NOPTR+RODATA), $16
-
-DATA ·iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
-DATA ·iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
-GLOBL ·iv1<>(SB), (NOPTR+RODATA), $16
-
-DATA ·iv2<>+0x00(SB)/8, $0x510e527fade682d1
-DATA ·iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
-GLOBL ·iv2<>(SB), (NOPTR+RODATA), $16
-
-DATA ·iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
-DATA ·iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
-GLOBL ·iv3<>(SB), (NOPTR+RODATA), $16
-
-DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
-DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
-GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
-
-DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
-DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
-GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
-
-#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
-	MOVO       v4, t1; \
-	MOVO       v5, v4; \
-	MOVO       t1, v5; \
-	MOVO       v6, t1; \
-	PUNPCKLQDQ v6, t2; \
-	PUNPCKHQDQ v7, v6; \
-	PUNPCKHQDQ t2, v6; \
-	PUNPCKLQDQ v7, t2; \
-	MOVO       t1, v7; \
-	MOVO       v2, t1; \
-	PUNPCKHQDQ t2, v7; \
-	PUNPCKLQDQ v3, t2; \
-	PUNPCKHQDQ t2, v2; \
-	PUNPCKLQDQ t1, t2; \
-	PUNPCKHQDQ t2, v3
-
-#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
-	MOVO       v4, t1; \
-	MOVO       v5, v4; \
-	MOVO       t1, v5; \
-	MOVO       v2, t1; \
-	PUNPCKLQDQ v2, t2; \
-	PUNPCKHQDQ v3, v2; \
-	PUNPCKHQDQ t2, v2; \
-	PUNPCKLQDQ v3, t2; \
-	MOVO       t1, v3; \
-	MOVO       v6, t1; \
-	PUNPCKHQDQ t2, v3; \
-	PUNPCKLQDQ v7, t2; \
-	PUNPCKHQDQ t2, v6; \
-	PUNPCKLQDQ t1, t2; \
-	PUNPCKHQDQ t2, v7
-
-#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
-	PADDQ  m0, v0;        \
-	PADDQ  m1, v1;        \
-	PADDQ  v2, v0;        \
-	PADDQ  v3, v1;        \
-	PXOR   v0, v6;        \
-	PXOR   v1, v7;        \
-	PSHUFD $0xB1, v6, v6; \
-	PSHUFD $0xB1, v7, v7; \
-	PADDQ  v6, v4;        \
-	PADDQ  v7, v5;        \
-	PXOR   v4, v2;        \
-	PXOR   v5, v3;        \
-	PSHUFB c40, v2;       \
-	PSHUFB c40, v3;       \
-	PADDQ  m2, v0;        \
-	PADDQ  m3, v1;        \
-	PADDQ  v2, v0;        \
-	PADDQ  v3, v1;        \
-	PXOR   v0, v6;        \
-	PXOR   v1, v7;        \
-	PSHUFB c48, v6;       \
-	PSHUFB c48, v7;       \
-	PADDQ  v6, v4;        \
-	PADDQ  v7, v5;        \
-	PXOR   v4, v2;        \
-	PXOR   v5, v3;        \
-	MOVOU  v2, t0;        \
-	PADDQ  v2, t0;        \
-	PSRLQ  $63, v2;       \
-	PXOR   t0, v2;        \
-	MOVOU  v3, t0;        \
-	PADDQ  v3, t0;        \
-	PSRLQ  $63, v3;       \
-	PXOR   t0, v3
-
-#define LOAD_MSG(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7) \
-	MOVQ   i0*8(src), m0;     \
-	PINSRQ $1, i1*8(src), m0; \
-	MOVQ   i2*8(src), m1;     \
-	PINSRQ $1, i3*8(src), m1; \
-	MOVQ   i4*8(src), m2;     \
-	PINSRQ $1, i5*8(src), m2; \
-	MOVQ   i6*8(src), m3;     \
-	PINSRQ $1, i7*8(src), m3
-
 // func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
-TEXT ·hashBlocksSSE4(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
-	MOVQ h+0(FP), AX
-	MOVQ c+8(FP), BX
-	MOVQ flag+16(FP), CX
-	MOVQ blocks_base+24(FP), SI
-	MOVQ blocks_len+32(FP), DI
-
-	MOVQ SP, R10
-	ADDQ $15, R10
-	ANDQ $~15, R10
-
-	MOVOU ·iv3<>(SB), X0
-	MOVO  X0, 0(R10)
-	XORQ  CX, 0(R10)     // 0(R10) = ·iv3 ^ (CX || 0)
-
-	MOVOU ·c40<>(SB), X13
-	MOVOU ·c48<>(SB), X14
-
-	MOVOU 0(AX), X12
+// Requires: SSE2, SSE4.1, SSSE3
+TEXT ·hashBlocksSSE4(SB), NOSPLIT, $288-48
+	MOVQ  h+0(FP), AX
+	MOVQ  c+8(FP), BX
+	MOVQ  flag+16(FP), CX
+	MOVQ  blocks_base+24(FP), SI
+	MOVQ  blocks_len+32(FP), DI
+	MOVQ  SP, R10
+	ADDQ  $0x0f, R10
+	ANDQ  $-16, R10
+	MOVOU ·iv3<>+0(SB), X0
+	MOVO  X0, (R10)
+	XORQ  CX, (R10)
+	MOVOU ·c40<>+0(SB), X13
+	MOVOU ·c48<>+0(SB), X14
+	MOVOU (AX), X12
 	MOVOU 16(AX), X15
-
-	MOVQ 0(BX), R8
-	MOVQ 8(BX), R9
+	MOVQ  (BX), R8
+	MOVQ  8(BX), R9
 
 loop:
-	ADDQ $128, R8
-	CMPQ R8, $128
+	ADDQ $0x80, R8
+	CMPQ R8, $0x80
 	JGE  noinc
 	INCQ R9
 
 noinc:
-	MOVQ R8, X8
-	PINSRQ $1, R9, X8
-
-	MOVO X12, X0
-	MOVO X15, X1
-	MOVOU 32(AX), X2
-	MOVOU 48(AX), X3
-	MOVOU ·iv0<>(SB), X4
-	MOVOU ·iv1<>(SB), X5
-	MOVOU ·iv2<>(SB), X6
-
-	PXOR X8, X6
-	MOVO 0(R10), X7
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7)
-	MOVO X8, 16(R10)
-	MOVO X9, 32(R10)
-	MOVO X10, 48(R10)
-	MOVO X11, 64(R10)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15)
-	MOVO X8, 80(R10)
-	MOVO X9, 96(R10)
-	MOVO X10, 112(R10)
-	MOVO X11, 128(R10)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6)
-	MOVO X8, 144(R10)
-	MOVO X9, 160(R10)
-	MOVO X10, 176(R10)
-	MOVO X11, 192(R10)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3)
-	MOVO X8, 208(R10)
-	MOVO X9, 224(R10)
-	MOVO X10, 240(R10)
-	MOVO X11, 256(R10)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	MOVOU 32(AX), X10
-	MOVOU 48(AX), X11
-	PXOR  X0, X12
-	PXOR  X1, X15
-	PXOR  X2, X10
-	PXOR  X3, X11
-	PXOR  X4, X12
-	PXOR  X5, X15
-	PXOR  X6, X10
-	PXOR  X7, X11
-	MOVOU X10, 32(AX)
-	MOVOU X11, 48(AX)
-
-	LEAQ 128(SI), SI
-	SUBQ $128, DI
-	JNE  loop
-
-	MOVOU X12, 0(AX)
-	MOVOU X15, 16(AX)
-
-	MOVQ R8, 0(BX)
-	MOVQ R9, 8(BX)
-
+	MOVQ       R8, X8
+	PINSRQ     $0x01, R9, X8
+	MOVO       X12, X0
+	MOVO       X15, X1
+	MOVOU      32(AX), X2
+	MOVOU      48(AX), X3
+	MOVOU      ·iv0<>+0(SB), X4
+	MOVOU      ·iv1<>+0(SB), X5
+	MOVOU      ·iv2<>+0(SB), X6
+	PXOR       X8, X6
+	MOVO       (R10), X7
+	MOVQ       (SI), X8
+	PINSRQ     $0x01, 16(SI), X8
+	MOVQ       32(SI), X9
+	PINSRQ     $0x01, 48(SI), X9
+	MOVQ       8(SI), X10
+	PINSRQ     $0x01, 24(SI), X10
+	MOVQ       40(SI), X11
+	PINSRQ     $0x01, 56(SI), X11
+	MOVO       X8, 16(R10)
+	MOVO       X9, 32(R10)
+	MOVO       X10, 48(R10)
+	MOVO       X11, 64(R10)
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       64(SI), X8
+	PINSRQ     $0x01, 80(SI), X8
+	MOVQ       96(SI), X9
+	PINSRQ     $0x01, 112(SI), X9
+	MOVQ       72(SI), X10
+	PINSRQ     $0x01, 88(SI), X10
+	MOVQ       104(SI), X11
+	PINSRQ     $0x01, 120(SI), X11
+	MOVO       X8, 80(R10)
+	MOVO       X9, 96(R10)
+	MOVO       X10, 112(R10)
+	MOVO       X11, 128(R10)
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       112(SI), X8
+	PINSRQ     $0x01, 32(SI), X8
+	MOVQ       72(SI), X9
+	PINSRQ     $0x01, 104(SI), X9
+	MOVQ       80(SI), X10
+	PINSRQ     $0x01, 64(SI), X10
+	MOVQ       120(SI), X11
+	PINSRQ     $0x01, 48(SI), X11
+	MOVO       X8, 144(R10)
+	MOVO       X9, 160(R10)
+	MOVO       X10, 176(R10)
+	MOVO       X11, 192(R10)
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       8(SI), X8
+	PINSRQ     $0x01, (SI), X8
+	MOVQ       88(SI), X9
+	PINSRQ     $0x01, 40(SI), X9
+	MOVQ       96(SI), X10
+	PINSRQ     $0x01, 16(SI), X10
+	MOVQ       56(SI), X11
+	PINSRQ     $0x01, 24(SI), X11
+	MOVO       X8, 208(R10)
+	MOVO       X9, 224(R10)
+	MOVO       X10, 240(R10)
+	MOVO       X11, 256(R10)
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       88(SI), X8
+	PINSRQ     $0x01, 96(SI), X8
+	MOVQ       40(SI), X9
+	PINSRQ     $0x01, 120(SI), X9
+	MOVQ       64(SI), X10
+	PINSRQ     $0x01, (SI), X10
+	MOVQ       16(SI), X11
+	PINSRQ     $0x01, 104(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       80(SI), X8
+	PINSRQ     $0x01, 24(SI), X8
+	MOVQ       56(SI), X9
+	PINSRQ     $0x01, 72(SI), X9
+	MOVQ       112(SI), X10
+	PINSRQ     $0x01, 48(SI), X10
+	MOVQ       8(SI), X11
+	PINSRQ     $0x01, 32(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       56(SI), X8
+	PINSRQ     $0x01, 24(SI), X8
+	MOVQ       104(SI), X9
+	PINSRQ     $0x01, 88(SI), X9
+	MOVQ       72(SI), X10
+	PINSRQ     $0x01, 8(SI), X10
+	MOVQ       96(SI), X11
+	PINSRQ     $0x01, 112(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       16(SI), X8
+	PINSRQ     $0x01, 40(SI), X8
+	MOVQ       32(SI), X9
+	PINSRQ     $0x01, 120(SI), X9
+	MOVQ       48(SI), X10
+	PINSRQ     $0x01, 80(SI), X10
+	MOVQ       (SI), X11
+	PINSRQ     $0x01, 64(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       72(SI), X8
+	PINSRQ     $0x01, 40(SI), X8
+	MOVQ       16(SI), X9
+	PINSRQ     $0x01, 80(SI), X9
+	MOVQ       (SI), X10
+	PINSRQ     $0x01, 56(SI), X10
+	MOVQ       32(SI), X11
+	PINSRQ     $0x01, 120(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       112(SI), X8
+	PINSRQ     $0x01, 88(SI), X8
+	MOVQ       48(SI), X9
+	PINSRQ     $0x01, 24(SI), X9
+	MOVQ       8(SI), X10
+	PINSRQ     $0x01, 96(SI), X10
+	MOVQ       64(SI), X11
+	PINSRQ     $0x01, 104(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       16(SI), X8
+	PINSRQ     $0x01, 48(SI), X8
+	MOVQ       (SI), X9
+	PINSRQ     $0x01, 64(SI), X9
+	MOVQ       96(SI), X10
+	PINSRQ     $0x01, 80(SI), X10
+	MOVQ       88(SI), X11
+	PINSRQ     $0x01, 24(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       32(SI), X8
+	PINSRQ     $0x01, 56(SI), X8
+	MOVQ       120(SI), X9
+	PINSRQ     $0x01, 8(SI), X9
+	MOVQ       104(SI), X10
+	PINSRQ     $0x01, 40(SI), X10
+	MOVQ       112(SI), X11
+	PINSRQ     $0x01, 72(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       96(SI), X8
+	PINSRQ     $0x01, 8(SI), X8
+	MOVQ       112(SI), X9
+	PINSRQ     $0x01, 32(SI), X9
+	MOVQ       40(SI), X10
+	PINSRQ     $0x01, 120(SI), X10
+	MOVQ       104(SI), X11
+	PINSRQ     $0x01, 80(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       (SI), X8
+	PINSRQ     $0x01, 48(SI), X8
+	MOVQ       72(SI), X9
+	PINSRQ     $0x01, 64(SI), X9
+	MOVQ       56(SI), X10
+	PINSRQ     $0x01, 24(SI), X10
+	MOVQ       16(SI), X11
+	PINSRQ     $0x01, 88(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       104(SI), X8
+	PINSRQ     $0x01, 56(SI), X8
+	MOVQ       96(SI), X9
+	PINSRQ     $0x01, 24(SI), X9
+	MOVQ       88(SI), X10
+	PINSRQ     $0x01, 112(SI), X10
+	MOVQ       8(SI), X11
+	PINSRQ     $0x01, 72(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       40(SI), X8
+	PINSRQ     $0x01, 120(SI), X8
+	MOVQ       64(SI), X9
+	PINSRQ     $0x01, 16(SI), X9
+	MOVQ       (SI), X10
+	PINSRQ     $0x01, 32(SI), X10
+	MOVQ       48(SI), X11
+	PINSRQ     $0x01, 80(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       48(SI), X8
+	PINSRQ     $0x01, 112(SI), X8
+	MOVQ       88(SI), X9
+	PINSRQ     $0x01, (SI), X9
+	MOVQ       120(SI), X10
+	PINSRQ     $0x01, 72(SI), X10
+	MOVQ       24(SI), X11
+	PINSRQ     $0x01, 64(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       96(SI), X8
+	PINSRQ     $0x01, 104(SI), X8
+	MOVQ       8(SI), X9
+	PINSRQ     $0x01, 80(SI), X9
+	MOVQ       16(SI), X10
+	PINSRQ     $0x01, 56(SI), X10
+	MOVQ       32(SI), X11
+	PINSRQ     $0x01, 40(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       80(SI), X8
+	PINSRQ     $0x01, 64(SI), X8
+	MOVQ       56(SI), X9
+	PINSRQ     $0x01, 8(SI), X9
+	MOVQ       16(SI), X10
+	PINSRQ     $0x01, 32(SI), X10
+	MOVQ       48(SI), X11
+	PINSRQ     $0x01, 40(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       120(SI), X8
+	PINSRQ     $0x01, 72(SI), X8
+	MOVQ       24(SI), X9
+	PINSRQ     $0x01, 104(SI), X9
+	MOVQ       88(SI), X10
+	PINSRQ     $0x01, 112(SI), X10
+	MOVQ       96(SI), X11
+	PINSRQ     $0x01, (SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	PADDQ      16(R10), X0
+	PADDQ      32(R10), X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      48(R10), X0
+	PADDQ      64(R10), X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	PADDQ      80(R10), X0
+	PADDQ      96(R10), X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      112(R10), X0
+	PADDQ      128(R10), X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	PADDQ      144(R10), X0
+	PADDQ      160(R10), X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      176(R10), X0
+	PADDQ      192(R10), X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	PADDQ      208(R10), X0
+	PADDQ      224(R10), X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      240(R10), X0
+	PADDQ      256(R10), X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVOU      32(AX), X10
+	MOVOU      48(AX), X11
+	PXOR       X0, X12
+	PXOR       X1, X15
+	PXOR       X2, X10
+	PXOR       X3, X11
+	PXOR       X4, X12
+	PXOR       X5, X15
+	PXOR       X6, X10
+	PXOR       X7, X11
+	MOVOU      X10, 32(AX)
+	MOVOU      X11, 48(AX)
+	LEAQ       128(SI), SI
+	SUBQ       $0x80, DI
+	JNE        loop
+	MOVOU      X12, (AX)
+	MOVOU      X15, 16(AX)
+	MOVQ       R8, (BX)
+	MOVQ       R9, 8(BX)
 	RET
+
+DATA ·iv3<>+0(SB)/8, $0x1f83d9abfb41bd6b
+DATA ·iv3<>+8(SB)/8, $0x5be0cd19137e2179
+GLOBL ·iv3<>(SB), RODATA|NOPTR, $16
+
+DATA ·c40<>+0(SB)/8, $0x0201000706050403
+DATA ·c40<>+8(SB)/8, $0x0a09080f0e0d0c0b
+GLOBL ·c40<>(SB), RODATA|NOPTR, $16
+
+DATA ·c48<>+0(SB)/8, $0x0100070605040302
+DATA ·c48<>+8(SB)/8, $0x09080f0e0d0c0b0a
+GLOBL ·c48<>(SB), RODATA|NOPTR, $16
+
+DATA ·iv0<>+0(SB)/8, $0x6a09e667f3bcc908
+DATA ·iv0<>+8(SB)/8, $0xbb67ae8584caa73b
+GLOBL ·iv0<>(SB), RODATA|NOPTR, $16
+
+DATA ·iv1<>+0(SB)/8, $0x3c6ef372fe94f82b
+DATA ·iv1<>+8(SB)/8, $0xa54ff53a5f1d36f1
+GLOBL ·iv1<>(SB), RODATA|NOPTR, $16
+
+DATA ·iv2<>+0(SB)/8, $0x510e527fade682d1
+DATA ·iv2<>+8(SB)/8, $0x9b05688c2b3e6c1f
+GLOBL ·iv2<>(SB), RODATA|NOPTR, $16