salsa20/salsa: Port salsa20_amd64.s to Avo

This implementation utilizes the same registers found in the reference
implementation, aiming to produce a minimal semantic diff between the
Avo-generated output and the original hand-written assembly.

To verify the Avo implementation, the reference and Avo-generated
assembly files are fed to `go tool asm`, capturing the debug output into
corresponding temp files. The debug output contains supplementary
metadata (line numbers, instruction offsets, and source file references)
that must be removed in order to obtain a semantic diff of the two
files. This is accomplished via a small utility script written in awk.

Commands used to verify Avo output:

GOROOT=$(go env GOROOT)
REFERENCE="b2d3a6a4b4d36521cd7f653879cf6981e7c5c340"

go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \
  <(git cat-file -p "$REFERENCE":salsa20/salsa/salsa20_amd64.s) \
  > /tmp/reference.s

go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \
  salsa20/salsa/salsa20_amd64.s \
  > /tmp/avo.s

normalize(){
  awk '{
    $1=$2=$3="";
    print substr($0,4)
  }'
}

diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s)

Change-Id: Ica0bb06f8b074ad566a979d33ddc81d8a38491b1
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/601217
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Filippo Valsorda <filippo@golang.org>
Reviewed-by: Roland Shoemaker <roland@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
diff --git a/salsa20/salsa/_asm/go.mod b/salsa20/salsa/_asm/go.mod
new file mode 100644
index 0000000..0cf7f76
--- /dev/null
+++ b/salsa20/salsa/_asm/go.mod
@@ -0,0 +1,14 @@
+module salsa20/salsa/_asm
+
+go 1.23
+
+require (
+	github.com/mmcloughlin/avo v0.6.0
+	golang.org/x/crypto v0.26.0
+)
+
+require (
+	golang.org/x/mod v0.20.0 // indirect
+	golang.org/x/sync v0.8.0 // indirect
+	golang.org/x/tools v0.24.0 // indirect
+)
diff --git a/salsa20/salsa/_asm/go.sum b/salsa20/salsa/_asm/go.sum
new file mode 100644
index 0000000..e597080
--- /dev/null
+++ b/salsa20/salsa/_asm/go.sum
@@ -0,0 +1,10 @@
+github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY=
+github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8=
+golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw=
+golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54=
+golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0=
+golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
+golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ=
+golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24=
+golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ=
diff --git a/salsa20/salsa/_asm/salsa20_amd64_asm.go b/salsa20/salsa/_asm/salsa20_amd64_asm.go
new file mode 100644
index 0000000..6546791
--- /dev/null
+++ b/salsa20/salsa/_asm/salsa20_amd64_asm.go
@@ -0,0 +1,932 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This code was translated into a form compatible with 6a from the public
+// domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html
+
+package main
+
+import (
+	. "github.com/mmcloughlin/avo/build"
+	"github.com/mmcloughlin/avo/ir"
+	. "github.com/mmcloughlin/avo/operand"
+	. "github.com/mmcloughlin/avo/reg"
+	_ "golang.org/x/crypto/salsa20/salsa"
+)
+
+//go:generate go run . -out ../salsa20_amd64.s -pkg salsa
+
+func main() {
+	Package("golang.org/x/crypto/salsa20/salsa")
+	ConstraintExpr("amd64,!purego,gc")
+	salsa2020XORKeyStream()
+	Generate()
+}
+
+func salsa2020XORKeyStream() {
+	Implement("salsa2020XORKeyStream")
+	Attributes(0)
+	AllocLocal(456) // frame = 424 + 32 byte alignment
+	Comment("This needs up to 64 bytes at 360(R12); hence the non-obvious frame size.")
+
+	Load(Param("out"), RDI)
+	Load(Param("in"), RSI)
+	Load(Param("n"), RDX)
+	Load(Param("nonce"), RCX)
+	Load(Param("key"), R8)
+
+	MOVQ(RSP, R12)
+	ADDQ(Imm(31), R12)
+	ANDQ(I32(^31), R12)
+
+	MOVQ(RDX, R9)
+	MOVQ(RCX, RDX)
+	MOVQ(R8, R10)
+	CMPQ(R9, Imm(0))
+	JBE(LabelRef("DONE"))
+
+	START()
+	BYTESATLEAST256()
+	MAINLOOP1()
+	BYTESBETWEEN1AND255()
+	NOCOPY()
+	MAINLOOP2()
+
+	Label("BYTESATLEAST64")
+	Label("DONE")
+	RET()
+	Label("BYTESATLEAST65")
+	SUBQ(Imm(64), R9)
+	ADDQ(Imm(64), RDI)
+	ADDQ(Imm(64), RSI)
+	JMP(LabelRef("BYTESBETWEEN1AND255"))
+}
+
+func START() {
+	Label("START")
+	MOVL(Mem{Base: R10}.Offset(20), ECX)
+	MOVL(Mem{Base: R10}.Offset(0), R8L)
+	MOVL(Mem{Base: EDX}.Offset(0), EAX)
+	MOVL(Mem{Base: R10}.Offset(16), R11L)
+	MOVL(ECX, Mem{Base: R12}.Offset(0))
+	MOVL(R8L, Mem{Base: R12}.Offset(4))
+	MOVL(EAX, Mem{Base: R12}.Offset(8))
+	MOVL(R11L, Mem{Base: R12}.Offset(12))
+	MOVL(Mem{Base: EDX}.Offset(8), ECX)
+	MOVL(Mem{Base: R10}.Offset(24), R8L)
+	MOVL(Mem{Base: R10}.Offset(4), EAX)
+	MOVL(Mem{Base: EDX}.Offset(4), R11L)
+	MOVL(ECX, Mem{Base: R12}.Offset(16))
+	MOVL(R8L, Mem{Base: R12}.Offset(20))
+	MOVL(EAX, Mem{Base: R12}.Offset(24))
+	MOVL(R11L, Mem{Base: R12}.Offset(28))
+	MOVL(Mem{Base: EDX}.Offset(12), ECX)
+	MOVL(Mem{Base: R10}.Offset(12), EDX)
+	MOVL(Mem{Base: R10}.Offset(28), R8L)
+	MOVL(Mem{Base: R10}.Offset(8), EAX)
+	MOVL(EDX, Mem{Base: R12}.Offset(32))
+	MOVL(ECX, Mem{Base: R12}.Offset(36))
+	MOVL(R8L, Mem{Base: R12}.Offset(40))
+	MOVL(EAX, Mem{Base: R12}.Offset(44))
+	MOVQ(Imm(1634760805), RDX)
+	MOVQ(Imm(857760878), RCX)
+	MOVQ(Imm(2036477234), R8)
+	MOVQ(Imm(1797285236), RAX)
+	MOVL(EDX, Mem{Base: R12}.Offset(48))
+	MOVL(ECX, Mem{Base: R12}.Offset(52))
+	MOVL(R8L, Mem{Base: R12}.Offset(56))
+	MOVL(EAX, Mem{Base: R12}.Offset(60))
+	CMPQ(R9, U32(256))
+	JB(LabelRef("BYTESBETWEEN1AND255"))
+	MOVOA(Mem{Base: R12}.Offset(48), X0)
+	PSHUFL(Imm(0x55), X0, X1)
+	PSHUFL(Imm(0xAA), X0, X2)
+	PSHUFL(Imm(0xFF), X0, X3)
+	PSHUFL(Imm(0x00), X0, X0)
+	MOVOA(X1, Mem{Base: R12}.Offset(64))
+	MOVOA(X2, Mem{Base: R12}.Offset(80))
+	MOVOA(X3, Mem{Base: R12}.Offset(96))
+	MOVOA(X0, Mem{Base: R12}.Offset(112))
+	MOVOA(Mem{Base: R12}.Offset(0), X0)
+	PSHUFL(Imm(0xAA), X0, X1)
+	PSHUFL(Imm(0xFF), X0, X2)
+	PSHUFL(Imm(0x00), X0, X3)
+	PSHUFL(Imm(0x55), X0, X0)
+	MOVOA(X1, Mem{Base: R12}.Offset(128))
+	MOVOA(X2, Mem{Base: R12}.Offset(144))
+	MOVOA(X3, Mem{Base: R12}.Offset(160))
+	MOVOA(X0, Mem{Base: R12}.Offset(176))
+	MOVOA(Mem{Base: R12}.Offset(16), X0)
+	PSHUFL(Imm(0xFF), X0, X1)
+	PSHUFL(Imm(0x55), X0, X2)
+	PSHUFL(Imm(0xAA), X0, X0)
+	MOVOA(X1, Mem{Base: R12}.Offset(192))
+	MOVOA(X2, Mem{Base: R12}.Offset(208))
+	MOVOA(X0, Mem{Base: R12}.Offset(224))
+	MOVOA(Mem{Base: R12}.Offset(32), X0)
+	PSHUFL(Imm(0x00), X0, X1)
+	PSHUFL(Imm(0xAA), X0, X2)
+	PSHUFL(Imm(0xFF), X0, X0)
+	MOVOA(X1, Mem{Base: R12}.Offset(240))
+	MOVOA(X2, Mem{Base: R12}.Offset(256))
+	MOVOA(X0, Mem{Base: R12}.Offset(272))
+
+}
+
+func BYTESATLEAST256() {
+	Label("BYTESATLEAST256")
+	MOVL(Mem{Base: R12}.Offset(16), EDX)
+	MOVL(Mem{Base: R12}.Offset(36), ECX)
+	MOVL(EDX, Mem{Base: R12}.Offset(288))
+	MOVL(ECX, Mem{Base: R12}.Offset(304))
+	SHLQ(Imm(32), RCX)
+	ADDQ(RCX, RDX)
+	ADDQ(Imm(1), RDX)
+	MOVQ(RDX, RCX)
+	SHRQ(Imm(32), RCX)
+	MOVL(EDX, Mem{Base: R12}.Offset(292))
+	MOVL(ECX, Mem{Base: R12}.Offset(308))
+	ADDQ(Imm(1), RDX)
+	MOVQ(RDX, RCX)
+	SHRQ(Imm(32), RCX)
+	MOVL(EDX, Mem{Base: R12}.Offset(296))
+	MOVL(ECX, Mem{Base: R12}.Offset(312))
+	ADDQ(Imm(1), RDX)
+	MOVQ(RDX, RCX)
+	SHRQ(Imm(32), RCX)
+	MOVL(EDX, Mem{Base: R12}.Offset(300))
+	MOVL(ECX, Mem{Base: R12}.Offset(316))
+	ADDQ(Imm(1), RDX)
+	MOVQ(RDX, RCX)
+	SHRQ(Imm(32), RCX)
+	MOVL(EDX, Mem{Base: R12}.Offset(16))
+	MOVL(ECX, Mem{Base: R12}.Offset(36))
+	MOVQ(R9, Mem{Base: R12}.Offset(352))
+	MOVQ(U32(20), RDX)
+	MOVOA(Mem{Base: R12}.Offset(64), X0)
+	MOVOA(Mem{Base: R12}.Offset(80), X1)
+	MOVOA(Mem{Base: R12}.Offset(96), X2)
+	MOVOA(Mem{Base: R12}.Offset(256), X3)
+	MOVOA(Mem{Base: R12}.Offset(272), X4)
+	MOVOA(Mem{Base: R12}.Offset(128), X5)
+	MOVOA(Mem{Base: R12}.Offset(144), X6)
+	MOVOA(Mem{Base: R12}.Offset(176), X7)
+	MOVOA(Mem{Base: R12}.Offset(192), X8)
+	MOVOA(Mem{Base: R12}.Offset(208), X9)
+	MOVOA(Mem{Base: R12}.Offset(224), X10)
+	MOVOA(Mem{Base: R12}.Offset(304), X11)
+	MOVOA(Mem{Base: R12}.Offset(112), X12)
+	MOVOA(Mem{Base: R12}.Offset(160), X13)
+	MOVOA(Mem{Base: R12}.Offset(240), X14)
+	MOVOA(Mem{Base: R12}.Offset(288), X15)
+}
+
+func MAINLOOP1() {
+	Label("MAINLOOP1")
+	MOVOA(X1, Mem{Base: R12}.Offset(320))
+	MOVOA(X2, Mem{Base: R12}.Offset(336))
+	MOVOA(X13, X1)
+	PADDL(X12, X1)
+	MOVOA(X1, X2)
+	PSLLL(Imm(7), X1)
+	PXOR(X1, X14)
+	PSRLL(Imm(25), X2)
+	PXOR(X2, X14)
+	MOVOA(X7, X1)
+	PADDL(X0, X1)
+	MOVOA(X1, X2)
+	PSLLL(Imm(7), X1)
+	PXOR(X1, X11)
+	PSRLL(Imm(25), X2)
+	PXOR(X2, X11)
+	MOVOA(X12, X1)
+	PADDL(X14, X1)
+	MOVOA(X1, X2)
+	PSLLL(Imm(9), X1)
+	PXOR(X1, X15)
+	PSRLL(Imm(23), X2)
+	PXOR(X2, X15)
+	MOVOA(X0, X1)
+	PADDL(X11, X1)
+	MOVOA(X1, X2)
+	PSLLL(Imm(9), X1)
+	PXOR(X1, X9)
+	PSRLL(Imm(23), X2)
+	PXOR(X2, X9)
+	MOVOA(X14, X1)
+	PADDL(X15, X1)
+	MOVOA(X1, X2)
+	PSLLL(Imm(13), X1)
+	PXOR(X1, X13)
+	PSRLL(Imm(19), X2)
+	PXOR(X2, X13)
+	MOVOA(X11, X1)
+	PADDL(X9, X1)
+	MOVOA(X1, X2)
+	PSLLL(Imm(13), X1)
+	PXOR(X1, X7)
+	PSRLL(Imm(19), X2)
+	PXOR(X2, X7)
+	MOVOA(X15, X1)
+	PADDL(X13, X1)
+	MOVOA(X1, X2)
+	PSLLL(Imm(18), X1)
+	PXOR(X1, X12)
+	PSRLL(Imm(14), X2)
+	PXOR(X2, X12)
+	MOVOA(Mem{Base: R12}.Offset(320), X1)
+	MOVOA(X12, Mem{Base: R12}.Offset(320))
+	MOVOA(X9, X2)
+	PADDL(X7, X2)
+	MOVOA(X2, X12)
+	PSLLL(Imm(18), X2)
+	PXOR(X2, X0)
+	PSRLL(Imm(14), X12)
+	PXOR(X12, X0)
+	MOVOA(X5, X2)
+	PADDL(X1, X2)
+	MOVOA(X2, X12)
+	PSLLL(Imm(7), X2)
+	PXOR(X2, X3)
+	PSRLL(Imm(25), X12)
+	PXOR(X12, X3)
+	MOVOA(Mem{Base: R12}.Offset(336), X2)
+	MOVOA(X0, Mem{Base: R12}.Offset(336))
+	MOVOA(X6, X0)
+	PADDL(X2, X0)
+	MOVOA(X0, X12)
+	PSLLL(Imm(7), X0)
+	PXOR(X0, X4)
+	PSRLL(Imm(25), X12)
+	PXOR(X12, X4)
+	MOVOA(X1, X0)
+	PADDL(X3, X0)
+	MOVOA(X0, X12)
+	PSLLL(Imm(9), X0)
+	PXOR(X0, X10)
+	PSRLL(Imm(23), X12)
+	PXOR(X12, X10)
+	MOVOA(X2, X0)
+	PADDL(X4, X0)
+	MOVOA(X0, X12)
+	PSLLL(Imm(9), X0)
+	PXOR(X0, X8)
+	PSRLL(Imm(23), X12)
+	PXOR(X12, X8)
+	MOVOA(X3, X0)
+	PADDL(X10, X0)
+	MOVOA(X0, X12)
+	PSLLL(Imm(13), X0)
+	PXOR(X0, X5)
+	PSRLL(Imm(19), X12)
+	PXOR(X12, X5)
+	MOVOA(X4, X0)
+	PADDL(X8, X0)
+	MOVOA(X0, X12)
+	PSLLL(Imm(13), X0)
+	PXOR(X0, X6)
+	PSRLL(Imm(19), X12)
+	PXOR(X12, X6)
+	MOVOA(X10, X0)
+	PADDL(X5, X0)
+	MOVOA(X0, X12)
+	PSLLL(Imm(18), X0)
+	PXOR(X0, X1)
+	PSRLL(Imm(14), X12)
+	PXOR(X12, X1)
+	MOVOA(Mem{Base: R12}.Offset(320), X0)
+	MOVOA(X1, Mem{Base: R12}.Offset(320))
+	MOVOA(X4, X1)
+	PADDL(X0, X1)
+	MOVOA(X1, X12)
+	PSLLL(Imm(7), X1)
+	PXOR(X1, X7)
+	PSRLL(Imm(25), X12)
+	PXOR(X12, X7)
+	MOVOA(X8, X1)
+	PADDL(X6, X1)
+	MOVOA(X1, X12)
+	PSLLL(Imm(18), X1)
+	PXOR(X1, X2)
+	PSRLL(Imm(14), X12)
+	PXOR(X12, X2)
+	MOVOA(Mem{Base: R12}.Offset(336), X12)
+	MOVOA(X2, Mem{Base: R12}.Offset(336))
+	MOVOA(X14, X1)
+	PADDL(X12, X1)
+	MOVOA(X1, X2)
+	PSLLL(Imm(7), X1)
+	PXOR(X1, X5)
+	PSRLL(Imm(25), X2)
+	PXOR(X2, X5)
+	MOVOA(X0, X1)
+	PADDL(X7, X1)
+	MOVOA(X1, X2)
+	PSLLL(Imm(9), X1)
+	PXOR(X1, X10)
+	PSRLL(Imm(23), X2)
+	PXOR(X2, X10)
+	MOVOA(X12, X1)
+	PADDL(X5, X1)
+	MOVOA(X1, X2)
+	PSLLL(Imm(9), X1)
+	PXOR(X1, X8)
+	PSRLL(Imm(23), X2)
+	PXOR(X2, X8)
+	MOVOA(X7, X1)
+	PADDL(X10, X1)
+	MOVOA(X1, X2)
+	PSLLL(Imm(13), X1)
+	PXOR(X1, X4)
+	PSRLL(Imm(19), X2)
+	PXOR(X2, X4)
+	MOVOA(X5, X1)
+	PADDL(X8, X1)
+	MOVOA(X1, X2)
+	PSLLL(Imm(13), X1)
+	PXOR(X1, X14)
+	PSRLL(Imm(19), X2)
+	PXOR(X2, X14)
+	MOVOA(X10, X1)
+	PADDL(X4, X1)
+	MOVOA(X1, X2)
+	PSLLL(Imm(18), X1)
+	PXOR(X1, X0)
+	PSRLL(Imm(14), X2)
+	PXOR(X2, X0)
+	MOVOA(Mem{Base: R12}.Offset(320), X1)
+	MOVOA(X0, Mem{Base: R12}.Offset(320))
+	MOVOA(X8, X0)
+	PADDL(X14, X0)
+	MOVOA(X0, X2)
+	PSLLL(Imm(18), X0)
+	PXOR(X0, X12)
+	PSRLL(Imm(14), X2)
+	PXOR(X2, X12)
+	MOVOA(X11, X0)
+	PADDL(X1, X0)
+	MOVOA(X0, X2)
+	PSLLL(Imm(7), X0)
+	PXOR(X0, X6)
+	PSRLL(Imm(25), X2)
+	PXOR(X2, X6)
+	MOVOA(Mem{Base: R12}.Offset(336), X2)
+	MOVOA(X12, Mem{Base: R12}.Offset(336))
+	MOVOA(X3, X0)
+	PADDL(X2, X0)
+	MOVOA(X0, X12)
+	PSLLL(Imm(7), X0)
+	PXOR(X0, X13)
+	PSRLL(Imm(25), X12)
+	PXOR(X12, X13)
+	MOVOA(X1, X0)
+	PADDL(X6, X0)
+	MOVOA(X0, X12)
+	PSLLL(Imm(9), X0)
+	PXOR(X0, X15)
+	PSRLL(Imm(23), X12)
+	PXOR(X12, X15)
+	MOVOA(X2, X0)
+	PADDL(X13, X0)
+	MOVOA(X0, X12)
+	PSLLL(Imm(9), X0)
+	PXOR(X0, X9)
+	PSRLL(Imm(23), X12)
+	PXOR(X12, X9)
+	MOVOA(X6, X0)
+	PADDL(X15, X0)
+	MOVOA(X0, X12)
+	PSLLL(Imm(13), X0)
+	PXOR(X0, X11)
+	PSRLL(Imm(19), X12)
+	PXOR(X12, X11)
+	MOVOA(X13, X0)
+	PADDL(X9, X0)
+	MOVOA(X0, X12)
+	PSLLL(Imm(13), X0)
+	PXOR(X0, X3)
+	PSRLL(Imm(19), X12)
+	PXOR(X12, X3)
+	MOVOA(X15, X0)
+	PADDL(X11, X0)
+	MOVOA(X0, X12)
+	PSLLL(Imm(18), X0)
+	PXOR(X0, X1)
+	PSRLL(Imm(14), X12)
+	PXOR(X12, X1)
+	MOVOA(X9, X0)
+	PADDL(X3, X0)
+	MOVOA(X0, X12)
+	PSLLL(Imm(18), X0)
+	PXOR(X0, X2)
+	PSRLL(Imm(14), X12)
+	PXOR(X12, X2)
+	MOVOA(Mem{Base: R12}.Offset(320), X12)
+	MOVOA(Mem{Base: R12}.Offset(336), X0)
+	SUBQ(Imm(2), RDX)
+	JA(LabelRef("MAINLOOP1"))
+	PADDL(Mem{Base: R12}.Offset(112), X12)
+	PADDL(Mem{Base: R12}.Offset(176), X7)
+	PADDL(Mem{Base: R12}.Offset(224), X10)
+	PADDL(Mem{Base: R12}.Offset(272), X4)
+	MOVD(X12, EDX)
+	MOVD(X7, ECX)
+	MOVD(X10, R8)
+	MOVD(X4, R9)
+	PSHUFL(Imm(0x39), X12, X12)
+	PSHUFL(Imm(0x39), X7, X7)
+	PSHUFL(Imm(0x39), X10, X10)
+	PSHUFL(Imm(0x39), X4, X4)
+	XORL(Mem{Base: SI}.Offset(0), EDX)
+	XORL(Mem{Base: SI}.Offset(4), ECX)
+	XORL(Mem{Base: SI}.Offset(8), R8L)
+	XORL(Mem{Base: SI}.Offset(12), R9L)
+	MOVL(EDX, Mem{Base: DI}.Offset(0))
+	MOVL(ECX, Mem{Base: DI}.Offset(4))
+	MOVL(R8L, Mem{Base: DI}.Offset(8))
+	MOVL(R9L, Mem{Base: DI}.Offset(12))
+	MOVD(X12, EDX)
+	MOVD(X7, ECX)
+	MOVD(X10, R8)
+	MOVD(X4, R9)
+	PSHUFL(Imm(0x39), X12, X12)
+	PSHUFL(Imm(0x39), X7, X7)
+	PSHUFL(Imm(0x39), X10, X10)
+	PSHUFL(Imm(0x39), X4, X4)
+	XORL(Mem{Base: SI}.Offset(64), EDX)
+	XORL(Mem{Base: SI}.Offset(68), ECX)
+	XORL(Mem{Base: SI}.Offset(72), R8L)
+	XORL(Mem{Base: SI}.Offset(76), R9L)
+	MOVL(EDX, Mem{Base: DI}.Offset(64))
+	MOVL(ECX, Mem{Base: DI}.Offset(68))
+	MOVL(R8L, Mem{Base: DI}.Offset(72))
+	MOVL(R9L, Mem{Base: DI}.Offset(76))
+	MOVD(X12, EDX)
+	MOVD(X7, ECX)
+	MOVD(X10, R8)
+	MOVD(X4, R9)
+	PSHUFL(Imm(0x39), X12, X12)
+	PSHUFL(Imm(0x39), X7, X7)
+	PSHUFL(Imm(0x39), X10, X10)
+	PSHUFL(Imm(0x39), X4, X4)
+	XORL(Mem{Base: SI}.Offset(128), EDX)
+	XORL(Mem{Base: SI}.Offset(132), ECX)
+	XORL(Mem{Base: SI}.Offset(136), R8L)
+	XORL(Mem{Base: SI}.Offset(140), R9L)
+	MOVL(EDX, Mem{Base: DI}.Offset(128))
+	MOVL(ECX, Mem{Base: DI}.Offset(132))
+	MOVL(R8L, Mem{Base: DI}.Offset(136))
+	MOVL(R9L, Mem{Base: DI}.Offset(140))
+	MOVD(X12, EDX)
+	MOVD(X7, ECX)
+	MOVD(X10, R8)
+	MOVD(X4, R9)
+	XORL(Mem{Base: SI}.Offset(192), EDX)
+	XORL(Mem{Base: SI}.Offset(196), ECX)
+	XORL(Mem{Base: SI}.Offset(200), R8L)
+	XORL(Mem{Base: SI}.Offset(204), R9L)
+	MOVL(EDX, Mem{Base: DI}.Offset(192))
+	MOVL(ECX, Mem{Base: DI}.Offset(196))
+	MOVL(R8L, Mem{Base: DI}.Offset(200))
+	MOVL(R9L, Mem{Base: DI}.Offset(204))
+	PADDL(Mem{Base: R12}.Offset(240), X14)
+	PADDL(Mem{Base: R12}.Offset(64), X0)
+	PADDL(Mem{Base: R12}.Offset(128), X5)
+	PADDL(Mem{Base: R12}.Offset(192), X8)
+	MOVD(X14, EDX)
+	MOVD(X0, ECX)
+	MOVD(X5, R8)
+	MOVD(X8, R9)
+	PSHUFL(Imm(0x39), X14, X14)
+	PSHUFL(Imm(0x39), X0, X0)
+	PSHUFL(Imm(0x39), X5, X5)
+	PSHUFL(Imm(0x39), X8, X8)
+	XORL(Mem{Base: SI}.Offset(16), EDX)
+	XORL(Mem{Base: SI}.Offset(20), ECX)
+	XORL(Mem{Base: SI}.Offset(24), R8L)
+	XORL(Mem{Base: SI}.Offset(28), R9L)
+	MOVL(EDX, Mem{Base: DI}.Offset(16))
+	MOVL(ECX, Mem{Base: DI}.Offset(20))
+	MOVL(R8L, Mem{Base: DI}.Offset(24))
+	MOVL(R9L, Mem{Base: DI}.Offset(28))
+	MOVD(X14, EDX)
+	MOVD(X0, ECX)
+	MOVD(X5, R8)
+	MOVD(X8, R9)
+	PSHUFL(Imm(0x39), X14, X14)
+	PSHUFL(Imm(0x39), X0, X0)
+	PSHUFL(Imm(0x39), X5, X5)
+	PSHUFL(Imm(0x39), X8, X8)
+	XORL(Mem{Base: SI}.Offset(80), EDX)
+	XORL(Mem{Base: SI}.Offset(84), ECX)
+	XORL(Mem{Base: SI}.Offset(88), R8L)
+	XORL(Mem{Base: SI}.Offset(92), R9L)
+	MOVL(EDX, Mem{Base: DI}.Offset(80))
+	MOVL(ECX, Mem{Base: DI}.Offset(84))
+	MOVL(R8L, Mem{Base: DI}.Offset(88))
+	MOVL(R9L, Mem{Base: DI}.Offset(92))
+	MOVD(X14, EDX)
+	MOVD(X0, ECX)
+	MOVD(X5, R8)
+	MOVD(X8, R9)
+	PSHUFL(Imm(0x39), X14, X14)
+	PSHUFL(Imm(0x39), X0, X0)
+	PSHUFL(Imm(0x39), X5, X5)
+	PSHUFL(Imm(0x39), X8, X8)
+	XORL(Mem{Base: SI}.Offset(144), EDX)
+	XORL(Mem{Base: SI}.Offset(148), ECX)
+	XORL(Mem{Base: SI}.Offset(152), R8L)
+	XORL(Mem{Base: SI}.Offset(156), R9L)
+	MOVL(EDX, Mem{Base: DI}.Offset(144))
+	MOVL(ECX, Mem{Base: DI}.Offset(148))
+	MOVL(R8L, Mem{Base: DI}.Offset(152))
+	MOVL(R9L, Mem{Base: DI}.Offset(156))
+	MOVD(X14, EDX)
+	MOVD(X0, ECX)
+	MOVD(X5, R8)
+	MOVD(X8, R9)
+	XORL(Mem{Base: SI}.Offset(208), EDX)
+	XORL(Mem{Base: SI}.Offset(212), ECX)
+	XORL(Mem{Base: SI}.Offset(216), R8L)
+	XORL(Mem{Base: SI}.Offset(220), R9L)
+	MOVL(EDX, Mem{Base: DI}.Offset(208))
+	MOVL(ECX, Mem{Base: DI}.Offset(212))
+	MOVL(R8L, Mem{Base: DI}.Offset(216))
+	MOVL(R9L, Mem{Base: DI}.Offset(220))
+	PADDL(Mem{Base: R12}.Offset(288), X15)
+	PADDL(Mem{Base: R12}.Offset(304), X11)
+	PADDL(Mem{Base: R12}.Offset(80), X1)
+	PADDL(Mem{Base: R12}.Offset(144), X6)
+	MOVD(X15, EDX)
+	MOVD(X11, ECX)
+	MOVD(X1, R8)
+	MOVD(X6, R9)
+	PSHUFL(Imm(0x39), X15, X15)
+	PSHUFL(Imm(0x39), X11, X11)
+	PSHUFL(Imm(0x39), X1, X1)
+	PSHUFL(Imm(0x39), X6, X6)
+	XORL(Mem{Base: SI}.Offset(32), EDX)
+	XORL(Mem{Base: SI}.Offset(36), ECX)
+	XORL(Mem{Base: SI}.Offset(40), R8L)
+	XORL(Mem{Base: SI}.Offset(44), R9L)
+	MOVL(EDX, Mem{Base: DI}.Offset(32))
+	MOVL(ECX, Mem{Base: DI}.Offset(36))
+	MOVL(R8L, Mem{Base: DI}.Offset(40))
+	MOVL(R9L, Mem{Base: DI}.Offset(44))
+	MOVD(X15, EDX)
+	MOVD(X11, ECX)
+	MOVD(X1, R8)
+	MOVD(X6, R9)
+	PSHUFL(Imm(0x39), X15, X15)
+	PSHUFL(Imm(0x39), X11, X11)
+	PSHUFL(Imm(0x39), X1, X1)
+	PSHUFL(Imm(0x39), X6, X6)
+	XORL(Mem{Base: SI}.Offset(96), EDX)
+	XORL(Mem{Base: SI}.Offset(100), ECX)
+	XORL(Mem{Base: SI}.Offset(104), R8L)
+	XORL(Mem{Base: SI}.Offset(108), R9L)
+	MOVL(EDX, Mem{Base: DI}.Offset(96))
+	MOVL(ECX, Mem{Base: DI}.Offset(100))
+	MOVL(R8L, Mem{Base: DI}.Offset(104))
+	MOVL(R9L, Mem{Base: DI}.Offset(108))
+	MOVD(X15, EDX)
+	MOVD(X11, ECX)
+	MOVD(X1, R8)
+	MOVD(X6, R9)
+	PSHUFL(Imm(0x39), X15, X15)
+	PSHUFL(Imm(0x39), X11, X11)
+	PSHUFL(Imm(0x39), X1, X1)
+	PSHUFL(Imm(0x39), X6, X6)
+	XORL(Mem{Base: SI}.Offset(160), EDX)
+	XORL(Mem{Base: SI}.Offset(164), ECX)
+	XORL(Mem{Base: SI}.Offset(168), R8L)
+	XORL(Mem{Base: SI}.Offset(172), R9L)
+	MOVL(EDX, Mem{Base: DI}.Offset(160))
+	MOVL(ECX, Mem{Base: DI}.Offset(164))
+	MOVL(R8L, Mem{Base: DI}.Offset(168))
+	MOVL(R9L, Mem{Base: DI}.Offset(172))
+	MOVD(X15, EDX)
+	MOVD(X11, ECX)
+	MOVD(X1, R8)
+	MOVD(X6, R9)
+	XORL(Mem{Base: SI}.Offset(224), EDX)
+	XORL(Mem{Base: SI}.Offset(228), ECX)
+	XORL(Mem{Base: SI}.Offset(232), R8L)
+	XORL(Mem{Base: SI}.Offset(236), R9L)
+	MOVL(EDX, Mem{Base: DI}.Offset(224))
+	MOVL(ECX, Mem{Base: DI}.Offset(228))
+	MOVL(R8L, Mem{Base: DI}.Offset(232))
+	MOVL(R9L, Mem{Base: DI}.Offset(236))
+	PADDL(Mem{Base: R12}.Offset(160), X13)
+	PADDL(Mem{Base: R12}.Offset(208), X9)
+	PADDL(Mem{Base: R12}.Offset(256), X3)
+	PADDL(Mem{Base: R12}.Offset(96), X2)
+	MOVD(X13, EDX)
+	MOVD(X9, ECX)
+	MOVD(X3, R8)
+	MOVD(X2, R9)
+	PSHUFL(Imm(0x39), X13, X13)
+	PSHUFL(Imm(0x39), X9, X9)
+	PSHUFL(Imm(0x39), X3, X3)
+	PSHUFL(Imm(0x39), X2, X2)
+	XORL(Mem{Base: SI}.Offset(48), EDX)
+	XORL(Mem{Base: SI}.Offset(52), ECX)
+	XORL(Mem{Base: SI}.Offset(56), R8L)
+	XORL(Mem{Base: SI}.Offset(60), R9L)
+	MOVL(EDX, Mem{Base: DI}.Offset(48))
+	MOVL(ECX, Mem{Base: DI}.Offset(52))
+	MOVL(R8L, Mem{Base: DI}.Offset(56))
+	MOVL(R9L, Mem{Base: DI}.Offset(60))
+	MOVD(X13, EDX)
+	MOVD(X9, ECX)
+	MOVD(X3, R8)
+	MOVD(X2, R9)
+	PSHUFL(Imm(0x39), X13, X13)
+	PSHUFL(Imm(0x39), X9, X9)
+	PSHUFL(Imm(0x39), X3, X3)
+	PSHUFL(Imm(0x39), X2, X2)
+	XORL(Mem{Base: SI}.Offset(112), EDX)
+	XORL(Mem{Base: SI}.Offset(116), ECX)
+	XORL(Mem{Base: SI}.Offset(120), R8L)
+	XORL(Mem{Base: SI}.Offset(124), R9L)
+	MOVL(EDX, Mem{Base: DI}.Offset(112))
+	MOVL(ECX, Mem{Base: DI}.Offset(116))
+	MOVL(R8L, Mem{Base: DI}.Offset(120))
+	MOVL(R9L, Mem{Base: DI}.Offset(124))
+	MOVD(X13, EDX)
+	MOVD(X9, ECX)
+	MOVD(X3, R8)
+	MOVD(X2, R9)
+	PSHUFL(Imm(0x39), X13, X13)
+	PSHUFL(Imm(0x39), X9, X9)
+	PSHUFL(Imm(0x39), X3, X3)
+	PSHUFL(Imm(0x39), X2, X2)
+	XORL(Mem{Base: SI}.Offset(176), EDX)
+	XORL(Mem{Base: SI}.Offset(180), ECX)
+	XORL(Mem{Base: SI}.Offset(184), R8L)
+	XORL(Mem{Base: SI}.Offset(188), R9L)
+	MOVL(EDX, Mem{Base: DI}.Offset(176))
+	MOVL(ECX, Mem{Base: DI}.Offset(180))
+	MOVL(R8L, Mem{Base: DI}.Offset(184))
+	MOVL(R9L, Mem{Base: DI}.Offset(188))
+	MOVD(X13, EDX)
+	MOVD(X9, ECX)
+	MOVD(X3, R8)
+	MOVD(X2, R9)
+	XORL(Mem{Base: SI}.Offset(240), EDX)
+	XORL(Mem{Base: SI}.Offset(244), ECX)
+	XORL(Mem{Base: SI}.Offset(248), R8L)
+	XORL(Mem{Base: SI}.Offset(252), R9L)
+	MOVL(EDX, Mem{Base: DI}.Offset(240))
+	MOVL(ECX, Mem{Base: DI}.Offset(244))
+	MOVL(R8L, Mem{Base: DI}.Offset(248))
+	MOVL(R9L, Mem{Base: DI}.Offset(252))
+	MOVQ(Mem{Base: R12}.Offset(352), R9)
+	SUBQ(U32(256), R9)
+	ADDQ(U32(256), RSI)
+	ADDQ(U32(256), RDI)
+	CMPQ(R9, U32(256))
+	JAE(LabelRef("BYTESATLEAST256"))
+	CMPQ(R9, Imm(0))
+	JBE(LabelRef("DONE"))
+}
+
+func BYTESBETWEEN1AND255() {
+	Label("BYTESBETWEEN1AND255")
+	CMPQ(R9, Imm(64))
+	JAE(LabelRef("NOCOPY"))
+	MOVQ(RDI, RDX)
+	LEAQ(Mem{Base: R12}.Offset(360), RDI)
+	MOVQ(R9, RCX)
+	// Hack to get Avo to emit:
+	// 	REP; MOVSB
+	Instruction(&ir.Instruction{Opcode: "REP; MOVSB"})
+	LEAQ(Mem{Base: R12}.Offset(360), RDI)
+	LEAQ(Mem{Base: R12}.Offset(360), RSI)
+}
+
+func NOCOPY() {
+	Label("NOCOPY")
+	MOVQ(R9, Mem{Base: R12}.Offset(352))
+	MOVOA(Mem{Base: R12}.Offset(48), X0)
+	MOVOA(Mem{Base: R12}.Offset(0), X1)
+	MOVOA(Mem{Base: R12}.Offset(16), X2)
+	MOVOA(Mem{Base: R12}.Offset(32), X3)
+	MOVOA(X1, X4)
+	MOVQ(U32(20), RCX)
+}
+
+func MAINLOOP2() {
+	Label("MAINLOOP2")
+	PADDL(X0, X4)
+	MOVOA(X0, X5)
+	MOVOA(X4, X6)
+	PSLLL(Imm(7), X4)
+	PSRLL(Imm(25), X6)
+	PXOR(X4, X3)
+	PXOR(X6, X3)
+	PADDL(X3, X5)
+	MOVOA(X3, X4)
+	MOVOA(X5, X6)
+	PSLLL(Imm(9), X5)
+	PSRLL(Imm(23), X6)
+	PXOR(X5, X2)
+	PSHUFL(Imm(0x93), X3, X3)
+	PXOR(X6, X2)
+	PADDL(X2, X4)
+	MOVOA(X2, X5)
+	MOVOA(X4, X6)
+	PSLLL(Imm(13), X4)
+	PSRLL(Imm(19), X6)
+	PXOR(X4, X1)
+	PSHUFL(Imm(0x4E), X2, X2)
+	PXOR(X6, X1)
+	PADDL(X1, X5)
+	MOVOA(X3, X4)
+	MOVOA(X5, X6)
+	PSLLL(Imm(18), X5)
+	PSRLL(Imm(14), X6)
+	PXOR(X5, X0)
+	PSHUFL(Imm(0x39), X1, X1)
+	PXOR(X6, X0)
+	PADDL(X0, X4)
+	MOVOA(X0, X5)
+	MOVOA(X4, X6)
+	PSLLL(Imm(7), X4)
+	PSRLL(Imm(25), X6)
+	PXOR(X4, X1)
+	PXOR(X6, X1)
+	PADDL(X1, X5)
+	MOVOA(X1, X4)
+	MOVOA(X5, X6)
+	PSLLL(Imm(9), X5)
+	PSRLL(Imm(23), X6)
+	PXOR(X5, X2)
+	PSHUFL(Imm(0x93), X1, X1)
+	PXOR(X6, X2)
+	PADDL(X2, X4)
+	MOVOA(X2, X5)
+	MOVOA(X4, X6)
+	PSLLL(Imm(13), X4)
+	PSRLL(Imm(19), X6)
+	PXOR(X4, X3)
+	PSHUFL(Imm(0x4E), X2, X2)
+	PXOR(X6, X3)
+	PADDL(X3, X5)
+	MOVOA(X1, X4)
+	MOVOA(X5, X6)
+	PSLLL(Imm(18), X5)
+	PSRLL(Imm(14), X6)
+	PXOR(X5, X0)
+	PSHUFL(Imm(0x39), X3, X3)
+	PXOR(X6, X0)
+	PADDL(X0, X4)
+	MOVOA(X0, X5)
+	MOVOA(X4, X6)
+	PSLLL(Imm(7), X4)
+	PSRLL(Imm(25), X6)
+	PXOR(X4, X3)
+	PXOR(X6, X3)
+	PADDL(X3, X5)
+	MOVOA(X3, X4)
+	MOVOA(X5, X6)
+	PSLLL(Imm(9), X5)
+	PSRLL(Imm(23), X6)
+	PXOR(X5, X2)
+	PSHUFL(Imm(0x93), X3, X3)
+	PXOR(X6, X2)
+	PADDL(X2, X4)
+	MOVOA(X2, X5)
+	MOVOA(X4, X6)
+	PSLLL(Imm(13), X4)
+	PSRLL(Imm(19), X6)
+	PXOR(X4, X1)
+	PSHUFL(Imm(0x4E), X2, X2)
+	PXOR(X6, X1)
+	PADDL(X1, X5)
+	MOVOA(X3, X4)
+	MOVOA(X5, X6)
+	PSLLL(Imm(18), X5)
+	PSRLL(Imm(14), X6)
+	PXOR(X5, X0)
+	PSHUFL(Imm(0x39), X1, X1)
+	PXOR(X6, X0)
+	PADDL(X0, X4)
+	MOVOA(X0, X5)
+	MOVOA(X4, X6)
+	PSLLL(Imm(7), X4)
+	PSRLL(Imm(25), X6)
+	PXOR(X4, X1)
+	PXOR(X6, X1)
+	PADDL(X1, X5)
+	MOVOA(X1, X4)
+	MOVOA(X5, X6)
+	PSLLL(Imm(9), X5)
+	PSRLL(Imm(23), X6)
+	PXOR(X5, X2)
+	PSHUFL(Imm(0x93), X1, X1)
+	PXOR(X6, X2)
+	PADDL(X2, X4)
+	MOVOA(X2, X5)
+	MOVOA(X4, X6)
+	PSLLL(Imm(13), X4)
+	PSRLL(Imm(19), X6)
+	PXOR(X4, X3)
+	PSHUFL(Imm(0x4E), X2, X2)
+	PXOR(X6, X3)
+	SUBQ(Imm(4), RCX)
+	PADDL(X3, X5)
+	MOVOA(X1, X4)
+	MOVOA(X5, X6)
+	PSLLL(Imm(18), X5)
+	PXOR(X7, X7)
+	PSRLL(Imm(14), X6)
+	PXOR(X5, X0)
+	PSHUFL(Imm(0x39), X3, X3)
+	PXOR(X6, X0)
+	JA(LabelRef("MAINLOOP2"))
+	PADDL(Mem{Base: R12}.Offset(48), X0)
+	PADDL(Mem{Base: R12}.Offset(0), X1)
+	PADDL(Mem{Base: R12}.Offset(16), X2)
+	PADDL(Mem{Base: R12}.Offset(32), X3)
+	MOVD(X0, ECX)
+	MOVD(X1, R8)
+	MOVD(X2, R9)
+	MOVD(X3, EAX)
+	PSHUFL(Imm(0x39), X0, X0)
+	PSHUFL(Imm(0x39), X1, X1)
+	PSHUFL(Imm(0x39), X2, X2)
+	PSHUFL(Imm(0x39), X3, X3)
+	XORL(Mem{Base: SI}.Offset(0), ECX)
+	XORL(Mem{Base: SI}.Offset(48), R8L)
+	XORL(Mem{Base: SI}.Offset(32), R9L)
+	XORL(Mem{Base: SI}.Offset(16), EAX)
+	MOVL(ECX, Mem{Base: DI}.Offset(0))
+	MOVL(R8L, Mem{Base: DI}.Offset(48))
+	MOVL(R9L, Mem{Base: DI}.Offset(32))
+	MOVL(EAX, Mem{Base: DI}.Offset(16))
+	MOVD(X0, ECX)
+	MOVD(X1, R8)
+	MOVD(X2, R9)
+	MOVD(X3, EAX)
+	PSHUFL(Imm(0x39), X0, X0)
+	PSHUFL(Imm(0x39), X1, X1)
+	PSHUFL(Imm(0x39), X2, X2)
+	PSHUFL(Imm(0x39), X3, X3)
+	XORL(Mem{Base: SI}.Offset(20), ECX)
+	XORL(Mem{Base: SI}.Offset(4), R8L)
+	XORL(Mem{Base: SI}.Offset(52), R9L)
+	XORL(Mem{Base: SI}.Offset(36), EAX)
+	MOVL(ECX, Mem{Base: DI}.Offset(20))
+	MOVL(R8L, Mem{Base: DI}.Offset(4))
+	MOVL(R9L, Mem{Base: DI}.Offset(52))
+	MOVL(EAX, Mem{Base: DI}.Offset(36))
+	MOVD(X0, ECX)
+	MOVD(X1, R8)
+	MOVD(X2, R9)
+	MOVD(X3, EAX)
+	PSHUFL(Imm(0x39), X0, X0)
+	PSHUFL(Imm(0x39), X1, X1)
+	PSHUFL(Imm(0x39), X2, X2)
+	PSHUFL(Imm(0x39), X3, X3)
+	XORL(Mem{Base: SI}.Offset(40), ECX)
+	XORL(Mem{Base: SI}.Offset(24), R8L)
+	XORL(Mem{Base: SI}.Offset(8), R9L)
+	XORL(Mem{Base: SI}.Offset(56), EAX)
+	MOVL(ECX, Mem{Base: DI}.Offset(40))
+	MOVL(R8L, Mem{Base: DI}.Offset(24))
+	MOVL(R9L, Mem{Base: DI}.Offset(8))
+	MOVL(EAX, Mem{Base: DI}.Offset(56))
+	MOVD(X0, ECX)
+	MOVD(X1, R8)
+	MOVD(X2, R9)
+	MOVD(X3, EAX)
+	XORL(Mem{Base: SI}.Offset(60), ECX)
+	XORL(Mem{Base: SI}.Offset(44), R8L)
+	XORL(Mem{Base: SI}.Offset(28), R9L)
+	XORL(Mem{Base: SI}.Offset(12), EAX)
+	MOVL(ECX, Mem{Base: DI}.Offset(60))
+	MOVL(R8L, Mem{Base: DI}.Offset(44))
+	MOVL(R9L, Mem{Base: DI}.Offset(28))
+	MOVL(EAX, Mem{Base: DI}.Offset(12))
+	MOVQ(Mem{Base: R12}.Offset(352), R9)
+	MOVL(Mem{Base: R12}.Offset(16), ECX)
+	MOVL(Mem{Base: R12}.Offset(36), R8L)
+	ADDQ(Imm(1), RCX)
+	SHLQ(Imm(32), R8)
+	ADDQ(R8, RCX)
+	MOVQ(RCX, R8)
+	SHRQ(Imm(32), R8)
+	MOVL(ECX, Mem{Base: R12}.Offset(16))
+	MOVL(R8L, Mem{Base: R12}.Offset(36))
+	CMPQ(R9, Imm(64))
+	JA(LabelRef("BYTESATLEAST65"))
+	JAE(LabelRef("BYTESATLEAST64"))
+	MOVQ(RDI, RSI)
+	MOVQ(RDX, RDI)
+	MOVQ(R9, RCX)
+	// Hack to get Avo to emit:
+	// 	REP; MOVSB
+	Instruction(&ir.Instruction{Opcode: "REP; MOVSB"})
+}
diff --git a/salsa20/salsa/salsa20_amd64.s b/salsa20/salsa/salsa20_amd64.s
index fcce023..3883e0e 100644
--- a/salsa20/salsa/salsa20_amd64.s
+++ b/salsa20/salsa/salsa20_amd64.s
@@ -1,880 +1,880 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
+// Code generated by command: go run salsa20_amd64_asm.go -out ../salsa20_amd64.s -pkg salsa. DO NOT EDIT.
 
 //go:build amd64 && !purego && gc
 
-// This code was translated into a form compatible with 6a from the public
-// domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html
+// func salsa2020XORKeyStream(out *byte, in *byte, n uint64, nonce *byte, key *byte)
+// Requires: SSE2
+TEXT ·salsa2020XORKeyStream(SB), $456-40
+	// This needs up to 64 bytes at 360(R12); hence the non-obvious frame size.
+	MOVQ   out+0(FP), DI
+	MOVQ   in+8(FP), SI
+	MOVQ   n+16(FP), DX
+	MOVQ   nonce+24(FP), CX
+	MOVQ   key+32(FP), R8
+	MOVQ   SP, R12
+	ADDQ   $0x1f, R12
+	ANDQ   $-32, R12
+	MOVQ   DX, R9
+	MOVQ   CX, DX
+	MOVQ   R8, R10
+	CMPQ   R9, $0x00
+	JBE    DONE
+	MOVL   20(R10), CX
+	MOVL   (R10), R8
+	MOVL   (DX), AX
+	MOVL   16(R10), R11
+	MOVL   CX, (R12)
+	MOVL   R8, 4(R12)
+	MOVL   AX, 8(R12)
+	MOVL   R11, 12(R12)
+	MOVL   8(DX), CX
+	MOVL   24(R10), R8
+	MOVL   4(R10), AX
+	MOVL   4(DX), R11
+	MOVL   CX, 16(R12)
+	MOVL   R8, 20(R12)
+	MOVL   AX, 24(R12)
+	MOVL   R11, 28(R12)
+	MOVL   12(DX), CX
+	MOVL   12(R10), DX
+	MOVL   28(R10), R8
+	MOVL   8(R10), AX
+	MOVL   DX, 32(R12)
+	MOVL   CX, 36(R12)
+	MOVL   R8, 40(R12)
+	MOVL   AX, 44(R12)
+	MOVQ   $0x61707865, DX
+	MOVQ   $0x3320646e, CX
+	MOVQ   $0x79622d32, R8
+	MOVQ   $0x6b206574, AX
+	MOVL   DX, 48(R12)
+	MOVL   CX, 52(R12)
+	MOVL   R8, 56(R12)
+	MOVL   AX, 60(R12)
+	CMPQ   R9, $0x00000100
+	JB     BYTESBETWEEN1AND255
+	MOVOA  48(R12), X0
+	PSHUFL $0x55, X0, X1
+	PSHUFL $0xaa, X0, X2
+	PSHUFL $0xff, X0, X3
+	PSHUFL $0x00, X0, X0
+	MOVOA  X1, 64(R12)
+	MOVOA  X2, 80(R12)
+	MOVOA  X3, 96(R12)
+	MOVOA  X0, 112(R12)
+	MOVOA  (R12), X0
+	PSHUFL $0xaa, X0, X1
+	PSHUFL $0xff, X0, X2
+	PSHUFL $0x00, X0, X3
+	PSHUFL $0x55, X0, X0
+	MOVOA  X1, 128(R12)
+	MOVOA  X2, 144(R12)
+	MOVOA  X3, 160(R12)
+	MOVOA  X0, 176(R12)
+	MOVOA  16(R12), X0
+	PSHUFL $0xff, X0, X1
+	PSHUFL $0x55, X0, X2
+	PSHUFL $0xaa, X0, X0
+	MOVOA  X1, 192(R12)
+	MOVOA  X2, 208(R12)
+	MOVOA  X0, 224(R12)
+	MOVOA  32(R12), X0
+	PSHUFL $0x00, X0, X1
+	PSHUFL $0xaa, X0, X2
+	PSHUFL $0xff, X0, X0
+	MOVOA  X1, 240(R12)
+	MOVOA  X2, 256(R12)
+	MOVOA  X0, 272(R12)
 
-// func salsa2020XORKeyStream(out, in *byte, n uint64, nonce, key *byte)
-// This needs up to 64 bytes at 360(R12); hence the non-obvious frame size.
-TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
-	MOVQ out+0(FP),DI
-	MOVQ in+8(FP),SI
-	MOVQ n+16(FP),DX
-	MOVQ nonce+24(FP),CX
-	MOVQ key+32(FP),R8
+BYTESATLEAST256:
+	MOVL  16(R12), DX
+	MOVL  36(R12), CX
+	MOVL  DX, 288(R12)
+	MOVL  CX, 304(R12)
+	SHLQ  $0x20, CX
+	ADDQ  CX, DX
+	ADDQ  $0x01, DX
+	MOVQ  DX, CX
+	SHRQ  $0x20, CX
+	MOVL  DX, 292(R12)
+	MOVL  CX, 308(R12)
+	ADDQ  $0x01, DX
+	MOVQ  DX, CX
+	SHRQ  $0x20, CX
+	MOVL  DX, 296(R12)
+	MOVL  CX, 312(R12)
+	ADDQ  $0x01, DX
+	MOVQ  DX, CX
+	SHRQ  $0x20, CX
+	MOVL  DX, 300(R12)
+	MOVL  CX, 316(R12)
+	ADDQ  $0x01, DX
+	MOVQ  DX, CX
+	SHRQ  $0x20, CX
+	MOVL  DX, 16(R12)
+	MOVL  CX, 36(R12)
+	MOVQ  R9, 352(R12)
+	MOVQ  $0x00000014, DX
+	MOVOA 64(R12), X0
+	MOVOA 80(R12), X1
+	MOVOA 96(R12), X2
+	MOVOA 256(R12), X3
+	MOVOA 272(R12), X4
+	MOVOA 128(R12), X5
+	MOVOA 144(R12), X6
+	MOVOA 176(R12), X7
+	MOVOA 192(R12), X8
+	MOVOA 208(R12), X9
+	MOVOA 224(R12), X10
+	MOVOA 304(R12), X11
+	MOVOA 112(R12), X12
+	MOVOA 160(R12), X13
+	MOVOA 240(R12), X14
+	MOVOA 288(R12), X15
 
-	MOVQ SP,R12
-	ADDQ $31, R12
-	ANDQ $~31, R12
+MAINLOOP1:
+	MOVOA  X1, 320(R12)
+	MOVOA  X2, 336(R12)
+	MOVOA  X13, X1
+	PADDL  X12, X1
+	MOVOA  X1, X2
+	PSLLL  $0x07, X1
+	PXOR   X1, X14
+	PSRLL  $0x19, X2
+	PXOR   X2, X14
+	MOVOA  X7, X1
+	PADDL  X0, X1
+	MOVOA  X1, X2
+	PSLLL  $0x07, X1
+	PXOR   X1, X11
+	PSRLL  $0x19, X2
+	PXOR   X2, X11
+	MOVOA  X12, X1
+	PADDL  X14, X1
+	MOVOA  X1, X2
+	PSLLL  $0x09, X1
+	PXOR   X1, X15
+	PSRLL  $0x17, X2
+	PXOR   X2, X15
+	MOVOA  X0, X1
+	PADDL  X11, X1
+	MOVOA  X1, X2
+	PSLLL  $0x09, X1
+	PXOR   X1, X9
+	PSRLL  $0x17, X2
+	PXOR   X2, X9
+	MOVOA  X14, X1
+	PADDL  X15, X1
+	MOVOA  X1, X2
+	PSLLL  $0x0d, X1
+	PXOR   X1, X13
+	PSRLL  $0x13, X2
+	PXOR   X2, X13
+	MOVOA  X11, X1
+	PADDL  X9, X1
+	MOVOA  X1, X2
+	PSLLL  $0x0d, X1
+	PXOR   X1, X7
+	PSRLL  $0x13, X2
+	PXOR   X2, X7
+	MOVOA  X15, X1
+	PADDL  X13, X1
+	MOVOA  X1, X2
+	PSLLL  $0x12, X1
+	PXOR   X1, X12
+	PSRLL  $0x0e, X2
+	PXOR   X2, X12
+	MOVOA  320(R12), X1
+	MOVOA  X12, 320(R12)
+	MOVOA  X9, X2
+	PADDL  X7, X2
+	MOVOA  X2, X12
+	PSLLL  $0x12, X2
+	PXOR   X2, X0
+	PSRLL  $0x0e, X12
+	PXOR   X12, X0
+	MOVOA  X5, X2
+	PADDL  X1, X2
+	MOVOA  X2, X12
+	PSLLL  $0x07, X2
+	PXOR   X2, X3
+	PSRLL  $0x19, X12
+	PXOR   X12, X3
+	MOVOA  336(R12), X2
+	MOVOA  X0, 336(R12)
+	MOVOA  X6, X0
+	PADDL  X2, X0
+	MOVOA  X0, X12
+	PSLLL  $0x07, X0
+	PXOR   X0, X4
+	PSRLL  $0x19, X12
+	PXOR   X12, X4
+	MOVOA  X1, X0
+	PADDL  X3, X0
+	MOVOA  X0, X12
+	PSLLL  $0x09, X0
+	PXOR   X0, X10
+	PSRLL  $0x17, X12
+	PXOR   X12, X10
+	MOVOA  X2, X0
+	PADDL  X4, X0
+	MOVOA  X0, X12
+	PSLLL  $0x09, X0
+	PXOR   X0, X8
+	PSRLL  $0x17, X12
+	PXOR   X12, X8
+	MOVOA  X3, X0
+	PADDL  X10, X0
+	MOVOA  X0, X12
+	PSLLL  $0x0d, X0
+	PXOR   X0, X5
+	PSRLL  $0x13, X12
+	PXOR   X12, X5
+	MOVOA  X4, X0
+	PADDL  X8, X0
+	MOVOA  X0, X12
+	PSLLL  $0x0d, X0
+	PXOR   X0, X6
+	PSRLL  $0x13, X12
+	PXOR   X12, X6
+	MOVOA  X10, X0
+	PADDL  X5, X0
+	MOVOA  X0, X12
+	PSLLL  $0x12, X0
+	PXOR   X0, X1
+	PSRLL  $0x0e, X12
+	PXOR   X12, X1
+	MOVOA  320(R12), X0
+	MOVOA  X1, 320(R12)
+	MOVOA  X4, X1
+	PADDL  X0, X1
+	MOVOA  X1, X12
+	PSLLL  $0x07, X1
+	PXOR   X1, X7
+	PSRLL  $0x19, X12
+	PXOR   X12, X7
+	MOVOA  X8, X1
+	PADDL  X6, X1
+	MOVOA  X1, X12
+	PSLLL  $0x12, X1
+	PXOR   X1, X2
+	PSRLL  $0x0e, X12
+	PXOR   X12, X2
+	MOVOA  336(R12), X12
+	MOVOA  X2, 336(R12)
+	MOVOA  X14, X1
+	PADDL  X12, X1
+	MOVOA  X1, X2
+	PSLLL  $0x07, X1
+	PXOR   X1, X5
+	PSRLL  $0x19, X2
+	PXOR   X2, X5
+	MOVOA  X0, X1
+	PADDL  X7, X1
+	MOVOA  X1, X2
+	PSLLL  $0x09, X1
+	PXOR   X1, X10
+	PSRLL  $0x17, X2
+	PXOR   X2, X10
+	MOVOA  X12, X1
+	PADDL  X5, X1
+	MOVOA  X1, X2
+	PSLLL  $0x09, X1
+	PXOR   X1, X8
+	PSRLL  $0x17, X2
+	PXOR   X2, X8
+	MOVOA  X7, X1
+	PADDL  X10, X1
+	MOVOA  X1, X2
+	PSLLL  $0x0d, X1
+	PXOR   X1, X4
+	PSRLL  $0x13, X2
+	PXOR   X2, X4
+	MOVOA  X5, X1
+	PADDL  X8, X1
+	MOVOA  X1, X2
+	PSLLL  $0x0d, X1
+	PXOR   X1, X14
+	PSRLL  $0x13, X2
+	PXOR   X2, X14
+	MOVOA  X10, X1
+	PADDL  X4, X1
+	MOVOA  X1, X2
+	PSLLL  $0x12, X1
+	PXOR   X1, X0
+	PSRLL  $0x0e, X2
+	PXOR   X2, X0
+	MOVOA  320(R12), X1
+	MOVOA  X0, 320(R12)
+	MOVOA  X8, X0
+	PADDL  X14, X0
+	MOVOA  X0, X2
+	PSLLL  $0x12, X0
+	PXOR   X0, X12
+	PSRLL  $0x0e, X2
+	PXOR   X2, X12
+	MOVOA  X11, X0
+	PADDL  X1, X0
+	MOVOA  X0, X2
+	PSLLL  $0x07, X0
+	PXOR   X0, X6
+	PSRLL  $0x19, X2
+	PXOR   X2, X6
+	MOVOA  336(R12), X2
+	MOVOA  X12, 336(R12)
+	MOVOA  X3, X0
+	PADDL  X2, X0
+	MOVOA  X0, X12
+	PSLLL  $0x07, X0
+	PXOR   X0, X13
+	PSRLL  $0x19, X12
+	PXOR   X12, X13
+	MOVOA  X1, X0
+	PADDL  X6, X0
+	MOVOA  X0, X12
+	PSLLL  $0x09, X0
+	PXOR   X0, X15
+	PSRLL  $0x17, X12
+	PXOR   X12, X15
+	MOVOA  X2, X0
+	PADDL  X13, X0
+	MOVOA  X0, X12
+	PSLLL  $0x09, X0
+	PXOR   X0, X9
+	PSRLL  $0x17, X12
+	PXOR   X12, X9
+	MOVOA  X6, X0
+	PADDL  X15, X0
+	MOVOA  X0, X12
+	PSLLL  $0x0d, X0
+	PXOR   X0, X11
+	PSRLL  $0x13, X12
+	PXOR   X12, X11
+	MOVOA  X13, X0
+	PADDL  X9, X0
+	MOVOA  X0, X12
+	PSLLL  $0x0d, X0
+	PXOR   X0, X3
+	PSRLL  $0x13, X12
+	PXOR   X12, X3
+	MOVOA  X15, X0
+	PADDL  X11, X0
+	MOVOA  X0, X12
+	PSLLL  $0x12, X0
+	PXOR   X0, X1
+	PSRLL  $0x0e, X12
+	PXOR   X12, X1
+	MOVOA  X9, X0
+	PADDL  X3, X0
+	MOVOA  X0, X12
+	PSLLL  $0x12, X0
+	PXOR   X0, X2
+	PSRLL  $0x0e, X12
+	PXOR   X12, X2
+	MOVOA  320(R12), X12
+	MOVOA  336(R12), X0
+	SUBQ   $0x02, DX
+	JA     MAINLOOP1
+	PADDL  112(R12), X12
+	PADDL  176(R12), X7
+	PADDL  224(R12), X10
+	PADDL  272(R12), X4
+	MOVD   X12, DX
+	MOVD   X7, CX
+	MOVD   X10, R8
+	MOVD   X4, R9
+	PSHUFL $0x39, X12, X12
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x39, X10, X10
+	PSHUFL $0x39, X4, X4
+	XORL   (SI), DX
+	XORL   4(SI), CX
+	XORL   8(SI), R8
+	XORL   12(SI), R9
+	MOVL   DX, (DI)
+	MOVL   CX, 4(DI)
+	MOVL   R8, 8(DI)
+	MOVL   R9, 12(DI)
+	MOVD   X12, DX
+	MOVD   X7, CX
+	MOVD   X10, R8
+	MOVD   X4, R9
+	PSHUFL $0x39, X12, X12
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x39, X10, X10
+	PSHUFL $0x39, X4, X4
+	XORL   64(SI), DX
+	XORL   68(SI), CX
+	XORL   72(SI), R8
+	XORL   76(SI), R9
+	MOVL   DX, 64(DI)
+	MOVL   CX, 68(DI)
+	MOVL   R8, 72(DI)
+	MOVL   R9, 76(DI)
+	MOVD   X12, DX
+	MOVD   X7, CX
+	MOVD   X10, R8
+	MOVD   X4, R9
+	PSHUFL $0x39, X12, X12
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x39, X10, X10
+	PSHUFL $0x39, X4, X4
+	XORL   128(SI), DX
+	XORL   132(SI), CX
+	XORL   136(SI), R8
+	XORL   140(SI), R9
+	MOVL   DX, 128(DI)
+	MOVL   CX, 132(DI)
+	MOVL   R8, 136(DI)
+	MOVL   R9, 140(DI)
+	MOVD   X12, DX
+	MOVD   X7, CX
+	MOVD   X10, R8
+	MOVD   X4, R9
+	XORL   192(SI), DX
+	XORL   196(SI), CX
+	XORL   200(SI), R8
+	XORL   204(SI), R9
+	MOVL   DX, 192(DI)
+	MOVL   CX, 196(DI)
+	MOVL   R8, 200(DI)
+	MOVL   R9, 204(DI)
+	PADDL  240(R12), X14
+	PADDL  64(R12), X0
+	PADDL  128(R12), X5
+	PADDL  192(R12), X8
+	MOVD   X14, DX
+	MOVD   X0, CX
+	MOVD   X5, R8
+	MOVD   X8, R9
+	PSHUFL $0x39, X14, X14
+	PSHUFL $0x39, X0, X0
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x39, X8, X8
+	XORL   16(SI), DX
+	XORL   20(SI), CX
+	XORL   24(SI), R8
+	XORL   28(SI), R9
+	MOVL   DX, 16(DI)
+	MOVL   CX, 20(DI)
+	MOVL   R8, 24(DI)
+	MOVL   R9, 28(DI)
+	MOVD   X14, DX
+	MOVD   X0, CX
+	MOVD   X5, R8
+	MOVD   X8, R9
+	PSHUFL $0x39, X14, X14
+	PSHUFL $0x39, X0, X0
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x39, X8, X8
+	XORL   80(SI), DX
+	XORL   84(SI), CX
+	XORL   88(SI), R8
+	XORL   92(SI), R9
+	MOVL   DX, 80(DI)
+	MOVL   CX, 84(DI)
+	MOVL   R8, 88(DI)
+	MOVL   R9, 92(DI)
+	MOVD   X14, DX
+	MOVD   X0, CX
+	MOVD   X5, R8
+	MOVD   X8, R9
+	PSHUFL $0x39, X14, X14
+	PSHUFL $0x39, X0, X0
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x39, X8, X8
+	XORL   144(SI), DX
+	XORL   148(SI), CX
+	XORL   152(SI), R8
+	XORL   156(SI), R9
+	MOVL   DX, 144(DI)
+	MOVL   CX, 148(DI)
+	MOVL   R8, 152(DI)
+	MOVL   R9, 156(DI)
+	MOVD   X14, DX
+	MOVD   X0, CX
+	MOVD   X5, R8
+	MOVD   X8, R9
+	XORL   208(SI), DX
+	XORL   212(SI), CX
+	XORL   216(SI), R8
+	XORL   220(SI), R9
+	MOVL   DX, 208(DI)
+	MOVL   CX, 212(DI)
+	MOVL   R8, 216(DI)
+	MOVL   R9, 220(DI)
+	PADDL  288(R12), X15
+	PADDL  304(R12), X11
+	PADDL  80(R12), X1
+	PADDL  144(R12), X6
+	MOVD   X15, DX
+	MOVD   X11, CX
+	MOVD   X1, R8
+	MOVD   X6, R9
+	PSHUFL $0x39, X15, X15
+	PSHUFL $0x39, X11, X11
+	PSHUFL $0x39, X1, X1
+	PSHUFL $0x39, X6, X6
+	XORL   32(SI), DX
+	XORL   36(SI), CX
+	XORL   40(SI), R8
+	XORL   44(SI), R9
+	MOVL   DX, 32(DI)
+	MOVL   CX, 36(DI)
+	MOVL   R8, 40(DI)
+	MOVL   R9, 44(DI)
+	MOVD   X15, DX
+	MOVD   X11, CX
+	MOVD   X1, R8
+	MOVD   X6, R9
+	PSHUFL $0x39, X15, X15
+	PSHUFL $0x39, X11, X11
+	PSHUFL $0x39, X1, X1
+	PSHUFL $0x39, X6, X6
+	XORL   96(SI), DX
+	XORL   100(SI), CX
+	XORL   104(SI), R8
+	XORL   108(SI), R9
+	MOVL   DX, 96(DI)
+	MOVL   CX, 100(DI)
+	MOVL   R8, 104(DI)
+	MOVL   R9, 108(DI)
+	MOVD   X15, DX
+	MOVD   X11, CX
+	MOVD   X1, R8
+	MOVD   X6, R9
+	PSHUFL $0x39, X15, X15
+	PSHUFL $0x39, X11, X11
+	PSHUFL $0x39, X1, X1
+	PSHUFL $0x39, X6, X6
+	XORL   160(SI), DX
+	XORL   164(SI), CX
+	XORL   168(SI), R8
+	XORL   172(SI), R9
+	MOVL   DX, 160(DI)
+	MOVL   CX, 164(DI)
+	MOVL   R8, 168(DI)
+	MOVL   R9, 172(DI)
+	MOVD   X15, DX
+	MOVD   X11, CX
+	MOVD   X1, R8
+	MOVD   X6, R9
+	XORL   224(SI), DX
+	XORL   228(SI), CX
+	XORL   232(SI), R8
+	XORL   236(SI), R9
+	MOVL   DX, 224(DI)
+	MOVL   CX, 228(DI)
+	MOVL   R8, 232(DI)
+	MOVL   R9, 236(DI)
+	PADDL  160(R12), X13
+	PADDL  208(R12), X9
+	PADDL  256(R12), X3
+	PADDL  96(R12), X2
+	MOVD   X13, DX
+	MOVD   X9, CX
+	MOVD   X3, R8
+	MOVD   X2, R9
+	PSHUFL $0x39, X13, X13
+	PSHUFL $0x39, X9, X9
+	PSHUFL $0x39, X3, X3
+	PSHUFL $0x39, X2, X2
+	XORL   48(SI), DX
+	XORL   52(SI), CX
+	XORL   56(SI), R8
+	XORL   60(SI), R9
+	MOVL   DX, 48(DI)
+	MOVL   CX, 52(DI)
+	MOVL   R8, 56(DI)
+	MOVL   R9, 60(DI)
+	MOVD   X13, DX
+	MOVD   X9, CX
+	MOVD   X3, R8
+	MOVD   X2, R9
+	PSHUFL $0x39, X13, X13
+	PSHUFL $0x39, X9, X9
+	PSHUFL $0x39, X3, X3
+	PSHUFL $0x39, X2, X2
+	XORL   112(SI), DX
+	XORL   116(SI), CX
+	XORL   120(SI), R8
+	XORL   124(SI), R9
+	MOVL   DX, 112(DI)
+	MOVL   CX, 116(DI)
+	MOVL   R8, 120(DI)
+	MOVL   R9, 124(DI)
+	MOVD   X13, DX
+	MOVD   X9, CX
+	MOVD   X3, R8
+	MOVD   X2, R9
+	PSHUFL $0x39, X13, X13
+	PSHUFL $0x39, X9, X9
+	PSHUFL $0x39, X3, X3
+	PSHUFL $0x39, X2, X2
+	XORL   176(SI), DX
+	XORL   180(SI), CX
+	XORL   184(SI), R8
+	XORL   188(SI), R9
+	MOVL   DX, 176(DI)
+	MOVL   CX, 180(DI)
+	MOVL   R8, 184(DI)
+	MOVL   R9, 188(DI)
+	MOVD   X13, DX
+	MOVD   X9, CX
+	MOVD   X3, R8
+	MOVD   X2, R9
+	XORL   240(SI), DX
+	XORL   244(SI), CX
+	XORL   248(SI), R8
+	XORL   252(SI), R9
+	MOVL   DX, 240(DI)
+	MOVL   CX, 244(DI)
+	MOVL   R8, 248(DI)
+	MOVL   R9, 252(DI)
+	MOVQ   352(R12), R9
+	SUBQ   $0x00000100, R9
+	ADDQ   $0x00000100, SI
+	ADDQ   $0x00000100, DI
+	CMPQ   R9, $0x00000100
+	JAE    BYTESATLEAST256
+	CMPQ   R9, $0x00
+	JBE    DONE
 
-	MOVQ DX,R9
-	MOVQ CX,DX
-	MOVQ R8,R10
-	CMPQ R9,$0
-	JBE DONE
-	START:
-	MOVL 20(R10),CX
-	MOVL 0(R10),R8
-	MOVL 0(DX),AX
-	MOVL 16(R10),R11
-	MOVL CX,0(R12)
-	MOVL R8, 4 (R12)
-	MOVL AX, 8 (R12)
-	MOVL R11, 12 (R12)
-	MOVL 8(DX),CX
-	MOVL 24(R10),R8
-	MOVL 4(R10),AX
-	MOVL 4(DX),R11
-	MOVL CX,16(R12)
-	MOVL R8, 20 (R12)
-	MOVL AX, 24 (R12)
-	MOVL R11, 28 (R12)
-	MOVL 12(DX),CX
-	MOVL 12(R10),DX
-	MOVL 28(R10),R8
-	MOVL 8(R10),AX
-	MOVL DX,32(R12)
-	MOVL CX, 36 (R12)
-	MOVL R8, 40 (R12)
-	MOVL AX, 44 (R12)
-	MOVQ $1634760805,DX
-	MOVQ $857760878,CX
-	MOVQ $2036477234,R8
-	MOVQ $1797285236,AX
-	MOVL DX,48(R12)
-	MOVL CX, 52 (R12)
-	MOVL R8, 56 (R12)
-	MOVL AX, 60 (R12)
-	CMPQ R9,$256
-	JB BYTESBETWEEN1AND255
-	MOVOA 48(R12),X0
-	PSHUFL $0X55,X0,X1
-	PSHUFL $0XAA,X0,X2
-	PSHUFL $0XFF,X0,X3
-	PSHUFL $0X00,X0,X0
-	MOVOA X1,64(R12)
-	MOVOA X2,80(R12)
-	MOVOA X3,96(R12)
-	MOVOA X0,112(R12)
-	MOVOA 0(R12),X0
-	PSHUFL $0XAA,X0,X1
-	PSHUFL $0XFF,X0,X2
-	PSHUFL $0X00,X0,X3
-	PSHUFL $0X55,X0,X0
-	MOVOA X1,128(R12)
-	MOVOA X2,144(R12)
-	MOVOA X3,160(R12)
-	MOVOA X0,176(R12)
-	MOVOA 16(R12),X0
-	PSHUFL $0XFF,X0,X1
-	PSHUFL $0X55,X0,X2
-	PSHUFL $0XAA,X0,X0
-	MOVOA X1,192(R12)
-	MOVOA X2,208(R12)
-	MOVOA X0,224(R12)
-	MOVOA 32(R12),X0
-	PSHUFL $0X00,X0,X1
-	PSHUFL $0XAA,X0,X2
-	PSHUFL $0XFF,X0,X0
-	MOVOA X1,240(R12)
-	MOVOA X2,256(R12)
-	MOVOA X0,272(R12)
-	BYTESATLEAST256:
-	MOVL 16(R12),DX
-	MOVL  36 (R12),CX
-	MOVL DX,288(R12)
-	MOVL CX,304(R12)
-	SHLQ $32,CX
-	ADDQ CX,DX
-	ADDQ $1,DX
-	MOVQ DX,CX
-	SHRQ $32,CX
-	MOVL DX, 292 (R12)
-	MOVL CX, 308 (R12)
-	ADDQ $1,DX
-	MOVQ DX,CX
-	SHRQ $32,CX
-	MOVL DX, 296 (R12)
-	MOVL CX, 312 (R12)
-	ADDQ $1,DX
-	MOVQ DX,CX
-	SHRQ $32,CX
-	MOVL DX, 300 (R12)
-	MOVL CX, 316 (R12)
-	ADDQ $1,DX
-	MOVQ DX,CX
-	SHRQ $32,CX
-	MOVL DX,16(R12)
-	MOVL CX, 36 (R12)
-	MOVQ R9,352(R12)
-	MOVQ $20,DX
-	MOVOA 64(R12),X0
-	MOVOA 80(R12),X1
-	MOVOA 96(R12),X2
-	MOVOA 256(R12),X3
-	MOVOA 272(R12),X4
-	MOVOA 128(R12),X5
-	MOVOA 144(R12),X6
-	MOVOA 176(R12),X7
-	MOVOA 192(R12),X8
-	MOVOA 208(R12),X9
-	MOVOA 224(R12),X10
-	MOVOA 304(R12),X11
-	MOVOA 112(R12),X12
-	MOVOA 160(R12),X13
-	MOVOA 240(R12),X14
-	MOVOA 288(R12),X15
-	MAINLOOP1:
-	MOVOA X1,320(R12)
-	MOVOA X2,336(R12)
-	MOVOA X13,X1
-	PADDL X12,X1
-	MOVOA X1,X2
-	PSLLL $7,X1
-	PXOR X1,X14
-	PSRLL $25,X2
-	PXOR X2,X14
-	MOVOA X7,X1
-	PADDL X0,X1
-	MOVOA X1,X2
-	PSLLL $7,X1
-	PXOR X1,X11
-	PSRLL $25,X2
-	PXOR X2,X11
-	MOVOA X12,X1
-	PADDL X14,X1
-	MOVOA X1,X2
-	PSLLL $9,X1
-	PXOR X1,X15
-	PSRLL $23,X2
-	PXOR X2,X15
-	MOVOA X0,X1
-	PADDL X11,X1
-	MOVOA X1,X2
-	PSLLL $9,X1
-	PXOR X1,X9
-	PSRLL $23,X2
-	PXOR X2,X9
-	MOVOA X14,X1
-	PADDL X15,X1
-	MOVOA X1,X2
-	PSLLL $13,X1
-	PXOR X1,X13
-	PSRLL $19,X2
-	PXOR X2,X13
-	MOVOA X11,X1
-	PADDL X9,X1
-	MOVOA X1,X2
-	PSLLL $13,X1
-	PXOR X1,X7
-	PSRLL $19,X2
-	PXOR X2,X7
-	MOVOA X15,X1
-	PADDL X13,X1
-	MOVOA X1,X2
-	PSLLL $18,X1
-	PXOR X1,X12
-	PSRLL $14,X2
-	PXOR X2,X12
-	MOVOA 320(R12),X1
-	MOVOA X12,320(R12)
-	MOVOA X9,X2
-	PADDL X7,X2
-	MOVOA X2,X12
-	PSLLL $18,X2
-	PXOR X2,X0
-	PSRLL $14,X12
-	PXOR X12,X0
-	MOVOA X5,X2
-	PADDL X1,X2
-	MOVOA X2,X12
-	PSLLL $7,X2
-	PXOR X2,X3
-	PSRLL $25,X12
-	PXOR X12,X3
-	MOVOA 336(R12),X2
-	MOVOA X0,336(R12)
-	MOVOA X6,X0
-	PADDL X2,X0
-	MOVOA X0,X12
-	PSLLL $7,X0
-	PXOR X0,X4
-	PSRLL $25,X12
-	PXOR X12,X4
-	MOVOA X1,X0
-	PADDL X3,X0
-	MOVOA X0,X12
-	PSLLL $9,X0
-	PXOR X0,X10
-	PSRLL $23,X12
-	PXOR X12,X10
-	MOVOA X2,X0
-	PADDL X4,X0
-	MOVOA X0,X12
-	PSLLL $9,X0
-	PXOR X0,X8
-	PSRLL $23,X12
-	PXOR X12,X8
-	MOVOA X3,X0
-	PADDL X10,X0
-	MOVOA X0,X12
-	PSLLL $13,X0
-	PXOR X0,X5
-	PSRLL $19,X12
-	PXOR X12,X5
-	MOVOA X4,X0
-	PADDL X8,X0
-	MOVOA X0,X12
-	PSLLL $13,X0
-	PXOR X0,X6
-	PSRLL $19,X12
-	PXOR X12,X6
-	MOVOA X10,X0
-	PADDL X5,X0
-	MOVOA X0,X12
-	PSLLL $18,X0
-	PXOR X0,X1
-	PSRLL $14,X12
-	PXOR X12,X1
-	MOVOA 320(R12),X0
-	MOVOA X1,320(R12)
-	MOVOA X4,X1
-	PADDL X0,X1
-	MOVOA X1,X12
-	PSLLL $7,X1
-	PXOR X1,X7
-	PSRLL $25,X12
-	PXOR X12,X7
-	MOVOA X8,X1
-	PADDL X6,X1
-	MOVOA X1,X12
-	PSLLL $18,X1
-	PXOR X1,X2
-	PSRLL $14,X12
-	PXOR X12,X2
-	MOVOA 336(R12),X12
-	MOVOA X2,336(R12)
-	MOVOA X14,X1
-	PADDL X12,X1
-	MOVOA X1,X2
-	PSLLL $7,X1
-	PXOR X1,X5
-	PSRLL $25,X2
-	PXOR X2,X5
-	MOVOA X0,X1
-	PADDL X7,X1
-	MOVOA X1,X2
-	PSLLL $9,X1
-	PXOR X1,X10
-	PSRLL $23,X2
-	PXOR X2,X10
-	MOVOA X12,X1
-	PADDL X5,X1
-	MOVOA X1,X2
-	PSLLL $9,X1
-	PXOR X1,X8
-	PSRLL $23,X2
-	PXOR X2,X8
-	MOVOA X7,X1
-	PADDL X10,X1
-	MOVOA X1,X2
-	PSLLL $13,X1
-	PXOR X1,X4
-	PSRLL $19,X2
-	PXOR X2,X4
-	MOVOA X5,X1
-	PADDL X8,X1
-	MOVOA X1,X2
-	PSLLL $13,X1
-	PXOR X1,X14
-	PSRLL $19,X2
-	PXOR X2,X14
-	MOVOA X10,X1
-	PADDL X4,X1
-	MOVOA X1,X2
-	PSLLL $18,X1
-	PXOR X1,X0
-	PSRLL $14,X2
-	PXOR X2,X0
-	MOVOA 320(R12),X1
-	MOVOA X0,320(R12)
-	MOVOA X8,X0
-	PADDL X14,X0
-	MOVOA X0,X2
-	PSLLL $18,X0
-	PXOR X0,X12
-	PSRLL $14,X2
-	PXOR X2,X12
-	MOVOA X11,X0
-	PADDL X1,X0
-	MOVOA X0,X2
-	PSLLL $7,X0
-	PXOR X0,X6
-	PSRLL $25,X2
-	PXOR X2,X6
-	MOVOA 336(R12),X2
-	MOVOA X12,336(R12)
-	MOVOA X3,X0
-	PADDL X2,X0
-	MOVOA X0,X12
-	PSLLL $7,X0
-	PXOR X0,X13
-	PSRLL $25,X12
-	PXOR X12,X13
-	MOVOA X1,X0
-	PADDL X6,X0
-	MOVOA X0,X12
-	PSLLL $9,X0
-	PXOR X0,X15
-	PSRLL $23,X12
-	PXOR X12,X15
-	MOVOA X2,X0
-	PADDL X13,X0
-	MOVOA X0,X12
-	PSLLL $9,X0
-	PXOR X0,X9
-	PSRLL $23,X12
-	PXOR X12,X9
-	MOVOA X6,X0
-	PADDL X15,X0
-	MOVOA X0,X12
-	PSLLL $13,X0
-	PXOR X0,X11
-	PSRLL $19,X12
-	PXOR X12,X11
-	MOVOA X13,X0
-	PADDL X9,X0
-	MOVOA X0,X12
-	PSLLL $13,X0
-	PXOR X0,X3
-	PSRLL $19,X12
-	PXOR X12,X3
-	MOVOA X15,X0
-	PADDL X11,X0
-	MOVOA X0,X12
-	PSLLL $18,X0
-	PXOR X0,X1
-	PSRLL $14,X12
-	PXOR X12,X1
-	MOVOA X9,X0
-	PADDL X3,X0
-	MOVOA X0,X12
-	PSLLL $18,X0
-	PXOR X0,X2
-	PSRLL $14,X12
-	PXOR X12,X2
-	MOVOA 320(R12),X12
-	MOVOA 336(R12),X0
-	SUBQ $2,DX
-	JA MAINLOOP1
-	PADDL 112(R12),X12
-	PADDL 176(R12),X7
-	PADDL 224(R12),X10
-	PADDL 272(R12),X4
-	MOVD X12,DX
-	MOVD X7,CX
-	MOVD X10,R8
-	MOVD X4,R9
-	PSHUFL $0X39,X12,X12
-	PSHUFL $0X39,X7,X7
-	PSHUFL $0X39,X10,X10
-	PSHUFL $0X39,X4,X4
-	XORL 0(SI),DX
-	XORL 4(SI),CX
-	XORL 8(SI),R8
-	XORL 12(SI),R9
-	MOVL DX,0(DI)
-	MOVL CX,4(DI)
-	MOVL R8,8(DI)
-	MOVL R9,12(DI)
-	MOVD X12,DX
-	MOVD X7,CX
-	MOVD X10,R8
-	MOVD X4,R9
-	PSHUFL $0X39,X12,X12
-	PSHUFL $0X39,X7,X7
-	PSHUFL $0X39,X10,X10
-	PSHUFL $0X39,X4,X4
-	XORL 64(SI),DX
-	XORL 68(SI),CX
-	XORL 72(SI),R8
-	XORL 76(SI),R9
-	MOVL DX,64(DI)
-	MOVL CX,68(DI)
-	MOVL R8,72(DI)
-	MOVL R9,76(DI)
-	MOVD X12,DX
-	MOVD X7,CX
-	MOVD X10,R8
-	MOVD X4,R9
-	PSHUFL $0X39,X12,X12
-	PSHUFL $0X39,X7,X7
-	PSHUFL $0X39,X10,X10
-	PSHUFL $0X39,X4,X4
-	XORL 128(SI),DX
-	XORL 132(SI),CX
-	XORL 136(SI),R8
-	XORL 140(SI),R9
-	MOVL DX,128(DI)
-	MOVL CX,132(DI)
-	MOVL R8,136(DI)
-	MOVL R9,140(DI)
-	MOVD X12,DX
-	MOVD X7,CX
-	MOVD X10,R8
-	MOVD X4,R9
-	XORL 192(SI),DX
-	XORL 196(SI),CX
-	XORL 200(SI),R8
-	XORL 204(SI),R9
-	MOVL DX,192(DI)
-	MOVL CX,196(DI)
-	MOVL R8,200(DI)
-	MOVL R9,204(DI)
-	PADDL 240(R12),X14
-	PADDL 64(R12),X0
-	PADDL 128(R12),X5
-	PADDL 192(R12),X8
-	MOVD X14,DX
-	MOVD X0,CX
-	MOVD X5,R8
-	MOVD X8,R9
-	PSHUFL $0X39,X14,X14
-	PSHUFL $0X39,X0,X0
-	PSHUFL $0X39,X5,X5
-	PSHUFL $0X39,X8,X8
-	XORL 16(SI),DX
-	XORL 20(SI),CX
-	XORL 24(SI),R8
-	XORL 28(SI),R9
-	MOVL DX,16(DI)
-	MOVL CX,20(DI)
-	MOVL R8,24(DI)
-	MOVL R9,28(DI)
-	MOVD X14,DX
-	MOVD X0,CX
-	MOVD X5,R8
-	MOVD X8,R9
-	PSHUFL $0X39,X14,X14
-	PSHUFL $0X39,X0,X0
-	PSHUFL $0X39,X5,X5
-	PSHUFL $0X39,X8,X8
-	XORL 80(SI),DX
-	XORL 84(SI),CX
-	XORL 88(SI),R8
-	XORL 92(SI),R9
-	MOVL DX,80(DI)
-	MOVL CX,84(DI)
-	MOVL R8,88(DI)
-	MOVL R9,92(DI)
-	MOVD X14,DX
-	MOVD X0,CX
-	MOVD X5,R8
-	MOVD X8,R9
-	PSHUFL $0X39,X14,X14
-	PSHUFL $0X39,X0,X0
-	PSHUFL $0X39,X5,X5
-	PSHUFL $0X39,X8,X8
-	XORL 144(SI),DX
-	XORL 148(SI),CX
-	XORL 152(SI),R8
-	XORL 156(SI),R9
-	MOVL DX,144(DI)
-	MOVL CX,148(DI)
-	MOVL R8,152(DI)
-	MOVL R9,156(DI)
-	MOVD X14,DX
-	MOVD X0,CX
-	MOVD X5,R8
-	MOVD X8,R9
-	XORL 208(SI),DX
-	XORL 212(SI),CX
-	XORL 216(SI),R8
-	XORL 220(SI),R9
-	MOVL DX,208(DI)
-	MOVL CX,212(DI)
-	MOVL R8,216(DI)
-	MOVL R9,220(DI)
-	PADDL 288(R12),X15
-	PADDL 304(R12),X11
-	PADDL 80(R12),X1
-	PADDL 144(R12),X6
-	MOVD X15,DX
-	MOVD X11,CX
-	MOVD X1,R8
-	MOVD X6,R9
-	PSHUFL $0X39,X15,X15
-	PSHUFL $0X39,X11,X11
-	PSHUFL $0X39,X1,X1
-	PSHUFL $0X39,X6,X6
-	XORL 32(SI),DX
-	XORL 36(SI),CX
-	XORL 40(SI),R8
-	XORL 44(SI),R9
-	MOVL DX,32(DI)
-	MOVL CX,36(DI)
-	MOVL R8,40(DI)
-	MOVL R9,44(DI)
-	MOVD X15,DX
-	MOVD X11,CX
-	MOVD X1,R8
-	MOVD X6,R9
-	PSHUFL $0X39,X15,X15
-	PSHUFL $0X39,X11,X11
-	PSHUFL $0X39,X1,X1
-	PSHUFL $0X39,X6,X6
-	XORL 96(SI),DX
-	XORL 100(SI),CX
-	XORL 104(SI),R8
-	XORL 108(SI),R9
-	MOVL DX,96(DI)
-	MOVL CX,100(DI)
-	MOVL R8,104(DI)
-	MOVL R9,108(DI)
-	MOVD X15,DX
-	MOVD X11,CX
-	MOVD X1,R8
-	MOVD X6,R9
-	PSHUFL $0X39,X15,X15
-	PSHUFL $0X39,X11,X11
-	PSHUFL $0X39,X1,X1
-	PSHUFL $0X39,X6,X6
-	XORL 160(SI),DX
-	XORL 164(SI),CX
-	XORL 168(SI),R8
-	XORL 172(SI),R9
-	MOVL DX,160(DI)
-	MOVL CX,164(DI)
-	MOVL R8,168(DI)
-	MOVL R9,172(DI)
-	MOVD X15,DX
-	MOVD X11,CX
-	MOVD X1,R8
-	MOVD X6,R9
-	XORL 224(SI),DX
-	XORL 228(SI),CX
-	XORL 232(SI),R8
-	XORL 236(SI),R9
-	MOVL DX,224(DI)
-	MOVL CX,228(DI)
-	MOVL R8,232(DI)
-	MOVL R9,236(DI)
-	PADDL 160(R12),X13
-	PADDL 208(R12),X9
-	PADDL 256(R12),X3
-	PADDL 96(R12),X2
-	MOVD X13,DX
-	MOVD X9,CX
-	MOVD X3,R8
-	MOVD X2,R9
-	PSHUFL $0X39,X13,X13
-	PSHUFL $0X39,X9,X9
-	PSHUFL $0X39,X3,X3
-	PSHUFL $0X39,X2,X2
-	XORL 48(SI),DX
-	XORL 52(SI),CX
-	XORL 56(SI),R8
-	XORL 60(SI),R9
-	MOVL DX,48(DI)
-	MOVL CX,52(DI)
-	MOVL R8,56(DI)
-	MOVL R9,60(DI)
-	MOVD X13,DX
-	MOVD X9,CX
-	MOVD X3,R8
-	MOVD X2,R9
-	PSHUFL $0X39,X13,X13
-	PSHUFL $0X39,X9,X9
-	PSHUFL $0X39,X3,X3
-	PSHUFL $0X39,X2,X2
-	XORL 112(SI),DX
-	XORL 116(SI),CX
-	XORL 120(SI),R8
-	XORL 124(SI),R9
-	MOVL DX,112(DI)
-	MOVL CX,116(DI)
-	MOVL R8,120(DI)
-	MOVL R9,124(DI)
-	MOVD X13,DX
-	MOVD X9,CX
-	MOVD X3,R8
-	MOVD X2,R9
-	PSHUFL $0X39,X13,X13
-	PSHUFL $0X39,X9,X9
-	PSHUFL $0X39,X3,X3
-	PSHUFL $0X39,X2,X2
-	XORL 176(SI),DX
-	XORL 180(SI),CX
-	XORL 184(SI),R8
-	XORL 188(SI),R9
-	MOVL DX,176(DI)
-	MOVL CX,180(DI)
-	MOVL R8,184(DI)
-	MOVL R9,188(DI)
-	MOVD X13,DX
-	MOVD X9,CX
-	MOVD X3,R8
-	MOVD X2,R9
-	XORL 240(SI),DX
-	XORL 244(SI),CX
-	XORL 248(SI),R8
-	XORL 252(SI),R9
-	MOVL DX,240(DI)
-	MOVL CX,244(DI)
-	MOVL R8,248(DI)
-	MOVL R9,252(DI)
-	MOVQ 352(R12),R9
-	SUBQ $256,R9
-	ADDQ $256,SI
-	ADDQ $256,DI
-	CMPQ R9,$256
-	JAE BYTESATLEAST256
-	CMPQ R9,$0
-	JBE DONE
-	BYTESBETWEEN1AND255:
-	CMPQ R9,$64
-	JAE NOCOPY
-	MOVQ DI,DX
-	LEAQ 360(R12),DI
-	MOVQ R9,CX
+BYTESBETWEEN1AND255:
+	CMPQ R9, $0x40
+	JAE  NOCOPY
+	MOVQ DI, DX
+	LEAQ 360(R12), DI
+	MOVQ R9, CX
 	REP; MOVSB
-	LEAQ 360(R12),DI
-	LEAQ 360(R12),SI
-	NOCOPY:
-	MOVQ R9,352(R12)
-	MOVOA 48(R12),X0
-	MOVOA 0(R12),X1
-	MOVOA 16(R12),X2
-	MOVOA 32(R12),X3
-	MOVOA X1,X4
-	MOVQ $20,CX
-	MAINLOOP2:
-	PADDL X0,X4
-	MOVOA X0,X5
-	MOVOA X4,X6
-	PSLLL $7,X4
-	PSRLL $25,X6
-	PXOR X4,X3
-	PXOR X6,X3
-	PADDL X3,X5
-	MOVOA X3,X4
-	MOVOA X5,X6
-	PSLLL $9,X5
-	PSRLL $23,X6
-	PXOR X5,X2
-	PSHUFL $0X93,X3,X3
-	PXOR X6,X2
-	PADDL X2,X4
-	MOVOA X2,X5
-	MOVOA X4,X6
-	PSLLL $13,X4
-	PSRLL $19,X6
-	PXOR X4,X1
-	PSHUFL $0X4E,X2,X2
-	PXOR X6,X1
-	PADDL X1,X5
-	MOVOA X3,X4
-	MOVOA X5,X6
-	PSLLL $18,X5
-	PSRLL $14,X6
-	PXOR X5,X0
-	PSHUFL $0X39,X1,X1
-	PXOR X6,X0
-	PADDL X0,X4
-	MOVOA X0,X5
-	MOVOA X4,X6
-	PSLLL $7,X4
-	PSRLL $25,X6
-	PXOR X4,X1
-	PXOR X6,X1
-	PADDL X1,X5
-	MOVOA X1,X4
-	MOVOA X5,X6
-	PSLLL $9,X5
-	PSRLL $23,X6
-	PXOR X5,X2
-	PSHUFL $0X93,X1,X1
-	PXOR X6,X2
-	PADDL X2,X4
-	MOVOA X2,X5
-	MOVOA X4,X6
-	PSLLL $13,X4
-	PSRLL $19,X6
-	PXOR X4,X3
-	PSHUFL $0X4E,X2,X2
-	PXOR X6,X3
-	PADDL X3,X5
-	MOVOA X1,X4
-	MOVOA X5,X6
-	PSLLL $18,X5
-	PSRLL $14,X6
-	PXOR X5,X0
-	PSHUFL $0X39,X3,X3
-	PXOR X6,X0
-	PADDL X0,X4
-	MOVOA X0,X5
-	MOVOA X4,X6
-	PSLLL $7,X4
-	PSRLL $25,X6
-	PXOR X4,X3
-	PXOR X6,X3
-	PADDL X3,X5
-	MOVOA X3,X4
-	MOVOA X5,X6
-	PSLLL $9,X5
-	PSRLL $23,X6
-	PXOR X5,X2
-	PSHUFL $0X93,X3,X3
-	PXOR X6,X2
-	PADDL X2,X4
-	MOVOA X2,X5
-	MOVOA X4,X6
-	PSLLL $13,X4
-	PSRLL $19,X6
-	PXOR X4,X1
-	PSHUFL $0X4E,X2,X2
-	PXOR X6,X1
-	PADDL X1,X5
-	MOVOA X3,X4
-	MOVOA X5,X6
-	PSLLL $18,X5
-	PSRLL $14,X6
-	PXOR X5,X0
-	PSHUFL $0X39,X1,X1
-	PXOR X6,X0
-	PADDL X0,X4
-	MOVOA X0,X5
-	MOVOA X4,X6
-	PSLLL $7,X4
-	PSRLL $25,X6
-	PXOR X4,X1
-	PXOR X6,X1
-	PADDL X1,X5
-	MOVOA X1,X4
-	MOVOA X5,X6
-	PSLLL $9,X5
-	PSRLL $23,X6
-	PXOR X5,X2
-	PSHUFL $0X93,X1,X1
-	PXOR X6,X2
-	PADDL X2,X4
-	MOVOA X2,X5
-	MOVOA X4,X6
-	PSLLL $13,X4
-	PSRLL $19,X6
-	PXOR X4,X3
-	PSHUFL $0X4E,X2,X2
-	PXOR X6,X3
-	SUBQ $4,CX
-	PADDL X3,X5
-	MOVOA X1,X4
-	MOVOA X5,X6
-	PSLLL $18,X5
-	PXOR X7,X7
-	PSRLL $14,X6
-	PXOR X5,X0
-	PSHUFL $0X39,X3,X3
-	PXOR X6,X0
-	JA MAINLOOP2
-	PADDL 48(R12),X0
-	PADDL 0(R12),X1
-	PADDL 16(R12),X2
-	PADDL 32(R12),X3
-	MOVD X0,CX
-	MOVD X1,R8
-	MOVD X2,R9
-	MOVD X3,AX
-	PSHUFL $0X39,X0,X0
-	PSHUFL $0X39,X1,X1
-	PSHUFL $0X39,X2,X2
-	PSHUFL $0X39,X3,X3
-	XORL 0(SI),CX
-	XORL 48(SI),R8
-	XORL 32(SI),R9
-	XORL 16(SI),AX
-	MOVL CX,0(DI)
-	MOVL R8,48(DI)
-	MOVL R9,32(DI)
-	MOVL AX,16(DI)
-	MOVD X0,CX
-	MOVD X1,R8
-	MOVD X2,R9
-	MOVD X3,AX
-	PSHUFL $0X39,X0,X0
-	PSHUFL $0X39,X1,X1
-	PSHUFL $0X39,X2,X2
-	PSHUFL $0X39,X3,X3
-	XORL 20(SI),CX
-	XORL 4(SI),R8
-	XORL 52(SI),R9
-	XORL 36(SI),AX
-	MOVL CX,20(DI)
-	MOVL R8,4(DI)
-	MOVL R9,52(DI)
-	MOVL AX,36(DI)
-	MOVD X0,CX
-	MOVD X1,R8
-	MOVD X2,R9
-	MOVD X3,AX
-	PSHUFL $0X39,X0,X0
-	PSHUFL $0X39,X1,X1
-	PSHUFL $0X39,X2,X2
-	PSHUFL $0X39,X3,X3
-	XORL 40(SI),CX
-	XORL 24(SI),R8
-	XORL 8(SI),R9
-	XORL 56(SI),AX
-	MOVL CX,40(DI)
-	MOVL R8,24(DI)
-	MOVL R9,8(DI)
-	MOVL AX,56(DI)
-	MOVD X0,CX
-	MOVD X1,R8
-	MOVD X2,R9
-	MOVD X3,AX
-	XORL 60(SI),CX
-	XORL 44(SI),R8
-	XORL 28(SI),R9
-	XORL 12(SI),AX
-	MOVL CX,60(DI)
-	MOVL R8,44(DI)
-	MOVL R9,28(DI)
-	MOVL AX,12(DI)
-	MOVQ 352(R12),R9
-	MOVL 16(R12),CX
-	MOVL  36 (R12),R8
-	ADDQ $1,CX
-	SHLQ $32,R8
-	ADDQ R8,CX
-	MOVQ CX,R8
-	SHRQ $32,R8
-	MOVL CX,16(R12)
-	MOVL R8, 36 (R12)
-	CMPQ R9,$64
-	JA BYTESATLEAST65
-	JAE BYTESATLEAST64
-	MOVQ DI,SI
-	MOVQ DX,DI
-	MOVQ R9,CX
+	LEAQ 360(R12), DI
+	LEAQ 360(R12), SI
+
+NOCOPY:
+	MOVQ  R9, 352(R12)
+	MOVOA 48(R12), X0
+	MOVOA (R12), X1
+	MOVOA 16(R12), X2
+	MOVOA 32(R12), X3
+	MOVOA X1, X4
+	MOVQ  $0x00000014, CX
+
+MAINLOOP2:
+	PADDL  X0, X4
+	MOVOA  X0, X5
+	MOVOA  X4, X6
+	PSLLL  $0x07, X4
+	PSRLL  $0x19, X6
+	PXOR   X4, X3
+	PXOR   X6, X3
+	PADDL  X3, X5
+	MOVOA  X3, X4
+	MOVOA  X5, X6
+	PSLLL  $0x09, X5
+	PSRLL  $0x17, X6
+	PXOR   X5, X2
+	PSHUFL $0x93, X3, X3
+	PXOR   X6, X2
+	PADDL  X2, X4
+	MOVOA  X2, X5
+	MOVOA  X4, X6
+	PSLLL  $0x0d, X4
+	PSRLL  $0x13, X6
+	PXOR   X4, X1
+	PSHUFL $0x4e, X2, X2
+	PXOR   X6, X1
+	PADDL  X1, X5
+	MOVOA  X3, X4
+	MOVOA  X5, X6
+	PSLLL  $0x12, X5
+	PSRLL  $0x0e, X6
+	PXOR   X5, X0
+	PSHUFL $0x39, X1, X1
+	PXOR   X6, X0
+	PADDL  X0, X4
+	MOVOA  X0, X5
+	MOVOA  X4, X6
+	PSLLL  $0x07, X4
+	PSRLL  $0x19, X6
+	PXOR   X4, X1
+	PXOR   X6, X1
+	PADDL  X1, X5
+	MOVOA  X1, X4
+	MOVOA  X5, X6
+	PSLLL  $0x09, X5
+	PSRLL  $0x17, X6
+	PXOR   X5, X2
+	PSHUFL $0x93, X1, X1
+	PXOR   X6, X2
+	PADDL  X2, X4
+	MOVOA  X2, X5
+	MOVOA  X4, X6
+	PSLLL  $0x0d, X4
+	PSRLL  $0x13, X6
+	PXOR   X4, X3
+	PSHUFL $0x4e, X2, X2
+	PXOR   X6, X3
+	PADDL  X3, X5
+	MOVOA  X1, X4
+	MOVOA  X5, X6
+	PSLLL  $0x12, X5
+	PSRLL  $0x0e, X6
+	PXOR   X5, X0
+	PSHUFL $0x39, X3, X3
+	PXOR   X6, X0
+	PADDL  X0, X4
+	MOVOA  X0, X5
+	MOVOA  X4, X6
+	PSLLL  $0x07, X4
+	PSRLL  $0x19, X6
+	PXOR   X4, X3
+	PXOR   X6, X3
+	PADDL  X3, X5
+	MOVOA  X3, X4
+	MOVOA  X5, X6
+	PSLLL  $0x09, X5
+	PSRLL  $0x17, X6
+	PXOR   X5, X2
+	PSHUFL $0x93, X3, X3
+	PXOR   X6, X2
+	PADDL  X2, X4
+	MOVOA  X2, X5
+	MOVOA  X4, X6
+	PSLLL  $0x0d, X4
+	PSRLL  $0x13, X6
+	PXOR   X4, X1
+	PSHUFL $0x4e, X2, X2
+	PXOR   X6, X1
+	PADDL  X1, X5
+	MOVOA  X3, X4
+	MOVOA  X5, X6
+	PSLLL  $0x12, X5
+	PSRLL  $0x0e, X6
+	PXOR   X5, X0
+	PSHUFL $0x39, X1, X1
+	PXOR   X6, X0
+	PADDL  X0, X4
+	MOVOA  X0, X5
+	MOVOA  X4, X6
+	PSLLL  $0x07, X4
+	PSRLL  $0x19, X6
+	PXOR   X4, X1
+	PXOR   X6, X1
+	PADDL  X1, X5
+	MOVOA  X1, X4
+	MOVOA  X5, X6
+	PSLLL  $0x09, X5
+	PSRLL  $0x17, X6
+	PXOR   X5, X2
+	PSHUFL $0x93, X1, X1
+	PXOR   X6, X2
+	PADDL  X2, X4
+	MOVOA  X2, X5
+	MOVOA  X4, X6
+	PSLLL  $0x0d, X4
+	PSRLL  $0x13, X6
+	PXOR   X4, X3
+	PSHUFL $0x4e, X2, X2
+	PXOR   X6, X3
+	SUBQ   $0x04, CX
+	PADDL  X3, X5
+	MOVOA  X1, X4
+	MOVOA  X5, X6
+	PSLLL  $0x12, X5
+	PXOR   X7, X7
+	PSRLL  $0x0e, X6
+	PXOR   X5, X0
+	PSHUFL $0x39, X3, X3
+	PXOR   X6, X0
+	JA     MAINLOOP2
+	PADDL  48(R12), X0
+	PADDL  (R12), X1
+	PADDL  16(R12), X2
+	PADDL  32(R12), X3
+	MOVD   X0, CX
+	MOVD   X1, R8
+	MOVD   X2, R9
+	MOVD   X3, AX
+	PSHUFL $0x39, X0, X0
+	PSHUFL $0x39, X1, X1
+	PSHUFL $0x39, X2, X2
+	PSHUFL $0x39, X3, X3
+	XORL   (SI), CX
+	XORL   48(SI), R8
+	XORL   32(SI), R9
+	XORL   16(SI), AX
+	MOVL   CX, (DI)
+	MOVL   R8, 48(DI)
+	MOVL   R9, 32(DI)
+	MOVL   AX, 16(DI)
+	MOVD   X0, CX
+	MOVD   X1, R8
+	MOVD   X2, R9
+	MOVD   X3, AX
+	PSHUFL $0x39, X0, X0
+	PSHUFL $0x39, X1, X1
+	PSHUFL $0x39, X2, X2
+	PSHUFL $0x39, X3, X3
+	XORL   20(SI), CX
+	XORL   4(SI), R8
+	XORL   52(SI), R9
+	XORL   36(SI), AX
+	MOVL   CX, 20(DI)
+	MOVL   R8, 4(DI)
+	MOVL   R9, 52(DI)
+	MOVL   AX, 36(DI)
+	MOVD   X0, CX
+	MOVD   X1, R8
+	MOVD   X2, R9
+	MOVD   X3, AX
+	PSHUFL $0x39, X0, X0
+	PSHUFL $0x39, X1, X1
+	PSHUFL $0x39, X2, X2
+	PSHUFL $0x39, X3, X3
+	XORL   40(SI), CX
+	XORL   24(SI), R8
+	XORL   8(SI), R9
+	XORL   56(SI), AX
+	MOVL   CX, 40(DI)
+	MOVL   R8, 24(DI)
+	MOVL   R9, 8(DI)
+	MOVL   AX, 56(DI)
+	MOVD   X0, CX
+	MOVD   X1, R8
+	MOVD   X2, R9
+	MOVD   X3, AX
+	XORL   60(SI), CX
+	XORL   44(SI), R8
+	XORL   28(SI), R9
+	XORL   12(SI), AX
+	MOVL   CX, 60(DI)
+	MOVL   R8, 44(DI)
+	MOVL   R9, 28(DI)
+	MOVL   AX, 12(DI)
+	MOVQ   352(R12), R9
+	MOVL   16(R12), CX
+	MOVL   36(R12), R8
+	ADDQ   $0x01, CX
+	SHLQ   $0x20, R8
+	ADDQ   R8, CX
+	MOVQ   CX, R8
+	SHRQ   $0x20, R8
+	MOVL   CX, 16(R12)
+	MOVL   R8, 36(R12)
+	CMPQ   R9, $0x40
+	JA     BYTESATLEAST65
+	JAE    BYTESATLEAST64
+	MOVQ   DI, SI
+	MOVQ   DX, DI
+	MOVQ   R9, CX
 	REP; MOVSB
-	BYTESATLEAST64:
-	DONE:
+
+BYTESATLEAST64:
+DONE:
 	RET
-	BYTESATLEAST65:
-	SUBQ $64,R9
-	ADDQ $64,DI
-	ADDQ $64,SI
-	JMP BYTESBETWEEN1AND255
+
+BYTESATLEAST65:
+	SUBQ $0x40, R9
+	ADDQ $0x40, DI
+	ADDQ $0x40, SI
+	JMP  BYTESBETWEEN1AND255