chacha20poly1305: Avo port of chacha20poly1305_amd64.s
This implementation utilizes the same registers found in the reference
implementation, aiming to produce a minimal semantic diff between the
Avo-generated output and the original hand-written assembly.
To verify the Avo implementation, the reference and Avo-generated
assembly files are fed to `go tool asm`, capturing the debug output into
corresponding temp files. The debug output contains supplementary
metadata (line numbers, instruction offsets, and source file references)
that must be removed in order to obtain a semantic diff of the two
files. This is accomplished via a small utility script written in awk.
Parameter metadata not found in the reference assembly file has been
added, leading to a diff on the lines where those symbols are
referenced.
Commands used to verify Avo output:
GOROOT=$(go env GOROOT)
ASM_PATH="chacha20poly1305/chacha20poly1305_amd64.s"
REFERENCE="b2d3a6a4b4d36521cd7f653879cf6981e7c5c340"
go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \
<(git cat-file -p "$REFERENCE:$ASM_PATH") \
> /tmp/reference.s
go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \
"$ASM_PATH" \
> /tmp/avo.s
normalize(){
awk '{
$1=$2=$3="";
print substr($0,4)
}'
}
diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s)
155,157c155,157
< MOVQ dst(FP), DI
< MOVQ key+24(FP), R8
< MOVQ src+48(FP), SI
---
> MOVQ dst_base(FP), DI
> MOVQ key_base+24(FP), R8
> MOVQ src_base+48(FP), SI
159c159
< MOVQ ad+72(FP), CX
---
> MOVQ ad_base+72(FP), CX
4684,4686c4684,4686
< MOVQ dst(FP), DI
< MOVQ key+24(FP), R8
< MOVQ src+48(FP), SI
---
> MOVQ dst_base(FP), DI
> MOVQ key_base+24(FP), R8
> MOVQ src_base+48(FP), SI
4688c4688
< MOVQ ad+72(FP), CX
---
> MOVQ ad_base+72(FP), CX
Change-Id: Ia3a8e70b7440944ee739499c41ddceb70e054ef9
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/601442
Reviewed-by: Filippo Valsorda <filippo@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Roland Shoemaker <roland@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
diff --git a/chacha20poly1305/_asm/chacha20poly1305_amd64_asm.go b/chacha20poly1305/_asm/chacha20poly1305_amd64_asm.go
new file mode 100644
index 0000000..e9ba153
--- /dev/null
+++ b/chacha20poly1305/_asm/chacha20poly1305_amd64_asm.go
@@ -0,0 +1,5516 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This assembly implementation was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
+
+package main
+
+import (
+ "fmt"
+ "os"
+ "strings"
+
+ . "github.com/mmcloughlin/avo/build"
+ "github.com/mmcloughlin/avo/ir"
+ . "github.com/mmcloughlin/avo/operand"
+ . "github.com/mmcloughlin/avo/reg"
+ _ "golang.org/x/crypto/chacha20poly1305"
+)
+
+//go:generate go run . -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305
+
+var (
+ // General register allocation
+ oup GPPhysical = RDI
+ inp = RSI
+ inl = RBX
+ adp = RCX // free to reuse, after we hash the additional data
+ keyp = R8 // free to reuse, when we copy the key to stack
+ itr2 = R9 // general iterator
+ itr1 = RCX // general iterator
+ acc0 = R10
+ acc1 = R11
+ acc2 = R12
+ t0 = R13
+ t1 = R14
+ t2 = R15
+ t3 = R8
+
+ // Register and stack allocation for the SSE code
+ rStore Mem = Mem{Base: BP}.Offset(0 * 16)
+ sStore = Mem{Base: BP}.Offset(1 * 16)
+ state1Store = Mem{Base: BP}.Offset(2 * 16)
+ state2Store = Mem{Base: BP}.Offset(3 * 16)
+ tmpStore = Mem{Base: BP}.Offset(4 * 16)
+ ctr0Store = Mem{Base: BP}.Offset(5 * 16)
+ ctr1Store = Mem{Base: BP}.Offset(6 * 16)
+ ctr2Store = Mem{Base: BP}.Offset(7 * 16)
+ ctr3Store = Mem{Base: BP}.Offset(8 * 16)
+ A0 VecPhysical = X0
+ A1 = X1
+ A2 = X2
+ B0 = X3
+ B1 = X4
+ B2 = X5
+ C0 = X6
+ C1 = X7
+ C2 = X8
+ D0 = X9
+ D1 = X10
+ D2 = X11
+ T0 = X12
+ T1 = X13
+ T2 = X14
+ T3 = X15
+ A3 = T0
+ B3 = T1
+ C3 = T2
+ D3 = T3
+
+ // Register and stack allocation for the AVX2 code
+ rsStoreAVX2 Mem = Mem{Base: BP}.Offset(0 * 32)
+ state1StoreAVX2 = Mem{Base: BP}.Offset(1 * 32)
+ state2StoreAVX2 = Mem{Base: BP}.Offset(2 * 32)
+ ctr0StoreAVX2 = Mem{Base: BP}.Offset(3 * 32)
+ ctr1StoreAVX2 = Mem{Base: BP}.Offset(4 * 32)
+ ctr2StoreAVX2 = Mem{Base: BP}.Offset(5 * 32)
+ ctr3StoreAVX2 = Mem{Base: BP}.Offset(6 * 32)
+ tmpStoreAVX2 = Mem{Base: BP}.Offset(7 * 32) // 256 bytes on stack
+ AA0 VecPhysical = Y0
+ AA1 = Y5
+ AA2 = Y6
+ AA3 = Y7
+ BB0 = Y14
+ BB1 = Y9
+ BB2 = Y10
+ BB3 = Y11
+ CC0 = Y12
+ CC1 = Y13
+ CC2 = Y8
+ CC3 = Y15
+ DD0 = Y4
+ DD1 = Y1
+ DD2 = Y2
+ DD3 = Y3
+ TT0 = DD3
+ TT1 = AA3
+ TT2 = BB3
+ TT3 = CC3
+)
+
+const ThatPeskyUnicodeDot = "\u00b7"
+
+func main() {
+ Package("golang.org/x/crypto/chacha20poly1305")
+ ConstraintExpr("gc,!purego")
+ polyHashADInternal()
+ chacha20Poly1305Open()
+ chacha20Poly1305Seal()
+ Generate()
+
+ var internalFunctions []string = []string{"·polyHashADInternal"}
+ removePeskyUnicodeDot(internalFunctions, "../chacha20poly1305_amd64.s")
+}
+
+// Utility function to emit BYTE instruction
+func BYTE(u8 U8) {
+ Instruction(&ir.Instruction{Opcode: "BYTE", Operands: []Op{u8}})
+}
+
+// PALIGNR $4, X3, X3
+func shiftB0Left() {
+ BYTE(U8(0x66))
+ BYTE(U8(0x0f))
+ BYTE(U8(0x3a))
+ BYTE(U8(0x0f))
+ BYTE(U8(0xdb))
+ BYTE(U8(0x04))
+}
+
+// PALIGNR $4, X4, X4
+func shiftB1Left() {
+ BYTE(U8(0x66))
+ BYTE(U8(0x0f))
+ BYTE(U8(0x3a))
+ BYTE(U8(0x0f))
+ BYTE(U8(0xe4))
+ BYTE(U8(0x04))
+}
+
+// PALIGNR $4, X5, X5
+func shiftB2Left() {
+ BYTE(U8(0x66))
+ BYTE(U8(0x0f))
+ BYTE(U8(0x3a))
+ BYTE(U8(0x0f))
+ BYTE(U8(0xed))
+ BYTE(U8(0x04))
+}
+
+// PALIGNR $4, X13, X13
+func shiftB3Left() {
+ BYTE(U8(0x66))
+ BYTE(U8(0x45))
+ BYTE(U8(0x0f))
+ BYTE(U8(0x3a))
+ BYTE(U8(0x0f))
+ BYTE(U8(0xed))
+ BYTE(U8(0x04))
+}
+
+// PALIGNR $8, X6, X6
+func shiftC0Left() {
+ BYTE(U8(0x66))
+ BYTE(U8(0x0f))
+ BYTE(U8(0x3a))
+ BYTE(U8(0x0f))
+ BYTE(U8(0xf6))
+ BYTE(U8(0x08))
+}
+
+// PALIGNR $8, X7, X7
+func shiftC1Left() {
+ BYTE(U8(0x66))
+ BYTE(U8(0x0f))
+ BYTE(U8(0x3a))
+ BYTE(U8(0x0f))
+ BYTE(U8(0xff))
+ BYTE(U8(0x08))
+}
+
+// PALIGNR $8, X8, X8
+func shiftC2Left() {
+ BYTE(U8(0x66))
+ BYTE(U8(0x45))
+ BYTE(U8(0x0f))
+ BYTE(U8(0x3a))
+ BYTE(U8(0x0f))
+ BYTE(U8(0xc0))
+ BYTE(U8(0x08))
+}
+
+// PALIGNR $8, X14, X14
+func shiftC3Left() {
+ BYTE(U8(0x66))
+ BYTE(U8(0x45))
+ BYTE(U8(0x0f))
+ BYTE(U8(0x3a))
+ BYTE(U8(0x0f))
+ BYTE(U8(0xf6))
+ BYTE(U8(0x08))
+}
+
+// PALIGNR $12, X9, X9
+func shiftD0Left() {
+ BYTE(U8(0x66))
+ BYTE(U8(0x45))
+ BYTE(U8(0x0f))
+ BYTE(U8(0x3a))
+ BYTE(U8(0x0f))
+ BYTE(U8(0xc9))
+ BYTE(U8(0x0c))
+}
+
+// PALIGNR $12, X10, X10
+func shiftD1Left() {
+ BYTE(U8(0x66))
+ BYTE(U8(0x45))
+ BYTE(U8(0x0f))
+ BYTE(U8(0x3a))
+ BYTE(U8(0x0f))
+ BYTE(U8(0xd2))
+ BYTE(U8(0x0c))
+}
+
+// PALIGNR $12, X11, X11
+func shiftD2Left() {
+ BYTE(U8(0x66))
+ BYTE(U8(0x45))
+ BYTE(U8(0x0f))
+ BYTE(U8(0x3a))
+ BYTE(U8(0x0f))
+ BYTE(U8(0xdb))
+ BYTE(U8(0x0c))
+}
+
+// PALIGNR $12, X15, X15
+func shiftD3Left() {
+ BYTE(U8(0x66))
+ BYTE(U8(0x45))
+ BYTE(U8(0x0f))
+ BYTE(U8(0x3a))
+ BYTE(U8(0x0f))
+ BYTE(U8(0xff))
+ BYTE(U8(0x0c))
+}
+
+// PALIGNR $12, X3, X3
+func shiftB0Right() {
+ BYTE(U8(0x66))
+ BYTE(U8(0x0f))
+ BYTE(U8(0x3a))
+ BYTE(U8(0x0f))
+ BYTE(U8(0xdb))
+ BYTE(U8(0x0c))
+}
+
+// PALIGNR $12, X4, X4
+func shiftB1Right() {
+ BYTE(U8(0x66))
+ BYTE(U8(0x0f))
+ BYTE(U8(0x3a))
+ BYTE(U8(0x0f))
+ BYTE(U8(0xe4))
+ BYTE(U8(0x0c))
+}
+
+// PALIGNR $12, X5, X5
+func shiftB2Right() {
+ BYTE(U8(0x66))
+ BYTE(U8(0x0f))
+ BYTE(U8(0x3a))
+ BYTE(U8(0x0f))
+ BYTE(U8(0xed))
+ BYTE(U8(0x0c))
+}
+
+// PALIGNR $12, X13, X13
+func shiftB3Right() {
+ BYTE(U8(0x66))
+ BYTE(U8(0x45))
+ BYTE(U8(0x0f))
+ BYTE(U8(0x3a))
+ BYTE(U8(0x0f))
+ BYTE(U8(0xed))
+ BYTE(U8(0x0c))
+}
+
+func shiftC0Right() {
+ shiftC0Left()
+}
+
+func shiftC1Right() {
+ shiftC1Left()
+}
+
+func shiftC2Right() {
+ shiftC2Left()
+}
+
+func shiftC3Right() {
+ shiftC3Left()
+}
+
+// PALIGNR $4, X9, X9
+func shiftD0Right() {
+ BYTE(U8(0x66))
+ BYTE(U8(0x45))
+ BYTE(U8(0x0f))
+ BYTE(U8(0x3a))
+ BYTE(U8(0x0f))
+ BYTE(U8(0xc9))
+ BYTE(U8(0x04))
+}
+
+// PALIGNR $4, X10, X10
+func shiftD1Right() {
+ BYTE(U8(0x66))
+ BYTE(U8(0x45))
+ BYTE(U8(0x0f))
+ BYTE(U8(0x3a))
+ BYTE(U8(0x0f))
+ BYTE(U8(0xd2))
+ BYTE(U8(0x04))
+}
+
+// PALIGNR $4, X11, X11
+func shiftD2Right() {
+ BYTE(U8(0x66))
+ BYTE(U8(0x45))
+ BYTE(U8(0x0f))
+ BYTE(U8(0x3a))
+ BYTE(U8(0x0f))
+ BYTE(U8(0xdb))
+ BYTE(U8(0x04))
+}
+
+// PALIGNR $4, X15, X15
+func shiftD3Right() {
+ BYTE(U8(0x66))
+ BYTE(U8(0x45))
+ BYTE(U8(0x0f))
+ BYTE(U8(0x3a))
+ BYTE(U8(0x0f))
+ BYTE(U8(0xff))
+ BYTE(U8(0x04))
+}
+
+// ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~SOME MACROS~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
+
+// Hack: ROL must be a #define macro as it is referenced by other macros
+func defineROL() {
+ definition :=
+ `#define ROL(N, R, T) \
+ MOVO R, T; \
+ PSLLL $(N), T; \
+ PSRLL $(32-(N)), R; \
+ PXOR T, R`
+ Comment("ROL rotates the uint32s in register R left by N bits, using temporary T.")
+ Instruction(&ir.Instruction{Opcode: definition})
+}
+
+// ROL rotates the uint32s in register R left by N bits, using temporary T.
+func ROL(N uint64, R, T VecPhysical) {
+ // Hack: ROL must be a #define macro as it is referenced by other macros
+ Instruction(&ir.Instruction{Opcode: fmt.Sprintf("ROL(%s, %s, %s)", I8(N).Asm(), R.Asm(), T.Asm())})
+}
+
+// Hack to get Avo to generate an #ifdef
+//
+// ROL16(R, T) definition depends on a compiler flag that specifies amd64 architectural level.
+func defineROL16() {
+ definition :=
+ `#ifdef GOAMD64_v2
+ #define ROL16(R, T) PSHUFB ·rol16<>(SB), R
+ #else
+ #define ROL16(R, T) ROL(16, R, T)
+ #endif`
+
+ Comment("ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.")
+ Instruction(&ir.Instruction{Opcode: definition})
+}
+
+// Hack to emit macro call
+//
+// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
+func ROL16(R, T VecPhysical) {
+ Instruction(&ir.Instruction{Opcode: fmt.Sprintf("ROL16(%s, %s)", R.Asm(), T.Asm())})
+}
+
+// Hack to get Avo to generate an #ifdef
+//
+// ROL8(R, T) definition depends on a compiler flag that specifies amd64 architectural level.
+func defineROL8() {
+ definition :=
+ `#ifdef GOAMD64_v2
+ #define ROL8(R, T) PSHUFB ·rol8<>(SB), R
+ #else
+ #define ROL8(R, T) ROL(8, R, T)
+ #endif`
+
+ Comment("ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.")
+ Instruction(&ir.Instruction{Opcode: definition})
+}
+
+// Hack to emit macro call
+//
+// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
+func ROL8(R, T VecPhysical) {
+ Instruction(&ir.Instruction{Opcode: fmt.Sprintf("ROL8(%s, %s)", R.Asm(), T.Asm())})
+}
+
+func chachaQR(A, B, C, D, T VecPhysical) {
+ PADDD(B, A)
+ PXOR(A, D)
+ ROL16(D, T)
+ PADDD(D, C)
+ PXOR(C, B)
+ MOVO(B, T)
+ PSLLL(Imm(12), T)
+ PSRLL(Imm(20), B)
+ PXOR(T, B)
+ PADDD(B, A)
+ PXOR(A, D)
+ ROL8(D, T)
+ PADDD(D, C)
+ PXOR(C, B)
+ MOVO(B, T)
+ PSLLL(Imm(7), T)
+ PSRLL(Imm(25), B)
+ PXOR(T, B)
+}
+
+func chachaQR_AVX2(A, B, C, D, T VecPhysical) {
+ VPADDD(B, A, A)
+ VPXOR(A, D, D)
+ rol16 := rol16_DATA()
+ VPSHUFB(rol16, D, D)
+ VPADDD(D, C, C)
+ VPXOR(C, B, B)
+ VPSLLD(Imm(12), B, T)
+ VPSRLD(Imm(20), B, B)
+ VPXOR(T, B, B)
+ VPADDD(B, A, A)
+ VPXOR(A, D, D)
+ rol8 := rol8_DATA()
+ VPSHUFB(rol8, D, D)
+ VPADDD(D, C, C)
+ VPXOR(C, B, B)
+ VPSLLD(Imm(7), B, T)
+ VPSRLD(Imm(25), B, B)
+ VPXOR(T, B, B)
+}
+
+func polyAdd(S Mem) {
+ ADDQ(S, acc0)
+ ADCQ(S.Offset(8), acc1)
+ ADCQ(Imm(1), acc2)
+}
+
+func polyMulStage1() {
+ MOVQ(Mem{Base: BP}.Offset(0*8), RAX)
+ MOVQ(RAX, t2)
+ MULQ(acc0)
+ MOVQ(RAX, t0)
+ MOVQ(RDX, t1)
+ MOVQ(Mem{Base: BP}.Offset(0*8), RAX)
+ MULQ(acc1)
+ IMULQ(acc2, t2)
+ ADDQ(RAX, t1)
+ ADCQ(RDX, t2)
+}
+
+func polyMulStage2() {
+ MOVQ(Mem{Base: BP}.Offset(1*8), RAX)
+ MOVQ(RAX, t3)
+ MULQ(acc0)
+ ADDQ(RAX, t1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, acc0)
+ MOVQ(Mem{Base: BP}.Offset(1*8), RAX)
+ MULQ(acc1)
+ ADDQ(RAX, t2)
+ ADCQ(Imm(0), RDX)
+}
+
+func polyMulStage3() {
+ IMULQ(acc2, t3)
+ ADDQ(acc0, t2)
+ ADCQ(RDX, t3)
+}
+
+func polyMulReduceStage() {
+ MOVQ(t0, acc0)
+ MOVQ(t1, acc1)
+ MOVQ(t2, acc2)
+ ANDQ(Imm(3), acc2)
+ MOVQ(t2, t0)
+ ANDQ(I8(-4), t0)
+ MOVQ(t3, t1)
+ SHRQ(Imm(2), t3, t2)
+ SHRQ(Imm(2), t3)
+ ADDQ(t0, acc0)
+ ADCQ(t1, acc1)
+ ADCQ(Imm(0), acc2)
+ ADDQ(t2, acc0)
+ ADCQ(t3, acc1)
+ ADCQ(Imm(0), acc2)
+}
+
+func polyMulStage1_AVX2() {
+ MOVQ(Mem{Base: BP}.Offset(0*8), RDX)
+ MOVQ(RDX, t2)
+ MULXQ(acc0, t0, t1)
+ IMULQ(acc2, t2)
+ MULXQ(acc1, RAX, RDX)
+ ADDQ(RAX, t1)
+ ADCQ(RDX, t2)
+}
+
+func polyMulStage2_AVX2() {
+ MOVQ(Mem{Base: BP}.Offset(1*8), RDX)
+ MULXQ(acc0, acc0, RAX)
+ ADDQ(acc0, t1)
+ MULXQ(acc1, acc1, t3)
+ ADCQ(acc1, t2)
+ ADCQ(Imm(0), t3)
+}
+
+func polyMulStage3_AVX2() {
+ IMULQ(acc2, RDX)
+ ADDQ(RAX, t2)
+ ADCQ(RDX, t3)
+}
+
+func polyMul() {
+ polyMulStage1()
+ polyMulStage2()
+ polyMulStage3()
+ polyMulReduceStage()
+}
+
+func polyMulAVX2() {
+ polyMulStage1_AVX2()
+ polyMulStage2_AVX2()
+ polyMulStage3_AVX2()
+ polyMulReduceStage()
+}
+
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+
+func polyHashADInternal() {
+ Function("polyHashADInternal<>")
+ Attributes(NOSPLIT)
+ AllocLocal(0)
+
+ Comment("Hack: Must declare #define macros inside of a function due to Avo constraints")
+ defineROL()
+ defineROL8()
+ defineROL16()
+
+ // adp points to beginning of additional data
+ // itr2 holds ad length
+ XORQ(acc0, acc0)
+ XORQ(acc1, acc1)
+ XORQ(acc2, acc2)
+ CMPQ(itr2, Imm(13))
+ JNE(LabelRef("hashADLoop"))
+
+ openFastTLSAD()
+ hashADLoop()
+ hashADTail()
+ hashADTailLoop()
+ hashADTailFinish()
+ hashADDone()
+}
+
+// Special treatment for the TLS case of 13 bytes
+func openFastTLSAD() {
+ Label("openFastTLSAD")
+ MOVQ(Mem{Base: adp}, acc0)
+ MOVQ(Mem{Base: adp}.Offset(5), acc1)
+ SHRQ(Imm(24), acc1)
+ MOVQ(U32(1), acc2)
+ polyMul()
+ RET()
+}
+
+// Hash in 16 byte chunks
+func hashADLoop() {
+ Label("hashADLoop")
+ Comment("Hash in 16 byte chunks")
+ CMPQ(itr2, Imm(16))
+ JB(LabelRef("hashADTail"))
+ polyAdd(Mem{Base: adp}.Offset(0))
+ LEAQ(Mem{Base: adp}.Offset(1*16), adp)
+ SUBQ(Imm(16), itr2)
+ polyMul()
+ JMP(LabelRef("hashADLoop"))
+}
+
+func hashADTail() {
+ Label("hashADTail")
+ CMPQ(itr2, Imm(0))
+ JE(LabelRef("hashADDone"))
+
+ Comment("Hash last < 16 byte tail")
+ XORQ(t0, t0)
+ XORQ(t1, t1)
+ XORQ(t2, t2)
+ ADDQ(itr2, adp)
+}
+
+func hashADTailLoop() {
+ Label("hashADTailLoop")
+ SHLQ(Imm(8), t0, t1)
+ SHLQ(Imm(8), t0)
+ // Hack to get Avo to emit:
+ // MOVB -1(adp), t2
+ Instruction(&ir.Instruction{Opcode: "MOVB", Operands: []Op{Mem{Base: adp}.Offset(-1), t2}})
+ XORQ(t2, t0)
+ DECQ(adp)
+ DECQ(itr2)
+ JNE(LabelRef("hashADTailLoop"))
+}
+
+func hashADTailFinish() {
+ ADDQ(t0, acc0)
+ ADCQ(t1, acc1)
+ ADCQ(Imm(1), acc2)
+ polyMul()
+}
+
+// Finished AD
+func hashADDone() {
+ Label("hashADDone")
+ RET()
+}
+
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+
+// Implements the following function fignature:
+//
+// func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool
+func chacha20Poly1305Open() {
+ Implement("chacha20Poly1305Open")
+ Attributes(0)
+ AllocLocal(288)
+
+ Comment("For aligned stack access")
+ MOVQ(RSP, RBP)
+ ADDQ(Imm(32), RBP)
+ ANDQ(I8(-32), RBP)
+
+ Load(Param("dst").Base(), oup)
+ Load(Param("key").Base(), keyp)
+ Load(Param("src").Base(), inp)
+ Load(Param("src").Len(), inl)
+ Load(Param("ad").Base(), adp)
+
+ Comment("Check for AVX2 support")
+ CMPB(Mem{Symbol: Symbol{Name: ThatPeskyUnicodeDot + "useAVX2"}, Base: StaticBase}, Imm(1))
+ JE(LabelRef("chacha20Poly1305Open_AVX2"))
+
+ Comment("Special optimization, for very short buffers")
+ CMPQ(inl, Imm(128))
+ JBE(LabelRef("openSSE128")) // About 16% faster
+
+ Comment("For long buffers, prepare the poly key first")
+ chacha20Constants := chacha20Constants_DATA()
+ MOVOU(chacha20Constants, A0)
+ MOVOU(Mem{Base: keyp}.Offset(1*16), B0)
+ MOVOU(Mem{Base: keyp}.Offset(2*16), C0)
+ MOVOU(Mem{Base: keyp}.Offset(3*16), D0)
+ MOVO(D0, T1)
+
+ Comment("Store state on stack for future use")
+ MOVO(B0, state1Store)
+ MOVO(C0, state2Store)
+ MOVO(D0, ctr3Store)
+ MOVQ(U32(10), itr2)
+
+ openSSEPreparePolyKey()
+ openSSEMainLoop()
+ openSSEInternalLoop()
+ openSSEMainLoopDone()
+ openSSEFinalize()
+
+ // ----------------------------------------------------------------------------
+ // Special optimization for buffers smaller than 129 bytes
+ openSSE128()
+ openSSE128InnerCipherLoop()
+ openSSE128Open()
+ openSSETail16()
+ openSSETail16Store()
+
+ // ----------------------------------------------------------------------------
+ // Special optimization for the last 64 bytes of ciphertext
+ openSSETail64()
+ openSSETail64LoopA()
+ openSSETail64LoopB()
+ openSSETail64DecLoop()
+ openSSETail64DecLoopDone()
+
+ // ----------------------------------------------------------------------------
+ // Special optimization for the last 128 bytes of ciphertext
+ openSSETail128()
+ openSSETail128LoopA()
+ openSSETail128LoopB()
+
+ // ----------------------------------------------------------------------------
+ // Special optimization for the last 192 bytes of ciphertext
+ openSSETail192()
+ openSSLTail192LoopA()
+ openSSLTail192LoopB()
+ openSSLTail192Store()
+
+ // ----------------------------------------------------------------------------
+ // Special optimization for the last 256 bytes of ciphertext
+ openSSETail256()
+ openSSETail256Loop()
+ openSSETail256HashLoop()
+
+ // ----------------------------------------------------------------------------
+ // ------------------------- AVX2 Code ----------------------------------------
+ chacha20Poly1305Open_AVX2()
+ openAVX2PreparePolyKey()
+ openAVX2InitialHash64()
+ openAVX2MainLoop()
+ openAVX2InternalLoop()
+ openAVX2MainLoopDone()
+
+ // ----------------------------------------------------------------------------
+ // Special optimization for buffers smaller than 193 bytes
+ openAVX2192()
+ openAVX2192InnerCipherLoop()
+ openAVX2ShortOpen()
+ openAVX2ShortOpenLoop()
+ openAVX2ShortTail32()
+ openAVX2ShortDone()
+
+ // ----------------------------------------------------------------------------
+ // Special optimization for buffers smaller than 321 bytes
+ openAVX2320()
+ openAVX2320InnerCipherLoop()
+
+ // ----------------------------------------------------------------------------
+ // Special optimization for the last 128 bytes of ciphertext
+ openAVX2Tail128()
+ openAVX2Tail128LoopA()
+ openAVX2Tail128LoopB()
+ openAVX2TailLoop()
+ openAVX2Tail()
+ openAVX2TailDone()
+
+ // ----------------------------------------------------------------------------
+ // Special optimization for the last 256 bytes of ciphertext
+ openAVX2Tail256()
+ openAVX2Tail256LoopA()
+ openAVX2Tail256LoopB()
+ openAVX2Tail256Hash()
+ openAVX2Tail256HashEnd()
+
+ // ----------------------------------------------------------------------------
+ // Special optimization for the last 384 bytes of ciphertext
+ openAVX2Tail384()
+ openAVX2Tail384LoopB()
+ openAVX2Tail384LoopA()
+ openAVX2Tail384Hash()
+ openAVX2Tail384HashEnd()
+
+ // ----------------------------------------------------------------------------
+ // Special optimization for the last 512 bytes of ciphertext
+ openAVX2Tail512()
+ openAVX2Tail512LoopB()
+ openAVX2Tail512LoopA()
+ openAVX2Tail512HashLoop()
+ openAVX2Tail512HashEnd()
+}
+
+func openSSEPreparePolyKey() {
+ Label("openSSEPreparePolyKey")
+ chachaQR(A0, B0, C0, D0, T0)
+ shiftB0Left()
+ shiftC0Left()
+ shiftD0Left()
+ chachaQR(A0, B0, C0, D0, T0)
+ shiftB0Right()
+ shiftC0Right()
+ shiftD0Right()
+ DECQ(itr2)
+ JNE(LabelRef("openSSEPreparePolyKey"))
+
+ Comment("A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded")
+ chacha20Constants := chacha20Constants_DATA()
+ PADDL(chacha20Constants, A0)
+ PADDL(state1Store, B0)
+
+ Comment("Clamp and store the key")
+ polyClampMask := polyClampMask_DATA()
+ PAND(polyClampMask, A0)
+ MOVO(A0, rStore)
+ MOVO(B0, sStore)
+
+ Comment("Hash AAD")
+ Load(Param("ad").Len(), itr2)
+ CALL(LabelRef("polyHashADInternal<>(SB)"))
+}
+
+func openSSEMainLoop() {
+ Label("openSSEMainLoop")
+ CMPQ(inl, U32(256))
+ JB(LabelRef("openSSEMainLoopDone"))
+
+ chacha20Constants := chacha20Constants_DATA()
+ sseIncMask := sseIncMask_DATA()
+
+ Comment("Load state, increment counter blocks")
+ MOVO(chacha20Constants, A0)
+ MOVO(state1Store, B0)
+ MOVO(state2Store, C0)
+ MOVO(ctr3Store, D0)
+ PADDL(sseIncMask, D0)
+ MOVO(A0, A1)
+ MOVO(B0, B1)
+ MOVO(C0, C1)
+ MOVO(D0, D1)
+ PADDL(sseIncMask, D1)
+ MOVO(A1, A2)
+ MOVO(B1, B2)
+ MOVO(C1, C2)
+ MOVO(D1, D2)
+ PADDL(sseIncMask, D2)
+ MOVO(A2, A3)
+ MOVO(B2, B3)
+ MOVO(C2, C3)
+ MOVO(D2, D3)
+ PADDL(sseIncMask, D3)
+
+ Comment("Store counters")
+ MOVO(D0, ctr0Store)
+ MOVO(D1, ctr1Store)
+ MOVO(D2, ctr2Store)
+ MOVO(D3, ctr3Store)
+
+ Comment("There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash")
+ Comment("2 blocks, and for the remaining 4 only 1 block - for a total of 16")
+ MOVQ(U32(4), itr1)
+ MOVQ(inp, itr2)
+}
+
+func openSSEInternalLoop() {
+ Label("openSSEInternalLoop")
+ MOVO(C3, tmpStore)
+ chachaQR(A0, B0, C0, D0, C3)
+ chachaQR(A1, B1, C1, D1, C3)
+ chachaQR(A2, B2, C2, D2, C3)
+ MOVO(tmpStore, C3)
+ MOVO(C1, tmpStore)
+ chachaQR(A3, B3, C3, D3, C1)
+ MOVO(tmpStore, C1)
+ polyAdd(Mem{Base: itr2}.Offset(0))
+ shiftB0Left()
+ shiftB1Left()
+ shiftB2Left()
+ shiftB3Left()
+ shiftC0Left()
+ shiftC1Left()
+ shiftC2Left()
+ shiftC3Left()
+ shiftD0Left()
+ shiftD1Left()
+ shiftD2Left()
+ shiftD3Left()
+ polyMulStage1()
+ polyMulStage2()
+ LEAQ(Mem{Base: itr2}.Offset(2*8), itr2)
+ MOVO(C3, tmpStore)
+ chachaQR(A0, B0, C0, D0, C3)
+ chachaQR(A1, B1, C1, D1, C3)
+ chachaQR(A2, B2, C2, D2, C3)
+ MOVO(tmpStore, C3)
+ MOVO(C1, tmpStore)
+ polyMulStage3()
+ chachaQR(A3, B3, C3, D3, C1)
+ MOVO(tmpStore, C1)
+ polyMulReduceStage()
+ shiftB0Right()
+ shiftB1Right()
+ shiftB2Right()
+ shiftB3Right()
+ shiftC0Right()
+ shiftC1Right()
+ shiftC2Right()
+ shiftC3Right()
+ shiftD0Right()
+ shiftD1Right()
+ shiftD2Right()
+ shiftD3Right()
+ DECQ(itr1)
+ JGE(LabelRef("openSSEInternalLoop"))
+
+ polyAdd(Mem{Base: itr2}.Offset(0))
+ polyMul()
+ LEAQ(Mem{Base: itr2}.Offset(2*8), itr2)
+
+ CMPQ(itr1, I8(-6))
+ JG(LabelRef("openSSEInternalLoop"))
+
+ chacha20Constants := chacha20Constants_DATA()
+ Comment("Add in the state")
+ PADDD(chacha20Constants, A0)
+ PADDD(chacha20Constants, A1)
+ PADDD(chacha20Constants, A2)
+ PADDD(chacha20Constants, A3)
+ PADDD(state1Store, B0)
+ PADDD(state1Store, B1)
+ PADDD(state1Store, B2)
+ PADDD(state1Store, B3)
+ PADDD(state2Store, C0)
+ PADDD(state2Store, C1)
+ PADDD(state2Store, C2)
+ PADDD(state2Store, C3)
+ PADDD(ctr0Store, D0)
+ PADDD(ctr1Store, D1)
+ PADDD(ctr2Store, D2)
+ PADDD(ctr3Store, D3)
+
+ Comment("Load - xor - store")
+ MOVO(D3, tmpStore)
+ MOVOU(Mem{Base: inp}.Offset(0*16), D3)
+ PXOR(D3, A0)
+ MOVOU(A0, Mem{Base: oup}.Offset(0*16))
+ MOVOU(Mem{Base: inp}.Offset(1*16), D3)
+ PXOR(D3, B0)
+ MOVOU(B0, Mem{Base: oup}.Offset(1*16))
+ MOVOU(Mem{Base: inp}.Offset(2*16), D3)
+ PXOR(D3, C0)
+ MOVOU(C0, Mem{Base: oup}.Offset(2*16))
+ MOVOU(Mem{Base: inp}.Offset(3*16), D3)
+ PXOR(D3, D0)
+ MOVOU(D0, Mem{Base: oup}.Offset(3*16))
+ MOVOU(Mem{Base: inp}.Offset(4*16), D0)
+ PXOR(D0, A1)
+ MOVOU(A1, Mem{Base: oup}.Offset(4*16))
+ MOVOU(Mem{Base: inp}.Offset(5*16), D0)
+ PXOR(D0, B1)
+ MOVOU(B1, Mem{Base: oup}.Offset(5*16))
+ MOVOU(Mem{Base: inp}.Offset(6*16), D0)
+ PXOR(D0, C1)
+ MOVOU(C1, Mem{Base: oup}.Offset(6*16))
+ MOVOU(Mem{Base: inp}.Offset(7*16), D0)
+ PXOR(D0, D1)
+ MOVOU(D1, Mem{Base: oup}.Offset(7*16))
+ MOVOU(Mem{Base: inp}.Offset(8*16), D0)
+ PXOR(D0, A2)
+ MOVOU(A2, Mem{Base: oup}.Offset(8*16))
+ MOVOU(Mem{Base: inp}.Offset(9*16), D0)
+ PXOR(D0, B2)
+ MOVOU(B2, Mem{Base: oup}.Offset(9*16))
+ MOVOU(Mem{Base: inp}.Offset(10*16), D0)
+ PXOR(D0, C2)
+ MOVOU(C2, Mem{Base: oup}.Offset(10*16))
+ MOVOU(Mem{Base: inp}.Offset(11*16), D0)
+ PXOR(D0, D2)
+ MOVOU(D2, Mem{Base: oup}.Offset(11*16))
+ MOVOU(Mem{Base: inp}.Offset(12*16), D0)
+ PXOR(D0, A3)
+ MOVOU(A3, Mem{Base: oup}.Offset(12*16))
+ MOVOU(Mem{Base: inp}.Offset(13*16), D0)
+ PXOR(D0, B3)
+ MOVOU(B3, Mem{Base: oup}.Offset(13*16))
+ MOVOU(Mem{Base: inp}.Offset(14*16), D0)
+ PXOR(D0, C3)
+ MOVOU(C3, Mem{Base: oup}.Offset(14*16))
+ MOVOU(Mem{Base: inp}.Offset(15*16), D0)
+ PXOR(tmpStore, D0)
+ MOVOU(D0, Mem{Base: oup}.Offset(15*16))
+ LEAQ(Mem{Base: inp}.Offset(256), inp)
+ LEAQ(Mem{Base: oup}.Offset(256), oup)
+ SUBQ(U32(256), inl)
+ JMP(LabelRef("openSSEMainLoop"))
+}
+
+func openSSEMainLoopDone() {
+ Label("openSSEMainLoopDone")
+ Comment("Handle the various tail sizes efficiently")
+ TESTQ(inl, inl)
+ JE(LabelRef("openSSEFinalize"))
+ CMPQ(inl, Imm(64))
+ JBE(LabelRef("openSSETail64"))
+ CMPQ(inl, Imm(128))
+ JBE(LabelRef("openSSETail128"))
+ CMPQ(inl, Imm(192))
+ JBE(LabelRef("openSSETail192"))
+ JMP(LabelRef("openSSETail256"))
+}
+
+func openSSEFinalize() {
+ Label("openSSEFinalize")
+ Comment("Hash in the PT, AAD lengths")
+ ADDQ(NewParamAddr("ad_len", 80), acc0)
+ ADCQ(NewParamAddr("src_len", 56), acc1)
+ ADCQ(Imm(1), acc2)
+ polyMul()
+
+ Comment("Final reduce")
+ MOVQ(acc0, t0)
+ MOVQ(acc1, t1)
+ MOVQ(acc2, t2)
+ SUBQ(I8(-5), acc0)
+ SBBQ(I8(-1), acc1)
+ SBBQ(Imm(3), acc2)
+ CMOVQCS(t0, acc0)
+ CMOVQCS(t1, acc1)
+ CMOVQCS(t2, acc2)
+
+ Comment("Add in the \"s\" part of the key")
+ ADDQ(sStore.Offset(0), acc0)
+ ADCQ(sStore.Offset(8), acc1)
+
+ Comment("Finally, constant time compare to the tag at the end of the message")
+ XORQ(RAX, RAX)
+ MOVQ(U32(1), RDX)
+ XORQ(Mem{Base: inp}.Offset(0*8), acc0)
+ XORQ(Mem{Base: inp}.Offset(1*8), acc1)
+ ORQ(acc1, acc0)
+ CMOVQEQ(RDX, RAX)
+
+ Comment("Return true iff tags are equal")
+ // Hack to get Avo to emit:
+ // MOVB AX, ret+96(FP)
+ Instruction(&ir.Instruction{Opcode: "MOVB", Operands: []Op{AX, NewParamAddr("ret", 96)}})
+ RET()
+}
+
+// ----------------------------------------------------------------------------
+// Special optimization for buffers smaller than 129 bytes
+
+// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
+func openSSE128() {
+ Label("openSSE128")
+
+ chacha20Constants := chacha20Constants_DATA()
+ sseIncMask := sseIncMask_DATA()
+
+ MOVOU(chacha20Constants, A0)
+ MOVOU(Mem{Base: keyp}.Offset(1*16), B0)
+ MOVOU(Mem{Base: keyp}.Offset(2*16), C0)
+ MOVOU(Mem{Base: keyp}.Offset(3*16), D0)
+ MOVO(A0, A1)
+ MOVO(B0, B1)
+ MOVO(C0, C1)
+ MOVO(D0, D1)
+ PADDL(sseIncMask, D1)
+ MOVO(A1, A2)
+ MOVO(B1, B2)
+ MOVO(C1, C2)
+ MOVO(D1, D2)
+ PADDL(sseIncMask, D2)
+ MOVO(B0, T1)
+ MOVO(C0, T2)
+ MOVO(D1, T3)
+ MOVQ(U32(10), itr2)
+}
+
+func openSSE128InnerCipherLoop() {
+ Label("openSSE128InnerCipherLoop")
+ chachaQR(A0, B0, C0, D0, T0)
+ chachaQR(A1, B1, C1, D1, T0)
+ chachaQR(A2, B2, C2, D2, T0)
+ shiftB0Left()
+ shiftB1Left()
+ shiftB2Left()
+ shiftC0Left()
+ shiftC1Left()
+ shiftC2Left()
+ shiftD0Left()
+ shiftD1Left()
+ shiftD2Left()
+ chachaQR(A0, B0, C0, D0, T0)
+ chachaQR(A1, B1, C1, D1, T0)
+ chachaQR(A2, B2, C2, D2, T0)
+ shiftB0Right()
+ shiftB1Right()
+ shiftB2Right()
+ shiftC0Right()
+ shiftC1Right()
+ shiftC2Right()
+ shiftD0Right()
+ shiftD1Right()
+ shiftD2Right()
+ DECQ(itr2)
+ JNE(LabelRef("openSSE128InnerCipherLoop"))
+
+ Comment("A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded")
+
+ chacha20Constants := chacha20Constants_DATA()
+ PADDL(chacha20Constants, A0)
+ PADDL(chacha20Constants, A1)
+ PADDL(chacha20Constants, A2)
+ PADDL(T1, B0)
+ PADDL(T1, B1)
+ PADDL(T1, B2)
+ PADDL(T2, C1)
+ PADDL(T2, C2)
+ PADDL(T3, D1)
+ sseIncMask := sseIncMask_DATA()
+ PADDL(sseIncMask, T3)
+ PADDL(T3, D2)
+
+ Comment("Clamp and store the key")
+ polyClampMask := polyClampMask_DATA()
+ PAND(polyClampMask, A0)
+ MOVOU(A0, rStore)
+ MOVOU(B0, sStore)
+
+ Comment("Hash")
+ Load(Param("ad").Len(), itr2)
+ CALL(LabelRef("polyHashADInternal<>(SB)"))
+}
+
+func openSSE128Open() {
+ Label("openSSE128Open")
+ CMPQ(inl, Imm(16))
+ JB(LabelRef("openSSETail16"))
+ SUBQ(Imm(16), inl)
+
+ Comment("Load for hashing")
+ polyAdd(Mem{Base: inp}.Offset(0))
+
+ Comment("Load for decryption")
+ MOVOU(Mem{Base: inp}, T0)
+ PXOR(T0, A1)
+ MOVOU(A1, Mem{Base: oup})
+ LEAQ(Mem{Base: inp}.Offset(1*16), inp)
+ LEAQ(Mem{Base: oup}.Offset(1*16), oup)
+ polyMul()
+
+ Comment("Shift the stream \"left\"")
+ MOVO(B1, A1)
+ MOVO(C1, B1)
+ MOVO(D1, C1)
+ MOVO(A2, D1)
+ MOVO(B2, A2)
+ MOVO(C2, B2)
+ MOVO(D2, C2)
+ JMP(LabelRef("openSSE128Open"))
+}
+
+func openSSETail16() {
+ Label("openSSETail16")
+ TESTQ(inl, inl)
+ JE(LabelRef("openSSEFinalize"))
+
+ Comment("We can safely load the CT from the end, because it is padded with the MAC")
+ MOVQ(inl, itr2)
+ SHLQ(Imm(4), itr2)
+ andMask := andMask_DATA()
+ LEAQ(andMask, t0)
+ MOVOU(Mem{Base: inp}, T0)
+ ADDQ(inl, inp)
+ PAND(Mem{Base: t0, Index: itr2, Scale: 1}.Offset(-16), T0)
+ MOVO(T0, tmpStore.Offset(0))
+ MOVQ(T0, t0)
+ MOVQ(tmpStore.Offset(8), t1)
+ PXOR(A1, T0)
+}
+
+func openSSETail16Store() {
+ Comment("We can only store one byte at a time, since plaintext can be shorter than 16 bytes")
+ Label("openSSETail16Store")
+ MOVQ(T0, t3)
+ // Hack to get Avo to emit:
+ // MOVB t3, (oup)
+ Instruction(&ir.Instruction{Opcode: "MOVB", Operands: []Op{t3, Mem{Base: oup}}})
+ PSRLDQ(Imm(1), T0)
+ INCQ(oup)
+ DECQ(inl)
+ JNE(LabelRef("openSSETail16Store"))
+ ADDQ(t0, acc0)
+ ADCQ(t1, acc1)
+ ADCQ(Imm(1), acc2)
+ polyMul()
+ JMP(LabelRef("openSSEFinalize"))
+}
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 64 bytes of ciphertext
+
+// Need to decrypt up to 64 bytes - prepare single block
+func openSSETail64() {
+ Label("openSSETail64")
+ chacha20Constants := chacha20Constants_DATA()
+ MOVO(chacha20Constants, A0)
+ MOVO(state1Store, B0)
+ MOVO(state2Store, C0)
+ MOVO(ctr3Store, D0)
+ sseIncMask := sseIncMask_DATA()
+ PADDL(sseIncMask, D0)
+ MOVO(D0, ctr0Store)
+ XORQ(itr2, itr2)
+ MOVQ(inl, itr1)
+ CMPQ(itr1, Imm(16))
+ JB(LabelRef("openSSETail64LoopB"))
+}
+
+// Perform ChaCha rounds, while hashing the remaining input
+func openSSETail64LoopA() {
+ Label("openSSETail64LoopA")
+ polyAdd(Mem{Base: inp, Index: itr2, Scale: 1}.Offset(0))
+ polyMul()
+ SUBQ(Imm(16), itr1)
+}
+
+func openSSETail64LoopB() {
+ Label("openSSETail64LoopB")
+ ADDQ(Imm(16), itr2)
+ chachaQR(A0, B0, C0, D0, T0)
+ shiftB0Left()
+ shiftC0Left()
+ shiftD0Left()
+ chachaQR(A0, B0, C0, D0, T0)
+ shiftB0Right()
+ shiftC0Right()
+ shiftD0Right()
+
+ CMPQ(itr1, Imm(16))
+ JAE(LabelRef("openSSETail64LoopA"))
+
+ CMPQ(itr2, Imm(160))
+ JNE(LabelRef("openSSETail64LoopB"))
+
+ chacha20Constants := chacha20Constants_DATA()
+ PADDL(chacha20Constants, A0)
+ PADDL(state1Store, B0)
+ PADDL(state2Store, C0)
+ PADDL(ctr0Store, D0)
+}
+
+func openSSETail64DecLoop() {
+ Label("openSSETail64DecLoop")
+ CMPQ(inl, Imm(16))
+ JB(LabelRef("openSSETail64DecLoopDone"))
+ SUBQ(Imm(16), inl)
+ MOVOU(Mem{Base: inp}, T0)
+ PXOR(T0, A0)
+ MOVOU(A0, Mem{Base: oup})
+ LEAQ(Mem{Base: inp}.Offset(16), inp)
+ LEAQ(Mem{Base: oup}.Offset(16), oup)
+ MOVO(B0, A0)
+ MOVO(C0, B0)
+ MOVO(D0, C0)
+ JMP(LabelRef("openSSETail64DecLoop"))
+}
+
+func openSSETail64DecLoopDone() {
+ Label("openSSETail64DecLoopDone")
+ MOVO(A0, A1)
+ JMP(LabelRef("openSSETail16"))
+}
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 128 bytes of ciphertext
+
+// Need to decrypt up to 128 bytes - prepare two blocks
+func openSSETail128() {
+ Label("openSSETail128")
+ chacha20Constants := chacha20Constants_DATA()
+ MOVO(chacha20Constants, A1)
+ MOVO(state1Store, B1)
+ MOVO(state2Store, C1)
+ MOVO(ctr3Store, D1)
+ sseIncMask := sseIncMask_DATA()
+ PADDL(sseIncMask, D1)
+ MOVO(D1, ctr0Store)
+ MOVO(A1, A0)
+ MOVO(B1, B0)
+ MOVO(C1, C0)
+ MOVO(D1, D0)
+ PADDL(sseIncMask, D0)
+ MOVO(D0, ctr1Store)
+ XORQ(itr2, itr2)
+ MOVQ(inl, itr1)
+ ANDQ(I8(-16), itr1)
+}
+
+// Perform ChaCha rounds, while hashing the remaining input
+func openSSETail128LoopA() {
+ Label("openSSETail128LoopA")
+ polyAdd(Mem{Base: inp, Index: itr2, Scale: 1}.Offset(0))
+ polyMul()
+}
+
+func openSSETail128LoopB() {
+ Label("openSSETail128LoopB")
+ ADDQ(Imm(16), itr2)
+ chachaQR(A0, B0, C0, D0, T0)
+ chachaQR(A1, B1, C1, D1, T0)
+ shiftB0Left()
+ shiftC0Left()
+ shiftD0Left()
+ shiftB1Left()
+ shiftC1Left()
+ shiftD1Left()
+ chachaQR(A0, B0, C0, D0, T0)
+ chachaQR(A1, B1, C1, D1, T0)
+ shiftB0Right()
+ shiftC0Right()
+ shiftD0Right()
+ shiftB1Right()
+ shiftC1Right()
+ shiftD1Right()
+
+ CMPQ(itr2, itr1)
+ JB(LabelRef("openSSETail128LoopA"))
+
+ CMPQ(itr2, Imm(160))
+ JNE(LabelRef("openSSETail128LoopB"))
+
+ chacha20Constants := chacha20Constants_DATA()
+ PADDL(chacha20Constants, A0)
+ PADDL(chacha20Constants, A1)
+ PADDL(state1Store, B0)
+ PADDL(state1Store, B1)
+ PADDL(state2Store, C0)
+ PADDL(state2Store, C1)
+ PADDL(ctr1Store, D0)
+ PADDL(ctr0Store, D1)
+
+ MOVOU(Mem{Base: inp}.Offset(0*16), T0)
+ MOVOU(Mem{Base: inp}.Offset(1*16), T1)
+ MOVOU(Mem{Base: inp}.Offset(2*16), T2)
+ MOVOU(Mem{Base: inp}.Offset(3*16), T3)
+ PXOR(T0, A1)
+ PXOR(T1, B1)
+ PXOR(T2, C1)
+ PXOR(T3, D1)
+ MOVOU(A1, Mem{Base: oup}.Offset(0*16))
+ MOVOU(B1, Mem{Base: oup}.Offset(1*16))
+ MOVOU(C1, Mem{Base: oup}.Offset(2*16))
+ MOVOU(D1, Mem{Base: oup}.Offset(3*16))
+
+ SUBQ(Imm(64), inl)
+ LEAQ(Mem{Base: inp}.Offset(64), inp)
+ LEAQ(Mem{Base: oup}.Offset(64), oup)
+ JMP(LabelRef("openSSETail64DecLoop"))
+}
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 192 bytes of ciphertext
+
+// Need to decrypt up to 192 bytes - prepare three blocks
+func openSSETail192() {
+ Label("openSSETail192")
+ chacha20Constants := chacha20Constants_DATA()
+ MOVO(chacha20Constants, A2)
+ MOVO(state1Store, B2)
+ MOVO(state2Store, C2)
+ MOVO(ctr3Store, D2)
+ sseIncMask := sseIncMask_DATA()
+ PADDL(sseIncMask, D2)
+ MOVO(D2, ctr0Store)
+ MOVO(A2, A1)
+ MOVO(B2, B1)
+ MOVO(C2, C1)
+ MOVO(D2, D1)
+ PADDL(sseIncMask, D1)
+ MOVO(D1, ctr1Store)
+ MOVO(A1, A0)
+ MOVO(B1, B0)
+ MOVO(C1, C0)
+ MOVO(D1, D0)
+ PADDL(sseIncMask, D0)
+ MOVO(D0, ctr2Store)
+
+ MOVQ(inl, itr1)
+ MOVQ(U32(160), itr2)
+ CMPQ(itr1, Imm(160))
+ CMOVQGT(itr2, itr1)
+ ANDQ(I8(-16), itr1)
+ XORQ(itr2, itr2)
+}
+
+// Perform ChaCha rounds, while hashing the remaining input
+func openSSLTail192LoopA() {
+ Label("openSSLTail192LoopA")
+ polyAdd(Mem{Base: inp, Index: itr2, Scale: 1}.Offset(0))
+ polyMul()
+}
+
+func openSSLTail192LoopB() {
+ Label("openSSLTail192LoopB")
+ ADDQ(Imm(16), itr2)
+ chachaQR(A0, B0, C0, D0, T0)
+ chachaQR(A1, B1, C1, D1, T0)
+ chachaQR(A2, B2, C2, D2, T0)
+ shiftB0Left()
+ shiftC0Left()
+ shiftD0Left()
+ shiftB1Left()
+ shiftC1Left()
+ shiftD1Left()
+ shiftB2Left()
+ shiftC2Left()
+ shiftD2Left()
+
+ chachaQR(A0, B0, C0, D0, T0)
+ chachaQR(A1, B1, C1, D1, T0)
+ chachaQR(A2, B2, C2, D2, T0)
+ shiftB0Right()
+ shiftC0Right()
+ shiftD0Right()
+ shiftB1Right()
+ shiftC1Right()
+ shiftD1Right()
+ shiftB2Right()
+ shiftC2Right()
+ shiftD2Right()
+
+ CMPQ(itr2, itr1)
+ JB(LabelRef("openSSLTail192LoopA"))
+
+ CMPQ(itr2, Imm(160))
+ JNE(LabelRef("openSSLTail192LoopB"))
+
+ CMPQ(inl, Imm(176))
+ JB(LabelRef("openSSLTail192Store"))
+
+ polyAdd(Mem{Base: inp}.Offset(160))
+ polyMul()
+
+ CMPQ(inl, Imm(192))
+ JB(LabelRef("openSSLTail192Store"))
+
+ polyAdd(Mem{Base: inp}.Offset(176))
+ polyMul()
+}
+
+func openSSLTail192Store() {
+ Label("openSSLTail192Store")
+ chacha20Constants := chacha20Constants_DATA()
+ PADDL(chacha20Constants, A0)
+ PADDL(chacha20Constants, A1)
+ PADDL(chacha20Constants, A2)
+ PADDL(state1Store, B0)
+ PADDL(state1Store, B1)
+ PADDL(state1Store, B2)
+ PADDL(state2Store, C0)
+ PADDL(state2Store, C1)
+ PADDL(state2Store, C2)
+ PADDL(ctr2Store, D0)
+ PADDL(ctr1Store, D1)
+ PADDL(ctr0Store, D2)
+
+ MOVOU(Mem{Base: inp}.Offset(0*16), T0)
+ MOVOU(Mem{Base: inp}.Offset(1*16), T1)
+ MOVOU(Mem{Base: inp}.Offset(2*16), T2)
+ MOVOU(Mem{Base: inp}.Offset(3*16), T3)
+ PXOR(T0, A2)
+ PXOR(T1, B2)
+ PXOR(T2, C2)
+ PXOR(T3, D2)
+ MOVOU(A2, Mem{Base: oup}.Offset(0*16))
+ MOVOU(B2, Mem{Base: oup}.Offset(1*16))
+ MOVOU(C2, Mem{Base: oup}.Offset(2*16))
+ MOVOU(D2, Mem{Base: oup}.Offset(3*16))
+
+ MOVOU(Mem{Base: inp}.Offset(4*16), T0)
+ MOVOU(Mem{Base: inp}.Offset(5*16), T1)
+ MOVOU(Mem{Base: inp}.Offset(6*16), T2)
+ MOVOU(Mem{Base: inp}.Offset(7*16), T3)
+ PXOR(T0, A1)
+ PXOR(T1, B1)
+ PXOR(T2, C1)
+ PXOR(T3, D1)
+ MOVOU(A1, Mem{Base: oup}.Offset(4*16))
+ MOVOU(B1, Mem{Base: oup}.Offset(5*16))
+ MOVOU(C1, Mem{Base: oup}.Offset(6*16))
+ MOVOU(D1, Mem{Base: oup}.Offset(7*16))
+
+ SUBQ(Imm(128), inl)
+ LEAQ(Mem{Base: inp}.Offset(128), inp)
+ LEAQ(Mem{Base: oup}.Offset(128), oup)
+ JMP(LabelRef("openSSETail64DecLoop"))
+}
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 256 bytes of ciphertext
+
+// Need to decrypt up to 256 bytes - prepare four blocks
+func openSSETail256() {
+ Label("openSSETail256")
+ chacha20Constants := chacha20Constants_DATA()
+ MOVO(chacha20Constants, A0)
+ MOVO(state1Store, B0)
+ MOVO(state2Store, C0)
+ MOVO(ctr3Store, D0)
+ sseIncMask := sseIncMask_DATA()
+ PADDL(sseIncMask, D0)
+ MOVO(A0, A1)
+ MOVO(B0, B1)
+ MOVO(C0, C1)
+ MOVO(D0, D1)
+ PADDL(sseIncMask, D1)
+ MOVO(A1, A2)
+ MOVO(B1, B2)
+ MOVO(C1, C2)
+ MOVO(D1, D2)
+ PADDL(sseIncMask, D2)
+ MOVO(A2, A3)
+ MOVO(B2, B3)
+ MOVO(C2, C3)
+ MOVO(D2, D3)
+ PADDL(sseIncMask, D3)
+
+ Comment("Store counters")
+ MOVO(D0, ctr0Store)
+ MOVO(D1, ctr1Store)
+ MOVO(D2, ctr2Store)
+ MOVO(D3, ctr3Store)
+ XORQ(itr2, itr2)
+}
+
+// This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
+func openSSETail256Loop() {
+ Label("openSSETail256Loop")
+ polyAdd(Mem{Base: inp, Index: itr2, Scale: 1}.Offset(0))
+ MOVO(C3, tmpStore)
+ chachaQR(A0, B0, C0, D0, C3)
+ chachaQR(A1, B1, C1, D1, C3)
+ chachaQR(A2, B2, C2, D2, C3)
+ MOVO(tmpStore, C3)
+ MOVO(C1, tmpStore)
+ chachaQR(A3, B3, C3, D3, C1)
+ MOVO(tmpStore, C1)
+ shiftB0Left()
+ shiftB1Left()
+ shiftB2Left()
+ shiftB3Left()
+ shiftC0Left()
+ shiftC1Left()
+ shiftC2Left()
+ shiftC3Left()
+ shiftD0Left()
+ shiftD1Left()
+ shiftD2Left()
+ shiftD3Left()
+ polyMulStage1()
+ polyMulStage2()
+ MOVO(C3, tmpStore)
+ chachaQR(A0, B0, C0, D0, C3)
+ chachaQR(A1, B1, C1, D1, C3)
+ chachaQR(A2, B2, C2, D2, C3)
+ MOVO(tmpStore, C3)
+ MOVO(C1, tmpStore)
+ chachaQR(A3, B3, C3, D3, C1)
+ MOVO(tmpStore, C1)
+ polyMulStage3()
+ polyMulReduceStage()
+ shiftB0Right()
+ shiftB1Right()
+ shiftB2Right()
+ shiftB3Right()
+ shiftC0Right()
+ shiftC1Right()
+ shiftC2Right()
+ shiftC3Right()
+ shiftD0Right()
+ shiftD1Right()
+ shiftD2Right()
+ shiftD3Right()
+ ADDQ(Imm(2*8), itr2)
+ CMPQ(itr2, Imm(160))
+ JB(LabelRef("openSSETail256Loop"))
+ MOVQ(inl, itr1)
+ ANDQ(I8(-16), itr1)
+}
+
+func openSSETail256HashLoop() {
+ Label("openSSETail256HashLoop")
+ polyAdd(Mem{Base: inp, Index: itr2, Scale: 1}.Offset(0))
+ polyMul()
+ ADDQ(Imm(2*8), itr2)
+ CMPQ(itr2, itr1)
+ JB(LabelRef("openSSETail256HashLoop"))
+
+ Comment("Add in the state")
+ chacha20Constants := chacha20Constants_DATA()
+ PADDD(chacha20Constants, A0)
+ PADDD(chacha20Constants, A1)
+ PADDD(chacha20Constants, A2)
+ PADDD(chacha20Constants, A3)
+ PADDD(state1Store, B0)
+ PADDD(state1Store, B1)
+ PADDD(state1Store, B2)
+ PADDD(state1Store, B3)
+ PADDD(state2Store, C0)
+ PADDD(state2Store, C1)
+ PADDD(state2Store, C2)
+ PADDD(state2Store, C3)
+ PADDD(ctr0Store, D0)
+ PADDD(ctr1Store, D1)
+ PADDD(ctr2Store, D2)
+ PADDD(ctr3Store, D3)
+ MOVO(D3, tmpStore)
+
+ Comment("Load - xor - store")
+ MOVOU(Mem{Base: inp}.Offset(0*16), D3)
+ PXOR(D3, A0)
+ MOVOU(Mem{Base: inp}.Offset(1*16), D3)
+ PXOR(D3, B0)
+ MOVOU(Mem{Base: inp}.Offset(2*16), D3)
+ PXOR(D3, C0)
+ MOVOU(Mem{Base: inp}.Offset(3*16), D3)
+ PXOR(D3, D0)
+ MOVOU(A0, Mem{Base: oup}.Offset(0*16))
+ MOVOU(B0, Mem{Base: oup}.Offset(1*16))
+ MOVOU(C0, Mem{Base: oup}.Offset(2*16))
+ MOVOU(D0, Mem{Base: oup}.Offset(3*16))
+ MOVOU(Mem{Base: inp}.Offset(4*16), A0)
+ MOVOU(Mem{Base: inp}.Offset(5*16), B0)
+ MOVOU(Mem{Base: inp}.Offset(6*16), C0)
+ MOVOU(Mem{Base: inp}.Offset(7*16), D0)
+ PXOR(A0, A1)
+ PXOR(B0, B1)
+ PXOR(C0, C1)
+ PXOR(D0, D1)
+ MOVOU(A1, Mem{Base: oup}.Offset(4*16))
+ MOVOU(B1, Mem{Base: oup}.Offset(5*16))
+ MOVOU(C1, Mem{Base: oup}.Offset(6*16))
+ MOVOU(D1, Mem{Base: oup}.Offset(7*16))
+ MOVOU(Mem{Base: inp}.Offset(8*16), A0)
+ MOVOU(Mem{Base: inp}.Offset(9*16), B0)
+ MOVOU(Mem{Base: inp}.Offset(10*16), C0)
+ MOVOU(Mem{Base: inp}.Offset(11*16), D0)
+ PXOR(A0, A2)
+ PXOR(B0, B2)
+ PXOR(C0, C2)
+ PXOR(D0, D2)
+ MOVOU(A2, Mem{Base: oup}.Offset(8*16))
+ MOVOU(B2, Mem{Base: oup}.Offset(9*16))
+ MOVOU(C2, Mem{Base: oup}.Offset(10*16))
+ MOVOU(D2, Mem{Base: oup}.Offset(11*16))
+ LEAQ(Mem{Base: inp}.Offset(192), inp)
+ LEAQ(Mem{Base: oup}.Offset(192), oup)
+ SUBQ(Imm(192), inl)
+ MOVO(A3, A0)
+ MOVO(B3, B0)
+ MOVO(C3, C0)
+ MOVO(tmpStore, D0)
+
+ JMP(LabelRef("openSSETail64DecLoop"))
+}
+
+// Functions to emit AVX instructions via BYTE directive
+
+// broadcasti128 16(r8), ymm14
+func VBROADCASTI128_16_R8_YMM14() {
+ BYTE(U8(0xc4))
+ BYTE(U8(0x42))
+ BYTE(U8(0x7d))
+ BYTE(U8(0x5a))
+ BYTE(U8(0x70))
+ BYTE(U8(0x10))
+}
+
+// broadcasti128 32(r8), ymm12
+func VBROADCASTI128_32_R8_YMM12() {
+ BYTE(U8(0xc4))
+ BYTE(U8(0x42))
+ BYTE(U8(0x7d))
+ BYTE(U8(0x5a))
+ BYTE(U8(0x60))
+ BYTE(U8(0x20))
+}
+
+// broadcasti128 48(r8), ymm4
+func VBROADCASTI128_48_R8_YMM4() {
+ BYTE(U8(0xc4))
+ BYTE(U8(0xc2))
+ BYTE(U8(0x7d))
+ BYTE(U8(0x5a))
+ BYTE(U8(0x60))
+ BYTE(U8(0x30))
+}
+
+// ----------------------------------------------------------------------------
+// ------------------------- AVX2 Code ----------------------------------------
+
+func chacha20Poly1305Open_AVX2() {
+ Label("chacha20Poly1305Open_AVX2")
+ VZEROUPPER()
+ chacha20Constants := chacha20Constants_DATA()
+ VMOVDQU(chacha20Constants, AA0)
+ VBROADCASTI128_16_R8_YMM14()
+ VBROADCASTI128_32_R8_YMM12()
+ VBROADCASTI128_48_R8_YMM4()
+ avx2InitMask := avx2InitMask_DATA()
+ VPADDD(avx2InitMask, DD0, DD0)
+
+ Comment("Special optimization, for very short buffers")
+ CMPQ(inl, Imm(192))
+ JBE(LabelRef("openAVX2192"))
+ CMPQ(inl, U32(320))
+ JBE(LabelRef("openAVX2320"))
+
+ Comment("For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream")
+ VMOVDQA(BB0, state1StoreAVX2)
+ VMOVDQA(CC0, state2StoreAVX2)
+ VMOVDQA(DD0, ctr3StoreAVX2)
+ MOVQ(U32(10), itr2)
+}
+
+func openAVX2PreparePolyKey() {
+ Label("openAVX2PreparePolyKey")
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+ VPALIGNR(Imm(4), BB0, BB0, BB0)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(12), DD0, DD0, DD0)
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+ VPALIGNR(Imm(12), BB0, BB0, BB0)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(4), DD0, DD0, DD0)
+ DECQ(itr2)
+ JNE(LabelRef("openAVX2PreparePolyKey"))
+
+ chacha20Constants := chacha20Constants_DATA()
+ VPADDD(chacha20Constants, AA0, AA0)
+ VPADDD(state1StoreAVX2, BB0, BB0)
+ VPADDD(state2StoreAVX2, CC0, CC0)
+ VPADDD(ctr3StoreAVX2, DD0, DD0)
+
+ VPERM2I128(Imm(0x02), AA0, BB0, TT0)
+
+ Comment("Clamp and store poly key")
+ polyClampMask := polyClampMask_DATA()
+ VPAND(polyClampMask, TT0, TT0)
+ VMOVDQA(TT0, rsStoreAVX2)
+
+ Comment("Stream for the first 64 bytes")
+ VPERM2I128(Imm(0x13), AA0, BB0, AA0)
+ VPERM2I128(Imm(0x13), CC0, DD0, BB0)
+
+ Comment("Hash AD + first 64 bytes")
+ // MOVQ ad_len+80(FP), itr2
+ MOVQ(NewParamAddr("ad_len", 80), itr2)
+ CALL(LabelRef("polyHashADInternal<>(SB)"))
+ XORQ(itr1, itr1)
+}
+
+func openAVX2InitialHash64() {
+ Label("openAVX2InitialHash64")
+ // polyAdd(0(inp)(itr1*1))
+ polyAdd(Mem{Base: inp, Index: itr1, Scale: 1}.Offset(0))
+ polyMulAVX2()
+ ADDQ(Imm(16), itr1)
+ CMPQ(itr1, Imm(64))
+ JNE(LabelRef("openAVX2InitialHash64"))
+
+ Comment("Decrypt the first 64 bytes")
+ VPXOR(Mem{Base: inp}.Offset(0*32), AA0, AA0)
+ VPXOR(Mem{Base: inp}.Offset(1*32), BB0, BB0)
+ VMOVDQU(AA0, Mem{Base: oup}.Offset(0*32))
+ VMOVDQU(BB0, Mem{Base: oup}.Offset(1*32))
+ LEAQ(Mem{Base: inp}.Offset(2*32), inp)
+ LEAQ(Mem{Base: oup}.Offset(2*32), oup)
+ SUBQ(Imm(64), inl)
+}
+
+func openAVX2MainLoop() {
+ Label("openAVX2MainLoop")
+ CMPQ(inl, U32(512))
+ JB(LabelRef("openAVX2MainLoopDone"))
+
+ Comment("Load state, increment counter blocks, store the incremented counters")
+ chacha20Constants := chacha20Constants_DATA()
+ VMOVDQU(chacha20Constants, AA0)
+ VMOVDQA(AA0, AA1)
+ VMOVDQA(AA0, AA2)
+ VMOVDQA(AA0, AA3)
+ VMOVDQA(state1StoreAVX2, BB0)
+ VMOVDQA(BB0, BB1)
+ VMOVDQA(BB0, BB2)
+ VMOVDQA(BB0, BB3)
+ VMOVDQA(state2StoreAVX2, CC0)
+ VMOVDQA(CC0, CC1)
+ VMOVDQA(CC0, CC2)
+ VMOVDQA(CC0, CC3)
+ VMOVDQA(ctr3StoreAVX2, DD0)
+ avx2IncMask := avx2IncMask_DATA()
+ VPADDD(avx2IncMask, DD0, DD0)
+ VPADDD(avx2IncMask, DD0, DD1)
+ VPADDD(avx2IncMask, DD1, DD2)
+ VPADDD(avx2IncMask, DD2, DD3)
+ VMOVDQA(DD0, ctr0StoreAVX2)
+ VMOVDQA(DD1, ctr1StoreAVX2)
+ VMOVDQA(DD2, ctr2StoreAVX2)
+ VMOVDQA(DD3, ctr3StoreAVX2)
+ XORQ(itr1, itr1)
+}
+
+// Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
+// Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
+func openAVX2InternalLoop() {
+ Label("openAVX2InternalLoop")
+ polyAdd(Mem{Base: inp, Index: itr1, Scale: 1}.Offset(0 * 8))
+ VPADDD(BB0, AA0, AA0)
+ VPADDD(BB1, AA1, AA1)
+ VPADDD(BB2, AA2, AA2)
+ VPADDD(BB3, AA3, AA3)
+ polyMulStage1_AVX2()
+ VPXOR(AA0, DD0, DD0)
+ VPXOR(AA1, DD1, DD1)
+ VPXOR(AA2, DD2, DD2)
+ VPXOR(AA3, DD3, DD3)
+ rol16 := rol16_DATA()
+ VPSHUFB(rol16, DD0, DD0)
+ VPSHUFB(rol16, DD1, DD1)
+ VPSHUFB(rol16, DD2, DD2)
+ VPSHUFB(rol16, DD3, DD3)
+ polyMulStage2_AVX2()
+ VPADDD(DD0, CC0, CC0)
+ VPADDD(DD1, CC1, CC1)
+ VPADDD(DD2, CC2, CC2)
+ VPADDD(DD3, CC3, CC3)
+ VPXOR(CC0, BB0, BB0)
+ VPXOR(CC1, BB1, BB1)
+ VPXOR(CC2, BB2, BB2)
+ VPXOR(CC3, BB3, BB3)
+ polyMulStage3_AVX2()
+ VMOVDQA(CC3, tmpStoreAVX2)
+ VPSLLD(Imm(12), BB0, CC3)
+ VPSRLD(Imm(20), BB0, BB0)
+ VPXOR(CC3, BB0, BB0)
+ VPSLLD(Imm(12), BB1, CC3)
+ VPSRLD(Imm(20), BB1, BB1)
+ VPXOR(CC3, BB1, BB1)
+ VPSLLD(Imm(12), BB2, CC3)
+ VPSRLD(Imm(20), BB2, BB2)
+ VPXOR(CC3, BB2, BB2)
+ VPSLLD(Imm(12), BB3, CC3)
+ VPSRLD(Imm(20), BB3, BB3)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(tmpStoreAVX2, CC3)
+ polyMulReduceStage()
+ VPADDD(BB0, AA0, AA0)
+ VPADDD(BB1, AA1, AA1)
+ VPADDD(BB2, AA2, AA2)
+ VPADDD(BB3, AA3, AA3)
+ VPXOR(AA0, DD0, DD0)
+ VPXOR(AA1, DD1, DD1)
+ VPXOR(AA2, DD2, DD2)
+ VPXOR(AA3, DD3, DD3)
+ rol8 := rol8_DATA()
+ VPSHUFB(rol8, DD0, DD0)
+ VPSHUFB(rol8, DD1, DD1)
+ VPSHUFB(rol8, DD2, DD2)
+ VPSHUFB(rol8, DD3, DD3)
+ polyAdd(Mem{Base: inp, Index: itr1, Scale: 1}.Offset(2 * 8))
+ VPADDD(DD0, CC0, CC0)
+ VPADDD(DD1, CC1, CC1)
+ VPADDD(DD2, CC2, CC2)
+ VPADDD(DD3, CC3, CC3)
+ polyMulStage1_AVX2()
+ VPXOR(CC0, BB0, BB0)
+ VPXOR(CC1, BB1, BB1)
+ VPXOR(CC2, BB2, BB2)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(CC3, tmpStoreAVX2)
+ VPSLLD(Imm(7), BB0, CC3)
+ VPSRLD(Imm(25), BB0, BB0)
+ VPXOR(CC3, BB0, BB0)
+ VPSLLD(Imm(7), BB1, CC3)
+ VPSRLD(Imm(25), BB1, BB1)
+ VPXOR(CC3, BB1, BB1)
+ VPSLLD(Imm(7), BB2, CC3)
+ VPSRLD(Imm(25), BB2, BB2)
+ VPXOR(CC3, BB2, BB2)
+ VPSLLD(Imm(7), BB3, CC3)
+ VPSRLD(Imm(25), BB3, BB3)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(tmpStoreAVX2, CC3)
+ polyMulStage2_AVX2()
+ VPALIGNR(Imm(4), BB0, BB0, BB0)
+ VPALIGNR(Imm(4), BB1, BB1, BB1)
+ VPALIGNR(Imm(4), BB2, BB2, BB2)
+ VPALIGNR(Imm(4), BB3, BB3, BB3)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(8), CC2, CC2, CC2)
+ VPALIGNR(Imm(8), CC3, CC3, CC3)
+ VPALIGNR(Imm(12), DD0, DD0, DD0)
+ VPALIGNR(Imm(12), DD1, DD1, DD1)
+ VPALIGNR(Imm(12), DD2, DD2, DD2)
+ VPALIGNR(Imm(12), DD3, DD3, DD3)
+ VPADDD(BB0, AA0, AA0)
+ VPADDD(BB1, AA1, AA1)
+ VPADDD(BB2, AA2, AA2)
+ VPADDD(BB3, AA3, AA3)
+ polyMulStage3_AVX2()
+ VPXOR(AA0, DD0, DD0)
+ VPXOR(AA1, DD1, DD1)
+ VPXOR(AA2, DD2, DD2)
+ VPXOR(AA3, DD3, DD3)
+ VPSHUFB(rol16, DD0, DD0)
+ VPSHUFB(rol16, DD1, DD1)
+ VPSHUFB(rol16, DD2, DD2)
+ VPSHUFB(rol16, DD3, DD3)
+ polyMulReduceStage()
+ VPADDD(DD0, CC0, CC0)
+ VPADDD(DD1, CC1, CC1)
+ VPADDD(DD2, CC2, CC2)
+ VPADDD(DD3, CC3, CC3)
+ VPXOR(CC0, BB0, BB0)
+ VPXOR(CC1, BB1, BB1)
+ VPXOR(CC2, BB2, BB2)
+ VPXOR(CC3, BB3, BB3)
+ polyAdd(Mem{Base: inp, Index: itr1, Scale: 1}.Offset(4 * 8))
+ LEAQ(Mem{Base: itr1}.Offset(6*8), itr1)
+ VMOVDQA(CC3, tmpStoreAVX2)
+ VPSLLD(Imm(12), BB0, CC3)
+ VPSRLD(Imm(20), BB0, BB0)
+ VPXOR(CC3, BB0, BB0)
+ VPSLLD(Imm(12), BB1, CC3)
+ VPSRLD(Imm(20), BB1, BB1)
+ VPXOR(CC3, BB1, BB1)
+ VPSLLD(Imm(12), BB2, CC3)
+ VPSRLD(Imm(20), BB2, BB2)
+ VPXOR(CC3, BB2, BB2)
+ VPSLLD(Imm(12), BB3, CC3)
+ VPSRLD(Imm(20), BB3, BB3)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(tmpStoreAVX2, CC3)
+ polyMulStage1_AVX2()
+ VPADDD(BB0, AA0, AA0)
+ VPADDD(BB1, AA1, AA1)
+ VPADDD(BB2, AA2, AA2)
+ VPADDD(BB3, AA3, AA3)
+ VPXOR(AA0, DD0, DD0)
+ VPXOR(AA1, DD1, DD1)
+ VPXOR(AA2, DD2, DD2)
+ VPXOR(AA3, DD3, DD3)
+ polyMulStage2_AVX2()
+ VPSHUFB(rol8, DD0, DD0)
+ VPSHUFB(rol8, DD1, DD1)
+ VPSHUFB(rol8, DD2, DD2)
+ VPSHUFB(rol8, DD3, DD3)
+ VPADDD(DD0, CC0, CC0)
+ VPADDD(DD1, CC1, CC1)
+ VPADDD(DD2, CC2, CC2)
+ VPADDD(DD3, CC3, CC3)
+ polyMulStage3_AVX2()
+ VPXOR(CC0, BB0, BB0)
+ VPXOR(CC1, BB1, BB1)
+ VPXOR(CC2, BB2, BB2)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(CC3, tmpStoreAVX2)
+ VPSLLD(Imm(7), BB0, CC3)
+ VPSRLD(Imm(25), BB0, BB0)
+ VPXOR(CC3, BB0, BB0)
+ VPSLLD(Imm(7), BB1, CC3)
+ VPSRLD(Imm(25), BB1, BB1)
+ VPXOR(CC3, BB1, BB1)
+ VPSLLD(Imm(7), BB2, CC3)
+ VPSRLD(Imm(25), BB2, BB2)
+ VPXOR(CC3, BB2, BB2)
+ VPSLLD(Imm(7), BB3, CC3)
+ VPSRLD(Imm(25), BB3, BB3)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(tmpStoreAVX2, CC3)
+ polyMulReduceStage()
+ VPALIGNR(Imm(12), BB0, BB0, BB0)
+ VPALIGNR(Imm(12), BB1, BB1, BB1)
+ VPALIGNR(Imm(12), BB2, BB2, BB2)
+ VPALIGNR(Imm(12), BB3, BB3, BB3)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(8), CC2, CC2, CC2)
+ VPALIGNR(Imm(8), CC3, CC3, CC3)
+ VPALIGNR(Imm(4), DD0, DD0, DD0)
+ VPALIGNR(Imm(4), DD1, DD1, DD1)
+ VPALIGNR(Imm(4), DD2, DD2, DD2)
+ VPALIGNR(Imm(4), DD3, DD3, DD3)
+ CMPQ(itr1, U32(480))
+ JNE(LabelRef("openAVX2InternalLoop"))
+
+ chacha20Constants := chacha20Constants_DATA()
+ VPADDD(chacha20Constants, AA0, AA0)
+ VPADDD(chacha20Constants, AA1, AA1)
+ VPADDD(chacha20Constants, AA2, AA2)
+ VPADDD(chacha20Constants, AA3, AA3)
+ VPADDD(state1StoreAVX2, BB0, BB0)
+ VPADDD(state1StoreAVX2, BB1, BB1)
+ VPADDD(state1StoreAVX2, BB2, BB2)
+ VPADDD(state1StoreAVX2, BB3, BB3)
+ VPADDD(state2StoreAVX2, CC0, CC0)
+ VPADDD(state2StoreAVX2, CC1, CC1)
+ VPADDD(state2StoreAVX2, CC2, CC2)
+ VPADDD(state2StoreAVX2, CC3, CC3)
+ VPADDD(ctr0StoreAVX2, DD0, DD0)
+ VPADDD(ctr1StoreAVX2, DD1, DD1)
+ VPADDD(ctr2StoreAVX2, DD2, DD2)
+ VPADDD(ctr3StoreAVX2, DD3, DD3)
+ VMOVDQA(CC3, tmpStoreAVX2)
+
+ Comment("We only hashed 480 of the 512 bytes available - hash the remaining 32 here")
+ polyAdd(Mem{Base: inp}.Offset(480))
+ polyMulAVX2()
+ VPERM2I128(Imm(0x02), AA0, BB0, CC3)
+ VPERM2I128(Imm(0x13), AA0, BB0, BB0)
+ VPERM2I128(Imm(0x02), CC0, DD0, AA0)
+ VPERM2I128(Imm(0x13), CC0, DD0, CC0)
+ VPXOR(Mem{Base: inp}.Offset(0*32), CC3, CC3)
+ VPXOR(Mem{Base: inp}.Offset(1*32), AA0, AA0)
+ VPXOR(Mem{Base: inp}.Offset(2*32), BB0, BB0)
+ VPXOR(Mem{Base: inp}.Offset(3*32), CC0, CC0)
+ VMOVDQU(CC3, Mem{Base: oup}.Offset(0*32))
+ VMOVDQU(AA0, Mem{Base: oup}.Offset(1*32))
+ VMOVDQU(BB0, Mem{Base: oup}.Offset(2*32))
+ VMOVDQU(CC0, Mem{Base: oup}.Offset(3*32))
+ VPERM2I128(Imm(0x02), AA1, BB1, AA0)
+ VPERM2I128(Imm(0x02), CC1, DD1, BB0)
+ VPERM2I128(Imm(0x13), AA1, BB1, CC0)
+ VPERM2I128(Imm(0x13), CC1, DD1, DD0)
+ VPXOR(Mem{Base: inp}.Offset(4*32), AA0, AA0)
+ VPXOR(Mem{Base: inp}.Offset(5*32), BB0, BB0)
+ VPXOR(Mem{Base: inp}.Offset(6*32), CC0, CC0)
+ VPXOR(Mem{Base: inp}.Offset(7*32), DD0, DD0)
+ VMOVDQU(AA0, Mem{Base: oup}.Offset(4*32))
+ VMOVDQU(BB0, Mem{Base: oup}.Offset(5*32))
+ VMOVDQU(CC0, Mem{Base: oup}.Offset(6*32))
+ VMOVDQU(DD0, Mem{Base: oup}.Offset(7*32))
+
+ Comment("and here")
+ polyAdd(Mem{Base: inp}.Offset(496))
+ polyMulAVX2()
+ VPERM2I128(Imm(0x02), AA2, BB2, AA0)
+ VPERM2I128(Imm(0x02), CC2, DD2, BB0)
+ VPERM2I128(Imm(0x13), AA2, BB2, CC0)
+ VPERM2I128(Imm(0x13), CC2, DD2, DD0)
+ VPXOR(Mem{Base: inp}.Offset(8*32), AA0, AA0)
+ VPXOR(Mem{Base: inp}.Offset(9*32), BB0, BB0)
+ VPXOR(Mem{Base: inp}.Offset(10*32), CC0, CC0)
+ VPXOR(Mem{Base: inp}.Offset(11*32), DD0, DD0)
+ VMOVDQU(AA0, Mem{Base: oup}.Offset(8*32))
+ VMOVDQU(BB0, Mem{Base: oup}.Offset(9*32))
+ VMOVDQU(CC0, Mem{Base: oup}.Offset(10*32))
+ VMOVDQU(DD0, Mem{Base: oup}.Offset(11*32))
+ VPERM2I128(Imm(0x02), AA3, BB3, AA0)
+ VPERM2I128(Imm(0x02), tmpStoreAVX2, DD3, BB0)
+ VPERM2I128(Imm(0x13), AA3, BB3, CC0)
+ VPERM2I128(Imm(0x13), tmpStoreAVX2, DD3, DD0)
+ VPXOR(Mem{Base: inp}.Offset(12*32), AA0, AA0)
+ VPXOR(Mem{Base: inp}.Offset(13*32), BB0, BB0)
+ VPXOR(Mem{Base: inp}.Offset(14*32), CC0, CC0)
+ VPXOR(Mem{Base: inp}.Offset(15*32), DD0, DD0)
+ VMOVDQU(AA0, Mem{Base: oup}.Offset(12*32))
+ VMOVDQU(BB0, Mem{Base: oup}.Offset(13*32))
+ VMOVDQU(CC0, Mem{Base: oup}.Offset(14*32))
+ VMOVDQU(DD0, Mem{Base: oup}.Offset(15*32))
+ LEAQ(Mem{Base: inp}.Offset(32*16), inp)
+ LEAQ(Mem{Base: oup}.Offset(32*16), oup)
+ SUBQ(U32(32*16), inl)
+ JMP(LabelRef("openAVX2MainLoop"))
+}
+
+// Handle the various tail sizes efficiently
+func openAVX2MainLoopDone() {
+ Label("openAVX2MainLoopDone")
+ Comment("Handle the various tail sizes efficiently")
+ TESTQ(inl, inl)
+ JE(LabelRef("openSSEFinalize"))
+ CMPQ(inl, Imm(128))
+ JBE(LabelRef("openAVX2Tail128"))
+ CMPQ(inl, U32(256))
+ JBE(LabelRef("openAVX2Tail256"))
+ CMPQ(inl, U32(384))
+ JBE(LabelRef("openAVX2Tail384"))
+ JMP(LabelRef("openAVX2Tail512"))
+}
+
+// ----------------------------------------------------------------------------
+// Special optimization for buffers smaller than 193 bytes
+
+// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
+func openAVX2192() {
+ Label("openAVX2192")
+ VMOVDQA(AA0, AA1)
+ VMOVDQA(BB0, BB1)
+ VMOVDQA(CC0, CC1)
+ avx2IncMask := avx2IncMask_DATA()
+ VPADDD(avx2IncMask, DD0, DD1)
+ VMOVDQA(AA0, AA2)
+ VMOVDQA(BB0, BB2)
+ VMOVDQA(CC0, CC2)
+ VMOVDQA(DD0, DD2)
+ VMOVDQA(DD1, TT3)
+ MOVQ(U32(10), itr2)
+}
+
+func openAVX2192InnerCipherLoop() {
+ Label("openAVX2192InnerCipherLoop")
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+ chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ VPALIGNR(Imm(4), BB0, BB0, BB0)
+ VPALIGNR(Imm(4), BB1, BB1, BB1)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(12), DD0, DD0, DD0)
+ VPALIGNR(Imm(12), DD1, DD1, DD1)
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+ chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ VPALIGNR(Imm(12), BB0, BB0, BB0)
+ VPALIGNR(Imm(12), BB1, BB1, BB1)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(4), DD0, DD0, DD0)
+ VPALIGNR(Imm(4), DD1, DD1, DD1)
+ DECQ(itr2)
+ JNE(LabelRef("openAVX2192InnerCipherLoop"))
+ VPADDD(AA2, AA0, AA0)
+ VPADDD(AA2, AA1, AA1)
+ VPADDD(BB2, BB0, BB0)
+ VPADDD(BB2, BB1, BB1)
+ VPADDD(CC2, CC0, CC0)
+ VPADDD(CC2, CC1, CC1)
+ VPADDD(DD2, DD0, DD0)
+ VPADDD(TT3, DD1, DD1)
+ VPERM2I128(Imm(0x02), AA0, BB0, TT0)
+
+ Comment("Clamp and store poly key")
+ polyClampMask := polyClampMask_DATA()
+ VPAND(polyClampMask, TT0, TT0)
+ VMOVDQA(TT0, rsStoreAVX2)
+
+ Comment("Stream for up to 192 bytes")
+ VPERM2I128(Imm(0x13), AA0, BB0, AA0)
+ VPERM2I128(Imm(0x13), CC0, DD0, BB0)
+ VPERM2I128(Imm(0x02), AA1, BB1, CC0)
+ VPERM2I128(Imm(0x02), CC1, DD1, DD0)
+ VPERM2I128(Imm(0x13), AA1, BB1, AA1)
+ VPERM2I128(Imm(0x13), CC1, DD1, BB1)
+}
+
+func openAVX2ShortOpen() {
+ Label("openAVX2ShortOpen")
+ Comment("Hash")
+ Load(Param("ad").Len(), itr2)
+ CALL(LabelRef("polyHashADInternal<>(SB)"))
+}
+
+func openAVX2ShortOpenLoop() {
+ Label("openAVX2ShortOpenLoop")
+ CMPQ(inl, Imm(32))
+ JB(LabelRef("openAVX2ShortTail32"))
+ SUBQ(Imm(32), inl)
+
+ Comment("Load for hashing")
+ polyAdd(Mem{Base: inp}.Offset(0 * 8))
+ polyMulAVX2()
+ polyAdd(Mem{Base: inp}.Offset(2 * 8))
+ polyMulAVX2()
+
+ Comment("Load for decryption")
+ VPXOR(Mem{Base: inp}, AA0, AA0)
+ VMOVDQU(AA0, Mem{Base: oup})
+ LEAQ(Mem{Base: inp}.Offset(1*32), inp)
+ LEAQ(Mem{Base: oup}.Offset(1*32), oup)
+
+ Comment("Shift stream left")
+ VMOVDQA(BB0, AA0)
+ VMOVDQA(CC0, BB0)
+ VMOVDQA(DD0, CC0)
+ VMOVDQA(AA1, DD0)
+ VMOVDQA(BB1, AA1)
+ VMOVDQA(CC1, BB1)
+ VMOVDQA(DD1, CC1)
+ VMOVDQA(AA2, DD1)
+ VMOVDQA(BB2, AA2)
+ JMP(LabelRef("openAVX2ShortOpenLoop"))
+}
+
+func openAVX2ShortTail32() {
+ Label("openAVX2ShortTail32")
+ CMPQ(inl, Imm(16))
+ VMOVDQA(A0, A1)
+ JB(LabelRef("openAVX2ShortDone"))
+
+ SUBQ(Imm(16), inl)
+
+ Comment("Load for hashing")
+ polyAdd(Mem{Base: inp}.Offset(0 * 8))
+ polyMulAVX2()
+
+ Comment("Load for decryption")
+ VPXOR(Mem{Base: inp}, A0, T0)
+ VMOVDQU(T0, Mem{Base: oup})
+ LEAQ(Mem{Base: inp}.Offset(1*16), inp)
+ LEAQ(Mem{Base: oup}.Offset(1*16), oup)
+ VPERM2I128(Imm(0x11), AA0, AA0, AA0)
+ VMOVDQA(A0, A1)
+}
+
+func openAVX2ShortDone() {
+ Label("openAVX2ShortDone")
+ VZEROUPPER()
+ JMP(LabelRef("openSSETail16"))
+}
+
+// ----------------------------------------------------------------------------
+// Special optimization for buffers smaller than 321 bytes
+
+// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
+func openAVX2320() {
+ Label("openAVX2320")
+ VMOVDQA(AA0, AA1)
+ VMOVDQA(BB0, BB1)
+ VMOVDQA(CC0, CC1)
+ avx2IncMask := avx2IncMask_DATA()
+ VPADDD(avx2IncMask, DD0, DD1)
+ VMOVDQA(AA0, AA2)
+ VMOVDQA(BB0, BB2)
+ VMOVDQA(CC0, CC2)
+ VPADDD(avx2IncMask, DD1, DD2)
+ VMOVDQA(BB0, TT1)
+ VMOVDQA(CC0, TT2)
+ VMOVDQA(DD0, TT3)
+ MOVQ(U32(10), itr2)
+}
+
+func openAVX2320InnerCipherLoop() {
+ Label("openAVX2320InnerCipherLoop")
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+ chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
+ VPALIGNR(Imm(4), BB0, BB0, BB0)
+ VPALIGNR(Imm(4), BB1, BB1, BB1)
+ VPALIGNR(Imm(4), BB2, BB2, BB2)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(8), CC2, CC2, CC2)
+ VPALIGNR(Imm(12), DD0, DD0, DD0)
+ VPALIGNR(Imm(12), DD1, DD1, DD1)
+ VPALIGNR(Imm(12), DD2, DD2, DD2)
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+ chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
+ VPALIGNR(Imm(12), BB0, BB0, BB0)
+ VPALIGNR(Imm(12), BB1, BB1, BB1)
+ VPALIGNR(Imm(12), BB2, BB2, BB2)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(8), CC2, CC2, CC2)
+ VPALIGNR(Imm(4), DD0, DD0, DD0)
+ VPALIGNR(Imm(4), DD1, DD1, DD1)
+ VPALIGNR(Imm(4), DD2, DD2, DD2)
+ DECQ(itr2)
+ JNE(LabelRef("openAVX2320InnerCipherLoop"))
+
+ chacha20Constants := chacha20Constants_DATA()
+ VMOVDQA(chacha20Constants, TT0)
+ VPADDD(TT0, AA0, AA0)
+ VPADDD(TT0, AA1, AA1)
+ VPADDD(TT0, AA2, AA2)
+ VPADDD(TT1, BB0, BB0)
+ VPADDD(TT1, BB1, BB1)
+ VPADDD(TT1, BB2, BB2)
+ VPADDD(TT2, CC0, CC0)
+ VPADDD(TT2, CC1, CC1)
+ VPADDD(TT2, CC2, CC2)
+ avx2IncMask := avx2IncMask_DATA()
+ VMOVDQA(avx2IncMask, TT0)
+ VPADDD(TT3, DD0, DD0)
+ VPADDD(TT0, TT3, TT3)
+ VPADDD(TT3, DD1, DD1)
+ VPADDD(TT0, TT3, TT3)
+ VPADDD(TT3, DD2, DD2)
+
+ Comment("Clamp and store poly key")
+ VPERM2I128(Imm(0x02), AA0, BB0, TT0)
+ polyClampMask := polyClampMask_DATA()
+ VPAND(polyClampMask, TT0, TT0)
+ VMOVDQA(TT0, rsStoreAVX2)
+
+ Comment("Stream for up to 320 bytes")
+ VPERM2I128(Imm(0x13), AA0, BB0, AA0)
+ VPERM2I128(Imm(0x13), CC0, DD0, BB0)
+ VPERM2I128(Imm(0x02), AA1, BB1, CC0)
+ VPERM2I128(Imm(0x02), CC1, DD1, DD0)
+ VPERM2I128(Imm(0x13), AA1, BB1, AA1)
+ VPERM2I128(Imm(0x13), CC1, DD1, BB1)
+ VPERM2I128(Imm(0x02), AA2, BB2, CC1)
+ VPERM2I128(Imm(0x02), CC2, DD2, DD1)
+ VPERM2I128(Imm(0x13), AA2, BB2, AA2)
+ VPERM2I128(Imm(0x13), CC2, DD2, BB2)
+ JMP(LabelRef("openAVX2ShortOpen"))
+}
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 128 bytes of ciphertext
+
+// Need to decrypt up to 128 bytes - prepare two blocks
+func openAVX2Tail128() {
+ Label("openAVX2Tail128")
+ Comment("Need to decrypt up to 128 bytes - prepare two blocks")
+ chacha20Constants := chacha20Constants_DATA()
+ VMOVDQA(chacha20Constants, AA1)
+ VMOVDQA(state1StoreAVX2, BB1)
+ VMOVDQA(state2StoreAVX2, CC1)
+ VMOVDQA(ctr3StoreAVX2, DD1)
+ avx2IncMask := avx2IncMask_DATA()
+ VPADDD(avx2IncMask, DD1, DD1)
+ VMOVDQA(DD1, DD0)
+
+ XORQ(itr2, itr2)
+ MOVQ(inl, itr1)
+ ANDQ(I8(-16), itr1)
+ TESTQ(itr1, itr1)
+ JE(LabelRef("openAVX2Tail128LoopB"))
+}
+
+// Perform ChaCha rounds, while hashing the remaining input
+func openAVX2Tail128LoopA() {
+ Label("openAVX2Tail128LoopA")
+ polyAdd(Mem{Base: inp, Index: itr2, Scale: 1}.Offset(0))
+ polyMulAVX2()
+}
+
+func openAVX2Tail128LoopB() {
+ Label("openAVX2Tail128LoopB")
+ ADDQ(Imm(16), itr2)
+ chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ VPALIGNR(Imm(4), BB1, BB1, BB1)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(12), DD1, DD1, DD1)
+ chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ VPALIGNR(Imm(12), BB1, BB1, BB1)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(4), DD1, DD1, DD1)
+ CMPQ(itr2, itr1)
+ JB(LabelRef("openAVX2Tail128LoopA"))
+ CMPQ(itr2, Imm(160))
+ JNE(LabelRef("openAVX2Tail128LoopB"))
+
+ chacha20Constants := chacha20Constants_DATA()
+ VPADDD(chacha20Constants, AA1, AA1)
+ VPADDD(state1StoreAVX2, BB1, BB1)
+ VPADDD(state2StoreAVX2, CC1, CC1)
+ VPADDD(DD0, DD1, DD1)
+ VPERM2I128(Imm(0x02), AA1, BB1, AA0)
+ VPERM2I128(Imm(0x02), CC1, DD1, BB0)
+ VPERM2I128(Imm(0x13), AA1, BB1, CC0)
+ VPERM2I128(Imm(0x13), CC1, DD1, DD0)
+}
+
+func openAVX2TailLoop() {
+ Label("openAVX2TailLoop")
+ CMPQ(inl, Imm(32))
+ JB(LabelRef("openAVX2Tail"))
+ SUBQ(Imm(32), inl)
+
+ Comment("Load for decryption")
+ VPXOR(Mem{Base: inp}, AA0, AA0)
+ VMOVDQU(AA0, Mem{Base: oup})
+ LEAQ(Mem{Base: inp}.Offset(1*32), inp)
+ LEAQ(Mem{Base: oup}.Offset(1*32), oup)
+ VMOVDQA(BB0, AA0)
+ VMOVDQA(CC0, BB0)
+ VMOVDQA(DD0, CC0)
+ JMP(LabelRef("openAVX2TailLoop"))
+}
+
+func openAVX2Tail() {
+ Label("openAVX2Tail")
+ CMPQ(inl, Imm(16))
+ VMOVDQA(A0, A1)
+ JB(LabelRef("openAVX2TailDone"))
+ SUBQ(Imm(16), inl)
+
+ Comment("Load for decryption")
+ VPXOR(Mem{Base: inp}, A0, T0)
+ VMOVDQU(T0, Mem{Base: oup})
+ LEAQ(Mem{Base: inp}.Offset(1*16), inp)
+ LEAQ(Mem{Base: oup}.Offset(1*16), oup)
+ VPERM2I128(Imm(0x11), AA0, AA0, AA0)
+ VMOVDQA(A0, A1)
+}
+
+func openAVX2TailDone() {
+ Label("openAVX2TailDone")
+ VZEROUPPER()
+ JMP(LabelRef("openSSETail16"))
+}
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 256 bytes of ciphertext
+
+// Need to decrypt up to 256 bytes - prepare four blocks
+func openAVX2Tail256() {
+ Label("openAVX2Tail256")
+ chacha20Constants := chacha20Constants_DATA()
+ VMOVDQA(chacha20Constants, AA0)
+ VMOVDQA(AA0, AA1)
+ VMOVDQA(state1StoreAVX2, BB0)
+ VMOVDQA(BB0, BB1)
+ VMOVDQA(state2StoreAVX2, CC0)
+ VMOVDQA(CC0, CC1)
+ VMOVDQA(ctr3StoreAVX2, DD0)
+ avx2IncMask := avx2IncMask_DATA()
+ VPADDD(avx2IncMask, DD0, DD0)
+ VPADDD(avx2IncMask, DD0, DD1)
+ VMOVDQA(DD0, TT1)
+ VMOVDQA(DD1, TT2)
+
+ Comment("Compute the number of iterations that will hash data")
+ MOVQ(inl, tmpStoreAVX2)
+ MOVQ(inl, itr1)
+ SUBQ(Imm(128), itr1)
+ SHRQ(Imm(4), itr1)
+ MOVQ(U32(10), itr2)
+ CMPQ(itr1, Imm(10))
+ CMOVQGT(itr2, itr1)
+ MOVQ(inp, inl)
+ XORQ(itr2, itr2)
+}
+
+func openAVX2Tail256LoopA() {
+ Label("openAVX2Tail256LoopA")
+ polyAdd(Mem{Base: inl}.Offset(0))
+ polyMulAVX2()
+ LEAQ(Mem{Base: inl}.Offset(16), inl)
+}
+
+// Perform ChaCha rounds, while hashing the remaining input
+func openAVX2Tail256LoopB() {
+ Label("openAVX2Tail256LoopB")
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+ chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ VPALIGNR(Imm(4), BB0, BB0, BB0)
+ VPALIGNR(Imm(4), BB1, BB1, BB1)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(12), DD0, DD0, DD0)
+ VPALIGNR(Imm(12), DD1, DD1, DD1)
+ INCQ(itr2)
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+ chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ VPALIGNR(Imm(12), BB0, BB0, BB0)
+ VPALIGNR(Imm(12), BB1, BB1, BB1)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(4), DD0, DD0, DD0)
+ VPALIGNR(Imm(4), DD1, DD1, DD1)
+ CMPQ(itr2, itr1)
+ JB(LabelRef("openAVX2Tail256LoopA"))
+
+ CMPQ(itr2, Imm(10))
+ JNE(LabelRef("openAVX2Tail256LoopB"))
+
+ MOVQ(inl, itr2)
+ SUBQ(inp, inl)
+ MOVQ(inl, itr1)
+ MOVQ(tmpStoreAVX2, inl)
+}
+
+// Hash the remainder of data (if any)
+func openAVX2Tail256Hash() {
+ Label("openAVX2Tail256Hash")
+ ADDQ(Imm(16), itr1)
+ CMPQ(itr1, inl)
+ JGT(LabelRef("openAVX2Tail256HashEnd"))
+ polyAdd(Mem{Base: itr2}.Offset(0))
+ polyMulAVX2()
+ LEAQ(Mem{Base: itr2}.Offset(16), itr2)
+ JMP(LabelRef("openAVX2Tail256Hash"))
+}
+
+// Store 128 bytes safely, then go to store loop
+func openAVX2Tail256HashEnd() {
+ Label("openAVX2Tail256HashEnd")
+ chacha20Constants := chacha20Constants_DATA()
+ VPADDD(chacha20Constants, AA0, AA0)
+ VPADDD(chacha20Constants, AA1, AA1)
+ VPADDD(state1StoreAVX2, BB0, BB0)
+ VPADDD(state1StoreAVX2, BB1, BB1)
+ VPADDD(state2StoreAVX2, CC0, CC0)
+ VPADDD(state2StoreAVX2, CC1, CC1)
+ VPADDD(TT1, DD0, DD0)
+ VPADDD(TT2, DD1, DD1)
+ VPERM2I128(Imm(0x02), AA0, BB0, AA2)
+ VPERM2I128(Imm(0x02), CC0, DD0, BB2)
+ VPERM2I128(Imm(0x13), AA0, BB0, CC2)
+ VPERM2I128(Imm(0x13), CC0, DD0, DD2)
+ VPERM2I128(Imm(0x02), AA1, BB1, AA0)
+ VPERM2I128(Imm(0x02), CC1, DD1, BB0)
+ VPERM2I128(Imm(0x13), AA1, BB1, CC0)
+ VPERM2I128(Imm(0x13), CC1, DD1, DD0)
+
+ VPXOR(Mem{Base: inp}.Offset(0*32), AA2, AA2)
+ VPXOR(Mem{Base: inp}.Offset(1*32), BB2, BB2)
+ VPXOR(Mem{Base: inp}.Offset(2*32), CC2, CC2)
+ VPXOR(Mem{Base: inp}.Offset(3*32), DD2, DD2)
+ VMOVDQU(AA2, Mem{Base: oup}.Offset(0*32))
+ VMOVDQU(BB2, Mem{Base: oup}.Offset(1*32))
+ VMOVDQU(CC2, Mem{Base: oup}.Offset(2*32))
+ VMOVDQU(DD2, Mem{Base: oup}.Offset(3*32))
+ LEAQ(Mem{Base: inp}.Offset(4*32), inp)
+ LEAQ(Mem{Base: oup}.Offset(4*32), oup)
+ SUBQ(Imm(4*32), inl)
+
+ JMP(LabelRef("openAVX2TailLoop"))
+}
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 384 bytes of ciphertext
+
+// Need to decrypt up to 384 bytes - prepare six blocks
+func openAVX2Tail384() {
+ Label("openAVX2Tail384")
+ Comment("Need to decrypt up to 384 bytes - prepare six blocks")
+ chacha20Constants := chacha20Constants_DATA()
+ VMOVDQA(chacha20Constants, AA0)
+ VMOVDQA(AA0, AA1)
+ VMOVDQA(AA0, AA2)
+ VMOVDQA(state1StoreAVX2, BB0)
+ VMOVDQA(BB0, BB1)
+ VMOVDQA(BB0, BB2)
+ VMOVDQA(state2StoreAVX2, CC0)
+ VMOVDQA(CC0, CC1)
+ VMOVDQA(CC0, CC2)
+ VMOVDQA(ctr3StoreAVX2, DD0)
+ avx2IncMask := avx2IncMask_DATA()
+ VPADDD(avx2IncMask, DD0, DD0)
+ VPADDD(avx2IncMask, DD0, DD1)
+ VPADDD(avx2IncMask, DD1, DD2)
+ VMOVDQA(DD0, ctr0StoreAVX2)
+ VMOVDQA(DD1, ctr1StoreAVX2)
+ VMOVDQA(DD2, ctr2StoreAVX2)
+
+ Comment("Compute the number of iterations that will hash two blocks of data")
+ MOVQ(inl, tmpStoreAVX2)
+ MOVQ(inl, itr1)
+ SUBQ(U32(256), itr1)
+ SHRQ(Imm(4), itr1)
+ ADDQ(Imm(6), itr1)
+ MOVQ(U32(10), itr2)
+ CMPQ(itr1, Imm(10))
+ CMOVQGT(itr2, itr1)
+ MOVQ(inp, inl)
+ XORQ(itr2, itr2)
+}
+
+// Perform ChaCha rounds, while hashing the remaining input
+func openAVX2Tail384LoopB() {
+ Label("openAVX2Tail384LoopB")
+ polyAdd(Mem{Base: inl}.Offset(0))
+ polyMulAVX2()
+ LEAQ(Mem{Base: inl}.Offset(16), inl)
+}
+
+func openAVX2Tail384LoopA() {
+ Label("openAVX2Tail384LoopA")
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+ chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
+ VPALIGNR(Imm(4), BB0, BB0, BB0)
+ VPALIGNR(Imm(4), BB1, BB1, BB1)
+ VPALIGNR(Imm(4), BB2, BB2, BB2)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(8), CC2, CC2, CC2)
+ VPALIGNR(Imm(12), DD0, DD0, DD0)
+ VPALIGNR(Imm(12), DD1, DD1, DD1)
+ VPALIGNR(Imm(12), DD2, DD2, DD2)
+ polyAdd(Mem{Base: inl}.Offset(0))
+ polyMulAVX2()
+ LEAQ(Mem{Base: inl}.Offset(16), inl)
+ INCQ(itr2)
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+ chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
+ VPALIGNR(Imm(12), BB0, BB0, BB0)
+ VPALIGNR(Imm(12), BB1, BB1, BB1)
+ VPALIGNR(Imm(12), BB2, BB2, BB2)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(8), CC2, CC2, CC2)
+ VPALIGNR(Imm(4), DD0, DD0, DD0)
+ VPALIGNR(Imm(4), DD1, DD1, DD1)
+ VPALIGNR(Imm(4), DD2, DD2, DD2)
+
+ CMPQ(itr2, itr1)
+ JB(LabelRef("openAVX2Tail384LoopB"))
+
+ CMPQ(itr2, Imm(10))
+ JNE(LabelRef("openAVX2Tail384LoopA"))
+
+ MOVQ(inl, itr2)
+ SUBQ(inp, inl)
+ MOVQ(inl, itr1)
+ MOVQ(tmpStoreAVX2, inl)
+}
+
+func openAVX2Tail384Hash() {
+ Label("openAVX2Tail384Hash")
+ ADDQ(Imm(16), itr1)
+ CMPQ(itr1, inl)
+ JGT(LabelRef("openAVX2Tail384HashEnd"))
+ polyAdd(Mem{Base: itr2}.Offset(0))
+ polyMulAVX2()
+ LEAQ(Mem{Base: itr2}.Offset(16), itr2)
+ JMP(LabelRef("openAVX2Tail384Hash"))
+}
+
+// Store 256 bytes safely, then go to store loop
+func openAVX2Tail384HashEnd() {
+ Label("openAVX2Tail384HashEnd")
+ chacha20Constants := chacha20Constants_DATA()
+ VPADDD(chacha20Constants, AA0, AA0)
+ VPADDD(chacha20Constants, AA1, AA1)
+ VPADDD(chacha20Constants, AA2, AA2)
+ VPADDD(state1StoreAVX2, BB0, BB0)
+ VPADDD(state1StoreAVX2, BB1, BB1)
+ VPADDD(state1StoreAVX2, BB2, BB2)
+ VPADDD(state2StoreAVX2, CC0, CC0)
+ VPADDD(state2StoreAVX2, CC1, CC1)
+ VPADDD(state2StoreAVX2, CC2, CC2)
+ VPADDD(ctr0StoreAVX2, DD0, DD0)
+ VPADDD(ctr1StoreAVX2, DD1, DD1)
+ VPADDD(ctr2StoreAVX2, DD2, DD2)
+ VPERM2I128(Imm(0x02), AA0, BB0, TT0)
+ VPERM2I128(Imm(0x02), CC0, DD0, TT1)
+ VPERM2I128(Imm(0x13), AA0, BB0, TT2)
+ VPERM2I128(Imm(0x13), CC0, DD0, TT3)
+ VPXOR(Mem{Base: inp}.Offset(0*32), TT0, TT0)
+ VPXOR(Mem{Base: inp}.Offset(1*32), TT1, TT1)
+ VPXOR(Mem{Base: inp}.Offset(2*32), TT2, TT2)
+ VPXOR(Mem{Base: inp}.Offset(3*32), TT3, TT3)
+ VMOVDQU(TT0, Mem{Base: oup}.Offset(0*32))
+ VMOVDQU(TT1, Mem{Base: oup}.Offset(1*32))
+ VMOVDQU(TT2, Mem{Base: oup}.Offset(2*32))
+ VMOVDQU(TT3, Mem{Base: oup}.Offset(3*32))
+ VPERM2I128(Imm(0x02), AA1, BB1, TT0)
+ VPERM2I128(Imm(0x02), CC1, DD1, TT1)
+ VPERM2I128(Imm(0x13), AA1, BB1, TT2)
+ VPERM2I128(Imm(0x13), CC1, DD1, TT3)
+ VPXOR(Mem{Base: inp}.Offset(4*32), TT0, TT0)
+ VPXOR(Mem{Base: inp}.Offset(5*32), TT1, TT1)
+ VPXOR(Mem{Base: inp}.Offset(6*32), TT2, TT2)
+ VPXOR(Mem{Base: inp}.Offset(7*32), TT3, TT3)
+ VMOVDQU(TT0, Mem{Base: oup}.Offset(4*32))
+ VMOVDQU(TT1, Mem{Base: oup}.Offset(5*32))
+ VMOVDQU(TT2, Mem{Base: oup}.Offset(6*32))
+ VMOVDQU(TT3, Mem{Base: oup}.Offset(7*32))
+ VPERM2I128(Imm(0x02), AA2, BB2, AA0)
+ VPERM2I128(Imm(0x02), CC2, DD2, BB0)
+ VPERM2I128(Imm(0x13), AA2, BB2, CC0)
+ VPERM2I128(Imm(0x13), CC2, DD2, DD0)
+ LEAQ(Mem{Base: inp}.Offset(8*32), inp)
+ LEAQ(Mem{Base: oup}.Offset(8*32), oup)
+ SUBQ(U32(8*32), inl)
+ JMP(LabelRef("openAVX2TailLoop"))
+}
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 512 bytes of ciphertext
+
+func openAVX2Tail512() {
+ Label("openAVX2Tail512")
+ chacha20Constants := chacha20Constants_DATA()
+ VMOVDQU(chacha20Constants, AA0)
+ VMOVDQA(AA0, AA1)
+ VMOVDQA(AA0, AA2)
+ VMOVDQA(AA0, AA3)
+ VMOVDQA(state1StoreAVX2, BB0)
+ VMOVDQA(BB0, BB1)
+ VMOVDQA(BB0, BB2)
+ VMOVDQA(BB0, BB3)
+ VMOVDQA(state2StoreAVX2, CC0)
+ VMOVDQA(CC0, CC1)
+ VMOVDQA(CC0, CC2)
+ VMOVDQA(CC0, CC3)
+ VMOVDQA(ctr3StoreAVX2, DD0)
+ avx2IncMask := avx2IncMask_DATA()
+ VPADDD(avx2IncMask, DD0, DD0)
+ VPADDD(avx2IncMask, DD0, DD1)
+ VPADDD(avx2IncMask, DD1, DD2)
+ VPADDD(avx2IncMask, DD2, DD3)
+ VMOVDQA(DD0, ctr0StoreAVX2)
+ VMOVDQA(DD1, ctr1StoreAVX2)
+ VMOVDQA(DD2, ctr2StoreAVX2)
+ VMOVDQA(DD3, ctr3StoreAVX2)
+ XORQ(itr1, itr1)
+ MOVQ(inp, itr2)
+}
+
+func openAVX2Tail512LoopB() {
+ Label("openAVX2Tail512LoopB")
+ polyAdd(Mem{Base: itr2}.Offset(0))
+ polyMulAVX2()
+ LEAQ(Mem{Base: itr2}.Offset(2*8), itr2)
+}
+
+func openAVX2Tail512LoopA() {
+ Label("openAVX2Tail512LoopA")
+ VPADDD(BB0, AA0, AA0)
+ VPADDD(BB1, AA1, AA1)
+ VPADDD(BB2, AA2, AA2)
+ VPADDD(BB3, AA3, AA3)
+ VPXOR(AA0, DD0, DD0)
+ VPXOR(AA1, DD1, DD1)
+ VPXOR(AA2, DD2, DD2)
+ VPXOR(AA3, DD3, DD3)
+ rol16 := rol16_DATA()
+ VPSHUFB(rol16, DD0, DD0)
+ VPSHUFB(rol16, DD1, DD1)
+ VPSHUFB(rol16, DD2, DD2)
+ VPSHUFB(rol16, DD3, DD3)
+ VPADDD(DD0, CC0, CC0)
+ VPADDD(DD1, CC1, CC1)
+ VPADDD(DD2, CC2, CC2)
+ VPADDD(DD3, CC3, CC3)
+ VPXOR(CC0, BB0, BB0)
+ VPXOR(CC1, BB1, BB1)
+ VPXOR(CC2, BB2, BB2)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(CC3, tmpStoreAVX2)
+ VPSLLD(Imm(12), BB0, CC3)
+ VPSRLD(Imm(20), BB0, BB0)
+ VPXOR(CC3, BB0, BB0)
+ VPSLLD(Imm(12), BB1, CC3)
+ VPSRLD(Imm(20), BB1, BB1)
+ VPXOR(CC3, BB1, BB1)
+ VPSLLD(Imm(12), BB2, CC3)
+ VPSRLD(Imm(20), BB2, BB2)
+ VPXOR(CC3, BB2, BB2)
+ VPSLLD(Imm(12), BB3, CC3)
+ VPSRLD(Imm(20), BB3, BB3)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(tmpStoreAVX2, CC3)
+ polyAdd(Mem{Base: itr2}.Offset(0 * 8))
+ polyMulAVX2()
+ VPADDD(BB0, AA0, AA0)
+ VPADDD(BB1, AA1, AA1)
+ VPADDD(BB2, AA2, AA2)
+ VPADDD(BB3, AA3, AA3)
+ VPXOR(AA0, DD0, DD0)
+ VPXOR(AA1, DD1, DD1)
+ VPXOR(AA2, DD2, DD2)
+ VPXOR(AA3, DD3, DD3)
+ rol8 := rol8_DATA()
+ VPSHUFB(rol8, DD0, DD0)
+ VPSHUFB(rol8, DD1, DD1)
+ VPSHUFB(rol8, DD2, DD2)
+ VPSHUFB(rol8, DD3, DD3)
+ VPADDD(DD0, CC0, CC0)
+ VPADDD(DD1, CC1, CC1)
+ VPADDD(DD2, CC2, CC2)
+ VPADDD(DD3, CC3, CC3)
+ VPXOR(CC0, BB0, BB0)
+ VPXOR(CC1, BB1, BB1)
+ VPXOR(CC2, BB2, BB2)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(CC3, tmpStoreAVX2)
+ VPSLLD(Imm(7), BB0, CC3)
+ VPSRLD(Imm(25), BB0, BB0)
+ VPXOR(CC3, BB0, BB0)
+ VPSLLD(Imm(7), BB1, CC3)
+ VPSRLD(Imm(25), BB1, BB1)
+ VPXOR(CC3, BB1, BB1)
+ VPSLLD(Imm(7), BB2, CC3)
+ VPSRLD(Imm(25), BB2, BB2)
+ VPXOR(CC3, BB2, BB2)
+ VPSLLD(Imm(7), BB3, CC3)
+ VPSRLD(Imm(25), BB3, BB3)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(tmpStoreAVX2, CC3)
+ VPALIGNR(Imm(4), BB0, BB0, BB0)
+ VPALIGNR(Imm(4), BB1, BB1, BB1)
+ VPALIGNR(Imm(4), BB2, BB2, BB2)
+ VPALIGNR(Imm(4), BB3, BB3, BB3)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(8), CC2, CC2, CC2)
+ VPALIGNR(Imm(8), CC3, CC3, CC3)
+ VPALIGNR(Imm(12), DD0, DD0, DD0)
+ VPALIGNR(Imm(12), DD1, DD1, DD1)
+ VPALIGNR(Imm(12), DD2, DD2, DD2)
+ VPALIGNR(Imm(12), DD3, DD3, DD3)
+ VPADDD(BB0, AA0, AA0)
+ VPADDD(BB1, AA1, AA1)
+ VPADDD(BB2, AA2, AA2)
+ VPADDD(BB3, AA3, AA3)
+ VPXOR(AA0, DD0, DD0)
+ VPXOR(AA1, DD1, DD1)
+ VPXOR(AA2, DD2, DD2)
+ VPXOR(AA3, DD3, DD3)
+ VPSHUFB(rol16, DD0, DD0)
+ VPSHUFB(rol16, DD1, DD1)
+ VPSHUFB(rol16, DD2, DD2)
+ VPSHUFB(rol16, DD3, DD3)
+ VPADDD(DD0, CC0, CC0)
+ VPADDD(DD1, CC1, CC1)
+ VPADDD(DD2, CC2, CC2)
+ VPADDD(DD3, CC3, CC3)
+ VPXOR(CC0, BB0, BB0)
+ VPXOR(CC1, BB1, BB1)
+ VPXOR(CC2, BB2, BB2)
+ VPXOR(CC3, BB3, BB3)
+ polyAdd(Mem{Base: itr2}.Offset(2 * 8))
+ polyMulAVX2()
+ LEAQ(Mem{Base: itr2}.Offset(4*8), itr2)
+ VMOVDQA(CC3, tmpStoreAVX2)
+ VPSLLD(Imm(12), BB0, CC3)
+ VPSRLD(Imm(20), BB0, BB0)
+ VPXOR(CC3, BB0, BB0)
+ VPSLLD(Imm(12), BB1, CC3)
+ VPSRLD(Imm(20), BB1, BB1)
+ VPXOR(CC3, BB1, BB1)
+ VPSLLD(Imm(12), BB2, CC3)
+ VPSRLD(Imm(20), BB2, BB2)
+ VPXOR(CC3, BB2, BB2)
+ VPSLLD(Imm(12), BB3, CC3)
+ VPSRLD(Imm(20), BB3, BB3)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(tmpStoreAVX2, CC3)
+ VPADDD(BB0, AA0, AA0)
+ VPADDD(BB1, AA1, AA1)
+ VPADDD(BB2, AA2, AA2)
+ VPADDD(BB3, AA3, AA3)
+ VPXOR(AA0, DD0, DD0)
+ VPXOR(AA1, DD1, DD1)
+ VPXOR(AA2, DD2, DD2)
+ VPXOR(AA3, DD3, DD3)
+ VPSHUFB(rol8, DD0, DD0)
+ VPSHUFB(rol8, DD1, DD1)
+ VPSHUFB(rol8, DD2, DD2)
+ VPSHUFB(rol8, DD3, DD3)
+ VPADDD(DD0, CC0, CC0)
+ VPADDD(DD1, CC1, CC1)
+ VPADDD(DD2, CC2, CC2)
+ VPADDD(DD3, CC3, CC3)
+ VPXOR(CC0, BB0, BB0)
+ VPXOR(CC1, BB1, BB1)
+ VPXOR(CC2, BB2, BB2)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(CC3, tmpStoreAVX2)
+ VPSLLD(Imm(7), BB0, CC3)
+ VPSRLD(Imm(25), BB0, BB0)
+ VPXOR(CC3, BB0, BB0)
+ VPSLLD(Imm(7), BB1, CC3)
+ VPSRLD(Imm(25), BB1, BB1)
+ VPXOR(CC3, BB1, BB1)
+ VPSLLD(Imm(7), BB2, CC3)
+ VPSRLD(Imm(25), BB2, BB2)
+ VPXOR(CC3, BB2, BB2)
+ VPSLLD(Imm(7), BB3, CC3)
+ VPSRLD(Imm(25), BB3, BB3)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(tmpStoreAVX2, CC3)
+ VPALIGNR(Imm(12), BB0, BB0, BB0)
+ VPALIGNR(Imm(12), BB1, BB1, BB1)
+ VPALIGNR(Imm(12), BB2, BB2, BB2)
+ VPALIGNR(Imm(12), BB3, BB3, BB3)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(8), CC2, CC2, CC2)
+ VPALIGNR(Imm(8), CC3, CC3, CC3)
+ VPALIGNR(Imm(4), DD0, DD0, DD0)
+ VPALIGNR(Imm(4), DD1, DD1, DD1)
+ VPALIGNR(Imm(4), DD2, DD2, DD2)
+ VPALIGNR(Imm(4), DD3, DD3, DD3)
+ INCQ(itr1)
+ CMPQ(itr1, Imm(4))
+ JLT(LabelRef("openAVX2Tail512LoopB"))
+
+ CMPQ(itr1, Imm(10))
+ JNE(LabelRef("openAVX2Tail512LoopA"))
+
+ MOVQ(inl, itr1)
+ SUBQ(U32(384), itr1)
+ ANDQ(I8(-16), itr1)
+}
+
+func openAVX2Tail512HashLoop() {
+ Label("openAVX2Tail512HashLoop")
+ TESTQ(itr1, itr1)
+ JE(LabelRef("openAVX2Tail512HashEnd"))
+ polyAdd(Mem{Base: itr2}.Offset(0))
+ polyMulAVX2()
+ LEAQ(Mem{Base: itr2}.Offset(16), itr2)
+ SUBQ(Imm(16), itr1)
+ JMP(LabelRef("openAVX2Tail512HashLoop"))
+}
+
+func openAVX2Tail512HashEnd() {
+ Label("openAVX2Tail512HashEnd")
+ chacha20Constants := chacha20Constants_DATA()
+ VPADDD(chacha20Constants, AA0, AA0)
+ VPADDD(chacha20Constants, AA1, AA1)
+ VPADDD(chacha20Constants, AA2, AA2)
+ VPADDD(chacha20Constants, AA3, AA3)
+ VPADDD(state1StoreAVX2, BB0, BB0)
+ VPADDD(state1StoreAVX2, BB1, BB1)
+ VPADDD(state1StoreAVX2, BB2, BB2)
+ VPADDD(state1StoreAVX2, BB3, BB3)
+ VPADDD(state2StoreAVX2, CC0, CC0)
+ VPADDD(state2StoreAVX2, CC1, CC1)
+ VPADDD(state2StoreAVX2, CC2, CC2)
+ VPADDD(state2StoreAVX2, CC3, CC3)
+ VPADDD(ctr0StoreAVX2, DD0, DD0)
+ VPADDD(ctr1StoreAVX2, DD1, DD1)
+ VPADDD(ctr2StoreAVX2, DD2, DD2)
+ VPADDD(ctr3StoreAVX2, DD3, DD3)
+ VMOVDQA(CC3, tmpStoreAVX2)
+ VPERM2I128(Imm(0x02), AA0, BB0, CC3)
+ VPERM2I128(Imm(0x13), AA0, BB0, BB0)
+ VPERM2I128(Imm(0x02), CC0, DD0, AA0)
+ VPERM2I128(Imm(0x13), CC0, DD0, CC0)
+ VPXOR(Mem{Base: inp}.Offset(0*32), CC3, CC3)
+ VPXOR(Mem{Base: inp}.Offset(1*32), AA0, AA0)
+ VPXOR(Mem{Base: inp}.Offset(2*32), BB0, BB0)
+ VPXOR(Mem{Base: inp}.Offset(3*32), CC0, CC0)
+ VMOVDQU(CC3, Mem{Base: oup}.Offset(0*32))
+ VMOVDQU(AA0, Mem{Base: oup}.Offset(1*32))
+ VMOVDQU(BB0, Mem{Base: oup}.Offset(2*32))
+ VMOVDQU(CC0, Mem{Base: oup}.Offset(3*32))
+ VPERM2I128(Imm(0x02), AA1, BB1, AA0)
+ VPERM2I128(Imm(0x02), CC1, DD1, BB0)
+ VPERM2I128(Imm(0x13), AA1, BB1, CC0)
+ VPERM2I128(Imm(0x13), CC1, DD1, DD0)
+ VPXOR(Mem{Base: inp}.Offset(4*32), AA0, AA0)
+ VPXOR(Mem{Base: inp}.Offset(5*32), BB0, BB0)
+ VPXOR(Mem{Base: inp}.Offset(6*32), CC0, CC0)
+ VPXOR(Mem{Base: inp}.Offset(7*32), DD0, DD0)
+ VMOVDQU(AA0, Mem{Base: oup}.Offset(4*32))
+ VMOVDQU(BB0, Mem{Base: oup}.Offset(5*32))
+ VMOVDQU(CC0, Mem{Base: oup}.Offset(6*32))
+ VMOVDQU(DD0, Mem{Base: oup}.Offset(7*32))
+ VPERM2I128(Imm(0x02), AA2, BB2, AA0)
+ VPERM2I128(Imm(0x02), CC2, DD2, BB0)
+ VPERM2I128(Imm(0x13), AA2, BB2, CC0)
+ VPERM2I128(Imm(0x13), CC2, DD2, DD0)
+ VPXOR(Mem{Base: inp}.Offset(8*32), AA0, AA0)
+ VPXOR(Mem{Base: inp}.Offset(9*32), BB0, BB0)
+ VPXOR(Mem{Base: inp}.Offset(10*32), CC0, CC0)
+ VPXOR(Mem{Base: inp}.Offset(11*32), DD0, DD0)
+ VMOVDQU(AA0, Mem{Base: oup}.Offset(8*32))
+ VMOVDQU(BB0, Mem{Base: oup}.Offset(9*32))
+ VMOVDQU(CC0, Mem{Base: oup}.Offset(10*32))
+ VMOVDQU(DD0, Mem{Base: oup}.Offset(11*32))
+ VPERM2I128(Imm(0x02), AA3, BB3, AA0)
+ VPERM2I128(Imm(0x02), tmpStoreAVX2, DD3, BB0)
+ VPERM2I128(Imm(0x13), AA3, BB3, CC0)
+ VPERM2I128(Imm(0x13), tmpStoreAVX2, DD3, DD0)
+
+ LEAQ(Mem{Base: inp}.Offset(12*32), inp)
+ LEAQ(Mem{Base: oup}.Offset(12*32), oup)
+ SUBQ(U32(12*32), inl)
+
+ JMP(LabelRef("openAVX2TailLoop"))
+}
+
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+
+// Implements the following function fignature:
+//
+// func chacha20Poly1305Seal(dst []byte, key []uint32, src, ad []byte)
+func chacha20Poly1305Seal() {
+ Implement("chacha20Poly1305Seal")
+ Attributes(0)
+ AllocLocal(288)
+
+ MOVQ(RSP, RBP)
+ ADDQ(Imm(32), RBP)
+ ANDQ(I32(-32), RBP)
+ Load(Param("dst").Base(), oup)
+ Load(Param("key").Base(), keyp)
+ Load(Param("src").Base(), inp)
+ Load(Param("src").Len(), inl)
+ Load(Param("ad").Base(), adp)
+
+ CMPB(Mem{Symbol: Symbol{Name: ThatPeskyUnicodeDot + "useAVX2"}, Base: StaticBase}, Imm(1))
+ JE(LabelRef("chacha20Poly1305Seal_AVX2"))
+
+ Comment("Special optimization, for very short buffers")
+ CMPQ(inl, Imm(128))
+ JBE(LabelRef("sealSSE128"))
+
+ Comment("In the seal case - prepare the poly key + 3 blocks of stream in the first iteration")
+ chacha20Constants := chacha20Constants_DATA()
+ MOVOU(chacha20Constants, A0)
+ MOVOU(Mem{Base: keyp}.Offset(1*16), B0)
+ MOVOU(Mem{Base: keyp}.Offset(2*16), C0)
+ MOVOU(Mem{Base: keyp}.Offset(3*16), D0)
+
+ Comment("Store state on stack for future use")
+ MOVO(B0, state1Store)
+ MOVO(C0, state2Store)
+
+ Comment("Load state, increment counter blocks")
+ MOVO(A0, A1)
+ MOVO(B0, B1)
+ MOVO(C0, C1)
+ MOVO(D0, D1)
+ sseIncMask := sseIncMask_DATA()
+ PADDL(sseIncMask, D1)
+ MOVO(A1, A2)
+ MOVO(B1, B2)
+ MOVO(C1, C2)
+ MOVO(D1, D2)
+ PADDL(sseIncMask, D2)
+ MOVO(A2, A3)
+ MOVO(B2, B3)
+ MOVO(C2, C3)
+ MOVO(D2, D3)
+ PADDL(sseIncMask, D3)
+
+ Comment("Store counters")
+ MOVO(D0, ctr0Store)
+ MOVO(D1, ctr1Store)
+ MOVO(D2, ctr2Store)
+ MOVO(D3, ctr3Store)
+ MOVQ(U32(10), itr2)
+
+ sealSSEIntroLoop()
+ sealSSEMainLoop()
+
+ // ----------------------------------------------------------------------------
+ // Special optimization for the last 64 bytes of plaintext
+ sealSSETail64()
+ sealSSETail64LoopA()
+ sealSSETail64LoopB()
+
+ // ----------------------------------------------------------------------------
+ // Special optimization for the last 128 bytes of plaintext
+ sealSSETail128()
+ sealSSETail128LoopA()
+ sealSSETail128LoopB()
+
+ // ----------------------------------------------------------------------------
+ // Special optimization for the last 192 bytes of plaintext
+ sealSSETail192()
+ sealSSETail192LoopA()
+ sealSSETail192LoopB()
+
+ // ----------------------------------------------------------------------------
+ // Special seal optimization for buffers smaller than 129 bytes
+ sealSSE128()
+ sealSSE128SealHash()
+ sealSSE128Seal()
+ sealSSETail()
+ sealSSETailLoadLoop()
+ sealSSEFinalize()
+
+ // ----------------------------------------------------------------------------
+ // ------------------------- AVX2 Code ----------------------------------------
+ chacha20Poly1305Seal_AVX2()
+ sealAVX2IntroLoop()
+ sealAVX2MainLoop()
+ sealAVX2InternalLoop()
+ sealAVX2InternalLoopStart()
+
+ // ----------------------------------------------------------------------------
+ // Special optimization for buffers smaller than 193 bytes
+ seal192AVX2()
+ sealAVX2192InnerCipherLoop()
+ sealAVX2ShortSeal()
+ sealAVX2SealHash()
+ sealAVX2ShortSealLoop()
+ sealAVX2ShortTail32()
+ sealAVX2ShortDone()
+
+ // ----------------------------------------------------------------------------
+ // Special optimization for buffers smaller than 321 bytes
+ seal320AVX2()
+ sealAVX2320InnerCipherLoop()
+
+ // ----------------------------------------------------------------------------
+ // Special optimization for the last 128 bytes of ciphertext
+ sealAVX2Tail128()
+ sealAVX2Tail128LoopA()
+ sealAVX2Tail128LoopB()
+
+ // ----------------------------------------------------------------------------
+ // Special optimization for the last 256 bytes of ciphertext
+ sealAVX2Tail256()
+ sealAVX2Tail256LoopA()
+ sealAVX2Tail256LoopB()
+
+ // ----------------------------------------------------------------------------
+ // Special optimization for the last 384 bytes of ciphertext
+ sealAVX2Tail384()
+ sealAVX2Tail384LoopA()
+ sealAVX2Tail384LoopB()
+
+ // ----------------------------------------------------------------------------
+ // Special optimization for the last 512 bytes of ciphertext
+ sealAVX2Tail512()
+ sealAVX2Tail512LoopA()
+ sealAVX2Tail512LoopB()
+}
+
+func sealSSEIntroLoop() {
+ Label("sealSSEIntroLoop")
+ MOVO(C3, tmpStore)
+ chachaQR(A0, B0, C0, D0, C3)
+ chachaQR(A1, B1, C1, D1, C3)
+ chachaQR(A2, B2, C2, D2, C3)
+ MOVO(tmpStore, C3)
+ MOVO(C1, tmpStore)
+ chachaQR(A3, B3, C3, D3, C1)
+ MOVO(tmpStore, C1)
+ shiftB0Left()
+ shiftB1Left()
+ shiftB2Left()
+ shiftB3Left()
+ shiftC0Left()
+ shiftC1Left()
+ shiftC2Left()
+ shiftC3Left()
+ shiftD0Left()
+ shiftD1Left()
+ shiftD2Left()
+ shiftD3Left()
+
+ MOVO(C3, tmpStore)
+ chachaQR(A0, B0, C0, D0, C3)
+ chachaQR(A1, B1, C1, D1, C3)
+ chachaQR(A2, B2, C2, D2, C3)
+ MOVO(tmpStore, C3)
+ MOVO(C1, tmpStore)
+ chachaQR(A3, B3, C3, D3, C1)
+ MOVO(tmpStore, C1)
+ shiftB0Right()
+ shiftB1Right()
+ shiftB2Right()
+ shiftB3Right()
+ shiftC0Right()
+ shiftC1Right()
+ shiftC2Right()
+ shiftC3Right()
+ shiftD0Right()
+ shiftD1Right()
+ shiftD2Right()
+ shiftD3Right()
+ DECQ(itr2)
+ JNE(LabelRef("sealSSEIntroLoop"))
+
+ Comment("Add in the state")
+ chacha20Constants := chacha20Constants_DATA()
+ PADDD(chacha20Constants, A0)
+ PADDD(chacha20Constants, A1)
+ PADDD(chacha20Constants, A2)
+ PADDD(chacha20Constants, A3)
+ PADDD(state1Store, B0)
+ PADDD(state1Store, B1)
+ PADDD(state1Store, B2)
+ PADDD(state1Store, B3)
+ PADDD(state2Store, C1)
+ PADDD(state2Store, C2)
+ PADDD(state2Store, C3)
+ PADDD(ctr1Store, D1)
+ PADDD(ctr2Store, D2)
+ PADDD(ctr3Store, D3)
+
+ Comment("Clamp and store the key")
+ polyClampMask := polyClampMask_DATA()
+ PAND(polyClampMask, A0)
+ MOVO(A0, rStore)
+ MOVO(B0, sStore)
+
+ Comment("Hash AAD")
+ MOVQ(NewParamAddr("ad_len", 80), itr2)
+ CALL(LabelRef("polyHashADInternal<>(SB)"))
+
+ MOVOU(Mem{Base: inp}.Offset(0*16), A0)
+ MOVOU(Mem{Base: inp}.Offset(1*16), B0)
+ MOVOU(Mem{Base: inp}.Offset(2*16), C0)
+ MOVOU(Mem{Base: inp}.Offset(3*16), D0)
+ PXOR(A0, A1)
+ PXOR(B0, B1)
+ PXOR(C0, C1)
+ PXOR(D0, D1)
+ MOVOU(A1, Mem{Base: oup}.Offset(0*16))
+ MOVOU(B1, Mem{Base: oup}.Offset(1*16))
+ MOVOU(C1, Mem{Base: oup}.Offset(2*16))
+ MOVOU(D1, Mem{Base: oup}.Offset(3*16))
+ MOVOU(Mem{Base: inp}.Offset(4*16), A0)
+ MOVOU(Mem{Base: inp}.Offset(5*16), B0)
+ MOVOU(Mem{Base: inp}.Offset(6*16), C0)
+ MOVOU(Mem{Base: inp}.Offset(7*16), D0)
+ PXOR(A0, A2)
+ PXOR(B0, B2)
+ PXOR(C0, C2)
+ PXOR(D0, D2)
+ MOVOU(A2, Mem{Base: oup}.Offset(4*16))
+ MOVOU(B2, Mem{Base: oup}.Offset(5*16))
+ MOVOU(C2, Mem{Base: oup}.Offset(6*16))
+ MOVOU(D2, Mem{Base: oup}.Offset(7*16))
+
+ MOVQ(U32(128), itr1)
+ SUBQ(Imm(128), inl)
+ LEAQ(Mem{Base: inp}.Offset(128), inp)
+
+ MOVO(A3, A1)
+ MOVO(B3, B1)
+ MOVO(C3, C1)
+ MOVO(D3, D1)
+
+ CMPQ(inl, Imm(64))
+ JBE(LabelRef("sealSSE128SealHash"))
+
+ MOVOU(Mem{Base: inp}.Offset(0*16), A0)
+ MOVOU(Mem{Base: inp}.Offset(1*16), B0)
+ MOVOU(Mem{Base: inp}.Offset(2*16), C0)
+ MOVOU(Mem{Base: inp}.Offset(3*16), D0)
+ PXOR(A0, A3)
+ PXOR(B0, B3)
+ PXOR(C0, C3)
+ PXOR(D0, D3)
+ MOVOU(A3, Mem{Base: oup}.Offset(8*16))
+ MOVOU(B3, Mem{Base: oup}.Offset(9*16))
+ MOVOU(C3, Mem{Base: oup}.Offset(10*16))
+ MOVOU(D3, Mem{Base: oup}.Offset(11*16))
+
+ ADDQ(Imm(64), itr1)
+ SUBQ(Imm(64), inl)
+ LEAQ(Mem{Base: inp}.Offset(64), inp)
+
+ MOVQ(U32(2), itr1)
+ MOVQ(U32(8), itr2)
+
+ CMPQ(inl, Imm(64))
+ JBE(LabelRef("sealSSETail64"))
+ CMPQ(inl, Imm(128))
+ JBE(LabelRef("sealSSETail128"))
+ CMPQ(inl, Imm(192))
+ JBE(LabelRef("sealSSETail192"))
+}
+
+func sealSSEMainLoop() {
+ Label("sealSSEMainLoop")
+ Comment("Load state, increment counter blocks")
+ chacha20Constants := chacha20Constants_DATA()
+ MOVO(chacha20Constants, A0)
+ MOVO(state1Store, B0)
+ MOVO(state2Store, C0)
+ MOVO(ctr3Store, D0)
+ sseIncMask := sseIncMask_DATA()
+ PADDL(sseIncMask, D0)
+ MOVO(A0, A1)
+ MOVO(B0, B1)
+ MOVO(C0, C1)
+ MOVO(D0, D1)
+ PADDL(sseIncMask, D1)
+ MOVO(A1, A2)
+ MOVO(B1, B2)
+ MOVO(C1, C2)
+ MOVO(D1, D2)
+ PADDL(sseIncMask, D2)
+ MOVO(A2, A3)
+ MOVO(B2, B3)
+ MOVO(C2, C3)
+ MOVO(D2, D3)
+ PADDL(sseIncMask, D3)
+
+ Comment("Store counters")
+ MOVO(D0, ctr0Store)
+ MOVO(D1, ctr1Store)
+ MOVO(D2, ctr2Store)
+ MOVO(D3, ctr3Store)
+
+ Label("sealSSEInnerLoop")
+ MOVO(C3, tmpStore)
+ chachaQR(A0, B0, C0, D0, C3)
+ chachaQR(A1, B1, C1, D1, C3)
+ chachaQR(A2, B2, C2, D2, C3)
+ MOVO(tmpStore, C3)
+ MOVO(C1, tmpStore)
+ chachaQR(A3, B3, C3, D3, C1)
+ MOVO(tmpStore, C1)
+ polyAdd(Mem{Base: oup}.Offset(0))
+ shiftB0Left()
+ shiftB1Left()
+ shiftB2Left()
+ shiftB3Left()
+ shiftC0Left()
+ shiftC1Left()
+ shiftC2Left()
+ shiftC3Left()
+ shiftD0Left()
+ shiftD1Left()
+ shiftD2Left()
+ shiftD3Left()
+ polyMulStage1()
+ polyMulStage2()
+ LEAQ(Mem{Base: oup}.Offset(2*8), oup)
+ MOVO(C3, tmpStore)
+ chachaQR(A0, B0, C0, D0, C3)
+ chachaQR(A1, B1, C1, D1, C3)
+ chachaQR(A2, B2, C2, D2, C3)
+ MOVO(tmpStore, C3)
+ MOVO(C1, tmpStore)
+ polyMulStage3()
+ chachaQR(A3, B3, C3, D3, C1)
+ MOVO(tmpStore, C1)
+ polyMulReduceStage()
+ shiftB0Right()
+ shiftB1Right()
+ shiftB2Right()
+ shiftB3Right()
+ shiftC0Right()
+ shiftC1Right()
+ shiftC2Right()
+ shiftC3Right()
+ shiftD0Right()
+ shiftD1Right()
+ shiftD2Right()
+ shiftD3Right()
+ DECQ(itr2)
+ JGE(LabelRef("sealSSEInnerLoop"))
+ polyAdd(Mem{Base: oup}.Offset(0))
+ polyMul()
+ LEAQ(Mem{Base: oup}.Offset(2*8), oup)
+ DECQ(itr1)
+ JG(LabelRef("sealSSEInnerLoop"))
+
+ Comment("Add in the state")
+ PADDD(chacha20Constants, A0)
+ PADDD(chacha20Constants, A1)
+ PADDD(chacha20Constants, A2)
+ PADDD(chacha20Constants, A3)
+ PADDD(state1Store, B0)
+ PADDD(state1Store, B1)
+ PADDD(state1Store, B2)
+ PADDD(state1Store, B3)
+ PADDD(state2Store, C0)
+ PADDD(state2Store, C1)
+ PADDD(state2Store, C2)
+ PADDD(state2Store, C3)
+ PADDD(ctr0Store, D0)
+ PADDD(ctr1Store, D1)
+ PADDD(ctr2Store, D2)
+ PADDD(ctr3Store, D3)
+ MOVO(D3, tmpStore)
+
+ Comment("Load - xor - store")
+ MOVOU(Mem{Base: inp}.Offset(0*16), D3)
+ PXOR(D3, A0)
+ MOVOU(Mem{Base: inp}.Offset(1*16), D3)
+ PXOR(D3, B0)
+ MOVOU(Mem{Base: inp}.Offset(2*16), D3)
+ PXOR(D3, C0)
+ MOVOU(Mem{Base: inp}.Offset(3*16), D3)
+ PXOR(D3, D0)
+ MOVOU(A0, Mem{Base: oup}.Offset(0*16))
+ MOVOU(B0, Mem{Base: oup}.Offset(1*16))
+ MOVOU(C0, Mem{Base: oup}.Offset(2*16))
+ MOVOU(D0, Mem{Base: oup}.Offset(3*16))
+ MOVO(tmpStore, D3)
+
+ MOVOU(Mem{Base: inp}.Offset(4*16), A0)
+ MOVOU(Mem{Base: inp}.Offset(5*16), B0)
+ MOVOU(Mem{Base: inp}.Offset(6*16), C0)
+ MOVOU(Mem{Base: inp}.Offset(7*16), D0)
+ PXOR(A0, A1)
+ PXOR(B0, B1)
+ PXOR(C0, C1)
+ PXOR(D0, D1)
+ MOVOU(A1, Mem{Base: oup}.Offset(4*16))
+ MOVOU(B1, Mem{Base: oup}.Offset(5*16))
+ MOVOU(C1, Mem{Base: oup}.Offset(6*16))
+ MOVOU(D1, Mem{Base: oup}.Offset(7*16))
+ MOVOU(Mem{Base: inp}.Offset(8*16), A0)
+ MOVOU(Mem{Base: inp}.Offset(9*16), B0)
+ MOVOU(Mem{Base: inp}.Offset(10*16), C0)
+ MOVOU(Mem{Base: inp}.Offset(11*16), D0)
+ PXOR(A0, A2)
+ PXOR(B0, B2)
+ PXOR(C0, C2)
+ PXOR(D0, D2)
+ MOVOU(A2, Mem{Base: oup}.Offset(8*16))
+ MOVOU(B2, Mem{Base: oup}.Offset(9*16))
+ MOVOU(C2, Mem{Base: oup}.Offset(10*16))
+ MOVOU(D2, Mem{Base: oup}.Offset(11*16))
+ ADDQ(Imm(192), inp)
+ MOVQ(U32(192), itr1)
+ SUBQ(Imm(192), inl)
+ MOVO(A3, A1)
+ MOVO(B3, B1)
+ MOVO(C3, C1)
+ MOVO(D3, D1)
+ CMPQ(inl, Imm(64))
+ JBE(LabelRef("sealSSE128SealHash"))
+ MOVOU(Mem{Base: inp}.Offset(0*16), A0)
+ MOVOU(Mem{Base: inp}.Offset(1*16), B0)
+ MOVOU(Mem{Base: inp}.Offset(2*16), C0)
+ MOVOU(Mem{Base: inp}.Offset(3*16), D0)
+ PXOR(A0, A3)
+ PXOR(B0, B3)
+ PXOR(C0, C3)
+ PXOR(D0, D3)
+ MOVOU(A3, Mem{Base: oup}.Offset(12*16))
+ MOVOU(B3, Mem{Base: oup}.Offset(13*16))
+ MOVOU(C3, Mem{Base: oup}.Offset(14*16))
+ MOVOU(D3, Mem{Base: oup}.Offset(15*16))
+ LEAQ(Mem{Base: inp}.Offset(64), inp)
+ SUBQ(Imm(64), inl)
+ MOVQ(U32(6), itr1)
+ MOVQ(U32(4), itr2)
+ CMPQ(inl, Imm(192))
+ JG(LabelRef("sealSSEMainLoop"))
+
+ MOVQ(inl, itr1)
+ TESTQ(inl, inl)
+ JE(LabelRef("sealSSE128SealHash"))
+ MOVQ(U32(6), itr1)
+ CMPQ(inl, Imm(64))
+ JBE(LabelRef("sealSSETail64"))
+ CMPQ(inl, Imm(128))
+ JBE(LabelRef("sealSSETail128"))
+ JMP(LabelRef("sealSSETail192"))
+}
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 64 bytes of plaintext
+
+// Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
+func sealSSETail64() {
+ Label("sealSSETail64")
+ chacha20Constants := chacha20Constants_DATA()
+ MOVO(chacha20Constants, A1)
+ MOVO(state1Store, B1)
+ MOVO(state2Store, C1)
+ MOVO(ctr3Store, D1)
+ sseIncMask := sseIncMask_DATA()
+ PADDL(sseIncMask, D1)
+ MOVO(D1, ctr0Store)
+}
+
+// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
+func sealSSETail64LoopA() {
+ Label("sealSSETail64LoopA")
+ polyAdd(Mem{Base: oup}.Offset(0))
+ polyMul()
+ LEAQ(Mem{Base: oup}.Offset(16), oup)
+}
+
+func sealSSETail64LoopB() {
+ Label("sealSSETail64LoopB")
+ chachaQR(A1, B1, C1, D1, T1)
+ shiftB1Left()
+ shiftC1Left()
+ shiftD1Left()
+ chachaQR(A1, B1, C1, D1, T1)
+ shiftB1Right()
+ shiftC1Right()
+ shiftD1Right()
+ polyAdd(Mem{Base: oup}.Offset(0))
+ polyMul()
+ LEAQ(Mem{Base: oup}.Offset(16), oup)
+
+ DECQ(itr1)
+ JG(LabelRef("sealSSETail64LoopA"))
+
+ DECQ(itr2)
+ JGE(LabelRef("sealSSETail64LoopB"))
+ chacha20Constants := chacha20Constants_DATA()
+ PADDL(chacha20Constants, A1)
+ PADDL(state1Store, B1)
+ PADDL(state2Store, C1)
+ PADDL(ctr0Store, D1)
+
+ JMP(LabelRef("sealSSE128Seal"))
+}
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 128 bytes of plaintext
+
+// Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
+func sealSSETail128() {
+ Label("sealSSETail128")
+ chacha20Constants := chacha20Constants_DATA()
+ MOVO(chacha20Constants, A0)
+ MOVO(state1Store, B0)
+ MOVO(state2Store, C0)
+ MOVO(ctr3Store, D0)
+ sseIncMask := sseIncMask_DATA()
+ PADDL(sseIncMask, D0)
+ MOVO(D0, ctr0Store)
+ MOVO(A0, A1)
+ MOVO(B0, B1)
+ MOVO(C0, C1)
+ MOVO(D0, D1)
+ PADDL(sseIncMask, D1)
+ MOVO(D1, ctr1Store)
+}
+
+// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
+func sealSSETail128LoopA() {
+ Label("sealSSETail128LoopA")
+ polyAdd(Mem{Base: oup}.Offset(0))
+ polyMul()
+ LEAQ(Mem{Base: oup}.Offset(16), oup)
+}
+
+func sealSSETail128LoopB() {
+ Label("sealSSETail128LoopB")
+ chachaQR(A0, B0, C0, D0, T0)
+ chachaQR(A1, B1, C1, D1, T0)
+ shiftB0Left()
+ shiftC0Left()
+ shiftD0Left()
+ shiftB1Left()
+ shiftC1Left()
+ shiftD1Left()
+ polyAdd(Mem{Base: oup}.Offset(0))
+ polyMul()
+ LEAQ(Mem{Base: oup}.Offset(16), oup)
+ chachaQR(A0, B0, C0, D0, T0)
+ chachaQR(A1, B1, C1, D1, T0)
+ shiftB0Right()
+ shiftC0Right()
+ shiftD0Right()
+ shiftB1Right()
+ shiftC1Right()
+ shiftD1Right()
+
+ DECQ(itr1)
+ JG(LabelRef("sealSSETail128LoopA"))
+
+ DECQ(itr2)
+ JGE(LabelRef("sealSSETail128LoopB"))
+
+ chacha20Constants := chacha20Constants_DATA()
+ PADDL(chacha20Constants, A0)
+ PADDL(chacha20Constants, A1)
+ PADDL(state1Store, B0)
+ PADDL(state1Store, B1)
+ PADDL(state2Store, C0)
+ PADDL(state2Store, C1)
+ PADDL(ctr0Store, D0)
+ PADDL(ctr1Store, D1)
+
+ MOVOU(Mem{Base: inp}.Offset(0*16), T0)
+ MOVOU(Mem{Base: inp}.Offset(1*16), T1)
+ MOVOU(Mem{Base: inp}.Offset(2*16), T2)
+ MOVOU(Mem{Base: inp}.Offset(3*16), T3)
+ PXOR(T0, A0)
+ PXOR(T1, B0)
+ PXOR(T2, C0)
+ PXOR(T3, D0)
+ MOVOU(A0, Mem{Base: oup}.Offset(0*16))
+ MOVOU(B0, Mem{Base: oup}.Offset(1*16))
+ MOVOU(C0, Mem{Base: oup}.Offset(2*16))
+ MOVOU(D0, Mem{Base: oup}.Offset(3*16))
+
+ MOVQ(U32(64), itr1)
+ LEAQ(Mem{Base: inp}.Offset(64), inp)
+ SUBQ(Imm(64), inl)
+
+ JMP(LabelRef("sealSSE128SealHash"))
+}
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 192 bytes of plaintext
+
+// Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
+func sealSSETail192() {
+ Label("sealSSETail192")
+ chacha20Constants := chacha20Constants_DATA()
+ MOVO(chacha20Constants, A0)
+ MOVO(state1Store, B0)
+ MOVO(state2Store, C0)
+ MOVO(ctr3Store, D0)
+ sseIncMask := sseIncMask_DATA()
+ PADDL(sseIncMask, D0)
+ MOVO(D0, ctr0Store)
+ MOVO(A0, A1)
+ MOVO(B0, B1)
+ MOVO(C0, C1)
+ MOVO(D0, D1)
+ PADDL(sseIncMask, D1)
+ MOVO(D1, ctr1Store)
+ MOVO(A1, A2)
+ MOVO(B1, B2)
+ MOVO(C1, C2)
+ MOVO(D1, D2)
+ PADDL(sseIncMask, D2)
+ MOVO(D2, ctr2Store)
+}
+
+// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
+func sealSSETail192LoopA() {
+ Label("sealSSETail192LoopA")
+ polyAdd(Mem{Base: oup}.Offset(0))
+ polyMul()
+ LEAQ(Mem{Base: oup}.Offset(16), oup)
+}
+
+func sealSSETail192LoopB() {
+ Label("sealSSETail192LoopB")
+ chachaQR(A0, B0, C0, D0, T0)
+ chachaQR(A1, B1, C1, D1, T0)
+ chachaQR(A2, B2, C2, D2, T0)
+ shiftB0Left()
+ shiftC0Left()
+ shiftD0Left()
+ shiftB1Left()
+ shiftC1Left()
+ shiftD1Left()
+ shiftB2Left()
+ shiftC2Left()
+ shiftD2Left()
+
+ polyAdd(Mem{Base: oup}.Offset(0))
+ polyMul()
+ LEAQ(Mem{Base: oup}.Offset(16), oup)
+
+ chachaQR(A0, B0, C0, D0, T0)
+ chachaQR(A1, B1, C1, D1, T0)
+ chachaQR(A2, B2, C2, D2, T0)
+ shiftB0Right()
+ shiftC0Right()
+ shiftD0Right()
+ shiftB1Right()
+ shiftC1Right()
+ shiftD1Right()
+ shiftB2Right()
+ shiftC2Right()
+ shiftD2Right()
+
+ DECQ(itr1)
+ JG(LabelRef("sealSSETail192LoopA"))
+
+ DECQ(itr2)
+ JGE(LabelRef("sealSSETail192LoopB"))
+
+ chacha20Constants := chacha20Constants_DATA()
+ PADDL(chacha20Constants, A0)
+ PADDL(chacha20Constants, A1)
+ PADDL(chacha20Constants, A2)
+ PADDL(state1Store, B0)
+ PADDL(state1Store, B1)
+ PADDL(state1Store, B2)
+ PADDL(state2Store, C0)
+ PADDL(state2Store, C1)
+ PADDL(state2Store, C2)
+ PADDL(ctr0Store, D0)
+ PADDL(ctr1Store, D1)
+ PADDL(ctr2Store, D2)
+
+ MOVOU(Mem{Base: inp}.Offset(0*16), T0)
+ MOVOU(Mem{Base: inp}.Offset(1*16), T1)
+ MOVOU(Mem{Base: inp}.Offset(2*16), T2)
+ MOVOU(Mem{Base: inp}.Offset(3*16), T3)
+ PXOR(T0, A0)
+ PXOR(T1, B0)
+ PXOR(T2, C0)
+ PXOR(T3, D0)
+ MOVOU(A0, Mem{Base: oup}.Offset(0*16))
+ MOVOU(B0, Mem{Base: oup}.Offset(1*16))
+ MOVOU(C0, Mem{Base: oup}.Offset(2*16))
+ MOVOU(D0, Mem{Base: oup}.Offset(3*16))
+ MOVOU(Mem{Base: inp}.Offset(4*16), T0)
+ MOVOU(Mem{Base: inp}.Offset(5*16), T1)
+ MOVOU(Mem{Base: inp}.Offset(6*16), T2)
+ MOVOU(Mem{Base: inp}.Offset(7*16), T3)
+ PXOR(T0, A1)
+ PXOR(T1, B1)
+ PXOR(T2, C1)
+ PXOR(T3, D1)
+ MOVOU(A1, Mem{Base: oup}.Offset(4*16))
+ MOVOU(B1, Mem{Base: oup}.Offset(5*16))
+ MOVOU(C1, Mem{Base: oup}.Offset(6*16))
+ MOVOU(D1, Mem{Base: oup}.Offset(7*16))
+
+ MOVO(A2, A1)
+ MOVO(B2, B1)
+ MOVO(C2, C1)
+ MOVO(D2, D1)
+ MOVQ(U32(128), itr1)
+ LEAQ(Mem{Base: inp}.Offset(128), inp)
+ SUBQ(Imm(128), inl)
+
+ JMP(LabelRef("sealSSE128SealHash"))
+}
+
+// ----------------------------------------------------------------------------
+// Special seal optimization for buffers smaller than 129 bytes
+
+// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
+func sealSSE128() {
+ Label("sealSSE128")
+ chacha20Constants := chacha20Constants_DATA()
+ MOVOU(chacha20Constants, A0)
+ MOVOU(Mem{Base: keyp}.Offset(1*16), B0)
+ MOVOU(Mem{Base: keyp}.Offset(2*16), C0)
+ MOVOU(Mem{Base: keyp}.Offset(3*16), D0)
+ MOVO(A0, A1)
+ MOVO(B0, B1)
+ MOVO(C0, C1)
+ MOVO(D0, D1)
+ sseIncMask := sseIncMask_DATA()
+ PADDL(sseIncMask, D1)
+ MOVO(A1, A2)
+ MOVO(B1, B2)
+ MOVO(C1, C2)
+ MOVO(D1, D2)
+ PADDL(sseIncMask, D2)
+ MOVO(B0, T1)
+ MOVO(C0, T2)
+ MOVO(D1, T3)
+ MOVQ(U32(10), itr2)
+
+ Label("sealSSE128InnerCipherLoop")
+ chachaQR(A0, B0, C0, D0, T0)
+ chachaQR(A1, B1, C1, D1, T0)
+ chachaQR(A2, B2, C2, D2, T0)
+ shiftB0Left()
+ shiftB1Left()
+ shiftB2Left()
+ shiftC0Left()
+ shiftC1Left()
+ shiftC2Left()
+ shiftD0Left()
+ shiftD1Left()
+ shiftD2Left()
+ chachaQR(A0, B0, C0, D0, T0)
+ chachaQR(A1, B1, C1, D1, T0)
+ chachaQR(A2, B2, C2, D2, T0)
+ shiftB0Right()
+ shiftB1Right()
+ shiftB2Right()
+ shiftC0Right()
+ shiftC1Right()
+ shiftC2Right()
+ shiftD0Right()
+ shiftD1Right()
+ shiftD2Right()
+ DECQ(itr2)
+ JNE(LabelRef("sealSSE128InnerCipherLoop"))
+
+ Comment("A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded")
+ PADDL(chacha20Constants, A0)
+ PADDL(chacha20Constants, A1)
+ PADDL(chacha20Constants, A2)
+ PADDL(T1, B0)
+ PADDL(T1, B1)
+ PADDL(T1, B2)
+ PADDL(T2, C1)
+ PADDL(T2, C2)
+ PADDL(T3, D1)
+ PADDL(sseIncMask, T3)
+ PADDL(T3, D2)
+ polyClampMask := polyClampMask_DATA()
+ PAND(polyClampMask, A0)
+ MOVOU(A0, rStore)
+ MOVOU(B0, sStore)
+
+ Comment("Hash")
+ MOVQ(NewParamAddr("ad_len", 80), itr2)
+ CALL(LabelRef("polyHashADInternal<>(SB)"))
+ XORQ(itr1, itr1)
+}
+
+// itr1 holds the number of bytes encrypted but not yet hashed
+func sealSSE128SealHash() {
+ Label("sealSSE128SealHash")
+ CMPQ(itr1, Imm(16))
+ JB(LabelRef("sealSSE128Seal"))
+ polyAdd(Mem{Base: oup}.Offset(0))
+ polyMul()
+
+ SUBQ(Imm(16), itr1)
+ ADDQ(Imm(16), oup)
+
+ JMP(LabelRef("sealSSE128SealHash"))
+}
+
+func sealSSE128Seal() {
+ Label("sealSSE128Seal")
+ CMPQ(inl, Imm(16))
+ JB(LabelRef("sealSSETail"))
+ SUBQ(Imm(16), inl)
+
+ Comment("Load for decryption")
+ MOVOU(Mem{Base: inp}, T0)
+ PXOR(T0, A1)
+ MOVOU(A1, Mem{Base: oup})
+ LEAQ(Mem{Base: inp}.Offset(1*16), inp)
+ LEAQ(Mem{Base: oup}.Offset(1*16), oup)
+
+ Comment("Extract for hashing")
+ MOVQ(A1, t0)
+ PSRLDQ(Imm(8), A1)
+ MOVQ(A1, t1)
+ ADDQ(t0, acc0)
+ ADCQ(t1, acc1)
+ ADCQ(Imm(1), acc2)
+ polyMul()
+
+ Comment("Shift the stream \"left\"")
+ MOVO(B1, A1)
+ MOVO(C1, B1)
+ MOVO(D1, C1)
+ MOVO(A2, D1)
+ MOVO(B2, A2)
+ MOVO(C2, B2)
+ MOVO(D2, C2)
+ JMP(LabelRef("sealSSE128Seal"))
+}
+
+func sealSSETail() {
+ Label("sealSSETail")
+ TESTQ(inl, inl)
+ JE(LabelRef("sealSSEFinalize"))
+
+ Comment("We can only load the PT one byte at a time to avoid read after end of buffer")
+ MOVQ(inl, itr2)
+ SHLQ(Imm(4), itr2)
+ andMask := andMask_DATA()
+ LEAQ(andMask, t0)
+ MOVQ(inl, itr1)
+ LEAQ(Mem{Base: inp, Index: inl, Scale: 1}.Offset(-1), inp)
+ XORQ(t2, t2)
+ XORQ(t3, t3)
+ XORQ(RAX, RAX)
+}
+
+func sealSSETailLoadLoop() {
+ Label("sealSSETailLoadLoop")
+ SHLQ(Imm(8), t2, t3)
+ SHLQ(Imm(8), t2)
+ // Hack to get Avo to emit:
+ // MOVB (inp), AX
+ Instruction(&ir.Instruction{Opcode: "MOVB", Operands: []Op{Mem{Base: inp}, AX}})
+ XORQ(RAX, t2)
+ LEAQ(Mem{Base: inp}.Offset(-1), inp)
+ DECQ(itr1)
+ JNE(LabelRef("sealSSETailLoadLoop"))
+ MOVQ(t2, tmpStore.Offset(0))
+ MOVQ(t3, tmpStore.Offset(8))
+ PXOR(tmpStore.Offset(0), A1)
+ MOVOU(A1, Mem{Base: oup})
+ MOVOU(Mem{Base: t0, Index: itr2, Scale: 1}.Offset(-16), T0)
+ PAND(T0, A1)
+ MOVQ(A1, t0)
+ PSRLDQ(Imm(8), A1)
+ MOVQ(A1, t1)
+ ADDQ(t0, acc0)
+ ADCQ(t1, acc1)
+ ADCQ(Imm(1), acc2)
+ polyMul()
+
+ ADDQ(inl, oup)
+}
+
+func sealSSEFinalize() {
+ Label("sealSSEFinalize")
+ Comment("Hash in the buffer lengths")
+ ADDQ(NewParamAddr("ad_len", 80), acc0)
+ ADCQ(NewParamAddr("src_len", 56), acc1)
+ ADCQ(Imm(1), acc2)
+ polyMul()
+
+ Comment("Final reduce")
+ MOVQ(acc0, t0)
+ MOVQ(acc1, t1)
+ MOVQ(acc2, t2)
+ SUBQ(I8(-5), acc0)
+ SBBQ(I8(-1), acc1)
+ SBBQ(Imm(3), acc2)
+ CMOVQCS(t0, acc0)
+ CMOVQCS(t1, acc1)
+ CMOVQCS(t2, acc2)
+
+ Comment("Add in the \"s\" part of the key")
+ ADDQ(sStore.Offset(0), acc0)
+ ADCQ(sStore.Offset(8), acc1)
+
+ Comment("Finally store the tag at the end of the message")
+ MOVQ(acc0, Mem{Base: oup}.Offset(0*8))
+ MOVQ(acc1, Mem{Base: oup}.Offset(1*8))
+ RET()
+}
+
+// ----------------------------------------------------------------------------
+// ------------------------- AVX2 Code ----------------------------------------
+
+func chacha20Poly1305Seal_AVX2() {
+ Label("chacha20Poly1305Seal_AVX2")
+ VZEROUPPER()
+ chacha20Constants := chacha20Constants_DATA()
+ VMOVDQU(chacha20Constants, AA0)
+ VBROADCASTI128_16_R8_YMM14()
+ VBROADCASTI128_32_R8_YMM12()
+ VBROADCASTI128_48_R8_YMM4()
+ avx2InitMask := avx2InitMask_DATA()
+ VPADDD(avx2InitMask, DD0, DD0)
+
+ Comment("Special optimizations, for very short buffers")
+ CMPQ(inl, U32(192))
+ JBE(LabelRef("seal192AVX2"))
+ CMPQ(inl, U32(320))
+ JBE(LabelRef("seal320AVX2"))
+
+ Comment("For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream")
+ VMOVDQA(AA0, AA1)
+ VMOVDQA(AA0, AA2)
+ VMOVDQA(AA0, AA3)
+ VMOVDQA(BB0, BB1)
+ VMOVDQA(BB0, BB2)
+ VMOVDQA(BB0, BB3)
+ VMOVDQA(BB0, state1StoreAVX2)
+ VMOVDQA(CC0, CC1)
+ VMOVDQA(CC0, CC2)
+ VMOVDQA(CC0, CC3)
+ VMOVDQA(CC0, state2StoreAVX2)
+ avx2IncMask := avx2IncMask_DATA()
+ VPADDD(avx2IncMask, DD0, DD1)
+ VMOVDQA(DD0, ctr0StoreAVX2)
+ VPADDD(avx2IncMask, DD1, DD2)
+ VMOVDQA(DD1, ctr1StoreAVX2)
+ VPADDD(avx2IncMask, DD2, DD3)
+ VMOVDQA(DD2, ctr2StoreAVX2)
+ VMOVDQA(DD3, ctr3StoreAVX2)
+ MOVQ(U32(10), itr2)
+}
+
+func sealAVX2IntroLoop() {
+ Label("sealAVX2IntroLoop")
+ VMOVDQA(CC3, tmpStoreAVX2)
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3)
+ chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3)
+ chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
+ VMOVDQA(tmpStoreAVX2, CC3)
+ VMOVDQA(CC1, tmpStoreAVX2)
+ chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
+ VMOVDQA(tmpStoreAVX2, CC1)
+
+ VPALIGNR(Imm(4), BB0, BB0, BB0)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(12), DD0, DD0, DD0)
+ VPALIGNR(Imm(4), BB1, BB1, BB1)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(12), DD1, DD1, DD1)
+ VPALIGNR(Imm(4), BB2, BB2, BB2)
+ VPALIGNR(Imm(8), CC2, CC2, CC2)
+ VPALIGNR(Imm(12), DD2, DD2, DD2)
+ VPALIGNR(Imm(4), BB3, BB3, BB3)
+ VPALIGNR(Imm(8), CC3, CC3, CC3)
+ VPALIGNR(Imm(12), DD3, DD3, DD3)
+
+ VMOVDQA(CC3, tmpStoreAVX2)
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3)
+ chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3)
+ chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
+ VMOVDQA(tmpStoreAVX2, CC3)
+ VMOVDQA(CC1, tmpStoreAVX2)
+ chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
+ VMOVDQA(tmpStoreAVX2, CC1)
+
+ VPALIGNR(Imm(12), BB0, BB0, BB0)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(4), DD0, DD0, DD0)
+ VPALIGNR(Imm(12), BB1, BB1, BB1)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(4), DD1, DD1, DD1)
+ VPALIGNR(Imm(12), BB2, BB2, BB2)
+ VPALIGNR(Imm(8), CC2, CC2, CC2)
+ VPALIGNR(Imm(4), DD2, DD2, DD2)
+ VPALIGNR(Imm(12), BB3, BB3, BB3)
+ VPALIGNR(Imm(8), CC3, CC3, CC3)
+ VPALIGNR(Imm(4), DD3, DD3, DD3)
+ DECQ(itr2)
+ JNE(LabelRef("sealAVX2IntroLoop"))
+
+ chacha20Constants := chacha20Constants_DATA()
+ VPADDD(chacha20Constants, AA0, AA0)
+ VPADDD(chacha20Constants, AA1, AA1)
+ VPADDD(chacha20Constants, AA2, AA2)
+ VPADDD(chacha20Constants, AA3, AA3)
+ VPADDD(state1StoreAVX2, BB0, BB0)
+ VPADDD(state1StoreAVX2, BB1, BB1)
+ VPADDD(state1StoreAVX2, BB2, BB2)
+ VPADDD(state1StoreAVX2, BB3, BB3)
+ VPADDD(state2StoreAVX2, CC0, CC0)
+ VPADDD(state2StoreAVX2, CC1, CC1)
+ VPADDD(state2StoreAVX2, CC2, CC2)
+ VPADDD(state2StoreAVX2, CC3, CC3)
+ VPADDD(ctr0StoreAVX2, DD0, DD0)
+ VPADDD(ctr1StoreAVX2, DD1, DD1)
+ VPADDD(ctr2StoreAVX2, DD2, DD2)
+ VPADDD(ctr3StoreAVX2, DD3, DD3)
+
+ VPERM2I128(Imm(0x13), CC0, DD0, CC0)
+ VPERM2I128(Imm(0x02), AA0, BB0, DD0)
+ VPERM2I128(Imm(0x13), AA0, BB0, AA0)
+
+ Comment("Clamp and store poly key")
+ polyClampMask := polyClampMask_DATA()
+ VPAND(polyClampMask, DD0, DD0)
+ VMOVDQA(DD0, rsStoreAVX2)
+
+ Comment("Hash AD")
+ MOVQ(NewParamAddr("ad_len", 80), itr2)
+ CALL(LabelRef("polyHashADInternal<>(SB)"))
+
+ Comment("Can store at least 320 bytes")
+ VPXOR(Mem{Base: inp}.Offset(0*32), AA0, AA0)
+ VPXOR(Mem{Base: inp}.Offset(1*32), CC0, CC0)
+ VMOVDQU(AA0, Mem{Base: oup}.Offset(0*32))
+ VMOVDQU(CC0, Mem{Base: oup}.Offset(1*32))
+
+ VPERM2I128(Imm(0x02), AA1, BB1, AA0)
+ VPERM2I128(Imm(0x02), CC1, DD1, BB0)
+ VPERM2I128(Imm(0x13), AA1, BB1, CC0)
+ VPERM2I128(Imm(0x13), CC1, DD1, DD0)
+ VPXOR(Mem{Base: inp}.Offset(2*32), AA0, AA0)
+ VPXOR(Mem{Base: inp}.Offset(3*32), BB0, BB0)
+ VPXOR(Mem{Base: inp}.Offset(4*32), CC0, CC0)
+ VPXOR(Mem{Base: inp}.Offset(5*32), DD0, DD0)
+ VMOVDQU(AA0, Mem{Base: oup}.Offset(2*32))
+ VMOVDQU(BB0, Mem{Base: oup}.Offset(3*32))
+ VMOVDQU(CC0, Mem{Base: oup}.Offset(4*32))
+ VMOVDQU(DD0, Mem{Base: oup}.Offset(5*32))
+ VPERM2I128(Imm(0x02), AA2, BB2, AA0)
+ VPERM2I128(Imm(0x02), CC2, DD2, BB0)
+ VPERM2I128(Imm(0x13), AA2, BB2, CC0)
+ VPERM2I128(Imm(0x13), CC2, DD2, DD0)
+ VPXOR(Mem{Base: inp}.Offset(6*32), AA0, AA0)
+ VPXOR(Mem{Base: inp}.Offset(7*32), BB0, BB0)
+ VPXOR(Mem{Base: inp}.Offset(8*32), CC0, CC0)
+ VPXOR(Mem{Base: inp}.Offset(9*32), DD0, DD0)
+ VMOVDQU(AA0, Mem{Base: oup}.Offset(6*32))
+ VMOVDQU(BB0, Mem{Base: oup}.Offset(7*32))
+ VMOVDQU(CC0, Mem{Base: oup}.Offset(8*32))
+ VMOVDQU(DD0, Mem{Base: oup}.Offset(9*32))
+
+ MOVQ(U32(320), itr1)
+ SUBQ(U32(320), inl)
+ LEAQ(Mem{Base: inp}.Offset(320), inp)
+
+ VPERM2I128(Imm(0x02), AA3, BB3, AA0)
+ VPERM2I128(Imm(0x02), CC3, DD3, BB0)
+ VPERM2I128(Imm(0x13), AA3, BB3, CC0)
+ VPERM2I128(Imm(0x13), CC3, DD3, DD0)
+ CMPQ(inl, Imm(128))
+ JBE(LabelRef("sealAVX2SealHash"))
+
+ VPXOR(Mem{Base: inp}.Offset(0*32), AA0, AA0)
+ VPXOR(Mem{Base: inp}.Offset(1*32), BB0, BB0)
+ VPXOR(Mem{Base: inp}.Offset(2*32), CC0, CC0)
+ VPXOR(Mem{Base: inp}.Offset(3*32), DD0, DD0)
+ VMOVDQU(AA0, Mem{Base: oup}.Offset(10*32))
+ VMOVDQU(BB0, Mem{Base: oup}.Offset(11*32))
+ VMOVDQU(CC0, Mem{Base: oup}.Offset(12*32))
+ VMOVDQU(DD0, Mem{Base: oup}.Offset(13*32))
+ SUBQ(Imm(128), inl)
+ LEAQ(Mem{Base: inp}.Offset(128), inp)
+
+ MOVQ(U32(8), itr1)
+ MOVQ(U32(2), itr2)
+
+ CMPQ(inl, Imm(128))
+ JBE(LabelRef("sealAVX2Tail128"))
+ CMPQ(inl, U32(256))
+ JBE(LabelRef("sealAVX2Tail256"))
+ CMPQ(inl, U32(384))
+ JBE(LabelRef("sealAVX2Tail384"))
+ CMPQ(inl, U32(512))
+ JBE(LabelRef("sealAVX2Tail512"))
+
+ Comment("We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop")
+ VMOVDQA(chacha20Constants, AA0)
+ VMOVDQA(AA0, AA1)
+ VMOVDQA(AA0, AA2)
+ VMOVDQA(AA0, AA3)
+ VMOVDQA(state1StoreAVX2, BB0)
+ VMOVDQA(BB0, BB1)
+ VMOVDQA(BB0, BB2)
+ VMOVDQA(BB0, BB3)
+ VMOVDQA(state2StoreAVX2, CC0)
+ VMOVDQA(CC0, CC1)
+ VMOVDQA(CC0, CC2)
+ VMOVDQA(CC0, CC3)
+ VMOVDQA(ctr3StoreAVX2, DD0)
+ avx2IncMask := avx2IncMask_DATA()
+ VPADDD(avx2IncMask, DD0, DD0)
+ VPADDD(avx2IncMask, DD0, DD1)
+ VPADDD(avx2IncMask, DD1, DD2)
+ VPADDD(avx2IncMask, DD2, DD3)
+ VMOVDQA(DD0, ctr0StoreAVX2)
+ VMOVDQA(DD1, ctr1StoreAVX2)
+ VMOVDQA(DD2, ctr2StoreAVX2)
+ VMOVDQA(DD3, ctr3StoreAVX2)
+
+ VMOVDQA(CC3, tmpStoreAVX2)
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3)
+ chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3)
+ chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
+ VMOVDQA(tmpStoreAVX2, CC3)
+ VMOVDQA(CC1, tmpStoreAVX2)
+ chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
+ VMOVDQA(tmpStoreAVX2, CC1)
+
+ VPALIGNR(Imm(4), BB0, BB0, BB0)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(12), DD0, DD0, DD0)
+ VPALIGNR(Imm(4), BB1, BB1, BB1)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(12), DD1, DD1, DD1)
+ VPALIGNR(Imm(4), BB2, BB2, BB2)
+ VPALIGNR(Imm(8), CC2, CC2, CC2)
+ VPALIGNR(Imm(12), DD2, DD2, DD2)
+ VPALIGNR(Imm(4), BB3, BB3, BB3)
+ VPALIGNR(Imm(8), CC3, CC3, CC3)
+ VPALIGNR(Imm(12), DD3, DD3, DD3)
+
+ VMOVDQA(CC3, tmpStoreAVX2)
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3)
+ chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3)
+ chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
+ VMOVDQA(tmpStoreAVX2, CC3)
+ VMOVDQA(CC1, tmpStoreAVX2)
+ chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
+ VMOVDQA(tmpStoreAVX2, CC1)
+
+ VPALIGNR(Imm(12), BB0, BB0, BB0)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(4), DD0, DD0, DD0)
+ VPALIGNR(Imm(12), BB1, BB1, BB1)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(4), DD1, DD1, DD1)
+ VPALIGNR(Imm(12), BB2, BB2, BB2)
+ VPALIGNR(Imm(8), CC2, CC2, CC2)
+ VPALIGNR(Imm(4), DD2, DD2, DD2)
+ VPALIGNR(Imm(12), BB3, BB3, BB3)
+ VPALIGNR(Imm(8), CC3, CC3, CC3)
+ VPALIGNR(Imm(4), DD3, DD3, DD3)
+ VPADDD(BB0, AA0, AA0)
+ VPADDD(BB1, AA1, AA1)
+ VPADDD(BB2, AA2, AA2)
+ VPADDD(BB3, AA3, AA3)
+ VPXOR(AA0, DD0, DD0)
+ VPXOR(AA1, DD1, DD1)
+ VPXOR(AA2, DD2, DD2)
+ VPXOR(AA3, DD3, DD3)
+ rol16 := rol16_DATA()
+ VPSHUFB(rol16, DD0, DD0)
+ VPSHUFB(rol16, DD1, DD1)
+ VPSHUFB(rol16, DD2, DD2)
+ VPSHUFB(rol16, DD3, DD3)
+ VPADDD(DD0, CC0, CC0)
+ VPADDD(DD1, CC1, CC1)
+ VPADDD(DD2, CC2, CC2)
+ VPADDD(DD3, CC3, CC3)
+ VPXOR(CC0, BB0, BB0)
+ VPXOR(CC1, BB1, BB1)
+ VPXOR(CC2, BB2, BB2)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(CC3, tmpStoreAVX2)
+ VPSLLD(Imm(12), BB0, CC3)
+ VPSRLD(Imm(20), BB0, BB0)
+ VPXOR(CC3, BB0, BB0)
+ VPSLLD(Imm(12), BB1, CC3)
+ VPSRLD(Imm(20), BB1, BB1)
+ VPXOR(CC3, BB1, BB1)
+ VPSLLD(Imm(12), BB2, CC3)
+ VPSRLD(Imm(20), BB2, BB2)
+ VPXOR(CC3, BB2, BB2)
+ VPSLLD(Imm(12), BB3, CC3)
+ VPSRLD(Imm(20), BB3, BB3)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(tmpStoreAVX2, CC3)
+
+ SUBQ(Imm(16), oup) // Adjust the pointer
+ MOVQ(U32(9), itr1)
+ JMP(LabelRef("sealAVX2InternalLoopStart"))
+}
+
+// Load state, increment counter blocks, store the incremented counters
+func sealAVX2MainLoop() {
+ Label("sealAVX2MainLoop")
+ chacha20Constants := chacha20Constants_DATA()
+ VMOVDQU(chacha20Constants, AA0)
+ VMOVDQA(AA0, AA1)
+ VMOVDQA(AA0, AA2)
+ VMOVDQA(AA0, AA3)
+ VMOVDQA(state1StoreAVX2, BB0)
+ VMOVDQA(BB0, BB1)
+ VMOVDQA(BB0, BB2)
+ VMOVDQA(BB0, BB3)
+ VMOVDQA(state2StoreAVX2, CC0)
+ VMOVDQA(CC0, CC1)
+ VMOVDQA(CC0, CC2)
+ VMOVDQA(CC0, CC3)
+ VMOVDQA(ctr3StoreAVX2, DD0)
+ avx2IncMask := avx2IncMask_DATA()
+ VPADDD(avx2IncMask, DD0, DD0)
+ VPADDD(avx2IncMask, DD0, DD1)
+ VPADDD(avx2IncMask, DD1, DD2)
+ VPADDD(avx2IncMask, DD2, DD3)
+ VMOVDQA(DD0, ctr0StoreAVX2)
+ VMOVDQA(DD1, ctr1StoreAVX2)
+ VMOVDQA(DD2, ctr2StoreAVX2)
+ VMOVDQA(DD3, ctr3StoreAVX2)
+ MOVQ(U32(10), itr1)
+}
+
+func sealAVX2InternalLoop() {
+ Label("sealAVX2InternalLoop")
+ polyAdd(Mem{Base: oup}.Offset(0 * 8))
+ VPADDD(BB0, AA0, AA0)
+ VPADDD(BB1, AA1, AA1)
+ VPADDD(BB2, AA2, AA2)
+ VPADDD(BB3, AA3, AA3)
+ polyMulStage1_AVX2()
+ VPXOR(AA0, DD0, DD0)
+ VPXOR(AA1, DD1, DD1)
+ VPXOR(AA2, DD2, DD2)
+ VPXOR(AA3, DD3, DD3)
+ rol16 := rol16_DATA()
+ VPSHUFB(rol16, DD0, DD0)
+ VPSHUFB(rol16, DD1, DD1)
+ VPSHUFB(rol16, DD2, DD2)
+ VPSHUFB(rol16, DD3, DD3)
+ polyMulStage2_AVX2()
+ VPADDD(DD0, CC0, CC0)
+ VPADDD(DD1, CC1, CC1)
+ VPADDD(DD2, CC2, CC2)
+ VPADDD(DD3, CC3, CC3)
+ VPXOR(CC0, BB0, BB0)
+ VPXOR(CC1, BB1, BB1)
+ VPXOR(CC2, BB2, BB2)
+ VPXOR(CC3, BB3, BB3)
+ polyMulStage3_AVX2()
+ VMOVDQA(CC3, tmpStoreAVX2)
+ VPSLLD(Imm(12), BB0, CC3)
+ VPSRLD(Imm(20), BB0, BB0)
+ VPXOR(CC3, BB0, BB0)
+ VPSLLD(Imm(12), BB1, CC3)
+ VPSRLD(Imm(20), BB1, BB1)
+ VPXOR(CC3, BB1, BB1)
+ VPSLLD(Imm(12), BB2, CC3)
+ VPSRLD(Imm(20), BB2, BB2)
+ VPXOR(CC3, BB2, BB2)
+ VPSLLD(Imm(12), BB3, CC3)
+ VPSRLD(Imm(20), BB3, BB3)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(tmpStoreAVX2, CC3)
+ polyMulReduceStage()
+}
+
+func sealAVX2InternalLoopStart() {
+ Label("sealAVX2InternalLoopStart")
+ VPADDD(BB0, AA0, AA0)
+ VPADDD(BB1, AA1, AA1)
+ VPADDD(BB2, AA2, AA2)
+ VPADDD(BB3, AA3, AA3)
+ VPXOR(AA0, DD0, DD0)
+ VPXOR(AA1, DD1, DD1)
+ VPXOR(AA2, DD2, DD2)
+ VPXOR(AA3, DD3, DD3)
+ rol8 := rol8_DATA()
+ VPSHUFB(rol8, DD0, DD0)
+ VPSHUFB(rol8, DD1, DD1)
+ VPSHUFB(rol8, DD2, DD2)
+ VPSHUFB(rol8, DD3, DD3)
+ polyAdd(Mem{Base: oup}.Offset(2 * 8))
+ VPADDD(DD0, CC0, CC0)
+ VPADDD(DD1, CC1, CC1)
+ VPADDD(DD2, CC2, CC2)
+ VPADDD(DD3, CC3, CC3)
+ polyMulStage1_AVX2()
+ VPXOR(CC0, BB0, BB0)
+ VPXOR(CC1, BB1, BB1)
+ VPXOR(CC2, BB2, BB2)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(CC3, tmpStoreAVX2)
+ VPSLLD(Imm(7), BB0, CC3)
+ VPSRLD(Imm(25), BB0, BB0)
+ VPXOR(CC3, BB0, BB0)
+ VPSLLD(Imm(7), BB1, CC3)
+ VPSRLD(Imm(25), BB1, BB1)
+ VPXOR(CC3, BB1, BB1)
+ VPSLLD(Imm(7), BB2, CC3)
+ VPSRLD(Imm(25), BB2, BB2)
+ VPXOR(CC3, BB2, BB2)
+ VPSLLD(Imm(7), BB3, CC3)
+ VPSRLD(Imm(25), BB3, BB3)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(tmpStoreAVX2, CC3)
+ polyMulStage2_AVX2()
+ VPALIGNR(Imm(4), BB0, BB0, BB0)
+ VPALIGNR(Imm(4), BB1, BB1, BB1)
+ VPALIGNR(Imm(4), BB2, BB2, BB2)
+ VPALIGNR(Imm(4), BB3, BB3, BB3)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(8), CC2, CC2, CC2)
+ VPALIGNR(Imm(8), CC3, CC3, CC3)
+ VPALIGNR(Imm(12), DD0, DD0, DD0)
+ VPALIGNR(Imm(12), DD1, DD1, DD1)
+ VPALIGNR(Imm(12), DD2, DD2, DD2)
+ VPALIGNR(Imm(12), DD3, DD3, DD3)
+ VPADDD(BB0, AA0, AA0)
+ VPADDD(BB1, AA1, AA1)
+ VPADDD(BB2, AA2, AA2)
+ VPADDD(BB3, AA3, AA3)
+ polyMulStage3_AVX2()
+ VPXOR(AA0, DD0, DD0)
+ VPXOR(AA1, DD1, DD1)
+ VPXOR(AA2, DD2, DD2)
+ VPXOR(AA3, DD3, DD3)
+ rol16 := rol16_DATA()
+ VPSHUFB(rol16, DD0, DD0)
+ VPSHUFB(rol16, DD1, DD1)
+ VPSHUFB(rol16, DD2, DD2)
+ VPSHUFB(rol16, DD3, DD3)
+ polyMulReduceStage()
+ VPADDD(DD0, CC0, CC0)
+ VPADDD(DD1, CC1, CC1)
+ VPADDD(DD2, CC2, CC2)
+ VPADDD(DD3, CC3, CC3)
+ VPXOR(CC0, BB0, BB0)
+ VPXOR(CC1, BB1, BB1)
+ VPXOR(CC2, BB2, BB2)
+ VPXOR(CC3, BB3, BB3)
+ polyAdd(Mem{Base: oup}.Offset(4 * 8))
+ LEAQ(Mem{Base: oup}.Offset(6*8), oup)
+ VMOVDQA(CC3, tmpStoreAVX2)
+ VPSLLD(Imm(12), BB0, CC3)
+ VPSRLD(Imm(20), BB0, BB0)
+ VPXOR(CC3, BB0, BB0)
+ VPSLLD(Imm(12), BB1, CC3)
+ VPSRLD(Imm(20), BB1, BB1)
+ VPXOR(CC3, BB1, BB1)
+ VPSLLD(Imm(12), BB2, CC3)
+ VPSRLD(Imm(20), BB2, BB2)
+ VPXOR(CC3, BB2, BB2)
+ VPSLLD(Imm(12), BB3, CC3)
+ VPSRLD(Imm(20), BB3, BB3)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(tmpStoreAVX2, CC3)
+ polyMulStage1_AVX2()
+ VPADDD(BB0, AA0, AA0)
+ VPADDD(BB1, AA1, AA1)
+ VPADDD(BB2, AA2, AA2)
+ VPADDD(BB3, AA3, AA3)
+ VPXOR(AA0, DD0, DD0)
+ VPXOR(AA1, DD1, DD1)
+ VPXOR(AA2, DD2, DD2)
+ VPXOR(AA3, DD3, DD3)
+ polyMulStage2_AVX2()
+ VPSHUFB(rol8, DD0, DD0)
+ VPSHUFB(rol8, DD1, DD1)
+ VPSHUFB(rol8, DD2, DD2)
+ VPSHUFB(rol8, DD3, DD3)
+ VPADDD(DD0, CC0, CC0)
+ VPADDD(DD1, CC1, CC1)
+ VPADDD(DD2, CC2, CC2)
+ VPADDD(DD3, CC3, CC3)
+ polyMulStage3_AVX2()
+ VPXOR(CC0, BB0, BB0)
+ VPXOR(CC1, BB1, BB1)
+ VPXOR(CC2, BB2, BB2)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(CC3, tmpStoreAVX2)
+ VPSLLD(Imm(7), BB0, CC3)
+ VPSRLD(Imm(25), BB0, BB0)
+ VPXOR(CC3, BB0, BB0)
+ VPSLLD(Imm(7), BB1, CC3)
+ VPSRLD(Imm(25), BB1, BB1)
+ VPXOR(CC3, BB1, BB1)
+ VPSLLD(Imm(7), BB2, CC3)
+ VPSRLD(Imm(25), BB2, BB2)
+ VPXOR(CC3, BB2, BB2)
+ VPSLLD(Imm(7), BB3, CC3)
+ VPSRLD(Imm(25), BB3, BB3)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(tmpStoreAVX2, CC3)
+ polyMulReduceStage()
+ VPALIGNR(Imm(12), BB0, BB0, BB0)
+ VPALIGNR(Imm(12), BB1, BB1, BB1)
+ VPALIGNR(Imm(12), BB2, BB2, BB2)
+ VPALIGNR(Imm(12), BB3, BB3, BB3)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(8), CC2, CC2, CC2)
+ VPALIGNR(Imm(8), CC3, CC3, CC3)
+ VPALIGNR(Imm(4), DD0, DD0, DD0)
+ VPALIGNR(Imm(4), DD1, DD1, DD1)
+ VPALIGNR(Imm(4), DD2, DD2, DD2)
+ VPALIGNR(Imm(4), DD3, DD3, DD3)
+ DECQ(itr1)
+ JNE(LabelRef("sealAVX2InternalLoop"))
+
+ chacha20Constants := chacha20Constants_DATA()
+ VPADDD(chacha20Constants, AA0, AA0)
+ VPADDD(chacha20Constants, AA1, AA1)
+ VPADDD(chacha20Constants, AA2, AA2)
+ VPADDD(chacha20Constants, AA3, AA3)
+ VPADDD(state1StoreAVX2, BB0, BB0)
+ VPADDD(state1StoreAVX2, BB1, BB1)
+ VPADDD(state1StoreAVX2, BB2, BB2)
+ VPADDD(state1StoreAVX2, BB3, BB3)
+ VPADDD(state2StoreAVX2, CC0, CC0)
+ VPADDD(state2StoreAVX2, CC1, CC1)
+ VPADDD(state2StoreAVX2, CC2, CC2)
+ VPADDD(state2StoreAVX2, CC3, CC3)
+ VPADDD(ctr0StoreAVX2, DD0, DD0)
+ VPADDD(ctr1StoreAVX2, DD1, DD1)
+ VPADDD(ctr2StoreAVX2, DD2, DD2)
+ VPADDD(ctr3StoreAVX2, DD3, DD3)
+ VMOVDQA(CC3, tmpStoreAVX2)
+
+ Comment("We only hashed 480 of the 512 bytes available - hash the remaining 32 here")
+ polyAdd(Mem{Base: oup}.Offset(0 * 8))
+ polyMulAVX2()
+ LEAQ(Mem{Base: oup}.Offset(4*8), oup)
+ VPERM2I128(Imm(0x02), AA0, BB0, CC3)
+ VPERM2I128(Imm(0x13), AA0, BB0, BB0)
+ VPERM2I128(Imm(0x02), CC0, DD0, AA0)
+ VPERM2I128(Imm(0x13), CC0, DD0, CC0)
+ VPXOR(Mem{Base: inp}.Offset(0*32), CC3, CC3)
+ VPXOR(Mem{Base: inp}.Offset(1*32), AA0, AA0)
+ VPXOR(Mem{Base: inp}.Offset(2*32), BB0, BB0)
+ VPXOR(Mem{Base: inp}.Offset(3*32), CC0, CC0)
+ VMOVDQU(CC3, Mem{Base: oup}.Offset(0*32))
+ VMOVDQU(AA0, Mem{Base: oup}.Offset(1*32))
+ VMOVDQU(BB0, Mem{Base: oup}.Offset(2*32))
+ VMOVDQU(CC0, Mem{Base: oup}.Offset(3*32))
+ VPERM2I128(Imm(0x02), AA1, BB1, AA0)
+ VPERM2I128(Imm(0x02), CC1, DD1, BB0)
+ VPERM2I128(Imm(0x13), AA1, BB1, CC0)
+ VPERM2I128(Imm(0x13), CC1, DD1, DD0)
+ VPXOR(Mem{Base: inp}.Offset(4*32), AA0, AA0)
+ VPXOR(Mem{Base: inp}.Offset(5*32), BB0, BB0)
+ VPXOR(Mem{Base: inp}.Offset(6*32), CC0, CC0)
+ VPXOR(Mem{Base: inp}.Offset(7*32), DD0, DD0)
+ VMOVDQU(AA0, Mem{Base: oup}.Offset(4*32))
+ VMOVDQU(BB0, Mem{Base: oup}.Offset(5*32))
+ VMOVDQU(CC0, Mem{Base: oup}.Offset(6*32))
+ VMOVDQU(DD0, Mem{Base: oup}.Offset(7*32))
+
+ Comment("and here")
+ polyAdd(Mem{Base: oup}.Offset(-2 * 8))
+ polyMulAVX2()
+ VPERM2I128(Imm(0x02), AA2, BB2, AA0)
+ VPERM2I128(Imm(0x02), CC2, DD2, BB0)
+ VPERM2I128(Imm(0x13), AA2, BB2, CC0)
+ VPERM2I128(Imm(0x13), CC2, DD2, DD0)
+ VPXOR(Mem{Base: inp}.Offset(8*32), AA0, AA0)
+ VPXOR(Mem{Base: inp}.Offset(9*32), BB0, BB0)
+ VPXOR(Mem{Base: inp}.Offset(10*32), CC0, CC0)
+ VPXOR(Mem{Base: inp}.Offset(11*32), DD0, DD0)
+ VMOVDQU(AA0, Mem{Base: oup}.Offset(8*32))
+ VMOVDQU(BB0, Mem{Base: oup}.Offset(9*32))
+ VMOVDQU(CC0, Mem{Base: oup}.Offset(10*32))
+ VMOVDQU(DD0, Mem{Base: oup}.Offset(11*32))
+ VPERM2I128(Imm(0x02), AA3, BB3, AA0)
+ VPERM2I128(Imm(0x02), tmpStoreAVX2, DD3, BB0)
+ VPERM2I128(Imm(0x13), AA3, BB3, CC0)
+ VPERM2I128(Imm(0x13), tmpStoreAVX2, DD3, DD0)
+ VPXOR(Mem{Base: inp}.Offset(12*32), AA0, AA0)
+ VPXOR(Mem{Base: inp}.Offset(13*32), BB0, BB0)
+ VPXOR(Mem{Base: inp}.Offset(14*32), CC0, CC0)
+ VPXOR(Mem{Base: inp}.Offset(15*32), DD0, DD0)
+ VMOVDQU(AA0, Mem{Base: oup}.Offset(12*32))
+ VMOVDQU(BB0, Mem{Base: oup}.Offset(13*32))
+ VMOVDQU(CC0, Mem{Base: oup}.Offset(14*32))
+ VMOVDQU(DD0, Mem{Base: oup}.Offset(15*32))
+ LEAQ(Mem{Base: inp}.Offset(32*16), inp)
+ SUBQ(U32(32*16), inl)
+ CMPQ(inl, U32(512))
+ JG(LabelRef("sealAVX2MainLoop"))
+
+ Comment("Tail can only hash 480 bytes")
+ polyAdd(Mem{Base: oup}.Offset(0 * 8))
+ polyMulAVX2()
+ polyAdd(Mem{Base: oup}.Offset(2 * 8))
+ polyMulAVX2()
+ LEAQ(Mem{Base: oup}.Offset(32), oup)
+
+ MOVQ(U32(10), itr1)
+ MOVQ(U32(0), itr2)
+ CMPQ(inl, Imm(128))
+ JBE(LabelRef("sealAVX2Tail128"))
+ CMPQ(inl, U32(256))
+ JBE(LabelRef("sealAVX2Tail256"))
+ CMPQ(inl, U32(384))
+ JBE(LabelRef("sealAVX2Tail384"))
+ JMP(LabelRef("sealAVX2Tail512"))
+}
+
+// ----------------------------------------------------------------------------
+// Special optimization for buffers smaller than 193 bytes
+
+// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
+func seal192AVX2() {
+ Label("seal192AVX2")
+ VMOVDQA(AA0, AA1)
+ VMOVDQA(BB0, BB1)
+ VMOVDQA(CC0, CC1)
+ avx2IncMask := avx2IncMask_DATA()
+ VPADDD(avx2IncMask, DD0, DD1)
+ VMOVDQA(AA0, AA2)
+ VMOVDQA(BB0, BB2)
+ VMOVDQA(CC0, CC2)
+ VMOVDQA(DD0, DD2)
+ VMOVDQA(DD1, TT3)
+ MOVQ(U32(10), itr2)
+}
+
+func sealAVX2192InnerCipherLoop() {
+ Label("sealAVX2192InnerCipherLoop")
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+ chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ VPALIGNR(Imm(4), BB0, BB0, BB0)
+ VPALIGNR(Imm(4), BB1, BB1, BB1)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(12), DD0, DD0, DD0)
+ VPALIGNR(Imm(12), DD1, DD1, DD1)
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+ chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ VPALIGNR(Imm(12), BB0, BB0, BB0)
+ VPALIGNR(Imm(12), BB1, BB1, BB1)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(4), DD0, DD0, DD0)
+ VPALIGNR(Imm(4), DD1, DD1, DD1)
+ DECQ(itr2)
+ JNE(LabelRef("sealAVX2192InnerCipherLoop"))
+ VPADDD(AA2, AA0, AA0)
+ VPADDD(AA2, AA1, AA1)
+ VPADDD(BB2, BB0, BB0)
+ VPADDD(BB2, BB1, BB1)
+ VPADDD(CC2, CC0, CC0)
+ VPADDD(CC2, CC1, CC1)
+ VPADDD(DD2, DD0, DD0)
+ VPADDD(TT3, DD1, DD1)
+ VPERM2I128(Imm(0x02), AA0, BB0, TT0)
+
+ Comment("Clamp and store poly key")
+ polyClampMask := polyClampMask_DATA()
+ VPAND(polyClampMask, TT0, TT0)
+ VMOVDQA(TT0, rsStoreAVX2)
+
+ Comment("Stream for up to 192 bytes")
+ VPERM2I128(Imm(0x13), AA0, BB0, AA0)
+ VPERM2I128(Imm(0x13), CC0, DD0, BB0)
+ VPERM2I128(Imm(0x02), AA1, BB1, CC0)
+ VPERM2I128(Imm(0x02), CC1, DD1, DD0)
+ VPERM2I128(Imm(0x13), AA1, BB1, AA1)
+ VPERM2I128(Imm(0x13), CC1, DD1, BB1)
+}
+
+func sealAVX2ShortSeal() {
+ Label("sealAVX2ShortSeal")
+ Comment("Hash aad")
+ MOVQ(NewParamAddr("ad_len", 80), itr2)
+ CALL(LabelRef("polyHashADInternal<>(SB)"))
+ XORQ(itr1, itr1)
+}
+
+func sealAVX2SealHash() {
+ Label("sealAVX2SealHash")
+ Comment("itr1 holds the number of bytes encrypted but not yet hashed")
+ CMPQ(itr1, Imm(16))
+ JB(LabelRef("sealAVX2ShortSealLoop"))
+ polyAdd(Mem{Base: oup}.Offset(0))
+ polyMul()
+ SUBQ(Imm(16), itr1)
+ ADDQ(Imm(16), oup)
+ JMP(LabelRef("sealAVX2SealHash"))
+}
+
+func sealAVX2ShortSealLoop() {
+ Label("sealAVX2ShortSealLoop")
+ CMPQ(inl, Imm(32))
+ JB(LabelRef("sealAVX2ShortTail32"))
+ SUBQ(Imm(32), inl)
+
+ Comment("Load for encryption")
+ VPXOR(Mem{Base: inp}, AA0, AA0)
+ VMOVDQU(AA0, Mem{Base: oup})
+ LEAQ(Mem{Base: inp}.Offset(1*32), inp)
+
+ Comment("Now can hash")
+ polyAdd(Mem{Base: oup}.Offset(0 * 8))
+ polyMulAVX2()
+ polyAdd(Mem{Base: oup}.Offset(2 * 8))
+ polyMulAVX2()
+ LEAQ(Mem{Base: oup}.Offset(1*32), oup)
+
+ Comment("Shift stream left")
+ VMOVDQA(BB0, AA0)
+ VMOVDQA(CC0, BB0)
+ VMOVDQA(DD0, CC0)
+ VMOVDQA(AA1, DD0)
+ VMOVDQA(BB1, AA1)
+ VMOVDQA(CC1, BB1)
+ VMOVDQA(DD1, CC1)
+ VMOVDQA(AA2, DD1)
+ VMOVDQA(BB2, AA2)
+ JMP(LabelRef("sealAVX2ShortSealLoop"))
+}
+
+func sealAVX2ShortTail32() {
+ Label("sealAVX2ShortTail32")
+ CMPQ(inl, Imm(16))
+ VMOVDQA(A0, A1)
+ JB(LabelRef("sealAVX2ShortDone"))
+
+ SUBQ(Imm(16), inl)
+
+ Comment("Load for encryption")
+ VPXOR(Mem{Base: inp}, A0, T0)
+ VMOVDQU(T0, Mem{Base: oup})
+ LEAQ(Mem{Base: inp}.Offset(1*16), inp)
+
+ Comment("Hash")
+ polyAdd(Mem{Base: oup}.Offset(0 * 8))
+ polyMulAVX2()
+ LEAQ(Mem{Base: oup}.Offset(1*16), oup)
+ VPERM2I128(Imm(0x11), AA0, AA0, AA0)
+ VMOVDQA(A0, A1)
+}
+
+func sealAVX2ShortDone() {
+ Label("sealAVX2ShortDone")
+ VZEROUPPER()
+ JMP(LabelRef("sealSSETail"))
+}
+
+// ----------------------------------------------------------------------------
+// Special optimization for buffers smaller than 321 bytes
+
+// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
+func seal320AVX2() {
+ Label("seal320AVX2")
+ VMOVDQA(AA0, AA1)
+ VMOVDQA(BB0, BB1)
+ VMOVDQA(CC0, CC1)
+ avx2IncMask := avx2IncMask_DATA()
+ VPADDD(avx2IncMask, DD0, DD1)
+ VMOVDQA(AA0, AA2)
+ VMOVDQA(BB0, BB2)
+ VMOVDQA(CC0, CC2)
+ VPADDD(avx2IncMask, DD1, DD2)
+ VMOVDQA(BB0, TT1)
+ VMOVDQA(CC0, TT2)
+ VMOVDQA(DD0, TT3)
+ MOVQ(U32(10), itr2)
+}
+
+func sealAVX2320InnerCipherLoop() {
+ Label("sealAVX2320InnerCipherLoop")
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+ chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
+ VPALIGNR(Imm(4), BB0, BB0, BB0)
+ VPALIGNR(Imm(4), BB1, BB1, BB1)
+ VPALIGNR(Imm(4), BB2, BB2, BB2)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(8), CC2, CC2, CC2)
+ VPALIGNR(Imm(12), DD0, DD0, DD0)
+ VPALIGNR(Imm(12), DD1, DD1, DD1)
+ VPALIGNR(Imm(12), DD2, DD2, DD2)
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+ chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
+ VPALIGNR(Imm(12), BB0, BB0, BB0)
+ VPALIGNR(Imm(12), BB1, BB1, BB1)
+ VPALIGNR(Imm(12), BB2, BB2, BB2)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(8), CC2, CC2, CC2)
+ VPALIGNR(Imm(4), DD0, DD0, DD0)
+ VPALIGNR(Imm(4), DD1, DD1, DD1)
+ VPALIGNR(Imm(4), DD2, DD2, DD2)
+ DECQ(itr2)
+ JNE(LabelRef("sealAVX2320InnerCipherLoop"))
+
+ chacha20Constants := chacha20Constants_DATA()
+ VMOVDQA(chacha20Constants, TT0)
+ VPADDD(TT0, AA0, AA0)
+ VPADDD(TT0, AA1, AA1)
+ VPADDD(TT0, AA2, AA2)
+ VPADDD(TT1, BB0, BB0)
+ VPADDD(TT1, BB1, BB1)
+ VPADDD(TT1, BB2, BB2)
+ VPADDD(TT2, CC0, CC0)
+ VPADDD(TT2, CC1, CC1)
+ VPADDD(TT2, CC2, CC2)
+ avx2IncMask := avx2IncMask_DATA()
+ VMOVDQA(avx2IncMask, TT0)
+ VPADDD(TT3, DD0, DD0)
+ VPADDD(TT0, TT3, TT3)
+ VPADDD(TT3, DD1, DD1)
+ VPADDD(TT0, TT3, TT3)
+ VPADDD(TT3, DD2, DD2)
+
+ Comment("Clamp and store poly key")
+ VPERM2I128(Imm(0x02), AA0, BB0, TT0)
+ polyClampMask := polyClampMask_DATA()
+ VPAND(polyClampMask, TT0, TT0)
+ VMOVDQA(TT0, rsStoreAVX2)
+
+ Comment("Stream for up to 320 bytes")
+ VPERM2I128(Imm(0x13), AA0, BB0, AA0)
+ VPERM2I128(Imm(0x13), CC0, DD0, BB0)
+ VPERM2I128(Imm(0x02), AA1, BB1, CC0)
+ VPERM2I128(Imm(0x02), CC1, DD1, DD0)
+ VPERM2I128(Imm(0x13), AA1, BB1, AA1)
+ VPERM2I128(Imm(0x13), CC1, DD1, BB1)
+ VPERM2I128(Imm(0x02), AA2, BB2, CC1)
+ VPERM2I128(Imm(0x02), CC2, DD2, DD1)
+ VPERM2I128(Imm(0x13), AA2, BB2, AA2)
+ VPERM2I128(Imm(0x13), CC2, DD2, BB2)
+ JMP(LabelRef("sealAVX2ShortSeal"))
+}
+
+// Need to decrypt up to 128 bytes - prepare two blocks:
+// - If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed.
+// - If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed.
+func sealAVX2Tail128() {
+ Label("sealAVX2Tail128")
+ chacha20Constants := chacha20Constants_DATA()
+ VMOVDQA(chacha20Constants, AA0)
+ VMOVDQA(state1StoreAVX2, BB0)
+ VMOVDQA(state2StoreAVX2, CC0)
+ VMOVDQA(ctr3StoreAVX2, DD0)
+ avx2IncMask := avx2IncMask_DATA()
+ VPADDD(avx2IncMask, DD0, DD0)
+ VMOVDQA(DD0, DD1)
+}
+
+func sealAVX2Tail128LoopA() {
+ Label("sealAVX2Tail128LoopA")
+ polyAdd(Mem{Base: oup}.Offset(0))
+ polyMul()
+ LEAQ(Mem{Base: oup}.Offset(16), oup)
+}
+
+func sealAVX2Tail128LoopB() {
+ Label("sealAVX2Tail128LoopB")
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+ polyAdd(Mem{Base: oup}.Offset(0))
+ polyMul()
+ VPALIGNR(Imm(4), BB0, BB0, BB0)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(12), DD0, DD0, DD0)
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+ polyAdd(Mem{Base: oup}.Offset(16))
+ polyMul()
+ LEAQ(Mem{Base: oup}.Offset(32), oup)
+ VPALIGNR(Imm(12), BB0, BB0, BB0)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(4), DD0, DD0, DD0)
+ DECQ(itr1)
+ JG(LabelRef("sealAVX2Tail128LoopA"))
+ DECQ(itr2)
+ JGE(LabelRef("sealAVX2Tail128LoopB"))
+
+ chacha20Constants := chacha20Constants_DATA()
+ VPADDD(chacha20Constants, AA0, AA1)
+ VPADDD(state1StoreAVX2, BB0, BB1)
+ VPADDD(state2StoreAVX2, CC0, CC1)
+ VPADDD(DD1, DD0, DD1)
+
+ VPERM2I128(Imm(0x02), AA1, BB1, AA0)
+ VPERM2I128(Imm(0x02), CC1, DD1, BB0)
+ VPERM2I128(Imm(0x13), AA1, BB1, CC0)
+ VPERM2I128(Imm(0x13), CC1, DD1, DD0)
+ JMP(LabelRef("sealAVX2ShortSealLoop"))
+}
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 256 bytes of ciphertext
+
+// Need to decrypt up to 256 bytes - prepare two blocks
+// - If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
+// - If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
+func sealAVX2Tail256() {
+ Label("sealAVX2Tail256")
+ chacha20Constants := chacha20Constants_DATA()
+ VMOVDQA(chacha20Constants, AA0)
+ VMOVDQA(chacha20Constants, AA1)
+ VMOVDQA(state1StoreAVX2, BB0)
+ VMOVDQA(state1StoreAVX2, BB1)
+ VMOVDQA(state2StoreAVX2, CC0)
+ VMOVDQA(state2StoreAVX2, CC1)
+ VMOVDQA(ctr3StoreAVX2, DD0)
+ avx2IncMask := avx2IncMask_DATA()
+ VPADDD(avx2IncMask, DD0, DD0)
+ VPADDD(avx2IncMask, DD0, DD1)
+ VMOVDQA(DD0, TT1)
+ VMOVDQA(DD1, TT2)
+}
+
+func sealAVX2Tail256LoopA() {
+ Label("sealAVX2Tail256LoopA")
+ polyAdd(Mem{Base: oup}.Offset(0))
+ polyMul()
+ LEAQ(Mem{Base: oup}.Offset(16), oup)
+}
+
+// LIne 2493
+func sealAVX2Tail256LoopB() {
+ Label("sealAVX2Tail256LoopB")
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+ chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ polyAdd(Mem{Base: oup}.Offset(0))
+ polyMul()
+ VPALIGNR(Imm(4), BB0, BB0, BB0)
+ VPALIGNR(Imm(4), BB1, BB1, BB1)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(12), DD0, DD0, DD0)
+ VPALIGNR(Imm(12), DD1, DD1, DD1)
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+ chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ polyAdd(Mem{Base: oup}.Offset(16))
+ polyMul()
+ LEAQ(Mem{Base: oup}.Offset(32), oup)
+ VPALIGNR(Imm(12), BB0, BB0, BB0)
+ VPALIGNR(Imm(12), BB1, BB1, BB1)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(4), DD0, DD0, DD0)
+ VPALIGNR(Imm(4), DD1, DD1, DD1)
+ DECQ(itr1)
+ JG(LabelRef("sealAVX2Tail256LoopA"))
+ DECQ(itr2)
+ JGE(LabelRef("sealAVX2Tail256LoopB"))
+
+ chacha20Constants := chacha20Constants_DATA()
+ VPADDD(chacha20Constants, AA0, AA0)
+ VPADDD(chacha20Constants, AA1, AA1)
+ VPADDD(state1StoreAVX2, BB0, BB0)
+ VPADDD(state1StoreAVX2, BB1, BB1)
+ VPADDD(state2StoreAVX2, CC0, CC0)
+ VPADDD(state2StoreAVX2, CC1, CC1)
+ VPADDD(TT1, DD0, DD0)
+ VPADDD(TT2, DD1, DD1)
+ VPERM2I128(Imm(0x02), AA0, BB0, TT0)
+ VPERM2I128(Imm(0x02), CC0, DD0, TT1)
+ VPERM2I128(Imm(0x13), AA0, BB0, TT2)
+ VPERM2I128(Imm(0x13), CC0, DD0, TT3)
+ VPXOR(Mem{Base: inp}.Offset(0*32), TT0, TT0)
+ VPXOR(Mem{Base: inp}.Offset(1*32), TT1, TT1)
+ VPXOR(Mem{Base: inp}.Offset(2*32), TT2, TT2)
+ VPXOR(Mem{Base: inp}.Offset(3*32), TT3, TT3)
+ VMOVDQU(TT0, Mem{Base: oup}.Offset(0*32))
+ VMOVDQU(TT1, Mem{Base: oup}.Offset(1*32))
+ VMOVDQU(TT2, Mem{Base: oup}.Offset(2*32))
+ VMOVDQU(TT3, Mem{Base: oup}.Offset(3*32))
+ MOVQ(U32(128), itr1)
+ LEAQ(Mem{Base: inp}.Offset(128), inp)
+ SUBQ(Imm(128), inl)
+ VPERM2I128(Imm(0x02), AA1, BB1, AA0)
+ VPERM2I128(Imm(0x02), CC1, DD1, BB0)
+ VPERM2I128(Imm(0x13), AA1, BB1, CC0)
+ VPERM2I128(Imm(0x13), CC1, DD1, DD0)
+
+ JMP(LabelRef("sealAVX2SealHash"))
+}
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 384 bytes of ciphertext
+
+// Need to decrypt up to 384 bytes - prepare two blocks
+// - If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
+// - If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
+func sealAVX2Tail384() {
+ Label("sealAVX2Tail384")
+ chacha20Constants := chacha20Constants_DATA()
+ VMOVDQA(chacha20Constants, AA0)
+ VMOVDQA(AA0, AA1)
+ VMOVDQA(AA0, AA2)
+ VMOVDQA(state1StoreAVX2, BB0)
+ VMOVDQA(BB0, BB1)
+ VMOVDQA(BB0, BB2)
+ VMOVDQA(state2StoreAVX2, CC0)
+ VMOVDQA(CC0, CC1)
+ VMOVDQA(CC0, CC2)
+ VMOVDQA(ctr3StoreAVX2, DD0)
+ avx2IncMask := avx2IncMask_DATA()
+ VPADDD(avx2IncMask, DD0, DD0)
+ VPADDD(avx2IncMask, DD0, DD1)
+ VPADDD(avx2IncMask, DD1, DD2)
+ VMOVDQA(DD0, TT1)
+ VMOVDQA(DD1, TT2)
+ VMOVDQA(DD2, TT3)
+}
+
+func sealAVX2Tail384LoopA() {
+ Label("sealAVX2Tail384LoopA")
+ polyAdd(Mem{Base: oup}.Offset(0))
+ polyMul()
+ LEAQ(Mem{Base: oup}.Offset(16), oup)
+}
+
+func sealAVX2Tail384LoopB() {
+ Label("sealAVX2Tail384LoopB")
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+ chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
+ polyAdd(Mem{Base: oup}.Offset(0))
+ polyMul()
+ VPALIGNR(Imm(4), BB0, BB0, BB0)
+ VPALIGNR(Imm(4), BB1, BB1, BB1)
+ VPALIGNR(Imm(4), BB2, BB2, BB2)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(8), CC2, CC2, CC2)
+ VPALIGNR(Imm(12), DD0, DD0, DD0)
+ VPALIGNR(Imm(12), DD1, DD1, DD1)
+ VPALIGNR(Imm(12), DD2, DD2, DD2)
+ chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+ chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+ chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
+ polyAdd(Mem{Base: oup}.Offset(16))
+ polyMul()
+ LEAQ(Mem{Base: oup}.Offset(32), oup)
+ VPALIGNR(Imm(12), BB0, BB0, BB0)
+ VPALIGNR(Imm(12), BB1, BB1, BB1)
+ VPALIGNR(Imm(12), BB2, BB2, BB2)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(8), CC2, CC2, CC2)
+ VPALIGNR(Imm(4), DD0, DD0, DD0)
+ VPALIGNR(Imm(4), DD1, DD1, DD1)
+ VPALIGNR(Imm(4), DD2, DD2, DD2)
+ DECQ(itr1)
+ JG(LabelRef("sealAVX2Tail384LoopA"))
+ DECQ(itr2)
+ JGE(LabelRef("sealAVX2Tail384LoopB"))
+
+ chacha20Constants := chacha20Constants_DATA()
+ VPADDD(chacha20Constants, AA0, AA0)
+ VPADDD(chacha20Constants, AA1, AA1)
+ VPADDD(chacha20Constants, AA2, AA2)
+ VPADDD(state1StoreAVX2, BB0, BB0)
+ VPADDD(state1StoreAVX2, BB1, BB1)
+ VPADDD(state1StoreAVX2, BB2, BB2)
+ VPADDD(state2StoreAVX2, CC0, CC0)
+ VPADDD(state2StoreAVX2, CC1, CC1)
+ VPADDD(state2StoreAVX2, CC2, CC2)
+ VPADDD(TT1, DD0, DD0)
+ VPADDD(TT2, DD1, DD1)
+ VPADDD(TT3, DD2, DD2)
+ VPERM2I128(Imm(0x02), AA0, BB0, TT0)
+ VPERM2I128(Imm(0x02), CC0, DD0, TT1)
+ VPERM2I128(Imm(0x13), AA0, BB0, TT2)
+ VPERM2I128(Imm(0x13), CC0, DD0, TT3)
+ VPXOR(Mem{Base: inp}.Offset(0*32), TT0, TT0)
+ VPXOR(Mem{Base: inp}.Offset(1*32), TT1, TT1)
+ VPXOR(Mem{Base: inp}.Offset(2*32), TT2, TT2)
+ VPXOR(Mem{Base: inp}.Offset(3*32), TT3, TT3)
+ VMOVDQU(TT0, Mem{Base: oup}.Offset(0*32))
+ VMOVDQU(TT1, Mem{Base: oup}.Offset(1*32))
+ VMOVDQU(TT2, Mem{Base: oup}.Offset(2*32))
+ VMOVDQU(TT3, Mem{Base: oup}.Offset(3*32))
+ VPERM2I128(Imm(0x02), AA1, BB1, TT0)
+ VPERM2I128(Imm(0x02), CC1, DD1, TT1)
+ VPERM2I128(Imm(0x13), AA1, BB1, TT2)
+ VPERM2I128(Imm(0x13), CC1, DD1, TT3)
+ VPXOR(Mem{Base: inp}.Offset(4*32), TT0, TT0)
+ VPXOR(Mem{Base: inp}.Offset(5*32), TT1, TT1)
+ VPXOR(Mem{Base: inp}.Offset(6*32), TT2, TT2)
+ VPXOR(Mem{Base: inp}.Offset(7*32), TT3, TT3)
+ VMOVDQU(TT0, Mem{Base: oup}.Offset(4*32))
+ VMOVDQU(TT1, Mem{Base: oup}.Offset(5*32))
+ VMOVDQU(TT2, Mem{Base: oup}.Offset(6*32))
+ VMOVDQU(TT3, Mem{Base: oup}.Offset(7*32))
+ MOVQ(U32(256), itr1)
+ LEAQ(Mem{Base: inp}.Offset(256), inp)
+ SUBQ(U32(256), inl)
+ VPERM2I128(Imm(0x02), AA2, BB2, AA0)
+ VPERM2I128(Imm(0x02), CC2, DD2, BB0)
+ VPERM2I128(Imm(0x13), AA2, BB2, CC0)
+ VPERM2I128(Imm(0x13), CC2, DD2, DD0)
+
+ JMP(LabelRef("sealAVX2SealHash"))
+}
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 512 bytes of ciphertext
+
+// Need to decrypt up to 512 bytes - prepare two blocks
+// - If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
+// - If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
+func sealAVX2Tail512() {
+ Label("sealAVX2Tail512")
+ chacha20Constants := chacha20Constants_DATA()
+ VMOVDQA(chacha20Constants, AA0)
+ VMOVDQA(AA0, AA1)
+ VMOVDQA(AA0, AA2)
+ VMOVDQA(AA0, AA3)
+ VMOVDQA(state1StoreAVX2, BB0)
+ VMOVDQA(BB0, BB1)
+ VMOVDQA(BB0, BB2)
+ VMOVDQA(BB0, BB3)
+ VMOVDQA(state2StoreAVX2, CC0)
+ VMOVDQA(CC0, CC1)
+ VMOVDQA(CC0, CC2)
+ VMOVDQA(CC0, CC3)
+ VMOVDQA(ctr3StoreAVX2, DD0)
+ avx2IncMask := avx2IncMask_DATA()
+ VPADDD(avx2IncMask, DD0, DD0)
+ VPADDD(avx2IncMask, DD0, DD1)
+ VPADDD(avx2IncMask, DD1, DD2)
+ VPADDD(avx2IncMask, DD2, DD3)
+ VMOVDQA(DD0, ctr0StoreAVX2)
+ VMOVDQA(DD1, ctr1StoreAVX2)
+ VMOVDQA(DD2, ctr2StoreAVX2)
+ VMOVDQA(DD3, ctr3StoreAVX2)
+}
+
+func sealAVX2Tail512LoopA() {
+ Label("sealAVX2Tail512LoopA")
+ polyAdd(Mem{Base: oup}.Offset(0))
+ polyMul()
+ LEAQ(Mem{Base: oup}.Offset(16), oup)
+}
+
+func sealAVX2Tail512LoopB() {
+ Label("sealAVX2Tail512LoopB")
+ VPADDD(BB0, AA0, AA0)
+ VPADDD(BB1, AA1, AA1)
+ VPADDD(BB2, AA2, AA2)
+ VPADDD(BB3, AA3, AA3)
+ VPXOR(AA0, DD0, DD0)
+ VPXOR(AA1, DD1, DD1)
+ VPXOR(AA2, DD2, DD2)
+ VPXOR(AA3, DD3, DD3)
+ rol16 := rol16_DATA()
+ VPSHUFB(rol16, DD0, DD0)
+ VPSHUFB(rol16, DD1, DD1)
+ VPSHUFB(rol16, DD2, DD2)
+ VPSHUFB(rol16, DD3, DD3)
+ VPADDD(DD0, CC0, CC0)
+ VPADDD(DD1, CC1, CC1)
+ VPADDD(DD2, CC2, CC2)
+ VPADDD(DD3, CC3, CC3)
+ VPXOR(CC0, BB0, BB0)
+ VPXOR(CC1, BB1, BB1)
+ VPXOR(CC2, BB2, BB2)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(CC3, tmpStoreAVX2)
+ VPSLLD(Imm(12), BB0, CC3)
+ VPSRLD(Imm(20), BB0, BB0)
+ VPXOR(CC3, BB0, BB0)
+ VPSLLD(Imm(12), BB1, CC3)
+ VPSRLD(Imm(20), BB1, BB1)
+ VPXOR(CC3, BB1, BB1)
+ VPSLLD(Imm(12), BB2, CC3)
+ VPSRLD(Imm(20), BB2, BB2)
+ VPXOR(CC3, BB2, BB2)
+ VPSLLD(Imm(12), BB3, CC3)
+ VPSRLD(Imm(20), BB3, BB3)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(tmpStoreAVX2, CC3)
+ polyAdd(Mem{Base: oup}.Offset(0 * 8))
+ polyMulAVX2()
+ VPADDD(BB0, AA0, AA0)
+ VPADDD(BB1, AA1, AA1)
+ VPADDD(BB2, AA2, AA2)
+ VPADDD(BB3, AA3, AA3)
+ VPXOR(AA0, DD0, DD0)
+ VPXOR(AA1, DD1, DD1)
+ VPXOR(AA2, DD2, DD2)
+ VPXOR(AA3, DD3, DD3)
+ rol8 := rol8_DATA()
+ VPSHUFB(rol8, DD0, DD0)
+ VPSHUFB(rol8, DD1, DD1)
+ VPSHUFB(rol8, DD2, DD2)
+ VPSHUFB(rol8, DD3, DD3)
+ VPADDD(DD0, CC0, CC0)
+ VPADDD(DD1, CC1, CC1)
+ VPADDD(DD2, CC2, CC2)
+ VPADDD(DD3, CC3, CC3)
+ VPXOR(CC0, BB0, BB0)
+ VPXOR(CC1, BB1, BB1)
+ VPXOR(CC2, BB2, BB2)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(CC3, tmpStoreAVX2)
+ VPSLLD(Imm(7), BB0, CC3)
+ VPSRLD(Imm(25), BB0, BB0)
+ VPXOR(CC3, BB0, BB0)
+ VPSLLD(Imm(7), BB1, CC3)
+ VPSRLD(Imm(25), BB1, BB1)
+ VPXOR(CC3, BB1, BB1)
+ VPSLLD(Imm(7), BB2, CC3)
+ VPSRLD(Imm(25), BB2, BB2)
+ VPXOR(CC3, BB2, BB2)
+ VPSLLD(Imm(7), BB3, CC3)
+ VPSRLD(Imm(25), BB3, BB3)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(tmpStoreAVX2, CC3)
+ VPALIGNR(Imm(4), BB0, BB0, BB0)
+ VPALIGNR(Imm(4), BB1, BB1, BB1)
+ VPALIGNR(Imm(4), BB2, BB2, BB2)
+ VPALIGNR(Imm(4), BB3, BB3, BB3)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(8), CC2, CC2, CC2)
+ VPALIGNR(Imm(8), CC3, CC3, CC3)
+ VPALIGNR(Imm(12), DD0, DD0, DD0)
+ VPALIGNR(Imm(12), DD1, DD1, DD1)
+ VPALIGNR(Imm(12), DD2, DD2, DD2)
+ VPALIGNR(Imm(12), DD3, DD3, DD3)
+ VPADDD(BB0, AA0, AA0)
+ VPADDD(BB1, AA1, AA1)
+ VPADDD(BB2, AA2, AA2)
+ VPADDD(BB3, AA3, AA3)
+ VPXOR(AA0, DD0, DD0)
+ VPXOR(AA1, DD1, DD1)
+ VPXOR(AA2, DD2, DD2)
+ VPXOR(AA3, DD3, DD3)
+ VPSHUFB(rol16, DD0, DD0)
+ VPSHUFB(rol16, DD1, DD1)
+ VPSHUFB(rol16, DD2, DD2)
+ VPSHUFB(rol16, DD3, DD3)
+ VPADDD(DD0, CC0, CC0)
+ VPADDD(DD1, CC1, CC1)
+ VPADDD(DD2, CC2, CC2)
+ VPADDD(DD3, CC3, CC3)
+ VPXOR(CC0, BB0, BB0)
+ VPXOR(CC1, BB1, BB1)
+ VPXOR(CC2, BB2, BB2)
+ VPXOR(CC3, BB3, BB3)
+ polyAdd(Mem{Base: oup}.Offset(2 * 8))
+ polyMulAVX2()
+ LEAQ(Mem{Base: oup}.Offset(4*8), oup)
+ VMOVDQA(CC3, tmpStoreAVX2)
+ VPSLLD(Imm(12), BB0, CC3)
+ VPSRLD(Imm(20), BB0, BB0)
+ VPXOR(CC3, BB0, BB0)
+ VPSLLD(Imm(12), BB1, CC3)
+ VPSRLD(Imm(20), BB1, BB1)
+ VPXOR(CC3, BB1, BB1)
+ VPSLLD(Imm(12), BB2, CC3)
+ VPSRLD(Imm(20), BB2, BB2)
+ VPXOR(CC3, BB2, BB2)
+ VPSLLD(Imm(12), BB3, CC3)
+ VPSRLD(Imm(20), BB3, BB3)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(tmpStoreAVX2, CC3)
+ VPADDD(BB0, AA0, AA0)
+ VPADDD(BB1, AA1, AA1)
+ VPADDD(BB2, AA2, AA2)
+ VPADDD(BB3, AA3, AA3)
+ VPXOR(AA0, DD0, DD0)
+ VPXOR(AA1, DD1, DD1)
+ VPXOR(AA2, DD2, DD2)
+ VPXOR(AA3, DD3, DD3)
+ VPSHUFB(rol8, DD0, DD0)
+ VPSHUFB(rol8, DD1, DD1)
+ VPSHUFB(rol8, DD2, DD2)
+ VPSHUFB(rol8, DD3, DD3)
+ VPADDD(DD0, CC0, CC0)
+ VPADDD(DD1, CC1, CC1)
+ VPADDD(DD2, CC2, CC2)
+ VPADDD(DD3, CC3, CC3)
+ VPXOR(CC0, BB0, BB0)
+ VPXOR(CC1, BB1, BB1)
+ VPXOR(CC2, BB2, BB2)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(CC3, tmpStoreAVX2)
+ VPSLLD(Imm(7), BB0, CC3)
+ VPSRLD(Imm(25), BB0, BB0)
+ VPXOR(CC3, BB0, BB0)
+ VPSLLD(Imm(7), BB1, CC3)
+ VPSRLD(Imm(25), BB1, BB1)
+ VPXOR(CC3, BB1, BB1)
+ VPSLLD(Imm(7), BB2, CC3)
+ VPSRLD(Imm(25), BB2, BB2)
+ VPXOR(CC3, BB2, BB2)
+ VPSLLD(Imm(7), BB3, CC3)
+ VPSRLD(Imm(25), BB3, BB3)
+ VPXOR(CC3, BB3, BB3)
+ VMOVDQA(tmpStoreAVX2, CC3)
+ VPALIGNR(Imm(12), BB0, BB0, BB0)
+ VPALIGNR(Imm(12), BB1, BB1, BB1)
+ VPALIGNR(Imm(12), BB2, BB2, BB2)
+ VPALIGNR(Imm(12), BB3, BB3, BB3)
+ VPALIGNR(Imm(8), CC0, CC0, CC0)
+ VPALIGNR(Imm(8), CC1, CC1, CC1)
+ VPALIGNR(Imm(8), CC2, CC2, CC2)
+ VPALIGNR(Imm(8), CC3, CC3, CC3)
+ VPALIGNR(Imm(4), DD0, DD0, DD0)
+ VPALIGNR(Imm(4), DD1, DD1, DD1)
+ VPALIGNR(Imm(4), DD2, DD2, DD2)
+ VPALIGNR(Imm(4), DD3, DD3, DD3)
+
+ DECQ(itr1)
+ JG(LabelRef("sealAVX2Tail512LoopA"))
+ DECQ(itr2)
+ JGE(LabelRef("sealAVX2Tail512LoopB"))
+
+ chacha20Constants := chacha20Constants_DATA()
+ VPADDD(chacha20Constants, AA0, AA0)
+ VPADDD(chacha20Constants, AA1, AA1)
+ VPADDD(chacha20Constants, AA2, AA2)
+ VPADDD(chacha20Constants, AA3, AA3)
+ VPADDD(state1StoreAVX2, BB0, BB0)
+ VPADDD(state1StoreAVX2, BB1, BB1)
+ VPADDD(state1StoreAVX2, BB2, BB2)
+ VPADDD(state1StoreAVX2, BB3, BB3)
+ VPADDD(state2StoreAVX2, CC0, CC0)
+ VPADDD(state2StoreAVX2, CC1, CC1)
+ VPADDD(state2StoreAVX2, CC2, CC2)
+ VPADDD(state2StoreAVX2, CC3, CC3)
+ VPADDD(ctr0StoreAVX2, DD0, DD0)
+ VPADDD(ctr1StoreAVX2, DD1, DD1)
+ VPADDD(ctr2StoreAVX2, DD2, DD2)
+ VPADDD(ctr3StoreAVX2, DD3, DD3)
+ VMOVDQA(CC3, tmpStoreAVX2)
+ VPERM2I128(Imm(0x02), AA0, BB0, CC3)
+ VPXOR(Mem{Base: inp}.Offset(0*32), CC3, CC3)
+ VMOVDQU(CC3, Mem{Base: oup}.Offset(0*32))
+ VPERM2I128(Imm(0x02), CC0, DD0, CC3)
+ VPXOR(Mem{Base: inp}.Offset(1*32), CC3, CC3)
+ VMOVDQU(CC3, Mem{Base: oup}.Offset(1*32))
+ VPERM2I128(Imm(0x13), AA0, BB0, CC3)
+ VPXOR(Mem{Base: inp}.Offset(2*32), CC3, CC3)
+ VMOVDQU(CC3, Mem{Base: oup}.Offset(2*32))
+ VPERM2I128(Imm(0x13), CC0, DD0, CC3)
+ VPXOR(Mem{Base: inp}.Offset(3*32), CC3, CC3)
+ VMOVDQU(CC3, Mem{Base: oup}.Offset(3*32))
+
+ VPERM2I128(Imm(0x02), AA1, BB1, AA0)
+ VPERM2I128(Imm(0x02), CC1, DD1, BB0)
+ VPERM2I128(Imm(0x13), AA1, BB1, CC0)
+ VPERM2I128(Imm(0x13), CC1, DD1, DD0)
+ VPXOR(Mem{Base: inp}.Offset(4*32), AA0, AA0)
+ VPXOR(Mem{Base: inp}.Offset(5*32), BB0, BB0)
+ VPXOR(Mem{Base: inp}.Offset(6*32), CC0, CC0)
+ VPXOR(Mem{Base: inp}.Offset(7*32), DD0, DD0)
+ VMOVDQU(AA0, Mem{Base: oup}.Offset(4*32))
+ VMOVDQU(BB0, Mem{Base: oup}.Offset(5*32))
+ VMOVDQU(CC0, Mem{Base: oup}.Offset(6*32))
+ VMOVDQU(DD0, Mem{Base: oup}.Offset(7*32))
+
+ VPERM2I128(Imm(0x02), AA2, BB2, AA0)
+ VPERM2I128(Imm(0x02), CC2, DD2, BB0)
+ VPERM2I128(Imm(0x13), AA2, BB2, CC0)
+ VPERM2I128(Imm(0x13), CC2, DD2, DD0)
+ VPXOR(Mem{Base: inp}.Offset(8*32), AA0, AA0)
+ VPXOR(Mem{Base: inp}.Offset(9*32), BB0, BB0)
+ VPXOR(Mem{Base: inp}.Offset(10*32), CC0, CC0)
+ VPXOR(Mem{Base: inp}.Offset(11*32), DD0, DD0)
+ VMOVDQU(AA0, Mem{Base: oup}.Offset(8*32))
+ VMOVDQU(BB0, Mem{Base: oup}.Offset(9*32))
+ VMOVDQU(CC0, Mem{Base: oup}.Offset(10*32))
+ VMOVDQU(DD0, Mem{Base: oup}.Offset(11*32))
+
+ MOVQ(U32(384), itr1)
+ LEAQ(Mem{Base: inp}.Offset(384), inp)
+ SUBQ(U32(384), inl)
+ VPERM2I128(Imm(0x02), AA3, BB3, AA0)
+ VPERM2I128(Imm(0x02), tmpStoreAVX2, DD3, BB0)
+ VPERM2I128(Imm(0x13), AA3, BB3, CC0)
+ VPERM2I128(Imm(0x13), tmpStoreAVX2, DD3, DD0)
+
+ JMP(LabelRef("sealAVX2SealHash"))
+}
+
+// ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~DATA SECTION~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
+
+var (
+ // Pointers for memoizing DATA section symbols
+ chacha20Constants_DATA_ptr,
+ rol16_DATA_ptr,
+ rol8_DATA_ptr,
+ sseIncMask_DATA_ptr,
+ avx2IncMask_DATA_ptr,
+ avx2InitMask_DATA_ptr,
+ polyClampMask_DATA_ptr,
+ andMask_DATA_ptr *Mem
+)
+
+var nothingUpMySleeve = [8]uint32{
+ 0x61707865,
+ 0x3320646e,
+ 0x79622d32,
+ 0x6b206574,
+ 0x61707865,
+ 0x3320646e,
+ 0x79622d32,
+ 0x6b206574,
+}
+
+// ChaCha20 constants
+func chacha20Constants_DATA() Mem {
+ if chacha20Constants_DATA_ptr != nil {
+ return *chacha20Constants_DATA_ptr
+ }
+
+ chacha20Constants := GLOBL(ThatPeskyUnicodeDot+"chacha20Constants", NOPTR|RODATA)
+ chacha20Constants_DATA_ptr = &chacha20Constants
+ for i, v := range nothingUpMySleeve {
+ DATA(i*4, U32(v))
+ }
+ return chacha20Constants
+}
+
+var rol16Consts = [4]uint64{
+ 0x0504070601000302,
+ 0x0D0C0F0E09080B0A,
+ 0x0504070601000302,
+ 0x0D0C0F0E09080B0A,
+}
+
+// <<< 16 with PSHUFB
+func rol16_DATA() Mem {
+ if rol16_DATA_ptr != nil {
+ return *rol16_DATA_ptr
+ }
+
+ rol16 := GLOBL(ThatPeskyUnicodeDot+"rol16", NOPTR|RODATA)
+ rol16_DATA_ptr = &rol16
+ for i, v := range rol16Consts {
+ DATA(i*8, U64(v))
+ }
+ return rol16
+}
+
+var rol8Consts = [4]uint64{
+ 0x0605040702010003,
+ 0x0E0D0C0F0A09080B,
+ 0x0605040702010003,
+ 0x0E0D0C0F0A09080B,
+}
+
+// <<< 8 with PSHUFB
+func rol8_DATA() Mem {
+ if rol8_DATA_ptr != nil {
+ return *rol8_DATA_ptr
+ }
+
+ rol8 := GLOBL(ThatPeskyUnicodeDot+"rol8", NOPTR|RODATA)
+ rol8_DATA_ptr = &rol8
+ for i, v := range rol8Consts {
+ DATA(i*8, U64(v))
+ }
+ return rol8
+}
+
+var avx2InitMaskConsts = [4]uint64{
+ 0x0,
+ 0x0,
+ 0x1,
+ 0x0,
+}
+
+func avx2InitMask_DATA() Mem {
+ if avx2InitMask_DATA_ptr != nil {
+ return *avx2InitMask_DATA_ptr
+ }
+
+ avx2InitMask := GLOBL(ThatPeskyUnicodeDot+"avx2InitMask", NOPTR|RODATA)
+ avx2InitMask_DATA_ptr = &avx2InitMask
+ for i, v := range avx2InitMaskConsts {
+ DATA(i*8, U64(v))
+ }
+ return avx2InitMask
+}
+
+var avx2IncMaskConsts = [4]uint64{
+ 0x2,
+ 0x0,
+ 0x2,
+ 0x0,
+}
+
+func avx2IncMask_DATA() Mem {
+ if avx2IncMask_DATA_ptr != nil {
+ return *avx2IncMask_DATA_ptr
+ }
+
+ avx2IncMask := GLOBL(ThatPeskyUnicodeDot+"avx2IncMask", NOPTR|RODATA)
+ avx2IncMask_DATA_ptr = &avx2IncMask
+ for i, v := range avx2IncMaskConsts {
+ DATA(i*8, U64(v))
+ }
+ return avx2IncMask
+}
+
+var polyClampMaskConsts = [4]uint64{
+ 0x0FFFFFFC0FFFFFFF,
+ 0x0FFFFFFC0FFFFFFC,
+ 0xFFFFFFFFFFFFFFFF,
+ 0xFFFFFFFFFFFFFFFF,
+}
+
+// Poly1305 key clamp
+func polyClampMask_DATA() Mem {
+ if polyClampMask_DATA_ptr != nil {
+ return *polyClampMask_DATA_ptr
+ }
+
+ polyClampMask := GLOBL(ThatPeskyUnicodeDot+"polyClampMask", NOPTR|RODATA)
+ polyClampMask_DATA_ptr = &polyClampMask
+ for i, v := range polyClampMaskConsts {
+ DATA(i*8, U64(v))
+ }
+ return polyClampMask
+}
+
+var sseIncMaskConsts = [2]uint64{
+ 0x1,
+ 0x0,
+}
+
+func sseIncMask_DATA() Mem {
+ if sseIncMask_DATA_ptr != nil {
+ return *sseIncMask_DATA_ptr
+ }
+
+ sseIncMask := GLOBL(ThatPeskyUnicodeDot+"sseIncMask", NOPTR|RODATA)
+ sseIncMask_DATA_ptr = &sseIncMask
+ for i, v := range sseIncMaskConsts {
+ DATA(i*8, U64(v))
+ }
+ return sseIncMask
+}
+
+var andMaskConsts = [30]uint64{
+ 0x00000000000000ff,
+ 0x0000000000000000,
+ 0x000000000000ffff,
+ 0x0000000000000000,
+ 0x0000000000ffffff,
+ 0x0000000000000000,
+ 0x00000000ffffffff,
+ 0x0000000000000000,
+ 0x000000ffffffffff,
+ 0x0000000000000000,
+ 0x0000ffffffffffff,
+ 0x0000000000000000,
+ 0x00ffffffffffffff,
+ 0x0000000000000000,
+ 0xffffffffffffffff,
+ 0x0000000000000000,
+ 0xffffffffffffffff,
+ 0x00000000000000ff,
+ 0xffffffffffffffff,
+ 0x000000000000ffff,
+ 0xffffffffffffffff,
+ 0x0000000000ffffff,
+ 0xffffffffffffffff,
+ 0x00000000ffffffff,
+ 0xffffffffffffffff,
+ 0x000000ffffffffff,
+ 0xffffffffffffffff,
+ 0x0000ffffffffffff,
+ 0xffffffffffffffff,
+ 0x00ffffffffffffff,
+}
+
+func andMask_DATA() Mem {
+ if andMask_DATA_ptr != nil {
+ return *andMask_DATA_ptr
+ }
+
+ andMask := GLOBL(ThatPeskyUnicodeDot+"andMask", NOPTR|RODATA)
+ andMask_DATA_ptr = &andMask
+ for i, v := range andMaskConsts {
+ DATA(i*8, U64(v))
+ }
+ return andMask
+}
+
+// removePeskyUnicodeDot strips the dot from the relevant TEXT directives such that they
+// can exist as internal assembly functions
+//
+// Avo v0.6.0 does not support the generation of internal assembly functions. Go's unicode
+// dot tells the compiler to link a TEXT symbol to a function in the current Go package
+// (or another package if specified). Avo unconditionally prepends the unicode dot to all
+// TEXT symbols, making it impossible to emit an internal function without this hack.
+//
+// There is a pending PR to add internal functions to Avo:
+// https://github.com/mmcloughlin/avo/pull/443
+//
+// If merged it should allow the usage of InternalFunction("NAME") for the specified functions
+func removePeskyUnicodeDot(internalFunctions []string, target string) {
+ bytes, err := os.ReadFile(target)
+ if err != nil {
+ panic(err)
+ }
+
+ content := string(bytes)
+
+ for _, from := range internalFunctions {
+ to := strings.ReplaceAll(from, ThatPeskyUnicodeDot, "")
+ content = strings.ReplaceAll(content, from, to)
+ }
+
+ err = os.WriteFile(target, []byte(content), 0644)
+ if err != nil {
+ panic(err)
+ }
+}
diff --git a/chacha20poly1305/_asm/go.mod b/chacha20poly1305/_asm/go.mod
new file mode 100644
index 0000000..957baf2
--- /dev/null
+++ b/chacha20poly1305/_asm/go.mod
@@ -0,0 +1,15 @@
+module chacha20poly1305/_asm
+
+go 1.23
+
+require (
+ github.com/mmcloughlin/avo v0.6.0
+ golang.org/x/crypto v0.26.0
+)
+
+require (
+ golang.org/x/mod v0.20.0 // indirect
+ golang.org/x/sync v0.8.0 // indirect
+ golang.org/x/sys v0.24.0 // indirect
+ golang.org/x/tools v0.24.0 // indirect
+)
diff --git a/chacha20poly1305/_asm/go.sum b/chacha20poly1305/_asm/go.sum
new file mode 100644
index 0000000..62ea9df
--- /dev/null
+++ b/chacha20poly1305/_asm/go.sum
@@ -0,0 +1,12 @@
+github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY=
+github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8=
+golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw=
+golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54=
+golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0=
+golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
+golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ=
+golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg=
+golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24=
+golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ=
diff --git a/chacha20poly1305/chacha20poly1305_amd64.s b/chacha20poly1305/chacha20poly1305_amd64.s
index 731d2ac..fd5ee84 100644
--- a/chacha20poly1305/chacha20poly1305_amd64.s
+++ b/chacha20poly1305/chacha20poly1305_amd64.s
@@ -1,2715 +1,9762 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
+// Code generated by command: go run chacha20poly1305_amd64_asm.go -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305. DO NOT EDIT.
//go:build gc && !purego
#include "textflag.h"
-// General register allocation
-#define oup DI
-#define inp SI
-#define inl BX
-#define adp CX // free to reuse, after we hash the additional data
-#define keyp R8 // free to reuse, when we copy the key to stack
-#define itr2 R9 // general iterator
-#define itr1 CX // general iterator
-#define acc0 R10
-#define acc1 R11
-#define acc2 R12
-#define t0 R13
-#define t1 R14
-#define t2 R15
-#define t3 R8
-// Register and stack allocation for the SSE code
-#define rStore (0*16)(BP)
-#define sStore (1*16)(BP)
-#define state1Store (2*16)(BP)
-#define state2Store (3*16)(BP)
-#define tmpStore (4*16)(BP)
-#define ctr0Store (5*16)(BP)
-#define ctr1Store (6*16)(BP)
-#define ctr2Store (7*16)(BP)
-#define ctr3Store (8*16)(BP)
-#define A0 X0
-#define A1 X1
-#define A2 X2
-#define B0 X3
-#define B1 X4
-#define B2 X5
-#define C0 X6
-#define C1 X7
-#define C2 X8
-#define D0 X9
-#define D1 X10
-#define D2 X11
-#define T0 X12
-#define T1 X13
-#define T2 X14
-#define T3 X15
-#define A3 T0
-#define B3 T1
-#define C3 T2
-#define D3 T3
-// Register and stack allocation for the AVX2 code
-#define rsStoreAVX2 (0*32)(BP)
-#define state1StoreAVX2 (1*32)(BP)
-#define state2StoreAVX2 (2*32)(BP)
-#define ctr0StoreAVX2 (3*32)(BP)
-#define ctr1StoreAVX2 (4*32)(BP)
-#define ctr2StoreAVX2 (5*32)(BP)
-#define ctr3StoreAVX2 (6*32)(BP)
-#define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack
-#define AA0 Y0
-#define AA1 Y5
-#define AA2 Y6
-#define AA3 Y7
-#define BB0 Y14
-#define BB1 Y9
-#define BB2 Y10
-#define BB3 Y11
-#define CC0 Y12
-#define CC1 Y13
-#define CC2 Y8
-#define CC3 Y15
-#define DD0 Y4
-#define DD1 Y1
-#define DD2 Y2
-#define DD3 Y3
-#define TT0 DD3
-#define TT1 AA3
-#define TT2 BB3
-#define TT3 CC3
-// ChaCha20 constants
-DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
-DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
-DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
-DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
-DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
-DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
-DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
-DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
-// <<< 16 with PSHUFB
-DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
-DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
-DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
-DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
-// <<< 8 with PSHUFB
-DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
-DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
-DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
-DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
-DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
-DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
-DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
-DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
-
-DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
-DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
-DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
-DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
-// Poly1305 key clamp
-DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
-DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
-DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
-DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
-
-DATA ·sseIncMask<>+0x00(SB)/8, $0x1
-DATA ·sseIncMask<>+0x08(SB)/8, $0x0
-// To load/store the last < 16 bytes in a buffer
-DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
-DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
-DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
-DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
-DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
-DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
-DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
-DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
-DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
-DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
-DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
-DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
-DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
-DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
-
-GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
-GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
-GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
-GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
-GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
-GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
-GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
-GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
-// No PALIGNR in Go ASM yet (but VPALIGNR is present).
-#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
-#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
-#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5
-#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13
-#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6
-#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7
-#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8
-#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14
-#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9
-#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10
-#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11
-#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15
-#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3
-#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4
-#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5
-#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13
-#define shiftC0Right shiftC0Left
-#define shiftC1Right shiftC1Left
-#define shiftC2Right shiftC2Left
-#define shiftC3Right shiftC3Left
-#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9
-#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10
-#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11
-#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
-
-// Some macros
-
-// ROL rotates the uint32s in register R left by N bits, using temporary T.
-#define ROL(N, R, T) \
- MOVO R, T; PSLLL $(N), T; PSRLL $(32-(N)), R; PXOR T, R
-
-// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
-#ifdef GOAMD64_v2
-#define ROL16(R, T) PSHUFB ·rol16<>(SB), R
-#else
-#define ROL16(R, T) ROL(16, R, T)
-#endif
-
-// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
-#ifdef GOAMD64_v2
-#define ROL8(R, T) PSHUFB ·rol8<>(SB), R
-#else
-#define ROL8(R, T) ROL(8, R, T)
-#endif
-
-#define chachaQR(A, B, C, D, T) \
- PADDD B, A; PXOR A, D; ROL16(D, T) \
- PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
- PADDD B, A; PXOR A, D; ROL8(D, T) \
- PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
-
-#define chachaQR_AVX2(A, B, C, D, T) \
- VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \
- VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
- VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \
- VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
-
-#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
-#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2
-#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX
-#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3
-#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t3, t2; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2
-
-#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2
-#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3
-#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3
-
-#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage
-#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage
-// ----------------------------------------------------------------------------
+// func polyHashADInternal<>()
TEXT polyHashADInternal<>(SB), NOSPLIT, $0
- // adp points to beginning of additional data
- // itr2 holds ad length
- XORQ acc0, acc0
- XORQ acc1, acc1
- XORQ acc2, acc2
- CMPQ itr2, $13
- JNE hashADLoop
+ // Hack: Must declare #define macros inside of a function due to Avo constraints
+ // ROL rotates the uint32s in register R left by N bits, using temporary T.
+ #define ROL(N, R, T) \
+ MOVO R, T; \
+ PSLLL $(N), T; \
+ PSRLL $(32-(N)), R; \
+ PXOR T, R
-openFastTLSAD:
- // Special treatment for the TLS case of 13 bytes
- MOVQ (adp), acc0
- MOVQ 5(adp), acc1
- SHRQ $24, acc1
- MOVQ $1, acc2
- polyMul
+ // ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
+ #ifdef GOAMD64_v2
+ #define ROL8(R, T) PSHUFB ·rol8<>(SB), R
+ #else
+ #define ROL8(R, T) ROL(8, R, T)
+ #endif
+
+ // ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
+ #ifdef GOAMD64_v2
+ #define ROL16(R, T) PSHUFB ·rol16<>(SB), R
+ #else
+ #define ROL16(R, T) ROL(16, R, T)
+ #endif
+ XORQ R10, R10
+ XORQ R11, R11
+ XORQ R12, R12
+ CMPQ R9, $0x0d
+ JNE hashADLoop
+ MOVQ (CX), R10
+ MOVQ 5(CX), R11
+ SHRQ $0x18, R11
+ MOVQ $0x00000001, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
RET
hashADLoop:
// Hash in 16 byte chunks
- CMPQ itr2, $16
- JB hashADTail
- polyAdd(0(adp))
- LEAQ (1*16)(adp), adp
- SUBQ $16, itr2
- polyMul
- JMP hashADLoop
+ CMPQ R9, $0x10
+ JB hashADTail
+ ADDQ (CX), R10
+ ADCQ 8(CX), R11
+ ADCQ $0x01, R12
+ LEAQ 16(CX), CX
+ SUBQ $0x10, R9
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ JMP hashADLoop
hashADTail:
- CMPQ itr2, $0
+ CMPQ R9, $0x00
JE hashADDone
// Hash last < 16 byte tail
- XORQ t0, t0
- XORQ t1, t1
- XORQ t2, t2
- ADDQ itr2, adp
+ XORQ R13, R13
+ XORQ R14, R14
+ XORQ R15, R15
+ ADDQ R9, CX
hashADTailLoop:
- SHLQ $8, t0, t1
- SHLQ $8, t0
- MOVB -1(adp), t2
- XORQ t2, t0
- DECQ adp
- DECQ itr2
- JNE hashADTailLoop
+ SHLQ $0x08, R13, R14
+ SHLQ $0x08, R13
+ MOVB -1(CX), R15
+ XORQ R15, R13
+ DECQ CX
+ DECQ R9
+ JNE hashADTailLoop
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
-hashADTailFinish:
- ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
- polyMul
-
- // Finished AD
hashADDone:
RET
-// ----------------------------------------------------------------------------
-// func chacha20Poly1305Open(dst, key, src, ad []byte) bool
-TEXT ·chacha20Poly1305Open(SB), 0, $288-97
+// func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool
+// Requires: AVX, AVX2, BMI2, CMOV, SSE2
+TEXT ·chacha20Poly1305Open(SB), $288-97
// For aligned stack access
MOVQ SP, BP
- ADDQ $32, BP
+ ADDQ $0x20, BP
ANDQ $-32, BP
- MOVQ dst+0(FP), oup
- MOVQ key+24(FP), keyp
- MOVQ src+48(FP), inp
- MOVQ src_len+56(FP), inl
- MOVQ ad+72(FP), adp
+ MOVQ dst_base+0(FP), DI
+ MOVQ key_base+24(FP), R8
+ MOVQ src_base+48(FP), SI
+ MOVQ src_len+56(FP), BX
+ MOVQ ad_base+72(FP), CX
// Check for AVX2 support
- CMPB ·useAVX2(SB), $1
+ CMPB ·useAVX2+0(SB), $0x01
JE chacha20Poly1305Open_AVX2
// Special optimization, for very short buffers
- CMPQ inl, $128
- JBE openSSE128 // About 16% faster
+ CMPQ BX, $0x80
+ JBE openSSE128
// For long buffers, prepare the poly key first
- MOVOU ·chacha20Constants<>(SB), A0
- MOVOU (1*16)(keyp), B0
- MOVOU (2*16)(keyp), C0
- MOVOU (3*16)(keyp), D0
- MOVO D0, T1
+ MOVOU ·chacha20Constants<>+0(SB), X0
+ MOVOU 16(R8), X3
+ MOVOU 32(R8), X6
+ MOVOU 48(R8), X9
+ MOVO X9, X13
// Store state on stack for future use
- MOVO B0, state1Store
- MOVO C0, state2Store
- MOVO D0, ctr3Store
- MOVQ $10, itr2
+ MOVO X3, 32(BP)
+ MOVO X6, 48(BP)
+ MOVO X9, 128(BP)
+ MOVQ $0x0000000a, R9
openSSEPreparePolyKey:
- chachaQR(A0, B0, C0, D0, T0)
- shiftB0Left; shiftC0Left; shiftD0Left
- chachaQR(A0, B0, C0, D0, T0)
- shiftB0Right; shiftC0Right; shiftD0Right
- DECQ itr2
- JNE openSSEPreparePolyKey
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ DECQ R9
+ JNE openSSEPreparePolyKey
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
- PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
+ PADDL ·chacha20Constants<>+0(SB), X0
+ PADDL 32(BP), X3
// Clamp and store the key
- PAND ·polyClampMask<>(SB), A0
- MOVO A0, rStore; MOVO B0, sStore
+ PAND ·polyClampMask<>+0(SB), X0
+ MOVO X0, (BP)
+ MOVO X3, 16(BP)
// Hash AAD
- MOVQ ad_len+80(FP), itr2
+ MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
openSSEMainLoop:
- CMPQ inl, $256
+ CMPQ BX, $0x00000100
JB openSSEMainLoopDone
// Load state, increment counter blocks
- MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
- MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+ MOVO ·chacha20Constants<>+0(SB), X0
+ MOVO 32(BP), X3
+ MOVO 48(BP), X6
+ MOVO 128(BP), X9
+ PADDL ·sseIncMask<>+0(SB), X9
+ MOVO X0, X1
+ MOVO X3, X4
+ MOVO X6, X7
+ MOVO X9, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X1, X2
+ MOVO X4, X5
+ MOVO X7, X8
+ MOVO X10, X11
+ PADDL ·sseIncMask<>+0(SB), X11
+ MOVO X2, X12
+ MOVO X5, X13
+ MOVO X8, X14
+ MOVO X11, X15
+ PADDL ·sseIncMask<>+0(SB), X15
// Store counters
- MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
+ MOVO X9, 80(BP)
+ MOVO X10, 96(BP)
+ MOVO X11, 112(BP)
+ MOVO X15, 128(BP)
- // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
- MOVQ $4, itr1
- MOVQ inp, itr2
+ // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash
+ // 2 blocks, and for the remaining 4 only 1 block - for a total of 16
+ MOVQ $0x00000004, CX
+ MOVQ SI, R9
openSSEInternalLoop:
- MOVO C3, tmpStore
- chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
- MOVO tmpStore, C3
- MOVO C1, tmpStore
- chachaQR(A3, B3, C3, D3, C1)
- MOVO tmpStore, C1
- polyAdd(0(itr2))
- shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
- shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
- shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
- polyMulStage1
- polyMulStage2
- LEAQ (2*8)(itr2), itr2
- MOVO C3, tmpStore
- chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
- MOVO tmpStore, C3
- MOVO C1, tmpStore
- polyMulStage3
- chachaQR(A3, B3, C3, D3, C1)
- MOVO tmpStore, C1
- polyMulReduceStage
- shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
- shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
- shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
- DECQ itr1
- JGE openSSEInternalLoop
-
- polyAdd(0(itr2))
- polyMul
- LEAQ (2*8)(itr2), itr2
-
- CMPQ itr1, $-6
- JG openSSEInternalLoop
+ MOVO X14, 64(BP)
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X3
+ PXOR X14, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X3
+ PXOR X14, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X4
+ PXOR X14, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X4
+ PXOR X14, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X5
+ PXOR X14, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X5
+ PXOR X14, X5
+ MOVO 64(BP), X14
+ MOVO X7, 64(BP)
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL16(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x0c, X7
+ PSRLL $0x14, X13
+ PXOR X7, X13
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL8(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x07, X7
+ PSRLL $0x19, X13
+ PXOR X7, X13
+ MOVO 64(BP), X7
+ ADDQ (R9), R10
+ ADCQ 8(R9), R11
+ ADCQ $0x01, R12
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x0c
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ LEAQ 16(R9), R9
+ MOVO X14, 64(BP)
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X3
+ PXOR X14, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X3
+ PXOR X14, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X4
+ PXOR X14, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X4
+ PXOR X14, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X5
+ PXOR X14, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X5
+ PXOR X14, X5
+ MOVO 64(BP), X14
+ MOVO X7, 64(BP)
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL16(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x0c, X7
+ PSRLL $0x14, X13
+ PXOR X7, X13
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL8(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x07, X7
+ PSRLL $0x19, X13
+ PXOR X7, X13
+ MOVO 64(BP), X7
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x04
+ DECQ CX
+ JGE openSSEInternalLoop
+ ADDQ (R9), R10
+ ADCQ 8(R9), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(R9), R9
+ CMPQ CX, $-6
+ JG openSSEInternalLoop
// Add in the state
- PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
- PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
- PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
- PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
+ PADDD ·chacha20Constants<>+0(SB), X0
+ PADDD ·chacha20Constants<>+0(SB), X1
+ PADDD ·chacha20Constants<>+0(SB), X2
+ PADDD ·chacha20Constants<>+0(SB), X12
+ PADDD 32(BP), X3
+ PADDD 32(BP), X4
+ PADDD 32(BP), X5
+ PADDD 32(BP), X13
+ PADDD 48(BP), X6
+ PADDD 48(BP), X7
+ PADDD 48(BP), X8
+ PADDD 48(BP), X14
+ PADDD 80(BP), X9
+ PADDD 96(BP), X10
+ PADDD 112(BP), X11
+ PADDD 128(BP), X15
// Load - xor - store
- MOVO D3, tmpStore
- MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup)
- MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup)
- MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup)
- MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup)
- MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup)
- MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup)
- MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup)
- MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup)
- MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup)
- MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup)
- MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup)
- MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup)
- MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup)
- MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup)
- MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup)
- MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup)
- LEAQ 256(inp), inp
- LEAQ 256(oup), oup
- SUBQ $256, inl
+ MOVO X15, 64(BP)
+ MOVOU (SI), X15
+ PXOR X15, X0
+ MOVOU X0, (DI)
+ MOVOU 16(SI), X15
+ PXOR X15, X3
+ MOVOU X3, 16(DI)
+ MOVOU 32(SI), X15
+ PXOR X15, X6
+ MOVOU X6, 32(DI)
+ MOVOU 48(SI), X15
+ PXOR X15, X9
+ MOVOU X9, 48(DI)
+ MOVOU 64(SI), X9
+ PXOR X9, X1
+ MOVOU X1, 64(DI)
+ MOVOU 80(SI), X9
+ PXOR X9, X4
+ MOVOU X4, 80(DI)
+ MOVOU 96(SI), X9
+ PXOR X9, X7
+ MOVOU X7, 96(DI)
+ MOVOU 112(SI), X9
+ PXOR X9, X10
+ MOVOU X10, 112(DI)
+ MOVOU 128(SI), X9
+ PXOR X9, X2
+ MOVOU X2, 128(DI)
+ MOVOU 144(SI), X9
+ PXOR X9, X5
+ MOVOU X5, 144(DI)
+ MOVOU 160(SI), X9
+ PXOR X9, X8
+ MOVOU X8, 160(DI)
+ MOVOU 176(SI), X9
+ PXOR X9, X11
+ MOVOU X11, 176(DI)
+ MOVOU 192(SI), X9
+ PXOR X9, X12
+ MOVOU X12, 192(DI)
+ MOVOU 208(SI), X9
+ PXOR X9, X13
+ MOVOU X13, 208(DI)
+ MOVOU 224(SI), X9
+ PXOR X9, X14
+ MOVOU X14, 224(DI)
+ MOVOU 240(SI), X9
+ PXOR 64(BP), X9
+ MOVOU X9, 240(DI)
+ LEAQ 256(SI), SI
+ LEAQ 256(DI), DI
+ SUBQ $0x00000100, BX
JMP openSSEMainLoop
openSSEMainLoopDone:
// Handle the various tail sizes efficiently
- TESTQ inl, inl
+ TESTQ BX, BX
JE openSSEFinalize
- CMPQ inl, $64
+ CMPQ BX, $0x40
JBE openSSETail64
- CMPQ inl, $128
+ CMPQ BX, $0x80
JBE openSSETail128
- CMPQ inl, $192
+ CMPQ BX, $0xc0
JBE openSSETail192
JMP openSSETail256
openSSEFinalize:
// Hash in the PT, AAD lengths
- ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2
- polyMul
+ ADDQ ad_len+80(FP), R10
+ ADCQ src_len+56(FP), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
// Final reduce
- MOVQ acc0, t0
- MOVQ acc1, t1
- MOVQ acc2, t2
- SUBQ $-5, acc0
- SBBQ $-1, acc1
- SBBQ $3, acc2
- CMOVQCS t0, acc0
- CMOVQCS t1, acc1
- CMOVQCS t2, acc2
+ MOVQ R10, R13
+ MOVQ R11, R14
+ MOVQ R12, R15
+ SUBQ $-5, R10
+ SBBQ $-1, R11
+ SBBQ $0x03, R12
+ CMOVQCS R13, R10
+ CMOVQCS R14, R11
+ CMOVQCS R15, R12
// Add in the "s" part of the key
- ADDQ 0+sStore, acc0
- ADCQ 8+sStore, acc1
+ ADDQ 16(BP), R10
+ ADCQ 24(BP), R11
// Finally, constant time compare to the tag at the end of the message
XORQ AX, AX
- MOVQ $1, DX
- XORQ (0*8)(inp), acc0
- XORQ (1*8)(inp), acc1
- ORQ acc1, acc0
+ MOVQ $0x00000001, DX
+ XORQ (SI), R10
+ XORQ 8(SI), R11
+ ORQ R11, R10
CMOVQEQ DX, AX
// Return true iff tags are equal
MOVB AX, ret+96(FP)
RET
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 129 bytes
openSSE128:
- // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
- MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
- MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
- MOVQ $10, itr2
+ MOVOU ·chacha20Constants<>+0(SB), X0
+ MOVOU 16(R8), X3
+ MOVOU 32(R8), X6
+ MOVOU 48(R8), X9
+ MOVO X0, X1
+ MOVO X3, X4
+ MOVO X6, X7
+ MOVO X9, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X1, X2
+ MOVO X4, X5
+ MOVO X7, X8
+ MOVO X10, X11
+ PADDL ·sseIncMask<>+0(SB), X11
+ MOVO X3, X13
+ MOVO X6, X14
+ MOVO X10, X15
+ MOVQ $0x0000000a, R9
openSSE128InnerCipherLoop:
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
- shiftB0Left; shiftB1Left; shiftB2Left
- shiftC0Left; shiftC1Left; shiftC2Left
- shiftD0Left; shiftD1Left; shiftD2Left
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
- shiftB0Right; shiftB1Right; shiftB2Right
- shiftC0Right; shiftC1Right; shiftC2Right
- shiftD0Right; shiftD1Right; shiftD2Right
- DECQ itr2
- JNE openSSE128InnerCipherLoop
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X5
+ PXOR X12, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X5
+ PXOR X12, X5
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X5
+ PXOR X12, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X5
+ PXOR X12, X5
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ DECQ R9
+ JNE openSSE128InnerCipherLoop
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
- PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
- PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
- PADDL T2, C1; PADDL T2, C2
- PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
+ PADDL ·chacha20Constants<>+0(SB), X0
+ PADDL ·chacha20Constants<>+0(SB), X1
+ PADDL ·chacha20Constants<>+0(SB), X2
+ PADDL X13, X3
+ PADDL X13, X4
+ PADDL X13, X5
+ PADDL X14, X7
+ PADDL X14, X8
+ PADDL X15, X10
+ PADDL ·sseIncMask<>+0(SB), X15
+ PADDL X15, X11
// Clamp and store the key
- PAND ·polyClampMask<>(SB), A0
- MOVOU A0, rStore; MOVOU B0, sStore
+ PAND ·polyClampMask<>+0(SB), X0
+ MOVOU X0, (BP)
+ MOVOU X3, 16(BP)
// Hash
- MOVQ ad_len+80(FP), itr2
+ MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
openSSE128Open:
- CMPQ inl, $16
+ CMPQ BX, $0x10
JB openSSETail16
- SUBQ $16, inl
+ SUBQ $0x10, BX
// Load for hashing
- polyAdd(0(inp))
+ ADDQ (SI), R10
+ ADCQ 8(SI), R11
+ ADCQ $0x01, R12
// Load for decryption
- MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup)
- LEAQ (1*16)(inp), inp
- LEAQ (1*16)(oup), oup
- polyMul
+ MOVOU (SI), X12
+ PXOR X12, X1
+ MOVOU X1, (DI)
+ LEAQ 16(SI), SI
+ LEAQ 16(DI), DI
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
// Shift the stream "left"
- MOVO B1, A1
- MOVO C1, B1
- MOVO D1, C1
- MOVO A2, D1
- MOVO B2, A2
- MOVO C2, B2
- MOVO D2, C2
+ MOVO X4, X1
+ MOVO X7, X4
+ MOVO X10, X7
+ MOVO X2, X10
+ MOVO X5, X2
+ MOVO X8, X5
+ MOVO X11, X8
JMP openSSE128Open
openSSETail16:
- TESTQ inl, inl
+ TESTQ BX, BX
JE openSSEFinalize
// We can safely load the CT from the end, because it is padded with the MAC
- MOVQ inl, itr2
- SHLQ $4, itr2
- LEAQ ·andMask<>(SB), t0
- MOVOU (inp), T0
- ADDQ inl, inp
- PAND -16(t0)(itr2*1), T0
- MOVO T0, 0+tmpStore
- MOVQ T0, t0
- MOVQ 8+tmpStore, t1
- PXOR A1, T0
+ MOVQ BX, R9
+ SHLQ $0x04, R9
+ LEAQ ·andMask<>+0(SB), R13
+ MOVOU (SI), X12
+ ADDQ BX, SI
+ PAND -16(R13)(R9*1), X12
+ MOVO X12, 64(BP)
+ MOVQ X12, R13
+ MOVQ 72(BP), R14
+ PXOR X1, X12
// We can only store one byte at a time, since plaintext can be shorter than 16 bytes
openSSETail16Store:
- MOVQ T0, t3
- MOVB t3, (oup)
- PSRLDQ $1, T0
- INCQ oup
- DECQ inl
+ MOVQ X12, R8
+ MOVB R8, (DI)
+ PSRLDQ $0x01, X12
+ INCQ DI
+ DECQ BX
JNE openSSETail16Store
- ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
- polyMul
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
JMP openSSEFinalize
-// ----------------------------------------------------------------------------
-// Special optimization for the last 64 bytes of ciphertext
openSSETail64:
- // Need to decrypt up to 64 bytes - prepare single block
- MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
- XORQ itr2, itr2
- MOVQ inl, itr1
- CMPQ itr1, $16
- JB openSSETail64LoopB
+ MOVO ·chacha20Constants<>+0(SB), X0
+ MOVO 32(BP), X3
+ MOVO 48(BP), X6
+ MOVO 128(BP), X9
+ PADDL ·sseIncMask<>+0(SB), X9
+ MOVO X9, 80(BP)
+ XORQ R9, R9
+ MOVQ BX, CX
+ CMPQ CX, $0x10
+ JB openSSETail64LoopB
openSSETail64LoopA:
- // Perform ChaCha rounds, while hashing the remaining input
- polyAdd(0(inp)(itr2*1))
- polyMul
- SUBQ $16, itr1
+ ADDQ (SI)(R9*1), R10
+ ADCQ 8(SI)(R9*1), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ SUBQ $0x10, CX
openSSETail64LoopB:
- ADDQ $16, itr2
- chachaQR(A0, B0, C0, D0, T0)
- shiftB0Left; shiftC0Left; shiftD0Left
- chachaQR(A0, B0, C0, D0, T0)
- shiftB0Right; shiftC0Right; shiftD0Right
-
- CMPQ itr1, $16
- JAE openSSETail64LoopA
-
- CMPQ itr2, $160
- JNE openSSETail64LoopB
-
- PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
+ ADDQ $0x10, R9
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ CMPQ CX, $0x10
+ JAE openSSETail64LoopA
+ CMPQ R9, $0xa0
+ JNE openSSETail64LoopB
+ PADDL ·chacha20Constants<>+0(SB), X0
+ PADDL 32(BP), X3
+ PADDL 48(BP), X6
+ PADDL 80(BP), X9
openSSETail64DecLoop:
- CMPQ inl, $16
+ CMPQ BX, $0x10
JB openSSETail64DecLoopDone
- SUBQ $16, inl
- MOVOU (inp), T0
- PXOR T0, A0
- MOVOU A0, (oup)
- LEAQ 16(inp), inp
- LEAQ 16(oup), oup
- MOVO B0, A0
- MOVO C0, B0
- MOVO D0, C0
+ SUBQ $0x10, BX
+ MOVOU (SI), X12
+ PXOR X12, X0
+ MOVOU X0, (DI)
+ LEAQ 16(SI), SI
+ LEAQ 16(DI), DI
+ MOVO X3, X0
+ MOVO X6, X3
+ MOVO X9, X6
JMP openSSETail64DecLoop
openSSETail64DecLoopDone:
- MOVO A0, A1
+ MOVO X0, X1
JMP openSSETail16
-// ----------------------------------------------------------------------------
-// Special optimization for the last 128 bytes of ciphertext
openSSETail128:
- // Need to decrypt up to 128 bytes - prepare two blocks
- MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
- MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
- XORQ itr2, itr2
- MOVQ inl, itr1
- ANDQ $-16, itr1
+ MOVO ·chacha20Constants<>+0(SB), X1
+ MOVO 32(BP), X4
+ MOVO 48(BP), X7
+ MOVO 128(BP), X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X10, 80(BP)
+ MOVO X1, X0
+ MOVO X4, X3
+ MOVO X7, X6
+ MOVO X10, X9
+ PADDL ·sseIncMask<>+0(SB), X9
+ MOVO X9, 96(BP)
+ XORQ R9, R9
+ MOVQ BX, CX
+ ANDQ $-16, CX
openSSETail128LoopA:
- // Perform ChaCha rounds, while hashing the remaining input
- polyAdd(0(inp)(itr2*1))
- polyMul
+ ADDQ (SI)(R9*1), R10
+ ADCQ 8(SI)(R9*1), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
openSSETail128LoopB:
- ADDQ $16, itr2
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
- shiftB0Left; shiftC0Left; shiftD0Left
- shiftB1Left; shiftC1Left; shiftD1Left
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
- shiftB0Right; shiftC0Right; shiftD0Right
- shiftB1Right; shiftC1Right; shiftD1Right
+ ADDQ $0x10, R9
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ CMPQ R9, CX
+ JB openSSETail128LoopA
+ CMPQ R9, $0xa0
+ JNE openSSETail128LoopB
+ PADDL ·chacha20Constants<>+0(SB), X0
+ PADDL ·chacha20Constants<>+0(SB), X1
+ PADDL 32(BP), X3
+ PADDL 32(BP), X4
+ PADDL 48(BP), X6
+ PADDL 48(BP), X7
+ PADDL 96(BP), X9
+ PADDL 80(BP), X10
+ MOVOU (SI), X12
+ MOVOU 16(SI), X13
+ MOVOU 32(SI), X14
+ MOVOU 48(SI), X15
+ PXOR X12, X1
+ PXOR X13, X4
+ PXOR X14, X7
+ PXOR X15, X10
+ MOVOU X1, (DI)
+ MOVOU X4, 16(DI)
+ MOVOU X7, 32(DI)
+ MOVOU X10, 48(DI)
+ SUBQ $0x40, BX
+ LEAQ 64(SI), SI
+ LEAQ 64(DI), DI
+ JMP openSSETail64DecLoop
- CMPQ itr2, itr1
- JB openSSETail128LoopA
-
- CMPQ itr2, $160
- JNE openSSETail128LoopB
-
- PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
- PADDL state1Store, B0; PADDL state1Store, B1
- PADDL state2Store, C0; PADDL state2Store, C1
- PADDL ctr1Store, D0; PADDL ctr0Store, D1
-
- MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
- PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
- MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
-
- SUBQ $64, inl
- LEAQ 64(inp), inp
- LEAQ 64(oup), oup
- JMP openSSETail64DecLoop
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 192 bytes of ciphertext
openSSETail192:
- // Need to decrypt up to 192 bytes - prepare three blocks
- MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
- MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
- MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
-
- MOVQ inl, itr1
- MOVQ $160, itr2
- CMPQ itr1, $160
- CMOVQGT itr2, itr1
- ANDQ $-16, itr1
- XORQ itr2, itr2
+ MOVO ·chacha20Constants<>+0(SB), X2
+ MOVO 32(BP), X5
+ MOVO 48(BP), X8
+ MOVO 128(BP), X11
+ PADDL ·sseIncMask<>+0(SB), X11
+ MOVO X11, 80(BP)
+ MOVO X2, X1
+ MOVO X5, X4
+ MOVO X8, X7
+ MOVO X11, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X10, 96(BP)
+ MOVO X1, X0
+ MOVO X4, X3
+ MOVO X7, X6
+ MOVO X10, X9
+ PADDL ·sseIncMask<>+0(SB), X9
+ MOVO X9, 112(BP)
+ MOVQ BX, CX
+ MOVQ $0x000000a0, R9
+ CMPQ CX, $0xa0
+ CMOVQGT R9, CX
+ ANDQ $-16, CX
+ XORQ R9, R9
openSSLTail192LoopA:
- // Perform ChaCha rounds, while hashing the remaining input
- polyAdd(0(inp)(itr2*1))
- polyMul
+ ADDQ (SI)(R9*1), R10
+ ADCQ 8(SI)(R9*1), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
openSSLTail192LoopB:
- ADDQ $16, itr2
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
- shiftB0Left; shiftC0Left; shiftD0Left
- shiftB1Left; shiftC1Left; shiftD1Left
- shiftB2Left; shiftC2Left; shiftD2Left
-
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
- shiftB0Right; shiftC0Right; shiftD0Right
- shiftB1Right; shiftC1Right; shiftD1Right
- shiftB2Right; shiftC2Right; shiftD2Right
-
- CMPQ itr2, itr1
- JB openSSLTail192LoopA
-
- CMPQ itr2, $160
- JNE openSSLTail192LoopB
-
- CMPQ inl, $176
- JB openSSLTail192Store
-
- polyAdd(160(inp))
- polyMul
-
- CMPQ inl, $192
- JB openSSLTail192Store
-
- polyAdd(176(inp))
- polyMul
+ ADDQ $0x10, R9
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X5
+ PXOR X12, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X5
+ PXOR X12, X5
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X5
+ PXOR X12, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X5
+ PXOR X12, X5
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ CMPQ R9, CX
+ JB openSSLTail192LoopA
+ CMPQ R9, $0xa0
+ JNE openSSLTail192LoopB
+ CMPQ BX, $0xb0
+ JB openSSLTail192Store
+ ADDQ 160(SI), R10
+ ADCQ 168(SI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ CMPQ BX, $0xc0
+ JB openSSLTail192Store
+ ADDQ 176(SI), R10
+ ADCQ 184(SI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
openSSLTail192Store:
- PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
- PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
- PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
- PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
+ PADDL ·chacha20Constants<>+0(SB), X0
+ PADDL ·chacha20Constants<>+0(SB), X1
+ PADDL ·chacha20Constants<>+0(SB), X2
+ PADDL 32(BP), X3
+ PADDL 32(BP), X4
+ PADDL 32(BP), X5
+ PADDL 48(BP), X6
+ PADDL 48(BP), X7
+ PADDL 48(BP), X8
+ PADDL 112(BP), X9
+ PADDL 96(BP), X10
+ PADDL 80(BP), X11
+ MOVOU (SI), X12
+ MOVOU 16(SI), X13
+ MOVOU 32(SI), X14
+ MOVOU 48(SI), X15
+ PXOR X12, X2
+ PXOR X13, X5
+ PXOR X14, X8
+ PXOR X15, X11
+ MOVOU X2, (DI)
+ MOVOU X5, 16(DI)
+ MOVOU X8, 32(DI)
+ MOVOU X11, 48(DI)
+ MOVOU 64(SI), X12
+ MOVOU 80(SI), X13
+ MOVOU 96(SI), X14
+ MOVOU 112(SI), X15
+ PXOR X12, X1
+ PXOR X13, X4
+ PXOR X14, X7
+ PXOR X15, X10
+ MOVOU X1, 64(DI)
+ MOVOU X4, 80(DI)
+ MOVOU X7, 96(DI)
+ MOVOU X10, 112(DI)
+ SUBQ $0x80, BX
+ LEAQ 128(SI), SI
+ LEAQ 128(DI), DI
+ JMP openSSETail64DecLoop
- MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
- PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2
- MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup)
-
- MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
- PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
- MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
-
- SUBQ $128, inl
- LEAQ 128(inp), inp
- LEAQ 128(oup), oup
- JMP openSSETail64DecLoop
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 256 bytes of ciphertext
openSSETail256:
- // Need to decrypt up to 256 bytes - prepare four blocks
- MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
- MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+ MOVO ·chacha20Constants<>+0(SB), X0
+ MOVO 32(BP), X3
+ MOVO 48(BP), X6
+ MOVO 128(BP), X9
+ PADDL ·sseIncMask<>+0(SB), X9
+ MOVO X0, X1
+ MOVO X3, X4
+ MOVO X6, X7
+ MOVO X9, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X1, X2
+ MOVO X4, X5
+ MOVO X7, X8
+ MOVO X10, X11
+ PADDL ·sseIncMask<>+0(SB), X11
+ MOVO X2, X12
+ MOVO X5, X13
+ MOVO X8, X14
+ MOVO X11, X15
+ PADDL ·sseIncMask<>+0(SB), X15
// Store counters
- MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
- XORQ itr2, itr2
+ MOVO X9, 80(BP)
+ MOVO X10, 96(BP)
+ MOVO X11, 112(BP)
+ MOVO X15, 128(BP)
+ XORQ R9, R9
openSSETail256Loop:
- // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
- polyAdd(0(inp)(itr2*1))
- MOVO C3, tmpStore
- chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
- MOVO tmpStore, C3
- MOVO C1, tmpStore
- chachaQR(A3, B3, C3, D3, C1)
- MOVO tmpStore, C1
- shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
- shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
- shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
- polyMulStage1
- polyMulStage2
- MOVO C3, tmpStore
- chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
- MOVO tmpStore, C3
- MOVO C1, tmpStore
- chachaQR(A3, B3, C3, D3, C1)
- MOVO tmpStore, C1
- polyMulStage3
- polyMulReduceStage
- shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
- shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
- shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
- ADDQ $2*8, itr2
- CMPQ itr2, $160
- JB openSSETail256Loop
- MOVQ inl, itr1
- ANDQ $-16, itr1
+ ADDQ (SI)(R9*1), R10
+ ADCQ 8(SI)(R9*1), R11
+ ADCQ $0x01, R12
+ MOVO X14, 64(BP)
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X3
+ PXOR X14, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X3
+ PXOR X14, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X4
+ PXOR X14, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X4
+ PXOR X14, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X5
+ PXOR X14, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X5
+ PXOR X14, X5
+ MOVO 64(BP), X14
+ MOVO X7, 64(BP)
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL16(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x0c, X7
+ PSRLL $0x14, X13
+ PXOR X7, X13
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL8(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x07, X7
+ PSRLL $0x19, X13
+ PXOR X7, X13
+ MOVO 64(BP), X7
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x0c
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ MOVO X14, 64(BP)
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X3
+ PXOR X14, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X3
+ PXOR X14, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X4
+ PXOR X14, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X4
+ PXOR X14, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X5
+ PXOR X14, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X5
+ PXOR X14, X5
+ MOVO 64(BP), X14
+ MOVO X7, 64(BP)
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL16(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x0c, X7
+ PSRLL $0x14, X13
+ PXOR X7, X13
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL8(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x07, X7
+ PSRLL $0x19, X13
+ PXOR X7, X13
+ MOVO 64(BP), X7
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x04
+ ADDQ $0x10, R9
+ CMPQ R9, $0xa0
+ JB openSSETail256Loop
+ MOVQ BX, CX
+ ANDQ $-16, CX
openSSETail256HashLoop:
- polyAdd(0(inp)(itr2*1))
- polyMul
- ADDQ $2*8, itr2
- CMPQ itr2, itr1
- JB openSSETail256HashLoop
+ ADDQ (SI)(R9*1), R10
+ ADCQ 8(SI)(R9*1), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ ADDQ $0x10, R9
+ CMPQ R9, CX
+ JB openSSETail256HashLoop
// Add in the state
- PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
- PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
- PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
- PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
- MOVO D3, tmpStore
+ PADDD ·chacha20Constants<>+0(SB), X0
+ PADDD ·chacha20Constants<>+0(SB), X1
+ PADDD ·chacha20Constants<>+0(SB), X2
+ PADDD ·chacha20Constants<>+0(SB), X12
+ PADDD 32(BP), X3
+ PADDD 32(BP), X4
+ PADDD 32(BP), X5
+ PADDD 32(BP), X13
+ PADDD 48(BP), X6
+ PADDD 48(BP), X7
+ PADDD 48(BP), X8
+ PADDD 48(BP), X14
+ PADDD 80(BP), X9
+ PADDD 96(BP), X10
+ PADDD 112(BP), X11
+ PADDD 128(BP), X15
+ MOVO X15, 64(BP)
// Load - xor - store
- MOVOU (0*16)(inp), D3; PXOR D3, A0
- MOVOU (1*16)(inp), D3; PXOR D3, B0
- MOVOU (2*16)(inp), D3; PXOR D3, C0
- MOVOU (3*16)(inp), D3; PXOR D3, D0
- MOVOU A0, (0*16)(oup)
- MOVOU B0, (1*16)(oup)
- MOVOU C0, (2*16)(oup)
- MOVOU D0, (3*16)(oup)
- MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
- PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
- MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
- MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
- PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
- MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
- LEAQ 192(inp), inp
- LEAQ 192(oup), oup
- SUBQ $192, inl
- MOVO A3, A0
- MOVO B3, B0
- MOVO C3, C0
- MOVO tmpStore, D0
+ MOVOU (SI), X15
+ PXOR X15, X0
+ MOVOU 16(SI), X15
+ PXOR X15, X3
+ MOVOU 32(SI), X15
+ PXOR X15, X6
+ MOVOU 48(SI), X15
+ PXOR X15, X9
+ MOVOU X0, (DI)
+ MOVOU X3, 16(DI)
+ MOVOU X6, 32(DI)
+ MOVOU X9, 48(DI)
+ MOVOU 64(SI), X0
+ MOVOU 80(SI), X3
+ MOVOU 96(SI), X6
+ MOVOU 112(SI), X9
+ PXOR X0, X1
+ PXOR X3, X4
+ PXOR X6, X7
+ PXOR X9, X10
+ MOVOU X1, 64(DI)
+ MOVOU X4, 80(DI)
+ MOVOU X7, 96(DI)
+ MOVOU X10, 112(DI)
+ MOVOU 128(SI), X0
+ MOVOU 144(SI), X3
+ MOVOU 160(SI), X6
+ MOVOU 176(SI), X9
+ PXOR X0, X2
+ PXOR X3, X5
+ PXOR X6, X8
+ PXOR X9, X11
+ MOVOU X2, 128(DI)
+ MOVOU X5, 144(DI)
+ MOVOU X8, 160(DI)
+ MOVOU X11, 176(DI)
+ LEAQ 192(SI), SI
+ LEAQ 192(DI), DI
+ SUBQ $0xc0, BX
+ MOVO X12, X0
+ MOVO X13, X3
+ MOVO X14, X6
+ MOVO 64(BP), X9
+ JMP openSSETail64DecLoop
- JMP openSSETail64DecLoop
-
-// ----------------------------------------------------------------------------
-// ------------------------- AVX2 Code ----------------------------------------
chacha20Poly1305Open_AVX2:
VZEROUPPER
- VMOVDQU ·chacha20Constants<>(SB), AA0
- BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
- BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
- BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
- VPADDD ·avx2InitMask<>(SB), DD0, DD0
+ VMOVDQU ·chacha20Constants<>+0(SB), Y0
+ BYTE $0xc4
+ BYTE $0x42
+ BYTE $0x7d
+ BYTE $0x5a
+ BYTE $0x70
+ BYTE $0x10
+ BYTE $0xc4
+ BYTE $0x42
+ BYTE $0x7d
+ BYTE $0x5a
+ BYTE $0x60
+ BYTE $0x20
+ BYTE $0xc4
+ BYTE $0xc2
+ BYTE $0x7d
+ BYTE $0x5a
+ BYTE $0x60
+ BYTE $0x30
+ VPADDD ·avx2InitMask<>+0(SB), Y4, Y4
// Special optimization, for very short buffers
- CMPQ inl, $192
+ CMPQ BX, $0xc0
JBE openAVX2192
- CMPQ inl, $320
+ CMPQ BX, $0x00000140
JBE openAVX2320
// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
- VMOVDQA BB0, state1StoreAVX2
- VMOVDQA CC0, state2StoreAVX2
- VMOVDQA DD0, ctr3StoreAVX2
- MOVQ $10, itr2
+ VMOVDQA Y14, 32(BP)
+ VMOVDQA Y12, 64(BP)
+ VMOVDQA Y4, 192(BP)
+ MOVQ $0x0000000a, R9
openAVX2PreparePolyKey:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
- DECQ itr2
- JNE openAVX2PreparePolyKey
-
- VPADDD ·chacha20Constants<>(SB), AA0, AA0
- VPADDD state1StoreAVX2, BB0, BB0
- VPADDD state2StoreAVX2, CC0, CC0
- VPADDD ctr3StoreAVX2, DD0, DD0
-
- VPERM2I128 $0x02, AA0, BB0, TT0
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x04, Y4, Y4, Y4
+ DECQ R9
+ JNE openAVX2PreparePolyKey
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 192(BP), Y4, Y4
+ VPERM2I128 $0x02, Y0, Y14, Y3
// Clamp and store poly key
- VPAND ·polyClampMask<>(SB), TT0, TT0
- VMOVDQA TT0, rsStoreAVX2
+ VPAND ·polyClampMask<>+0(SB), Y3, Y3
+ VMOVDQA Y3, (BP)
// Stream for the first 64 bytes
- VPERM2I128 $0x13, AA0, BB0, AA0
- VPERM2I128 $0x13, CC0, DD0, BB0
+ VPERM2I128 $0x13, Y0, Y14, Y0
+ VPERM2I128 $0x13, Y12, Y4, Y14
// Hash AD + first 64 bytes
- MOVQ ad_len+80(FP), itr2
+ MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
- XORQ itr1, itr1
+ XORQ CX, CX
openAVX2InitialHash64:
- polyAdd(0(inp)(itr1*1))
- polyMulAVX2
- ADDQ $16, itr1
- CMPQ itr1, $64
- JNE openAVX2InitialHash64
+ ADDQ (SI)(CX*1), R10
+ ADCQ 8(SI)(CX*1), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ ADDQ $0x10, CX
+ CMPQ CX, $0x40
+ JNE openAVX2InitialHash64
// Decrypt the first 64 bytes
- VPXOR (0*32)(inp), AA0, AA0
- VPXOR (1*32)(inp), BB0, BB0
- VMOVDQU AA0, (0*32)(oup)
- VMOVDQU BB0, (1*32)(oup)
- LEAQ (2*32)(inp), inp
- LEAQ (2*32)(oup), oup
- SUBQ $64, inl
+ VPXOR (SI), Y0, Y0
+ VPXOR 32(SI), Y14, Y14
+ VMOVDQU Y0, (DI)
+ VMOVDQU Y14, 32(DI)
+ LEAQ 64(SI), SI
+ LEAQ 64(DI), DI
+ SUBQ $0x40, BX
openAVX2MainLoop:
- CMPQ inl, $512
+ CMPQ BX, $0x00000200
JB openAVX2MainLoopDone
// Load state, increment counter blocks, store the incremented counters
- VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
- VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
- VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
- XORQ itr1, itr1
+ VMOVDQU ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA Y0, Y5
+ VMOVDQA Y0, Y6
+ VMOVDQA Y0, Y7
+ VMOVDQA 32(BP), Y14
+ VMOVDQA Y14, Y9
+ VMOVDQA Y14, Y10
+ VMOVDQA Y14, Y11
+ VMOVDQA 64(BP), Y12
+ VMOVDQA Y12, Y13
+ VMOVDQA Y12, Y8
+ VMOVDQA Y12, Y15
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
+ VMOVDQA Y4, 96(BP)
+ VMOVDQA Y1, 128(BP)
+ VMOVDQA Y2, 160(BP)
+ VMOVDQA Y3, 192(BP)
+ XORQ CX, CX
openAVX2InternalLoop:
- // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
- // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
- polyAdd(0*8(inp)(itr1*1))
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- polyMulStage1_AVX2
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- polyMulStage2_AVX2
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- polyMulStage3_AVX2
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyMulReduceStage
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
- polyAdd(2*8(inp)(itr1*1))
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- polyMulStage1_AVX2
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyMulStage2_AVX2
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- polyMulStage3_AVX2
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- polyMulReduceStage
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- polyAdd(4*8(inp)(itr1*1))
- LEAQ (6*8)(itr1), itr1
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyMulStage1_AVX2
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- polyMulStage2_AVX2
- VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- polyMulStage3_AVX2
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyMulReduceStage
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
- CMPQ itr1, $480
+ ADDQ (SI)(CX*1), R10
+ ADCQ 8(SI)(CX*1), R11
+ ADCQ $0x01, R12
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ ADDQ 16(SI)(CX*1), R10
+ ADCQ 24(SI)(CX*1), R11
+ ADCQ $0x01, R12
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x07, Y11, Y15
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x04, Y11, Y11, Y11
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPALIGNR $0x0c, Y3, Y3, Y3
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ ADDQ 32(SI)(CX*1), R10
+ ADCQ 40(SI)(CX*1), R11
+ ADCQ $0x01, R12
+ LEAQ 48(CX), CX
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x07, Y11, Y15
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x0c, Y11, Y11, Y11
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x04, Y2, Y2, Y2
+ VPALIGNR $0x04, Y3, Y3, Y3
+ CMPQ CX, $0x000001e0
JNE openAVX2InternalLoop
-
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
- VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
- VMOVDQA CC3, tmpStoreAVX2
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
+ VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 32(BP), Y10, Y10
+ VPADDD 32(BP), Y11, Y11
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD 64(BP), Y8, Y8
+ VPADDD 64(BP), Y15, Y15
+ VPADDD 96(BP), Y4, Y4
+ VPADDD 128(BP), Y1, Y1
+ VPADDD 160(BP), Y2, Y2
+ VPADDD 192(BP), Y3, Y3
+ VMOVDQA Y15, 224(BP)
// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
- polyAdd(480(inp))
- polyMulAVX2
- VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
- VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
- VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
- VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
- VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
- VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
+ ADDQ 480(SI), R10
+ ADCQ 488(SI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPERM2I128 $0x02, Y0, Y14, Y15
+ VPERM2I128 $0x13, Y0, Y14, Y14
+ VPERM2I128 $0x02, Y12, Y4, Y0
+ VPERM2I128 $0x13, Y12, Y4, Y12
+ VPXOR (SI), Y15, Y15
+ VPXOR 32(SI), Y0, Y0
+ VPXOR 64(SI), Y14, Y14
+ VPXOR 96(SI), Y12, Y12
+ VMOVDQU Y15, (DI)
+ VMOVDQU Y0, 32(DI)
+ VMOVDQU Y14, 64(DI)
+ VMOVDQU Y12, 96(DI)
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
+ VPXOR 128(SI), Y0, Y0
+ VPXOR 160(SI), Y14, Y14
+ VPXOR 192(SI), Y12, Y12
+ VPXOR 224(SI), Y4, Y4
+ VMOVDQU Y0, 128(DI)
+ VMOVDQU Y14, 160(DI)
+ VMOVDQU Y12, 192(DI)
+ VMOVDQU Y4, 224(DI)
// and here
- polyAdd(496(inp))
- polyMulAVX2
- VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
- VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
- VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
- VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
- VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
- VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
- LEAQ (32*16)(inp), inp
- LEAQ (32*16)(oup), oup
- SUBQ $(32*16), inl
+ ADDQ 496(SI), R10
+ ADCQ 504(SI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPERM2I128 $0x02, Y6, Y10, Y0
+ VPERM2I128 $0x02, Y8, Y2, Y14
+ VPERM2I128 $0x13, Y6, Y10, Y12
+ VPERM2I128 $0x13, Y8, Y2, Y4
+ VPXOR 256(SI), Y0, Y0
+ VPXOR 288(SI), Y14, Y14
+ VPXOR 320(SI), Y12, Y12
+ VPXOR 352(SI), Y4, Y4
+ VMOVDQU Y0, 256(DI)
+ VMOVDQU Y14, 288(DI)
+ VMOVDQU Y12, 320(DI)
+ VMOVDQU Y4, 352(DI)
+ VPERM2I128 $0x02, Y7, Y11, Y0
+ VPERM2I128 $0x02, 224(BP), Y3, Y14
+ VPERM2I128 $0x13, Y7, Y11, Y12
+ VPERM2I128 $0x13, 224(BP), Y3, Y4
+ VPXOR 384(SI), Y0, Y0
+ VPXOR 416(SI), Y14, Y14
+ VPXOR 448(SI), Y12, Y12
+ VPXOR 480(SI), Y4, Y4
+ VMOVDQU Y0, 384(DI)
+ VMOVDQU Y14, 416(DI)
+ VMOVDQU Y12, 448(DI)
+ VMOVDQU Y4, 480(DI)
+ LEAQ 512(SI), SI
+ LEAQ 512(DI), DI
+ SUBQ $0x00000200, BX
JMP openAVX2MainLoop
openAVX2MainLoopDone:
// Handle the various tail sizes efficiently
- TESTQ inl, inl
+ TESTQ BX, BX
JE openSSEFinalize
- CMPQ inl, $128
+ CMPQ BX, $0x80
JBE openAVX2Tail128
- CMPQ inl, $256
+ CMPQ BX, $0x00000100
JBE openAVX2Tail256
- CMPQ inl, $384
+ CMPQ BX, $0x00000180
JBE openAVX2Tail384
JMP openAVX2Tail512
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 193 bytes
openAVX2192:
- // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
- VMOVDQA AA0, AA1
- VMOVDQA BB0, BB1
- VMOVDQA CC0, CC1
- VPADDD ·avx2IncMask<>(SB), DD0, DD1
- VMOVDQA AA0, AA2
- VMOVDQA BB0, BB2
- VMOVDQA CC0, CC2
- VMOVDQA DD0, DD2
- VMOVDQA DD1, TT3
- MOVQ $10, itr2
+ VMOVDQA Y0, Y5
+ VMOVDQA Y14, Y9
+ VMOVDQA Y12, Y13
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VMOVDQA Y0, Y6
+ VMOVDQA Y14, Y10
+ VMOVDQA Y12, Y8
+ VMOVDQA Y4, Y2
+ VMOVDQA Y1, Y15
+ MOVQ $0x0000000a, R9
openAVX2192InnerCipherLoop:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
- DECQ itr2
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ DECQ R9
JNE openAVX2192InnerCipherLoop
- VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1
- VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1
- VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1
- VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1
- VPERM2I128 $0x02, AA0, BB0, TT0
+ VPADDD Y6, Y0, Y0
+ VPADDD Y6, Y5, Y5
+ VPADDD Y10, Y14, Y14
+ VPADDD Y10, Y9, Y9
+ VPADDD Y8, Y12, Y12
+ VPADDD Y8, Y13, Y13
+ VPADDD Y2, Y4, Y4
+ VPADDD Y15, Y1, Y1
+ VPERM2I128 $0x02, Y0, Y14, Y3
// Clamp and store poly key
- VPAND ·polyClampMask<>(SB), TT0, TT0
- VMOVDQA TT0, rsStoreAVX2
+ VPAND ·polyClampMask<>+0(SB), Y3, Y3
+ VMOVDQA Y3, (BP)
// Stream for up to 192 bytes
- VPERM2I128 $0x13, AA0, BB0, AA0
- VPERM2I128 $0x13, CC0, DD0, BB0
- VPERM2I128 $0x02, AA1, BB1, CC0
- VPERM2I128 $0x02, CC1, DD1, DD0
- VPERM2I128 $0x13, AA1, BB1, AA1
- VPERM2I128 $0x13, CC1, DD1, BB1
+ VPERM2I128 $0x13, Y0, Y14, Y0
+ VPERM2I128 $0x13, Y12, Y4, Y14
+ VPERM2I128 $0x02, Y5, Y9, Y12
+ VPERM2I128 $0x02, Y13, Y1, Y4
+ VPERM2I128 $0x13, Y5, Y9, Y5
+ VPERM2I128 $0x13, Y13, Y1, Y9
openAVX2ShortOpen:
// Hash
- MOVQ ad_len+80(FP), itr2
+ MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
openAVX2ShortOpenLoop:
- CMPQ inl, $32
+ CMPQ BX, $0x20
JB openAVX2ShortTail32
- SUBQ $32, inl
+ SUBQ $0x20, BX
// Load for hashing
- polyAdd(0*8(inp))
- polyMulAVX2
- polyAdd(2*8(inp))
- polyMulAVX2
+ ADDQ (SI), R10
+ ADCQ 8(SI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ ADDQ 16(SI), R10
+ ADCQ 24(SI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
// Load for decryption
- VPXOR (inp), AA0, AA0
- VMOVDQU AA0, (oup)
- LEAQ (1*32)(inp), inp
- LEAQ (1*32)(oup), oup
+ VPXOR (SI), Y0, Y0
+ VMOVDQU Y0, (DI)
+ LEAQ 32(SI), SI
+ LEAQ 32(DI), DI
// Shift stream left
- VMOVDQA BB0, AA0
- VMOVDQA CC0, BB0
- VMOVDQA DD0, CC0
- VMOVDQA AA1, DD0
- VMOVDQA BB1, AA1
- VMOVDQA CC1, BB1
- VMOVDQA DD1, CC1
- VMOVDQA AA2, DD1
- VMOVDQA BB2, AA2
+ VMOVDQA Y14, Y0
+ VMOVDQA Y12, Y14
+ VMOVDQA Y4, Y12
+ VMOVDQA Y5, Y4
+ VMOVDQA Y9, Y5
+ VMOVDQA Y13, Y9
+ VMOVDQA Y1, Y13
+ VMOVDQA Y6, Y1
+ VMOVDQA Y10, Y6
JMP openAVX2ShortOpenLoop
openAVX2ShortTail32:
- CMPQ inl, $16
- VMOVDQA A0, A1
+ CMPQ BX, $0x10
+ VMOVDQA X0, X1
JB openAVX2ShortDone
-
- SUBQ $16, inl
+ SUBQ $0x10, BX
// Load for hashing
- polyAdd(0*8(inp))
- polyMulAVX2
+ ADDQ (SI), R10
+ ADCQ 8(SI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
// Load for decryption
- VPXOR (inp), A0, T0
- VMOVDQU T0, (oup)
- LEAQ (1*16)(inp), inp
- LEAQ (1*16)(oup), oup
- VPERM2I128 $0x11, AA0, AA0, AA0
- VMOVDQA A0, A1
+ VPXOR (SI), X0, X12
+ VMOVDQU X12, (DI)
+ LEAQ 16(SI), SI
+ LEAQ 16(DI), DI
+ VPERM2I128 $0x11, Y0, Y0, Y0
+ VMOVDQA X0, X1
openAVX2ShortDone:
VZEROUPPER
JMP openSSETail16
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 321 bytes
openAVX2320:
- // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
- VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
- VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
- VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
- MOVQ $10, itr2
+ VMOVDQA Y0, Y5
+ VMOVDQA Y14, Y9
+ VMOVDQA Y12, Y13
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VMOVDQA Y0, Y6
+ VMOVDQA Y14, Y10
+ VMOVDQA Y12, Y8
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VMOVDQA Y14, Y7
+ VMOVDQA Y12, Y11
+ VMOVDQA Y4, Y15
+ MOVQ $0x0000000a, R9
openAVX2320InnerCipherLoop:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
- DECQ itr2
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y3
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y3
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y3
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y3
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x04, Y2, Y2, Y2
+ DECQ R9
JNE openAVX2320InnerCipherLoop
-
- VMOVDQA ·chacha20Constants<>(SB), TT0
- VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
- VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
- VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
- VMOVDQA ·avx2IncMask<>(SB), TT0
- VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
- VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
- VPADDD TT3, DD2, DD2
+ VMOVDQA ·chacha20Constants<>+0(SB), Y3
+ VPADDD Y3, Y0, Y0
+ VPADDD Y3, Y5, Y5
+ VPADDD Y3, Y6, Y6
+ VPADDD Y7, Y14, Y14
+ VPADDD Y7, Y9, Y9
+ VPADDD Y7, Y10, Y10
+ VPADDD Y11, Y12, Y12
+ VPADDD Y11, Y13, Y13
+ VPADDD Y11, Y8, Y8
+ VMOVDQA ·avx2IncMask<>+0(SB), Y3
+ VPADDD Y15, Y4, Y4
+ VPADDD Y3, Y15, Y15
+ VPADDD Y15, Y1, Y1
+ VPADDD Y3, Y15, Y15
+ VPADDD Y15, Y2, Y2
// Clamp and store poly key
- VPERM2I128 $0x02, AA0, BB0, TT0
- VPAND ·polyClampMask<>(SB), TT0, TT0
- VMOVDQA TT0, rsStoreAVX2
+ VPERM2I128 $0x02, Y0, Y14, Y3
+ VPAND ·polyClampMask<>+0(SB), Y3, Y3
+ VMOVDQA Y3, (BP)
// Stream for up to 320 bytes
- VPERM2I128 $0x13, AA0, BB0, AA0
- VPERM2I128 $0x13, CC0, DD0, BB0
- VPERM2I128 $0x02, AA1, BB1, CC0
- VPERM2I128 $0x02, CC1, DD1, DD0
- VPERM2I128 $0x13, AA1, BB1, AA1
- VPERM2I128 $0x13, CC1, DD1, BB1
- VPERM2I128 $0x02, AA2, BB2, CC1
- VPERM2I128 $0x02, CC2, DD2, DD1
- VPERM2I128 $0x13, AA2, BB2, AA2
- VPERM2I128 $0x13, CC2, DD2, BB2
+ VPERM2I128 $0x13, Y0, Y14, Y0
+ VPERM2I128 $0x13, Y12, Y4, Y14
+ VPERM2I128 $0x02, Y5, Y9, Y12
+ VPERM2I128 $0x02, Y13, Y1, Y4
+ VPERM2I128 $0x13, Y5, Y9, Y5
+ VPERM2I128 $0x13, Y13, Y1, Y9
+ VPERM2I128 $0x02, Y6, Y10, Y13
+ VPERM2I128 $0x02, Y8, Y2, Y1
+ VPERM2I128 $0x13, Y6, Y10, Y6
+ VPERM2I128 $0x13, Y8, Y2, Y10
JMP openAVX2ShortOpen
-// ----------------------------------------------------------------------------
-// Special optimization for the last 128 bytes of ciphertext
openAVX2Tail128:
// Need to decrypt up to 128 bytes - prepare two blocks
- VMOVDQA ·chacha20Constants<>(SB), AA1
- VMOVDQA state1StoreAVX2, BB1
- VMOVDQA state2StoreAVX2, CC1
- VMOVDQA ctr3StoreAVX2, DD1
- VPADDD ·avx2IncMask<>(SB), DD1, DD1
- VMOVDQA DD1, DD0
-
- XORQ itr2, itr2
- MOVQ inl, itr1
- ANDQ $-16, itr1
- TESTQ itr1, itr1
- JE openAVX2Tail128LoopB
+ VMOVDQA ·chacha20Constants<>+0(SB), Y5
+ VMOVDQA 32(BP), Y9
+ VMOVDQA 64(BP), Y13
+ VMOVDQA 192(BP), Y1
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y1
+ VMOVDQA Y1, Y4
+ XORQ R9, R9
+ MOVQ BX, CX
+ ANDQ $-16, CX
+ TESTQ CX, CX
+ JE openAVX2Tail128LoopB
openAVX2Tail128LoopA:
- // Perform ChaCha rounds, while hashing the remaining input
- polyAdd(0(inp)(itr2*1))
- polyMulAVX2
+ ADDQ (SI)(R9*1), R10
+ ADCQ 8(SI)(R9*1), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
openAVX2Tail128LoopB:
- ADDQ $16, itr2
- chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- VPALIGNR $4, BB1, BB1, BB1
- VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $12, DD1, DD1, DD1
- chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- VPALIGNR $12, BB1, BB1, BB1
- VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $4, DD1, DD1, DD1
- CMPQ itr2, itr1
- JB openAVX2Tail128LoopA
- CMPQ itr2, $160
- JNE openAVX2Tail128LoopB
-
- VPADDD ·chacha20Constants<>(SB), AA1, AA1
- VPADDD state1StoreAVX2, BB1, BB1
- VPADDD state2StoreAVX2, CC1, CC1
- VPADDD DD0, DD1, DD1
- VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
+ ADDQ $0x10, R9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x04, Y1, Y1, Y1
+ CMPQ R9, CX
+ JB openAVX2Tail128LoopA
+ CMPQ R9, $0xa0
+ JNE openAVX2Tail128LoopB
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 64(BP), Y13, Y13
+ VPADDD Y4, Y1, Y1
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
openAVX2TailLoop:
- CMPQ inl, $32
+ CMPQ BX, $0x20
JB openAVX2Tail
- SUBQ $32, inl
+ SUBQ $0x20, BX
// Load for decryption
- VPXOR (inp), AA0, AA0
- VMOVDQU AA0, (oup)
- LEAQ (1*32)(inp), inp
- LEAQ (1*32)(oup), oup
- VMOVDQA BB0, AA0
- VMOVDQA CC0, BB0
- VMOVDQA DD0, CC0
+ VPXOR (SI), Y0, Y0
+ VMOVDQU Y0, (DI)
+ LEAQ 32(SI), SI
+ LEAQ 32(DI), DI
+ VMOVDQA Y14, Y0
+ VMOVDQA Y12, Y14
+ VMOVDQA Y4, Y12
JMP openAVX2TailLoop
openAVX2Tail:
- CMPQ inl, $16
- VMOVDQA A0, A1
+ CMPQ BX, $0x10
+ VMOVDQA X0, X1
JB openAVX2TailDone
- SUBQ $16, inl
+ SUBQ $0x10, BX
// Load for decryption
- VPXOR (inp), A0, T0
- VMOVDQU T0, (oup)
- LEAQ (1*16)(inp), inp
- LEAQ (1*16)(oup), oup
- VPERM2I128 $0x11, AA0, AA0, AA0
- VMOVDQA A0, A1
+ VPXOR (SI), X0, X12
+ VMOVDQU X12, (DI)
+ LEAQ 16(SI), SI
+ LEAQ 16(DI), DI
+ VPERM2I128 $0x11, Y0, Y0, Y0
+ VMOVDQA X0, X1
openAVX2TailDone:
VZEROUPPER
JMP openSSETail16
-// ----------------------------------------------------------------------------
-// Special optimization for the last 256 bytes of ciphertext
openAVX2Tail256:
- // Need to decrypt up to 256 bytes - prepare four blocks
- VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
- VMOVDQA ctr3StoreAVX2, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD1
- VMOVDQA DD0, TT1
- VMOVDQA DD1, TT2
+ VMOVDQA ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA Y0, Y5
+ VMOVDQA 32(BP), Y14
+ VMOVDQA Y14, Y9
+ VMOVDQA 64(BP), Y12
+ VMOVDQA Y12, Y13
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VMOVDQA Y4, Y7
+ VMOVDQA Y1, Y11
// Compute the number of iterations that will hash data
- MOVQ inl, tmpStoreAVX2
- MOVQ inl, itr1
- SUBQ $128, itr1
- SHRQ $4, itr1
- MOVQ $10, itr2
- CMPQ itr1, $10
- CMOVQGT itr2, itr1
- MOVQ inp, inl
- XORQ itr2, itr2
+ MOVQ BX, 224(BP)
+ MOVQ BX, CX
+ SUBQ $0x80, CX
+ SHRQ $0x04, CX
+ MOVQ $0x0000000a, R9
+ CMPQ CX, $0x0a
+ CMOVQGT R9, CX
+ MOVQ SI, BX
+ XORQ R9, R9
openAVX2Tail256LoopA:
- polyAdd(0(inl))
- polyMulAVX2
- LEAQ 16(inl), inl
+ ADDQ (BX), R10
+ ADCQ 8(BX), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(BX), BX
- // Perform ChaCha rounds, while hashing the remaining input
openAVX2Tail256LoopB:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
- INCQ itr2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
- CMPQ itr2, itr1
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ INCQ R9
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ CMPQ R9, CX
JB openAVX2Tail256LoopA
+ CMPQ R9, $0x0a
+ JNE openAVX2Tail256LoopB
+ MOVQ BX, R9
+ SUBQ SI, BX
+ MOVQ BX, CX
+ MOVQ 224(BP), BX
- CMPQ itr2, $10
- JNE openAVX2Tail256LoopB
-
- MOVQ inl, itr2
- SUBQ inp, inl
- MOVQ inl, itr1
- MOVQ tmpStoreAVX2, inl
-
- // Hash the remainder of data (if any)
openAVX2Tail256Hash:
- ADDQ $16, itr1
- CMPQ itr1, inl
- JGT openAVX2Tail256HashEnd
- polyAdd (0(itr2))
- polyMulAVX2
- LEAQ 16(itr2), itr2
- JMP openAVX2Tail256Hash
+ ADDQ $0x10, CX
+ CMPQ CX, BX
+ JGT openAVX2Tail256HashEnd
+ ADDQ (R9), R10
+ ADCQ 8(R9), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(R9), R9
+ JMP openAVX2Tail256Hash
-// Store 128 bytes safely, then go to store loop
openAVX2Tail256HashEnd:
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
- VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
- VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2
- VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
-
- VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2
- VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup)
- LEAQ (4*32)(inp), inp
- LEAQ (4*32)(oup), oup
- SUBQ $4*32, inl
-
- JMP openAVX2TailLoop
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 384 bytes of ciphertext
-openAVX2Tail384:
- // Need to decrypt up to 384 bytes - prepare six blocks
- VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
- VMOVDQA ctr3StoreAVX2, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD1
- VPADDD ·avx2IncMask<>(SB), DD1, DD2
- VMOVDQA DD0, ctr0StoreAVX2
- VMOVDQA DD1, ctr1StoreAVX2
- VMOVDQA DD2, ctr2StoreAVX2
-
- // Compute the number of iterations that will hash two blocks of data
- MOVQ inl, tmpStoreAVX2
- MOVQ inl, itr1
- SUBQ $256, itr1
- SHRQ $4, itr1
- ADDQ $6, itr1
- MOVQ $10, itr2
- CMPQ itr1, $10
- CMOVQGT itr2, itr1
- MOVQ inp, inl
- XORQ itr2, itr2
-
- // Perform ChaCha rounds, while hashing the remaining input
-openAVX2Tail384LoopB:
- polyAdd(0(inl))
- polyMulAVX2
- LEAQ 16(inl), inl
-
-openAVX2Tail384LoopA:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
- polyAdd(0(inl))
- polyMulAVX2
- LEAQ 16(inl), inl
- INCQ itr2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
-
- CMPQ itr2, itr1
- JB openAVX2Tail384LoopB
-
- CMPQ itr2, $10
- JNE openAVX2Tail384LoopA
-
- MOVQ inl, itr2
- SUBQ inp, inl
- MOVQ inl, itr1
- MOVQ tmpStoreAVX2, inl
-
-openAVX2Tail384Hash:
- ADDQ $16, itr1
- CMPQ itr1, inl
- JGT openAVX2Tail384HashEnd
- polyAdd(0(itr2))
- polyMulAVX2
- LEAQ 16(itr2), itr2
- JMP openAVX2Tail384Hash
-
-// Store 256 bytes safely, then go to store loop
-openAVX2Tail384HashEnd:
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
- VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
- VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3
- VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
- VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
- VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3
- VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
- VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
- VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
- LEAQ (8*32)(inp), inp
- LEAQ (8*32)(oup), oup
- SUBQ $8*32, inl
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD Y7, Y4, Y4
+ VPADDD Y11, Y1, Y1
+ VPERM2I128 $0x02, Y0, Y14, Y6
+ VPERM2I128 $0x02, Y12, Y4, Y10
+ VPERM2I128 $0x13, Y0, Y14, Y8
+ VPERM2I128 $0x13, Y12, Y4, Y2
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
+ VPXOR (SI), Y6, Y6
+ VPXOR 32(SI), Y10, Y10
+ VPXOR 64(SI), Y8, Y8
+ VPXOR 96(SI), Y2, Y2
+ VMOVDQU Y6, (DI)
+ VMOVDQU Y10, 32(DI)
+ VMOVDQU Y8, 64(DI)
+ VMOVDQU Y2, 96(DI)
+ LEAQ 128(SI), SI
+ LEAQ 128(DI), DI
+ SUBQ $0x80, BX
JMP openAVX2TailLoop
-// ----------------------------------------------------------------------------
-// Special optimization for the last 512 bytes of ciphertext
+openAVX2Tail384:
+ // Need to decrypt up to 384 bytes - prepare six blocks
+ VMOVDQA ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA Y0, Y5
+ VMOVDQA Y0, Y6
+ VMOVDQA 32(BP), Y14
+ VMOVDQA Y14, Y9
+ VMOVDQA Y14, Y10
+ VMOVDQA 64(BP), Y12
+ VMOVDQA Y12, Y13
+ VMOVDQA Y12, Y8
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VMOVDQA Y4, 96(BP)
+ VMOVDQA Y1, 128(BP)
+ VMOVDQA Y2, 160(BP)
+
+ // Compute the number of iterations that will hash two blocks of data
+ MOVQ BX, 224(BP)
+ MOVQ BX, CX
+ SUBQ $0x00000100, CX
+ SHRQ $0x04, CX
+ ADDQ $0x06, CX
+ MOVQ $0x0000000a, R9
+ CMPQ CX, $0x0a
+ CMOVQGT R9, CX
+ MOVQ SI, BX
+ XORQ R9, R9
+
+openAVX2Tail384LoopB:
+ ADDQ (BX), R10
+ ADCQ 8(BX), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(BX), BX
+
+openAVX2Tail384LoopA:
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y3
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y3
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ ADDQ (BX), R10
+ ADCQ 8(BX), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(BX), BX
+ INCQ R9
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y3
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y3
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x04, Y2, Y2, Y2
+ CMPQ R9, CX
+ JB openAVX2Tail384LoopB
+ CMPQ R9, $0x0a
+ JNE openAVX2Tail384LoopA
+ MOVQ BX, R9
+ SUBQ SI, BX
+ MOVQ BX, CX
+ MOVQ 224(BP), BX
+
+openAVX2Tail384Hash:
+ ADDQ $0x10, CX
+ CMPQ CX, BX
+ JGT openAVX2Tail384HashEnd
+ ADDQ (R9), R10
+ ADCQ 8(R9), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(R9), R9
+ JMP openAVX2Tail384Hash
+
+openAVX2Tail384HashEnd:
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 32(BP), Y10, Y10
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD 64(BP), Y8, Y8
+ VPADDD 96(BP), Y4, Y4
+ VPADDD 128(BP), Y1, Y1
+ VPADDD 160(BP), Y2, Y2
+ VPERM2I128 $0x02, Y0, Y14, Y3
+ VPERM2I128 $0x02, Y12, Y4, Y7
+ VPERM2I128 $0x13, Y0, Y14, Y11
+ VPERM2I128 $0x13, Y12, Y4, Y15
+ VPXOR (SI), Y3, Y3
+ VPXOR 32(SI), Y7, Y7
+ VPXOR 64(SI), Y11, Y11
+ VPXOR 96(SI), Y15, Y15
+ VMOVDQU Y3, (DI)
+ VMOVDQU Y7, 32(DI)
+ VMOVDQU Y11, 64(DI)
+ VMOVDQU Y15, 96(DI)
+ VPERM2I128 $0x02, Y5, Y9, Y3
+ VPERM2I128 $0x02, Y13, Y1, Y7
+ VPERM2I128 $0x13, Y5, Y9, Y11
+ VPERM2I128 $0x13, Y13, Y1, Y15
+ VPXOR 128(SI), Y3, Y3
+ VPXOR 160(SI), Y7, Y7
+ VPXOR 192(SI), Y11, Y11
+ VPXOR 224(SI), Y15, Y15
+ VMOVDQU Y3, 128(DI)
+ VMOVDQU Y7, 160(DI)
+ VMOVDQU Y11, 192(DI)
+ VMOVDQU Y15, 224(DI)
+ VPERM2I128 $0x02, Y6, Y10, Y0
+ VPERM2I128 $0x02, Y8, Y2, Y14
+ VPERM2I128 $0x13, Y6, Y10, Y12
+ VPERM2I128 $0x13, Y8, Y2, Y4
+ LEAQ 256(SI), SI
+ LEAQ 256(DI), DI
+ SUBQ $0x00000100, BX
+ JMP openAVX2TailLoop
+
openAVX2Tail512:
- VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
- VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
- VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
- XORQ itr1, itr1
- MOVQ inp, itr2
+ VMOVDQU ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA Y0, Y5
+ VMOVDQA Y0, Y6
+ VMOVDQA Y0, Y7
+ VMOVDQA 32(BP), Y14
+ VMOVDQA Y14, Y9
+ VMOVDQA Y14, Y10
+ VMOVDQA Y14, Y11
+ VMOVDQA 64(BP), Y12
+ VMOVDQA Y12, Y13
+ VMOVDQA Y12, Y8
+ VMOVDQA Y12, Y15
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
+ VMOVDQA Y4, 96(BP)
+ VMOVDQA Y1, 128(BP)
+ VMOVDQA Y2, 160(BP)
+ VMOVDQA Y3, 192(BP)
+ XORQ CX, CX
+ MOVQ SI, R9
openAVX2Tail512LoopB:
- polyAdd(0(itr2))
- polyMulAVX2
- LEAQ (2*8)(itr2), itr2
+ ADDQ (R9), R10
+ ADCQ 8(R9), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(R9), R9
openAVX2Tail512LoopA:
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyAdd(0*8(itr2))
- polyMulAVX2
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- polyAdd(2*8(itr2))
- polyMulAVX2
- LEAQ (4*8)(itr2), itr2
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
- INCQ itr1
- CMPQ itr1, $4
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ ADDQ (R9), R10
+ ADCQ 8(R9), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x07, Y11, Y15
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x04, Y11, Y11, Y11
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPALIGNR $0x0c, Y3, Y3, Y3
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ ADDQ 16(R9), R10
+ ADCQ 24(R9), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 32(R9), R9
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x07, Y11, Y15
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x0c, Y11, Y11, Y11
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x04, Y2, Y2, Y2
+ VPALIGNR $0x04, Y3, Y3, Y3
+ INCQ CX
+ CMPQ CX, $0x04
JLT openAVX2Tail512LoopB
-
- CMPQ itr1, $10
- JNE openAVX2Tail512LoopA
-
- MOVQ inl, itr1
- SUBQ $384, itr1
- ANDQ $-16, itr1
+ CMPQ CX, $0x0a
+ JNE openAVX2Tail512LoopA
+ MOVQ BX, CX
+ SUBQ $0x00000180, CX
+ ANDQ $-16, CX
openAVX2Tail512HashLoop:
- TESTQ itr1, itr1
+ TESTQ CX, CX
JE openAVX2Tail512HashEnd
- polyAdd(0(itr2))
- polyMulAVX2
- LEAQ 16(itr2), itr2
- SUBQ $16, itr1
+ ADDQ (R9), R10
+ ADCQ 8(R9), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(R9), R9
+ SUBQ $0x10, CX
JMP openAVX2Tail512HashLoop
openAVX2Tail512HashEnd:
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
- VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
- VMOVDQA CC3, tmpStoreAVX2
- VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
- VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
- VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
- VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
- VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
- VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
- VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
- VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
- VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
- VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
+ VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 32(BP), Y10, Y10
+ VPADDD 32(BP), Y11, Y11
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD 64(BP), Y8, Y8
+ VPADDD 64(BP), Y15, Y15
+ VPADDD 96(BP), Y4, Y4
+ VPADDD 128(BP), Y1, Y1
+ VPADDD 160(BP), Y2, Y2
+ VPADDD 192(BP), Y3, Y3
+ VMOVDQA Y15, 224(BP)
+ VPERM2I128 $0x02, Y0, Y14, Y15
+ VPERM2I128 $0x13, Y0, Y14, Y14
+ VPERM2I128 $0x02, Y12, Y4, Y0
+ VPERM2I128 $0x13, Y12, Y4, Y12
+ VPXOR (SI), Y15, Y15
+ VPXOR 32(SI), Y0, Y0
+ VPXOR 64(SI), Y14, Y14
+ VPXOR 96(SI), Y12, Y12
+ VMOVDQU Y15, (DI)
+ VMOVDQU Y0, 32(DI)
+ VMOVDQU Y14, 64(DI)
+ VMOVDQU Y12, 96(DI)
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
+ VPXOR 128(SI), Y0, Y0
+ VPXOR 160(SI), Y14, Y14
+ VPXOR 192(SI), Y12, Y12
+ VPXOR 224(SI), Y4, Y4
+ VMOVDQU Y0, 128(DI)
+ VMOVDQU Y14, 160(DI)
+ VMOVDQU Y12, 192(DI)
+ VMOVDQU Y4, 224(DI)
+ VPERM2I128 $0x02, Y6, Y10, Y0
+ VPERM2I128 $0x02, Y8, Y2, Y14
+ VPERM2I128 $0x13, Y6, Y10, Y12
+ VPERM2I128 $0x13, Y8, Y2, Y4
+ VPXOR 256(SI), Y0, Y0
+ VPXOR 288(SI), Y14, Y14
+ VPXOR 320(SI), Y12, Y12
+ VPXOR 352(SI), Y4, Y4
+ VMOVDQU Y0, 256(DI)
+ VMOVDQU Y14, 288(DI)
+ VMOVDQU Y12, 320(DI)
+ VMOVDQU Y4, 352(DI)
+ VPERM2I128 $0x02, Y7, Y11, Y0
+ VPERM2I128 $0x02, 224(BP), Y3, Y14
+ VPERM2I128 $0x13, Y7, Y11, Y12
+ VPERM2I128 $0x13, 224(BP), Y3, Y4
+ LEAQ 384(SI), SI
+ LEAQ 384(DI), DI
+ SUBQ $0x00000180, BX
+ JMP openAVX2TailLoop
- LEAQ (12*32)(inp), inp
- LEAQ (12*32)(oup), oup
- SUBQ $12*32, inl
+DATA ·chacha20Constants<>+0(SB)/4, $0x61707865
+DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e
+DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32
+DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574
+DATA ·chacha20Constants<>+16(SB)/4, $0x61707865
+DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e
+DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32
+DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574
+GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32
- JMP openAVX2TailLoop
+DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff
+DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc
+DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff
+DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff
+GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32
-// ----------------------------------------------------------------------------
-// ----------------------------------------------------------------------------
-// func chacha20Poly1305Seal(dst, key, src, ad []byte)
-TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
- // For aligned stack access
+DATA ·sseIncMask<>+0(SB)/8, $0x0000000000000001
+DATA ·sseIncMask<>+8(SB)/8, $0x0000000000000000
+GLOBL ·sseIncMask<>(SB), RODATA|NOPTR, $16
+
+DATA ·andMask<>+0(SB)/8, $0x00000000000000ff
+DATA ·andMask<>+8(SB)/8, $0x0000000000000000
+DATA ·andMask<>+16(SB)/8, $0x000000000000ffff
+DATA ·andMask<>+24(SB)/8, $0x0000000000000000
+DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff
+DATA ·andMask<>+40(SB)/8, $0x0000000000000000
+DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff
+DATA ·andMask<>+56(SB)/8, $0x0000000000000000
+DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff
+DATA ·andMask<>+72(SB)/8, $0x0000000000000000
+DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff
+DATA ·andMask<>+88(SB)/8, $0x0000000000000000
+DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff
+DATA ·andMask<>+104(SB)/8, $0x0000000000000000
+DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+120(SB)/8, $0x0000000000000000
+DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+136(SB)/8, $0x00000000000000ff
+DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+152(SB)/8, $0x000000000000ffff
+DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff
+DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff
+DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff
+DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff
+DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff
+GLOBL ·andMask<>(SB), RODATA|NOPTR, $240
+
+DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000
+DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000
+DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001
+DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000
+GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32
+
+DATA ·rol16<>+0(SB)/8, $0x0504070601000302
+DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a
+DATA ·rol16<>+16(SB)/8, $0x0504070601000302
+DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a
+GLOBL ·rol16<>(SB), RODATA|NOPTR, $32
+
+DATA ·rol8<>+0(SB)/8, $0x0605040702010003
+DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b
+DATA ·rol8<>+16(SB)/8, $0x0605040702010003
+DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b
+GLOBL ·rol8<>(SB), RODATA|NOPTR, $32
+
+DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002
+DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000
+DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002
+DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000
+GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32
+
+// func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte)
+// Requires: AVX, AVX2, BMI2, CMOV, SSE2
+TEXT ·chacha20Poly1305Seal(SB), $288-96
MOVQ SP, BP
- ADDQ $32, BP
+ ADDQ $0x20, BP
ANDQ $-32, BP
- MOVQ dst+0(FP), oup
- MOVQ key+24(FP), keyp
- MOVQ src+48(FP), inp
- MOVQ src_len+56(FP), inl
- MOVQ ad+72(FP), adp
-
- CMPB ·useAVX2(SB), $1
+ MOVQ dst_base+0(FP), DI
+ MOVQ key_base+24(FP), R8
+ MOVQ src_base+48(FP), SI
+ MOVQ src_len+56(FP), BX
+ MOVQ ad_base+72(FP), CX
+ CMPB ·useAVX2+0(SB), $0x01
JE chacha20Poly1305Seal_AVX2
// Special optimization, for very short buffers
- CMPQ inl, $128
- JBE sealSSE128 // About 15% faster
+ CMPQ BX, $0x80
+ JBE sealSSE128
// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
- MOVOU ·chacha20Constants<>(SB), A0
- MOVOU (1*16)(keyp), B0
- MOVOU (2*16)(keyp), C0
- MOVOU (3*16)(keyp), D0
+ MOVOU ·chacha20Constants<>+0(SB), X0
+ MOVOU 16(R8), X3
+ MOVOU 32(R8), X6
+ MOVOU 48(R8), X9
// Store state on stack for future use
- MOVO B0, state1Store
- MOVO C0, state2Store
+ MOVO X3, 32(BP)
+ MOVO X6, 48(BP)
// Load state, increment counter blocks
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
- MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+ MOVO X0, X1
+ MOVO X3, X4
+ MOVO X6, X7
+ MOVO X9, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X1, X2
+ MOVO X4, X5
+ MOVO X7, X8
+ MOVO X10, X11
+ PADDL ·sseIncMask<>+0(SB), X11
+ MOVO X2, X12
+ MOVO X5, X13
+ MOVO X8, X14
+ MOVO X11, X15
+ PADDL ·sseIncMask<>+0(SB), X15
// Store counters
- MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
- MOVQ $10, itr2
+ MOVO X9, 80(BP)
+ MOVO X10, 96(BP)
+ MOVO X11, 112(BP)
+ MOVO X15, 128(BP)
+ MOVQ $0x0000000a, R9
sealSSEIntroLoop:
- MOVO C3, tmpStore
- chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
- MOVO tmpStore, C3
- MOVO C1, tmpStore
- chachaQR(A3, B3, C3, D3, C1)
- MOVO tmpStore, C1
- shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
- shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
- shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
-
- MOVO C3, tmpStore
- chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
- MOVO tmpStore, C3
- MOVO C1, tmpStore
- chachaQR(A3, B3, C3, D3, C1)
- MOVO tmpStore, C1
- shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
- shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
- shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
- DECQ itr2
- JNE sealSSEIntroLoop
+ MOVO X14, 64(BP)
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X3
+ PXOR X14, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X3
+ PXOR X14, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X4
+ PXOR X14, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X4
+ PXOR X14, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X5
+ PXOR X14, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X5
+ PXOR X14, X5
+ MOVO 64(BP), X14
+ MOVO X7, 64(BP)
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL16(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x0c, X7
+ PSRLL $0x14, X13
+ PXOR X7, X13
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL8(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x07, X7
+ PSRLL $0x19, X13
+ PXOR X7, X13
+ MOVO 64(BP), X7
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x0c
+ MOVO X14, 64(BP)
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X3
+ PXOR X14, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X3
+ PXOR X14, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X4
+ PXOR X14, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X4
+ PXOR X14, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X5
+ PXOR X14, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X5
+ PXOR X14, X5
+ MOVO 64(BP), X14
+ MOVO X7, 64(BP)
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL16(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x0c, X7
+ PSRLL $0x14, X13
+ PXOR X7, X13
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL8(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x07, X7
+ PSRLL $0x19, X13
+ PXOR X7, X13
+ MOVO 64(BP), X7
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x04
+ DECQ R9
+ JNE sealSSEIntroLoop
// Add in the state
- PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
- PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
- PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
- PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
+ PADDD ·chacha20Constants<>+0(SB), X0
+ PADDD ·chacha20Constants<>+0(SB), X1
+ PADDD ·chacha20Constants<>+0(SB), X2
+ PADDD ·chacha20Constants<>+0(SB), X12
+ PADDD 32(BP), X3
+ PADDD 32(BP), X4
+ PADDD 32(BP), X5
+ PADDD 32(BP), X13
+ PADDD 48(BP), X7
+ PADDD 48(BP), X8
+ PADDD 48(BP), X14
+ PADDD 96(BP), X10
+ PADDD 112(BP), X11
+ PADDD 128(BP), X15
// Clamp and store the key
- PAND ·polyClampMask<>(SB), A0
- MOVO A0, rStore
- MOVO B0, sStore
+ PAND ·polyClampMask<>+0(SB), X0
+ MOVO X0, (BP)
+ MOVO X3, 16(BP)
// Hash AAD
- MOVQ ad_len+80(FP), itr2
- CALL polyHashADInternal<>(SB)
-
- MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
- PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
- MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
- MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
- PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
- MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup)
-
- MOVQ $128, itr1
- SUBQ $128, inl
- LEAQ 128(inp), inp
-
- MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1
-
- CMPQ inl, $64
- JBE sealSSE128SealHash
-
- MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
- PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
- MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup)
-
- ADDQ $64, itr1
- SUBQ $64, inl
- LEAQ 64(inp), inp
-
- MOVQ $2, itr1
- MOVQ $8, itr2
-
- CMPQ inl, $64
- JBE sealSSETail64
- CMPQ inl, $128
- JBE sealSSETail128
- CMPQ inl, $192
- JBE sealSSETail192
+ MOVQ ad_len+80(FP), R9
+ CALL polyHashADInternal<>(SB)
+ MOVOU (SI), X0
+ MOVOU 16(SI), X3
+ MOVOU 32(SI), X6
+ MOVOU 48(SI), X9
+ PXOR X0, X1
+ PXOR X3, X4
+ PXOR X6, X7
+ PXOR X9, X10
+ MOVOU X1, (DI)
+ MOVOU X4, 16(DI)
+ MOVOU X7, 32(DI)
+ MOVOU X10, 48(DI)
+ MOVOU 64(SI), X0
+ MOVOU 80(SI), X3
+ MOVOU 96(SI), X6
+ MOVOU 112(SI), X9
+ PXOR X0, X2
+ PXOR X3, X5
+ PXOR X6, X8
+ PXOR X9, X11
+ MOVOU X2, 64(DI)
+ MOVOU X5, 80(DI)
+ MOVOU X8, 96(DI)
+ MOVOU X11, 112(DI)
+ MOVQ $0x00000080, CX
+ SUBQ $0x80, BX
+ LEAQ 128(SI), SI
+ MOVO X12, X1
+ MOVO X13, X4
+ MOVO X14, X7
+ MOVO X15, X10
+ CMPQ BX, $0x40
+ JBE sealSSE128SealHash
+ MOVOU (SI), X0
+ MOVOU 16(SI), X3
+ MOVOU 32(SI), X6
+ MOVOU 48(SI), X9
+ PXOR X0, X12
+ PXOR X3, X13
+ PXOR X6, X14
+ PXOR X9, X15
+ MOVOU X12, 128(DI)
+ MOVOU X13, 144(DI)
+ MOVOU X14, 160(DI)
+ MOVOU X15, 176(DI)
+ ADDQ $0x40, CX
+ SUBQ $0x40, BX
+ LEAQ 64(SI), SI
+ MOVQ $0x00000002, CX
+ MOVQ $0x00000008, R9
+ CMPQ BX, $0x40
+ JBE sealSSETail64
+ CMPQ BX, $0x80
+ JBE sealSSETail128
+ CMPQ BX, $0xc0
+ JBE sealSSETail192
sealSSEMainLoop:
// Load state, increment counter blocks
- MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
- MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+ MOVO ·chacha20Constants<>+0(SB), X0
+ MOVO 32(BP), X3
+ MOVO 48(BP), X6
+ MOVO 128(BP), X9
+ PADDL ·sseIncMask<>+0(SB), X9
+ MOVO X0, X1
+ MOVO X3, X4
+ MOVO X6, X7
+ MOVO X9, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X1, X2
+ MOVO X4, X5
+ MOVO X7, X8
+ MOVO X10, X11
+ PADDL ·sseIncMask<>+0(SB), X11
+ MOVO X2, X12
+ MOVO X5, X13
+ MOVO X8, X14
+ MOVO X11, X15
+ PADDL ·sseIncMask<>+0(SB), X15
// Store counters
- MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
+ MOVO X9, 80(BP)
+ MOVO X10, 96(BP)
+ MOVO X11, 112(BP)
+ MOVO X15, 128(BP)
sealSSEInnerLoop:
- MOVO C3, tmpStore
- chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
- MOVO tmpStore, C3
- MOVO C1, tmpStore
- chachaQR(A3, B3, C3, D3, C1)
- MOVO tmpStore, C1
- polyAdd(0(oup))
- shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
- shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
- shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
- polyMulStage1
- polyMulStage2
- LEAQ (2*8)(oup), oup
- MOVO C3, tmpStore
- chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
- MOVO tmpStore, C3
- MOVO C1, tmpStore
- polyMulStage3
- chachaQR(A3, B3, C3, D3, C1)
- MOVO tmpStore, C1
- polyMulReduceStage
- shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
- shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
- shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
- DECQ itr2
- JGE sealSSEInnerLoop
- polyAdd(0(oup))
- polyMul
- LEAQ (2*8)(oup), oup
- DECQ itr1
- JG sealSSEInnerLoop
+ MOVO X14, 64(BP)
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X3
+ PXOR X14, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X3
+ PXOR X14, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X4
+ PXOR X14, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X4
+ PXOR X14, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X5
+ PXOR X14, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X5
+ PXOR X14, X5
+ MOVO 64(BP), X14
+ MOVO X7, 64(BP)
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL16(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x0c, X7
+ PSRLL $0x14, X13
+ PXOR X7, X13
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL8(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x07, X7
+ PSRLL $0x19, X13
+ PXOR X7, X13
+ MOVO 64(BP), X7
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x0c
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ LEAQ 16(DI), DI
+ MOVO X14, 64(BP)
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X3
+ PXOR X14, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X14)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X3
+ PXOR X14, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X4
+ PXOR X14, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X14)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X4
+ PXOR X14, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x0c, X14
+ PSRLL $0x14, X5
+ PXOR X14, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X14)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X14
+ PSLLL $0x07, X14
+ PSRLL $0x19, X5
+ PXOR X14, X5
+ MOVO 64(BP), X14
+ MOVO X7, 64(BP)
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL16(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x0c, X7
+ PSRLL $0x14, X13
+ PXOR X7, X13
+ PADDD X13, X12
+ PXOR X12, X15
+ ROL8(X15, X7)
+ PADDD X15, X14
+ PXOR X14, X13
+ MOVO X13, X7
+ PSLLL $0x07, X7
+ PSRLL $0x19, X13
+ PXOR X7, X13
+ MOVO 64(BP), X7
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x04
+ DECQ R9
+ JGE sealSSEInnerLoop
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
+ DECQ CX
+ JG sealSSEInnerLoop
// Add in the state
- PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
- PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
- PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
- PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
- MOVO D3, tmpStore
+ PADDD ·chacha20Constants<>+0(SB), X0
+ PADDD ·chacha20Constants<>+0(SB), X1
+ PADDD ·chacha20Constants<>+0(SB), X2
+ PADDD ·chacha20Constants<>+0(SB), X12
+ PADDD 32(BP), X3
+ PADDD 32(BP), X4
+ PADDD 32(BP), X5
+ PADDD 32(BP), X13
+ PADDD 48(BP), X6
+ PADDD 48(BP), X7
+ PADDD 48(BP), X8
+ PADDD 48(BP), X14
+ PADDD 80(BP), X9
+ PADDD 96(BP), X10
+ PADDD 112(BP), X11
+ PADDD 128(BP), X15
+ MOVO X15, 64(BP)
// Load - xor - store
- MOVOU (0*16)(inp), D3; PXOR D3, A0
- MOVOU (1*16)(inp), D3; PXOR D3, B0
- MOVOU (2*16)(inp), D3; PXOR D3, C0
- MOVOU (3*16)(inp), D3; PXOR D3, D0
- MOVOU A0, (0*16)(oup)
- MOVOU B0, (1*16)(oup)
- MOVOU C0, (2*16)(oup)
- MOVOU D0, (3*16)(oup)
- MOVO tmpStore, D3
-
- MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
- PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
- MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
- MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
- PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
- MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
- ADDQ $192, inp
- MOVQ $192, itr1
- SUBQ $192, inl
- MOVO A3, A1
- MOVO B3, B1
- MOVO C3, C1
- MOVO D3, D1
- CMPQ inl, $64
+ MOVOU (SI), X15
+ PXOR X15, X0
+ MOVOU 16(SI), X15
+ PXOR X15, X3
+ MOVOU 32(SI), X15
+ PXOR X15, X6
+ MOVOU 48(SI), X15
+ PXOR X15, X9
+ MOVOU X0, (DI)
+ MOVOU X3, 16(DI)
+ MOVOU X6, 32(DI)
+ MOVOU X9, 48(DI)
+ MOVO 64(BP), X15
+ MOVOU 64(SI), X0
+ MOVOU 80(SI), X3
+ MOVOU 96(SI), X6
+ MOVOU 112(SI), X9
+ PXOR X0, X1
+ PXOR X3, X4
+ PXOR X6, X7
+ PXOR X9, X10
+ MOVOU X1, 64(DI)
+ MOVOU X4, 80(DI)
+ MOVOU X7, 96(DI)
+ MOVOU X10, 112(DI)
+ MOVOU 128(SI), X0
+ MOVOU 144(SI), X3
+ MOVOU 160(SI), X6
+ MOVOU 176(SI), X9
+ PXOR X0, X2
+ PXOR X3, X5
+ PXOR X6, X8
+ PXOR X9, X11
+ MOVOU X2, 128(DI)
+ MOVOU X5, 144(DI)
+ MOVOU X8, 160(DI)
+ MOVOU X11, 176(DI)
+ ADDQ $0xc0, SI
+ MOVQ $0x000000c0, CX
+ SUBQ $0xc0, BX
+ MOVO X12, X1
+ MOVO X13, X4
+ MOVO X14, X7
+ MOVO X15, X10
+ CMPQ BX, $0x40
JBE sealSSE128SealHash
- MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
- PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
- MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup)
- LEAQ 64(inp), inp
- SUBQ $64, inl
- MOVQ $6, itr1
- MOVQ $4, itr2
- CMPQ inl, $192
+ MOVOU (SI), X0
+ MOVOU 16(SI), X3
+ MOVOU 32(SI), X6
+ MOVOU 48(SI), X9
+ PXOR X0, X12
+ PXOR X3, X13
+ PXOR X6, X14
+ PXOR X9, X15
+ MOVOU X12, 192(DI)
+ MOVOU X13, 208(DI)
+ MOVOU X14, 224(DI)
+ MOVOU X15, 240(DI)
+ LEAQ 64(SI), SI
+ SUBQ $0x40, BX
+ MOVQ $0x00000006, CX
+ MOVQ $0x00000004, R9
+ CMPQ BX, $0xc0
JG sealSSEMainLoop
-
- MOVQ inl, itr1
- TESTQ inl, inl
+ MOVQ BX, CX
+ TESTQ BX, BX
JE sealSSE128SealHash
- MOVQ $6, itr1
- CMPQ inl, $64
+ MOVQ $0x00000006, CX
+ CMPQ BX, $0x40
JBE sealSSETail64
- CMPQ inl, $128
+ CMPQ BX, $0x80
JBE sealSSETail128
JMP sealSSETail192
-// ----------------------------------------------------------------------------
-// Special optimization for the last 64 bytes of plaintext
sealSSETail64:
- // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
- MOVO ·chacha20Constants<>(SB), A1
- MOVO state1Store, B1
- MOVO state2Store, C1
- MOVO ctr3Store, D1
- PADDL ·sseIncMask<>(SB), D1
- MOVO D1, ctr0Store
+ MOVO ·chacha20Constants<>+0(SB), X1
+ MOVO 32(BP), X4
+ MOVO 48(BP), X7
+ MOVO 128(BP), X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X10, 80(BP)
sealSSETail64LoopA:
- // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
sealSSETail64LoopB:
- chachaQR(A1, B1, C1, D1, T1)
- shiftB1Left; shiftC1Left; shiftD1Left
- chachaQR(A1, B1, C1, D1, T1)
- shiftB1Right; shiftC1Right; shiftD1Right
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
-
- DECQ itr1
- JG sealSSETail64LoopA
-
- DECQ itr2
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X13)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X13
+ PSLLL $0x0c, X13
+ PSRLL $0x14, X4
+ PXOR X13, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X13)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X13
+ PSLLL $0x07, X13
+ PSRLL $0x19, X4
+ PXOR X13, X4
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X13)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X13
+ PSLLL $0x0c, X13
+ PSRLL $0x14, X4
+ PXOR X13, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X13)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X13
+ PSLLL $0x07, X13
+ PSRLL $0x19, X4
+ PXOR X13, X4
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
+ DECQ CX
+ JG sealSSETail64LoopA
+ DECQ R9
JGE sealSSETail64LoopB
- PADDL ·chacha20Constants<>(SB), A1
- PADDL state1Store, B1
- PADDL state2Store, C1
- PADDL ctr0Store, D1
+ PADDL ·chacha20Constants<>+0(SB), X1
+ PADDL 32(BP), X4
+ PADDL 48(BP), X7
+ PADDL 80(BP), X10
+ JMP sealSSE128Seal
- JMP sealSSE128Seal
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 128 bytes of plaintext
sealSSETail128:
- // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
- MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
+ MOVO ·chacha20Constants<>+0(SB), X0
+ MOVO 32(BP), X3
+ MOVO 48(BP), X6
+ MOVO 128(BP), X9
+ PADDL ·sseIncMask<>+0(SB), X9
+ MOVO X9, 80(BP)
+ MOVO X0, X1
+ MOVO X3, X4
+ MOVO X6, X7
+ MOVO X9, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X10, 96(BP)
sealSSETail128LoopA:
- // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
sealSSETail128LoopB:
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
- shiftB0Left; shiftC0Left; shiftD0Left
- shiftB1Left; shiftC1Left; shiftD1Left
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
- shiftB0Right; shiftC0Right; shiftD0Right
- shiftB1Right; shiftC1Right; shiftD1Right
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ DECQ CX
+ JG sealSSETail128LoopA
+ DECQ R9
+ JGE sealSSETail128LoopB
+ PADDL ·chacha20Constants<>+0(SB), X0
+ PADDL ·chacha20Constants<>+0(SB), X1
+ PADDL 32(BP), X3
+ PADDL 32(BP), X4
+ PADDL 48(BP), X6
+ PADDL 48(BP), X7
+ PADDL 80(BP), X9
+ PADDL 96(BP), X10
+ MOVOU (SI), X12
+ MOVOU 16(SI), X13
+ MOVOU 32(SI), X14
+ MOVOU 48(SI), X15
+ PXOR X12, X0
+ PXOR X13, X3
+ PXOR X14, X6
+ PXOR X15, X9
+ MOVOU X0, (DI)
+ MOVOU X3, 16(DI)
+ MOVOU X6, 32(DI)
+ MOVOU X9, 48(DI)
+ MOVQ $0x00000040, CX
+ LEAQ 64(SI), SI
+ SUBQ $0x40, BX
+ JMP sealSSE128SealHash
- DECQ itr1
- JG sealSSETail128LoopA
-
- DECQ itr2
- JGE sealSSETail128LoopB
-
- PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
- PADDL state1Store, B0; PADDL state1Store, B1
- PADDL state2Store, C0; PADDL state2Store, C1
- PADDL ctr0Store, D0; PADDL ctr1Store, D1
-
- MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
- PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
- MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
-
- MOVQ $64, itr1
- LEAQ 64(inp), inp
- SUBQ $64, inl
-
- JMP sealSSE128SealHash
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 192 bytes of plaintext
sealSSETail192:
- // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
- MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
+ MOVO ·chacha20Constants<>+0(SB), X0
+ MOVO 32(BP), X3
+ MOVO 48(BP), X6
+ MOVO 128(BP), X9
+ PADDL ·sseIncMask<>+0(SB), X9
+ MOVO X9, 80(BP)
+ MOVO X0, X1
+ MOVO X3, X4
+ MOVO X6, X7
+ MOVO X9, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X10, 96(BP)
+ MOVO X1, X2
+ MOVO X4, X5
+ MOVO X7, X8
+ MOVO X10, X11
+ PADDL ·sseIncMask<>+0(SB), X11
+ MOVO X11, 112(BP)
sealSSETail192LoopA:
- // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
sealSSETail192LoopB:
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
- shiftB0Left; shiftC0Left; shiftD0Left
- shiftB1Left; shiftC1Left; shiftD1Left
- shiftB2Left; shiftC2Left; shiftD2Left
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X5
+ PXOR X12, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X5
+ PXOR X12, X5
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X5
+ PXOR X12, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X5
+ PXOR X12, X5
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ DECQ CX
+ JG sealSSETail192LoopA
+ DECQ R9
+ JGE sealSSETail192LoopB
+ PADDL ·chacha20Constants<>+0(SB), X0
+ PADDL ·chacha20Constants<>+0(SB), X1
+ PADDL ·chacha20Constants<>+0(SB), X2
+ PADDL 32(BP), X3
+ PADDL 32(BP), X4
+ PADDL 32(BP), X5
+ PADDL 48(BP), X6
+ PADDL 48(BP), X7
+ PADDL 48(BP), X8
+ PADDL 80(BP), X9
+ PADDL 96(BP), X10
+ PADDL 112(BP), X11
+ MOVOU (SI), X12
+ MOVOU 16(SI), X13
+ MOVOU 32(SI), X14
+ MOVOU 48(SI), X15
+ PXOR X12, X0
+ PXOR X13, X3
+ PXOR X14, X6
+ PXOR X15, X9
+ MOVOU X0, (DI)
+ MOVOU X3, 16(DI)
+ MOVOU X6, 32(DI)
+ MOVOU X9, 48(DI)
+ MOVOU 64(SI), X12
+ MOVOU 80(SI), X13
+ MOVOU 96(SI), X14
+ MOVOU 112(SI), X15
+ PXOR X12, X1
+ PXOR X13, X4
+ PXOR X14, X7
+ PXOR X15, X10
+ MOVOU X1, 64(DI)
+ MOVOU X4, 80(DI)
+ MOVOU X7, 96(DI)
+ MOVOU X10, 112(DI)
+ MOVO X2, X1
+ MOVO X5, X4
+ MOVO X8, X7
+ MOVO X11, X10
+ MOVQ $0x00000080, CX
+ LEAQ 128(SI), SI
+ SUBQ $0x80, BX
+ JMP sealSSE128SealHash
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
-
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
- shiftB0Right; shiftC0Right; shiftD0Right
- shiftB1Right; shiftC1Right; shiftD1Right
- shiftB2Right; shiftC2Right; shiftD2Right
-
- DECQ itr1
- JG sealSSETail192LoopA
-
- DECQ itr2
- JGE sealSSETail192LoopB
-
- PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
- PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
- PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
- PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
-
- MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
- PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
- MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
- MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
- PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
- MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
-
- MOVO A2, A1
- MOVO B2, B1
- MOVO C2, C1
- MOVO D2, D1
- MOVQ $128, itr1
- LEAQ 128(inp), inp
- SUBQ $128, inl
-
- JMP sealSSE128SealHash
-
-// ----------------------------------------------------------------------------
-// Special seal optimization for buffers smaller than 129 bytes
sealSSE128:
- // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
- MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
- MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
- MOVQ $10, itr2
+ MOVOU ·chacha20Constants<>+0(SB), X0
+ MOVOU 16(R8), X3
+ MOVOU 32(R8), X6
+ MOVOU 48(R8), X9
+ MOVO X0, X1
+ MOVO X3, X4
+ MOVO X6, X7
+ MOVO X9, X10
+ PADDL ·sseIncMask<>+0(SB), X10
+ MOVO X1, X2
+ MOVO X4, X5
+ MOVO X7, X8
+ MOVO X10, X11
+ PADDL ·sseIncMask<>+0(SB), X11
+ MOVO X3, X13
+ MOVO X6, X14
+ MOVO X10, X15
+ MOVQ $0x0000000a, R9
sealSSE128InnerCipherLoop:
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
- shiftB0Left; shiftB1Left; shiftB2Left
- shiftC0Left; shiftC1Left; shiftC2Left
- shiftD0Left; shiftD1Left; shiftD2Left
- chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
- shiftB0Right; shiftB1Right; shiftB2Right
- shiftC0Right; shiftC1Right; shiftC2Right
- shiftD0Right; shiftD1Right; shiftD2Right
- DECQ itr2
- JNE sealSSE128InnerCipherLoop
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X5
+ PXOR X12, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X5
+ PXOR X12, X5
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL16(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X3
+ PXOR X12, X3
+ PADDD X3, X0
+ PXOR X0, X9
+ ROL8(X9, X12)
+ PADDD X9, X6
+ PXOR X6, X3
+ MOVO X3, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X3
+ PXOR X12, X3
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL16(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X4
+ PXOR X12, X4
+ PADDD X4, X1
+ PXOR X1, X10
+ ROL8(X10, X12)
+ PADDD X10, X7
+ PXOR X7, X4
+ MOVO X4, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X4
+ PXOR X12, X4
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL16(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x0c, X12
+ PSRLL $0x14, X5
+ PXOR X12, X5
+ PADDD X5, X2
+ PXOR X2, X11
+ ROL8(X11, X12)
+ PADDD X11, X8
+ PXOR X8, X5
+ MOVO X5, X12
+ PSLLL $0x07, X12
+ PSRLL $0x19, X5
+ PXOR X12, X5
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xe4
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xed
+ BYTE $0x0c
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xf6
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xff
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc0
+ BYTE $0x08
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xc9
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xd2
+ BYTE $0x04
+ BYTE $0x66
+ BYTE $0x45
+ BYTE $0x0f
+ BYTE $0x3a
+ BYTE $0x0f
+ BYTE $0xdb
+ BYTE $0x04
+ DECQ R9
+ JNE sealSSE128InnerCipherLoop
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
- PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
- PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
- PADDL T2, C1; PADDL T2, C2
- PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
- PAND ·polyClampMask<>(SB), A0
- MOVOU A0, rStore
- MOVOU B0, sStore
+ PADDL ·chacha20Constants<>+0(SB), X0
+ PADDL ·chacha20Constants<>+0(SB), X1
+ PADDL ·chacha20Constants<>+0(SB), X2
+ PADDL X13, X3
+ PADDL X13, X4
+ PADDL X13, X5
+ PADDL X14, X7
+ PADDL X14, X8
+ PADDL X15, X10
+ PADDL ·sseIncMask<>+0(SB), X15
+ PADDL X15, X11
+ PAND ·polyClampMask<>+0(SB), X0
+ MOVOU X0, (BP)
+ MOVOU X3, 16(BP)
// Hash
- MOVQ ad_len+80(FP), itr2
+ MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
- XORQ itr1, itr1
+ XORQ CX, CX
sealSSE128SealHash:
- // itr1 holds the number of bytes encrypted but not yet hashed
- CMPQ itr1, $16
- JB sealSSE128Seal
- polyAdd(0(oup))
- polyMul
-
- SUBQ $16, itr1
- ADDQ $16, oup
-
- JMP sealSSE128SealHash
+ CMPQ CX, $0x10
+ JB sealSSE128Seal
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ SUBQ $0x10, CX
+ ADDQ $0x10, DI
+ JMP sealSSE128SealHash
sealSSE128Seal:
- CMPQ inl, $16
+ CMPQ BX, $0x10
JB sealSSETail
- SUBQ $16, inl
+ SUBQ $0x10, BX
// Load for decryption
- MOVOU (inp), T0
- PXOR T0, A1
- MOVOU A1, (oup)
- LEAQ (1*16)(inp), inp
- LEAQ (1*16)(oup), oup
+ MOVOU (SI), X12
+ PXOR X12, X1
+ MOVOU X1, (DI)
+ LEAQ 16(SI), SI
+ LEAQ 16(DI), DI
// Extract for hashing
- MOVQ A1, t0
- PSRLDQ $8, A1
- MOVQ A1, t1
- ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
- polyMul
+ MOVQ X1, R13
+ PSRLDQ $0x08, X1
+ MOVQ X1, R14
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
// Shift the stream "left"
- MOVO B1, A1
- MOVO C1, B1
- MOVO D1, C1
- MOVO A2, D1
- MOVO B2, A2
- MOVO C2, B2
- MOVO D2, C2
+ MOVO X4, X1
+ MOVO X7, X4
+ MOVO X10, X7
+ MOVO X2, X10
+ MOVO X5, X2
+ MOVO X8, X5
+ MOVO X11, X8
JMP sealSSE128Seal
sealSSETail:
- TESTQ inl, inl
+ TESTQ BX, BX
JE sealSSEFinalize
// We can only load the PT one byte at a time to avoid read after end of buffer
- MOVQ inl, itr2
- SHLQ $4, itr2
- LEAQ ·andMask<>(SB), t0
- MOVQ inl, itr1
- LEAQ -1(inp)(inl*1), inp
- XORQ t2, t2
- XORQ t3, t3
+ MOVQ BX, R9
+ SHLQ $0x04, R9
+ LEAQ ·andMask<>+0(SB), R13
+ MOVQ BX, CX
+ LEAQ -1(SI)(BX*1), SI
+ XORQ R15, R15
+ XORQ R8, R8
XORQ AX, AX
sealSSETailLoadLoop:
- SHLQ $8, t2, t3
- SHLQ $8, t2
- MOVB (inp), AX
- XORQ AX, t2
- LEAQ -1(inp), inp
- DECQ itr1
+ SHLQ $0x08, R15, R8
+ SHLQ $0x08, R15
+ MOVB (SI), AX
+ XORQ AX, R15
+ LEAQ -1(SI), SI
+ DECQ CX
JNE sealSSETailLoadLoop
- MOVQ t2, 0+tmpStore
- MOVQ t3, 8+tmpStore
- PXOR 0+tmpStore, A1
- MOVOU A1, (oup)
- MOVOU -16(t0)(itr2*1), T0
- PAND T0, A1
- MOVQ A1, t0
- PSRLDQ $8, A1
- MOVQ A1, t1
- ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
- polyMul
-
- ADDQ inl, oup
+ MOVQ R15, 64(BP)
+ MOVQ R8, 72(BP)
+ PXOR 64(BP), X1
+ MOVOU X1, (DI)
+ MOVOU -16(R13)(R9*1), X12
+ PAND X12, X1
+ MOVQ X1, R13
+ PSRLDQ $0x08, X1
+ MOVQ X1, R14
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ ADDQ BX, DI
sealSSEFinalize:
// Hash in the buffer lengths
- ADDQ ad_len+80(FP), acc0
- ADCQ src_len+56(FP), acc1
- ADCQ $1, acc2
- polyMul
+ ADDQ ad_len+80(FP), R10
+ ADCQ src_len+56(FP), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
// Final reduce
- MOVQ acc0, t0
- MOVQ acc1, t1
- MOVQ acc2, t2
- SUBQ $-5, acc0
- SBBQ $-1, acc1
- SBBQ $3, acc2
- CMOVQCS t0, acc0
- CMOVQCS t1, acc1
- CMOVQCS t2, acc2
+ MOVQ R10, R13
+ MOVQ R11, R14
+ MOVQ R12, R15
+ SUBQ $-5, R10
+ SBBQ $-1, R11
+ SBBQ $0x03, R12
+ CMOVQCS R13, R10
+ CMOVQCS R14, R11
+ CMOVQCS R15, R12
// Add in the "s" part of the key
- ADDQ 0+sStore, acc0
- ADCQ 8+sStore, acc1
+ ADDQ 16(BP), R10
+ ADCQ 24(BP), R11
// Finally store the tag at the end of the message
- MOVQ acc0, (0*8)(oup)
- MOVQ acc1, (1*8)(oup)
+ MOVQ R10, (DI)
+ MOVQ R11, 8(DI)
RET
-// ----------------------------------------------------------------------------
-// ------------------------- AVX2 Code ----------------------------------------
chacha20Poly1305Seal_AVX2:
VZEROUPPER
- VMOVDQU ·chacha20Constants<>(SB), AA0
- BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
- BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
- BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
- VPADDD ·avx2InitMask<>(SB), DD0, DD0
+ VMOVDQU ·chacha20Constants<>+0(SB), Y0
+ BYTE $0xc4
+ BYTE $0x42
+ BYTE $0x7d
+ BYTE $0x5a
+ BYTE $0x70
+ BYTE $0x10
+ BYTE $0xc4
+ BYTE $0x42
+ BYTE $0x7d
+ BYTE $0x5a
+ BYTE $0x60
+ BYTE $0x20
+ BYTE $0xc4
+ BYTE $0xc2
+ BYTE $0x7d
+ BYTE $0x5a
+ BYTE $0x60
+ BYTE $0x30
+ VPADDD ·avx2InitMask<>+0(SB), Y4, Y4
// Special optimizations, for very short buffers
- CMPQ inl, $192
- JBE seal192AVX2 // 33% faster
- CMPQ inl, $320
- JBE seal320AVX2 // 17% faster
+ CMPQ BX, $0x000000c0
+ JBE seal192AVX2
+ CMPQ BX, $0x00000140
+ JBE seal320AVX2
// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
- VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
- VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
- VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
- VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
- VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
- VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
- VMOVDQA DD3, ctr3StoreAVX2
- MOVQ $10, itr2
+ VMOVDQA Y0, Y5
+ VMOVDQA Y0, Y6
+ VMOVDQA Y0, Y7
+ VMOVDQA Y14, Y9
+ VMOVDQA Y14, Y10
+ VMOVDQA Y14, Y11
+ VMOVDQA Y14, 32(BP)
+ VMOVDQA Y12, Y13
+ VMOVDQA Y12, Y8
+ VMOVDQA Y12, Y15
+ VMOVDQA Y12, 64(BP)
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VMOVDQA Y4, 96(BP)
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VMOVDQA Y1, 128(BP)
+ VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
+ VMOVDQA Y2, 160(BP)
+ VMOVDQA Y3, 192(BP)
+ MOVQ $0x0000000a, R9
sealAVX2IntroLoop:
- VMOVDQA CC3, tmpStoreAVX2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
- VMOVDQA tmpStoreAVX2, CC3
- VMOVDQA CC1, tmpStoreAVX2
- chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
- VMOVDQA tmpStoreAVX2, CC1
-
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
- VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
- VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
- VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
-
- VMOVDQA CC3, tmpStoreAVX2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
- VMOVDQA tmpStoreAVX2, CC3
- VMOVDQA CC1, tmpStoreAVX2
- chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
- VMOVDQA tmpStoreAVX2, CC1
-
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
- VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
- VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
- VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
- DECQ itr2
- JNE sealAVX2IntroLoop
-
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
- VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
-
- VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127
- VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key
- VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
+ VMOVDQA Y15, 224(BP)
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VMOVDQA 224(BP), Y15
+ VMOVDQA Y13, 224(BP)
+ VPADDD Y11, Y7, Y7
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y3, Y15, Y15
+ VPXOR Y15, Y11, Y11
+ VPSLLD $0x0c, Y11, Y13
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y13, Y11, Y11
+ VPADDD Y11, Y7, Y7
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y3, Y15, Y15
+ VPXOR Y15, Y11, Y11
+ VPSLLD $0x07, Y11, Y13
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y13, Y11, Y11
+ VMOVDQA 224(BP), Y13
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPALIGNR $0x04, Y11, Y11, Y11
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x0c, Y3, Y3, Y3
+ VMOVDQA Y15, 224(BP)
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VMOVDQA 224(BP), Y15
+ VMOVDQA Y13, 224(BP)
+ VPADDD Y11, Y7, Y7
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y3, Y15, Y15
+ VPXOR Y15, Y11, Y11
+ VPSLLD $0x0c, Y11, Y13
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y13, Y11, Y11
+ VPADDD Y11, Y7, Y7
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y3, Y15, Y15
+ VPXOR Y15, Y11, Y11
+ VPSLLD $0x07, Y11, Y13
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y13, Y11, Y11
+ VMOVDQA 224(BP), Y13
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x04, Y2, Y2, Y2
+ VPALIGNR $0x0c, Y11, Y11, Y11
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x04, Y3, Y3, Y3
+ DECQ R9
+ JNE sealAVX2IntroLoop
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
+ VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 32(BP), Y10, Y10
+ VPADDD 32(BP), Y11, Y11
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD 64(BP), Y8, Y8
+ VPADDD 64(BP), Y15, Y15
+ VPADDD 96(BP), Y4, Y4
+ VPADDD 128(BP), Y1, Y1
+ VPADDD 160(BP), Y2, Y2
+ VPADDD 192(BP), Y3, Y3
+ VPERM2I128 $0x13, Y12, Y4, Y12
+ VPERM2I128 $0x02, Y0, Y14, Y4
+ VPERM2I128 $0x13, Y0, Y14, Y0
// Clamp and store poly key
- VPAND ·polyClampMask<>(SB), DD0, DD0
- VMOVDQA DD0, rsStoreAVX2
+ VPAND ·polyClampMask<>+0(SB), Y4, Y4
+ VMOVDQA Y4, (BP)
// Hash AD
- MOVQ ad_len+80(FP), itr2
+ MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
// Can store at least 320 bytes
- VPXOR (0*32)(inp), AA0, AA0
- VPXOR (1*32)(inp), CC0, CC0
- VMOVDQU AA0, (0*32)(oup)
- VMOVDQU CC0, (1*32)(oup)
-
- VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
- VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0
- VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup)
- VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
- VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0
- VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup)
-
- MOVQ $320, itr1
- SUBQ $320, inl
- LEAQ 320(inp), inp
-
- VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0
- CMPQ inl, $128
+ VPXOR (SI), Y0, Y0
+ VPXOR 32(SI), Y12, Y12
+ VMOVDQU Y0, (DI)
+ VMOVDQU Y12, 32(DI)
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
+ VPXOR 64(SI), Y0, Y0
+ VPXOR 96(SI), Y14, Y14
+ VPXOR 128(SI), Y12, Y12
+ VPXOR 160(SI), Y4, Y4
+ VMOVDQU Y0, 64(DI)
+ VMOVDQU Y14, 96(DI)
+ VMOVDQU Y12, 128(DI)
+ VMOVDQU Y4, 160(DI)
+ VPERM2I128 $0x02, Y6, Y10, Y0
+ VPERM2I128 $0x02, Y8, Y2, Y14
+ VPERM2I128 $0x13, Y6, Y10, Y12
+ VPERM2I128 $0x13, Y8, Y2, Y4
+ VPXOR 192(SI), Y0, Y0
+ VPXOR 224(SI), Y14, Y14
+ VPXOR 256(SI), Y12, Y12
+ VPXOR 288(SI), Y4, Y4
+ VMOVDQU Y0, 192(DI)
+ VMOVDQU Y14, 224(DI)
+ VMOVDQU Y12, 256(DI)
+ VMOVDQU Y4, 288(DI)
+ MOVQ $0x00000140, CX
+ SUBQ $0x00000140, BX
+ LEAQ 320(SI), SI
+ VPERM2I128 $0x02, Y7, Y11, Y0
+ VPERM2I128 $0x02, Y15, Y3, Y14
+ VPERM2I128 $0x13, Y7, Y11, Y12
+ VPERM2I128 $0x13, Y15, Y3, Y4
+ CMPQ BX, $0x80
JBE sealAVX2SealHash
-
- VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0
- VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup)
- SUBQ $128, inl
- LEAQ 128(inp), inp
-
- MOVQ $8, itr1
- MOVQ $2, itr2
-
- CMPQ inl, $128
- JBE sealAVX2Tail128
- CMPQ inl, $256
- JBE sealAVX2Tail256
- CMPQ inl, $384
- JBE sealAVX2Tail384
- CMPQ inl, $512
- JBE sealAVX2Tail512
+ VPXOR (SI), Y0, Y0
+ VPXOR 32(SI), Y14, Y14
+ VPXOR 64(SI), Y12, Y12
+ VPXOR 96(SI), Y4, Y4
+ VMOVDQU Y0, 320(DI)
+ VMOVDQU Y14, 352(DI)
+ VMOVDQU Y12, 384(DI)
+ VMOVDQU Y4, 416(DI)
+ SUBQ $0x80, BX
+ LEAQ 128(SI), SI
+ MOVQ $0x00000008, CX
+ MOVQ $0x00000002, R9
+ CMPQ BX, $0x80
+ JBE sealAVX2Tail128
+ CMPQ BX, $0x00000100
+ JBE sealAVX2Tail256
+ CMPQ BX, $0x00000180
+ JBE sealAVX2Tail384
+ CMPQ BX, $0x00000200
+ JBE sealAVX2Tail512
// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
- VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
- VMOVDQA ctr3StoreAVX2, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
- VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
-
- VMOVDQA CC3, tmpStoreAVX2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
- VMOVDQA tmpStoreAVX2, CC3
- VMOVDQA CC1, tmpStoreAVX2
- chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
- VMOVDQA tmpStoreAVX2, CC1
-
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
- VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
- VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
- VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
-
- VMOVDQA CC3, tmpStoreAVX2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
- VMOVDQA tmpStoreAVX2, CC3
- VMOVDQA CC1, tmpStoreAVX2
- chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
- VMOVDQA tmpStoreAVX2, CC1
-
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
- VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
- VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
- VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
-
- SUBQ $16, oup // Adjust the pointer
- MOVQ $9, itr1
- JMP sealAVX2InternalLoopStart
+ VMOVDQA ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA Y0, Y5
+ VMOVDQA Y0, Y6
+ VMOVDQA Y0, Y7
+ VMOVDQA 32(BP), Y14
+ VMOVDQA Y14, Y9
+ VMOVDQA Y14, Y10
+ VMOVDQA Y14, Y11
+ VMOVDQA 64(BP), Y12
+ VMOVDQA Y12, Y13
+ VMOVDQA Y12, Y8
+ VMOVDQA Y12, Y15
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
+ VMOVDQA Y4, 96(BP)
+ VMOVDQA Y1, 128(BP)
+ VMOVDQA Y2, 160(BP)
+ VMOVDQA Y3, 192(BP)
+ VMOVDQA Y15, 224(BP)
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VMOVDQA 224(BP), Y15
+ VMOVDQA Y13, 224(BP)
+ VPADDD Y11, Y7, Y7
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y3, Y15, Y15
+ VPXOR Y15, Y11, Y11
+ VPSLLD $0x0c, Y11, Y13
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y13, Y11, Y11
+ VPADDD Y11, Y7, Y7
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y3, Y15, Y15
+ VPXOR Y15, Y11, Y11
+ VPSLLD $0x07, Y11, Y13
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y13, Y11, Y11
+ VMOVDQA 224(BP), Y13
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPALIGNR $0x04, Y11, Y11, Y11
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x0c, Y3, Y3, Y3
+ VMOVDQA Y15, 224(BP)
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VMOVDQA 224(BP), Y15
+ VMOVDQA Y13, 224(BP)
+ VPADDD Y11, Y7, Y7
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y3, Y15, Y15
+ VPXOR Y15, Y11, Y11
+ VPSLLD $0x0c, Y11, Y13
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y13, Y11, Y11
+ VPADDD Y11, Y7, Y7
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y3, Y15, Y15
+ VPXOR Y15, Y11, Y11
+ VPSLLD $0x07, Y11, Y13
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y13, Y11, Y11
+ VMOVDQA 224(BP), Y13
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x04, Y2, Y2, Y2
+ VPALIGNR $0x0c, Y11, Y11, Y11
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x04, Y3, Y3, Y3
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ SUBQ $0x10, DI
+ MOVQ $0x00000009, CX
+ JMP sealAVX2InternalLoopStart
sealAVX2MainLoop:
- // Load state, increment counter blocks, store the incremented counters
- VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
- VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
- VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
- MOVQ $10, itr1
+ VMOVDQU ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA Y0, Y5
+ VMOVDQA Y0, Y6
+ VMOVDQA Y0, Y7
+ VMOVDQA 32(BP), Y14
+ VMOVDQA Y14, Y9
+ VMOVDQA Y14, Y10
+ VMOVDQA Y14, Y11
+ VMOVDQA 64(BP), Y12
+ VMOVDQA Y12, Y13
+ VMOVDQA Y12, Y8
+ VMOVDQA Y12, Y15
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
+ VMOVDQA Y4, 96(BP)
+ VMOVDQA Y1, 128(BP)
+ VMOVDQA Y2, 160(BP)
+ VMOVDQA Y3, 192(BP)
+ MOVQ $0x0000000a, CX
sealAVX2InternalLoop:
- polyAdd(0*8(oup))
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- polyMulStage1_AVX2
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- polyMulStage2_AVX2
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- polyMulStage3_AVX2
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyMulReduceStage
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
sealAVX2InternalLoopStart:
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
- polyAdd(2*8(oup))
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- polyMulStage1_AVX2
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyMulStage2_AVX2
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- polyMulStage3_AVX2
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- polyMulReduceStage
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- polyAdd(4*8(oup))
- LEAQ (6*8)(oup), oup
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyMulStage1_AVX2
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- polyMulStage2_AVX2
- VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- polyMulStage3_AVX2
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyMulReduceStage
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
- DECQ itr1
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ ADDQ 16(DI), R10
+ ADCQ 24(DI), R11
+ ADCQ $0x01, R12
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x07, Y11, Y15
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x04, Y11, Y11, Y11
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPALIGNR $0x0c, Y3, Y3, Y3
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ ADDQ 32(DI), R10
+ ADCQ 40(DI), R11
+ ADCQ $0x01, R12
+ LEAQ 48(DI), DI
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x07, Y11, Y15
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x0c, Y11, Y11, Y11
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x04, Y2, Y2, Y2
+ VPALIGNR $0x04, Y3, Y3, Y3
+ DECQ CX
JNE sealAVX2InternalLoop
-
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
- VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
- VMOVDQA CC3, tmpStoreAVX2
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
+ VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 32(BP), Y10, Y10
+ VPADDD 32(BP), Y11, Y11
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD 64(BP), Y8, Y8
+ VPADDD 64(BP), Y15, Y15
+ VPADDD 96(BP), Y4, Y4
+ VPADDD 128(BP), Y1, Y1
+ VPADDD 160(BP), Y2, Y2
+ VPADDD 192(BP), Y3, Y3
+ VMOVDQA Y15, 224(BP)
// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
- polyAdd(0*8(oup))
- polyMulAVX2
- LEAQ (4*8)(oup), oup
- VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
- VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
- VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
- VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
- VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
- VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 32(DI), DI
+ VPERM2I128 $0x02, Y0, Y14, Y15
+ VPERM2I128 $0x13, Y0, Y14, Y14
+ VPERM2I128 $0x02, Y12, Y4, Y0
+ VPERM2I128 $0x13, Y12, Y4, Y12
+ VPXOR (SI), Y15, Y15
+ VPXOR 32(SI), Y0, Y0
+ VPXOR 64(SI), Y14, Y14
+ VPXOR 96(SI), Y12, Y12
+ VMOVDQU Y15, (DI)
+ VMOVDQU Y0, 32(DI)
+ VMOVDQU Y14, 64(DI)
+ VMOVDQU Y12, 96(DI)
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
+ VPXOR 128(SI), Y0, Y0
+ VPXOR 160(SI), Y14, Y14
+ VPXOR 192(SI), Y12, Y12
+ VPXOR 224(SI), Y4, Y4
+ VMOVDQU Y0, 128(DI)
+ VMOVDQU Y14, 160(DI)
+ VMOVDQU Y12, 192(DI)
+ VMOVDQU Y4, 224(DI)
// and here
- polyAdd(-2*8(oup))
- polyMulAVX2
- VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
- VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
- VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
- VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
- VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
- VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
- LEAQ (32*16)(inp), inp
- SUBQ $(32*16), inl
- CMPQ inl, $512
+ ADDQ -16(DI), R10
+ ADCQ -8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPERM2I128 $0x02, Y6, Y10, Y0
+ VPERM2I128 $0x02, Y8, Y2, Y14
+ VPERM2I128 $0x13, Y6, Y10, Y12
+ VPERM2I128 $0x13, Y8, Y2, Y4
+ VPXOR 256(SI), Y0, Y0
+ VPXOR 288(SI), Y14, Y14
+ VPXOR 320(SI), Y12, Y12
+ VPXOR 352(SI), Y4, Y4
+ VMOVDQU Y0, 256(DI)
+ VMOVDQU Y14, 288(DI)
+ VMOVDQU Y12, 320(DI)
+ VMOVDQU Y4, 352(DI)
+ VPERM2I128 $0x02, Y7, Y11, Y0
+ VPERM2I128 $0x02, 224(BP), Y3, Y14
+ VPERM2I128 $0x13, Y7, Y11, Y12
+ VPERM2I128 $0x13, 224(BP), Y3, Y4
+ VPXOR 384(SI), Y0, Y0
+ VPXOR 416(SI), Y14, Y14
+ VPXOR 448(SI), Y12, Y12
+ VPXOR 480(SI), Y4, Y4
+ VMOVDQU Y0, 384(DI)
+ VMOVDQU Y14, 416(DI)
+ VMOVDQU Y12, 448(DI)
+ VMOVDQU Y4, 480(DI)
+ LEAQ 512(SI), SI
+ SUBQ $0x00000200, BX
+ CMPQ BX, $0x00000200
JG sealAVX2MainLoop
// Tail can only hash 480 bytes
- polyAdd(0*8(oup))
- polyMulAVX2
- polyAdd(2*8(oup))
- polyMulAVX2
- LEAQ 32(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ ADDQ 16(DI), R10
+ ADCQ 24(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 32(DI), DI
+ MOVQ $0x0000000a, CX
+ MOVQ $0x00000000, R9
+ CMPQ BX, $0x80
+ JBE sealAVX2Tail128
+ CMPQ BX, $0x00000100
+ JBE sealAVX2Tail256
+ CMPQ BX, $0x00000180
+ JBE sealAVX2Tail384
+ JMP sealAVX2Tail512
- MOVQ $10, itr1
- MOVQ $0, itr2
- CMPQ inl, $128
- JBE sealAVX2Tail128
- CMPQ inl, $256
- JBE sealAVX2Tail256
- CMPQ inl, $384
- JBE sealAVX2Tail384
- JMP sealAVX2Tail512
-
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 193 bytes
seal192AVX2:
- // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
- VMOVDQA AA0, AA1
- VMOVDQA BB0, BB1
- VMOVDQA CC0, CC1
- VPADDD ·avx2IncMask<>(SB), DD0, DD1
- VMOVDQA AA0, AA2
- VMOVDQA BB0, BB2
- VMOVDQA CC0, CC2
- VMOVDQA DD0, DD2
- VMOVDQA DD1, TT3
- MOVQ $10, itr2
+ VMOVDQA Y0, Y5
+ VMOVDQA Y14, Y9
+ VMOVDQA Y12, Y13
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VMOVDQA Y0, Y6
+ VMOVDQA Y14, Y10
+ VMOVDQA Y12, Y8
+ VMOVDQA Y4, Y2
+ VMOVDQA Y1, Y15
+ MOVQ $0x0000000a, R9
sealAVX2192InnerCipherLoop:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
- DECQ itr2
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ DECQ R9
JNE sealAVX2192InnerCipherLoop
- VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1
- VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1
- VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1
- VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1
- VPERM2I128 $0x02, AA0, BB0, TT0
+ VPADDD Y6, Y0, Y0
+ VPADDD Y6, Y5, Y5
+ VPADDD Y10, Y14, Y14
+ VPADDD Y10, Y9, Y9
+ VPADDD Y8, Y12, Y12
+ VPADDD Y8, Y13, Y13
+ VPADDD Y2, Y4, Y4
+ VPADDD Y15, Y1, Y1
+ VPERM2I128 $0x02, Y0, Y14, Y3
// Clamp and store poly key
- VPAND ·polyClampMask<>(SB), TT0, TT0
- VMOVDQA TT0, rsStoreAVX2
+ VPAND ·polyClampMask<>+0(SB), Y3, Y3
+ VMOVDQA Y3, (BP)
// Stream for up to 192 bytes
- VPERM2I128 $0x13, AA0, BB0, AA0
- VPERM2I128 $0x13, CC0, DD0, BB0
- VPERM2I128 $0x02, AA1, BB1, CC0
- VPERM2I128 $0x02, CC1, DD1, DD0
- VPERM2I128 $0x13, AA1, BB1, AA1
- VPERM2I128 $0x13, CC1, DD1, BB1
+ VPERM2I128 $0x13, Y0, Y14, Y0
+ VPERM2I128 $0x13, Y12, Y4, Y14
+ VPERM2I128 $0x02, Y5, Y9, Y12
+ VPERM2I128 $0x02, Y13, Y1, Y4
+ VPERM2I128 $0x13, Y5, Y9, Y5
+ VPERM2I128 $0x13, Y13, Y1, Y9
sealAVX2ShortSeal:
// Hash aad
- MOVQ ad_len+80(FP), itr2
+ MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
- XORQ itr1, itr1
+ XORQ CX, CX
sealAVX2SealHash:
// itr1 holds the number of bytes encrypted but not yet hashed
- CMPQ itr1, $16
- JB sealAVX2ShortSealLoop
- polyAdd(0(oup))
- polyMul
- SUBQ $16, itr1
- ADDQ $16, oup
- JMP sealAVX2SealHash
+ CMPQ CX, $0x10
+ JB sealAVX2ShortSealLoop
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ SUBQ $0x10, CX
+ ADDQ $0x10, DI
+ JMP sealAVX2SealHash
sealAVX2ShortSealLoop:
- CMPQ inl, $32
+ CMPQ BX, $0x20
JB sealAVX2ShortTail32
- SUBQ $32, inl
+ SUBQ $0x20, BX
// Load for encryption
- VPXOR (inp), AA0, AA0
- VMOVDQU AA0, (oup)
- LEAQ (1*32)(inp), inp
+ VPXOR (SI), Y0, Y0
+ VMOVDQU Y0, (DI)
+ LEAQ 32(SI), SI
// Now can hash
- polyAdd(0*8(oup))
- polyMulAVX2
- polyAdd(2*8(oup))
- polyMulAVX2
- LEAQ (1*32)(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ ADDQ 16(DI), R10
+ ADCQ 24(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 32(DI), DI
// Shift stream left
- VMOVDQA BB0, AA0
- VMOVDQA CC0, BB0
- VMOVDQA DD0, CC0
- VMOVDQA AA1, DD0
- VMOVDQA BB1, AA1
- VMOVDQA CC1, BB1
- VMOVDQA DD1, CC1
- VMOVDQA AA2, DD1
- VMOVDQA BB2, AA2
+ VMOVDQA Y14, Y0
+ VMOVDQA Y12, Y14
+ VMOVDQA Y4, Y12
+ VMOVDQA Y5, Y4
+ VMOVDQA Y9, Y5
+ VMOVDQA Y13, Y9
+ VMOVDQA Y1, Y13
+ VMOVDQA Y6, Y1
+ VMOVDQA Y10, Y6
JMP sealAVX2ShortSealLoop
sealAVX2ShortTail32:
- CMPQ inl, $16
- VMOVDQA A0, A1
+ CMPQ BX, $0x10
+ VMOVDQA X0, X1
JB sealAVX2ShortDone
-
- SUBQ $16, inl
+ SUBQ $0x10, BX
// Load for encryption
- VPXOR (inp), A0, T0
- VMOVDQU T0, (oup)
- LEAQ (1*16)(inp), inp
+ VPXOR (SI), X0, X12
+ VMOVDQU X12, (DI)
+ LEAQ 16(SI), SI
// Hash
- polyAdd(0*8(oup))
- polyMulAVX2
- LEAQ (1*16)(oup), oup
- VPERM2I128 $0x11, AA0, AA0, AA0
- VMOVDQA A0, A1
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
+ VPERM2I128 $0x11, Y0, Y0, Y0
+ VMOVDQA X0, X1
sealAVX2ShortDone:
VZEROUPPER
JMP sealSSETail
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 321 bytes
seal320AVX2:
- // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
- VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
- VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
- VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
- MOVQ $10, itr2
+ VMOVDQA Y0, Y5
+ VMOVDQA Y14, Y9
+ VMOVDQA Y12, Y13
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VMOVDQA Y0, Y6
+ VMOVDQA Y14, Y10
+ VMOVDQA Y12, Y8
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VMOVDQA Y14, Y7
+ VMOVDQA Y12, Y11
+ VMOVDQA Y4, Y15
+ MOVQ $0x0000000a, R9
sealAVX2320InnerCipherLoop:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
- DECQ itr2
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y3
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y3
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y3
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y3
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x04, Y2, Y2, Y2
+ DECQ R9
JNE sealAVX2320InnerCipherLoop
-
- VMOVDQA ·chacha20Constants<>(SB), TT0
- VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
- VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
- VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
- VMOVDQA ·avx2IncMask<>(SB), TT0
- VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
- VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
- VPADDD TT3, DD2, DD2
+ VMOVDQA ·chacha20Constants<>+0(SB), Y3
+ VPADDD Y3, Y0, Y0
+ VPADDD Y3, Y5, Y5
+ VPADDD Y3, Y6, Y6
+ VPADDD Y7, Y14, Y14
+ VPADDD Y7, Y9, Y9
+ VPADDD Y7, Y10, Y10
+ VPADDD Y11, Y12, Y12
+ VPADDD Y11, Y13, Y13
+ VPADDD Y11, Y8, Y8
+ VMOVDQA ·avx2IncMask<>+0(SB), Y3
+ VPADDD Y15, Y4, Y4
+ VPADDD Y3, Y15, Y15
+ VPADDD Y15, Y1, Y1
+ VPADDD Y3, Y15, Y15
+ VPADDD Y15, Y2, Y2
// Clamp and store poly key
- VPERM2I128 $0x02, AA0, BB0, TT0
- VPAND ·polyClampMask<>(SB), TT0, TT0
- VMOVDQA TT0, rsStoreAVX2
+ VPERM2I128 $0x02, Y0, Y14, Y3
+ VPAND ·polyClampMask<>+0(SB), Y3, Y3
+ VMOVDQA Y3, (BP)
// Stream for up to 320 bytes
- VPERM2I128 $0x13, AA0, BB0, AA0
- VPERM2I128 $0x13, CC0, DD0, BB0
- VPERM2I128 $0x02, AA1, BB1, CC0
- VPERM2I128 $0x02, CC1, DD1, DD0
- VPERM2I128 $0x13, AA1, BB1, AA1
- VPERM2I128 $0x13, CC1, DD1, BB1
- VPERM2I128 $0x02, AA2, BB2, CC1
- VPERM2I128 $0x02, CC2, DD2, DD1
- VPERM2I128 $0x13, AA2, BB2, AA2
- VPERM2I128 $0x13, CC2, DD2, BB2
+ VPERM2I128 $0x13, Y0, Y14, Y0
+ VPERM2I128 $0x13, Y12, Y4, Y14
+ VPERM2I128 $0x02, Y5, Y9, Y12
+ VPERM2I128 $0x02, Y13, Y1, Y4
+ VPERM2I128 $0x13, Y5, Y9, Y5
+ VPERM2I128 $0x13, Y13, Y1, Y9
+ VPERM2I128 $0x02, Y6, Y10, Y13
+ VPERM2I128 $0x02, Y8, Y2, Y1
+ VPERM2I128 $0x13, Y6, Y10, Y6
+ VPERM2I128 $0x13, Y8, Y2, Y10
JMP sealAVX2ShortSeal
-// ----------------------------------------------------------------------------
-// Special optimization for the last 128 bytes of ciphertext
sealAVX2Tail128:
- // Need to decrypt up to 128 bytes - prepare two blocks
- // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
- // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
- VMOVDQA ·chacha20Constants<>(SB), AA0
- VMOVDQA state1StoreAVX2, BB0
- VMOVDQA state2StoreAVX2, CC0
- VMOVDQA ctr3StoreAVX2, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD0
- VMOVDQA DD0, DD1
+ VMOVDQA ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA 32(BP), Y14
+ VMOVDQA 64(BP), Y12
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VMOVDQA Y4, Y1
sealAVX2Tail128LoopA:
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
sealAVX2Tail128LoopB:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
- polyAdd(0(oup))
- polyMul
- VPALIGNR $4, BB0, BB0, BB0
- VPALIGNR $8, CC0, CC0, CC0
- VPALIGNR $12, DD0, DD0, DD0
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
- polyAdd(16(oup))
- polyMul
- LEAQ 32(oup), oup
- VPALIGNR $12, BB0, BB0, BB0
- VPALIGNR $8, CC0, CC0, CC0
- VPALIGNR $4, DD0, DD0, DD0
- DECQ itr1
- JG sealAVX2Tail128LoopA
- DECQ itr2
- JGE sealAVX2Tail128LoopB
-
- VPADDD ·chacha20Constants<>(SB), AA0, AA1
- VPADDD state1StoreAVX2, BB0, BB1
- VPADDD state2StoreAVX2, CC0, CC1
- VPADDD DD1, DD0, DD1
-
- VPERM2I128 $0x02, AA1, BB1, AA0
- VPERM2I128 $0x02, CC1, DD1, BB0
- VPERM2I128 $0x13, AA1, BB1, CC0
- VPERM2I128 $0x13, CC1, DD1, DD0
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ ADDQ 16(DI), R10
+ ADCQ 24(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 32(DI), DI
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x04, Y4, Y4, Y4
+ DECQ CX
+ JG sealAVX2Tail128LoopA
+ DECQ R9
+ JGE sealAVX2Tail128LoopB
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y5
+ VPADDD 32(BP), Y14, Y9
+ VPADDD 64(BP), Y12, Y13
+ VPADDD Y1, Y4, Y1
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
JMP sealAVX2ShortSealLoop
-// ----------------------------------------------------------------------------
-// Special optimization for the last 256 bytes of ciphertext
sealAVX2Tail256:
- // Need to decrypt up to 256 bytes - prepare two blocks
- // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
- // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
- VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
- VMOVDQA ctr3StoreAVX2, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD1
- VMOVDQA DD0, TT1
- VMOVDQA DD1, TT2
+ VMOVDQA ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA ·chacha20Constants<>+0(SB), Y5
+ VMOVDQA 32(BP), Y14
+ VMOVDQA 32(BP), Y9
+ VMOVDQA 64(BP), Y12
+ VMOVDQA 64(BP), Y13
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VMOVDQA Y4, Y7
+ VMOVDQA Y1, Y11
sealAVX2Tail256LoopA:
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
sealAVX2Tail256LoopB:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- polyAdd(0(oup))
- polyMul
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
- polyAdd(16(oup))
- polyMul
- LEAQ 32(oup), oup
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
- DECQ itr1
- JG sealAVX2Tail256LoopA
- DECQ itr2
- JGE sealAVX2Tail256LoopB
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ ADDQ 16(DI), R10
+ ADCQ 24(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 32(DI), DI
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ DECQ CX
+ JG sealAVX2Tail256LoopA
+ DECQ R9
+ JGE sealAVX2Tail256LoopB
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD Y7, Y4, Y4
+ VPADDD Y11, Y1, Y1
+ VPERM2I128 $0x02, Y0, Y14, Y3
+ VPERM2I128 $0x02, Y12, Y4, Y7
+ VPERM2I128 $0x13, Y0, Y14, Y11
+ VPERM2I128 $0x13, Y12, Y4, Y15
+ VPXOR (SI), Y3, Y3
+ VPXOR 32(SI), Y7, Y7
+ VPXOR 64(SI), Y11, Y11
+ VPXOR 96(SI), Y15, Y15
+ VMOVDQU Y3, (DI)
+ VMOVDQU Y7, 32(DI)
+ VMOVDQU Y11, 64(DI)
+ VMOVDQU Y15, 96(DI)
+ MOVQ $0x00000080, CX
+ LEAQ 128(SI), SI
+ SUBQ $0x80, BX
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
+ JMP sealAVX2SealHash
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
- VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
- VPERM2I128 $0x02, AA0, BB0, TT0
- VPERM2I128 $0x02, CC0, DD0, TT1
- VPERM2I128 $0x13, AA0, BB0, TT2
- VPERM2I128 $0x13, CC0, DD0, TT3
- VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
- VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
- MOVQ $128, itr1
- LEAQ 128(inp), inp
- SUBQ $128, inl
- VPERM2I128 $0x02, AA1, BB1, AA0
- VPERM2I128 $0x02, CC1, DD1, BB0
- VPERM2I128 $0x13, AA1, BB1, CC0
- VPERM2I128 $0x13, CC1, DD1, DD0
-
- JMP sealAVX2SealHash
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 384 bytes of ciphertext
sealAVX2Tail384:
- // Need to decrypt up to 384 bytes - prepare two blocks
- // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
- // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
- VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
- VMOVDQA ctr3StoreAVX2, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
- VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
+ VMOVDQA ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA Y0, Y5
+ VMOVDQA Y0, Y6
+ VMOVDQA 32(BP), Y14
+ VMOVDQA Y14, Y9
+ VMOVDQA Y14, Y10
+ VMOVDQA 64(BP), Y12
+ VMOVDQA Y12, Y13
+ VMOVDQA Y12, Y8
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VMOVDQA Y4, Y7
+ VMOVDQA Y1, Y11
+ VMOVDQA Y2, Y15
sealAVX2Tail384LoopA:
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
sealAVX2Tail384LoopB:
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
- polyAdd(0(oup))
- polyMul
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
- chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
- polyAdd(16(oup))
- polyMul
- LEAQ 32(oup), oup
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
- DECQ itr1
- JG sealAVX2Tail384LoopA
- DECQ itr2
- JGE sealAVX2Tail384LoopB
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y3
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y3
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x0c, Y14, Y3
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y14, Y0, Y0
+ VPXOR Y0, Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPADDD Y4, Y12, Y12
+ VPXOR Y12, Y14, Y14
+ VPSLLD $0x07, Y14, Y3
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y3, Y14, Y14
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x0c, Y9, Y3
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y9, Y5, Y5
+ VPXOR Y5, Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPADDD Y1, Y13, Y13
+ VPXOR Y13, Y9, Y9
+ VPSLLD $0x07, Y9, Y3
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y3, Y9, Y9
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x0c, Y10, Y3
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ VPADDD Y10, Y6, Y6
+ VPXOR Y6, Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPADDD Y2, Y8, Y8
+ VPXOR Y8, Y10, Y10
+ VPSLLD $0x07, Y10, Y3
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y3, Y10, Y10
+ ADDQ 16(DI), R10
+ ADCQ 24(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 32(DI), DI
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x04, Y2, Y2, Y2
+ DECQ CX
+ JG sealAVX2Tail384LoopA
+ DECQ R9
+ JGE sealAVX2Tail384LoopB
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 32(BP), Y10, Y10
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD 64(BP), Y8, Y8
+ VPADDD Y7, Y4, Y4
+ VPADDD Y11, Y1, Y1
+ VPADDD Y15, Y2, Y2
+ VPERM2I128 $0x02, Y0, Y14, Y3
+ VPERM2I128 $0x02, Y12, Y4, Y7
+ VPERM2I128 $0x13, Y0, Y14, Y11
+ VPERM2I128 $0x13, Y12, Y4, Y15
+ VPXOR (SI), Y3, Y3
+ VPXOR 32(SI), Y7, Y7
+ VPXOR 64(SI), Y11, Y11
+ VPXOR 96(SI), Y15, Y15
+ VMOVDQU Y3, (DI)
+ VMOVDQU Y7, 32(DI)
+ VMOVDQU Y11, 64(DI)
+ VMOVDQU Y15, 96(DI)
+ VPERM2I128 $0x02, Y5, Y9, Y3
+ VPERM2I128 $0x02, Y13, Y1, Y7
+ VPERM2I128 $0x13, Y5, Y9, Y11
+ VPERM2I128 $0x13, Y13, Y1, Y15
+ VPXOR 128(SI), Y3, Y3
+ VPXOR 160(SI), Y7, Y7
+ VPXOR 192(SI), Y11, Y11
+ VPXOR 224(SI), Y15, Y15
+ VMOVDQU Y3, 128(DI)
+ VMOVDQU Y7, 160(DI)
+ VMOVDQU Y11, 192(DI)
+ VMOVDQU Y15, 224(DI)
+ MOVQ $0x00000100, CX
+ LEAQ 256(SI), SI
+ SUBQ $0x00000100, BX
+ VPERM2I128 $0x02, Y6, Y10, Y0
+ VPERM2I128 $0x02, Y8, Y2, Y14
+ VPERM2I128 $0x13, Y6, Y10, Y12
+ VPERM2I128 $0x13, Y8, Y2, Y4
+ JMP sealAVX2SealHash
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
- VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
- VPERM2I128 $0x02, AA0, BB0, TT0
- VPERM2I128 $0x02, CC0, DD0, TT1
- VPERM2I128 $0x13, AA0, BB0, TT2
- VPERM2I128 $0x13, CC0, DD0, TT3
- VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
- VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
- VPERM2I128 $0x02, AA1, BB1, TT0
- VPERM2I128 $0x02, CC1, DD1, TT1
- VPERM2I128 $0x13, AA1, BB1, TT2
- VPERM2I128 $0x13, CC1, DD1, TT3
- VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
- VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
- MOVQ $256, itr1
- LEAQ 256(inp), inp
- SUBQ $256, inl
- VPERM2I128 $0x02, AA2, BB2, AA0
- VPERM2I128 $0x02, CC2, DD2, BB0
- VPERM2I128 $0x13, AA2, BB2, CC0
- VPERM2I128 $0x13, CC2, DD2, DD0
-
- JMP sealAVX2SealHash
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 512 bytes of ciphertext
sealAVX2Tail512:
- // Need to decrypt up to 512 bytes - prepare two blocks
- // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
- // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
- VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
- VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
- VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
- VMOVDQA ctr3StoreAVX2, DD0
- VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
- VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
+ VMOVDQA ·chacha20Constants<>+0(SB), Y0
+ VMOVDQA Y0, Y5
+ VMOVDQA Y0, Y6
+ VMOVDQA Y0, Y7
+ VMOVDQA 32(BP), Y14
+ VMOVDQA Y14, Y9
+ VMOVDQA Y14, Y10
+ VMOVDQA Y14, Y11
+ VMOVDQA 64(BP), Y12
+ VMOVDQA Y12, Y13
+ VMOVDQA Y12, Y8
+ VMOVDQA Y12, Y15
+ VMOVDQA 192(BP), Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
+ VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
+ VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
+ VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
+ VMOVDQA Y4, 96(BP)
+ VMOVDQA Y1, 128(BP)
+ VMOVDQA Y2, 160(BP)
+ VMOVDQA Y3, 192(BP)
sealAVX2Tail512LoopA:
- polyAdd(0(oup))
- polyMul
- LEAQ 16(oup), oup
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), AX
+ MOVQ AX, R15
+ MULQ R10
+ MOVQ AX, R13
+ MOVQ DX, R14
+ MOVQ (BP), AX
+ MULQ R11
+ IMULQ R12, R15
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), AX
+ MOVQ AX, R8
+ MULQ R10
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 8(BP), AX
+ MULQ R11
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ IMULQ R12, R8
+ ADDQ R10, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 16(DI), DI
sealAVX2Tail512LoopB:
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- polyAdd(0*8(oup))
- polyMulAVX2
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
- VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- polyAdd(2*8(oup))
- polyMulAVX2
- LEAQ (4*8)(oup), oup
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
- VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
- VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
- VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
- VMOVDQA CC3, tmpStoreAVX2
- VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
- VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
- VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
- VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
- VMOVDQA tmpStoreAVX2, CC3
- VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
- VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
- VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
-
- DECQ itr1
- JG sealAVX2Tail512LoopA
- DECQ itr2
- JGE sealAVX2Tail512LoopB
-
- VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
- VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
- VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
- VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
- VMOVDQA CC3, tmpStoreAVX2
- VPERM2I128 $0x02, AA0, BB0, CC3
- VPXOR (0*32)(inp), CC3, CC3
- VMOVDQU CC3, (0*32)(oup)
- VPERM2I128 $0x02, CC0, DD0, CC3
- VPXOR (1*32)(inp), CC3, CC3
- VMOVDQU CC3, (1*32)(oup)
- VPERM2I128 $0x13, AA0, BB0, CC3
- VPXOR (2*32)(inp), CC3, CC3
- VMOVDQU CC3, (2*32)(oup)
- VPERM2I128 $0x13, CC0, DD0, CC3
- VPXOR (3*32)(inp), CC3, CC3
- VMOVDQU CC3, (3*32)(oup)
-
- VPERM2I128 $0x02, AA1, BB1, AA0
- VPERM2I128 $0x02, CC1, DD1, BB0
- VPERM2I128 $0x13, AA1, BB1, CC0
- VPERM2I128 $0x13, CC1, DD1, DD0
- VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
- VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
-
- VPERM2I128 $0x02, AA2, BB2, AA0
- VPERM2I128 $0x02, CC2, DD2, BB0
- VPERM2I128 $0x13, AA2, BB2, CC0
- VPERM2I128 $0x13, CC2, DD2, DD0
- VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
- VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
-
- MOVQ $384, itr1
- LEAQ 384(inp), inp
- SUBQ $384, inl
- VPERM2I128 $0x02, AA3, BB3, AA0
- VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0
- VPERM2I128 $0x13, AA3, BB3, CC0
- VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
-
- JMP sealAVX2SealHash
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ ADDQ (DI), R10
+ ADCQ 8(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x07, Y11, Y15
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ VPALIGNR $0x04, Y14, Y14, Y14
+ VPALIGNR $0x04, Y9, Y9, Y9
+ VPALIGNR $0x04, Y10, Y10, Y10
+ VPALIGNR $0x04, Y11, Y11, Y11
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x0c, Y4, Y4, Y4
+ VPALIGNR $0x0c, Y1, Y1, Y1
+ VPALIGNR $0x0c, Y2, Y2, Y2
+ VPALIGNR $0x0c, Y3, Y3, Y3
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol16<>+0(SB), Y4, Y4
+ VPSHUFB ·rol16<>+0(SB), Y1, Y1
+ VPSHUFB ·rol16<>+0(SB), Y2, Y2
+ VPSHUFB ·rol16<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ ADDQ 16(DI), R10
+ ADCQ 24(DI), R11
+ ADCQ $0x01, R12
+ MOVQ (BP), DX
+ MOVQ DX, R15
+ MULXQ R10, R13, R14
+ IMULQ R12, R15
+ MULXQ R11, AX, DX
+ ADDQ AX, R14
+ ADCQ DX, R15
+ MOVQ 8(BP), DX
+ MULXQ R10, R10, AX
+ ADDQ R10, R14
+ MULXQ R11, R11, R8
+ ADCQ R11, R15
+ ADCQ $0x00, R8
+ IMULQ R12, DX
+ ADDQ AX, R15
+ ADCQ DX, R8
+ MOVQ R13, R10
+ MOVQ R14, R11
+ MOVQ R15, R12
+ ANDQ $0x03, R12
+ MOVQ R15, R13
+ ANDQ $-4, R13
+ MOVQ R8, R14
+ SHRQ $0x02, R8, R15
+ SHRQ $0x02, R8
+ ADDQ R13, R10
+ ADCQ R14, R11
+ ADCQ $0x00, R12
+ ADDQ R15, R10
+ ADCQ R8, R11
+ ADCQ $0x00, R12
+ LEAQ 32(DI), DI
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x0c, Y14, Y15
+ VPSRLD $0x14, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x0c, Y9, Y15
+ VPSRLD $0x14, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x0c, Y10, Y15
+ VPSRLD $0x14, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x0c, Y11, Y15
+ VPSRLD $0x14, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ VPADDD Y14, Y0, Y0
+ VPADDD Y9, Y5, Y5
+ VPADDD Y10, Y6, Y6
+ VPADDD Y11, Y7, Y7
+ VPXOR Y0, Y4, Y4
+ VPXOR Y5, Y1, Y1
+ VPXOR Y6, Y2, Y2
+ VPXOR Y7, Y3, Y3
+ VPSHUFB ·rol8<>+0(SB), Y4, Y4
+ VPSHUFB ·rol8<>+0(SB), Y1, Y1
+ VPSHUFB ·rol8<>+0(SB), Y2, Y2
+ VPSHUFB ·rol8<>+0(SB), Y3, Y3
+ VPADDD Y4, Y12, Y12
+ VPADDD Y1, Y13, Y13
+ VPADDD Y2, Y8, Y8
+ VPADDD Y3, Y15, Y15
+ VPXOR Y12, Y14, Y14
+ VPXOR Y13, Y9, Y9
+ VPXOR Y8, Y10, Y10
+ VPXOR Y15, Y11, Y11
+ VMOVDQA Y15, 224(BP)
+ VPSLLD $0x07, Y14, Y15
+ VPSRLD $0x19, Y14, Y14
+ VPXOR Y15, Y14, Y14
+ VPSLLD $0x07, Y9, Y15
+ VPSRLD $0x19, Y9, Y9
+ VPXOR Y15, Y9, Y9
+ VPSLLD $0x07, Y10, Y15
+ VPSRLD $0x19, Y10, Y10
+ VPXOR Y15, Y10, Y10
+ VPSLLD $0x07, Y11, Y15
+ VPSRLD $0x19, Y11, Y11
+ VPXOR Y15, Y11, Y11
+ VMOVDQA 224(BP), Y15
+ VPALIGNR $0x0c, Y14, Y14, Y14
+ VPALIGNR $0x0c, Y9, Y9, Y9
+ VPALIGNR $0x0c, Y10, Y10, Y10
+ VPALIGNR $0x0c, Y11, Y11, Y11
+ VPALIGNR $0x08, Y12, Y12, Y12
+ VPALIGNR $0x08, Y13, Y13, Y13
+ VPALIGNR $0x08, Y8, Y8, Y8
+ VPALIGNR $0x08, Y15, Y15, Y15
+ VPALIGNR $0x04, Y4, Y4, Y4
+ VPALIGNR $0x04, Y1, Y1, Y1
+ VPALIGNR $0x04, Y2, Y2, Y2
+ VPALIGNR $0x04, Y3, Y3, Y3
+ DECQ CX
+ JG sealAVX2Tail512LoopA
+ DECQ R9
+ JGE sealAVX2Tail512LoopB
+ VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
+ VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
+ VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
+ VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
+ VPADDD 32(BP), Y14, Y14
+ VPADDD 32(BP), Y9, Y9
+ VPADDD 32(BP), Y10, Y10
+ VPADDD 32(BP), Y11, Y11
+ VPADDD 64(BP), Y12, Y12
+ VPADDD 64(BP), Y13, Y13
+ VPADDD 64(BP), Y8, Y8
+ VPADDD 64(BP), Y15, Y15
+ VPADDD 96(BP), Y4, Y4
+ VPADDD 128(BP), Y1, Y1
+ VPADDD 160(BP), Y2, Y2
+ VPADDD 192(BP), Y3, Y3
+ VMOVDQA Y15, 224(BP)
+ VPERM2I128 $0x02, Y0, Y14, Y15
+ VPXOR (SI), Y15, Y15
+ VMOVDQU Y15, (DI)
+ VPERM2I128 $0x02, Y12, Y4, Y15
+ VPXOR 32(SI), Y15, Y15
+ VMOVDQU Y15, 32(DI)
+ VPERM2I128 $0x13, Y0, Y14, Y15
+ VPXOR 64(SI), Y15, Y15
+ VMOVDQU Y15, 64(DI)
+ VPERM2I128 $0x13, Y12, Y4, Y15
+ VPXOR 96(SI), Y15, Y15
+ VMOVDQU Y15, 96(DI)
+ VPERM2I128 $0x02, Y5, Y9, Y0
+ VPERM2I128 $0x02, Y13, Y1, Y14
+ VPERM2I128 $0x13, Y5, Y9, Y12
+ VPERM2I128 $0x13, Y13, Y1, Y4
+ VPXOR 128(SI), Y0, Y0
+ VPXOR 160(SI), Y14, Y14
+ VPXOR 192(SI), Y12, Y12
+ VPXOR 224(SI), Y4, Y4
+ VMOVDQU Y0, 128(DI)
+ VMOVDQU Y14, 160(DI)
+ VMOVDQU Y12, 192(DI)
+ VMOVDQU Y4, 224(DI)
+ VPERM2I128 $0x02, Y6, Y10, Y0
+ VPERM2I128 $0x02, Y8, Y2, Y14
+ VPERM2I128 $0x13, Y6, Y10, Y12
+ VPERM2I128 $0x13, Y8, Y2, Y4
+ VPXOR 256(SI), Y0, Y0
+ VPXOR 288(SI), Y14, Y14
+ VPXOR 320(SI), Y12, Y12
+ VPXOR 352(SI), Y4, Y4
+ VMOVDQU Y0, 256(DI)
+ VMOVDQU Y14, 288(DI)
+ VMOVDQU Y12, 320(DI)
+ VMOVDQU Y4, 352(DI)
+ MOVQ $0x00000180, CX
+ LEAQ 384(SI), SI
+ SUBQ $0x00000180, BX
+ VPERM2I128 $0x02, Y7, Y11, Y0
+ VPERM2I128 $0x02, 224(BP), Y3, Y14
+ VPERM2I128 $0x13, Y7, Y11, Y12
+ VPERM2I128 $0x13, 224(BP), Y3, Y4
+ JMP sealAVX2SealHash