vendor: update golang.org/x/crypto/chacha20poly1305
This change updates the vendored chacha20poly1305 package to match
revision 14f9af67c679edd414f72f13d67c917447113df2 of x/crypto.
Change-Id: I05a4ba86578b0f0cdb1ed7dd50fee3b38bb48cf5
Reviewed-on: https://go-review.googlesource.com/31312
Run-TryBot: Adam Langley <agl@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
diff --git a/src/vendor/golang_org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s b/src/vendor/golang_org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s
index a1812b7..ac95844 100644
--- a/src/vendor/golang_org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s
+++ b/src/vendor/golang_org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s
@@ -82,82 +82,82 @@
#define TT2 BB3
#define TT3 CC3
// ChaCha20 constants
-DATA chacha20Constants<>+0x00(SB)/4, $0x61707865
-DATA chacha20Constants<>+0x04(SB)/4, $0x3320646e
-DATA chacha20Constants<>+0x08(SB)/4, $0x79622d32
-DATA chacha20Constants<>+0x0c(SB)/4, $0x6b206574
-DATA chacha20Constants<>+0x10(SB)/4, $0x61707865
-DATA chacha20Constants<>+0x14(SB)/4, $0x3320646e
-DATA chacha20Constants<>+0x18(SB)/4, $0x79622d32
-DATA chacha20Constants<>+0x1c(SB)/4, $0x6b206574
+DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
+DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
+DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
+DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
+DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
+DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
+DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
+DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
// <<< 16 with PSHUFB
-DATA rol16<>+0x00(SB)/8, $0x0504070601000302
-DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
-DATA rol16<>+0x10(SB)/8, $0x0504070601000302
-DATA rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
+DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
+DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
+DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
+DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
// <<< 8 with PSHUFB
-DATA rol8<>+0x00(SB)/8, $0x0605040702010003
-DATA rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
-DATA rol8<>+0x10(SB)/8, $0x0605040702010003
-DATA rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
+DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
+DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
+DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
+DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
-DATA avx2InitMask<>+0x00(SB)/8, $0x0
-DATA avx2InitMask<>+0x08(SB)/8, $0x0
-DATA avx2InitMask<>+0x10(SB)/8, $0x1
-DATA avx2InitMask<>+0x18(SB)/8, $0x0
+DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
+DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
+DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
+DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
-DATA avx2IncMask<>+0x00(SB)/8, $0x2
-DATA avx2IncMask<>+0x08(SB)/8, $0x0
-DATA avx2IncMask<>+0x10(SB)/8, $0x2
-DATA avx2IncMask<>+0x18(SB)/8, $0x0
+DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
+DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
+DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
+DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
// Poly1305 key clamp
-DATA polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
-DATA polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
-DATA polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
-DATA polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
+DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
+DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
+DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
+DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
-DATA sseIncMask<>+0x00(SB)/8, $0x1
-DATA sseIncMask<>+0x08(SB)/8, $0x0
+DATA ·sseIncMask<>+0x00(SB)/8, $0x1
+DATA ·sseIncMask<>+0x08(SB)/8, $0x0
// To load/store the last < 16 bytes in a buffer
-DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
-DATA andMask<>+0x08(SB)/8, $0x0000000000000000
-DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
-DATA andMask<>+0x18(SB)/8, $0x0000000000000000
-DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
-DATA andMask<>+0x28(SB)/8, $0x0000000000000000
-DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
-DATA andMask<>+0x38(SB)/8, $0x0000000000000000
-DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
-DATA andMask<>+0x48(SB)/8, $0x0000000000000000
-DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
-DATA andMask<>+0x58(SB)/8, $0x0000000000000000
-DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
-DATA andMask<>+0x68(SB)/8, $0x0000000000000000
-DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
-DATA andMask<>+0x78(SB)/8, $0x0000000000000000
-DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
-DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
-DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
-DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
-DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
-DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
-DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
-DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
-DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
-DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
-DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
-DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
-DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
-DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
+DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
+DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
+DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
+DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
+DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
+DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
+DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
+DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
+DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
+DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
+DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
+DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
+DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
+DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
+DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
+DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
+DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
+DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
+DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
+DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
+DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
+DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
-GLOBL chacha20Constants<>(SB), (NOPTR+RODATA), $32
-GLOBL rol16<>(SB), (NOPTR+RODATA), $32
-GLOBL rol8<>(SB), (NOPTR+RODATA), $32
-GLOBL sseIncMask<>(SB), (NOPTR+RODATA), $16
-GLOBL avx2IncMask<>(SB), (NOPTR+RODATA), $32
-GLOBL avx2InitMask<>(SB), (NOPTR+RODATA), $32
-GLOBL polyClampMask<>(SB), (NOPTR+RODATA), $32
-GLOBL andMask<>(SB), (NOPTR+RODATA), $240
+GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
+GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
+GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
+GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
+GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
+GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
+GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
+GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
// No PALIGNR in Go ASM yet (but VPALIGNR is present).
#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
@@ -185,15 +185,15 @@
#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
// Some macros
#define chachaQR(A, B, C, D, T) \
- PADDD B, A; PXOR A, D; PSHUFB rol16<>(SB), D \
+ PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D \
PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
- PADDD B, A; PXOR A, D; PSHUFB rol8<>(SB), D \
+ PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D \
PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
#define chachaQR_AVX2(A, B, C, D, T) \
- VPADDD B, A, A; VPXOR A, D, D; VPSHUFB rol16<>(SB), D, D \
+ VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \
VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
- VPADDD B, A, A; VPXOR A, D, D; VPSHUFB rol8<>(SB), D, D \
+ VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \
VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
@@ -286,7 +286,7 @@
JBE openSSE128 // About 16% faster
// For long buffers, prepare the poly key first
- MOVOU chacha20Constants<>(SB), A0
+ MOVOU ·chacha20Constants<>(SB), A0
MOVOU (1*16)(keyp), B0
MOVOU (2*16)(keyp), C0
MOVOU (3*16)(keyp), D0
@@ -307,10 +307,10 @@
JNE openSSEPreparePolyKey
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
- PADDL chacha20Constants<>(SB), A0; PADDL state1Store, B0
+ PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
// Clamp and store the key
- PAND polyClampMask<>(SB), A0
+ PAND ·polyClampMask<>(SB), A0
MOVO A0, rStore; MOVO B0, sStore
// Hash AAD
@@ -322,10 +322,10 @@
JB openSSEMainLoopDone
// Load state, increment counter blocks
- MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2
- MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL sseIncMask<>(SB), D3
+ MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
+ MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
+ MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
+ MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
// Store counters
MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
@@ -370,7 +370,7 @@
JG openSSEInternalLoop
// Add in the state
- PADDD chacha20Constants<>(SB), A0; PADDD chacha20Constants<>(SB), A1; PADDD chacha20Constants<>(SB), A2; PADDD chacha20Constants<>(SB), A3
+ PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
@@ -446,9 +446,9 @@
// Special optimization for buffers smaller than 129 bytes
openSSE128:
// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
- MOVOU chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2
+ MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
+ MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
+ MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
MOVQ $10, itr2
@@ -465,13 +465,13 @@
JNE openSSE128InnerCipherLoop
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
- PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1; PADDL chacha20Constants<>(SB), A2
+ PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
PADDL T2, C1; PADDL T2, C2
- PADDL T3, D1; PADDL sseIncMask<>(SB), T3; PADDL T3, D2
+ PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
// Clamp and store the key
- PAND polyClampMask<>(SB), A0
+ PAND ·polyClampMask<>(SB), A0
MOVOU A0, rStore; MOVOU B0, sStore
// Hash
@@ -509,7 +509,7 @@
// We can safely load the CT from the end, because it is padded with the MAC
MOVQ inl, itr2
SHLQ $4, itr2
- LEAQ andMask<>(SB), t0
+ LEAQ ·andMask<>(SB), t0
MOVOU (inp), T0
ADDQ inl, inp
PAND -16(t0)(itr2*1), T0
@@ -534,7 +534,7 @@
// Special optimization for the last 64 bytes of ciphertext
openSSETail64:
// Need to decrypt up to 64 bytes - prepare single block
- MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr0Store
+ MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
XORQ itr2, itr2
MOVQ inl, itr1
CMPQ itr1, $16
@@ -559,7 +559,7 @@
CMPQ itr2, $160
JNE openSSETail64LoopB
- PADDL chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
+ PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
openSSETail64DecLoop:
CMPQ inl, $16
@@ -583,8 +583,8 @@
// Special optimization for the last 128 bytes of ciphertext
openSSETail128:
// Need to decrypt up to 128 bytes - prepare two blocks
- MOVO chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL sseIncMask<>(SB), D1; MOVO D1, ctr0Store
- MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr1Store
+ MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
+ MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
XORQ itr2, itr2
MOVQ inl, itr1
ANDQ $-16, itr1
@@ -609,7 +609,7 @@
CMPQ itr2, $160
JNE openSSETail128LoopB
- PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1
+ PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
PADDL state1Store, B0; PADDL state1Store, B1
PADDL state2Store, C0; PADDL state2Store, C1
PADDL ctr1Store, D0; PADDL ctr0Store, D1
@@ -627,9 +627,9 @@
// Special optimization for the last 192 bytes of ciphertext
openSSETail192:
// Need to decrypt up to 192 bytes - prepare three blocks
- MOVO chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL sseIncMask<>(SB), D2; MOVO D2, ctr0Store
- MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL sseIncMask<>(SB), D1; MOVO D1, ctr1Store
- MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr2Store
+ MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
+ MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
+ MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
MOVQ inl, itr1
MOVQ $160, itr2
@@ -674,7 +674,7 @@
polyMul
openSSLTail192Store:
- PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1; PADDL chacha20Constants<>(SB), A2
+ PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
@@ -696,10 +696,10 @@
// Special optimization for the last 256 bytes of ciphertext
openSSETail256:
// Need to decrypt up to 256 bytes - prepare four blocks
- MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2
- MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL sseIncMask<>(SB), D3
+ MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
+ MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
+ MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
+ MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
// Store counters
MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
@@ -744,7 +744,7 @@
JB openSSETail256HashLoop
// Add in the state
- PADDD chacha20Constants<>(SB), A0; PADDD chacha20Constants<>(SB), A1; PADDD chacha20Constants<>(SB), A2; PADDD chacha20Constants<>(SB), A3
+ PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
@@ -779,11 +779,11 @@
// ------------------------- AVX2 Code ----------------------------------------
chacha20Poly1305Open_AVX2:
VZEROUPPER
- VMOVDQU chacha20Constants<>(SB), AA0
+ VMOVDQU ·chacha20Constants<>(SB), AA0
BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
- VPADDD avx2InitMask<>(SB), DD0, DD0
+ VPADDD ·avx2InitMask<>(SB), DD0, DD0
// Special optimization, for very short buffers
CMPQ inl, $192
@@ -805,7 +805,7 @@
DECQ itr2
JNE openAVX2PreparePolyKey
- VPADDD chacha20Constants<>(SB), AA0, AA0
+ VPADDD ·chacha20Constants<>(SB), AA0, AA0
VPADDD state1StoreAVX2, BB0, BB0
VPADDD state2StoreAVX2, CC0, CC0
VPADDD ctr3StoreAVX2, DD0, DD0
@@ -813,7 +813,7 @@
VPERM2I128 $0x02, AA0, BB0, TT0
// Clamp and store poly key
- VPAND polyClampMask<>(SB), TT0, TT0
+ VPAND ·polyClampMask<>(SB), TT0, TT0
VMOVDQA TT0, rsStoreAVX2
// Stream for the first 64 bytes
@@ -846,10 +846,10 @@
JB openAVX2MainLoopDone
// Load state, increment counter blocks, store the incremented counters
- VMOVDQU chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
+ VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
- VMOVDQA ctr3StoreAVX2, DD0; VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3
+ VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
XORQ itr1, itr1
@@ -860,7 +860,7 @@
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
polyMulStage1_AVX2
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
+ VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
polyMulStage2_AVX2
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
@@ -874,7 +874,7 @@
polyMulReduceStage
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3
+ VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
polyAdd(2*8(inp)(itr1*1))
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
polyMulStage1_AVX2
@@ -892,7 +892,7 @@
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
polyMulStage3_AVX2
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
+ VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
polyMulReduceStage
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
@@ -908,7 +908,7 @@
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
polyMulStage2_AVX2
- VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3
+ VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
polyMulStage3_AVX2
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
@@ -925,7 +925,7 @@
CMPQ itr1, $480
JNE openAVX2InternalLoop
- VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3
+ VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
@@ -974,7 +974,7 @@
VMOVDQA AA0, AA1
VMOVDQA BB0, BB1
VMOVDQA CC0, CC1
- VPADDD avx2IncMask<>(SB), DD0, DD1
+ VPADDD ·avx2IncMask<>(SB), DD0, DD1
VMOVDQA AA0, AA2
VMOVDQA BB0, BB2
VMOVDQA CC0, CC2
@@ -1000,7 +1000,7 @@
VPERM2I128 $0x02, AA0, BB0, TT0
// Clamp and store poly key
- VPAND polyClampMask<>(SB), TT0, TT0
+ VPAND ·polyClampMask<>(SB), TT0, TT0
VMOVDQA TT0, rsStoreAVX2
// Stream for up to 192 bytes
@@ -1072,8 +1072,8 @@
// Special optimization for buffers smaller than 321 bytes
openAVX2320:
// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
- VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD avx2IncMask<>(SB), DD0, DD1
- VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD avx2IncMask<>(SB), DD1, DD2
+ VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
+ VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
MOVQ $10, itr2
@@ -1089,18 +1089,18 @@
DECQ itr2
JNE openAVX2320InnerCipherLoop
- VMOVDQA chacha20Constants<>(SB), TT0
+ VMOVDQA ·chacha20Constants<>(SB), TT0
VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
- VMOVDQA avx2IncMask<>(SB), TT0
+ VMOVDQA ·avx2IncMask<>(SB), TT0
VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
VPADDD TT3, DD2, DD2
// Clamp and store poly key
VPERM2I128 $0x02, AA0, BB0, TT0
- VPAND polyClampMask<>(SB), TT0, TT0
+ VPAND ·polyClampMask<>(SB), TT0, TT0
VMOVDQA TT0, rsStoreAVX2
// Stream for up to 320 bytes
@@ -1120,11 +1120,11 @@
// Special optimization for the last 128 bytes of ciphertext
openAVX2Tail128:
// Need to decrypt up to 128 bytes - prepare two blocks
- VMOVDQA chacha20Constants<>(SB), AA1
+ VMOVDQA ·chacha20Constants<>(SB), AA1
VMOVDQA state1StoreAVX2, BB1
VMOVDQA state2StoreAVX2, CC1
VMOVDQA ctr3StoreAVX2, DD1
- VPADDD avx2IncMask<>(SB), DD1, DD1
+ VPADDD ·avx2IncMask<>(SB), DD1, DD1
VMOVDQA DD1, DD0
XORQ itr2, itr2
@@ -1153,7 +1153,7 @@
CMPQ itr2, $160
JNE openAVX2Tail128LoopB
- VPADDD chacha20Constants<>(SB), AA1, AA1
+ VPADDD ·chacha20Constants<>(SB), AA1, AA1
VPADDD state1StoreAVX2, BB1, BB1
VPADDD state2StoreAVX2, CC1, CC1
VPADDD DD0, DD1, DD1
@@ -1196,12 +1196,12 @@
// Special optimization for the last 256 bytes of ciphertext
openAVX2Tail256:
// Need to decrypt up to 256 bytes - prepare four blocks
- VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
+ VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
VMOVDQA ctr3StoreAVX2, DD0
- VPADDD avx2IncMask<>(SB), DD0, DD0
- VPADDD avx2IncMask<>(SB), DD0, DD1
+ VPADDD ·avx2IncMask<>(SB), DD0, DD0
+ VPADDD ·avx2IncMask<>(SB), DD0, DD1
VMOVDQA DD0, TT1
VMOVDQA DD1, TT2
@@ -1255,7 +1255,7 @@
// Store 128 bytes safely, then go to store loop
openAVX2Tail256HashEnd:
- VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1
+ VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
@@ -1274,13 +1274,13 @@
// Special optimization for the last 384 bytes of ciphertext
openAVX2Tail384:
// Need to decrypt up to 384 bytes - prepare six blocks
- VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
+ VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
VMOVDQA ctr3StoreAVX2, DD0
- VPADDD avx2IncMask<>(SB), DD0, DD0
- VPADDD avx2IncMask<>(SB), DD0, DD1
- VPADDD avx2IncMask<>(SB), DD1, DD2
+ VPADDD ·avx2IncMask<>(SB), DD0, DD0
+ VPADDD ·avx2IncMask<>(SB), DD0, DD1
+ VPADDD ·avx2IncMask<>(SB), DD1, DD2
VMOVDQA DD0, ctr0StoreAVX2
VMOVDQA DD1, ctr1StoreAVX2
VMOVDQA DD2, ctr2StoreAVX2
@@ -1339,7 +1339,7 @@
// Store 256 bytes safely, then go to store loop
openAVX2Tail384HashEnd:
- VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2
+ VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
@@ -1358,10 +1358,10 @@
// ----------------------------------------------------------------------------
// Special optimization for the last 512 bytes of ciphertext
openAVX2Tail512:
- VMOVDQU chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
+ VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
- VMOVDQA ctr3StoreAVX2, DD0; VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3
+ VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
XORQ itr1, itr1
MOVQ inp, itr2
@@ -1374,7 +1374,7 @@
openAVX2Tail512LoopA:
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
+ VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
VMOVDQA CC3, tmpStoreAVX2
@@ -1387,7 +1387,7 @@
polyMulAVX2
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3
+ VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
VMOVDQA CC3, tmpStoreAVX2
@@ -1401,7 +1401,7 @@
VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
+ VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
polyAdd(2*8(itr2))
@@ -1415,7 +1415,7 @@
VMOVDQA tmpStoreAVX2, CC3
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3
+ VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
VMOVDQA CC3, tmpStoreAVX2
@@ -1448,7 +1448,7 @@
JMP openAVX2Tail512HashLoop
openAVX2Tail512HashEnd:
- VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3
+ VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
@@ -1493,7 +1493,7 @@
JBE sealSSE128 // About 15% faster
// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
- MOVOU chacha20Constants<>(SB), A0
+ MOVOU ·chacha20Constants<>(SB), A0
MOVOU (1*16)(keyp), B0
MOVOU (2*16)(keyp), C0
MOVOU (3*16)(keyp), D0
@@ -1503,9 +1503,9 @@
MOVO C0, state2Store
// Load state, increment counter blocks
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2
- MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL sseIncMask<>(SB), D3
+ MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
+ MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
+ MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
// Store counters
MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
@@ -1535,13 +1535,13 @@
JNE sealSSEIntroLoop
// Add in the state
- PADDD chacha20Constants<>(SB), A0; PADDD chacha20Constants<>(SB), A1; PADDD chacha20Constants<>(SB), A2; PADDD chacha20Constants<>(SB), A3
+ PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
// Clamp and store the key
- PAND polyClampMask<>(SB), A0
+ PAND ·polyClampMask<>(SB), A0
MOVO A0, rStore
MOVO B0, sStore
@@ -1585,10 +1585,10 @@
sealSSEMainLoop:
// Load state, increment counter blocks
- MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2
- MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL sseIncMask<>(SB), D3
+ MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
+ MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
+ MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
+ MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
// Store counters
MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
@@ -1627,7 +1627,7 @@
JG sealSSEInnerLoop
// Add in the state
- PADDD chacha20Constants<>(SB), A0; PADDD chacha20Constants<>(SB), A1; PADDD chacha20Constants<>(SB), A2; PADDD chacha20Constants<>(SB), A3
+ PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
@@ -1683,11 +1683,11 @@
// Special optimization for the last 64 bytes of plaintext
sealSSETail64:
// Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
- MOVO chacha20Constants<>(SB), A1
+ MOVO ·chacha20Constants<>(SB), A1
MOVO state1Store, B1
MOVO state2Store, C1
MOVO ctr3Store, D1
- PADDL sseIncMask<>(SB), D1
+ PADDL ·sseIncMask<>(SB), D1
MOVO D1, ctr0Store
sealSSETail64LoopA:
@@ -1710,7 +1710,7 @@
DECQ itr2
JGE sealSSETail64LoopB
- PADDL chacha20Constants<>(SB), A1
+ PADDL ·chacha20Constants<>(SB), A1
PADDL state1Store, B1
PADDL state2Store, C1
PADDL ctr0Store, D1
@@ -1721,8 +1721,8 @@
// Special optimization for the last 128 bytes of plaintext
sealSSETail128:
// Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
- MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr0Store
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1; MOVO D1, ctr1Store
+ MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
+ MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
sealSSETail128LoopA:
// Perform ChaCha rounds, while hashing the prevsiosly encrpyted ciphertext
@@ -1747,7 +1747,7 @@
DECQ itr2
JGE sealSSETail128LoopB
- PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1
+ PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
PADDL state1Store, B0; PADDL state1Store, B1
PADDL state2Store, C0; PADDL state2Store, C1
PADDL ctr0Store, D0; PADDL ctr1Store, D1
@@ -1766,9 +1766,9 @@
// Special optimization for the last 192 bytes of plaintext
sealSSETail192:
// Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
- MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr0Store
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1; MOVO D1, ctr1Store
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2; MOVO D2, ctr2Store
+ MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
+ MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
+ MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
sealSSETail192LoopA:
// Perform ChaCha rounds, while hashing the prevsiosly encrpyted ciphertext
@@ -1797,7 +1797,7 @@
DECQ itr2
JGE sealSSETail192LoopB
- PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1; PADDL chacha20Constants<>(SB), A2
+ PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
@@ -1823,9 +1823,9 @@
// Special seal optimization for buffers smaller than 129 bytes
sealSSE128:
// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
- MOVOU chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
- MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1
- MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2
+ MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
+ MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
+ MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
MOVQ $10, itr2
@@ -1842,11 +1842,11 @@
JNE sealSSE128InnerCipherLoop
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
- PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1; PADDL chacha20Constants<>(SB), A2
+ PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
PADDL T2, C1; PADDL T2, C2
- PADDL T3, D1; PADDL sseIncMask<>(SB), T3; PADDL T3, D2
- PAND polyClampMask<>(SB), A0
+ PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
+ PAND ·polyClampMask<>(SB), A0
MOVOU A0, rStore
MOVOU B0, sStore
@@ -1903,7 +1903,7 @@
// We can only load the PT one byte at a time to avoid read after end of buffer
MOVQ inl, itr2
SHLQ $4, itr2
- LEAQ andMask<>(SB), t0
+ LEAQ ·andMask<>(SB), t0
MOVQ inl, itr1
LEAQ -1(inp)(inl*1), inp
XORQ t2, t2
@@ -1963,11 +1963,11 @@
// ------------------------- AVX2 Code ----------------------------------------
chacha20Poly1305Seal_AVX2:
VZEROUPPER
- VMOVDQU chacha20Constants<>(SB), AA0
+ VMOVDQU ·chacha20Constants<>(SB), AA0
BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
- VPADDD avx2InitMask<>(SB), DD0, DD0
+ VPADDD ·avx2InitMask<>(SB), DD0, DD0
// Special optimizations, for very short buffers
CMPQ inl, $192
@@ -1979,9 +1979,9 @@
VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
- VPADDD avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
- VPADDD avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
- VPADDD avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
+ VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
+ VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
+ VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
VMOVDQA DD3, ctr3StoreAVX2
MOVQ $10, itr2
@@ -2012,7 +2012,7 @@
DECQ itr2
JNE sealAVX2IntroLoop
- VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3
+ VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
@@ -2022,7 +2022,7 @@
VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
// Clamp and store poly key
- VPAND polyClampMask<>(SB), DD0, DD0
+ VPAND ·polyClampMask<>(SB), DD0, DD0
VMOVDQA DD0, rsStoreAVX2
// Hash AD
@@ -2068,11 +2068,11 @@
JBE sealAVX2Tail512
// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
- VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
+ VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
VMOVDQA ctr3StoreAVX2, DD0
- VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3
+ VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
VMOVDQA CC3, tmpStoreAVX2
@@ -2100,7 +2100,7 @@
VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
+ VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
VMOVDQA CC3, tmpStoreAVX2
@@ -2116,10 +2116,10 @@
sealAVX2MainLoop:
// Load state, increment counter blocks, store the incremented counters
- VMOVDQU chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
+ VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
- VMOVDQA ctr3StoreAVX2, DD0; VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3
+ VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
MOVQ $10, itr1
@@ -2128,7 +2128,7 @@
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
polyMulStage1_AVX2
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
+ VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
polyMulStage2_AVX2
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
@@ -2144,7 +2144,7 @@
sealAVX2InternalLoopStart:
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3
+ VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
polyAdd(2*8(oup))
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
polyMulStage1_AVX2
@@ -2162,7 +2162,7 @@
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
polyMulStage3_AVX2
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
+ VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
polyMulReduceStage
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
@@ -2178,7 +2178,7 @@
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
polyMulStage2_AVX2
- VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3
+ VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
polyMulStage3_AVX2
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
@@ -2195,7 +2195,7 @@
DECQ itr1
JNE sealAVX2InternalLoop
- VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3
+ VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
@@ -2250,7 +2250,7 @@
VMOVDQA AA0, AA1
VMOVDQA BB0, BB1
VMOVDQA CC0, CC1
- VPADDD avx2IncMask<>(SB), DD0, DD1
+ VPADDD ·avx2IncMask<>(SB), DD0, DD1
VMOVDQA AA0, AA2
VMOVDQA BB0, BB2
VMOVDQA CC0, CC2
@@ -2276,7 +2276,7 @@
VPERM2I128 $0x02, AA0, BB0, TT0
// Clamp and store poly key
- VPAND polyClampMask<>(SB), TT0, TT0
+ VPAND ·polyClampMask<>(SB), TT0, TT0
VMOVDQA TT0, rsStoreAVX2
// Stream for up to 192 bytes
@@ -2359,8 +2359,8 @@
// Special optimization for buffers smaller than 321 bytes
seal320AVX2:
// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
- VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD avx2IncMask<>(SB), DD0, DD1
- VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD avx2IncMask<>(SB), DD1, DD2
+ VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
+ VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
MOVQ $10, itr2
@@ -2376,18 +2376,18 @@
DECQ itr2
JNE sealAVX2320InnerCipherLoop
- VMOVDQA chacha20Constants<>(SB), TT0
+ VMOVDQA ·chacha20Constants<>(SB), TT0
VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
- VMOVDQA avx2IncMask<>(SB), TT0
+ VMOVDQA ·avx2IncMask<>(SB), TT0
VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
VPADDD TT3, DD2, DD2
// Clamp and store poly key
VPERM2I128 $0x02, AA0, BB0, TT0
- VPAND polyClampMask<>(SB), TT0, TT0
+ VPAND ·polyClampMask<>(SB), TT0, TT0
VMOVDQA TT0, rsStoreAVX2
// Stream for up to 320 bytes
@@ -2409,11 +2409,11 @@
// Need to decrypt up to 128 bytes - prepare two blocks
// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
- VMOVDQA chacha20Constants<>(SB), AA0
+ VMOVDQA ·chacha20Constants<>(SB), AA0
VMOVDQA state1StoreAVX2, BB0
VMOVDQA state2StoreAVX2, CC0
VMOVDQA ctr3StoreAVX2, DD0
- VPADDD avx2IncMask<>(SB), DD0, DD0
+ VPADDD ·avx2IncMask<>(SB), DD0, DD0
VMOVDQA DD0, DD1
sealAVX2Tail128LoopA:
@@ -2440,7 +2440,7 @@
DECQ itr2
JGE sealAVX2Tail128LoopB
- VPADDD chacha20Constants<>(SB), AA0, AA1
+ VPADDD ·chacha20Constants<>(SB), AA0, AA1
VPADDD state1StoreAVX2, BB0, BB1
VPADDD state2StoreAVX2, CC0, CC1
VPADDD DD1, DD0, DD1
@@ -2457,12 +2457,12 @@
// Need to decrypt up to 256 bytes - prepare two blocks
// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
- VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA chacha20Constants<>(SB), AA1
+ VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
VMOVDQA ctr3StoreAVX2, DD0
- VPADDD avx2IncMask<>(SB), DD0, DD0
- VPADDD avx2IncMask<>(SB), DD0, DD1
+ VPADDD ·avx2IncMask<>(SB), DD0, DD0
+ VPADDD ·avx2IncMask<>(SB), DD0, DD1
VMOVDQA DD0, TT1
VMOVDQA DD1, TT2
@@ -2490,7 +2490,7 @@
DECQ itr2
JGE sealAVX2Tail256LoopB
- VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1
+ VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
@@ -2516,11 +2516,11 @@
// Need to decrypt up to 384 bytes - prepare two blocks
// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
- VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
+ VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
VMOVDQA ctr3StoreAVX2, DD0
- VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2
+ VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
sealAVX2Tail384LoopA:
@@ -2547,7 +2547,7 @@
DECQ itr2
JGE sealAVX2Tail384LoopB
- VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2
+ VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
@@ -2579,11 +2579,11 @@
// Need to decrypt up to 512 bytes - prepare two blocks
// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
- VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
+ VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
VMOVDQA ctr3StoreAVX2, DD0
- VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3
+ VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
sealAVX2Tail512LoopA:
@@ -2594,7 +2594,7 @@
sealAVX2Tail512LoopB:
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
+ VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
VMOVDQA CC3, tmpStoreAVX2
@@ -2607,7 +2607,7 @@
polyMulAVX2
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3
+ VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
VMOVDQA CC3, tmpStoreAVX2
@@ -2621,7 +2621,7 @@
VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
+ VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
polyAdd(2*8(oup))
@@ -2635,7 +2635,7 @@
VMOVDQA tmpStoreAVX2, CC3
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
- VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3
+ VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
VMOVDQA CC3, tmpStoreAVX2
@@ -2653,7 +2653,7 @@
DECQ itr2
JGE sealAVX2Tail512LoopB
- VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3
+ VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3