| // Code generated by command: go run chacha20poly1305_amd64_asm.go -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305. DO NOT EDIT. |
| |
| //go:build gc && !purego |
| |
| #include "textflag.h" |
| |
| // func polyHashADInternal<>() |
| TEXT polyHashADInternal<>(SB), NOSPLIT, $0 |
| // Hack: Must declare #define macros inside of a function due to Avo constraints |
| // ROL rotates the uint32s in register R left by N bits, using temporary T. |
| #define ROL(N, R, T) \ |
| MOVO R, T; \ |
| PSLLL $(N), T; \ |
| PSRLL $(32-(N)), R; \ |
| PXOR T, R |
| |
| // ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. |
| #ifdef GOAMD64_v2 |
| #define ROL8(R, T) PSHUFB ·rol8<>(SB), R |
| #else |
| #define ROL8(R, T) ROL(8, R, T) |
| #endif |
| |
| // ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. |
| #ifdef GOAMD64_v2 |
| #define ROL16(R, T) PSHUFB ·rol16<>(SB), R |
| #else |
| #define ROL16(R, T) ROL(16, R, T) |
| #endif |
| XORQ R10, R10 |
| XORQ R11, R11 |
| XORQ R12, R12 |
| CMPQ R9, $0x0d |
| JNE hashADLoop |
| MOVQ (CX), R10 |
| MOVQ 5(CX), R11 |
| SHRQ $0x18, R11 |
| MOVQ $0x00000001, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| RET |
| |
| hashADLoop: |
| // Hash in 16 byte chunks |
| CMPQ R9, $0x10 |
| JB hashADTail |
| ADDQ (CX), R10 |
| ADCQ 8(CX), R11 |
| ADCQ $0x01, R12 |
| LEAQ 16(CX), CX |
| SUBQ $0x10, R9 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| JMP hashADLoop |
| |
| hashADTail: |
| CMPQ R9, $0x00 |
| JE hashADDone |
| |
| // Hash last < 16 byte tail |
| XORQ R13, R13 |
| XORQ R14, R14 |
| XORQ R15, R15 |
| ADDQ R9, CX |
| |
| hashADTailLoop: |
| SHLQ $0x08, R13, R14 |
| SHLQ $0x08, R13 |
| MOVB -1(CX), R15 |
| XORQ R15, R13 |
| DECQ CX |
| DECQ R9 |
| JNE hashADTailLoop |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| |
| hashADDone: |
| RET |
| |
| // func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool |
| // Requires: AVX, AVX2, BMI2, CMOV, SSE2 |
| TEXT ·chacha20Poly1305Open(SB), $288-97 |
| // For aligned stack access |
| MOVQ SP, BP |
| ADDQ $0x20, BP |
| ANDQ $-32, BP |
| MOVQ dst_base+0(FP), DI |
| MOVQ key_base+24(FP), R8 |
| MOVQ src_base+48(FP), SI |
| MOVQ src_len+56(FP), BX |
| MOVQ ad_base+72(FP), CX |
| |
| // Check for AVX2 support |
| CMPB ·useAVX2+0(SB), $0x01 |
| JE chacha20Poly1305Open_AVX2 |
| |
| // Special optimization, for very short buffers |
| CMPQ BX, $0x80 |
| JBE openSSE128 |
| |
| // For long buffers, prepare the poly key first |
| MOVOU ·chacha20Constants<>+0(SB), X0 |
| MOVOU 16(R8), X3 |
| MOVOU 32(R8), X6 |
| MOVOU 48(R8), X9 |
| MOVO X9, X13 |
| |
| // Store state on stack for future use |
| MOVO X3, 32(BP) |
| MOVO X6, 48(BP) |
| MOVO X9, 128(BP) |
| MOVQ $0x0000000a, R9 |
| |
| openSSEPreparePolyKey: |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL16(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X3 |
| PXOR X12, X3 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL8(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X3 |
| PXOR X12, X3 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc9 |
| BYTE $0x0c |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL16(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X3 |
| PXOR X12, X3 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL8(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X3 |
| PXOR X12, X3 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc9 |
| BYTE $0x04 |
| DECQ R9 |
| JNE openSSEPreparePolyKey |
| |
| // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded |
| PADDL ·chacha20Constants<>+0(SB), X0 |
| PADDL 32(BP), X3 |
| |
| // Clamp and store the key |
| PAND ·polyClampMask<>+0(SB), X0 |
| MOVO X0, (BP) |
| MOVO X3, 16(BP) |
| |
| // Hash AAD |
| MOVQ ad_len+80(FP), R9 |
| CALL polyHashADInternal<>(SB) |
| |
| openSSEMainLoop: |
| CMPQ BX, $0x00000100 |
| JB openSSEMainLoopDone |
| |
| // Load state, increment counter blocks |
| MOVO ·chacha20Constants<>+0(SB), X0 |
| MOVO 32(BP), X3 |
| MOVO 48(BP), X6 |
| MOVO 128(BP), X9 |
| PADDL ·sseIncMask<>+0(SB), X9 |
| MOVO X0, X1 |
| MOVO X3, X4 |
| MOVO X6, X7 |
| MOVO X9, X10 |
| PADDL ·sseIncMask<>+0(SB), X10 |
| MOVO X1, X2 |
| MOVO X4, X5 |
| MOVO X7, X8 |
| MOVO X10, X11 |
| PADDL ·sseIncMask<>+0(SB), X11 |
| MOVO X2, X12 |
| MOVO X5, X13 |
| MOVO X8, X14 |
| MOVO X11, X15 |
| PADDL ·sseIncMask<>+0(SB), X15 |
| |
| // Store counters |
| MOVO X9, 80(BP) |
| MOVO X10, 96(BP) |
| MOVO X11, 112(BP) |
| MOVO X15, 128(BP) |
| |
| // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash |
| // 2 blocks, and for the remaining 4 only 1 block - for a total of 16 |
| MOVQ $0x00000004, CX |
| MOVQ SI, R9 |
| |
| openSSEInternalLoop: |
| MOVO X14, 64(BP) |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL16(X9, X14) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X14 |
| PSLLL $0x0c, X14 |
| PSRLL $0x14, X3 |
| PXOR X14, X3 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL8(X9, X14) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X14 |
| PSLLL $0x07, X14 |
| PSRLL $0x19, X3 |
| PXOR X14, X3 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL16(X10, X14) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X14 |
| PSLLL $0x0c, X14 |
| PSRLL $0x14, X4 |
| PXOR X14, X4 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL8(X10, X14) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X14 |
| PSLLL $0x07, X14 |
| PSRLL $0x19, X4 |
| PXOR X14, X4 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL16(X11, X14) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X14 |
| PSLLL $0x0c, X14 |
| PSRLL $0x14, X5 |
| PXOR X14, X5 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL8(X11, X14) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X14 |
| PSLLL $0x07, X14 |
| PSRLL $0x19, X5 |
| PXOR X14, X5 |
| MOVO 64(BP), X14 |
| MOVO X7, 64(BP) |
| PADDD X13, X12 |
| PXOR X12, X15 |
| ROL16(X15, X7) |
| PADDD X15, X14 |
| PXOR X14, X13 |
| MOVO X13, X7 |
| PSLLL $0x0c, X7 |
| PSRLL $0x14, X13 |
| PXOR X7, X13 |
| PADDD X13, X12 |
| PXOR X12, X15 |
| ROL8(X15, X7) |
| PADDD X15, X14 |
| PXOR X14, X13 |
| MOVO X13, X7 |
| PSLLL $0x07, X7 |
| PSRLL $0x19, X13 |
| PXOR X7, X13 |
| MOVO 64(BP), X7 |
| ADDQ (R9), R10 |
| ADCQ 8(R9), R11 |
| ADCQ $0x01, R12 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xe4 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xed |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xed |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc0 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc9 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xd2 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x0c |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| LEAQ 16(R9), R9 |
| MOVO X14, 64(BP) |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL16(X9, X14) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X14 |
| PSLLL $0x0c, X14 |
| PSRLL $0x14, X3 |
| PXOR X14, X3 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL8(X9, X14) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X14 |
| PSLLL $0x07, X14 |
| PSRLL $0x19, X3 |
| PXOR X14, X3 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL16(X10, X14) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X14 |
| PSLLL $0x0c, X14 |
| PSRLL $0x14, X4 |
| PXOR X14, X4 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL8(X10, X14) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X14 |
| PSLLL $0x07, X14 |
| PSRLL $0x19, X4 |
| PXOR X14, X4 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL16(X11, X14) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X14 |
| PSLLL $0x0c, X14 |
| PSRLL $0x14, X5 |
| PXOR X14, X5 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL8(X11, X14) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X14 |
| PSLLL $0x07, X14 |
| PSRLL $0x19, X5 |
| PXOR X14, X5 |
| MOVO 64(BP), X14 |
| MOVO X7, 64(BP) |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| PADDD X13, X12 |
| PXOR X12, X15 |
| ROL16(X15, X7) |
| PADDD X15, X14 |
| PXOR X14, X13 |
| MOVO X13, X7 |
| PSLLL $0x0c, X7 |
| PSRLL $0x14, X13 |
| PXOR X7, X13 |
| PADDD X13, X12 |
| PXOR X12, X15 |
| ROL8(X15, X7) |
| PADDD X15, X14 |
| PXOR X14, X13 |
| MOVO X13, X7 |
| PSLLL $0x07, X7 |
| PSRLL $0x19, X13 |
| PXOR X7, X13 |
| MOVO 64(BP), X7 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xe4 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xed |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xed |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc0 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc9 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xd2 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x04 |
| DECQ CX |
| JGE openSSEInternalLoop |
| ADDQ (R9), R10 |
| ADCQ 8(R9), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 16(R9), R9 |
| CMPQ CX, $-6 |
| JG openSSEInternalLoop |
| |
| // Add in the state |
| PADDD ·chacha20Constants<>+0(SB), X0 |
| PADDD ·chacha20Constants<>+0(SB), X1 |
| PADDD ·chacha20Constants<>+0(SB), X2 |
| PADDD ·chacha20Constants<>+0(SB), X12 |
| PADDD 32(BP), X3 |
| PADDD 32(BP), X4 |
| PADDD 32(BP), X5 |
| PADDD 32(BP), X13 |
| PADDD 48(BP), X6 |
| PADDD 48(BP), X7 |
| PADDD 48(BP), X8 |
| PADDD 48(BP), X14 |
| PADDD 80(BP), X9 |
| PADDD 96(BP), X10 |
| PADDD 112(BP), X11 |
| PADDD 128(BP), X15 |
| |
| // Load - xor - store |
| MOVO X15, 64(BP) |
| MOVOU (SI), X15 |
| PXOR X15, X0 |
| MOVOU X0, (DI) |
| MOVOU 16(SI), X15 |
| PXOR X15, X3 |
| MOVOU X3, 16(DI) |
| MOVOU 32(SI), X15 |
| PXOR X15, X6 |
| MOVOU X6, 32(DI) |
| MOVOU 48(SI), X15 |
| PXOR X15, X9 |
| MOVOU X9, 48(DI) |
| MOVOU 64(SI), X9 |
| PXOR X9, X1 |
| MOVOU X1, 64(DI) |
| MOVOU 80(SI), X9 |
| PXOR X9, X4 |
| MOVOU X4, 80(DI) |
| MOVOU 96(SI), X9 |
| PXOR X9, X7 |
| MOVOU X7, 96(DI) |
| MOVOU 112(SI), X9 |
| PXOR X9, X10 |
| MOVOU X10, 112(DI) |
| MOVOU 128(SI), X9 |
| PXOR X9, X2 |
| MOVOU X2, 128(DI) |
| MOVOU 144(SI), X9 |
| PXOR X9, X5 |
| MOVOU X5, 144(DI) |
| MOVOU 160(SI), X9 |
| PXOR X9, X8 |
| MOVOU X8, 160(DI) |
| MOVOU 176(SI), X9 |
| PXOR X9, X11 |
| MOVOU X11, 176(DI) |
| MOVOU 192(SI), X9 |
| PXOR X9, X12 |
| MOVOU X12, 192(DI) |
| MOVOU 208(SI), X9 |
| PXOR X9, X13 |
| MOVOU X13, 208(DI) |
| MOVOU 224(SI), X9 |
| PXOR X9, X14 |
| MOVOU X14, 224(DI) |
| MOVOU 240(SI), X9 |
| PXOR 64(BP), X9 |
| MOVOU X9, 240(DI) |
| LEAQ 256(SI), SI |
| LEAQ 256(DI), DI |
| SUBQ $0x00000100, BX |
| JMP openSSEMainLoop |
| |
| openSSEMainLoopDone: |
| // Handle the various tail sizes efficiently |
| TESTQ BX, BX |
| JE openSSEFinalize |
| CMPQ BX, $0x40 |
| JBE openSSETail64 |
| CMPQ BX, $0x80 |
| JBE openSSETail128 |
| CMPQ BX, $0xc0 |
| JBE openSSETail192 |
| JMP openSSETail256 |
| |
| openSSEFinalize: |
| // Hash in the PT, AAD lengths |
| ADDQ ad_len+80(FP), R10 |
| ADCQ src_len+56(FP), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| |
| // Final reduce |
| MOVQ R10, R13 |
| MOVQ R11, R14 |
| MOVQ R12, R15 |
| SUBQ $-5, R10 |
| SBBQ $-1, R11 |
| SBBQ $0x03, R12 |
| CMOVQCS R13, R10 |
| CMOVQCS R14, R11 |
| CMOVQCS R15, R12 |
| |
| // Add in the "s" part of the key |
| ADDQ 16(BP), R10 |
| ADCQ 24(BP), R11 |
| |
| // Finally, constant time compare to the tag at the end of the message |
| XORQ AX, AX |
| MOVQ $0x00000001, DX |
| XORQ (SI), R10 |
| XORQ 8(SI), R11 |
| ORQ R11, R10 |
| CMOVQEQ DX, AX |
| |
| // Return true iff tags are equal |
| MOVB AX, ret+96(FP) |
| RET |
| |
| openSSE128: |
| MOVOU ·chacha20Constants<>+0(SB), X0 |
| MOVOU 16(R8), X3 |
| MOVOU 32(R8), X6 |
| MOVOU 48(R8), X9 |
| MOVO X0, X1 |
| MOVO X3, X4 |
| MOVO X6, X7 |
| MOVO X9, X10 |
| PADDL ·sseIncMask<>+0(SB), X10 |
| MOVO X1, X2 |
| MOVO X4, X5 |
| MOVO X7, X8 |
| MOVO X10, X11 |
| PADDL ·sseIncMask<>+0(SB), X11 |
| MOVO X3, X13 |
| MOVO X6, X14 |
| MOVO X10, X15 |
| MOVQ $0x0000000a, R9 |
| |
| openSSE128InnerCipherLoop: |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL16(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X3 |
| PXOR X12, X3 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL8(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X3 |
| PXOR X12, X3 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL16(X10, X12) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X4 |
| PXOR X12, X4 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL8(X10, X12) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X4 |
| PXOR X12, X4 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL16(X11, X12) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X5 |
| PXOR X12, X5 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL8(X11, X12) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X5 |
| PXOR X12, X5 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xe4 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xed |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc0 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc9 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xd2 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x0c |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL16(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X3 |
| PXOR X12, X3 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL8(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X3 |
| PXOR X12, X3 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL16(X10, X12) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X4 |
| PXOR X12, X4 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL8(X10, X12) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X4 |
| PXOR X12, X4 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL16(X11, X12) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X5 |
| PXOR X12, X5 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL8(X11, X12) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X5 |
| PXOR X12, X5 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xe4 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xed |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc0 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc9 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xd2 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x04 |
| DECQ R9 |
| JNE openSSE128InnerCipherLoop |
| |
| // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded |
| PADDL ·chacha20Constants<>+0(SB), X0 |
| PADDL ·chacha20Constants<>+0(SB), X1 |
| PADDL ·chacha20Constants<>+0(SB), X2 |
| PADDL X13, X3 |
| PADDL X13, X4 |
| PADDL X13, X5 |
| PADDL X14, X7 |
| PADDL X14, X8 |
| PADDL X15, X10 |
| PADDL ·sseIncMask<>+0(SB), X15 |
| PADDL X15, X11 |
| |
| // Clamp and store the key |
| PAND ·polyClampMask<>+0(SB), X0 |
| MOVOU X0, (BP) |
| MOVOU X3, 16(BP) |
| |
| // Hash |
| MOVQ ad_len+80(FP), R9 |
| CALL polyHashADInternal<>(SB) |
| |
| openSSE128Open: |
| CMPQ BX, $0x10 |
| JB openSSETail16 |
| SUBQ $0x10, BX |
| |
| // Load for hashing |
| ADDQ (SI), R10 |
| ADCQ 8(SI), R11 |
| ADCQ $0x01, R12 |
| |
| // Load for decryption |
| MOVOU (SI), X12 |
| PXOR X12, X1 |
| MOVOU X1, (DI) |
| LEAQ 16(SI), SI |
| LEAQ 16(DI), DI |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| |
| // Shift the stream "left" |
| MOVO X4, X1 |
| MOVO X7, X4 |
| MOVO X10, X7 |
| MOVO X2, X10 |
| MOVO X5, X2 |
| MOVO X8, X5 |
| MOVO X11, X8 |
| JMP openSSE128Open |
| |
| openSSETail16: |
| TESTQ BX, BX |
| JE openSSEFinalize |
| |
| // We can safely load the CT from the end, because it is padded with the MAC |
| MOVQ BX, R9 |
| SHLQ $0x04, R9 |
| LEAQ ·andMask<>+0(SB), R13 |
| MOVOU (SI), X12 |
| ADDQ BX, SI |
| PAND -16(R13)(R9*1), X12 |
| MOVO X12, 64(BP) |
| MOVQ X12, R13 |
| MOVQ 72(BP), R14 |
| PXOR X1, X12 |
| |
| // We can only store one byte at a time, since plaintext can be shorter than 16 bytes |
| openSSETail16Store: |
| MOVQ X12, R8 |
| MOVB R8, (DI) |
| PSRLDQ $0x01, X12 |
| INCQ DI |
| DECQ BX |
| JNE openSSETail16Store |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| JMP openSSEFinalize |
| |
| openSSETail64: |
| MOVO ·chacha20Constants<>+0(SB), X0 |
| MOVO 32(BP), X3 |
| MOVO 48(BP), X6 |
| MOVO 128(BP), X9 |
| PADDL ·sseIncMask<>+0(SB), X9 |
| MOVO X9, 80(BP) |
| XORQ R9, R9 |
| MOVQ BX, CX |
| CMPQ CX, $0x10 |
| JB openSSETail64LoopB |
| |
| openSSETail64LoopA: |
| ADDQ (SI)(R9*1), R10 |
| ADCQ 8(SI)(R9*1), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| SUBQ $0x10, CX |
| |
| openSSETail64LoopB: |
| ADDQ $0x10, R9 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL16(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X3 |
| PXOR X12, X3 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL8(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X3 |
| PXOR X12, X3 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc9 |
| BYTE $0x0c |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL16(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X3 |
| PXOR X12, X3 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL8(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X3 |
| PXOR X12, X3 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc9 |
| BYTE $0x04 |
| CMPQ CX, $0x10 |
| JAE openSSETail64LoopA |
| CMPQ R9, $0xa0 |
| JNE openSSETail64LoopB |
| PADDL ·chacha20Constants<>+0(SB), X0 |
| PADDL 32(BP), X3 |
| PADDL 48(BP), X6 |
| PADDL 80(BP), X9 |
| |
| openSSETail64DecLoop: |
| CMPQ BX, $0x10 |
| JB openSSETail64DecLoopDone |
| SUBQ $0x10, BX |
| MOVOU (SI), X12 |
| PXOR X12, X0 |
| MOVOU X0, (DI) |
| LEAQ 16(SI), SI |
| LEAQ 16(DI), DI |
| MOVO X3, X0 |
| MOVO X6, X3 |
| MOVO X9, X6 |
| JMP openSSETail64DecLoop |
| |
| openSSETail64DecLoopDone: |
| MOVO X0, X1 |
| JMP openSSETail16 |
| |
| openSSETail128: |
| MOVO ·chacha20Constants<>+0(SB), X1 |
| MOVO 32(BP), X4 |
| MOVO 48(BP), X7 |
| MOVO 128(BP), X10 |
| PADDL ·sseIncMask<>+0(SB), X10 |
| MOVO X10, 80(BP) |
| MOVO X1, X0 |
| MOVO X4, X3 |
| MOVO X7, X6 |
| MOVO X10, X9 |
| PADDL ·sseIncMask<>+0(SB), X9 |
| MOVO X9, 96(BP) |
| XORQ R9, R9 |
| MOVQ BX, CX |
| ANDQ $-16, CX |
| |
| openSSETail128LoopA: |
| ADDQ (SI)(R9*1), R10 |
| ADCQ 8(SI)(R9*1), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| |
| openSSETail128LoopB: |
| ADDQ $0x10, R9 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL16(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X3 |
| PXOR X12, X3 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL8(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X3 |
| PXOR X12, X3 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL16(X10, X12) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X4 |
| PXOR X12, X4 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL8(X10, X12) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X4 |
| PXOR X12, X4 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc9 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xe4 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xd2 |
| BYTE $0x0c |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL16(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X3 |
| PXOR X12, X3 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL8(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X3 |
| PXOR X12, X3 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL16(X10, X12) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X4 |
| PXOR X12, X4 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL8(X10, X12) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X4 |
| PXOR X12, X4 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc9 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xe4 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xd2 |
| BYTE $0x04 |
| CMPQ R9, CX |
| JB openSSETail128LoopA |
| CMPQ R9, $0xa0 |
| JNE openSSETail128LoopB |
| PADDL ·chacha20Constants<>+0(SB), X0 |
| PADDL ·chacha20Constants<>+0(SB), X1 |
| PADDL 32(BP), X3 |
| PADDL 32(BP), X4 |
| PADDL 48(BP), X6 |
| PADDL 48(BP), X7 |
| PADDL 96(BP), X9 |
| PADDL 80(BP), X10 |
| MOVOU (SI), X12 |
| MOVOU 16(SI), X13 |
| MOVOU 32(SI), X14 |
| MOVOU 48(SI), X15 |
| PXOR X12, X1 |
| PXOR X13, X4 |
| PXOR X14, X7 |
| PXOR X15, X10 |
| MOVOU X1, (DI) |
| MOVOU X4, 16(DI) |
| MOVOU X7, 32(DI) |
| MOVOU X10, 48(DI) |
| SUBQ $0x40, BX |
| LEAQ 64(SI), SI |
| LEAQ 64(DI), DI |
| JMP openSSETail64DecLoop |
| |
| openSSETail192: |
| MOVO ·chacha20Constants<>+0(SB), X2 |
| MOVO 32(BP), X5 |
| MOVO 48(BP), X8 |
| MOVO 128(BP), X11 |
| PADDL ·sseIncMask<>+0(SB), X11 |
| MOVO X11, 80(BP) |
| MOVO X2, X1 |
| MOVO X5, X4 |
| MOVO X8, X7 |
| MOVO X11, X10 |
| PADDL ·sseIncMask<>+0(SB), X10 |
| MOVO X10, 96(BP) |
| MOVO X1, X0 |
| MOVO X4, X3 |
| MOVO X7, X6 |
| MOVO X10, X9 |
| PADDL ·sseIncMask<>+0(SB), X9 |
| MOVO X9, 112(BP) |
| MOVQ BX, CX |
| MOVQ $0x000000a0, R9 |
| CMPQ CX, $0xa0 |
| CMOVQGT R9, CX |
| ANDQ $-16, CX |
| XORQ R9, R9 |
| |
| openSSLTail192LoopA: |
| ADDQ (SI)(R9*1), R10 |
| ADCQ 8(SI)(R9*1), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| |
| openSSLTail192LoopB: |
| ADDQ $0x10, R9 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL16(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X3 |
| PXOR X12, X3 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL8(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X3 |
| PXOR X12, X3 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL16(X10, X12) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X4 |
| PXOR X12, X4 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL8(X10, X12) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X4 |
| PXOR X12, X4 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL16(X11, X12) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X5 |
| PXOR X12, X5 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL8(X11, X12) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X5 |
| PXOR X12, X5 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc9 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xe4 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xd2 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xed |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc0 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x0c |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL16(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X3 |
| PXOR X12, X3 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL8(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X3 |
| PXOR X12, X3 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL16(X10, X12) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X4 |
| PXOR X12, X4 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL8(X10, X12) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X4 |
| PXOR X12, X4 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL16(X11, X12) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X5 |
| PXOR X12, X5 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL8(X11, X12) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X5 |
| PXOR X12, X5 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc9 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xe4 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xd2 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xed |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc0 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x04 |
| CMPQ R9, CX |
| JB openSSLTail192LoopA |
| CMPQ R9, $0xa0 |
| JNE openSSLTail192LoopB |
| CMPQ BX, $0xb0 |
| JB openSSLTail192Store |
| ADDQ 160(SI), R10 |
| ADCQ 168(SI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| CMPQ BX, $0xc0 |
| JB openSSLTail192Store |
| ADDQ 176(SI), R10 |
| ADCQ 184(SI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| |
| openSSLTail192Store: |
| PADDL ·chacha20Constants<>+0(SB), X0 |
| PADDL ·chacha20Constants<>+0(SB), X1 |
| PADDL ·chacha20Constants<>+0(SB), X2 |
| PADDL 32(BP), X3 |
| PADDL 32(BP), X4 |
| PADDL 32(BP), X5 |
| PADDL 48(BP), X6 |
| PADDL 48(BP), X7 |
| PADDL 48(BP), X8 |
| PADDL 112(BP), X9 |
| PADDL 96(BP), X10 |
| PADDL 80(BP), X11 |
| MOVOU (SI), X12 |
| MOVOU 16(SI), X13 |
| MOVOU 32(SI), X14 |
| MOVOU 48(SI), X15 |
| PXOR X12, X2 |
| PXOR X13, X5 |
| PXOR X14, X8 |
| PXOR X15, X11 |
| MOVOU X2, (DI) |
| MOVOU X5, 16(DI) |
| MOVOU X8, 32(DI) |
| MOVOU X11, 48(DI) |
| MOVOU 64(SI), X12 |
| MOVOU 80(SI), X13 |
| MOVOU 96(SI), X14 |
| MOVOU 112(SI), X15 |
| PXOR X12, X1 |
| PXOR X13, X4 |
| PXOR X14, X7 |
| PXOR X15, X10 |
| MOVOU X1, 64(DI) |
| MOVOU X4, 80(DI) |
| MOVOU X7, 96(DI) |
| MOVOU X10, 112(DI) |
| SUBQ $0x80, BX |
| LEAQ 128(SI), SI |
| LEAQ 128(DI), DI |
| JMP openSSETail64DecLoop |
| |
| openSSETail256: |
| MOVO ·chacha20Constants<>+0(SB), X0 |
| MOVO 32(BP), X3 |
| MOVO 48(BP), X6 |
| MOVO 128(BP), X9 |
| PADDL ·sseIncMask<>+0(SB), X9 |
| MOVO X0, X1 |
| MOVO X3, X4 |
| MOVO X6, X7 |
| MOVO X9, X10 |
| PADDL ·sseIncMask<>+0(SB), X10 |
| MOVO X1, X2 |
| MOVO X4, X5 |
| MOVO X7, X8 |
| MOVO X10, X11 |
| PADDL ·sseIncMask<>+0(SB), X11 |
| MOVO X2, X12 |
| MOVO X5, X13 |
| MOVO X8, X14 |
| MOVO X11, X15 |
| PADDL ·sseIncMask<>+0(SB), X15 |
| |
| // Store counters |
| MOVO X9, 80(BP) |
| MOVO X10, 96(BP) |
| MOVO X11, 112(BP) |
| MOVO X15, 128(BP) |
| XORQ R9, R9 |
| |
| openSSETail256Loop: |
| ADDQ (SI)(R9*1), R10 |
| ADCQ 8(SI)(R9*1), R11 |
| ADCQ $0x01, R12 |
| MOVO X14, 64(BP) |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL16(X9, X14) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X14 |
| PSLLL $0x0c, X14 |
| PSRLL $0x14, X3 |
| PXOR X14, X3 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL8(X9, X14) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X14 |
| PSLLL $0x07, X14 |
| PSRLL $0x19, X3 |
| PXOR X14, X3 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL16(X10, X14) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X14 |
| PSLLL $0x0c, X14 |
| PSRLL $0x14, X4 |
| PXOR X14, X4 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL8(X10, X14) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X14 |
| PSLLL $0x07, X14 |
| PSRLL $0x19, X4 |
| PXOR X14, X4 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL16(X11, X14) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X14 |
| PSLLL $0x0c, X14 |
| PSRLL $0x14, X5 |
| PXOR X14, X5 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL8(X11, X14) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X14 |
| PSLLL $0x07, X14 |
| PSRLL $0x19, X5 |
| PXOR X14, X5 |
| MOVO 64(BP), X14 |
| MOVO X7, 64(BP) |
| PADDD X13, X12 |
| PXOR X12, X15 |
| ROL16(X15, X7) |
| PADDD X15, X14 |
| PXOR X14, X13 |
| MOVO X13, X7 |
| PSLLL $0x0c, X7 |
| PSRLL $0x14, X13 |
| PXOR X7, X13 |
| PADDD X13, X12 |
| PXOR X12, X15 |
| ROL8(X15, X7) |
| PADDD X15, X14 |
| PXOR X14, X13 |
| MOVO X13, X7 |
| PSLLL $0x07, X7 |
| PSRLL $0x19, X13 |
| PXOR X7, X13 |
| MOVO 64(BP), X7 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xe4 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xed |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xed |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc0 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc9 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xd2 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x0c |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| MOVO X14, 64(BP) |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL16(X9, X14) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X14 |
| PSLLL $0x0c, X14 |
| PSRLL $0x14, X3 |
| PXOR X14, X3 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL8(X9, X14) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X14 |
| PSLLL $0x07, X14 |
| PSRLL $0x19, X3 |
| PXOR X14, X3 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL16(X10, X14) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X14 |
| PSLLL $0x0c, X14 |
| PSRLL $0x14, X4 |
| PXOR X14, X4 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL8(X10, X14) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X14 |
| PSLLL $0x07, X14 |
| PSRLL $0x19, X4 |
| PXOR X14, X4 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL16(X11, X14) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X14 |
| PSLLL $0x0c, X14 |
| PSRLL $0x14, X5 |
| PXOR X14, X5 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL8(X11, X14) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X14 |
| PSLLL $0x07, X14 |
| PSRLL $0x19, X5 |
| PXOR X14, X5 |
| MOVO 64(BP), X14 |
| MOVO X7, 64(BP) |
| PADDD X13, X12 |
| PXOR X12, X15 |
| ROL16(X15, X7) |
| PADDD X15, X14 |
| PXOR X14, X13 |
| MOVO X13, X7 |
| PSLLL $0x0c, X7 |
| PSRLL $0x14, X13 |
| PXOR X7, X13 |
| PADDD X13, X12 |
| PXOR X12, X15 |
| ROL8(X15, X7) |
| PADDD X15, X14 |
| PXOR X14, X13 |
| MOVO X13, X7 |
| PSLLL $0x07, X7 |
| PSRLL $0x19, X13 |
| PXOR X7, X13 |
| MOVO 64(BP), X7 |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xe4 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xed |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xed |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc0 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc9 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xd2 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x04 |
| ADDQ $0x10, R9 |
| CMPQ R9, $0xa0 |
| JB openSSETail256Loop |
| MOVQ BX, CX |
| ANDQ $-16, CX |
| |
| openSSETail256HashLoop: |
| ADDQ (SI)(R9*1), R10 |
| ADCQ 8(SI)(R9*1), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| ADDQ $0x10, R9 |
| CMPQ R9, CX |
| JB openSSETail256HashLoop |
| |
| // Add in the state |
| PADDD ·chacha20Constants<>+0(SB), X0 |
| PADDD ·chacha20Constants<>+0(SB), X1 |
| PADDD ·chacha20Constants<>+0(SB), X2 |
| PADDD ·chacha20Constants<>+0(SB), X12 |
| PADDD 32(BP), X3 |
| PADDD 32(BP), X4 |
| PADDD 32(BP), X5 |
| PADDD 32(BP), X13 |
| PADDD 48(BP), X6 |
| PADDD 48(BP), X7 |
| PADDD 48(BP), X8 |
| PADDD 48(BP), X14 |
| PADDD 80(BP), X9 |
| PADDD 96(BP), X10 |
| PADDD 112(BP), X11 |
| PADDD 128(BP), X15 |
| MOVO X15, 64(BP) |
| |
| // Load - xor - store |
| MOVOU (SI), X15 |
| PXOR X15, X0 |
| MOVOU 16(SI), X15 |
| PXOR X15, X3 |
| MOVOU 32(SI), X15 |
| PXOR X15, X6 |
| MOVOU 48(SI), X15 |
| PXOR X15, X9 |
| MOVOU X0, (DI) |
| MOVOU X3, 16(DI) |
| MOVOU X6, 32(DI) |
| MOVOU X9, 48(DI) |
| MOVOU 64(SI), X0 |
| MOVOU 80(SI), X3 |
| MOVOU 96(SI), X6 |
| MOVOU 112(SI), X9 |
| PXOR X0, X1 |
| PXOR X3, X4 |
| PXOR X6, X7 |
| PXOR X9, X10 |
| MOVOU X1, 64(DI) |
| MOVOU X4, 80(DI) |
| MOVOU X7, 96(DI) |
| MOVOU X10, 112(DI) |
| MOVOU 128(SI), X0 |
| MOVOU 144(SI), X3 |
| MOVOU 160(SI), X6 |
| MOVOU 176(SI), X9 |
| PXOR X0, X2 |
| PXOR X3, X5 |
| PXOR X6, X8 |
| PXOR X9, X11 |
| MOVOU X2, 128(DI) |
| MOVOU X5, 144(DI) |
| MOVOU X8, 160(DI) |
| MOVOU X11, 176(DI) |
| LEAQ 192(SI), SI |
| LEAQ 192(DI), DI |
| SUBQ $0xc0, BX |
| MOVO X12, X0 |
| MOVO X13, X3 |
| MOVO X14, X6 |
| MOVO 64(BP), X9 |
| JMP openSSETail64DecLoop |
| |
| chacha20Poly1305Open_AVX2: |
| VZEROUPPER |
| VMOVDQU ·chacha20Constants<>+0(SB), Y0 |
| BYTE $0xc4 |
| BYTE $0x42 |
| BYTE $0x7d |
| BYTE $0x5a |
| BYTE $0x70 |
| BYTE $0x10 |
| BYTE $0xc4 |
| BYTE $0x42 |
| BYTE $0x7d |
| BYTE $0x5a |
| BYTE $0x60 |
| BYTE $0x20 |
| BYTE $0xc4 |
| BYTE $0xc2 |
| BYTE $0x7d |
| BYTE $0x5a |
| BYTE $0x60 |
| BYTE $0x30 |
| VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 |
| |
| // Special optimization, for very short buffers |
| CMPQ BX, $0xc0 |
| JBE openAVX2192 |
| CMPQ BX, $0x00000140 |
| JBE openAVX2320 |
| |
| // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream |
| VMOVDQA Y14, 32(BP) |
| VMOVDQA Y12, 64(BP) |
| VMOVDQA Y4, 192(BP) |
| MOVQ $0x0000000a, R9 |
| |
| openAVX2PreparePolyKey: |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x0c, Y14, Y3 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x07, Y14, Y3 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPALIGNR $0x04, Y14, Y14, Y14 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x0c, Y4, Y4, Y4 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x0c, Y14, Y3 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x07, Y14, Y3 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPALIGNR $0x0c, Y14, Y14, Y14 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x04, Y4, Y4, Y4 |
| DECQ R9 |
| JNE openAVX2PreparePolyKey |
| VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 |
| VPADDD 32(BP), Y14, Y14 |
| VPADDD 64(BP), Y12, Y12 |
| VPADDD 192(BP), Y4, Y4 |
| VPERM2I128 $0x02, Y0, Y14, Y3 |
| |
| // Clamp and store poly key |
| VPAND ·polyClampMask<>+0(SB), Y3, Y3 |
| VMOVDQA Y3, (BP) |
| |
| // Stream for the first 64 bytes |
| VPERM2I128 $0x13, Y0, Y14, Y0 |
| VPERM2I128 $0x13, Y12, Y4, Y14 |
| |
| // Hash AD + first 64 bytes |
| MOVQ ad_len+80(FP), R9 |
| CALL polyHashADInternal<>(SB) |
| XORQ CX, CX |
| |
| openAVX2InitialHash64: |
| ADDQ (SI)(CX*1), R10 |
| ADCQ 8(SI)(CX*1), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| ADDQ $0x10, CX |
| CMPQ CX, $0x40 |
| JNE openAVX2InitialHash64 |
| |
| // Decrypt the first 64 bytes |
| VPXOR (SI), Y0, Y0 |
| VPXOR 32(SI), Y14, Y14 |
| VMOVDQU Y0, (DI) |
| VMOVDQU Y14, 32(DI) |
| LEAQ 64(SI), SI |
| LEAQ 64(DI), DI |
| SUBQ $0x40, BX |
| |
| openAVX2MainLoop: |
| CMPQ BX, $0x00000200 |
| JB openAVX2MainLoopDone |
| |
| // Load state, increment counter blocks, store the incremented counters |
| VMOVDQU ·chacha20Constants<>+0(SB), Y0 |
| VMOVDQA Y0, Y5 |
| VMOVDQA Y0, Y6 |
| VMOVDQA Y0, Y7 |
| VMOVDQA 32(BP), Y14 |
| VMOVDQA Y14, Y9 |
| VMOVDQA Y14, Y10 |
| VMOVDQA Y14, Y11 |
| VMOVDQA 64(BP), Y12 |
| VMOVDQA Y12, Y13 |
| VMOVDQA Y12, Y8 |
| VMOVDQA Y12, Y15 |
| VMOVDQA 192(BP), Y4 |
| VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 |
| VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 |
| VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 |
| VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 |
| VMOVDQA Y4, 96(BP) |
| VMOVDQA Y1, 128(BP) |
| VMOVDQA Y2, 160(BP) |
| VMOVDQA Y3, 192(BP) |
| XORQ CX, CX |
| |
| openAVX2InternalLoop: |
| ADDQ (SI)(CX*1), R10 |
| ADCQ 8(SI)(CX*1), R11 |
| ADCQ $0x01, R12 |
| VPADDD Y14, Y0, Y0 |
| VPADDD Y9, Y5, Y5 |
| VPADDD Y10, Y6, Y6 |
| VPADDD Y11, Y7, Y7 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| VPXOR Y0, Y4, Y4 |
| VPXOR Y5, Y1, Y1 |
| VPXOR Y6, Y2, Y2 |
| VPXOR Y7, Y3, Y3 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y2, Y2 |
| VPSHUFB ·rol16<>+0(SB), Y3, Y3 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| VPADDD Y4, Y12, Y12 |
| VPADDD Y1, Y13, Y13 |
| VPADDD Y2, Y8, Y8 |
| VPADDD Y3, Y15, Y15 |
| VPXOR Y12, Y14, Y14 |
| VPXOR Y13, Y9, Y9 |
| VPXOR Y8, Y10, Y10 |
| VPXOR Y15, Y11, Y11 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| VMOVDQA Y15, 224(BP) |
| VPSLLD $0x0c, Y14, Y15 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPSLLD $0x0c, Y9, Y15 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPSLLD $0x0c, Y10, Y15 |
| VPSRLD $0x14, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VPSLLD $0x0c, Y11, Y15 |
| VPSRLD $0x14, Y11, Y11 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA 224(BP), Y15 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| VPADDD Y14, Y0, Y0 |
| VPADDD Y9, Y5, Y5 |
| VPADDD Y10, Y6, Y6 |
| VPADDD Y11, Y7, Y7 |
| VPXOR Y0, Y4, Y4 |
| VPXOR Y5, Y1, Y1 |
| VPXOR Y6, Y2, Y2 |
| VPXOR Y7, Y3, Y3 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y2, Y2 |
| VPSHUFB ·rol8<>+0(SB), Y3, Y3 |
| ADDQ 16(SI)(CX*1), R10 |
| ADCQ 24(SI)(CX*1), R11 |
| ADCQ $0x01, R12 |
| VPADDD Y4, Y12, Y12 |
| VPADDD Y1, Y13, Y13 |
| VPADDD Y2, Y8, Y8 |
| VPADDD Y3, Y15, Y15 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| VPXOR Y12, Y14, Y14 |
| VPXOR Y13, Y9, Y9 |
| VPXOR Y8, Y10, Y10 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA Y15, 224(BP) |
| VPSLLD $0x07, Y14, Y15 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPSLLD $0x07, Y9, Y15 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPSLLD $0x07, Y10, Y15 |
| VPSRLD $0x19, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VPSLLD $0x07, Y11, Y15 |
| VPSRLD $0x19, Y11, Y11 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA 224(BP), Y15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| VPALIGNR $0x04, Y14, Y14, Y14 |
| VPALIGNR $0x04, Y9, Y9, Y9 |
| VPALIGNR $0x04, Y10, Y10, Y10 |
| VPALIGNR $0x04, Y11, Y11, Y11 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x08, Y8, Y8, Y8 |
| VPALIGNR $0x08, Y15, Y15, Y15 |
| VPALIGNR $0x0c, Y4, Y4, Y4 |
| VPALIGNR $0x0c, Y1, Y1, Y1 |
| VPALIGNR $0x0c, Y2, Y2, Y2 |
| VPALIGNR $0x0c, Y3, Y3, Y3 |
| VPADDD Y14, Y0, Y0 |
| VPADDD Y9, Y5, Y5 |
| VPADDD Y10, Y6, Y6 |
| VPADDD Y11, Y7, Y7 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| VPXOR Y0, Y4, Y4 |
| VPXOR Y5, Y1, Y1 |
| VPXOR Y6, Y2, Y2 |
| VPXOR Y7, Y3, Y3 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y2, Y2 |
| VPSHUFB ·rol16<>+0(SB), Y3, Y3 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| VPADDD Y4, Y12, Y12 |
| VPADDD Y1, Y13, Y13 |
| VPADDD Y2, Y8, Y8 |
| VPADDD Y3, Y15, Y15 |
| VPXOR Y12, Y14, Y14 |
| VPXOR Y13, Y9, Y9 |
| VPXOR Y8, Y10, Y10 |
| VPXOR Y15, Y11, Y11 |
| ADDQ 32(SI)(CX*1), R10 |
| ADCQ 40(SI)(CX*1), R11 |
| ADCQ $0x01, R12 |
| LEAQ 48(CX), CX |
| VMOVDQA Y15, 224(BP) |
| VPSLLD $0x0c, Y14, Y15 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPSLLD $0x0c, Y9, Y15 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPSLLD $0x0c, Y10, Y15 |
| VPSRLD $0x14, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VPSLLD $0x0c, Y11, Y15 |
| VPSRLD $0x14, Y11, Y11 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA 224(BP), Y15 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| VPADDD Y14, Y0, Y0 |
| VPADDD Y9, Y5, Y5 |
| VPADDD Y10, Y6, Y6 |
| VPADDD Y11, Y7, Y7 |
| VPXOR Y0, Y4, Y4 |
| VPXOR Y5, Y1, Y1 |
| VPXOR Y6, Y2, Y2 |
| VPXOR Y7, Y3, Y3 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y2, Y2 |
| VPSHUFB ·rol8<>+0(SB), Y3, Y3 |
| VPADDD Y4, Y12, Y12 |
| VPADDD Y1, Y13, Y13 |
| VPADDD Y2, Y8, Y8 |
| VPADDD Y3, Y15, Y15 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| VPXOR Y12, Y14, Y14 |
| VPXOR Y13, Y9, Y9 |
| VPXOR Y8, Y10, Y10 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA Y15, 224(BP) |
| VPSLLD $0x07, Y14, Y15 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPSLLD $0x07, Y9, Y15 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPSLLD $0x07, Y10, Y15 |
| VPSRLD $0x19, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VPSLLD $0x07, Y11, Y15 |
| VPSRLD $0x19, Y11, Y11 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA 224(BP), Y15 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| VPALIGNR $0x0c, Y14, Y14, Y14 |
| VPALIGNR $0x0c, Y9, Y9, Y9 |
| VPALIGNR $0x0c, Y10, Y10, Y10 |
| VPALIGNR $0x0c, Y11, Y11, Y11 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x08, Y8, Y8, Y8 |
| VPALIGNR $0x08, Y15, Y15, Y15 |
| VPALIGNR $0x04, Y4, Y4, Y4 |
| VPALIGNR $0x04, Y1, Y1, Y1 |
| VPALIGNR $0x04, Y2, Y2, Y2 |
| VPALIGNR $0x04, Y3, Y3, Y3 |
| CMPQ CX, $0x000001e0 |
| JNE openAVX2InternalLoop |
| VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 |
| VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 |
| VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 |
| VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 |
| VPADDD 32(BP), Y14, Y14 |
| VPADDD 32(BP), Y9, Y9 |
| VPADDD 32(BP), Y10, Y10 |
| VPADDD 32(BP), Y11, Y11 |
| VPADDD 64(BP), Y12, Y12 |
| VPADDD 64(BP), Y13, Y13 |
| VPADDD 64(BP), Y8, Y8 |
| VPADDD 64(BP), Y15, Y15 |
| VPADDD 96(BP), Y4, Y4 |
| VPADDD 128(BP), Y1, Y1 |
| VPADDD 160(BP), Y2, Y2 |
| VPADDD 192(BP), Y3, Y3 |
| VMOVDQA Y15, 224(BP) |
| |
| // We only hashed 480 of the 512 bytes available - hash the remaining 32 here |
| ADDQ 480(SI), R10 |
| ADCQ 488(SI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| VPERM2I128 $0x02, Y0, Y14, Y15 |
| VPERM2I128 $0x13, Y0, Y14, Y14 |
| VPERM2I128 $0x02, Y12, Y4, Y0 |
| VPERM2I128 $0x13, Y12, Y4, Y12 |
| VPXOR (SI), Y15, Y15 |
| VPXOR 32(SI), Y0, Y0 |
| VPXOR 64(SI), Y14, Y14 |
| VPXOR 96(SI), Y12, Y12 |
| VMOVDQU Y15, (DI) |
| VMOVDQU Y0, 32(DI) |
| VMOVDQU Y14, 64(DI) |
| VMOVDQU Y12, 96(DI) |
| VPERM2I128 $0x02, Y5, Y9, Y0 |
| VPERM2I128 $0x02, Y13, Y1, Y14 |
| VPERM2I128 $0x13, Y5, Y9, Y12 |
| VPERM2I128 $0x13, Y13, Y1, Y4 |
| VPXOR 128(SI), Y0, Y0 |
| VPXOR 160(SI), Y14, Y14 |
| VPXOR 192(SI), Y12, Y12 |
| VPXOR 224(SI), Y4, Y4 |
| VMOVDQU Y0, 128(DI) |
| VMOVDQU Y14, 160(DI) |
| VMOVDQU Y12, 192(DI) |
| VMOVDQU Y4, 224(DI) |
| |
| // and here |
| ADDQ 496(SI), R10 |
| ADCQ 504(SI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| VPERM2I128 $0x02, Y6, Y10, Y0 |
| VPERM2I128 $0x02, Y8, Y2, Y14 |
| VPERM2I128 $0x13, Y6, Y10, Y12 |
| VPERM2I128 $0x13, Y8, Y2, Y4 |
| VPXOR 256(SI), Y0, Y0 |
| VPXOR 288(SI), Y14, Y14 |
| VPXOR 320(SI), Y12, Y12 |
| VPXOR 352(SI), Y4, Y4 |
| VMOVDQU Y0, 256(DI) |
| VMOVDQU Y14, 288(DI) |
| VMOVDQU Y12, 320(DI) |
| VMOVDQU Y4, 352(DI) |
| VPERM2I128 $0x02, Y7, Y11, Y0 |
| VPERM2I128 $0x02, 224(BP), Y3, Y14 |
| VPERM2I128 $0x13, Y7, Y11, Y12 |
| VPERM2I128 $0x13, 224(BP), Y3, Y4 |
| VPXOR 384(SI), Y0, Y0 |
| VPXOR 416(SI), Y14, Y14 |
| VPXOR 448(SI), Y12, Y12 |
| VPXOR 480(SI), Y4, Y4 |
| VMOVDQU Y0, 384(DI) |
| VMOVDQU Y14, 416(DI) |
| VMOVDQU Y12, 448(DI) |
| VMOVDQU Y4, 480(DI) |
| LEAQ 512(SI), SI |
| LEAQ 512(DI), DI |
| SUBQ $0x00000200, BX |
| JMP openAVX2MainLoop |
| |
| openAVX2MainLoopDone: |
| // Handle the various tail sizes efficiently |
| TESTQ BX, BX |
| JE openSSEFinalize |
| CMPQ BX, $0x80 |
| JBE openAVX2Tail128 |
| CMPQ BX, $0x00000100 |
| JBE openAVX2Tail256 |
| CMPQ BX, $0x00000180 |
| JBE openAVX2Tail384 |
| JMP openAVX2Tail512 |
| |
| openAVX2192: |
| VMOVDQA Y0, Y5 |
| VMOVDQA Y14, Y9 |
| VMOVDQA Y12, Y13 |
| VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 |
| VMOVDQA Y0, Y6 |
| VMOVDQA Y14, Y10 |
| VMOVDQA Y12, Y8 |
| VMOVDQA Y4, Y2 |
| VMOVDQA Y1, Y15 |
| MOVQ $0x0000000a, R9 |
| |
| openAVX2192InnerCipherLoop: |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x0c, Y14, Y3 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x07, Y14, Y3 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x0c, Y9, Y3 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x07, Y9, Y3 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPALIGNR $0x04, Y14, Y14, Y14 |
| VPALIGNR $0x04, Y9, Y9, Y9 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x0c, Y4, Y4, Y4 |
| VPALIGNR $0x0c, Y1, Y1, Y1 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x0c, Y14, Y3 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x07, Y14, Y3 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x0c, Y9, Y3 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x07, Y9, Y3 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPALIGNR $0x0c, Y14, Y14, Y14 |
| VPALIGNR $0x0c, Y9, Y9, Y9 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x04, Y4, Y4, Y4 |
| VPALIGNR $0x04, Y1, Y1, Y1 |
| DECQ R9 |
| JNE openAVX2192InnerCipherLoop |
| VPADDD Y6, Y0, Y0 |
| VPADDD Y6, Y5, Y5 |
| VPADDD Y10, Y14, Y14 |
| VPADDD Y10, Y9, Y9 |
| VPADDD Y8, Y12, Y12 |
| VPADDD Y8, Y13, Y13 |
| VPADDD Y2, Y4, Y4 |
| VPADDD Y15, Y1, Y1 |
| VPERM2I128 $0x02, Y0, Y14, Y3 |
| |
| // Clamp and store poly key |
| VPAND ·polyClampMask<>+0(SB), Y3, Y3 |
| VMOVDQA Y3, (BP) |
| |
| // Stream for up to 192 bytes |
| VPERM2I128 $0x13, Y0, Y14, Y0 |
| VPERM2I128 $0x13, Y12, Y4, Y14 |
| VPERM2I128 $0x02, Y5, Y9, Y12 |
| VPERM2I128 $0x02, Y13, Y1, Y4 |
| VPERM2I128 $0x13, Y5, Y9, Y5 |
| VPERM2I128 $0x13, Y13, Y1, Y9 |
| |
| openAVX2ShortOpen: |
| // Hash |
| MOVQ ad_len+80(FP), R9 |
| CALL polyHashADInternal<>(SB) |
| |
| openAVX2ShortOpenLoop: |
| CMPQ BX, $0x20 |
| JB openAVX2ShortTail32 |
| SUBQ $0x20, BX |
| |
| // Load for hashing |
| ADDQ (SI), R10 |
| ADCQ 8(SI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| ADDQ 16(SI), R10 |
| ADCQ 24(SI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| |
| // Load for decryption |
| VPXOR (SI), Y0, Y0 |
| VMOVDQU Y0, (DI) |
| LEAQ 32(SI), SI |
| LEAQ 32(DI), DI |
| |
| // Shift stream left |
| VMOVDQA Y14, Y0 |
| VMOVDQA Y12, Y14 |
| VMOVDQA Y4, Y12 |
| VMOVDQA Y5, Y4 |
| VMOVDQA Y9, Y5 |
| VMOVDQA Y13, Y9 |
| VMOVDQA Y1, Y13 |
| VMOVDQA Y6, Y1 |
| VMOVDQA Y10, Y6 |
| JMP openAVX2ShortOpenLoop |
| |
| openAVX2ShortTail32: |
| CMPQ BX, $0x10 |
| VMOVDQA X0, X1 |
| JB openAVX2ShortDone |
| SUBQ $0x10, BX |
| |
| // Load for hashing |
| ADDQ (SI), R10 |
| ADCQ 8(SI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| |
| // Load for decryption |
| VPXOR (SI), X0, X12 |
| VMOVDQU X12, (DI) |
| LEAQ 16(SI), SI |
| LEAQ 16(DI), DI |
| VPERM2I128 $0x11, Y0, Y0, Y0 |
| VMOVDQA X0, X1 |
| |
| openAVX2ShortDone: |
| VZEROUPPER |
| JMP openSSETail16 |
| |
| openAVX2320: |
| VMOVDQA Y0, Y5 |
| VMOVDQA Y14, Y9 |
| VMOVDQA Y12, Y13 |
| VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 |
| VMOVDQA Y0, Y6 |
| VMOVDQA Y14, Y10 |
| VMOVDQA Y12, Y8 |
| VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 |
| VMOVDQA Y14, Y7 |
| VMOVDQA Y12, Y11 |
| VMOVDQA Y4, Y15 |
| MOVQ $0x0000000a, R9 |
| |
| openAVX2320InnerCipherLoop: |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x0c, Y14, Y3 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x07, Y14, Y3 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x0c, Y9, Y3 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x07, Y9, Y3 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y10, Y6, Y6 |
| VPXOR Y6, Y2, Y2 |
| VPSHUFB ·rol16<>+0(SB), Y2, Y2 |
| VPADDD Y2, Y8, Y8 |
| VPXOR Y8, Y10, Y10 |
| VPSLLD $0x0c, Y10, Y3 |
| VPSRLD $0x14, Y10, Y10 |
| VPXOR Y3, Y10, Y10 |
| VPADDD Y10, Y6, Y6 |
| VPXOR Y6, Y2, Y2 |
| VPSHUFB ·rol8<>+0(SB), Y2, Y2 |
| VPADDD Y2, Y8, Y8 |
| VPXOR Y8, Y10, Y10 |
| VPSLLD $0x07, Y10, Y3 |
| VPSRLD $0x19, Y10, Y10 |
| VPXOR Y3, Y10, Y10 |
| VPALIGNR $0x04, Y14, Y14, Y14 |
| VPALIGNR $0x04, Y9, Y9, Y9 |
| VPALIGNR $0x04, Y10, Y10, Y10 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x08, Y8, Y8, Y8 |
| VPALIGNR $0x0c, Y4, Y4, Y4 |
| VPALIGNR $0x0c, Y1, Y1, Y1 |
| VPALIGNR $0x0c, Y2, Y2, Y2 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x0c, Y14, Y3 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x07, Y14, Y3 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x0c, Y9, Y3 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x07, Y9, Y3 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y10, Y6, Y6 |
| VPXOR Y6, Y2, Y2 |
| VPSHUFB ·rol16<>+0(SB), Y2, Y2 |
| VPADDD Y2, Y8, Y8 |
| VPXOR Y8, Y10, Y10 |
| VPSLLD $0x0c, Y10, Y3 |
| VPSRLD $0x14, Y10, Y10 |
| VPXOR Y3, Y10, Y10 |
| VPADDD Y10, Y6, Y6 |
| VPXOR Y6, Y2, Y2 |
| VPSHUFB ·rol8<>+0(SB), Y2, Y2 |
| VPADDD Y2, Y8, Y8 |
| VPXOR Y8, Y10, Y10 |
| VPSLLD $0x07, Y10, Y3 |
| VPSRLD $0x19, Y10, Y10 |
| VPXOR Y3, Y10, Y10 |
| VPALIGNR $0x0c, Y14, Y14, Y14 |
| VPALIGNR $0x0c, Y9, Y9, Y9 |
| VPALIGNR $0x0c, Y10, Y10, Y10 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x08, Y8, Y8, Y8 |
| VPALIGNR $0x04, Y4, Y4, Y4 |
| VPALIGNR $0x04, Y1, Y1, Y1 |
| VPALIGNR $0x04, Y2, Y2, Y2 |
| DECQ R9 |
| JNE openAVX2320InnerCipherLoop |
| VMOVDQA ·chacha20Constants<>+0(SB), Y3 |
| VPADDD Y3, Y0, Y0 |
| VPADDD Y3, Y5, Y5 |
| VPADDD Y3, Y6, Y6 |
| VPADDD Y7, Y14, Y14 |
| VPADDD Y7, Y9, Y9 |
| VPADDD Y7, Y10, Y10 |
| VPADDD Y11, Y12, Y12 |
| VPADDD Y11, Y13, Y13 |
| VPADDD Y11, Y8, Y8 |
| VMOVDQA ·avx2IncMask<>+0(SB), Y3 |
| VPADDD Y15, Y4, Y4 |
| VPADDD Y3, Y15, Y15 |
| VPADDD Y15, Y1, Y1 |
| VPADDD Y3, Y15, Y15 |
| VPADDD Y15, Y2, Y2 |
| |
| // Clamp and store poly key |
| VPERM2I128 $0x02, Y0, Y14, Y3 |
| VPAND ·polyClampMask<>+0(SB), Y3, Y3 |
| VMOVDQA Y3, (BP) |
| |
| // Stream for up to 320 bytes |
| VPERM2I128 $0x13, Y0, Y14, Y0 |
| VPERM2I128 $0x13, Y12, Y4, Y14 |
| VPERM2I128 $0x02, Y5, Y9, Y12 |
| VPERM2I128 $0x02, Y13, Y1, Y4 |
| VPERM2I128 $0x13, Y5, Y9, Y5 |
| VPERM2I128 $0x13, Y13, Y1, Y9 |
| VPERM2I128 $0x02, Y6, Y10, Y13 |
| VPERM2I128 $0x02, Y8, Y2, Y1 |
| VPERM2I128 $0x13, Y6, Y10, Y6 |
| VPERM2I128 $0x13, Y8, Y2, Y10 |
| JMP openAVX2ShortOpen |
| |
| openAVX2Tail128: |
| // Need to decrypt up to 128 bytes - prepare two blocks |
| VMOVDQA ·chacha20Constants<>+0(SB), Y5 |
| VMOVDQA 32(BP), Y9 |
| VMOVDQA 64(BP), Y13 |
| VMOVDQA 192(BP), Y1 |
| VPADDD ·avx2IncMask<>+0(SB), Y1, Y1 |
| VMOVDQA Y1, Y4 |
| XORQ R9, R9 |
| MOVQ BX, CX |
| ANDQ $-16, CX |
| TESTQ CX, CX |
| JE openAVX2Tail128LoopB |
| |
| openAVX2Tail128LoopA: |
| ADDQ (SI)(R9*1), R10 |
| ADCQ 8(SI)(R9*1), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| |
| openAVX2Tail128LoopB: |
| ADDQ $0x10, R9 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x0c, Y9, Y3 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x07, Y9, Y3 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPALIGNR $0x04, Y9, Y9, Y9 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x0c, Y1, Y1, Y1 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x0c, Y9, Y3 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x07, Y9, Y3 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPALIGNR $0x0c, Y9, Y9, Y9 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x04, Y1, Y1, Y1 |
| CMPQ R9, CX |
| JB openAVX2Tail128LoopA |
| CMPQ R9, $0xa0 |
| JNE openAVX2Tail128LoopB |
| VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 |
| VPADDD 32(BP), Y9, Y9 |
| VPADDD 64(BP), Y13, Y13 |
| VPADDD Y4, Y1, Y1 |
| VPERM2I128 $0x02, Y5, Y9, Y0 |
| VPERM2I128 $0x02, Y13, Y1, Y14 |
| VPERM2I128 $0x13, Y5, Y9, Y12 |
| VPERM2I128 $0x13, Y13, Y1, Y4 |
| |
| openAVX2TailLoop: |
| CMPQ BX, $0x20 |
| JB openAVX2Tail |
| SUBQ $0x20, BX |
| |
| // Load for decryption |
| VPXOR (SI), Y0, Y0 |
| VMOVDQU Y0, (DI) |
| LEAQ 32(SI), SI |
| LEAQ 32(DI), DI |
| VMOVDQA Y14, Y0 |
| VMOVDQA Y12, Y14 |
| VMOVDQA Y4, Y12 |
| JMP openAVX2TailLoop |
| |
| openAVX2Tail: |
| CMPQ BX, $0x10 |
| VMOVDQA X0, X1 |
| JB openAVX2TailDone |
| SUBQ $0x10, BX |
| |
| // Load for decryption |
| VPXOR (SI), X0, X12 |
| VMOVDQU X12, (DI) |
| LEAQ 16(SI), SI |
| LEAQ 16(DI), DI |
| VPERM2I128 $0x11, Y0, Y0, Y0 |
| VMOVDQA X0, X1 |
| |
| openAVX2TailDone: |
| VZEROUPPER |
| JMP openSSETail16 |
| |
| openAVX2Tail256: |
| VMOVDQA ·chacha20Constants<>+0(SB), Y0 |
| VMOVDQA Y0, Y5 |
| VMOVDQA 32(BP), Y14 |
| VMOVDQA Y14, Y9 |
| VMOVDQA 64(BP), Y12 |
| VMOVDQA Y12, Y13 |
| VMOVDQA 192(BP), Y4 |
| VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 |
| VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 |
| VMOVDQA Y4, Y7 |
| VMOVDQA Y1, Y11 |
| |
| // Compute the number of iterations that will hash data |
| MOVQ BX, 224(BP) |
| MOVQ BX, CX |
| SUBQ $0x80, CX |
| SHRQ $0x04, CX |
| MOVQ $0x0000000a, R9 |
| CMPQ CX, $0x0a |
| CMOVQGT R9, CX |
| MOVQ SI, BX |
| XORQ R9, R9 |
| |
| openAVX2Tail256LoopA: |
| ADDQ (BX), R10 |
| ADCQ 8(BX), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 16(BX), BX |
| |
| openAVX2Tail256LoopB: |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x0c, Y14, Y3 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x07, Y14, Y3 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x0c, Y9, Y3 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x07, Y9, Y3 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPALIGNR $0x04, Y14, Y14, Y14 |
| VPALIGNR $0x04, Y9, Y9, Y9 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x0c, Y4, Y4, Y4 |
| VPALIGNR $0x0c, Y1, Y1, Y1 |
| INCQ R9 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x0c, Y14, Y3 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x07, Y14, Y3 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x0c, Y9, Y3 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x07, Y9, Y3 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPALIGNR $0x0c, Y14, Y14, Y14 |
| VPALIGNR $0x0c, Y9, Y9, Y9 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x04, Y4, Y4, Y4 |
| VPALIGNR $0x04, Y1, Y1, Y1 |
| CMPQ R9, CX |
| JB openAVX2Tail256LoopA |
| CMPQ R9, $0x0a |
| JNE openAVX2Tail256LoopB |
| MOVQ BX, R9 |
| SUBQ SI, BX |
| MOVQ BX, CX |
| MOVQ 224(BP), BX |
| |
| openAVX2Tail256Hash: |
| ADDQ $0x10, CX |
| CMPQ CX, BX |
| JGT openAVX2Tail256HashEnd |
| ADDQ (R9), R10 |
| ADCQ 8(R9), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 16(R9), R9 |
| JMP openAVX2Tail256Hash |
| |
| openAVX2Tail256HashEnd: |
| VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 |
| VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 |
| VPADDD 32(BP), Y14, Y14 |
| VPADDD 32(BP), Y9, Y9 |
| VPADDD 64(BP), Y12, Y12 |
| VPADDD 64(BP), Y13, Y13 |
| VPADDD Y7, Y4, Y4 |
| VPADDD Y11, Y1, Y1 |
| VPERM2I128 $0x02, Y0, Y14, Y6 |
| VPERM2I128 $0x02, Y12, Y4, Y10 |
| VPERM2I128 $0x13, Y0, Y14, Y8 |
| VPERM2I128 $0x13, Y12, Y4, Y2 |
| VPERM2I128 $0x02, Y5, Y9, Y0 |
| VPERM2I128 $0x02, Y13, Y1, Y14 |
| VPERM2I128 $0x13, Y5, Y9, Y12 |
| VPERM2I128 $0x13, Y13, Y1, Y4 |
| VPXOR (SI), Y6, Y6 |
| VPXOR 32(SI), Y10, Y10 |
| VPXOR 64(SI), Y8, Y8 |
| VPXOR 96(SI), Y2, Y2 |
| VMOVDQU Y6, (DI) |
| VMOVDQU Y10, 32(DI) |
| VMOVDQU Y8, 64(DI) |
| VMOVDQU Y2, 96(DI) |
| LEAQ 128(SI), SI |
| LEAQ 128(DI), DI |
| SUBQ $0x80, BX |
| JMP openAVX2TailLoop |
| |
| openAVX2Tail384: |
| // Need to decrypt up to 384 bytes - prepare six blocks |
| VMOVDQA ·chacha20Constants<>+0(SB), Y0 |
| VMOVDQA Y0, Y5 |
| VMOVDQA Y0, Y6 |
| VMOVDQA 32(BP), Y14 |
| VMOVDQA Y14, Y9 |
| VMOVDQA Y14, Y10 |
| VMOVDQA 64(BP), Y12 |
| VMOVDQA Y12, Y13 |
| VMOVDQA Y12, Y8 |
| VMOVDQA 192(BP), Y4 |
| VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 |
| VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 |
| VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 |
| VMOVDQA Y4, 96(BP) |
| VMOVDQA Y1, 128(BP) |
| VMOVDQA Y2, 160(BP) |
| |
| // Compute the number of iterations that will hash two blocks of data |
| MOVQ BX, 224(BP) |
| MOVQ BX, CX |
| SUBQ $0x00000100, CX |
| SHRQ $0x04, CX |
| ADDQ $0x06, CX |
| MOVQ $0x0000000a, R9 |
| CMPQ CX, $0x0a |
| CMOVQGT R9, CX |
| MOVQ SI, BX |
| XORQ R9, R9 |
| |
| openAVX2Tail384LoopB: |
| ADDQ (BX), R10 |
| ADCQ 8(BX), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 16(BX), BX |
| |
| openAVX2Tail384LoopA: |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x0c, Y14, Y3 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x07, Y14, Y3 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x0c, Y9, Y3 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x07, Y9, Y3 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y10, Y6, Y6 |
| VPXOR Y6, Y2, Y2 |
| VPSHUFB ·rol16<>+0(SB), Y2, Y2 |
| VPADDD Y2, Y8, Y8 |
| VPXOR Y8, Y10, Y10 |
| VPSLLD $0x0c, Y10, Y3 |
| VPSRLD $0x14, Y10, Y10 |
| VPXOR Y3, Y10, Y10 |
| VPADDD Y10, Y6, Y6 |
| VPXOR Y6, Y2, Y2 |
| VPSHUFB ·rol8<>+0(SB), Y2, Y2 |
| VPADDD Y2, Y8, Y8 |
| VPXOR Y8, Y10, Y10 |
| VPSLLD $0x07, Y10, Y3 |
| VPSRLD $0x19, Y10, Y10 |
| VPXOR Y3, Y10, Y10 |
| VPALIGNR $0x04, Y14, Y14, Y14 |
| VPALIGNR $0x04, Y9, Y9, Y9 |
| VPALIGNR $0x04, Y10, Y10, Y10 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x08, Y8, Y8, Y8 |
| VPALIGNR $0x0c, Y4, Y4, Y4 |
| VPALIGNR $0x0c, Y1, Y1, Y1 |
| VPALIGNR $0x0c, Y2, Y2, Y2 |
| ADDQ (BX), R10 |
| ADCQ 8(BX), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 16(BX), BX |
| INCQ R9 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x0c, Y14, Y3 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x07, Y14, Y3 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x0c, Y9, Y3 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x07, Y9, Y3 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y10, Y6, Y6 |
| VPXOR Y6, Y2, Y2 |
| VPSHUFB ·rol16<>+0(SB), Y2, Y2 |
| VPADDD Y2, Y8, Y8 |
| VPXOR Y8, Y10, Y10 |
| VPSLLD $0x0c, Y10, Y3 |
| VPSRLD $0x14, Y10, Y10 |
| VPXOR Y3, Y10, Y10 |
| VPADDD Y10, Y6, Y6 |
| VPXOR Y6, Y2, Y2 |
| VPSHUFB ·rol8<>+0(SB), Y2, Y2 |
| VPADDD Y2, Y8, Y8 |
| VPXOR Y8, Y10, Y10 |
| VPSLLD $0x07, Y10, Y3 |
| VPSRLD $0x19, Y10, Y10 |
| VPXOR Y3, Y10, Y10 |
| VPALIGNR $0x0c, Y14, Y14, Y14 |
| VPALIGNR $0x0c, Y9, Y9, Y9 |
| VPALIGNR $0x0c, Y10, Y10, Y10 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x08, Y8, Y8, Y8 |
| VPALIGNR $0x04, Y4, Y4, Y4 |
| VPALIGNR $0x04, Y1, Y1, Y1 |
| VPALIGNR $0x04, Y2, Y2, Y2 |
| CMPQ R9, CX |
| JB openAVX2Tail384LoopB |
| CMPQ R9, $0x0a |
| JNE openAVX2Tail384LoopA |
| MOVQ BX, R9 |
| SUBQ SI, BX |
| MOVQ BX, CX |
| MOVQ 224(BP), BX |
| |
| openAVX2Tail384Hash: |
| ADDQ $0x10, CX |
| CMPQ CX, BX |
| JGT openAVX2Tail384HashEnd |
| ADDQ (R9), R10 |
| ADCQ 8(R9), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 16(R9), R9 |
| JMP openAVX2Tail384Hash |
| |
| openAVX2Tail384HashEnd: |
| VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 |
| VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 |
| VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 |
| VPADDD 32(BP), Y14, Y14 |
| VPADDD 32(BP), Y9, Y9 |
| VPADDD 32(BP), Y10, Y10 |
| VPADDD 64(BP), Y12, Y12 |
| VPADDD 64(BP), Y13, Y13 |
| VPADDD 64(BP), Y8, Y8 |
| VPADDD 96(BP), Y4, Y4 |
| VPADDD 128(BP), Y1, Y1 |
| VPADDD 160(BP), Y2, Y2 |
| VPERM2I128 $0x02, Y0, Y14, Y3 |
| VPERM2I128 $0x02, Y12, Y4, Y7 |
| VPERM2I128 $0x13, Y0, Y14, Y11 |
| VPERM2I128 $0x13, Y12, Y4, Y15 |
| VPXOR (SI), Y3, Y3 |
| VPXOR 32(SI), Y7, Y7 |
| VPXOR 64(SI), Y11, Y11 |
| VPXOR 96(SI), Y15, Y15 |
| VMOVDQU Y3, (DI) |
| VMOVDQU Y7, 32(DI) |
| VMOVDQU Y11, 64(DI) |
| VMOVDQU Y15, 96(DI) |
| VPERM2I128 $0x02, Y5, Y9, Y3 |
| VPERM2I128 $0x02, Y13, Y1, Y7 |
| VPERM2I128 $0x13, Y5, Y9, Y11 |
| VPERM2I128 $0x13, Y13, Y1, Y15 |
| VPXOR 128(SI), Y3, Y3 |
| VPXOR 160(SI), Y7, Y7 |
| VPXOR 192(SI), Y11, Y11 |
| VPXOR 224(SI), Y15, Y15 |
| VMOVDQU Y3, 128(DI) |
| VMOVDQU Y7, 160(DI) |
| VMOVDQU Y11, 192(DI) |
| VMOVDQU Y15, 224(DI) |
| VPERM2I128 $0x02, Y6, Y10, Y0 |
| VPERM2I128 $0x02, Y8, Y2, Y14 |
| VPERM2I128 $0x13, Y6, Y10, Y12 |
| VPERM2I128 $0x13, Y8, Y2, Y4 |
| LEAQ 256(SI), SI |
| LEAQ 256(DI), DI |
| SUBQ $0x00000100, BX |
| JMP openAVX2TailLoop |
| |
| openAVX2Tail512: |
| VMOVDQU ·chacha20Constants<>+0(SB), Y0 |
| VMOVDQA Y0, Y5 |
| VMOVDQA Y0, Y6 |
| VMOVDQA Y0, Y7 |
| VMOVDQA 32(BP), Y14 |
| VMOVDQA Y14, Y9 |
| VMOVDQA Y14, Y10 |
| VMOVDQA Y14, Y11 |
| VMOVDQA 64(BP), Y12 |
| VMOVDQA Y12, Y13 |
| VMOVDQA Y12, Y8 |
| VMOVDQA Y12, Y15 |
| VMOVDQA 192(BP), Y4 |
| VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 |
| VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 |
| VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 |
| VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 |
| VMOVDQA Y4, 96(BP) |
| VMOVDQA Y1, 128(BP) |
| VMOVDQA Y2, 160(BP) |
| VMOVDQA Y3, 192(BP) |
| XORQ CX, CX |
| MOVQ SI, R9 |
| |
| openAVX2Tail512LoopB: |
| ADDQ (R9), R10 |
| ADCQ 8(R9), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 16(R9), R9 |
| |
| openAVX2Tail512LoopA: |
| VPADDD Y14, Y0, Y0 |
| VPADDD Y9, Y5, Y5 |
| VPADDD Y10, Y6, Y6 |
| VPADDD Y11, Y7, Y7 |
| VPXOR Y0, Y4, Y4 |
| VPXOR Y5, Y1, Y1 |
| VPXOR Y6, Y2, Y2 |
| VPXOR Y7, Y3, Y3 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y2, Y2 |
| VPSHUFB ·rol16<>+0(SB), Y3, Y3 |
| VPADDD Y4, Y12, Y12 |
| VPADDD Y1, Y13, Y13 |
| VPADDD Y2, Y8, Y8 |
| VPADDD Y3, Y15, Y15 |
| VPXOR Y12, Y14, Y14 |
| VPXOR Y13, Y9, Y9 |
| VPXOR Y8, Y10, Y10 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA Y15, 224(BP) |
| VPSLLD $0x0c, Y14, Y15 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPSLLD $0x0c, Y9, Y15 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPSLLD $0x0c, Y10, Y15 |
| VPSRLD $0x14, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VPSLLD $0x0c, Y11, Y15 |
| VPSRLD $0x14, Y11, Y11 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA 224(BP), Y15 |
| ADDQ (R9), R10 |
| ADCQ 8(R9), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| VPADDD Y14, Y0, Y0 |
| VPADDD Y9, Y5, Y5 |
| VPADDD Y10, Y6, Y6 |
| VPADDD Y11, Y7, Y7 |
| VPXOR Y0, Y4, Y4 |
| VPXOR Y5, Y1, Y1 |
| VPXOR Y6, Y2, Y2 |
| VPXOR Y7, Y3, Y3 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y2, Y2 |
| VPSHUFB ·rol8<>+0(SB), Y3, Y3 |
| VPADDD Y4, Y12, Y12 |
| VPADDD Y1, Y13, Y13 |
| VPADDD Y2, Y8, Y8 |
| VPADDD Y3, Y15, Y15 |
| VPXOR Y12, Y14, Y14 |
| VPXOR Y13, Y9, Y9 |
| VPXOR Y8, Y10, Y10 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA Y15, 224(BP) |
| VPSLLD $0x07, Y14, Y15 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPSLLD $0x07, Y9, Y15 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPSLLD $0x07, Y10, Y15 |
| VPSRLD $0x19, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VPSLLD $0x07, Y11, Y15 |
| VPSRLD $0x19, Y11, Y11 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA 224(BP), Y15 |
| VPALIGNR $0x04, Y14, Y14, Y14 |
| VPALIGNR $0x04, Y9, Y9, Y9 |
| VPALIGNR $0x04, Y10, Y10, Y10 |
| VPALIGNR $0x04, Y11, Y11, Y11 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x08, Y8, Y8, Y8 |
| VPALIGNR $0x08, Y15, Y15, Y15 |
| VPALIGNR $0x0c, Y4, Y4, Y4 |
| VPALIGNR $0x0c, Y1, Y1, Y1 |
| VPALIGNR $0x0c, Y2, Y2, Y2 |
| VPALIGNR $0x0c, Y3, Y3, Y3 |
| VPADDD Y14, Y0, Y0 |
| VPADDD Y9, Y5, Y5 |
| VPADDD Y10, Y6, Y6 |
| VPADDD Y11, Y7, Y7 |
| VPXOR Y0, Y4, Y4 |
| VPXOR Y5, Y1, Y1 |
| VPXOR Y6, Y2, Y2 |
| VPXOR Y7, Y3, Y3 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y2, Y2 |
| VPSHUFB ·rol16<>+0(SB), Y3, Y3 |
| VPADDD Y4, Y12, Y12 |
| VPADDD Y1, Y13, Y13 |
| VPADDD Y2, Y8, Y8 |
| VPADDD Y3, Y15, Y15 |
| VPXOR Y12, Y14, Y14 |
| VPXOR Y13, Y9, Y9 |
| VPXOR Y8, Y10, Y10 |
| VPXOR Y15, Y11, Y11 |
| ADDQ 16(R9), R10 |
| ADCQ 24(R9), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 32(R9), R9 |
| VMOVDQA Y15, 224(BP) |
| VPSLLD $0x0c, Y14, Y15 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPSLLD $0x0c, Y9, Y15 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPSLLD $0x0c, Y10, Y15 |
| VPSRLD $0x14, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VPSLLD $0x0c, Y11, Y15 |
| VPSRLD $0x14, Y11, Y11 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA 224(BP), Y15 |
| VPADDD Y14, Y0, Y0 |
| VPADDD Y9, Y5, Y5 |
| VPADDD Y10, Y6, Y6 |
| VPADDD Y11, Y7, Y7 |
| VPXOR Y0, Y4, Y4 |
| VPXOR Y5, Y1, Y1 |
| VPXOR Y6, Y2, Y2 |
| VPXOR Y7, Y3, Y3 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y2, Y2 |
| VPSHUFB ·rol8<>+0(SB), Y3, Y3 |
| VPADDD Y4, Y12, Y12 |
| VPADDD Y1, Y13, Y13 |
| VPADDD Y2, Y8, Y8 |
| VPADDD Y3, Y15, Y15 |
| VPXOR Y12, Y14, Y14 |
| VPXOR Y13, Y9, Y9 |
| VPXOR Y8, Y10, Y10 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA Y15, 224(BP) |
| VPSLLD $0x07, Y14, Y15 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPSLLD $0x07, Y9, Y15 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPSLLD $0x07, Y10, Y15 |
| VPSRLD $0x19, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VPSLLD $0x07, Y11, Y15 |
| VPSRLD $0x19, Y11, Y11 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA 224(BP), Y15 |
| VPALIGNR $0x0c, Y14, Y14, Y14 |
| VPALIGNR $0x0c, Y9, Y9, Y9 |
| VPALIGNR $0x0c, Y10, Y10, Y10 |
| VPALIGNR $0x0c, Y11, Y11, Y11 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x08, Y8, Y8, Y8 |
| VPALIGNR $0x08, Y15, Y15, Y15 |
| VPALIGNR $0x04, Y4, Y4, Y4 |
| VPALIGNR $0x04, Y1, Y1, Y1 |
| VPALIGNR $0x04, Y2, Y2, Y2 |
| VPALIGNR $0x04, Y3, Y3, Y3 |
| INCQ CX |
| CMPQ CX, $0x04 |
| JLT openAVX2Tail512LoopB |
| CMPQ CX, $0x0a |
| JNE openAVX2Tail512LoopA |
| MOVQ BX, CX |
| SUBQ $0x00000180, CX |
| ANDQ $-16, CX |
| |
| openAVX2Tail512HashLoop: |
| TESTQ CX, CX |
| JE openAVX2Tail512HashEnd |
| ADDQ (R9), R10 |
| ADCQ 8(R9), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 16(R9), R9 |
| SUBQ $0x10, CX |
| JMP openAVX2Tail512HashLoop |
| |
| openAVX2Tail512HashEnd: |
| VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 |
| VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 |
| VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 |
| VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 |
| VPADDD 32(BP), Y14, Y14 |
| VPADDD 32(BP), Y9, Y9 |
| VPADDD 32(BP), Y10, Y10 |
| VPADDD 32(BP), Y11, Y11 |
| VPADDD 64(BP), Y12, Y12 |
| VPADDD 64(BP), Y13, Y13 |
| VPADDD 64(BP), Y8, Y8 |
| VPADDD 64(BP), Y15, Y15 |
| VPADDD 96(BP), Y4, Y4 |
| VPADDD 128(BP), Y1, Y1 |
| VPADDD 160(BP), Y2, Y2 |
| VPADDD 192(BP), Y3, Y3 |
| VMOVDQA Y15, 224(BP) |
| VPERM2I128 $0x02, Y0, Y14, Y15 |
| VPERM2I128 $0x13, Y0, Y14, Y14 |
| VPERM2I128 $0x02, Y12, Y4, Y0 |
| VPERM2I128 $0x13, Y12, Y4, Y12 |
| VPXOR (SI), Y15, Y15 |
| VPXOR 32(SI), Y0, Y0 |
| VPXOR 64(SI), Y14, Y14 |
| VPXOR 96(SI), Y12, Y12 |
| VMOVDQU Y15, (DI) |
| VMOVDQU Y0, 32(DI) |
| VMOVDQU Y14, 64(DI) |
| VMOVDQU Y12, 96(DI) |
| VPERM2I128 $0x02, Y5, Y9, Y0 |
| VPERM2I128 $0x02, Y13, Y1, Y14 |
| VPERM2I128 $0x13, Y5, Y9, Y12 |
| VPERM2I128 $0x13, Y13, Y1, Y4 |
| VPXOR 128(SI), Y0, Y0 |
| VPXOR 160(SI), Y14, Y14 |
| VPXOR 192(SI), Y12, Y12 |
| VPXOR 224(SI), Y4, Y4 |
| VMOVDQU Y0, 128(DI) |
| VMOVDQU Y14, 160(DI) |
| VMOVDQU Y12, 192(DI) |
| VMOVDQU Y4, 224(DI) |
| VPERM2I128 $0x02, Y6, Y10, Y0 |
| VPERM2I128 $0x02, Y8, Y2, Y14 |
| VPERM2I128 $0x13, Y6, Y10, Y12 |
| VPERM2I128 $0x13, Y8, Y2, Y4 |
| VPXOR 256(SI), Y0, Y0 |
| VPXOR 288(SI), Y14, Y14 |
| VPXOR 320(SI), Y12, Y12 |
| VPXOR 352(SI), Y4, Y4 |
| VMOVDQU Y0, 256(DI) |
| VMOVDQU Y14, 288(DI) |
| VMOVDQU Y12, 320(DI) |
| VMOVDQU Y4, 352(DI) |
| VPERM2I128 $0x02, Y7, Y11, Y0 |
| VPERM2I128 $0x02, 224(BP), Y3, Y14 |
| VPERM2I128 $0x13, Y7, Y11, Y12 |
| VPERM2I128 $0x13, 224(BP), Y3, Y4 |
| LEAQ 384(SI), SI |
| LEAQ 384(DI), DI |
| SUBQ $0x00000180, BX |
| JMP openAVX2TailLoop |
| |
| DATA ·chacha20Constants<>+0(SB)/4, $0x61707865 |
| DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e |
| DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32 |
| DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574 |
| DATA ·chacha20Constants<>+16(SB)/4, $0x61707865 |
| DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e |
| DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32 |
| DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574 |
| GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32 |
| |
| DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff |
| DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc |
| DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff |
| DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff |
| GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32 |
| |
| DATA ·sseIncMask<>+0(SB)/8, $0x0000000000000001 |
| DATA ·sseIncMask<>+8(SB)/8, $0x0000000000000000 |
| GLOBL ·sseIncMask<>(SB), RODATA|NOPTR, $16 |
| |
| DATA ·andMask<>+0(SB)/8, $0x00000000000000ff |
| DATA ·andMask<>+8(SB)/8, $0x0000000000000000 |
| DATA ·andMask<>+16(SB)/8, $0x000000000000ffff |
| DATA ·andMask<>+24(SB)/8, $0x0000000000000000 |
| DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff |
| DATA ·andMask<>+40(SB)/8, $0x0000000000000000 |
| DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff |
| DATA ·andMask<>+56(SB)/8, $0x0000000000000000 |
| DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff |
| DATA ·andMask<>+72(SB)/8, $0x0000000000000000 |
| DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff |
| DATA ·andMask<>+88(SB)/8, $0x0000000000000000 |
| DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff |
| DATA ·andMask<>+104(SB)/8, $0x0000000000000000 |
| DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff |
| DATA ·andMask<>+120(SB)/8, $0x0000000000000000 |
| DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff |
| DATA ·andMask<>+136(SB)/8, $0x00000000000000ff |
| DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff |
| DATA ·andMask<>+152(SB)/8, $0x000000000000ffff |
| DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff |
| DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff |
| DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff |
| DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff |
| DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff |
| DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff |
| DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff |
| DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff |
| DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff |
| DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff |
| GLOBL ·andMask<>(SB), RODATA|NOPTR, $240 |
| |
| DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000 |
| DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000 |
| DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001 |
| DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000 |
| GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32 |
| |
| DATA ·rol16<>+0(SB)/8, $0x0504070601000302 |
| DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a |
| DATA ·rol16<>+16(SB)/8, $0x0504070601000302 |
| DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a |
| GLOBL ·rol16<>(SB), RODATA|NOPTR, $32 |
| |
| DATA ·rol8<>+0(SB)/8, $0x0605040702010003 |
| DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b |
| DATA ·rol8<>+16(SB)/8, $0x0605040702010003 |
| DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b |
| GLOBL ·rol8<>(SB), RODATA|NOPTR, $32 |
| |
| DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002 |
| DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000 |
| DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002 |
| DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000 |
| GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32 |
| |
| // func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte) |
| // Requires: AVX, AVX2, BMI2, CMOV, SSE2 |
| TEXT ·chacha20Poly1305Seal(SB), $288-96 |
| MOVQ SP, BP |
| ADDQ $0x20, BP |
| ANDQ $-32, BP |
| MOVQ dst_base+0(FP), DI |
| MOVQ key_base+24(FP), R8 |
| MOVQ src_base+48(FP), SI |
| MOVQ src_len+56(FP), BX |
| MOVQ ad_base+72(FP), CX |
| CMPB ·useAVX2+0(SB), $0x01 |
| JE chacha20Poly1305Seal_AVX2 |
| |
| // Special optimization, for very short buffers |
| CMPQ BX, $0x80 |
| JBE sealSSE128 |
| |
| // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration |
| MOVOU ·chacha20Constants<>+0(SB), X0 |
| MOVOU 16(R8), X3 |
| MOVOU 32(R8), X6 |
| MOVOU 48(R8), X9 |
| |
| // Store state on stack for future use |
| MOVO X3, 32(BP) |
| MOVO X6, 48(BP) |
| |
| // Load state, increment counter blocks |
| MOVO X0, X1 |
| MOVO X3, X4 |
| MOVO X6, X7 |
| MOVO X9, X10 |
| PADDL ·sseIncMask<>+0(SB), X10 |
| MOVO X1, X2 |
| MOVO X4, X5 |
| MOVO X7, X8 |
| MOVO X10, X11 |
| PADDL ·sseIncMask<>+0(SB), X11 |
| MOVO X2, X12 |
| MOVO X5, X13 |
| MOVO X8, X14 |
| MOVO X11, X15 |
| PADDL ·sseIncMask<>+0(SB), X15 |
| |
| // Store counters |
| MOVO X9, 80(BP) |
| MOVO X10, 96(BP) |
| MOVO X11, 112(BP) |
| MOVO X15, 128(BP) |
| MOVQ $0x0000000a, R9 |
| |
| sealSSEIntroLoop: |
| MOVO X14, 64(BP) |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL16(X9, X14) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X14 |
| PSLLL $0x0c, X14 |
| PSRLL $0x14, X3 |
| PXOR X14, X3 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL8(X9, X14) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X14 |
| PSLLL $0x07, X14 |
| PSRLL $0x19, X3 |
| PXOR X14, X3 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL16(X10, X14) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X14 |
| PSLLL $0x0c, X14 |
| PSRLL $0x14, X4 |
| PXOR X14, X4 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL8(X10, X14) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X14 |
| PSLLL $0x07, X14 |
| PSRLL $0x19, X4 |
| PXOR X14, X4 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL16(X11, X14) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X14 |
| PSLLL $0x0c, X14 |
| PSRLL $0x14, X5 |
| PXOR X14, X5 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL8(X11, X14) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X14 |
| PSLLL $0x07, X14 |
| PSRLL $0x19, X5 |
| PXOR X14, X5 |
| MOVO 64(BP), X14 |
| MOVO X7, 64(BP) |
| PADDD X13, X12 |
| PXOR X12, X15 |
| ROL16(X15, X7) |
| PADDD X15, X14 |
| PXOR X14, X13 |
| MOVO X13, X7 |
| PSLLL $0x0c, X7 |
| PSRLL $0x14, X13 |
| PXOR X7, X13 |
| PADDD X13, X12 |
| PXOR X12, X15 |
| ROL8(X15, X7) |
| PADDD X15, X14 |
| PXOR X14, X13 |
| MOVO X13, X7 |
| PSLLL $0x07, X7 |
| PSRLL $0x19, X13 |
| PXOR X7, X13 |
| MOVO 64(BP), X7 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xe4 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xed |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xed |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc0 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc9 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xd2 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x0c |
| MOVO X14, 64(BP) |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL16(X9, X14) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X14 |
| PSLLL $0x0c, X14 |
| PSRLL $0x14, X3 |
| PXOR X14, X3 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL8(X9, X14) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X14 |
| PSLLL $0x07, X14 |
| PSRLL $0x19, X3 |
| PXOR X14, X3 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL16(X10, X14) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X14 |
| PSLLL $0x0c, X14 |
| PSRLL $0x14, X4 |
| PXOR X14, X4 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL8(X10, X14) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X14 |
| PSLLL $0x07, X14 |
| PSRLL $0x19, X4 |
| PXOR X14, X4 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL16(X11, X14) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X14 |
| PSLLL $0x0c, X14 |
| PSRLL $0x14, X5 |
| PXOR X14, X5 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL8(X11, X14) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X14 |
| PSLLL $0x07, X14 |
| PSRLL $0x19, X5 |
| PXOR X14, X5 |
| MOVO 64(BP), X14 |
| MOVO X7, 64(BP) |
| PADDD X13, X12 |
| PXOR X12, X15 |
| ROL16(X15, X7) |
| PADDD X15, X14 |
| PXOR X14, X13 |
| MOVO X13, X7 |
| PSLLL $0x0c, X7 |
| PSRLL $0x14, X13 |
| PXOR X7, X13 |
| PADDD X13, X12 |
| PXOR X12, X15 |
| ROL8(X15, X7) |
| PADDD X15, X14 |
| PXOR X14, X13 |
| MOVO X13, X7 |
| PSLLL $0x07, X7 |
| PSRLL $0x19, X13 |
| PXOR X7, X13 |
| MOVO 64(BP), X7 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xe4 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xed |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xed |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc0 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc9 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xd2 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x04 |
| DECQ R9 |
| JNE sealSSEIntroLoop |
| |
| // Add in the state |
| PADDD ·chacha20Constants<>+0(SB), X0 |
| PADDD ·chacha20Constants<>+0(SB), X1 |
| PADDD ·chacha20Constants<>+0(SB), X2 |
| PADDD ·chacha20Constants<>+0(SB), X12 |
| PADDD 32(BP), X3 |
| PADDD 32(BP), X4 |
| PADDD 32(BP), X5 |
| PADDD 32(BP), X13 |
| PADDD 48(BP), X7 |
| PADDD 48(BP), X8 |
| PADDD 48(BP), X14 |
| PADDD 96(BP), X10 |
| PADDD 112(BP), X11 |
| PADDD 128(BP), X15 |
| |
| // Clamp and store the key |
| PAND ·polyClampMask<>+0(SB), X0 |
| MOVO X0, (BP) |
| MOVO X3, 16(BP) |
| |
| // Hash AAD |
| MOVQ ad_len+80(FP), R9 |
| CALL polyHashADInternal<>(SB) |
| MOVOU (SI), X0 |
| MOVOU 16(SI), X3 |
| MOVOU 32(SI), X6 |
| MOVOU 48(SI), X9 |
| PXOR X0, X1 |
| PXOR X3, X4 |
| PXOR X6, X7 |
| PXOR X9, X10 |
| MOVOU X1, (DI) |
| MOVOU X4, 16(DI) |
| MOVOU X7, 32(DI) |
| MOVOU X10, 48(DI) |
| MOVOU 64(SI), X0 |
| MOVOU 80(SI), X3 |
| MOVOU 96(SI), X6 |
| MOVOU 112(SI), X9 |
| PXOR X0, X2 |
| PXOR X3, X5 |
| PXOR X6, X8 |
| PXOR X9, X11 |
| MOVOU X2, 64(DI) |
| MOVOU X5, 80(DI) |
| MOVOU X8, 96(DI) |
| MOVOU X11, 112(DI) |
| MOVQ $0x00000080, CX |
| SUBQ $0x80, BX |
| LEAQ 128(SI), SI |
| MOVO X12, X1 |
| MOVO X13, X4 |
| MOVO X14, X7 |
| MOVO X15, X10 |
| CMPQ BX, $0x40 |
| JBE sealSSE128SealHash |
| MOVOU (SI), X0 |
| MOVOU 16(SI), X3 |
| MOVOU 32(SI), X6 |
| MOVOU 48(SI), X9 |
| PXOR X0, X12 |
| PXOR X3, X13 |
| PXOR X6, X14 |
| PXOR X9, X15 |
| MOVOU X12, 128(DI) |
| MOVOU X13, 144(DI) |
| MOVOU X14, 160(DI) |
| MOVOU X15, 176(DI) |
| ADDQ $0x40, CX |
| SUBQ $0x40, BX |
| LEAQ 64(SI), SI |
| MOVQ $0x00000002, CX |
| MOVQ $0x00000008, R9 |
| CMPQ BX, $0x40 |
| JBE sealSSETail64 |
| CMPQ BX, $0x80 |
| JBE sealSSETail128 |
| CMPQ BX, $0xc0 |
| JBE sealSSETail192 |
| |
| sealSSEMainLoop: |
| // Load state, increment counter blocks |
| MOVO ·chacha20Constants<>+0(SB), X0 |
| MOVO 32(BP), X3 |
| MOVO 48(BP), X6 |
| MOVO 128(BP), X9 |
| PADDL ·sseIncMask<>+0(SB), X9 |
| MOVO X0, X1 |
| MOVO X3, X4 |
| MOVO X6, X7 |
| MOVO X9, X10 |
| PADDL ·sseIncMask<>+0(SB), X10 |
| MOVO X1, X2 |
| MOVO X4, X5 |
| MOVO X7, X8 |
| MOVO X10, X11 |
| PADDL ·sseIncMask<>+0(SB), X11 |
| MOVO X2, X12 |
| MOVO X5, X13 |
| MOVO X8, X14 |
| MOVO X11, X15 |
| PADDL ·sseIncMask<>+0(SB), X15 |
| |
| // Store counters |
| MOVO X9, 80(BP) |
| MOVO X10, 96(BP) |
| MOVO X11, 112(BP) |
| MOVO X15, 128(BP) |
| |
| sealSSEInnerLoop: |
| MOVO X14, 64(BP) |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL16(X9, X14) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X14 |
| PSLLL $0x0c, X14 |
| PSRLL $0x14, X3 |
| PXOR X14, X3 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL8(X9, X14) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X14 |
| PSLLL $0x07, X14 |
| PSRLL $0x19, X3 |
| PXOR X14, X3 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL16(X10, X14) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X14 |
| PSLLL $0x0c, X14 |
| PSRLL $0x14, X4 |
| PXOR X14, X4 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL8(X10, X14) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X14 |
| PSLLL $0x07, X14 |
| PSRLL $0x19, X4 |
| PXOR X14, X4 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL16(X11, X14) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X14 |
| PSLLL $0x0c, X14 |
| PSRLL $0x14, X5 |
| PXOR X14, X5 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL8(X11, X14) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X14 |
| PSLLL $0x07, X14 |
| PSRLL $0x19, X5 |
| PXOR X14, X5 |
| MOVO 64(BP), X14 |
| MOVO X7, 64(BP) |
| PADDD X13, X12 |
| PXOR X12, X15 |
| ROL16(X15, X7) |
| PADDD X15, X14 |
| PXOR X14, X13 |
| MOVO X13, X7 |
| PSLLL $0x0c, X7 |
| PSRLL $0x14, X13 |
| PXOR X7, X13 |
| PADDD X13, X12 |
| PXOR X12, X15 |
| ROL8(X15, X7) |
| PADDD X15, X14 |
| PXOR X14, X13 |
| MOVO X13, X7 |
| PSLLL $0x07, X7 |
| PSRLL $0x19, X13 |
| PXOR X7, X13 |
| MOVO 64(BP), X7 |
| ADDQ (DI), R10 |
| ADCQ 8(DI), R11 |
| ADCQ $0x01, R12 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xe4 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xed |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xed |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc0 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc9 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xd2 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x0c |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| LEAQ 16(DI), DI |
| MOVO X14, 64(BP) |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL16(X9, X14) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X14 |
| PSLLL $0x0c, X14 |
| PSRLL $0x14, X3 |
| PXOR X14, X3 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL8(X9, X14) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X14 |
| PSLLL $0x07, X14 |
| PSRLL $0x19, X3 |
| PXOR X14, X3 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL16(X10, X14) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X14 |
| PSLLL $0x0c, X14 |
| PSRLL $0x14, X4 |
| PXOR X14, X4 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL8(X10, X14) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X14 |
| PSLLL $0x07, X14 |
| PSRLL $0x19, X4 |
| PXOR X14, X4 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL16(X11, X14) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X14 |
| PSLLL $0x0c, X14 |
| PSRLL $0x14, X5 |
| PXOR X14, X5 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL8(X11, X14) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X14 |
| PSLLL $0x07, X14 |
| PSRLL $0x19, X5 |
| PXOR X14, X5 |
| MOVO 64(BP), X14 |
| MOVO X7, 64(BP) |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| PADDD X13, X12 |
| PXOR X12, X15 |
| ROL16(X15, X7) |
| PADDD X15, X14 |
| PXOR X14, X13 |
| MOVO X13, X7 |
| PSLLL $0x0c, X7 |
| PSRLL $0x14, X13 |
| PXOR X7, X13 |
| PADDD X13, X12 |
| PXOR X12, X15 |
| ROL8(X15, X7) |
| PADDD X15, X14 |
| PXOR X14, X13 |
| MOVO X13, X7 |
| PSLLL $0x07, X7 |
| PSRLL $0x19, X13 |
| PXOR X7, X13 |
| MOVO 64(BP), X7 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xe4 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xed |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xed |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc0 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc9 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xd2 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x04 |
| DECQ R9 |
| JGE sealSSEInnerLoop |
| ADDQ (DI), R10 |
| ADCQ 8(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 16(DI), DI |
| DECQ CX |
| JG sealSSEInnerLoop |
| |
| // Add in the state |
| PADDD ·chacha20Constants<>+0(SB), X0 |
| PADDD ·chacha20Constants<>+0(SB), X1 |
| PADDD ·chacha20Constants<>+0(SB), X2 |
| PADDD ·chacha20Constants<>+0(SB), X12 |
| PADDD 32(BP), X3 |
| PADDD 32(BP), X4 |
| PADDD 32(BP), X5 |
| PADDD 32(BP), X13 |
| PADDD 48(BP), X6 |
| PADDD 48(BP), X7 |
| PADDD 48(BP), X8 |
| PADDD 48(BP), X14 |
| PADDD 80(BP), X9 |
| PADDD 96(BP), X10 |
| PADDD 112(BP), X11 |
| PADDD 128(BP), X15 |
| MOVO X15, 64(BP) |
| |
| // Load - xor - store |
| MOVOU (SI), X15 |
| PXOR X15, X0 |
| MOVOU 16(SI), X15 |
| PXOR X15, X3 |
| MOVOU 32(SI), X15 |
| PXOR X15, X6 |
| MOVOU 48(SI), X15 |
| PXOR X15, X9 |
| MOVOU X0, (DI) |
| MOVOU X3, 16(DI) |
| MOVOU X6, 32(DI) |
| MOVOU X9, 48(DI) |
| MOVO 64(BP), X15 |
| MOVOU 64(SI), X0 |
| MOVOU 80(SI), X3 |
| MOVOU 96(SI), X6 |
| MOVOU 112(SI), X9 |
| PXOR X0, X1 |
| PXOR X3, X4 |
| PXOR X6, X7 |
| PXOR X9, X10 |
| MOVOU X1, 64(DI) |
| MOVOU X4, 80(DI) |
| MOVOU X7, 96(DI) |
| MOVOU X10, 112(DI) |
| MOVOU 128(SI), X0 |
| MOVOU 144(SI), X3 |
| MOVOU 160(SI), X6 |
| MOVOU 176(SI), X9 |
| PXOR X0, X2 |
| PXOR X3, X5 |
| PXOR X6, X8 |
| PXOR X9, X11 |
| MOVOU X2, 128(DI) |
| MOVOU X5, 144(DI) |
| MOVOU X8, 160(DI) |
| MOVOU X11, 176(DI) |
| ADDQ $0xc0, SI |
| MOVQ $0x000000c0, CX |
| SUBQ $0xc0, BX |
| MOVO X12, X1 |
| MOVO X13, X4 |
| MOVO X14, X7 |
| MOVO X15, X10 |
| CMPQ BX, $0x40 |
| JBE sealSSE128SealHash |
| MOVOU (SI), X0 |
| MOVOU 16(SI), X3 |
| MOVOU 32(SI), X6 |
| MOVOU 48(SI), X9 |
| PXOR X0, X12 |
| PXOR X3, X13 |
| PXOR X6, X14 |
| PXOR X9, X15 |
| MOVOU X12, 192(DI) |
| MOVOU X13, 208(DI) |
| MOVOU X14, 224(DI) |
| MOVOU X15, 240(DI) |
| LEAQ 64(SI), SI |
| SUBQ $0x40, BX |
| MOVQ $0x00000006, CX |
| MOVQ $0x00000004, R9 |
| CMPQ BX, $0xc0 |
| JG sealSSEMainLoop |
| MOVQ BX, CX |
| TESTQ BX, BX |
| JE sealSSE128SealHash |
| MOVQ $0x00000006, CX |
| CMPQ BX, $0x40 |
| JBE sealSSETail64 |
| CMPQ BX, $0x80 |
| JBE sealSSETail128 |
| JMP sealSSETail192 |
| |
| sealSSETail64: |
| MOVO ·chacha20Constants<>+0(SB), X1 |
| MOVO 32(BP), X4 |
| MOVO 48(BP), X7 |
| MOVO 128(BP), X10 |
| PADDL ·sseIncMask<>+0(SB), X10 |
| MOVO X10, 80(BP) |
| |
| sealSSETail64LoopA: |
| ADDQ (DI), R10 |
| ADCQ 8(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 16(DI), DI |
| |
| sealSSETail64LoopB: |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL16(X10, X13) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X13 |
| PSLLL $0x0c, X13 |
| PSRLL $0x14, X4 |
| PXOR X13, X4 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL8(X10, X13) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X13 |
| PSLLL $0x07, X13 |
| PSRLL $0x19, X4 |
| PXOR X13, X4 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xe4 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xd2 |
| BYTE $0x0c |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL16(X10, X13) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X13 |
| PSLLL $0x0c, X13 |
| PSRLL $0x14, X4 |
| PXOR X13, X4 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL8(X10, X13) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X13 |
| PSLLL $0x07, X13 |
| PSRLL $0x19, X4 |
| PXOR X13, X4 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xe4 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xd2 |
| BYTE $0x04 |
| ADDQ (DI), R10 |
| ADCQ 8(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 16(DI), DI |
| DECQ CX |
| JG sealSSETail64LoopA |
| DECQ R9 |
| JGE sealSSETail64LoopB |
| PADDL ·chacha20Constants<>+0(SB), X1 |
| PADDL 32(BP), X4 |
| PADDL 48(BP), X7 |
| PADDL 80(BP), X10 |
| JMP sealSSE128Seal |
| |
| sealSSETail128: |
| MOVO ·chacha20Constants<>+0(SB), X0 |
| MOVO 32(BP), X3 |
| MOVO 48(BP), X6 |
| MOVO 128(BP), X9 |
| PADDL ·sseIncMask<>+0(SB), X9 |
| MOVO X9, 80(BP) |
| MOVO X0, X1 |
| MOVO X3, X4 |
| MOVO X6, X7 |
| MOVO X9, X10 |
| PADDL ·sseIncMask<>+0(SB), X10 |
| MOVO X10, 96(BP) |
| |
| sealSSETail128LoopA: |
| ADDQ (DI), R10 |
| ADCQ 8(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 16(DI), DI |
| |
| sealSSETail128LoopB: |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL16(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X3 |
| PXOR X12, X3 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL8(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X3 |
| PXOR X12, X3 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL16(X10, X12) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X4 |
| PXOR X12, X4 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL8(X10, X12) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X4 |
| PXOR X12, X4 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc9 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xe4 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xd2 |
| BYTE $0x0c |
| ADDQ (DI), R10 |
| ADCQ 8(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 16(DI), DI |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL16(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X3 |
| PXOR X12, X3 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL8(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X3 |
| PXOR X12, X3 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL16(X10, X12) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X4 |
| PXOR X12, X4 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL8(X10, X12) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X4 |
| PXOR X12, X4 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc9 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xe4 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xd2 |
| BYTE $0x04 |
| DECQ CX |
| JG sealSSETail128LoopA |
| DECQ R9 |
| JGE sealSSETail128LoopB |
| PADDL ·chacha20Constants<>+0(SB), X0 |
| PADDL ·chacha20Constants<>+0(SB), X1 |
| PADDL 32(BP), X3 |
| PADDL 32(BP), X4 |
| PADDL 48(BP), X6 |
| PADDL 48(BP), X7 |
| PADDL 80(BP), X9 |
| PADDL 96(BP), X10 |
| MOVOU (SI), X12 |
| MOVOU 16(SI), X13 |
| MOVOU 32(SI), X14 |
| MOVOU 48(SI), X15 |
| PXOR X12, X0 |
| PXOR X13, X3 |
| PXOR X14, X6 |
| PXOR X15, X9 |
| MOVOU X0, (DI) |
| MOVOU X3, 16(DI) |
| MOVOU X6, 32(DI) |
| MOVOU X9, 48(DI) |
| MOVQ $0x00000040, CX |
| LEAQ 64(SI), SI |
| SUBQ $0x40, BX |
| JMP sealSSE128SealHash |
| |
| sealSSETail192: |
| MOVO ·chacha20Constants<>+0(SB), X0 |
| MOVO 32(BP), X3 |
| MOVO 48(BP), X6 |
| MOVO 128(BP), X9 |
| PADDL ·sseIncMask<>+0(SB), X9 |
| MOVO X9, 80(BP) |
| MOVO X0, X1 |
| MOVO X3, X4 |
| MOVO X6, X7 |
| MOVO X9, X10 |
| PADDL ·sseIncMask<>+0(SB), X10 |
| MOVO X10, 96(BP) |
| MOVO X1, X2 |
| MOVO X4, X5 |
| MOVO X7, X8 |
| MOVO X10, X11 |
| PADDL ·sseIncMask<>+0(SB), X11 |
| MOVO X11, 112(BP) |
| |
| sealSSETail192LoopA: |
| ADDQ (DI), R10 |
| ADCQ 8(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 16(DI), DI |
| |
| sealSSETail192LoopB: |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL16(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X3 |
| PXOR X12, X3 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL8(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X3 |
| PXOR X12, X3 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL16(X10, X12) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X4 |
| PXOR X12, X4 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL8(X10, X12) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X4 |
| PXOR X12, X4 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL16(X11, X12) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X5 |
| PXOR X12, X5 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL8(X11, X12) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X5 |
| PXOR X12, X5 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc9 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xe4 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xd2 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xed |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc0 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x0c |
| ADDQ (DI), R10 |
| ADCQ 8(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 16(DI), DI |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL16(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X3 |
| PXOR X12, X3 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL8(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X3 |
| PXOR X12, X3 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL16(X10, X12) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X4 |
| PXOR X12, X4 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL8(X10, X12) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X4 |
| PXOR X12, X4 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL16(X11, X12) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X5 |
| PXOR X12, X5 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL8(X11, X12) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X5 |
| PXOR X12, X5 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc9 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xe4 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xd2 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xed |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc0 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x04 |
| DECQ CX |
| JG sealSSETail192LoopA |
| DECQ R9 |
| JGE sealSSETail192LoopB |
| PADDL ·chacha20Constants<>+0(SB), X0 |
| PADDL ·chacha20Constants<>+0(SB), X1 |
| PADDL ·chacha20Constants<>+0(SB), X2 |
| PADDL 32(BP), X3 |
| PADDL 32(BP), X4 |
| PADDL 32(BP), X5 |
| PADDL 48(BP), X6 |
| PADDL 48(BP), X7 |
| PADDL 48(BP), X8 |
| PADDL 80(BP), X9 |
| PADDL 96(BP), X10 |
| PADDL 112(BP), X11 |
| MOVOU (SI), X12 |
| MOVOU 16(SI), X13 |
| MOVOU 32(SI), X14 |
| MOVOU 48(SI), X15 |
| PXOR X12, X0 |
| PXOR X13, X3 |
| PXOR X14, X6 |
| PXOR X15, X9 |
| MOVOU X0, (DI) |
| MOVOU X3, 16(DI) |
| MOVOU X6, 32(DI) |
| MOVOU X9, 48(DI) |
| MOVOU 64(SI), X12 |
| MOVOU 80(SI), X13 |
| MOVOU 96(SI), X14 |
| MOVOU 112(SI), X15 |
| PXOR X12, X1 |
| PXOR X13, X4 |
| PXOR X14, X7 |
| PXOR X15, X10 |
| MOVOU X1, 64(DI) |
| MOVOU X4, 80(DI) |
| MOVOU X7, 96(DI) |
| MOVOU X10, 112(DI) |
| MOVO X2, X1 |
| MOVO X5, X4 |
| MOVO X8, X7 |
| MOVO X11, X10 |
| MOVQ $0x00000080, CX |
| LEAQ 128(SI), SI |
| SUBQ $0x80, BX |
| JMP sealSSE128SealHash |
| |
| sealSSE128: |
| MOVOU ·chacha20Constants<>+0(SB), X0 |
| MOVOU 16(R8), X3 |
| MOVOU 32(R8), X6 |
| MOVOU 48(R8), X9 |
| MOVO X0, X1 |
| MOVO X3, X4 |
| MOVO X6, X7 |
| MOVO X9, X10 |
| PADDL ·sseIncMask<>+0(SB), X10 |
| MOVO X1, X2 |
| MOVO X4, X5 |
| MOVO X7, X8 |
| MOVO X10, X11 |
| PADDL ·sseIncMask<>+0(SB), X11 |
| MOVO X3, X13 |
| MOVO X6, X14 |
| MOVO X10, X15 |
| MOVQ $0x0000000a, R9 |
| |
| sealSSE128InnerCipherLoop: |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL16(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X3 |
| PXOR X12, X3 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL8(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X3 |
| PXOR X12, X3 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL16(X10, X12) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X4 |
| PXOR X12, X4 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL8(X10, X12) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X4 |
| PXOR X12, X4 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL16(X11, X12) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X5 |
| PXOR X12, X5 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL8(X11, X12) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X5 |
| PXOR X12, X5 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xe4 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xed |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc0 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc9 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xd2 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x0c |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL16(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X3 |
| PXOR X12, X3 |
| PADDD X3, X0 |
| PXOR X0, X9 |
| ROL8(X9, X12) |
| PADDD X9, X6 |
| PXOR X6, X3 |
| MOVO X3, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X3 |
| PXOR X12, X3 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL16(X10, X12) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X4 |
| PXOR X12, X4 |
| PADDD X4, X1 |
| PXOR X1, X10 |
| ROL8(X10, X12) |
| PADDD X10, X7 |
| PXOR X7, X4 |
| MOVO X4, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X4 |
| PXOR X12, X4 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL16(X11, X12) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X12 |
| PSLLL $0x0c, X12 |
| PSRLL $0x14, X5 |
| PXOR X12, X5 |
| PADDD X5, X2 |
| PXOR X2, X11 |
| ROL8(X11, X12) |
| PADDD X11, X8 |
| PXOR X8, X5 |
| MOVO X5, X12 |
| PSLLL $0x07, X12 |
| PSRLL $0x19, X5 |
| PXOR X12, X5 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xe4 |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xed |
| BYTE $0x0c |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xf6 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xff |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc0 |
| BYTE $0x08 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xc9 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xd2 |
| BYTE $0x04 |
| BYTE $0x66 |
| BYTE $0x45 |
| BYTE $0x0f |
| BYTE $0x3a |
| BYTE $0x0f |
| BYTE $0xdb |
| BYTE $0x04 |
| DECQ R9 |
| JNE sealSSE128InnerCipherLoop |
| |
| // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded |
| PADDL ·chacha20Constants<>+0(SB), X0 |
| PADDL ·chacha20Constants<>+0(SB), X1 |
| PADDL ·chacha20Constants<>+0(SB), X2 |
| PADDL X13, X3 |
| PADDL X13, X4 |
| PADDL X13, X5 |
| PADDL X14, X7 |
| PADDL X14, X8 |
| PADDL X15, X10 |
| PADDL ·sseIncMask<>+0(SB), X15 |
| PADDL X15, X11 |
| PAND ·polyClampMask<>+0(SB), X0 |
| MOVOU X0, (BP) |
| MOVOU X3, 16(BP) |
| |
| // Hash |
| MOVQ ad_len+80(FP), R9 |
| CALL polyHashADInternal<>(SB) |
| XORQ CX, CX |
| |
| sealSSE128SealHash: |
| CMPQ CX, $0x10 |
| JB sealSSE128Seal |
| ADDQ (DI), R10 |
| ADCQ 8(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| SUBQ $0x10, CX |
| ADDQ $0x10, DI |
| JMP sealSSE128SealHash |
| |
| sealSSE128Seal: |
| CMPQ BX, $0x10 |
| JB sealSSETail |
| SUBQ $0x10, BX |
| |
| // Load for decryption |
| MOVOU (SI), X12 |
| PXOR X12, X1 |
| MOVOU X1, (DI) |
| LEAQ 16(SI), SI |
| LEAQ 16(DI), DI |
| |
| // Extract for hashing |
| MOVQ X1, R13 |
| PSRLDQ $0x08, X1 |
| MOVQ X1, R14 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| |
| // Shift the stream "left" |
| MOVO X4, X1 |
| MOVO X7, X4 |
| MOVO X10, X7 |
| MOVO X2, X10 |
| MOVO X5, X2 |
| MOVO X8, X5 |
| MOVO X11, X8 |
| JMP sealSSE128Seal |
| |
| sealSSETail: |
| TESTQ BX, BX |
| JE sealSSEFinalize |
| |
| // We can only load the PT one byte at a time to avoid read after end of buffer |
| MOVQ BX, R9 |
| SHLQ $0x04, R9 |
| LEAQ ·andMask<>+0(SB), R13 |
| MOVQ BX, CX |
| LEAQ -1(SI)(BX*1), SI |
| XORQ R15, R15 |
| XORQ R8, R8 |
| XORQ AX, AX |
| |
| sealSSETailLoadLoop: |
| SHLQ $0x08, R15, R8 |
| SHLQ $0x08, R15 |
| MOVB (SI), AX |
| XORQ AX, R15 |
| LEAQ -1(SI), SI |
| DECQ CX |
| JNE sealSSETailLoadLoop |
| MOVQ R15, 64(BP) |
| MOVQ R8, 72(BP) |
| PXOR 64(BP), X1 |
| MOVOU X1, (DI) |
| MOVOU -16(R13)(R9*1), X12 |
| PAND X12, X1 |
| MOVQ X1, R13 |
| PSRLDQ $0x08, X1 |
| MOVQ X1, R14 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| ADDQ BX, DI |
| |
| sealSSEFinalize: |
| // Hash in the buffer lengths |
| ADDQ ad_len+80(FP), R10 |
| ADCQ src_len+56(FP), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| |
| // Final reduce |
| MOVQ R10, R13 |
| MOVQ R11, R14 |
| MOVQ R12, R15 |
| SUBQ $-5, R10 |
| SBBQ $-1, R11 |
| SBBQ $0x03, R12 |
| CMOVQCS R13, R10 |
| CMOVQCS R14, R11 |
| CMOVQCS R15, R12 |
| |
| // Add in the "s" part of the key |
| ADDQ 16(BP), R10 |
| ADCQ 24(BP), R11 |
| |
| // Finally store the tag at the end of the message |
| MOVQ R10, (DI) |
| MOVQ R11, 8(DI) |
| RET |
| |
| chacha20Poly1305Seal_AVX2: |
| VZEROUPPER |
| VMOVDQU ·chacha20Constants<>+0(SB), Y0 |
| BYTE $0xc4 |
| BYTE $0x42 |
| BYTE $0x7d |
| BYTE $0x5a |
| BYTE $0x70 |
| BYTE $0x10 |
| BYTE $0xc4 |
| BYTE $0x42 |
| BYTE $0x7d |
| BYTE $0x5a |
| BYTE $0x60 |
| BYTE $0x20 |
| BYTE $0xc4 |
| BYTE $0xc2 |
| BYTE $0x7d |
| BYTE $0x5a |
| BYTE $0x60 |
| BYTE $0x30 |
| VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 |
| |
| // Special optimizations, for very short buffers |
| CMPQ BX, $0x000000c0 |
| JBE seal192AVX2 |
| CMPQ BX, $0x00000140 |
| JBE seal320AVX2 |
| |
| // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream |
| VMOVDQA Y0, Y5 |
| VMOVDQA Y0, Y6 |
| VMOVDQA Y0, Y7 |
| VMOVDQA Y14, Y9 |
| VMOVDQA Y14, Y10 |
| VMOVDQA Y14, Y11 |
| VMOVDQA Y14, 32(BP) |
| VMOVDQA Y12, Y13 |
| VMOVDQA Y12, Y8 |
| VMOVDQA Y12, Y15 |
| VMOVDQA Y12, 64(BP) |
| VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 |
| VMOVDQA Y4, 96(BP) |
| VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 |
| VMOVDQA Y1, 128(BP) |
| VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 |
| VMOVDQA Y2, 160(BP) |
| VMOVDQA Y3, 192(BP) |
| MOVQ $0x0000000a, R9 |
| |
| sealAVX2IntroLoop: |
| VMOVDQA Y15, 224(BP) |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x0c, Y14, Y15 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x07, Y14, Y15 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x0c, Y9, Y15 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x07, Y9, Y15 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPADDD Y10, Y6, Y6 |
| VPXOR Y6, Y2, Y2 |
| VPSHUFB ·rol16<>+0(SB), Y2, Y2 |
| VPADDD Y2, Y8, Y8 |
| VPXOR Y8, Y10, Y10 |
| VPSLLD $0x0c, Y10, Y15 |
| VPSRLD $0x14, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VPADDD Y10, Y6, Y6 |
| VPXOR Y6, Y2, Y2 |
| VPSHUFB ·rol8<>+0(SB), Y2, Y2 |
| VPADDD Y2, Y8, Y8 |
| VPXOR Y8, Y10, Y10 |
| VPSLLD $0x07, Y10, Y15 |
| VPSRLD $0x19, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VMOVDQA 224(BP), Y15 |
| VMOVDQA Y13, 224(BP) |
| VPADDD Y11, Y7, Y7 |
| VPXOR Y7, Y3, Y3 |
| VPSHUFB ·rol16<>+0(SB), Y3, Y3 |
| VPADDD Y3, Y15, Y15 |
| VPXOR Y15, Y11, Y11 |
| VPSLLD $0x0c, Y11, Y13 |
| VPSRLD $0x14, Y11, Y11 |
| VPXOR Y13, Y11, Y11 |
| VPADDD Y11, Y7, Y7 |
| VPXOR Y7, Y3, Y3 |
| VPSHUFB ·rol8<>+0(SB), Y3, Y3 |
| VPADDD Y3, Y15, Y15 |
| VPXOR Y15, Y11, Y11 |
| VPSLLD $0x07, Y11, Y13 |
| VPSRLD $0x19, Y11, Y11 |
| VPXOR Y13, Y11, Y11 |
| VMOVDQA 224(BP), Y13 |
| VPALIGNR $0x04, Y14, Y14, Y14 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x0c, Y4, Y4, Y4 |
| VPALIGNR $0x04, Y9, Y9, Y9 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x0c, Y1, Y1, Y1 |
| VPALIGNR $0x04, Y10, Y10, Y10 |
| VPALIGNR $0x08, Y8, Y8, Y8 |
| VPALIGNR $0x0c, Y2, Y2, Y2 |
| VPALIGNR $0x04, Y11, Y11, Y11 |
| VPALIGNR $0x08, Y15, Y15, Y15 |
| VPALIGNR $0x0c, Y3, Y3, Y3 |
| VMOVDQA Y15, 224(BP) |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x0c, Y14, Y15 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x07, Y14, Y15 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x0c, Y9, Y15 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x07, Y9, Y15 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPADDD Y10, Y6, Y6 |
| VPXOR Y6, Y2, Y2 |
| VPSHUFB ·rol16<>+0(SB), Y2, Y2 |
| VPADDD Y2, Y8, Y8 |
| VPXOR Y8, Y10, Y10 |
| VPSLLD $0x0c, Y10, Y15 |
| VPSRLD $0x14, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VPADDD Y10, Y6, Y6 |
| VPXOR Y6, Y2, Y2 |
| VPSHUFB ·rol8<>+0(SB), Y2, Y2 |
| VPADDD Y2, Y8, Y8 |
| VPXOR Y8, Y10, Y10 |
| VPSLLD $0x07, Y10, Y15 |
| VPSRLD $0x19, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VMOVDQA 224(BP), Y15 |
| VMOVDQA Y13, 224(BP) |
| VPADDD Y11, Y7, Y7 |
| VPXOR Y7, Y3, Y3 |
| VPSHUFB ·rol16<>+0(SB), Y3, Y3 |
| VPADDD Y3, Y15, Y15 |
| VPXOR Y15, Y11, Y11 |
| VPSLLD $0x0c, Y11, Y13 |
| VPSRLD $0x14, Y11, Y11 |
| VPXOR Y13, Y11, Y11 |
| VPADDD Y11, Y7, Y7 |
| VPXOR Y7, Y3, Y3 |
| VPSHUFB ·rol8<>+0(SB), Y3, Y3 |
| VPADDD Y3, Y15, Y15 |
| VPXOR Y15, Y11, Y11 |
| VPSLLD $0x07, Y11, Y13 |
| VPSRLD $0x19, Y11, Y11 |
| VPXOR Y13, Y11, Y11 |
| VMOVDQA 224(BP), Y13 |
| VPALIGNR $0x0c, Y14, Y14, Y14 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x04, Y4, Y4, Y4 |
| VPALIGNR $0x0c, Y9, Y9, Y9 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x04, Y1, Y1, Y1 |
| VPALIGNR $0x0c, Y10, Y10, Y10 |
| VPALIGNR $0x08, Y8, Y8, Y8 |
| VPALIGNR $0x04, Y2, Y2, Y2 |
| VPALIGNR $0x0c, Y11, Y11, Y11 |
| VPALIGNR $0x08, Y15, Y15, Y15 |
| VPALIGNR $0x04, Y3, Y3, Y3 |
| DECQ R9 |
| JNE sealAVX2IntroLoop |
| VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 |
| VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 |
| VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 |
| VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 |
| VPADDD 32(BP), Y14, Y14 |
| VPADDD 32(BP), Y9, Y9 |
| VPADDD 32(BP), Y10, Y10 |
| VPADDD 32(BP), Y11, Y11 |
| VPADDD 64(BP), Y12, Y12 |
| VPADDD 64(BP), Y13, Y13 |
| VPADDD 64(BP), Y8, Y8 |
| VPADDD 64(BP), Y15, Y15 |
| VPADDD 96(BP), Y4, Y4 |
| VPADDD 128(BP), Y1, Y1 |
| VPADDD 160(BP), Y2, Y2 |
| VPADDD 192(BP), Y3, Y3 |
| VPERM2I128 $0x13, Y12, Y4, Y12 |
| VPERM2I128 $0x02, Y0, Y14, Y4 |
| VPERM2I128 $0x13, Y0, Y14, Y0 |
| |
| // Clamp and store poly key |
| VPAND ·polyClampMask<>+0(SB), Y4, Y4 |
| VMOVDQA Y4, (BP) |
| |
| // Hash AD |
| MOVQ ad_len+80(FP), R9 |
| CALL polyHashADInternal<>(SB) |
| |
| // Can store at least 320 bytes |
| VPXOR (SI), Y0, Y0 |
| VPXOR 32(SI), Y12, Y12 |
| VMOVDQU Y0, (DI) |
| VMOVDQU Y12, 32(DI) |
| VPERM2I128 $0x02, Y5, Y9, Y0 |
| VPERM2I128 $0x02, Y13, Y1, Y14 |
| VPERM2I128 $0x13, Y5, Y9, Y12 |
| VPERM2I128 $0x13, Y13, Y1, Y4 |
| VPXOR 64(SI), Y0, Y0 |
| VPXOR 96(SI), Y14, Y14 |
| VPXOR 128(SI), Y12, Y12 |
| VPXOR 160(SI), Y4, Y4 |
| VMOVDQU Y0, 64(DI) |
| VMOVDQU Y14, 96(DI) |
| VMOVDQU Y12, 128(DI) |
| VMOVDQU Y4, 160(DI) |
| VPERM2I128 $0x02, Y6, Y10, Y0 |
| VPERM2I128 $0x02, Y8, Y2, Y14 |
| VPERM2I128 $0x13, Y6, Y10, Y12 |
| VPERM2I128 $0x13, Y8, Y2, Y4 |
| VPXOR 192(SI), Y0, Y0 |
| VPXOR 224(SI), Y14, Y14 |
| VPXOR 256(SI), Y12, Y12 |
| VPXOR 288(SI), Y4, Y4 |
| VMOVDQU Y0, 192(DI) |
| VMOVDQU Y14, 224(DI) |
| VMOVDQU Y12, 256(DI) |
| VMOVDQU Y4, 288(DI) |
| MOVQ $0x00000140, CX |
| SUBQ $0x00000140, BX |
| LEAQ 320(SI), SI |
| VPERM2I128 $0x02, Y7, Y11, Y0 |
| VPERM2I128 $0x02, Y15, Y3, Y14 |
| VPERM2I128 $0x13, Y7, Y11, Y12 |
| VPERM2I128 $0x13, Y15, Y3, Y4 |
| CMPQ BX, $0x80 |
| JBE sealAVX2SealHash |
| VPXOR (SI), Y0, Y0 |
| VPXOR 32(SI), Y14, Y14 |
| VPXOR 64(SI), Y12, Y12 |
| VPXOR 96(SI), Y4, Y4 |
| VMOVDQU Y0, 320(DI) |
| VMOVDQU Y14, 352(DI) |
| VMOVDQU Y12, 384(DI) |
| VMOVDQU Y4, 416(DI) |
| SUBQ $0x80, BX |
| LEAQ 128(SI), SI |
| MOVQ $0x00000008, CX |
| MOVQ $0x00000002, R9 |
| CMPQ BX, $0x80 |
| JBE sealAVX2Tail128 |
| CMPQ BX, $0x00000100 |
| JBE sealAVX2Tail256 |
| CMPQ BX, $0x00000180 |
| JBE sealAVX2Tail384 |
| CMPQ BX, $0x00000200 |
| JBE sealAVX2Tail512 |
| |
| // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop |
| VMOVDQA ·chacha20Constants<>+0(SB), Y0 |
| VMOVDQA Y0, Y5 |
| VMOVDQA Y0, Y6 |
| VMOVDQA Y0, Y7 |
| VMOVDQA 32(BP), Y14 |
| VMOVDQA Y14, Y9 |
| VMOVDQA Y14, Y10 |
| VMOVDQA Y14, Y11 |
| VMOVDQA 64(BP), Y12 |
| VMOVDQA Y12, Y13 |
| VMOVDQA Y12, Y8 |
| VMOVDQA Y12, Y15 |
| VMOVDQA 192(BP), Y4 |
| VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 |
| VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 |
| VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 |
| VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 |
| VMOVDQA Y4, 96(BP) |
| VMOVDQA Y1, 128(BP) |
| VMOVDQA Y2, 160(BP) |
| VMOVDQA Y3, 192(BP) |
| VMOVDQA Y15, 224(BP) |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x0c, Y14, Y15 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x07, Y14, Y15 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x0c, Y9, Y15 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x07, Y9, Y15 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPADDD Y10, Y6, Y6 |
| VPXOR Y6, Y2, Y2 |
| VPSHUFB ·rol16<>+0(SB), Y2, Y2 |
| VPADDD Y2, Y8, Y8 |
| VPXOR Y8, Y10, Y10 |
| VPSLLD $0x0c, Y10, Y15 |
| VPSRLD $0x14, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VPADDD Y10, Y6, Y6 |
| VPXOR Y6, Y2, Y2 |
| VPSHUFB ·rol8<>+0(SB), Y2, Y2 |
| VPADDD Y2, Y8, Y8 |
| VPXOR Y8, Y10, Y10 |
| VPSLLD $0x07, Y10, Y15 |
| VPSRLD $0x19, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VMOVDQA 224(BP), Y15 |
| VMOVDQA Y13, 224(BP) |
| VPADDD Y11, Y7, Y7 |
| VPXOR Y7, Y3, Y3 |
| VPSHUFB ·rol16<>+0(SB), Y3, Y3 |
| VPADDD Y3, Y15, Y15 |
| VPXOR Y15, Y11, Y11 |
| VPSLLD $0x0c, Y11, Y13 |
| VPSRLD $0x14, Y11, Y11 |
| VPXOR Y13, Y11, Y11 |
| VPADDD Y11, Y7, Y7 |
| VPXOR Y7, Y3, Y3 |
| VPSHUFB ·rol8<>+0(SB), Y3, Y3 |
| VPADDD Y3, Y15, Y15 |
| VPXOR Y15, Y11, Y11 |
| VPSLLD $0x07, Y11, Y13 |
| VPSRLD $0x19, Y11, Y11 |
| VPXOR Y13, Y11, Y11 |
| VMOVDQA 224(BP), Y13 |
| VPALIGNR $0x04, Y14, Y14, Y14 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x0c, Y4, Y4, Y4 |
| VPALIGNR $0x04, Y9, Y9, Y9 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x0c, Y1, Y1, Y1 |
| VPALIGNR $0x04, Y10, Y10, Y10 |
| VPALIGNR $0x08, Y8, Y8, Y8 |
| VPALIGNR $0x0c, Y2, Y2, Y2 |
| VPALIGNR $0x04, Y11, Y11, Y11 |
| VPALIGNR $0x08, Y15, Y15, Y15 |
| VPALIGNR $0x0c, Y3, Y3, Y3 |
| VMOVDQA Y15, 224(BP) |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x0c, Y14, Y15 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x07, Y14, Y15 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x0c, Y9, Y15 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x07, Y9, Y15 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPADDD Y10, Y6, Y6 |
| VPXOR Y6, Y2, Y2 |
| VPSHUFB ·rol16<>+0(SB), Y2, Y2 |
| VPADDD Y2, Y8, Y8 |
| VPXOR Y8, Y10, Y10 |
| VPSLLD $0x0c, Y10, Y15 |
| VPSRLD $0x14, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VPADDD Y10, Y6, Y6 |
| VPXOR Y6, Y2, Y2 |
| VPSHUFB ·rol8<>+0(SB), Y2, Y2 |
| VPADDD Y2, Y8, Y8 |
| VPXOR Y8, Y10, Y10 |
| VPSLLD $0x07, Y10, Y15 |
| VPSRLD $0x19, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VMOVDQA 224(BP), Y15 |
| VMOVDQA Y13, 224(BP) |
| VPADDD Y11, Y7, Y7 |
| VPXOR Y7, Y3, Y3 |
| VPSHUFB ·rol16<>+0(SB), Y3, Y3 |
| VPADDD Y3, Y15, Y15 |
| VPXOR Y15, Y11, Y11 |
| VPSLLD $0x0c, Y11, Y13 |
| VPSRLD $0x14, Y11, Y11 |
| VPXOR Y13, Y11, Y11 |
| VPADDD Y11, Y7, Y7 |
| VPXOR Y7, Y3, Y3 |
| VPSHUFB ·rol8<>+0(SB), Y3, Y3 |
| VPADDD Y3, Y15, Y15 |
| VPXOR Y15, Y11, Y11 |
| VPSLLD $0x07, Y11, Y13 |
| VPSRLD $0x19, Y11, Y11 |
| VPXOR Y13, Y11, Y11 |
| VMOVDQA 224(BP), Y13 |
| VPALIGNR $0x0c, Y14, Y14, Y14 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x04, Y4, Y4, Y4 |
| VPALIGNR $0x0c, Y9, Y9, Y9 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x04, Y1, Y1, Y1 |
| VPALIGNR $0x0c, Y10, Y10, Y10 |
| VPALIGNR $0x08, Y8, Y8, Y8 |
| VPALIGNR $0x04, Y2, Y2, Y2 |
| VPALIGNR $0x0c, Y11, Y11, Y11 |
| VPALIGNR $0x08, Y15, Y15, Y15 |
| VPALIGNR $0x04, Y3, Y3, Y3 |
| VPADDD Y14, Y0, Y0 |
| VPADDD Y9, Y5, Y5 |
| VPADDD Y10, Y6, Y6 |
| VPADDD Y11, Y7, Y7 |
| VPXOR Y0, Y4, Y4 |
| VPXOR Y5, Y1, Y1 |
| VPXOR Y6, Y2, Y2 |
| VPXOR Y7, Y3, Y3 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y2, Y2 |
| VPSHUFB ·rol16<>+0(SB), Y3, Y3 |
| VPADDD Y4, Y12, Y12 |
| VPADDD Y1, Y13, Y13 |
| VPADDD Y2, Y8, Y8 |
| VPADDD Y3, Y15, Y15 |
| VPXOR Y12, Y14, Y14 |
| VPXOR Y13, Y9, Y9 |
| VPXOR Y8, Y10, Y10 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA Y15, 224(BP) |
| VPSLLD $0x0c, Y14, Y15 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPSLLD $0x0c, Y9, Y15 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPSLLD $0x0c, Y10, Y15 |
| VPSRLD $0x14, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VPSLLD $0x0c, Y11, Y15 |
| VPSRLD $0x14, Y11, Y11 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA 224(BP), Y15 |
| SUBQ $0x10, DI |
| MOVQ $0x00000009, CX |
| JMP sealAVX2InternalLoopStart |
| |
| sealAVX2MainLoop: |
| VMOVDQU ·chacha20Constants<>+0(SB), Y0 |
| VMOVDQA Y0, Y5 |
| VMOVDQA Y0, Y6 |
| VMOVDQA Y0, Y7 |
| VMOVDQA 32(BP), Y14 |
| VMOVDQA Y14, Y9 |
| VMOVDQA Y14, Y10 |
| VMOVDQA Y14, Y11 |
| VMOVDQA 64(BP), Y12 |
| VMOVDQA Y12, Y13 |
| VMOVDQA Y12, Y8 |
| VMOVDQA Y12, Y15 |
| VMOVDQA 192(BP), Y4 |
| VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 |
| VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 |
| VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 |
| VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 |
| VMOVDQA Y4, 96(BP) |
| VMOVDQA Y1, 128(BP) |
| VMOVDQA Y2, 160(BP) |
| VMOVDQA Y3, 192(BP) |
| MOVQ $0x0000000a, CX |
| |
| sealAVX2InternalLoop: |
| ADDQ (DI), R10 |
| ADCQ 8(DI), R11 |
| ADCQ $0x01, R12 |
| VPADDD Y14, Y0, Y0 |
| VPADDD Y9, Y5, Y5 |
| VPADDD Y10, Y6, Y6 |
| VPADDD Y11, Y7, Y7 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| VPXOR Y0, Y4, Y4 |
| VPXOR Y5, Y1, Y1 |
| VPXOR Y6, Y2, Y2 |
| VPXOR Y7, Y3, Y3 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y2, Y2 |
| VPSHUFB ·rol16<>+0(SB), Y3, Y3 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| VPADDD Y4, Y12, Y12 |
| VPADDD Y1, Y13, Y13 |
| VPADDD Y2, Y8, Y8 |
| VPADDD Y3, Y15, Y15 |
| VPXOR Y12, Y14, Y14 |
| VPXOR Y13, Y9, Y9 |
| VPXOR Y8, Y10, Y10 |
| VPXOR Y15, Y11, Y11 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| VMOVDQA Y15, 224(BP) |
| VPSLLD $0x0c, Y14, Y15 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPSLLD $0x0c, Y9, Y15 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPSLLD $0x0c, Y10, Y15 |
| VPSRLD $0x14, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VPSLLD $0x0c, Y11, Y15 |
| VPSRLD $0x14, Y11, Y11 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA 224(BP), Y15 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| |
| sealAVX2InternalLoopStart: |
| VPADDD Y14, Y0, Y0 |
| VPADDD Y9, Y5, Y5 |
| VPADDD Y10, Y6, Y6 |
| VPADDD Y11, Y7, Y7 |
| VPXOR Y0, Y4, Y4 |
| VPXOR Y5, Y1, Y1 |
| VPXOR Y6, Y2, Y2 |
| VPXOR Y7, Y3, Y3 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y2, Y2 |
| VPSHUFB ·rol8<>+0(SB), Y3, Y3 |
| ADDQ 16(DI), R10 |
| ADCQ 24(DI), R11 |
| ADCQ $0x01, R12 |
| VPADDD Y4, Y12, Y12 |
| VPADDD Y1, Y13, Y13 |
| VPADDD Y2, Y8, Y8 |
| VPADDD Y3, Y15, Y15 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| VPXOR Y12, Y14, Y14 |
| VPXOR Y13, Y9, Y9 |
| VPXOR Y8, Y10, Y10 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA Y15, 224(BP) |
| VPSLLD $0x07, Y14, Y15 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPSLLD $0x07, Y9, Y15 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPSLLD $0x07, Y10, Y15 |
| VPSRLD $0x19, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VPSLLD $0x07, Y11, Y15 |
| VPSRLD $0x19, Y11, Y11 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA 224(BP), Y15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| VPALIGNR $0x04, Y14, Y14, Y14 |
| VPALIGNR $0x04, Y9, Y9, Y9 |
| VPALIGNR $0x04, Y10, Y10, Y10 |
| VPALIGNR $0x04, Y11, Y11, Y11 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x08, Y8, Y8, Y8 |
| VPALIGNR $0x08, Y15, Y15, Y15 |
| VPALIGNR $0x0c, Y4, Y4, Y4 |
| VPALIGNR $0x0c, Y1, Y1, Y1 |
| VPALIGNR $0x0c, Y2, Y2, Y2 |
| VPALIGNR $0x0c, Y3, Y3, Y3 |
| VPADDD Y14, Y0, Y0 |
| VPADDD Y9, Y5, Y5 |
| VPADDD Y10, Y6, Y6 |
| VPADDD Y11, Y7, Y7 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| VPXOR Y0, Y4, Y4 |
| VPXOR Y5, Y1, Y1 |
| VPXOR Y6, Y2, Y2 |
| VPXOR Y7, Y3, Y3 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y2, Y2 |
| VPSHUFB ·rol16<>+0(SB), Y3, Y3 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| VPADDD Y4, Y12, Y12 |
| VPADDD Y1, Y13, Y13 |
| VPADDD Y2, Y8, Y8 |
| VPADDD Y3, Y15, Y15 |
| VPXOR Y12, Y14, Y14 |
| VPXOR Y13, Y9, Y9 |
| VPXOR Y8, Y10, Y10 |
| VPXOR Y15, Y11, Y11 |
| ADDQ 32(DI), R10 |
| ADCQ 40(DI), R11 |
| ADCQ $0x01, R12 |
| LEAQ 48(DI), DI |
| VMOVDQA Y15, 224(BP) |
| VPSLLD $0x0c, Y14, Y15 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPSLLD $0x0c, Y9, Y15 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPSLLD $0x0c, Y10, Y15 |
| VPSRLD $0x14, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VPSLLD $0x0c, Y11, Y15 |
| VPSRLD $0x14, Y11, Y11 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA 224(BP), Y15 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| VPADDD Y14, Y0, Y0 |
| VPADDD Y9, Y5, Y5 |
| VPADDD Y10, Y6, Y6 |
| VPADDD Y11, Y7, Y7 |
| VPXOR Y0, Y4, Y4 |
| VPXOR Y5, Y1, Y1 |
| VPXOR Y6, Y2, Y2 |
| VPXOR Y7, Y3, Y3 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y2, Y2 |
| VPSHUFB ·rol8<>+0(SB), Y3, Y3 |
| VPADDD Y4, Y12, Y12 |
| VPADDD Y1, Y13, Y13 |
| VPADDD Y2, Y8, Y8 |
| VPADDD Y3, Y15, Y15 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| VPXOR Y12, Y14, Y14 |
| VPXOR Y13, Y9, Y9 |
| VPXOR Y8, Y10, Y10 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA Y15, 224(BP) |
| VPSLLD $0x07, Y14, Y15 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPSLLD $0x07, Y9, Y15 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPSLLD $0x07, Y10, Y15 |
| VPSRLD $0x19, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VPSLLD $0x07, Y11, Y15 |
| VPSRLD $0x19, Y11, Y11 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA 224(BP), Y15 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| VPALIGNR $0x0c, Y14, Y14, Y14 |
| VPALIGNR $0x0c, Y9, Y9, Y9 |
| VPALIGNR $0x0c, Y10, Y10, Y10 |
| VPALIGNR $0x0c, Y11, Y11, Y11 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x08, Y8, Y8, Y8 |
| VPALIGNR $0x08, Y15, Y15, Y15 |
| VPALIGNR $0x04, Y4, Y4, Y4 |
| VPALIGNR $0x04, Y1, Y1, Y1 |
| VPALIGNR $0x04, Y2, Y2, Y2 |
| VPALIGNR $0x04, Y3, Y3, Y3 |
| DECQ CX |
| JNE sealAVX2InternalLoop |
| VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 |
| VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 |
| VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 |
| VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 |
| VPADDD 32(BP), Y14, Y14 |
| VPADDD 32(BP), Y9, Y9 |
| VPADDD 32(BP), Y10, Y10 |
| VPADDD 32(BP), Y11, Y11 |
| VPADDD 64(BP), Y12, Y12 |
| VPADDD 64(BP), Y13, Y13 |
| VPADDD 64(BP), Y8, Y8 |
| VPADDD 64(BP), Y15, Y15 |
| VPADDD 96(BP), Y4, Y4 |
| VPADDD 128(BP), Y1, Y1 |
| VPADDD 160(BP), Y2, Y2 |
| VPADDD 192(BP), Y3, Y3 |
| VMOVDQA Y15, 224(BP) |
| |
| // We only hashed 480 of the 512 bytes available - hash the remaining 32 here |
| ADDQ (DI), R10 |
| ADCQ 8(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 32(DI), DI |
| VPERM2I128 $0x02, Y0, Y14, Y15 |
| VPERM2I128 $0x13, Y0, Y14, Y14 |
| VPERM2I128 $0x02, Y12, Y4, Y0 |
| VPERM2I128 $0x13, Y12, Y4, Y12 |
| VPXOR (SI), Y15, Y15 |
| VPXOR 32(SI), Y0, Y0 |
| VPXOR 64(SI), Y14, Y14 |
| VPXOR 96(SI), Y12, Y12 |
| VMOVDQU Y15, (DI) |
| VMOVDQU Y0, 32(DI) |
| VMOVDQU Y14, 64(DI) |
| VMOVDQU Y12, 96(DI) |
| VPERM2I128 $0x02, Y5, Y9, Y0 |
| VPERM2I128 $0x02, Y13, Y1, Y14 |
| VPERM2I128 $0x13, Y5, Y9, Y12 |
| VPERM2I128 $0x13, Y13, Y1, Y4 |
| VPXOR 128(SI), Y0, Y0 |
| VPXOR 160(SI), Y14, Y14 |
| VPXOR 192(SI), Y12, Y12 |
| VPXOR 224(SI), Y4, Y4 |
| VMOVDQU Y0, 128(DI) |
| VMOVDQU Y14, 160(DI) |
| VMOVDQU Y12, 192(DI) |
| VMOVDQU Y4, 224(DI) |
| |
| // and here |
| ADDQ -16(DI), R10 |
| ADCQ -8(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| VPERM2I128 $0x02, Y6, Y10, Y0 |
| VPERM2I128 $0x02, Y8, Y2, Y14 |
| VPERM2I128 $0x13, Y6, Y10, Y12 |
| VPERM2I128 $0x13, Y8, Y2, Y4 |
| VPXOR 256(SI), Y0, Y0 |
| VPXOR 288(SI), Y14, Y14 |
| VPXOR 320(SI), Y12, Y12 |
| VPXOR 352(SI), Y4, Y4 |
| VMOVDQU Y0, 256(DI) |
| VMOVDQU Y14, 288(DI) |
| VMOVDQU Y12, 320(DI) |
| VMOVDQU Y4, 352(DI) |
| VPERM2I128 $0x02, Y7, Y11, Y0 |
| VPERM2I128 $0x02, 224(BP), Y3, Y14 |
| VPERM2I128 $0x13, Y7, Y11, Y12 |
| VPERM2I128 $0x13, 224(BP), Y3, Y4 |
| VPXOR 384(SI), Y0, Y0 |
| VPXOR 416(SI), Y14, Y14 |
| VPXOR 448(SI), Y12, Y12 |
| VPXOR 480(SI), Y4, Y4 |
| VMOVDQU Y0, 384(DI) |
| VMOVDQU Y14, 416(DI) |
| VMOVDQU Y12, 448(DI) |
| VMOVDQU Y4, 480(DI) |
| LEAQ 512(SI), SI |
| SUBQ $0x00000200, BX |
| CMPQ BX, $0x00000200 |
| JG sealAVX2MainLoop |
| |
| // Tail can only hash 480 bytes |
| ADDQ (DI), R10 |
| ADCQ 8(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| ADDQ 16(DI), R10 |
| ADCQ 24(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 32(DI), DI |
| MOVQ $0x0000000a, CX |
| MOVQ $0x00000000, R9 |
| CMPQ BX, $0x80 |
| JBE sealAVX2Tail128 |
| CMPQ BX, $0x00000100 |
| JBE sealAVX2Tail256 |
| CMPQ BX, $0x00000180 |
| JBE sealAVX2Tail384 |
| JMP sealAVX2Tail512 |
| |
| seal192AVX2: |
| VMOVDQA Y0, Y5 |
| VMOVDQA Y14, Y9 |
| VMOVDQA Y12, Y13 |
| VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 |
| VMOVDQA Y0, Y6 |
| VMOVDQA Y14, Y10 |
| VMOVDQA Y12, Y8 |
| VMOVDQA Y4, Y2 |
| VMOVDQA Y1, Y15 |
| MOVQ $0x0000000a, R9 |
| |
| sealAVX2192InnerCipherLoop: |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x0c, Y14, Y3 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x07, Y14, Y3 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x0c, Y9, Y3 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x07, Y9, Y3 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPALIGNR $0x04, Y14, Y14, Y14 |
| VPALIGNR $0x04, Y9, Y9, Y9 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x0c, Y4, Y4, Y4 |
| VPALIGNR $0x0c, Y1, Y1, Y1 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x0c, Y14, Y3 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x07, Y14, Y3 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x0c, Y9, Y3 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x07, Y9, Y3 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPALIGNR $0x0c, Y14, Y14, Y14 |
| VPALIGNR $0x0c, Y9, Y9, Y9 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x04, Y4, Y4, Y4 |
| VPALIGNR $0x04, Y1, Y1, Y1 |
| DECQ R9 |
| JNE sealAVX2192InnerCipherLoop |
| VPADDD Y6, Y0, Y0 |
| VPADDD Y6, Y5, Y5 |
| VPADDD Y10, Y14, Y14 |
| VPADDD Y10, Y9, Y9 |
| VPADDD Y8, Y12, Y12 |
| VPADDD Y8, Y13, Y13 |
| VPADDD Y2, Y4, Y4 |
| VPADDD Y15, Y1, Y1 |
| VPERM2I128 $0x02, Y0, Y14, Y3 |
| |
| // Clamp and store poly key |
| VPAND ·polyClampMask<>+0(SB), Y3, Y3 |
| VMOVDQA Y3, (BP) |
| |
| // Stream for up to 192 bytes |
| VPERM2I128 $0x13, Y0, Y14, Y0 |
| VPERM2I128 $0x13, Y12, Y4, Y14 |
| VPERM2I128 $0x02, Y5, Y9, Y12 |
| VPERM2I128 $0x02, Y13, Y1, Y4 |
| VPERM2I128 $0x13, Y5, Y9, Y5 |
| VPERM2I128 $0x13, Y13, Y1, Y9 |
| |
| sealAVX2ShortSeal: |
| // Hash aad |
| MOVQ ad_len+80(FP), R9 |
| CALL polyHashADInternal<>(SB) |
| XORQ CX, CX |
| |
| sealAVX2SealHash: |
| // itr1 holds the number of bytes encrypted but not yet hashed |
| CMPQ CX, $0x10 |
| JB sealAVX2ShortSealLoop |
| ADDQ (DI), R10 |
| ADCQ 8(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| SUBQ $0x10, CX |
| ADDQ $0x10, DI |
| JMP sealAVX2SealHash |
| |
| sealAVX2ShortSealLoop: |
| CMPQ BX, $0x20 |
| JB sealAVX2ShortTail32 |
| SUBQ $0x20, BX |
| |
| // Load for encryption |
| VPXOR (SI), Y0, Y0 |
| VMOVDQU Y0, (DI) |
| LEAQ 32(SI), SI |
| |
| // Now can hash |
| ADDQ (DI), R10 |
| ADCQ 8(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| ADDQ 16(DI), R10 |
| ADCQ 24(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 32(DI), DI |
| |
| // Shift stream left |
| VMOVDQA Y14, Y0 |
| VMOVDQA Y12, Y14 |
| VMOVDQA Y4, Y12 |
| VMOVDQA Y5, Y4 |
| VMOVDQA Y9, Y5 |
| VMOVDQA Y13, Y9 |
| VMOVDQA Y1, Y13 |
| VMOVDQA Y6, Y1 |
| VMOVDQA Y10, Y6 |
| JMP sealAVX2ShortSealLoop |
| |
| sealAVX2ShortTail32: |
| CMPQ BX, $0x10 |
| VMOVDQA X0, X1 |
| JB sealAVX2ShortDone |
| SUBQ $0x10, BX |
| |
| // Load for encryption |
| VPXOR (SI), X0, X12 |
| VMOVDQU X12, (DI) |
| LEAQ 16(SI), SI |
| |
| // Hash |
| ADDQ (DI), R10 |
| ADCQ 8(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 16(DI), DI |
| VPERM2I128 $0x11, Y0, Y0, Y0 |
| VMOVDQA X0, X1 |
| |
| sealAVX2ShortDone: |
| VZEROUPPER |
| JMP sealSSETail |
| |
| seal320AVX2: |
| VMOVDQA Y0, Y5 |
| VMOVDQA Y14, Y9 |
| VMOVDQA Y12, Y13 |
| VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 |
| VMOVDQA Y0, Y6 |
| VMOVDQA Y14, Y10 |
| VMOVDQA Y12, Y8 |
| VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 |
| VMOVDQA Y14, Y7 |
| VMOVDQA Y12, Y11 |
| VMOVDQA Y4, Y15 |
| MOVQ $0x0000000a, R9 |
| |
| sealAVX2320InnerCipherLoop: |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x0c, Y14, Y3 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x07, Y14, Y3 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x0c, Y9, Y3 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x07, Y9, Y3 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y10, Y6, Y6 |
| VPXOR Y6, Y2, Y2 |
| VPSHUFB ·rol16<>+0(SB), Y2, Y2 |
| VPADDD Y2, Y8, Y8 |
| VPXOR Y8, Y10, Y10 |
| VPSLLD $0x0c, Y10, Y3 |
| VPSRLD $0x14, Y10, Y10 |
| VPXOR Y3, Y10, Y10 |
| VPADDD Y10, Y6, Y6 |
| VPXOR Y6, Y2, Y2 |
| VPSHUFB ·rol8<>+0(SB), Y2, Y2 |
| VPADDD Y2, Y8, Y8 |
| VPXOR Y8, Y10, Y10 |
| VPSLLD $0x07, Y10, Y3 |
| VPSRLD $0x19, Y10, Y10 |
| VPXOR Y3, Y10, Y10 |
| VPALIGNR $0x04, Y14, Y14, Y14 |
| VPALIGNR $0x04, Y9, Y9, Y9 |
| VPALIGNR $0x04, Y10, Y10, Y10 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x08, Y8, Y8, Y8 |
| VPALIGNR $0x0c, Y4, Y4, Y4 |
| VPALIGNR $0x0c, Y1, Y1, Y1 |
| VPALIGNR $0x0c, Y2, Y2, Y2 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x0c, Y14, Y3 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x07, Y14, Y3 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x0c, Y9, Y3 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x07, Y9, Y3 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y10, Y6, Y6 |
| VPXOR Y6, Y2, Y2 |
| VPSHUFB ·rol16<>+0(SB), Y2, Y2 |
| VPADDD Y2, Y8, Y8 |
| VPXOR Y8, Y10, Y10 |
| VPSLLD $0x0c, Y10, Y3 |
| VPSRLD $0x14, Y10, Y10 |
| VPXOR Y3, Y10, Y10 |
| VPADDD Y10, Y6, Y6 |
| VPXOR Y6, Y2, Y2 |
| VPSHUFB ·rol8<>+0(SB), Y2, Y2 |
| VPADDD Y2, Y8, Y8 |
| VPXOR Y8, Y10, Y10 |
| VPSLLD $0x07, Y10, Y3 |
| VPSRLD $0x19, Y10, Y10 |
| VPXOR Y3, Y10, Y10 |
| VPALIGNR $0x0c, Y14, Y14, Y14 |
| VPALIGNR $0x0c, Y9, Y9, Y9 |
| VPALIGNR $0x0c, Y10, Y10, Y10 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x08, Y8, Y8, Y8 |
| VPALIGNR $0x04, Y4, Y4, Y4 |
| VPALIGNR $0x04, Y1, Y1, Y1 |
| VPALIGNR $0x04, Y2, Y2, Y2 |
| DECQ R9 |
| JNE sealAVX2320InnerCipherLoop |
| VMOVDQA ·chacha20Constants<>+0(SB), Y3 |
| VPADDD Y3, Y0, Y0 |
| VPADDD Y3, Y5, Y5 |
| VPADDD Y3, Y6, Y6 |
| VPADDD Y7, Y14, Y14 |
| VPADDD Y7, Y9, Y9 |
| VPADDD Y7, Y10, Y10 |
| VPADDD Y11, Y12, Y12 |
| VPADDD Y11, Y13, Y13 |
| VPADDD Y11, Y8, Y8 |
| VMOVDQA ·avx2IncMask<>+0(SB), Y3 |
| VPADDD Y15, Y4, Y4 |
| VPADDD Y3, Y15, Y15 |
| VPADDD Y15, Y1, Y1 |
| VPADDD Y3, Y15, Y15 |
| VPADDD Y15, Y2, Y2 |
| |
| // Clamp and store poly key |
| VPERM2I128 $0x02, Y0, Y14, Y3 |
| VPAND ·polyClampMask<>+0(SB), Y3, Y3 |
| VMOVDQA Y3, (BP) |
| |
| // Stream for up to 320 bytes |
| VPERM2I128 $0x13, Y0, Y14, Y0 |
| VPERM2I128 $0x13, Y12, Y4, Y14 |
| VPERM2I128 $0x02, Y5, Y9, Y12 |
| VPERM2I128 $0x02, Y13, Y1, Y4 |
| VPERM2I128 $0x13, Y5, Y9, Y5 |
| VPERM2I128 $0x13, Y13, Y1, Y9 |
| VPERM2I128 $0x02, Y6, Y10, Y13 |
| VPERM2I128 $0x02, Y8, Y2, Y1 |
| VPERM2I128 $0x13, Y6, Y10, Y6 |
| VPERM2I128 $0x13, Y8, Y2, Y10 |
| JMP sealAVX2ShortSeal |
| |
| sealAVX2Tail128: |
| VMOVDQA ·chacha20Constants<>+0(SB), Y0 |
| VMOVDQA 32(BP), Y14 |
| VMOVDQA 64(BP), Y12 |
| VMOVDQA 192(BP), Y4 |
| VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 |
| VMOVDQA Y4, Y1 |
| |
| sealAVX2Tail128LoopA: |
| ADDQ (DI), R10 |
| ADCQ 8(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 16(DI), DI |
| |
| sealAVX2Tail128LoopB: |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x0c, Y14, Y3 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x07, Y14, Y3 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| ADDQ (DI), R10 |
| ADCQ 8(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| VPALIGNR $0x04, Y14, Y14, Y14 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x0c, Y4, Y4, Y4 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x0c, Y14, Y3 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x07, Y14, Y3 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| ADDQ 16(DI), R10 |
| ADCQ 24(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 32(DI), DI |
| VPALIGNR $0x0c, Y14, Y14, Y14 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x04, Y4, Y4, Y4 |
| DECQ CX |
| JG sealAVX2Tail128LoopA |
| DECQ R9 |
| JGE sealAVX2Tail128LoopB |
| VPADDD ·chacha20Constants<>+0(SB), Y0, Y5 |
| VPADDD 32(BP), Y14, Y9 |
| VPADDD 64(BP), Y12, Y13 |
| VPADDD Y1, Y4, Y1 |
| VPERM2I128 $0x02, Y5, Y9, Y0 |
| VPERM2I128 $0x02, Y13, Y1, Y14 |
| VPERM2I128 $0x13, Y5, Y9, Y12 |
| VPERM2I128 $0x13, Y13, Y1, Y4 |
| JMP sealAVX2ShortSealLoop |
| |
| sealAVX2Tail256: |
| VMOVDQA ·chacha20Constants<>+0(SB), Y0 |
| VMOVDQA ·chacha20Constants<>+0(SB), Y5 |
| VMOVDQA 32(BP), Y14 |
| VMOVDQA 32(BP), Y9 |
| VMOVDQA 64(BP), Y12 |
| VMOVDQA 64(BP), Y13 |
| VMOVDQA 192(BP), Y4 |
| VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 |
| VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 |
| VMOVDQA Y4, Y7 |
| VMOVDQA Y1, Y11 |
| |
| sealAVX2Tail256LoopA: |
| ADDQ (DI), R10 |
| ADCQ 8(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 16(DI), DI |
| |
| sealAVX2Tail256LoopB: |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x0c, Y14, Y3 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x07, Y14, Y3 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x0c, Y9, Y3 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x07, Y9, Y3 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| ADDQ (DI), R10 |
| ADCQ 8(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| VPALIGNR $0x04, Y14, Y14, Y14 |
| VPALIGNR $0x04, Y9, Y9, Y9 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x0c, Y4, Y4, Y4 |
| VPALIGNR $0x0c, Y1, Y1, Y1 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x0c, Y14, Y3 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x07, Y14, Y3 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x0c, Y9, Y3 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x07, Y9, Y3 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| ADDQ 16(DI), R10 |
| ADCQ 24(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 32(DI), DI |
| VPALIGNR $0x0c, Y14, Y14, Y14 |
| VPALIGNR $0x0c, Y9, Y9, Y9 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x04, Y4, Y4, Y4 |
| VPALIGNR $0x04, Y1, Y1, Y1 |
| DECQ CX |
| JG sealAVX2Tail256LoopA |
| DECQ R9 |
| JGE sealAVX2Tail256LoopB |
| VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 |
| VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 |
| VPADDD 32(BP), Y14, Y14 |
| VPADDD 32(BP), Y9, Y9 |
| VPADDD 64(BP), Y12, Y12 |
| VPADDD 64(BP), Y13, Y13 |
| VPADDD Y7, Y4, Y4 |
| VPADDD Y11, Y1, Y1 |
| VPERM2I128 $0x02, Y0, Y14, Y3 |
| VPERM2I128 $0x02, Y12, Y4, Y7 |
| VPERM2I128 $0x13, Y0, Y14, Y11 |
| VPERM2I128 $0x13, Y12, Y4, Y15 |
| VPXOR (SI), Y3, Y3 |
| VPXOR 32(SI), Y7, Y7 |
| VPXOR 64(SI), Y11, Y11 |
| VPXOR 96(SI), Y15, Y15 |
| VMOVDQU Y3, (DI) |
| VMOVDQU Y7, 32(DI) |
| VMOVDQU Y11, 64(DI) |
| VMOVDQU Y15, 96(DI) |
| MOVQ $0x00000080, CX |
| LEAQ 128(SI), SI |
| SUBQ $0x80, BX |
| VPERM2I128 $0x02, Y5, Y9, Y0 |
| VPERM2I128 $0x02, Y13, Y1, Y14 |
| VPERM2I128 $0x13, Y5, Y9, Y12 |
| VPERM2I128 $0x13, Y13, Y1, Y4 |
| JMP sealAVX2SealHash |
| |
| sealAVX2Tail384: |
| VMOVDQA ·chacha20Constants<>+0(SB), Y0 |
| VMOVDQA Y0, Y5 |
| VMOVDQA Y0, Y6 |
| VMOVDQA 32(BP), Y14 |
| VMOVDQA Y14, Y9 |
| VMOVDQA Y14, Y10 |
| VMOVDQA 64(BP), Y12 |
| VMOVDQA Y12, Y13 |
| VMOVDQA Y12, Y8 |
| VMOVDQA 192(BP), Y4 |
| VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 |
| VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 |
| VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 |
| VMOVDQA Y4, Y7 |
| VMOVDQA Y1, Y11 |
| VMOVDQA Y2, Y15 |
| |
| sealAVX2Tail384LoopA: |
| ADDQ (DI), R10 |
| ADCQ 8(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 16(DI), DI |
| |
| sealAVX2Tail384LoopB: |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x0c, Y14, Y3 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x07, Y14, Y3 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x0c, Y9, Y3 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x07, Y9, Y3 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y10, Y6, Y6 |
| VPXOR Y6, Y2, Y2 |
| VPSHUFB ·rol16<>+0(SB), Y2, Y2 |
| VPADDD Y2, Y8, Y8 |
| VPXOR Y8, Y10, Y10 |
| VPSLLD $0x0c, Y10, Y3 |
| VPSRLD $0x14, Y10, Y10 |
| VPXOR Y3, Y10, Y10 |
| VPADDD Y10, Y6, Y6 |
| VPXOR Y6, Y2, Y2 |
| VPSHUFB ·rol8<>+0(SB), Y2, Y2 |
| VPADDD Y2, Y8, Y8 |
| VPXOR Y8, Y10, Y10 |
| VPSLLD $0x07, Y10, Y3 |
| VPSRLD $0x19, Y10, Y10 |
| VPXOR Y3, Y10, Y10 |
| ADDQ (DI), R10 |
| ADCQ 8(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| VPALIGNR $0x04, Y14, Y14, Y14 |
| VPALIGNR $0x04, Y9, Y9, Y9 |
| VPALIGNR $0x04, Y10, Y10, Y10 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x08, Y8, Y8, Y8 |
| VPALIGNR $0x0c, Y4, Y4, Y4 |
| VPALIGNR $0x0c, Y1, Y1, Y1 |
| VPALIGNR $0x0c, Y2, Y2, Y2 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x0c, Y14, Y3 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y14, Y0, Y0 |
| VPXOR Y0, Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPADDD Y4, Y12, Y12 |
| VPXOR Y12, Y14, Y14 |
| VPSLLD $0x07, Y14, Y3 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y3, Y14, Y14 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x0c, Y9, Y3 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y9, Y5, Y5 |
| VPXOR Y5, Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPADDD Y1, Y13, Y13 |
| VPXOR Y13, Y9, Y9 |
| VPSLLD $0x07, Y9, Y3 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y3, Y9, Y9 |
| VPADDD Y10, Y6, Y6 |
| VPXOR Y6, Y2, Y2 |
| VPSHUFB ·rol16<>+0(SB), Y2, Y2 |
| VPADDD Y2, Y8, Y8 |
| VPXOR Y8, Y10, Y10 |
| VPSLLD $0x0c, Y10, Y3 |
| VPSRLD $0x14, Y10, Y10 |
| VPXOR Y3, Y10, Y10 |
| VPADDD Y10, Y6, Y6 |
| VPXOR Y6, Y2, Y2 |
| VPSHUFB ·rol8<>+0(SB), Y2, Y2 |
| VPADDD Y2, Y8, Y8 |
| VPXOR Y8, Y10, Y10 |
| VPSLLD $0x07, Y10, Y3 |
| VPSRLD $0x19, Y10, Y10 |
| VPXOR Y3, Y10, Y10 |
| ADDQ 16(DI), R10 |
| ADCQ 24(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 32(DI), DI |
| VPALIGNR $0x0c, Y14, Y14, Y14 |
| VPALIGNR $0x0c, Y9, Y9, Y9 |
| VPALIGNR $0x0c, Y10, Y10, Y10 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x08, Y8, Y8, Y8 |
| VPALIGNR $0x04, Y4, Y4, Y4 |
| VPALIGNR $0x04, Y1, Y1, Y1 |
| VPALIGNR $0x04, Y2, Y2, Y2 |
| DECQ CX |
| JG sealAVX2Tail384LoopA |
| DECQ R9 |
| JGE sealAVX2Tail384LoopB |
| VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 |
| VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 |
| VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 |
| VPADDD 32(BP), Y14, Y14 |
| VPADDD 32(BP), Y9, Y9 |
| VPADDD 32(BP), Y10, Y10 |
| VPADDD 64(BP), Y12, Y12 |
| VPADDD 64(BP), Y13, Y13 |
| VPADDD 64(BP), Y8, Y8 |
| VPADDD Y7, Y4, Y4 |
| VPADDD Y11, Y1, Y1 |
| VPADDD Y15, Y2, Y2 |
| VPERM2I128 $0x02, Y0, Y14, Y3 |
| VPERM2I128 $0x02, Y12, Y4, Y7 |
| VPERM2I128 $0x13, Y0, Y14, Y11 |
| VPERM2I128 $0x13, Y12, Y4, Y15 |
| VPXOR (SI), Y3, Y3 |
| VPXOR 32(SI), Y7, Y7 |
| VPXOR 64(SI), Y11, Y11 |
| VPXOR 96(SI), Y15, Y15 |
| VMOVDQU Y3, (DI) |
| VMOVDQU Y7, 32(DI) |
| VMOVDQU Y11, 64(DI) |
| VMOVDQU Y15, 96(DI) |
| VPERM2I128 $0x02, Y5, Y9, Y3 |
| VPERM2I128 $0x02, Y13, Y1, Y7 |
| VPERM2I128 $0x13, Y5, Y9, Y11 |
| VPERM2I128 $0x13, Y13, Y1, Y15 |
| VPXOR 128(SI), Y3, Y3 |
| VPXOR 160(SI), Y7, Y7 |
| VPXOR 192(SI), Y11, Y11 |
| VPXOR 224(SI), Y15, Y15 |
| VMOVDQU Y3, 128(DI) |
| VMOVDQU Y7, 160(DI) |
| VMOVDQU Y11, 192(DI) |
| VMOVDQU Y15, 224(DI) |
| MOVQ $0x00000100, CX |
| LEAQ 256(SI), SI |
| SUBQ $0x00000100, BX |
| VPERM2I128 $0x02, Y6, Y10, Y0 |
| VPERM2I128 $0x02, Y8, Y2, Y14 |
| VPERM2I128 $0x13, Y6, Y10, Y12 |
| VPERM2I128 $0x13, Y8, Y2, Y4 |
| JMP sealAVX2SealHash |
| |
| sealAVX2Tail512: |
| VMOVDQA ·chacha20Constants<>+0(SB), Y0 |
| VMOVDQA Y0, Y5 |
| VMOVDQA Y0, Y6 |
| VMOVDQA Y0, Y7 |
| VMOVDQA 32(BP), Y14 |
| VMOVDQA Y14, Y9 |
| VMOVDQA Y14, Y10 |
| VMOVDQA Y14, Y11 |
| VMOVDQA 64(BP), Y12 |
| VMOVDQA Y12, Y13 |
| VMOVDQA Y12, Y8 |
| VMOVDQA Y12, Y15 |
| VMOVDQA 192(BP), Y4 |
| VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 |
| VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 |
| VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 |
| VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 |
| VMOVDQA Y4, 96(BP) |
| VMOVDQA Y1, 128(BP) |
| VMOVDQA Y2, 160(BP) |
| VMOVDQA Y3, 192(BP) |
| |
| sealAVX2Tail512LoopA: |
| ADDQ (DI), R10 |
| ADCQ 8(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), AX |
| MOVQ AX, R15 |
| MULQ R10 |
| MOVQ AX, R13 |
| MOVQ DX, R14 |
| MOVQ (BP), AX |
| MULQ R11 |
| IMULQ R12, R15 |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), AX |
| MOVQ AX, R8 |
| MULQ R10 |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 8(BP), AX |
| MULQ R11 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| IMULQ R12, R8 |
| ADDQ R10, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 16(DI), DI |
| |
| sealAVX2Tail512LoopB: |
| VPADDD Y14, Y0, Y0 |
| VPADDD Y9, Y5, Y5 |
| VPADDD Y10, Y6, Y6 |
| VPADDD Y11, Y7, Y7 |
| VPXOR Y0, Y4, Y4 |
| VPXOR Y5, Y1, Y1 |
| VPXOR Y6, Y2, Y2 |
| VPXOR Y7, Y3, Y3 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y2, Y2 |
| VPSHUFB ·rol16<>+0(SB), Y3, Y3 |
| VPADDD Y4, Y12, Y12 |
| VPADDD Y1, Y13, Y13 |
| VPADDD Y2, Y8, Y8 |
| VPADDD Y3, Y15, Y15 |
| VPXOR Y12, Y14, Y14 |
| VPXOR Y13, Y9, Y9 |
| VPXOR Y8, Y10, Y10 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA Y15, 224(BP) |
| VPSLLD $0x0c, Y14, Y15 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPSLLD $0x0c, Y9, Y15 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPSLLD $0x0c, Y10, Y15 |
| VPSRLD $0x14, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VPSLLD $0x0c, Y11, Y15 |
| VPSRLD $0x14, Y11, Y11 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA 224(BP), Y15 |
| ADDQ (DI), R10 |
| ADCQ 8(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| VPADDD Y14, Y0, Y0 |
| VPADDD Y9, Y5, Y5 |
| VPADDD Y10, Y6, Y6 |
| VPADDD Y11, Y7, Y7 |
| VPXOR Y0, Y4, Y4 |
| VPXOR Y5, Y1, Y1 |
| VPXOR Y6, Y2, Y2 |
| VPXOR Y7, Y3, Y3 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y2, Y2 |
| VPSHUFB ·rol8<>+0(SB), Y3, Y3 |
| VPADDD Y4, Y12, Y12 |
| VPADDD Y1, Y13, Y13 |
| VPADDD Y2, Y8, Y8 |
| VPADDD Y3, Y15, Y15 |
| VPXOR Y12, Y14, Y14 |
| VPXOR Y13, Y9, Y9 |
| VPXOR Y8, Y10, Y10 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA Y15, 224(BP) |
| VPSLLD $0x07, Y14, Y15 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPSLLD $0x07, Y9, Y15 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPSLLD $0x07, Y10, Y15 |
| VPSRLD $0x19, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VPSLLD $0x07, Y11, Y15 |
| VPSRLD $0x19, Y11, Y11 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA 224(BP), Y15 |
| VPALIGNR $0x04, Y14, Y14, Y14 |
| VPALIGNR $0x04, Y9, Y9, Y9 |
| VPALIGNR $0x04, Y10, Y10, Y10 |
| VPALIGNR $0x04, Y11, Y11, Y11 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x08, Y8, Y8, Y8 |
| VPALIGNR $0x08, Y15, Y15, Y15 |
| VPALIGNR $0x0c, Y4, Y4, Y4 |
| VPALIGNR $0x0c, Y1, Y1, Y1 |
| VPALIGNR $0x0c, Y2, Y2, Y2 |
| VPALIGNR $0x0c, Y3, Y3, Y3 |
| VPADDD Y14, Y0, Y0 |
| VPADDD Y9, Y5, Y5 |
| VPADDD Y10, Y6, Y6 |
| VPADDD Y11, Y7, Y7 |
| VPXOR Y0, Y4, Y4 |
| VPXOR Y5, Y1, Y1 |
| VPXOR Y6, Y2, Y2 |
| VPXOR Y7, Y3, Y3 |
| VPSHUFB ·rol16<>+0(SB), Y4, Y4 |
| VPSHUFB ·rol16<>+0(SB), Y1, Y1 |
| VPSHUFB ·rol16<>+0(SB), Y2, Y2 |
| VPSHUFB ·rol16<>+0(SB), Y3, Y3 |
| VPADDD Y4, Y12, Y12 |
| VPADDD Y1, Y13, Y13 |
| VPADDD Y2, Y8, Y8 |
| VPADDD Y3, Y15, Y15 |
| VPXOR Y12, Y14, Y14 |
| VPXOR Y13, Y9, Y9 |
| VPXOR Y8, Y10, Y10 |
| VPXOR Y15, Y11, Y11 |
| ADDQ 16(DI), R10 |
| ADCQ 24(DI), R11 |
| ADCQ $0x01, R12 |
| MOVQ (BP), DX |
| MOVQ DX, R15 |
| MULXQ R10, R13, R14 |
| IMULQ R12, R15 |
| MULXQ R11, AX, DX |
| ADDQ AX, R14 |
| ADCQ DX, R15 |
| MOVQ 8(BP), DX |
| MULXQ R10, R10, AX |
| ADDQ R10, R14 |
| MULXQ R11, R11, R8 |
| ADCQ R11, R15 |
| ADCQ $0x00, R8 |
| IMULQ R12, DX |
| ADDQ AX, R15 |
| ADCQ DX, R8 |
| MOVQ R13, R10 |
| MOVQ R14, R11 |
| MOVQ R15, R12 |
| ANDQ $0x03, R12 |
| MOVQ R15, R13 |
| ANDQ $-4, R13 |
| MOVQ R8, R14 |
| SHRQ $0x02, R8, R15 |
| SHRQ $0x02, R8 |
| ADDQ R13, R10 |
| ADCQ R14, R11 |
| ADCQ $0x00, R12 |
| ADDQ R15, R10 |
| ADCQ R8, R11 |
| ADCQ $0x00, R12 |
| LEAQ 32(DI), DI |
| VMOVDQA Y15, 224(BP) |
| VPSLLD $0x0c, Y14, Y15 |
| VPSRLD $0x14, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPSLLD $0x0c, Y9, Y15 |
| VPSRLD $0x14, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPSLLD $0x0c, Y10, Y15 |
| VPSRLD $0x14, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VPSLLD $0x0c, Y11, Y15 |
| VPSRLD $0x14, Y11, Y11 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA 224(BP), Y15 |
| VPADDD Y14, Y0, Y0 |
| VPADDD Y9, Y5, Y5 |
| VPADDD Y10, Y6, Y6 |
| VPADDD Y11, Y7, Y7 |
| VPXOR Y0, Y4, Y4 |
| VPXOR Y5, Y1, Y1 |
| VPXOR Y6, Y2, Y2 |
| VPXOR Y7, Y3, Y3 |
| VPSHUFB ·rol8<>+0(SB), Y4, Y4 |
| VPSHUFB ·rol8<>+0(SB), Y1, Y1 |
| VPSHUFB ·rol8<>+0(SB), Y2, Y2 |
| VPSHUFB ·rol8<>+0(SB), Y3, Y3 |
| VPADDD Y4, Y12, Y12 |
| VPADDD Y1, Y13, Y13 |
| VPADDD Y2, Y8, Y8 |
| VPADDD Y3, Y15, Y15 |
| VPXOR Y12, Y14, Y14 |
| VPXOR Y13, Y9, Y9 |
| VPXOR Y8, Y10, Y10 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA Y15, 224(BP) |
| VPSLLD $0x07, Y14, Y15 |
| VPSRLD $0x19, Y14, Y14 |
| VPXOR Y15, Y14, Y14 |
| VPSLLD $0x07, Y9, Y15 |
| VPSRLD $0x19, Y9, Y9 |
| VPXOR Y15, Y9, Y9 |
| VPSLLD $0x07, Y10, Y15 |
| VPSRLD $0x19, Y10, Y10 |
| VPXOR Y15, Y10, Y10 |
| VPSLLD $0x07, Y11, Y15 |
| VPSRLD $0x19, Y11, Y11 |
| VPXOR Y15, Y11, Y11 |
| VMOVDQA 224(BP), Y15 |
| VPALIGNR $0x0c, Y14, Y14, Y14 |
| VPALIGNR $0x0c, Y9, Y9, Y9 |
| VPALIGNR $0x0c, Y10, Y10, Y10 |
| VPALIGNR $0x0c, Y11, Y11, Y11 |
| VPALIGNR $0x08, Y12, Y12, Y12 |
| VPALIGNR $0x08, Y13, Y13, Y13 |
| VPALIGNR $0x08, Y8, Y8, Y8 |
| VPALIGNR $0x08, Y15, Y15, Y15 |
| VPALIGNR $0x04, Y4, Y4, Y4 |
| VPALIGNR $0x04, Y1, Y1, Y1 |
| VPALIGNR $0x04, Y2, Y2, Y2 |
| VPALIGNR $0x04, Y3, Y3, Y3 |
| DECQ CX |
| JG sealAVX2Tail512LoopA |
| DECQ R9 |
| JGE sealAVX2Tail512LoopB |
| VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 |
| VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 |
| VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 |
| VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 |
| VPADDD 32(BP), Y14, Y14 |
| VPADDD 32(BP), Y9, Y9 |
| VPADDD 32(BP), Y10, Y10 |
| VPADDD 32(BP), Y11, Y11 |
| VPADDD 64(BP), Y12, Y12 |
| VPADDD 64(BP), Y13, Y13 |
| VPADDD 64(BP), Y8, Y8 |
| VPADDD 64(BP), Y15, Y15 |
| VPADDD 96(BP), Y4, Y4 |
| VPADDD 128(BP), Y1, Y1 |
| VPADDD 160(BP), Y2, Y2 |
| VPADDD 192(BP), Y3, Y3 |
| VMOVDQA Y15, 224(BP) |
| VPERM2I128 $0x02, Y0, Y14, Y15 |
| VPXOR (SI), Y15, Y15 |
| VMOVDQU Y15, (DI) |
| VPERM2I128 $0x02, Y12, Y4, Y15 |
| VPXOR 32(SI), Y15, Y15 |
| VMOVDQU Y15, 32(DI) |
| VPERM2I128 $0x13, Y0, Y14, Y15 |
| VPXOR 64(SI), Y15, Y15 |
| VMOVDQU Y15, 64(DI) |
| VPERM2I128 $0x13, Y12, Y4, Y15 |
| VPXOR 96(SI), Y15, Y15 |
| VMOVDQU Y15, 96(DI) |
| VPERM2I128 $0x02, Y5, Y9, Y0 |
| VPERM2I128 $0x02, Y13, Y1, Y14 |
| VPERM2I128 $0x13, Y5, Y9, Y12 |
| VPERM2I128 $0x13, Y13, Y1, Y4 |
| VPXOR 128(SI), Y0, Y0 |
| VPXOR 160(SI), Y14, Y14 |
| VPXOR 192(SI), Y12, Y12 |
| VPXOR 224(SI), Y4, Y4 |
| VMOVDQU Y0, 128(DI) |
| VMOVDQU Y14, 160(DI) |
| VMOVDQU Y12, 192(DI) |
| VMOVDQU Y4, 224(DI) |
| VPERM2I128 $0x02, Y6, Y10, Y0 |
| VPERM2I128 $0x02, Y8, Y2, Y14 |
| VPERM2I128 $0x13, Y6, Y10, Y12 |
| VPERM2I128 $0x13, Y8, Y2, Y4 |
| VPXOR 256(SI), Y0, Y0 |
| VPXOR 288(SI), Y14, Y14 |
| VPXOR 320(SI), Y12, Y12 |
| VPXOR 352(SI), Y4, Y4 |
| VMOVDQU Y0, 256(DI) |
| VMOVDQU Y14, 288(DI) |
| VMOVDQU Y12, 320(DI) |
| VMOVDQU Y4, 352(DI) |
| MOVQ $0x00000180, CX |
| LEAQ 384(SI), SI |
| SUBQ $0x00000180, BX |
| VPERM2I128 $0x02, Y7, Y11, Y0 |
| VPERM2I128 $0x02, 224(BP), Y3, Y14 |
| VPERM2I128 $0x13, Y7, Y11, Y12 |
| VPERM2I128 $0x13, 224(BP), Y3, Y4 |
| JMP sealAVX2SealHash |