| // Copyright 2015 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI |
| // The implementation uses some optimization as described in: |
| // [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication |
| // Instruction and its Usage for Computing the GCM Mode rev. 2.02 |
| // [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and |
| // Hardware |
| |
| #include "textflag.h" |
| |
| #define B0 X0 |
| #define B1 X1 |
| #define B2 X2 |
| #define B3 X3 |
| #define B4 X4 |
| #define B5 X5 |
| #define B6 X6 |
| #define B7 X7 |
| |
| #define ACC0 X8 |
| #define ACC1 X9 |
| #define ACCM X10 |
| |
| #define T0 X11 |
| #define T1 X12 |
| #define T2 X13 |
| #define POLY X14 |
| #define BSWAP X15 |
| |
| DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f |
| DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607 |
| |
| DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001 |
| DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000 |
| |
| DATA andMask<>+0x00(SB)/8, $0x00000000000000ff |
| DATA andMask<>+0x08(SB)/8, $0x0000000000000000 |
| DATA andMask<>+0x10(SB)/8, $0x000000000000ffff |
| DATA andMask<>+0x18(SB)/8, $0x0000000000000000 |
| DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff |
| DATA andMask<>+0x28(SB)/8, $0x0000000000000000 |
| DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff |
| DATA andMask<>+0x38(SB)/8, $0x0000000000000000 |
| DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff |
| DATA andMask<>+0x48(SB)/8, $0x0000000000000000 |
| DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff |
| DATA andMask<>+0x58(SB)/8, $0x0000000000000000 |
| DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff |
| DATA andMask<>+0x68(SB)/8, $0x0000000000000000 |
| DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff |
| DATA andMask<>+0x78(SB)/8, $0x0000000000000000 |
| DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff |
| DATA andMask<>+0x88(SB)/8, $0x00000000000000ff |
| DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff |
| DATA andMask<>+0x98(SB)/8, $0x000000000000ffff |
| DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff |
| DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff |
| DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff |
| DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff |
| DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff |
| DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff |
| DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff |
| DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff |
| DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff |
| DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff |
| |
| GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16 |
| GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16 |
| GLOBL andMask<>(SB), (NOPTR+RODATA), $240 |
| |
| // func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64) |
| TEXT ·gcmAesFinish(SB),NOSPLIT,$0 |
| #define pTbl DI |
| #define tMsk SI |
| #define tPtr DX |
| #define plen AX |
| #define dlen CX |
| |
| MOVQ productTable+0(FP), pTbl |
| MOVQ tagMask+8(FP), tMsk |
| MOVQ T+16(FP), tPtr |
| MOVQ pLen+24(FP), plen |
| MOVQ dLen+32(FP), dlen |
| |
| MOVOU (tPtr), ACC0 |
| MOVOU (tMsk), T2 |
| |
| MOVOU bswapMask<>(SB), BSWAP |
| MOVOU gcmPoly<>(SB), POLY |
| |
| SHLQ $3, plen |
| SHLQ $3, dlen |
| |
| MOVQ plen, B0 |
| PINSRQ $1, dlen, B0 |
| |
| PXOR ACC0, B0 |
| |
| MOVOU (16*14)(pTbl), ACC0 |
| MOVOU (16*15)(pTbl), ACCM |
| MOVOU ACC0, ACC1 |
| |
| PCLMULQDQ $0x00, B0, ACC0 |
| PCLMULQDQ $0x11, B0, ACC1 |
| PSHUFD $78, B0, T0 |
| PXOR B0, T0 |
| PCLMULQDQ $0x00, T0, ACCM |
| |
| PXOR ACC0, ACCM |
| PXOR ACC1, ACCM |
| MOVOU ACCM, T0 |
| PSRLDQ $8, ACCM |
| PSLLDQ $8, T0 |
| PXOR ACCM, ACC1 |
| PXOR T0, ACC0 |
| |
| MOVOU POLY, T0 |
| PCLMULQDQ $0x01, ACC0, T0 |
| PSHUFD $78, ACC0, ACC0 |
| PXOR T0, ACC0 |
| |
| MOVOU POLY, T0 |
| PCLMULQDQ $0x01, ACC0, T0 |
| PSHUFD $78, ACC0, ACC0 |
| PXOR T0, ACC0 |
| |
| PXOR ACC1, ACC0 |
| |
| PSHUFB BSWAP, ACC0 |
| PXOR T2, ACC0 |
| MOVOU ACC0, (tPtr) |
| |
| RET |
| #undef pTbl |
| #undef tMsk |
| #undef tPtr |
| #undef plen |
| #undef dlen |
| |
| // func gcmAesInit(productTable *[256]byte, ks []uint32) |
| TEXT ·gcmAesInit(SB),NOSPLIT,$0 |
| #define dst DI |
| #define KS SI |
| #define NR DX |
| |
| MOVQ productTable+0(FP), dst |
| MOVQ ks_base+8(FP), KS |
| MOVQ ks_len+16(FP), NR |
| |
| SHRQ $2, NR |
| DECQ NR |
| |
| MOVOU bswapMask<>(SB), BSWAP |
| MOVOU gcmPoly<>(SB), POLY |
| |
| // Encrypt block 0, with the AES key to generate the hash key H |
| MOVOU (16*0)(KS), B0 |
| MOVOU (16*1)(KS), T0 |
| AESENC T0, B0 |
| MOVOU (16*2)(KS), T0 |
| AESENC T0, B0 |
| MOVOU (16*3)(KS), T0 |
| AESENC T0, B0 |
| MOVOU (16*4)(KS), T0 |
| AESENC T0, B0 |
| MOVOU (16*5)(KS), T0 |
| AESENC T0, B0 |
| MOVOU (16*6)(KS), T0 |
| AESENC T0, B0 |
| MOVOU (16*7)(KS), T0 |
| AESENC T0, B0 |
| MOVOU (16*8)(KS), T0 |
| AESENC T0, B0 |
| MOVOU (16*9)(KS), T0 |
| AESENC T0, B0 |
| MOVOU (16*10)(KS), T0 |
| CMPQ NR, $12 |
| JB initEncLast |
| AESENC T0, B0 |
| MOVOU (16*11)(KS), T0 |
| AESENC T0, B0 |
| MOVOU (16*12)(KS), T0 |
| JE initEncLast |
| AESENC T0, B0 |
| MOVOU (16*13)(KS), T0 |
| AESENC T0, B0 |
| MOVOU (16*14)(KS), T0 |
| initEncLast: |
| AESENCLAST T0, B0 |
| |
| PSHUFB BSWAP, B0 |
| // H * 2 |
| PSHUFD $0xff, B0, T0 |
| MOVOU B0, T1 |
| PSRAL $31, T0 |
| PAND POLY, T0 |
| PSRLL $31, T1 |
| PSLLDQ $4, T1 |
| PSLLL $1, B0 |
| PXOR T0, B0 |
| PXOR T1, B0 |
| // Karatsuba pre-computations |
| MOVOU B0, (16*14)(dst) |
| PSHUFD $78, B0, B1 |
| PXOR B0, B1 |
| MOVOU B1, (16*15)(dst) |
| |
| MOVOU B0, B2 |
| MOVOU B1, B3 |
| // Now prepare powers of H and pre-computations for them |
| MOVQ $7, AX |
| |
| initLoop: |
| MOVOU B2, T0 |
| MOVOU B2, T1 |
| MOVOU B3, T2 |
| PCLMULQDQ $0x00, B0, T0 |
| PCLMULQDQ $0x11, B0, T1 |
| PCLMULQDQ $0x00, B1, T2 |
| |
| PXOR T0, T2 |
| PXOR T1, T2 |
| MOVOU T2, B4 |
| PSLLDQ $8, B4 |
| PSRLDQ $8, T2 |
| PXOR B4, T0 |
| PXOR T2, T1 |
| |
| MOVOU POLY, B2 |
| PCLMULQDQ $0x01, T0, B2 |
| PSHUFD $78, T0, T0 |
| PXOR B2, T0 |
| MOVOU POLY, B2 |
| PCLMULQDQ $0x01, T0, B2 |
| PSHUFD $78, T0, T0 |
| PXOR T0, B2 |
| PXOR T1, B2 |
| |
| MOVOU B2, (16*12)(dst) |
| PSHUFD $78, B2, B3 |
| PXOR B2, B3 |
| MOVOU B3, (16*13)(dst) |
| |
| DECQ AX |
| LEAQ (-16*2)(dst), dst |
| JNE initLoop |
| |
| RET |
| #undef NR |
| #undef KS |
| #undef dst |
| |
| // func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte) |
| TEXT ·gcmAesData(SB),NOSPLIT,$0 |
| #define pTbl DI |
| #define aut SI |
| #define tPtr CX |
| #define autLen DX |
| |
| #define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a |
| #define mulRoundAAD(X ,i) \ |
| MOVOU (16*(i*2))(pTbl), T1;\ |
| MOVOU T1, T2;\ |
| PCLMULQDQ $0x00, X, T1;\ |
| PXOR T1, ACC0;\ |
| PCLMULQDQ $0x11, X, T2;\ |
| PXOR T2, ACC1;\ |
| PSHUFD $78, X, T1;\ |
| PXOR T1, X;\ |
| MOVOU (16*(i*2+1))(pTbl), T1;\ |
| PCLMULQDQ $0x00, X, T1;\ |
| PXOR T1, ACCM |
| |
| MOVQ productTable+0(FP), pTbl |
| MOVQ data_base+8(FP), aut |
| MOVQ data_len+16(FP), autLen |
| MOVQ T+32(FP), tPtr |
| |
| PXOR ACC0, ACC0 |
| MOVOU bswapMask<>(SB), BSWAP |
| MOVOU gcmPoly<>(SB), POLY |
| |
| TESTQ autLen, autLen |
| JEQ dataBail |
| |
| CMPQ autLen, $13 // optimize the TLS case |
| JE dataTLS |
| CMPQ autLen, $128 |
| JB startSinglesLoop |
| JMP dataOctaLoop |
| |
| dataTLS: |
| MOVOU (16*14)(pTbl), T1 |
| MOVOU (16*15)(pTbl), T2 |
| PXOR B0, B0 |
| MOVQ (aut), B0 |
| PINSRD $2, 8(aut), B0 |
| PINSRB $12, 12(aut), B0 |
| XORQ autLen, autLen |
| JMP dataMul |
| |
| dataOctaLoop: |
| CMPQ autLen, $128 |
| JB startSinglesLoop |
| SUBQ $128, autLen |
| |
| MOVOU (16*0)(aut), X0 |
| MOVOU (16*1)(aut), X1 |
| MOVOU (16*2)(aut), X2 |
| MOVOU (16*3)(aut), X3 |
| MOVOU (16*4)(aut), X4 |
| MOVOU (16*5)(aut), X5 |
| MOVOU (16*6)(aut), X6 |
| MOVOU (16*7)(aut), X7 |
| LEAQ (16*8)(aut), aut |
| PSHUFB BSWAP, X0 |
| PSHUFB BSWAP, X1 |
| PSHUFB BSWAP, X2 |
| PSHUFB BSWAP, X3 |
| PSHUFB BSWAP, X4 |
| PSHUFB BSWAP, X5 |
| PSHUFB BSWAP, X6 |
| PSHUFB BSWAP, X7 |
| PXOR ACC0, X0 |
| |
| MOVOU (16*0)(pTbl), ACC0 |
| MOVOU (16*1)(pTbl), ACCM |
| MOVOU ACC0, ACC1 |
| PSHUFD $78, X0, T1 |
| PXOR X0, T1 |
| PCLMULQDQ $0x00, X0, ACC0 |
| PCLMULQDQ $0x11, X0, ACC1 |
| PCLMULQDQ $0x00, T1, ACCM |
| |
| mulRoundAAD(X1, 1) |
| mulRoundAAD(X2, 2) |
| mulRoundAAD(X3, 3) |
| mulRoundAAD(X4, 4) |
| mulRoundAAD(X5, 5) |
| mulRoundAAD(X6, 6) |
| mulRoundAAD(X7, 7) |
| |
| PXOR ACC0, ACCM |
| PXOR ACC1, ACCM |
| MOVOU ACCM, T0 |
| PSRLDQ $8, ACCM |
| PSLLDQ $8, T0 |
| PXOR ACCM, ACC1 |
| PXOR T0, ACC0 |
| reduceRound(ACC0) |
| reduceRound(ACC0) |
| PXOR ACC1, ACC0 |
| JMP dataOctaLoop |
| |
| startSinglesLoop: |
| MOVOU (16*14)(pTbl), T1 |
| MOVOU (16*15)(pTbl), T2 |
| |
| dataSinglesLoop: |
| |
| CMPQ autLen, $16 |
| JB dataEnd |
| SUBQ $16, autLen |
| |
| MOVOU (aut), B0 |
| dataMul: |
| PSHUFB BSWAP, B0 |
| PXOR ACC0, B0 |
| |
| MOVOU T1, ACC0 |
| MOVOU T2, ACCM |
| MOVOU T1, ACC1 |
| |
| PSHUFD $78, B0, T0 |
| PXOR B0, T0 |
| PCLMULQDQ $0x00, B0, ACC0 |
| PCLMULQDQ $0x11, B0, ACC1 |
| PCLMULQDQ $0x00, T0, ACCM |
| |
| PXOR ACC0, ACCM |
| PXOR ACC1, ACCM |
| MOVOU ACCM, T0 |
| PSRLDQ $8, ACCM |
| PSLLDQ $8, T0 |
| PXOR ACCM, ACC1 |
| PXOR T0, ACC0 |
| |
| MOVOU POLY, T0 |
| PCLMULQDQ $0x01, ACC0, T0 |
| PSHUFD $78, ACC0, ACC0 |
| PXOR T0, ACC0 |
| |
| MOVOU POLY, T0 |
| PCLMULQDQ $0x01, ACC0, T0 |
| PSHUFD $78, ACC0, ACC0 |
| PXOR T0, ACC0 |
| PXOR ACC1, ACC0 |
| |
| LEAQ 16(aut), aut |
| |
| JMP dataSinglesLoop |
| |
| dataEnd: |
| |
| TESTQ autLen, autLen |
| JEQ dataBail |
| |
| PXOR B0, B0 |
| LEAQ -1(aut)(autLen*1), aut |
| |
| dataLoadLoop: |
| |
| PSLLDQ $1, B0 |
| PINSRB $0, (aut), B0 |
| |
| LEAQ -1(aut), aut |
| DECQ autLen |
| JNE dataLoadLoop |
| |
| JMP dataMul |
| |
| dataBail: |
| MOVOU ACC0, (tPtr) |
| RET |
| #undef pTbl |
| #undef aut |
| #undef tPtr |
| #undef autLen |
| |
| // func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32) |
| TEXT ·gcmAesEnc(SB),0,$256-96 |
| #define pTbl DI |
| #define ctx DX |
| #define ctrPtr CX |
| #define ptx SI |
| #define ks AX |
| #define tPtr R8 |
| #define ptxLen R9 |
| #define aluCTR R10 |
| #define aluTMP R11 |
| #define aluK R12 |
| #define NR R13 |
| |
| #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP) |
| #define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7 |
| #define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7 |
| #define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7 |
| #define combinedRound(i) \ |
| MOVOU (16*i)(ks), T0;\ |
| AESENC T0, B0;\ |
| AESENC T0, B1;\ |
| AESENC T0, B2;\ |
| AESENC T0, B3;\ |
| MOVOU (16*(i*2))(pTbl), T1;\ |
| MOVOU T1, T2;\ |
| AESENC T0, B4;\ |
| AESENC T0, B5;\ |
| AESENC T0, B6;\ |
| AESENC T0, B7;\ |
| MOVOU (16*i)(SP), T0;\ |
| PCLMULQDQ $0x00, T0, T1;\ |
| PXOR T1, ACC0;\ |
| PSHUFD $78, T0, T1;\ |
| PCLMULQDQ $0x11, T0, T2;\ |
| PXOR T1, T0;\ |
| PXOR T2, ACC1;\ |
| MOVOU (16*(i*2+1))(pTbl), T2;\ |
| PCLMULQDQ $0x00, T2, T0;\ |
| PXOR T0, ACCM |
| #define mulRound(i) \ |
| MOVOU (16*i)(SP), T0;\ |
| MOVOU (16*(i*2))(pTbl), T1;\ |
| MOVOU T1, T2;\ |
| PCLMULQDQ $0x00, T0, T1;\ |
| PXOR T1, ACC0;\ |
| PCLMULQDQ $0x11, T0, T2;\ |
| PXOR T2, ACC1;\ |
| PSHUFD $78, T0, T1;\ |
| PXOR T1, T0;\ |
| MOVOU (16*(i*2+1))(pTbl), T1;\ |
| PCLMULQDQ $0x00, T0, T1;\ |
| PXOR T1, ACCM |
| |
| MOVQ productTable+0(FP), pTbl |
| MOVQ dst+8(FP), ctx |
| MOVQ src_base+32(FP), ptx |
| MOVQ src_len+40(FP), ptxLen |
| MOVQ ctr+56(FP), ctrPtr |
| MOVQ T+64(FP), tPtr |
| MOVQ ks_base+72(FP), ks |
| MOVQ ks_len+80(FP), NR |
| |
| SHRQ $2, NR |
| DECQ NR |
| |
| MOVOU bswapMask<>(SB), BSWAP |
| MOVOU gcmPoly<>(SB), POLY |
| |
| MOVOU (tPtr), ACC0 |
| PXOR ACC1, ACC1 |
| PXOR ACCM, ACCM |
| MOVOU (ctrPtr), B0 |
| MOVL (3*4)(ctrPtr), aluCTR |
| MOVOU (ks), T0 |
| MOVL (3*4)(ks), aluK |
| BSWAPL aluCTR |
| BSWAPL aluK |
| |
| PXOR B0, T0 |
| MOVOU T0, (8*16 + 0*16)(SP) |
| increment(0) |
| |
| CMPQ ptxLen, $128 |
| JB gcmAesEncSingles |
| SUBQ $128, ptxLen |
| |
| // We have at least 8 blocks to encrypt, prepare the rest of the counters |
| MOVOU T0, (8*16 + 1*16)(SP) |
| increment(1) |
| MOVOU T0, (8*16 + 2*16)(SP) |
| increment(2) |
| MOVOU T0, (8*16 + 3*16)(SP) |
| increment(3) |
| MOVOU T0, (8*16 + 4*16)(SP) |
| increment(4) |
| MOVOU T0, (8*16 + 5*16)(SP) |
| increment(5) |
| MOVOU T0, (8*16 + 6*16)(SP) |
| increment(6) |
| MOVOU T0, (8*16 + 7*16)(SP) |
| increment(7) |
| |
| MOVOU (8*16 + 0*16)(SP), B0 |
| MOVOU (8*16 + 1*16)(SP), B1 |
| MOVOU (8*16 + 2*16)(SP), B2 |
| MOVOU (8*16 + 3*16)(SP), B3 |
| MOVOU (8*16 + 4*16)(SP), B4 |
| MOVOU (8*16 + 5*16)(SP), B5 |
| MOVOU (8*16 + 6*16)(SP), B6 |
| MOVOU (8*16 + 7*16)(SP), B7 |
| |
| aesRound(1) |
| increment(0) |
| aesRound(2) |
| increment(1) |
| aesRound(3) |
| increment(2) |
| aesRound(4) |
| increment(3) |
| aesRound(5) |
| increment(4) |
| aesRound(6) |
| increment(5) |
| aesRound(7) |
| increment(6) |
| aesRound(8) |
| increment(7) |
| aesRound(9) |
| MOVOU (16*10)(ks), T0 |
| CMPQ NR, $12 |
| JB encLast1 |
| aesRnd(T0) |
| aesRound(11) |
| MOVOU (16*12)(ks), T0 |
| JE encLast1 |
| aesRnd(T0) |
| aesRound(13) |
| MOVOU (16*14)(ks), T0 |
| encLast1: |
| aesRndLast(T0) |
| |
| MOVOU (16*0)(ptx), T0 |
| PXOR T0, B0 |
| MOVOU (16*1)(ptx), T0 |
| PXOR T0, B1 |
| MOVOU (16*2)(ptx), T0 |
| PXOR T0, B2 |
| MOVOU (16*3)(ptx), T0 |
| PXOR T0, B3 |
| MOVOU (16*4)(ptx), T0 |
| PXOR T0, B4 |
| MOVOU (16*5)(ptx), T0 |
| PXOR T0, B5 |
| MOVOU (16*6)(ptx), T0 |
| PXOR T0, B6 |
| MOVOU (16*7)(ptx), T0 |
| PXOR T0, B7 |
| |
| MOVOU B0, (16*0)(ctx) |
| PSHUFB BSWAP, B0 |
| PXOR ACC0, B0 |
| MOVOU B1, (16*1)(ctx) |
| PSHUFB BSWAP, B1 |
| MOVOU B2, (16*2)(ctx) |
| PSHUFB BSWAP, B2 |
| MOVOU B3, (16*3)(ctx) |
| PSHUFB BSWAP, B3 |
| MOVOU B4, (16*4)(ctx) |
| PSHUFB BSWAP, B4 |
| MOVOU B5, (16*5)(ctx) |
| PSHUFB BSWAP, B5 |
| MOVOU B6, (16*6)(ctx) |
| PSHUFB BSWAP, B6 |
| MOVOU B7, (16*7)(ctx) |
| PSHUFB BSWAP, B7 |
| |
| MOVOU B0, (16*0)(SP) |
| MOVOU B1, (16*1)(SP) |
| MOVOU B2, (16*2)(SP) |
| MOVOU B3, (16*3)(SP) |
| MOVOU B4, (16*4)(SP) |
| MOVOU B5, (16*5)(SP) |
| MOVOU B6, (16*6)(SP) |
| MOVOU B7, (16*7)(SP) |
| |
| LEAQ 128(ptx), ptx |
| LEAQ 128(ctx), ctx |
| |
| gcmAesEncOctetsLoop: |
| |
| CMPQ ptxLen, $128 |
| JB gcmAesEncOctetsEnd |
| SUBQ $128, ptxLen |
| |
| MOVOU (8*16 + 0*16)(SP), B0 |
| MOVOU (8*16 + 1*16)(SP), B1 |
| MOVOU (8*16 + 2*16)(SP), B2 |
| MOVOU (8*16 + 3*16)(SP), B3 |
| MOVOU (8*16 + 4*16)(SP), B4 |
| MOVOU (8*16 + 5*16)(SP), B5 |
| MOVOU (8*16 + 6*16)(SP), B6 |
| MOVOU (8*16 + 7*16)(SP), B7 |
| |
| MOVOU (16*0)(SP), T0 |
| PSHUFD $78, T0, T1 |
| PXOR T0, T1 |
| |
| MOVOU (16*0)(pTbl), ACC0 |
| MOVOU (16*1)(pTbl), ACCM |
| MOVOU ACC0, ACC1 |
| |
| PCLMULQDQ $0x00, T1, ACCM |
| PCLMULQDQ $0x00, T0, ACC0 |
| PCLMULQDQ $0x11, T0, ACC1 |
| |
| combinedRound(1) |
| increment(0) |
| combinedRound(2) |
| increment(1) |
| combinedRound(3) |
| increment(2) |
| combinedRound(4) |
| increment(3) |
| combinedRound(5) |
| increment(4) |
| combinedRound(6) |
| increment(5) |
| combinedRound(7) |
| increment(6) |
| |
| aesRound(8) |
| increment(7) |
| |
| PXOR ACC0, ACCM |
| PXOR ACC1, ACCM |
| MOVOU ACCM, T0 |
| PSRLDQ $8, ACCM |
| PSLLDQ $8, T0 |
| PXOR ACCM, ACC1 |
| PXOR T0, ACC0 |
| |
| reduceRound(ACC0) |
| aesRound(9) |
| |
| reduceRound(ACC0) |
| PXOR ACC1, ACC0 |
| |
| MOVOU (16*10)(ks), T0 |
| CMPQ NR, $12 |
| JB encLast2 |
| aesRnd(T0) |
| aesRound(11) |
| MOVOU (16*12)(ks), T0 |
| JE encLast2 |
| aesRnd(T0) |
| aesRound(13) |
| MOVOU (16*14)(ks), T0 |
| encLast2: |
| aesRndLast(T0) |
| |
| MOVOU (16*0)(ptx), T0 |
| PXOR T0, B0 |
| MOVOU (16*1)(ptx), T0 |
| PXOR T0, B1 |
| MOVOU (16*2)(ptx), T0 |
| PXOR T0, B2 |
| MOVOU (16*3)(ptx), T0 |
| PXOR T0, B3 |
| MOVOU (16*4)(ptx), T0 |
| PXOR T0, B4 |
| MOVOU (16*5)(ptx), T0 |
| PXOR T0, B5 |
| MOVOU (16*6)(ptx), T0 |
| PXOR T0, B6 |
| MOVOU (16*7)(ptx), T0 |
| PXOR T0, B7 |
| |
| MOVOU B0, (16*0)(ctx) |
| PSHUFB BSWAP, B0 |
| PXOR ACC0, B0 |
| MOVOU B1, (16*1)(ctx) |
| PSHUFB BSWAP, B1 |
| MOVOU B2, (16*2)(ctx) |
| PSHUFB BSWAP, B2 |
| MOVOU B3, (16*3)(ctx) |
| PSHUFB BSWAP, B3 |
| MOVOU B4, (16*4)(ctx) |
| PSHUFB BSWAP, B4 |
| MOVOU B5, (16*5)(ctx) |
| PSHUFB BSWAP, B5 |
| MOVOU B6, (16*6)(ctx) |
| PSHUFB BSWAP, B6 |
| MOVOU B7, (16*7)(ctx) |
| PSHUFB BSWAP, B7 |
| |
| MOVOU B0, (16*0)(SP) |
| MOVOU B1, (16*1)(SP) |
| MOVOU B2, (16*2)(SP) |
| MOVOU B3, (16*3)(SP) |
| MOVOU B4, (16*4)(SP) |
| MOVOU B5, (16*5)(SP) |
| MOVOU B6, (16*6)(SP) |
| MOVOU B7, (16*7)(SP) |
| |
| LEAQ 128(ptx), ptx |
| LEAQ 128(ctx), ctx |
| |
| JMP gcmAesEncOctetsLoop |
| |
| gcmAesEncOctetsEnd: |
| |
| MOVOU (16*0)(SP), T0 |
| MOVOU (16*0)(pTbl), ACC0 |
| MOVOU (16*1)(pTbl), ACCM |
| MOVOU ACC0, ACC1 |
| PSHUFD $78, T0, T1 |
| PXOR T0, T1 |
| PCLMULQDQ $0x00, T0, ACC0 |
| PCLMULQDQ $0x11, T0, ACC1 |
| PCLMULQDQ $0x00, T1, ACCM |
| |
| mulRound(1) |
| mulRound(2) |
| mulRound(3) |
| mulRound(4) |
| mulRound(5) |
| mulRound(6) |
| mulRound(7) |
| |
| PXOR ACC0, ACCM |
| PXOR ACC1, ACCM |
| MOVOU ACCM, T0 |
| PSRLDQ $8, ACCM |
| PSLLDQ $8, T0 |
| PXOR ACCM, ACC1 |
| PXOR T0, ACC0 |
| |
| reduceRound(ACC0) |
| reduceRound(ACC0) |
| PXOR ACC1, ACC0 |
| |
| TESTQ ptxLen, ptxLen |
| JE gcmAesEncDone |
| |
| SUBQ $7, aluCTR |
| |
| gcmAesEncSingles: |
| |
| MOVOU (16*1)(ks), B1 |
| MOVOU (16*2)(ks), B2 |
| MOVOU (16*3)(ks), B3 |
| MOVOU (16*4)(ks), B4 |
| MOVOU (16*5)(ks), B5 |
| MOVOU (16*6)(ks), B6 |
| MOVOU (16*7)(ks), B7 |
| |
| MOVOU (16*14)(pTbl), T2 |
| |
| gcmAesEncSinglesLoop: |
| |
| CMPQ ptxLen, $16 |
| JB gcmAesEncTail |
| SUBQ $16, ptxLen |
| |
| MOVOU (8*16 + 0*16)(SP), B0 |
| increment(0) |
| |
| AESENC B1, B0 |
| AESENC B2, B0 |
| AESENC B3, B0 |
| AESENC B4, B0 |
| AESENC B5, B0 |
| AESENC B6, B0 |
| AESENC B7, B0 |
| MOVOU (16*8)(ks), T0 |
| AESENC T0, B0 |
| MOVOU (16*9)(ks), T0 |
| AESENC T0, B0 |
| MOVOU (16*10)(ks), T0 |
| CMPQ NR, $12 |
| JB encLast3 |
| AESENC T0, B0 |
| MOVOU (16*11)(ks), T0 |
| AESENC T0, B0 |
| MOVOU (16*12)(ks), T0 |
| JE encLast3 |
| AESENC T0, B0 |
| MOVOU (16*13)(ks), T0 |
| AESENC T0, B0 |
| MOVOU (16*14)(ks), T0 |
| encLast3: |
| AESENCLAST T0, B0 |
| |
| MOVOU (ptx), T0 |
| PXOR T0, B0 |
| MOVOU B0, (ctx) |
| |
| PSHUFB BSWAP, B0 |
| PXOR ACC0, B0 |
| |
| MOVOU T2, ACC0 |
| MOVOU T2, ACC1 |
| MOVOU (16*15)(pTbl), ACCM |
| |
| PSHUFD $78, B0, T0 |
| PXOR B0, T0 |
| PCLMULQDQ $0x00, B0, ACC0 |
| PCLMULQDQ $0x11, B0, ACC1 |
| PCLMULQDQ $0x00, T0, ACCM |
| |
| PXOR ACC0, ACCM |
| PXOR ACC1, ACCM |
| MOVOU ACCM, T0 |
| PSRLDQ $8, ACCM |
| PSLLDQ $8, T0 |
| PXOR ACCM, ACC1 |
| PXOR T0, ACC0 |
| |
| reduceRound(ACC0) |
| reduceRound(ACC0) |
| PXOR ACC1, ACC0 |
| |
| LEAQ (16*1)(ptx), ptx |
| LEAQ (16*1)(ctx), ctx |
| |
| JMP gcmAesEncSinglesLoop |
| |
| gcmAesEncTail: |
| TESTQ ptxLen, ptxLen |
| JE gcmAesEncDone |
| |
| MOVOU (8*16 + 0*16)(SP), B0 |
| AESENC B1, B0 |
| AESENC B2, B0 |
| AESENC B3, B0 |
| AESENC B4, B0 |
| AESENC B5, B0 |
| AESENC B6, B0 |
| AESENC B7, B0 |
| MOVOU (16*8)(ks), T0 |
| AESENC T0, B0 |
| MOVOU (16*9)(ks), T0 |
| AESENC T0, B0 |
| MOVOU (16*10)(ks), T0 |
| CMPQ NR, $12 |
| JB encLast4 |
| AESENC T0, B0 |
| MOVOU (16*11)(ks), T0 |
| AESENC T0, B0 |
| MOVOU (16*12)(ks), T0 |
| JE encLast4 |
| AESENC T0, B0 |
| MOVOU (16*13)(ks), T0 |
| AESENC T0, B0 |
| MOVOU (16*14)(ks), T0 |
| encLast4: |
| AESENCLAST T0, B0 |
| MOVOU B0, T0 |
| |
| LEAQ -1(ptx)(ptxLen*1), ptx |
| |
| MOVQ ptxLen, aluTMP |
| SHLQ $4, aluTMP |
| |
| LEAQ andMask<>(SB), aluCTR |
| MOVOU -16(aluCTR)(aluTMP*1), T1 |
| |
| PXOR B0, B0 |
| ptxLoadLoop: |
| PSLLDQ $1, B0 |
| PINSRB $0, (ptx), B0 |
| LEAQ -1(ptx), ptx |
| DECQ ptxLen |
| JNE ptxLoadLoop |
| |
| PXOR T0, B0 |
| PAND T1, B0 |
| MOVOU B0, (ctx) // I assume there is always space, due to TAG in the end of the CT |
| |
| PSHUFB BSWAP, B0 |
| PXOR ACC0, B0 |
| |
| MOVOU T2, ACC0 |
| MOVOU T2, ACC1 |
| MOVOU (16*15)(pTbl), ACCM |
| |
| PSHUFD $78, B0, T0 |
| PXOR B0, T0 |
| PCLMULQDQ $0x00, B0, ACC0 |
| PCLMULQDQ $0x11, B0, ACC1 |
| PCLMULQDQ $0x00, T0, ACCM |
| |
| PXOR ACC0, ACCM |
| PXOR ACC1, ACCM |
| MOVOU ACCM, T0 |
| PSRLDQ $8, ACCM |
| PSLLDQ $8, T0 |
| PXOR ACCM, ACC1 |
| PXOR T0, ACC0 |
| |
| reduceRound(ACC0) |
| reduceRound(ACC0) |
| PXOR ACC1, ACC0 |
| |
| gcmAesEncDone: |
| MOVOU ACC0, (tPtr) |
| RET |
| #undef increment |
| |
| // func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32) |
| TEXT ·gcmAesDec(SB),0,$128-96 |
| #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP) |
| #define combinedDecRound(i) \ |
| MOVOU (16*i)(ks), T0;\ |
| AESENC T0, B0;\ |
| AESENC T0, B1;\ |
| AESENC T0, B2;\ |
| AESENC T0, B3;\ |
| MOVOU (16*(i*2))(pTbl), T1;\ |
| MOVOU T1, T2;\ |
| AESENC T0, B4;\ |
| AESENC T0, B5;\ |
| AESENC T0, B6;\ |
| AESENC T0, B7;\ |
| MOVOU (16*i)(ctx), T0;\ |
| PSHUFB BSWAP, T0;\ |
| PCLMULQDQ $0x00, T0, T1;\ |
| PXOR T1, ACC0;\ |
| PSHUFD $78, T0, T1;\ |
| PCLMULQDQ $0x11, T0, T2;\ |
| PXOR T1, T0;\ |
| PXOR T2, ACC1;\ |
| MOVOU (16*(i*2+1))(pTbl), T2;\ |
| PCLMULQDQ $0x00, T2, T0;\ |
| PXOR T0, ACCM |
| |
| MOVQ productTable+0(FP), pTbl |
| MOVQ dst+8(FP), ptx |
| MOVQ src_base+32(FP), ctx |
| MOVQ src_len+40(FP), ptxLen |
| MOVQ ctr+56(FP), ctrPtr |
| MOVQ T+64(FP), tPtr |
| MOVQ ks_base+72(FP), ks |
| MOVQ ks_len+80(FP), NR |
| |
| SHRQ $2, NR |
| DECQ NR |
| |
| MOVOU bswapMask<>(SB), BSWAP |
| MOVOU gcmPoly<>(SB), POLY |
| |
| MOVOU (tPtr), ACC0 |
| PXOR ACC1, ACC1 |
| PXOR ACCM, ACCM |
| MOVOU (ctrPtr), B0 |
| MOVL (3*4)(ctrPtr), aluCTR |
| MOVOU (ks), T0 |
| MOVL (3*4)(ks), aluK |
| BSWAPL aluCTR |
| BSWAPL aluK |
| |
| PXOR B0, T0 |
| MOVOU T0, (0*16)(SP) |
| increment(0) |
| |
| CMPQ ptxLen, $128 |
| JB gcmAesDecSingles |
| |
| MOVOU T0, (1*16)(SP) |
| increment(1) |
| MOVOU T0, (2*16)(SP) |
| increment(2) |
| MOVOU T0, (3*16)(SP) |
| increment(3) |
| MOVOU T0, (4*16)(SP) |
| increment(4) |
| MOVOU T0, (5*16)(SP) |
| increment(5) |
| MOVOU T0, (6*16)(SP) |
| increment(6) |
| MOVOU T0, (7*16)(SP) |
| increment(7) |
| |
| gcmAesDecOctetsLoop: |
| |
| CMPQ ptxLen, $128 |
| JB gcmAesDecEndOctets |
| SUBQ $128, ptxLen |
| |
| MOVOU (0*16)(SP), B0 |
| MOVOU (1*16)(SP), B1 |
| MOVOU (2*16)(SP), B2 |
| MOVOU (3*16)(SP), B3 |
| MOVOU (4*16)(SP), B4 |
| MOVOU (5*16)(SP), B5 |
| MOVOU (6*16)(SP), B6 |
| MOVOU (7*16)(SP), B7 |
| |
| MOVOU (16*0)(ctx), T0 |
| PSHUFB BSWAP, T0 |
| PXOR ACC0, T0 |
| PSHUFD $78, T0, T1 |
| PXOR T0, T1 |
| |
| MOVOU (16*0)(pTbl), ACC0 |
| MOVOU (16*1)(pTbl), ACCM |
| MOVOU ACC0, ACC1 |
| |
| PCLMULQDQ $0x00, T1, ACCM |
| PCLMULQDQ $0x00, T0, ACC0 |
| PCLMULQDQ $0x11, T0, ACC1 |
| |
| combinedDecRound(1) |
| increment(0) |
| combinedDecRound(2) |
| increment(1) |
| combinedDecRound(3) |
| increment(2) |
| combinedDecRound(4) |
| increment(3) |
| combinedDecRound(5) |
| increment(4) |
| combinedDecRound(6) |
| increment(5) |
| combinedDecRound(7) |
| increment(6) |
| |
| aesRound(8) |
| increment(7) |
| |
| PXOR ACC0, ACCM |
| PXOR ACC1, ACCM |
| MOVOU ACCM, T0 |
| PSRLDQ $8, ACCM |
| PSLLDQ $8, T0 |
| PXOR ACCM, ACC1 |
| PXOR T0, ACC0 |
| |
| reduceRound(ACC0) |
| aesRound(9) |
| |
| reduceRound(ACC0) |
| PXOR ACC1, ACC0 |
| |
| MOVOU (16*10)(ks), T0 |
| CMPQ NR, $12 |
| JB decLast1 |
| aesRnd(T0) |
| aesRound(11) |
| MOVOU (16*12)(ks), T0 |
| JE decLast1 |
| aesRnd(T0) |
| aesRound(13) |
| MOVOU (16*14)(ks), T0 |
| decLast1: |
| aesRndLast(T0) |
| |
| MOVOU (16*0)(ctx), T0 |
| PXOR T0, B0 |
| MOVOU (16*1)(ctx), T0 |
| PXOR T0, B1 |
| MOVOU (16*2)(ctx), T0 |
| PXOR T0, B2 |
| MOVOU (16*3)(ctx), T0 |
| PXOR T0, B3 |
| MOVOU (16*4)(ctx), T0 |
| PXOR T0, B4 |
| MOVOU (16*5)(ctx), T0 |
| PXOR T0, B5 |
| MOVOU (16*6)(ctx), T0 |
| PXOR T0, B6 |
| MOVOU (16*7)(ctx), T0 |
| PXOR T0, B7 |
| |
| MOVOU B0, (16*0)(ptx) |
| MOVOU B1, (16*1)(ptx) |
| MOVOU B2, (16*2)(ptx) |
| MOVOU B3, (16*3)(ptx) |
| MOVOU B4, (16*4)(ptx) |
| MOVOU B5, (16*5)(ptx) |
| MOVOU B6, (16*6)(ptx) |
| MOVOU B7, (16*7)(ptx) |
| |
| LEAQ 128(ptx), ptx |
| LEAQ 128(ctx), ctx |
| |
| JMP gcmAesDecOctetsLoop |
| |
| gcmAesDecEndOctets: |
| |
| SUBQ $7, aluCTR |
| |
| gcmAesDecSingles: |
| |
| MOVOU (16*1)(ks), B1 |
| MOVOU (16*2)(ks), B2 |
| MOVOU (16*3)(ks), B3 |
| MOVOU (16*4)(ks), B4 |
| MOVOU (16*5)(ks), B5 |
| MOVOU (16*6)(ks), B6 |
| MOVOU (16*7)(ks), B7 |
| |
| MOVOU (16*14)(pTbl), T2 |
| |
| gcmAesDecSinglesLoop: |
| |
| CMPQ ptxLen, $16 |
| JB gcmAesDecTail |
| SUBQ $16, ptxLen |
| |
| MOVOU (ctx), B0 |
| MOVOU B0, T1 |
| PSHUFB BSWAP, B0 |
| PXOR ACC0, B0 |
| |
| MOVOU T2, ACC0 |
| MOVOU T2, ACC1 |
| MOVOU (16*15)(pTbl), ACCM |
| |
| PCLMULQDQ $0x00, B0, ACC0 |
| PCLMULQDQ $0x11, B0, ACC1 |
| PSHUFD $78, B0, T0 |
| PXOR B0, T0 |
| PCLMULQDQ $0x00, T0, ACCM |
| |
| PXOR ACC0, ACCM |
| PXOR ACC1, ACCM |
| MOVOU ACCM, T0 |
| PSRLDQ $8, ACCM |
| PSLLDQ $8, T0 |
| PXOR ACCM, ACC1 |
| PXOR T0, ACC0 |
| |
| reduceRound(ACC0) |
| reduceRound(ACC0) |
| PXOR ACC1, ACC0 |
| |
| MOVOU (0*16)(SP), B0 |
| increment(0) |
| AESENC B1, B0 |
| AESENC B2, B0 |
| AESENC B3, B0 |
| AESENC B4, B0 |
| AESENC B5, B0 |
| AESENC B6, B0 |
| AESENC B7, B0 |
| MOVOU (16*8)(ks), T0 |
| AESENC T0, B0 |
| MOVOU (16*9)(ks), T0 |
| AESENC T0, B0 |
| MOVOU (16*10)(ks), T0 |
| CMPQ NR, $12 |
| JB decLast2 |
| AESENC T0, B0 |
| MOVOU (16*11)(ks), T0 |
| AESENC T0, B0 |
| MOVOU (16*12)(ks), T0 |
| JE decLast2 |
| AESENC T0, B0 |
| MOVOU (16*13)(ks), T0 |
| AESENC T0, B0 |
| MOVOU (16*14)(ks), T0 |
| decLast2: |
| AESENCLAST T0, B0 |
| |
| PXOR T1, B0 |
| MOVOU B0, (ptx) |
| |
| LEAQ (16*1)(ptx), ptx |
| LEAQ (16*1)(ctx), ctx |
| |
| JMP gcmAesDecSinglesLoop |
| |
| gcmAesDecTail: |
| |
| TESTQ ptxLen, ptxLen |
| JE gcmAesDecDone |
| |
| MOVQ ptxLen, aluTMP |
| SHLQ $4, aluTMP |
| LEAQ andMask<>(SB), aluCTR |
| MOVOU -16(aluCTR)(aluTMP*1), T1 |
| |
| MOVOU (ctx), B0 // I assume there is TAG attached to the ctx, and there is no read overflow |
| PAND T1, B0 |
| |
| MOVOU B0, T1 |
| PSHUFB BSWAP, B0 |
| PXOR ACC0, B0 |
| |
| MOVOU (16*14)(pTbl), ACC0 |
| MOVOU (16*15)(pTbl), ACCM |
| MOVOU ACC0, ACC1 |
| |
| PCLMULQDQ $0x00, B0, ACC0 |
| PCLMULQDQ $0x11, B0, ACC1 |
| PSHUFD $78, B0, T0 |
| PXOR B0, T0 |
| PCLMULQDQ $0x00, T0, ACCM |
| |
| PXOR ACC0, ACCM |
| PXOR ACC1, ACCM |
| MOVOU ACCM, T0 |
| PSRLDQ $8, ACCM |
| PSLLDQ $8, T0 |
| PXOR ACCM, ACC1 |
| PXOR T0, ACC0 |
| |
| reduceRound(ACC0) |
| reduceRound(ACC0) |
| PXOR ACC1, ACC0 |
| |
| MOVOU (0*16)(SP), B0 |
| increment(0) |
| AESENC B1, B0 |
| AESENC B2, B0 |
| AESENC B3, B0 |
| AESENC B4, B0 |
| AESENC B5, B0 |
| AESENC B6, B0 |
| AESENC B7, B0 |
| MOVOU (16*8)(ks), T0 |
| AESENC T0, B0 |
| MOVOU (16*9)(ks), T0 |
| AESENC T0, B0 |
| MOVOU (16*10)(ks), T0 |
| CMPQ NR, $12 |
| JB decLast3 |
| AESENC T0, B0 |
| MOVOU (16*11)(ks), T0 |
| AESENC T0, B0 |
| MOVOU (16*12)(ks), T0 |
| JE decLast3 |
| AESENC T0, B0 |
| MOVOU (16*13)(ks), T0 |
| AESENC T0, B0 |
| MOVOU (16*14)(ks), T0 |
| decLast3: |
| AESENCLAST T0, B0 |
| PXOR T1, B0 |
| |
| ptxStoreLoop: |
| PEXTRB $0, B0, (ptx) |
| PSRLDQ $1, B0 |
| LEAQ 1(ptx), ptx |
| DECQ ptxLen |
| |
| JNE ptxStoreLoop |
| |
| gcmAesDecDone: |
| |
| MOVOU ACC0, (tPtr) |
| RET |