| // Copyright 2015 The Go Authors. All rights reserved. | 
 | // Use of this source code is governed by a BSD-style | 
 | // license that can be found in the LICENSE file. | 
 |  | 
 | // This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI | 
 | // The implementation uses some optimization as described in: | 
 | // [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication | 
 | //     Instruction and its Usage for Computing the GCM Mode rev. 2.02 | 
 | // [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and | 
 | //     Hardware | 
 |  | 
 | #include "textflag.h" | 
 |  | 
 | #define B0 X0 | 
 | #define B1 X1 | 
 | #define B2 X2 | 
 | #define B3 X3 | 
 | #define B4 X4 | 
 | #define B5 X5 | 
 | #define B6 X6 | 
 | #define B7 X7 | 
 |  | 
 | #define ACC0 X8 | 
 | #define ACC1 X9 | 
 | #define ACCM X10 | 
 |  | 
 | #define T0 X11 | 
 | #define T1 X12 | 
 | #define T2 X13 | 
 | #define POLY X14 | 
 | #define BSWAP X15 | 
 |  | 
 | DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f | 
 | DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607 | 
 |  | 
 | DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001 | 
 | DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000 | 
 |  | 
 | DATA andMask<>+0x00(SB)/8, $0x00000000000000ff | 
 | DATA andMask<>+0x08(SB)/8, $0x0000000000000000 | 
 | DATA andMask<>+0x10(SB)/8, $0x000000000000ffff | 
 | DATA andMask<>+0x18(SB)/8, $0x0000000000000000 | 
 | DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff | 
 | DATA andMask<>+0x28(SB)/8, $0x0000000000000000 | 
 | DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff | 
 | DATA andMask<>+0x38(SB)/8, $0x0000000000000000 | 
 | DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff | 
 | DATA andMask<>+0x48(SB)/8, $0x0000000000000000 | 
 | DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff | 
 | DATA andMask<>+0x58(SB)/8, $0x0000000000000000 | 
 | DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff | 
 | DATA andMask<>+0x68(SB)/8, $0x0000000000000000 | 
 | DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff | 
 | DATA andMask<>+0x78(SB)/8, $0x0000000000000000 | 
 | DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff | 
 | DATA andMask<>+0x88(SB)/8, $0x00000000000000ff | 
 | DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff | 
 | DATA andMask<>+0x98(SB)/8, $0x000000000000ffff | 
 | DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff | 
 | DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff | 
 | DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff | 
 | DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff | 
 | DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff | 
 | DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff | 
 | DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff | 
 | DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff | 
 | DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff | 
 | DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff | 
 |  | 
 | GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16 | 
 | GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16 | 
 | GLOBL andMask<>(SB), (NOPTR+RODATA), $240 | 
 |  | 
 | // func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64) | 
 | TEXT ·gcmAesFinish(SB),NOSPLIT,$0 | 
 | #define pTbl DI | 
 | #define tMsk SI | 
 | #define tPtr DX | 
 | #define plen AX | 
 | #define dlen CX | 
 |  | 
 | 	MOVQ productTable+0(FP), pTbl | 
 | 	MOVQ tagMask+8(FP), tMsk | 
 | 	MOVQ T+16(FP), tPtr | 
 | 	MOVQ pLen+24(FP), plen | 
 | 	MOVQ dLen+32(FP), dlen | 
 |  | 
 | 	MOVOU (tPtr), ACC0 | 
 | 	MOVOU (tMsk), T2 | 
 |  | 
 | 	MOVOU bswapMask<>(SB), BSWAP | 
 | 	MOVOU gcmPoly<>(SB), POLY | 
 |  | 
 | 	SHLQ $3, plen | 
 | 	SHLQ $3, dlen | 
 |  | 
 | 	MOVQ plen, B0 | 
 | 	PINSRQ $1, dlen, B0 | 
 |  | 
 | 	PXOR ACC0, B0 | 
 |  | 
 | 	MOVOU (16*14)(pTbl), ACC0 | 
 | 	MOVOU (16*15)(pTbl), ACCM | 
 | 	MOVOU ACC0, ACC1 | 
 |  | 
 | 	PCLMULQDQ $0x00, B0, ACC0 | 
 | 	PCLMULQDQ $0x11, B0, ACC1 | 
 | 	PSHUFD $78, B0, T0 | 
 | 	PXOR B0, T0 | 
 | 	PCLMULQDQ $0x00, T0, ACCM | 
 |  | 
 | 	PXOR ACC0, ACCM | 
 | 	PXOR ACC1, ACCM | 
 | 	MOVOU ACCM, T0 | 
 | 	PSRLDQ $8, ACCM | 
 | 	PSLLDQ $8, T0 | 
 | 	PXOR ACCM, ACC1 | 
 | 	PXOR T0, ACC0 | 
 |  | 
 | 	MOVOU POLY, T0 | 
 | 	PCLMULQDQ $0x01, ACC0, T0 | 
 | 	PSHUFD $78, ACC0, ACC0 | 
 | 	PXOR T0, ACC0 | 
 |  | 
 | 	MOVOU POLY, T0 | 
 | 	PCLMULQDQ $0x01, ACC0, T0 | 
 | 	PSHUFD $78, ACC0, ACC0 | 
 | 	PXOR T0, ACC0 | 
 |  | 
 | 	PXOR ACC1, ACC0 | 
 |  | 
 | 	PSHUFB BSWAP, ACC0 | 
 | 	PXOR T2, ACC0 | 
 | 	MOVOU ACC0, (tPtr) | 
 |  | 
 | 	RET | 
 | #undef pTbl | 
 | #undef tMsk | 
 | #undef tPtr | 
 | #undef plen | 
 | #undef dlen | 
 |  | 
 | // func gcmAesInit(productTable *[256]byte, ks []uint32) | 
 | TEXT ·gcmAesInit(SB),NOSPLIT,$0 | 
 | #define dst DI | 
 | #define KS SI | 
 | #define NR DX | 
 |  | 
 | 	MOVQ productTable+0(FP), dst | 
 | 	MOVQ ks_base+8(FP), KS | 
 | 	MOVQ ks_len+16(FP), NR | 
 |  | 
 | 	SHRQ $2, NR | 
 | 	DECQ NR | 
 |  | 
 | 	MOVOU bswapMask<>(SB), BSWAP | 
 | 	MOVOU gcmPoly<>(SB), POLY | 
 |  | 
 | 	// Encrypt block 0, with the AES key to generate the hash key H | 
 | 	MOVOU (16*0)(KS), B0 | 
 | 	MOVOU (16*1)(KS), T0 | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*2)(KS), T0 | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*3)(KS), T0 | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*4)(KS), T0 | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*5)(KS), T0 | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*6)(KS), T0 | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*7)(KS), T0 | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*8)(KS), T0 | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*9)(KS), T0 | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*10)(KS), T0 | 
 | 	CMPQ NR, $12 | 
 | 	JB initEncLast | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*11)(KS), T0 | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*12)(KS), T0 | 
 | 	JE initEncLast | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*13)(KS), T0 | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*14)(KS), T0 | 
 | initEncLast: | 
 | 	AESENCLAST T0, B0 | 
 |  | 
 | 	PSHUFB BSWAP, B0 | 
 | 	// H * 2 | 
 | 	PSHUFD $0xff, B0, T0 | 
 | 	MOVOU B0, T1 | 
 | 	PSRAL $31, T0 | 
 | 	PAND POLY, T0 | 
 | 	PSRLL $31, T1 | 
 | 	PSLLDQ $4, T1 | 
 | 	PSLLL $1, B0 | 
 | 	PXOR T0, B0 | 
 | 	PXOR T1, B0 | 
 | 	// Karatsuba pre-computations | 
 | 	MOVOU B0, (16*14)(dst) | 
 | 	PSHUFD $78, B0, B1 | 
 | 	PXOR B0, B1 | 
 | 	MOVOU B1, (16*15)(dst) | 
 |  | 
 | 	MOVOU B0, B2 | 
 | 	MOVOU B1, B3 | 
 | 	// Now prepare powers of H and pre-computations for them | 
 | 	MOVQ $7, AX | 
 |  | 
 | initLoop: | 
 | 		MOVOU B2, T0 | 
 | 		MOVOU B2, T1 | 
 | 		MOVOU B3, T2 | 
 | 		PCLMULQDQ $0x00, B0, T0 | 
 | 		PCLMULQDQ $0x11, B0, T1 | 
 | 		PCLMULQDQ $0x00, B1, T2 | 
 |  | 
 | 		PXOR T0, T2 | 
 | 		PXOR T1, T2 | 
 | 		MOVOU T2, B4 | 
 | 		PSLLDQ $8, B4 | 
 | 		PSRLDQ $8, T2 | 
 | 		PXOR B4, T0 | 
 | 		PXOR T2, T1 | 
 |  | 
 | 		MOVOU POLY, B2 | 
 | 		PCLMULQDQ $0x01, T0, B2 | 
 | 		PSHUFD $78, T0, T0 | 
 | 		PXOR B2, T0 | 
 | 		MOVOU POLY, B2 | 
 | 		PCLMULQDQ $0x01, T0, B2 | 
 | 		PSHUFD $78, T0, T0 | 
 | 		PXOR T0, B2 | 
 | 		PXOR T1, B2 | 
 |  | 
 | 		MOVOU B2, (16*12)(dst) | 
 | 		PSHUFD $78, B2, B3 | 
 | 		PXOR B2, B3 | 
 | 		MOVOU B3, (16*13)(dst) | 
 |  | 
 | 		DECQ AX | 
 | 		LEAQ (-16*2)(dst), dst | 
 | 	JNE initLoop | 
 |  | 
 | 	RET | 
 | #undef NR | 
 | #undef KS | 
 | #undef dst | 
 |  | 
 | // func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte) | 
 | TEXT ·gcmAesData(SB),NOSPLIT,$0 | 
 | #define pTbl DI | 
 | #define aut SI | 
 | #define tPtr CX | 
 | #define autLen DX | 
 |  | 
 | #define reduceRound(a) 	MOVOU POLY, T0;	PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a | 
 | #define mulRoundAAD(X ,i) \ | 
 | 	MOVOU (16*(i*2))(pTbl), T1;\ | 
 | 	MOVOU T1, T2;\ | 
 | 	PCLMULQDQ $0x00, X, T1;\ | 
 | 	PXOR T1, ACC0;\ | 
 | 	PCLMULQDQ $0x11, X, T2;\ | 
 | 	PXOR T2, ACC1;\ | 
 | 	PSHUFD $78, X, T1;\ | 
 | 	PXOR T1, X;\ | 
 | 	MOVOU (16*(i*2+1))(pTbl), T1;\ | 
 | 	PCLMULQDQ $0x00, X, T1;\ | 
 | 	PXOR T1, ACCM | 
 |  | 
 | 	MOVQ productTable+0(FP), pTbl | 
 | 	MOVQ data_base+8(FP), aut | 
 | 	MOVQ data_len+16(FP), autLen | 
 | 	MOVQ T+32(FP), tPtr | 
 |  | 
 | 	PXOR ACC0, ACC0 | 
 | 	MOVOU bswapMask<>(SB), BSWAP | 
 | 	MOVOU gcmPoly<>(SB), POLY | 
 |  | 
 | 	TESTQ autLen, autLen | 
 | 	JEQ dataBail | 
 |  | 
 | 	CMPQ autLen, $13	// optimize the TLS case | 
 | 	JE dataTLS | 
 | 	CMPQ autLen, $128 | 
 | 	JB startSinglesLoop | 
 | 	JMP dataOctaLoop | 
 |  | 
 | dataTLS: | 
 | 	MOVOU (16*14)(pTbl), T1 | 
 | 	MOVOU (16*15)(pTbl), T2 | 
 | 	PXOR B0, B0 | 
 | 	MOVQ (aut), B0 | 
 | 	PINSRD $2, 8(aut), B0 | 
 | 	PINSRB $12, 12(aut), B0 | 
 | 	XORQ autLen, autLen | 
 | 	JMP dataMul | 
 |  | 
 | dataOctaLoop: | 
 | 		CMPQ autLen, $128 | 
 | 		JB startSinglesLoop | 
 | 		SUBQ $128, autLen | 
 |  | 
 | 		MOVOU (16*0)(aut), X0 | 
 | 		MOVOU (16*1)(aut), X1 | 
 | 		MOVOU (16*2)(aut), X2 | 
 | 		MOVOU (16*3)(aut), X3 | 
 | 		MOVOU (16*4)(aut), X4 | 
 | 		MOVOU (16*5)(aut), X5 | 
 | 		MOVOU (16*6)(aut), X6 | 
 | 		MOVOU (16*7)(aut), X7 | 
 | 		LEAQ (16*8)(aut), aut | 
 | 		PSHUFB BSWAP, X0 | 
 | 		PSHUFB BSWAP, X1 | 
 | 		PSHUFB BSWAP, X2 | 
 | 		PSHUFB BSWAP, X3 | 
 | 		PSHUFB BSWAP, X4 | 
 | 		PSHUFB BSWAP, X5 | 
 | 		PSHUFB BSWAP, X6 | 
 | 		PSHUFB BSWAP, X7 | 
 | 		PXOR ACC0, X0 | 
 |  | 
 | 		MOVOU (16*0)(pTbl), ACC0 | 
 | 		MOVOU (16*1)(pTbl), ACCM | 
 | 		MOVOU ACC0, ACC1 | 
 | 		PSHUFD $78, X0, T1 | 
 | 		PXOR X0, T1 | 
 | 		PCLMULQDQ $0x00, X0, ACC0 | 
 | 		PCLMULQDQ $0x11, X0, ACC1 | 
 | 		PCLMULQDQ $0x00, T1, ACCM | 
 |  | 
 | 		mulRoundAAD(X1, 1) | 
 | 		mulRoundAAD(X2, 2) | 
 | 		mulRoundAAD(X3, 3) | 
 | 		mulRoundAAD(X4, 4) | 
 | 		mulRoundAAD(X5, 5) | 
 | 		mulRoundAAD(X6, 6) | 
 | 		mulRoundAAD(X7, 7) | 
 |  | 
 | 		PXOR ACC0, ACCM | 
 | 		PXOR ACC1, ACCM | 
 | 		MOVOU ACCM, T0 | 
 | 		PSRLDQ $8, ACCM | 
 | 		PSLLDQ $8, T0 | 
 | 		PXOR ACCM, ACC1 | 
 | 		PXOR T0, ACC0 | 
 | 		reduceRound(ACC0) | 
 | 		reduceRound(ACC0) | 
 | 		PXOR ACC1, ACC0 | 
 | 	JMP dataOctaLoop | 
 |  | 
 | startSinglesLoop: | 
 | 	MOVOU (16*14)(pTbl), T1 | 
 | 	MOVOU (16*15)(pTbl), T2 | 
 |  | 
 | dataSinglesLoop: | 
 |  | 
 | 		CMPQ autLen, $16 | 
 | 		JB dataEnd | 
 | 		SUBQ $16, autLen | 
 |  | 
 | 		MOVOU (aut), B0 | 
 | dataMul: | 
 | 		PSHUFB BSWAP, B0 | 
 | 		PXOR ACC0, B0 | 
 |  | 
 | 		MOVOU T1, ACC0 | 
 | 		MOVOU T2, ACCM | 
 | 		MOVOU T1, ACC1 | 
 |  | 
 | 		PSHUFD $78, B0, T0 | 
 | 		PXOR B0, T0 | 
 | 		PCLMULQDQ $0x00, B0, ACC0 | 
 | 		PCLMULQDQ $0x11, B0, ACC1 | 
 | 		PCLMULQDQ $0x00, T0, ACCM | 
 |  | 
 | 		PXOR ACC0, ACCM | 
 | 		PXOR ACC1, ACCM | 
 | 		MOVOU ACCM, T0 | 
 | 		PSRLDQ $8, ACCM | 
 | 		PSLLDQ $8, T0 | 
 | 		PXOR ACCM, ACC1 | 
 | 		PXOR T0, ACC0 | 
 |  | 
 | 		MOVOU POLY, T0 | 
 | 		PCLMULQDQ $0x01, ACC0, T0 | 
 | 		PSHUFD $78, ACC0, ACC0 | 
 | 		PXOR T0, ACC0 | 
 |  | 
 | 		MOVOU POLY, T0 | 
 | 		PCLMULQDQ $0x01, ACC0, T0 | 
 | 		PSHUFD $78, ACC0, ACC0 | 
 | 		PXOR T0, ACC0 | 
 | 		PXOR ACC1, ACC0 | 
 |  | 
 | 		LEAQ 16(aut), aut | 
 |  | 
 | 	JMP dataSinglesLoop | 
 |  | 
 | dataEnd: | 
 |  | 
 | 	TESTQ autLen, autLen | 
 | 	JEQ dataBail | 
 |  | 
 | 	PXOR B0, B0 | 
 | 	LEAQ -1(aut)(autLen*1), aut | 
 |  | 
 | dataLoadLoop: | 
 |  | 
 | 		PSLLDQ $1, B0 | 
 | 		PINSRB $0, (aut), B0 | 
 |  | 
 | 		LEAQ -1(aut), aut | 
 | 		DECQ autLen | 
 | 		JNE dataLoadLoop | 
 |  | 
 | 	JMP dataMul | 
 |  | 
 | dataBail: | 
 | 	MOVOU ACC0, (tPtr) | 
 | 	RET | 
 | #undef pTbl | 
 | #undef aut | 
 | #undef tPtr | 
 | #undef autLen | 
 |  | 
 | // func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32) | 
 | TEXT ·gcmAesEnc(SB),0,$256-96 | 
 | #define pTbl DI | 
 | #define ctx DX | 
 | #define ctrPtr CX | 
 | #define ptx SI | 
 | #define ks AX | 
 | #define tPtr R8 | 
 | #define ptxLen R9 | 
 | #define aluCTR R10 | 
 | #define aluTMP R11 | 
 | #define aluK R12 | 
 | #define NR R13 | 
 |  | 
 | #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP) | 
 | #define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7 | 
 | #define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7 | 
 | #define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7 | 
 | #define combinedRound(i) \ | 
 | 	MOVOU (16*i)(ks), T0;\ | 
 | 	AESENC T0, B0;\ | 
 | 	AESENC T0, B1;\ | 
 | 	AESENC T0, B2;\ | 
 | 	AESENC T0, B3;\ | 
 | 	 MOVOU (16*(i*2))(pTbl), T1;\ | 
 | 	 MOVOU T1, T2;\ | 
 | 	AESENC T0, B4;\ | 
 | 	AESENC T0, B5;\ | 
 | 	AESENC T0, B6;\ | 
 | 	AESENC T0, B7;\ | 
 | 	 MOVOU (16*i)(SP), T0;\ | 
 | 	 PCLMULQDQ $0x00, T0, T1;\ | 
 | 	 PXOR T1, ACC0;\ | 
 | 	 PSHUFD $78, T0, T1;\ | 
 | 	 PCLMULQDQ $0x11, T0, T2;\ | 
 | 	 PXOR T1, T0;\ | 
 | 	 PXOR T2, ACC1;\ | 
 | 	 MOVOU (16*(i*2+1))(pTbl), T2;\ | 
 | 	 PCLMULQDQ $0x00, T2, T0;\ | 
 | 	 PXOR T0, ACCM | 
 | #define mulRound(i) \ | 
 | 	MOVOU (16*i)(SP), T0;\ | 
 | 	MOVOU (16*(i*2))(pTbl), T1;\ | 
 | 	MOVOU T1, T2;\ | 
 | 	PCLMULQDQ $0x00, T0, T1;\ | 
 | 	PXOR T1, ACC0;\ | 
 | 	PCLMULQDQ $0x11, T0, T2;\ | 
 | 	PXOR T2, ACC1;\ | 
 | 	PSHUFD $78, T0, T1;\ | 
 | 	PXOR T1, T0;\ | 
 | 	MOVOU (16*(i*2+1))(pTbl), T1;\ | 
 | 	PCLMULQDQ $0x00, T0, T1;\ | 
 | 	PXOR T1, ACCM | 
 |  | 
 | 	MOVQ productTable+0(FP), pTbl | 
 | 	MOVQ dst+8(FP), ctx | 
 | 	MOVQ src_base+32(FP), ptx | 
 | 	MOVQ src_len+40(FP), ptxLen | 
 | 	MOVQ ctr+56(FP), ctrPtr | 
 | 	MOVQ T+64(FP), tPtr | 
 | 	MOVQ ks_base+72(FP), ks | 
 | 	MOVQ ks_len+80(FP), NR | 
 |  | 
 | 	SHRQ $2, NR | 
 | 	DECQ NR | 
 |  | 
 | 	MOVOU bswapMask<>(SB), BSWAP | 
 | 	MOVOU gcmPoly<>(SB), POLY | 
 |  | 
 | 	MOVOU (tPtr), ACC0 | 
 | 	PXOR ACC1, ACC1 | 
 | 	PXOR ACCM, ACCM | 
 | 	MOVOU (ctrPtr), B0 | 
 | 	MOVL (3*4)(ctrPtr), aluCTR | 
 | 	MOVOU (ks), T0 | 
 | 	MOVL (3*4)(ks), aluK | 
 | 	BSWAPL aluCTR | 
 | 	BSWAPL aluK | 
 |  | 
 | 	PXOR B0, T0 | 
 | 	MOVOU T0, (8*16 + 0*16)(SP) | 
 | 	increment(0) | 
 |  | 
 | 	CMPQ ptxLen, $128 | 
 | 	JB gcmAesEncSingles | 
 | 	SUBQ $128, ptxLen | 
 |  | 
 | 	// We have at least 8 blocks to encrypt, prepare the rest of the counters | 
 | 	MOVOU T0, (8*16 + 1*16)(SP) | 
 | 	increment(1) | 
 | 	MOVOU T0, (8*16 + 2*16)(SP) | 
 | 	increment(2) | 
 | 	MOVOU T0, (8*16 + 3*16)(SP) | 
 | 	increment(3) | 
 | 	MOVOU T0, (8*16 + 4*16)(SP) | 
 | 	increment(4) | 
 | 	MOVOU T0, (8*16 + 5*16)(SP) | 
 | 	increment(5) | 
 | 	MOVOU T0, (8*16 + 6*16)(SP) | 
 | 	increment(6) | 
 | 	MOVOU T0, (8*16 + 7*16)(SP) | 
 | 	increment(7) | 
 |  | 
 | 	MOVOU (8*16 + 0*16)(SP), B0 | 
 | 	MOVOU (8*16 + 1*16)(SP), B1 | 
 | 	MOVOU (8*16 + 2*16)(SP), B2 | 
 | 	MOVOU (8*16 + 3*16)(SP), B3 | 
 | 	MOVOU (8*16 + 4*16)(SP), B4 | 
 | 	MOVOU (8*16 + 5*16)(SP), B5 | 
 | 	MOVOU (8*16 + 6*16)(SP), B6 | 
 | 	MOVOU (8*16 + 7*16)(SP), B7 | 
 |  | 
 | 	aesRound(1) | 
 | 	increment(0) | 
 | 	aesRound(2) | 
 | 	increment(1) | 
 | 	aesRound(3) | 
 | 	increment(2) | 
 | 	aesRound(4) | 
 | 	increment(3) | 
 | 	aesRound(5) | 
 | 	increment(4) | 
 | 	aesRound(6) | 
 | 	increment(5) | 
 | 	aesRound(7) | 
 | 	increment(6) | 
 | 	aesRound(8) | 
 | 	increment(7) | 
 | 	aesRound(9) | 
 | 	MOVOU (16*10)(ks), T0 | 
 | 	CMPQ NR, $12 | 
 | 	JB encLast1 | 
 | 	aesRnd(T0) | 
 | 	aesRound(11) | 
 | 	MOVOU (16*12)(ks), T0 | 
 | 	JE encLast1 | 
 | 	aesRnd(T0) | 
 | 	aesRound(13) | 
 | 	MOVOU (16*14)(ks), T0 | 
 | encLast1: | 
 | 	aesRndLast(T0) | 
 |  | 
 | 	MOVOU (16*0)(ptx), T0 | 
 | 	PXOR T0, B0 | 
 | 	MOVOU (16*1)(ptx), T0 | 
 | 	PXOR T0, B1 | 
 | 	MOVOU (16*2)(ptx), T0 | 
 | 	PXOR T0, B2 | 
 | 	MOVOU (16*3)(ptx), T0 | 
 | 	PXOR T0, B3 | 
 | 	MOVOU (16*4)(ptx), T0 | 
 | 	PXOR T0, B4 | 
 | 	MOVOU (16*5)(ptx), T0 | 
 | 	PXOR T0, B5 | 
 | 	MOVOU (16*6)(ptx), T0 | 
 | 	PXOR T0, B6 | 
 | 	MOVOU (16*7)(ptx), T0 | 
 | 	PXOR T0, B7 | 
 |  | 
 | 	MOVOU B0, (16*0)(ctx) | 
 | 	PSHUFB BSWAP, B0 | 
 | 	PXOR ACC0, B0 | 
 | 	MOVOU B1, (16*1)(ctx) | 
 | 	PSHUFB BSWAP, B1 | 
 | 	MOVOU B2, (16*2)(ctx) | 
 | 	PSHUFB BSWAP, B2 | 
 | 	MOVOU B3, (16*3)(ctx) | 
 | 	PSHUFB BSWAP, B3 | 
 | 	MOVOU B4, (16*4)(ctx) | 
 | 	PSHUFB BSWAP, B4 | 
 | 	MOVOU B5, (16*5)(ctx) | 
 | 	PSHUFB BSWAP, B5 | 
 | 	MOVOU B6, (16*6)(ctx) | 
 | 	PSHUFB BSWAP, B6 | 
 | 	MOVOU B7, (16*7)(ctx) | 
 | 	PSHUFB BSWAP, B7 | 
 |  | 
 | 	MOVOU B0, (16*0)(SP) | 
 | 	MOVOU B1, (16*1)(SP) | 
 | 	MOVOU B2, (16*2)(SP) | 
 | 	MOVOU B3, (16*3)(SP) | 
 | 	MOVOU B4, (16*4)(SP) | 
 | 	MOVOU B5, (16*5)(SP) | 
 | 	MOVOU B6, (16*6)(SP) | 
 | 	MOVOU B7, (16*7)(SP) | 
 |  | 
 | 	LEAQ 128(ptx), ptx | 
 | 	LEAQ 128(ctx), ctx | 
 |  | 
 | gcmAesEncOctetsLoop: | 
 |  | 
 | 		CMPQ ptxLen, $128 | 
 | 		JB gcmAesEncOctetsEnd | 
 | 		SUBQ $128, ptxLen | 
 |  | 
 | 		MOVOU (8*16 + 0*16)(SP), B0 | 
 | 		MOVOU (8*16 + 1*16)(SP), B1 | 
 | 		MOVOU (8*16 + 2*16)(SP), B2 | 
 | 		MOVOU (8*16 + 3*16)(SP), B3 | 
 | 		MOVOU (8*16 + 4*16)(SP), B4 | 
 | 		MOVOU (8*16 + 5*16)(SP), B5 | 
 | 		MOVOU (8*16 + 6*16)(SP), B6 | 
 | 		MOVOU (8*16 + 7*16)(SP), B7 | 
 |  | 
 | 		MOVOU (16*0)(SP), T0 | 
 | 		PSHUFD $78, T0, T1 | 
 | 		PXOR T0, T1 | 
 |  | 
 | 		MOVOU (16*0)(pTbl), ACC0 | 
 | 		MOVOU (16*1)(pTbl), ACCM | 
 | 		MOVOU ACC0, ACC1 | 
 |  | 
 | 		PCLMULQDQ $0x00, T1, ACCM | 
 | 		PCLMULQDQ $0x00, T0, ACC0 | 
 | 		PCLMULQDQ $0x11, T0, ACC1 | 
 |  | 
 | 		combinedRound(1) | 
 | 		increment(0) | 
 | 		combinedRound(2) | 
 | 		increment(1) | 
 | 		combinedRound(3) | 
 | 		increment(2) | 
 | 		combinedRound(4) | 
 | 		increment(3) | 
 | 		combinedRound(5) | 
 | 		increment(4) | 
 | 		combinedRound(6) | 
 | 		increment(5) | 
 | 		combinedRound(7) | 
 | 		increment(6) | 
 |  | 
 | 		aesRound(8) | 
 | 		increment(7) | 
 |  | 
 | 		PXOR ACC0, ACCM | 
 | 		PXOR ACC1, ACCM | 
 | 		MOVOU ACCM, T0 | 
 | 		PSRLDQ $8, ACCM | 
 | 		PSLLDQ $8, T0 | 
 | 		PXOR ACCM, ACC1 | 
 | 		PXOR T0, ACC0 | 
 |  | 
 | 		reduceRound(ACC0) | 
 | 		aesRound(9) | 
 |  | 
 | 		reduceRound(ACC0) | 
 | 		PXOR ACC1, ACC0 | 
 |  | 
 | 		MOVOU (16*10)(ks), T0 | 
 | 		CMPQ NR, $12 | 
 | 		JB encLast2 | 
 | 		aesRnd(T0) | 
 | 		aesRound(11) | 
 | 		MOVOU (16*12)(ks), T0 | 
 | 		JE encLast2 | 
 | 		aesRnd(T0) | 
 | 		aesRound(13) | 
 | 		MOVOU (16*14)(ks), T0 | 
 | encLast2: | 
 | 		aesRndLast(T0) | 
 |  | 
 | 		MOVOU (16*0)(ptx), T0 | 
 | 		PXOR T0, B0 | 
 | 		MOVOU (16*1)(ptx), T0 | 
 | 		PXOR T0, B1 | 
 | 		MOVOU (16*2)(ptx), T0 | 
 | 		PXOR T0, B2 | 
 | 		MOVOU (16*3)(ptx), T0 | 
 | 		PXOR T0, B3 | 
 | 		MOVOU (16*4)(ptx), T0 | 
 | 		PXOR T0, B4 | 
 | 		MOVOU (16*5)(ptx), T0 | 
 | 		PXOR T0, B5 | 
 | 		MOVOU (16*6)(ptx), T0 | 
 | 		PXOR T0, B6 | 
 | 		MOVOU (16*7)(ptx), T0 | 
 | 		PXOR T0, B7 | 
 |  | 
 | 		MOVOU B0, (16*0)(ctx) | 
 | 		PSHUFB BSWAP, B0 | 
 | 		PXOR ACC0, B0 | 
 | 		MOVOU B1, (16*1)(ctx) | 
 | 		PSHUFB BSWAP, B1 | 
 | 		MOVOU B2, (16*2)(ctx) | 
 | 		PSHUFB BSWAP, B2 | 
 | 		MOVOU B3, (16*3)(ctx) | 
 | 		PSHUFB BSWAP, B3 | 
 | 		MOVOU B4, (16*4)(ctx) | 
 | 		PSHUFB BSWAP, B4 | 
 | 		MOVOU B5, (16*5)(ctx) | 
 | 		PSHUFB BSWAP, B5 | 
 | 		MOVOU B6, (16*6)(ctx) | 
 | 		PSHUFB BSWAP, B6 | 
 | 		MOVOU B7, (16*7)(ctx) | 
 | 		PSHUFB BSWAP, B7 | 
 |  | 
 | 		MOVOU B0, (16*0)(SP) | 
 | 		MOVOU B1, (16*1)(SP) | 
 | 		MOVOU B2, (16*2)(SP) | 
 | 		MOVOU B3, (16*3)(SP) | 
 | 		MOVOU B4, (16*4)(SP) | 
 | 		MOVOU B5, (16*5)(SP) | 
 | 		MOVOU B6, (16*6)(SP) | 
 | 		MOVOU B7, (16*7)(SP) | 
 |  | 
 | 		LEAQ 128(ptx), ptx | 
 | 		LEAQ 128(ctx), ctx | 
 |  | 
 | 		JMP gcmAesEncOctetsLoop | 
 |  | 
 | gcmAesEncOctetsEnd: | 
 |  | 
 | 	MOVOU (16*0)(SP), T0 | 
 | 	MOVOU (16*0)(pTbl), ACC0 | 
 | 	MOVOU (16*1)(pTbl), ACCM | 
 | 	MOVOU ACC0, ACC1 | 
 | 	PSHUFD $78, T0, T1 | 
 | 	PXOR T0, T1 | 
 | 	PCLMULQDQ $0x00, T0, ACC0 | 
 | 	PCLMULQDQ $0x11, T0, ACC1 | 
 | 	PCLMULQDQ $0x00, T1, ACCM | 
 |  | 
 | 	mulRound(1) | 
 | 	mulRound(2) | 
 | 	mulRound(3) | 
 | 	mulRound(4) | 
 | 	mulRound(5) | 
 | 	mulRound(6) | 
 | 	mulRound(7) | 
 |  | 
 | 	PXOR ACC0, ACCM | 
 | 	PXOR ACC1, ACCM | 
 | 	MOVOU ACCM, T0 | 
 | 	PSRLDQ $8, ACCM | 
 | 	PSLLDQ $8, T0 | 
 | 	PXOR ACCM, ACC1 | 
 | 	PXOR T0, ACC0 | 
 |  | 
 | 	reduceRound(ACC0) | 
 | 	reduceRound(ACC0) | 
 | 	PXOR ACC1, ACC0 | 
 |  | 
 | 	TESTQ ptxLen, ptxLen | 
 | 	JE gcmAesEncDone | 
 |  | 
 | 	SUBQ $7, aluCTR | 
 |  | 
 | gcmAesEncSingles: | 
 |  | 
 | 	MOVOU (16*1)(ks), B1 | 
 | 	MOVOU (16*2)(ks), B2 | 
 | 	MOVOU (16*3)(ks), B3 | 
 | 	MOVOU (16*4)(ks), B4 | 
 | 	MOVOU (16*5)(ks), B5 | 
 | 	MOVOU (16*6)(ks), B6 | 
 | 	MOVOU (16*7)(ks), B7 | 
 |  | 
 | 	MOVOU (16*14)(pTbl), T2 | 
 |  | 
 | gcmAesEncSinglesLoop: | 
 |  | 
 | 		CMPQ ptxLen, $16 | 
 | 		JB gcmAesEncTail | 
 | 		SUBQ $16, ptxLen | 
 |  | 
 | 		MOVOU (8*16 + 0*16)(SP), B0 | 
 | 		increment(0) | 
 |  | 
 | 		AESENC B1, B0 | 
 | 		AESENC B2, B0 | 
 | 		AESENC B3, B0 | 
 | 		AESENC B4, B0 | 
 | 		AESENC B5, B0 | 
 | 		AESENC B6, B0 | 
 | 		AESENC B7, B0 | 
 | 		MOVOU (16*8)(ks), T0 | 
 | 		AESENC T0, B0 | 
 | 		MOVOU (16*9)(ks), T0 | 
 | 		AESENC T0, B0 | 
 | 		MOVOU (16*10)(ks), T0 | 
 | 		CMPQ NR, $12 | 
 | 		JB encLast3 | 
 | 		AESENC T0, B0 | 
 | 		MOVOU (16*11)(ks), T0 | 
 | 		AESENC T0, B0 | 
 | 		MOVOU (16*12)(ks), T0 | 
 | 		JE encLast3 | 
 | 		AESENC T0, B0 | 
 | 		MOVOU (16*13)(ks), T0 | 
 | 		AESENC T0, B0 | 
 | 		MOVOU (16*14)(ks), T0 | 
 | encLast3: | 
 | 		AESENCLAST T0, B0 | 
 |  | 
 | 		MOVOU (ptx), T0 | 
 | 		PXOR T0, B0 | 
 | 		MOVOU B0, (ctx) | 
 |  | 
 | 		PSHUFB BSWAP, B0 | 
 | 		PXOR ACC0, B0 | 
 |  | 
 | 		MOVOU T2, ACC0 | 
 | 		MOVOU T2, ACC1 | 
 | 		MOVOU (16*15)(pTbl), ACCM | 
 |  | 
 | 		PSHUFD $78, B0, T0 | 
 | 		PXOR B0, T0 | 
 | 		PCLMULQDQ $0x00, B0, ACC0 | 
 | 		PCLMULQDQ $0x11, B0, ACC1 | 
 | 		PCLMULQDQ $0x00, T0, ACCM | 
 |  | 
 | 		PXOR ACC0, ACCM | 
 | 		PXOR ACC1, ACCM | 
 | 		MOVOU ACCM, T0 | 
 | 		PSRLDQ $8, ACCM | 
 | 		PSLLDQ $8, T0 | 
 | 		PXOR ACCM, ACC1 | 
 | 		PXOR T0, ACC0 | 
 |  | 
 | 		reduceRound(ACC0) | 
 | 		reduceRound(ACC0) | 
 | 		PXOR ACC1, ACC0 | 
 |  | 
 | 		LEAQ (16*1)(ptx), ptx | 
 | 		LEAQ (16*1)(ctx), ctx | 
 |  | 
 | 	JMP gcmAesEncSinglesLoop | 
 |  | 
 | gcmAesEncTail: | 
 | 	TESTQ ptxLen, ptxLen | 
 | 	JE gcmAesEncDone | 
 |  | 
 | 	MOVOU (8*16 + 0*16)(SP), B0 | 
 | 	AESENC B1, B0 | 
 | 	AESENC B2, B0 | 
 | 	AESENC B3, B0 | 
 | 	AESENC B4, B0 | 
 | 	AESENC B5, B0 | 
 | 	AESENC B6, B0 | 
 | 	AESENC B7, B0 | 
 | 	MOVOU (16*8)(ks), T0 | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*9)(ks), T0 | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*10)(ks), T0 | 
 | 	CMPQ NR, $12 | 
 | 	JB encLast4 | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*11)(ks), T0 | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*12)(ks), T0 | 
 | 	JE encLast4 | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*13)(ks), T0 | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*14)(ks), T0 | 
 | encLast4: | 
 | 	AESENCLAST T0, B0 | 
 | 	MOVOU B0, T0 | 
 |  | 
 | 	LEAQ -1(ptx)(ptxLen*1), ptx | 
 |  | 
 | 	MOVQ ptxLen, aluTMP | 
 | 	SHLQ $4, aluTMP | 
 |  | 
 | 	LEAQ andMask<>(SB), aluCTR | 
 | 	MOVOU -16(aluCTR)(aluTMP*1), T1 | 
 |  | 
 | 	PXOR B0, B0 | 
 | ptxLoadLoop: | 
 | 		PSLLDQ $1, B0 | 
 | 		PINSRB $0, (ptx), B0 | 
 | 		LEAQ -1(ptx), ptx | 
 | 		DECQ ptxLen | 
 | 	JNE ptxLoadLoop | 
 |  | 
 | 	PXOR T0, B0 | 
 | 	PAND T1, B0 | 
 | 	MOVOU B0, (ctx)	// I assume there is always space, due to TAG in the end of the CT | 
 |  | 
 | 	PSHUFB BSWAP, B0 | 
 | 	PXOR ACC0, B0 | 
 |  | 
 | 	MOVOU T2, ACC0 | 
 | 	MOVOU T2, ACC1 | 
 | 	MOVOU (16*15)(pTbl), ACCM | 
 |  | 
 | 	PSHUFD $78, B0, T0 | 
 | 	PXOR B0, T0 | 
 | 	PCLMULQDQ $0x00, B0, ACC0 | 
 | 	PCLMULQDQ $0x11, B0, ACC1 | 
 | 	PCLMULQDQ $0x00, T0, ACCM | 
 |  | 
 | 	PXOR ACC0, ACCM | 
 | 	PXOR ACC1, ACCM | 
 | 	MOVOU ACCM, T0 | 
 | 	PSRLDQ $8, ACCM | 
 | 	PSLLDQ $8, T0 | 
 | 	PXOR ACCM, ACC1 | 
 | 	PXOR T0, ACC0 | 
 |  | 
 | 	reduceRound(ACC0) | 
 | 	reduceRound(ACC0) | 
 | 	PXOR ACC1, ACC0 | 
 |  | 
 | gcmAesEncDone: | 
 | 	MOVOU ACC0, (tPtr) | 
 | 	RET | 
 | #undef increment | 
 |  | 
 | // func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32) | 
 | TEXT ·gcmAesDec(SB),0,$128-96 | 
 | #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP) | 
 | #define combinedDecRound(i) \ | 
 | 	MOVOU (16*i)(ks), T0;\ | 
 | 	AESENC T0, B0;\ | 
 | 	AESENC T0, B1;\ | 
 | 	AESENC T0, B2;\ | 
 | 	AESENC T0, B3;\ | 
 | 	MOVOU (16*(i*2))(pTbl), T1;\ | 
 | 	MOVOU T1, T2;\ | 
 | 	AESENC T0, B4;\ | 
 | 	AESENC T0, B5;\ | 
 | 	AESENC T0, B6;\ | 
 | 	AESENC T0, B7;\ | 
 | 	MOVOU (16*i)(ctx), T0;\ | 
 | 	PSHUFB BSWAP, T0;\ | 
 | 	PCLMULQDQ $0x00, T0, T1;\ | 
 | 	PXOR T1, ACC0;\ | 
 | 	PSHUFD $78, T0, T1;\ | 
 | 	PCLMULQDQ $0x11, T0, T2;\ | 
 | 	PXOR T1, T0;\ | 
 | 	PXOR T2, ACC1;\ | 
 | 	MOVOU (16*(i*2+1))(pTbl), T2;\ | 
 | 	PCLMULQDQ $0x00, T2, T0;\ | 
 | 	PXOR T0, ACCM | 
 |  | 
 | 	MOVQ productTable+0(FP), pTbl | 
 | 	MOVQ dst+8(FP), ptx | 
 | 	MOVQ src_base+32(FP), ctx | 
 | 	MOVQ src_len+40(FP), ptxLen | 
 | 	MOVQ ctr+56(FP), ctrPtr | 
 | 	MOVQ T+64(FP), tPtr | 
 | 	MOVQ ks_base+72(FP), ks | 
 | 	MOVQ ks_len+80(FP), NR | 
 |  | 
 | 	SHRQ $2, NR | 
 | 	DECQ NR | 
 |  | 
 | 	MOVOU bswapMask<>(SB), BSWAP | 
 | 	MOVOU gcmPoly<>(SB), POLY | 
 |  | 
 | 	MOVOU (tPtr), ACC0 | 
 | 	PXOR ACC1, ACC1 | 
 | 	PXOR ACCM, ACCM | 
 | 	MOVOU (ctrPtr), B0 | 
 | 	MOVL (3*4)(ctrPtr), aluCTR | 
 | 	MOVOU (ks), T0 | 
 | 	MOVL (3*4)(ks), aluK | 
 | 	BSWAPL aluCTR | 
 | 	BSWAPL aluK | 
 |  | 
 | 	PXOR B0, T0 | 
 | 	MOVOU T0, (0*16)(SP) | 
 | 	increment(0) | 
 |  | 
 | 	CMPQ ptxLen, $128 | 
 | 	JB gcmAesDecSingles | 
 |  | 
 | 	MOVOU T0, (1*16)(SP) | 
 | 	increment(1) | 
 | 	MOVOU T0, (2*16)(SP) | 
 | 	increment(2) | 
 | 	MOVOU T0, (3*16)(SP) | 
 | 	increment(3) | 
 | 	MOVOU T0, (4*16)(SP) | 
 | 	increment(4) | 
 | 	MOVOU T0, (5*16)(SP) | 
 | 	increment(5) | 
 | 	MOVOU T0, (6*16)(SP) | 
 | 	increment(6) | 
 | 	MOVOU T0, (7*16)(SP) | 
 | 	increment(7) | 
 |  | 
 | gcmAesDecOctetsLoop: | 
 |  | 
 | 		CMPQ ptxLen, $128 | 
 | 		JB gcmAesDecEndOctets | 
 | 		SUBQ $128, ptxLen | 
 |  | 
 | 		MOVOU (0*16)(SP), B0 | 
 | 		MOVOU (1*16)(SP), B1 | 
 | 		MOVOU (2*16)(SP), B2 | 
 | 		MOVOU (3*16)(SP), B3 | 
 | 		MOVOU (4*16)(SP), B4 | 
 | 		MOVOU (5*16)(SP), B5 | 
 | 		MOVOU (6*16)(SP), B6 | 
 | 		MOVOU (7*16)(SP), B7 | 
 |  | 
 | 		MOVOU (16*0)(ctx), T0 | 
 | 		PSHUFB BSWAP, T0 | 
 | 		PXOR ACC0, T0 | 
 | 		PSHUFD $78, T0, T1 | 
 | 		PXOR T0, T1 | 
 |  | 
 | 		MOVOU (16*0)(pTbl), ACC0 | 
 | 		MOVOU (16*1)(pTbl), ACCM | 
 | 		MOVOU ACC0, ACC1 | 
 |  | 
 | 		PCLMULQDQ $0x00, T1, ACCM | 
 | 		PCLMULQDQ $0x00, T0, ACC0 | 
 | 		PCLMULQDQ $0x11, T0, ACC1 | 
 |  | 
 | 		combinedDecRound(1) | 
 | 		increment(0) | 
 | 		combinedDecRound(2) | 
 | 		increment(1) | 
 | 		combinedDecRound(3) | 
 | 		increment(2) | 
 | 		combinedDecRound(4) | 
 | 		increment(3) | 
 | 		combinedDecRound(5) | 
 | 		increment(4) | 
 | 		combinedDecRound(6) | 
 | 		increment(5) | 
 | 		combinedDecRound(7) | 
 | 		increment(6) | 
 |  | 
 | 		aesRound(8) | 
 | 		increment(7) | 
 |  | 
 | 		PXOR ACC0, ACCM | 
 | 		PXOR ACC1, ACCM | 
 | 		MOVOU ACCM, T0 | 
 | 		PSRLDQ $8, ACCM | 
 | 		PSLLDQ $8, T0 | 
 | 		PXOR ACCM, ACC1 | 
 | 		PXOR T0, ACC0 | 
 |  | 
 | 		reduceRound(ACC0) | 
 | 		aesRound(9) | 
 |  | 
 | 		reduceRound(ACC0) | 
 | 		PXOR ACC1, ACC0 | 
 |  | 
 | 		MOVOU (16*10)(ks), T0 | 
 | 		CMPQ NR, $12 | 
 | 		JB decLast1 | 
 | 		aesRnd(T0) | 
 | 		aesRound(11) | 
 | 		MOVOU (16*12)(ks), T0 | 
 | 		JE decLast1 | 
 | 		aesRnd(T0) | 
 | 		aesRound(13) | 
 | 		MOVOU (16*14)(ks), T0 | 
 | decLast1: | 
 | 		aesRndLast(T0) | 
 |  | 
 | 		MOVOU (16*0)(ctx), T0 | 
 | 		PXOR T0, B0 | 
 | 		MOVOU (16*1)(ctx), T0 | 
 | 		PXOR T0, B1 | 
 | 		MOVOU (16*2)(ctx), T0 | 
 | 		PXOR T0, B2 | 
 | 		MOVOU (16*3)(ctx), T0 | 
 | 		PXOR T0, B3 | 
 | 		MOVOU (16*4)(ctx), T0 | 
 | 		PXOR T0, B4 | 
 | 		MOVOU (16*5)(ctx), T0 | 
 | 		PXOR T0, B5 | 
 | 		MOVOU (16*6)(ctx), T0 | 
 | 		PXOR T0, B6 | 
 | 		MOVOU (16*7)(ctx), T0 | 
 | 		PXOR T0, B7 | 
 |  | 
 | 		MOVOU B0, (16*0)(ptx) | 
 | 		MOVOU B1, (16*1)(ptx) | 
 | 		MOVOU B2, (16*2)(ptx) | 
 | 		MOVOU B3, (16*3)(ptx) | 
 | 		MOVOU B4, (16*4)(ptx) | 
 | 		MOVOU B5, (16*5)(ptx) | 
 | 		MOVOU B6, (16*6)(ptx) | 
 | 		MOVOU B7, (16*7)(ptx) | 
 |  | 
 | 		LEAQ 128(ptx), ptx | 
 | 		LEAQ 128(ctx), ctx | 
 |  | 
 | 		JMP gcmAesDecOctetsLoop | 
 |  | 
 | gcmAesDecEndOctets: | 
 |  | 
 | 	SUBQ $7, aluCTR | 
 |  | 
 | gcmAesDecSingles: | 
 |  | 
 | 	MOVOU (16*1)(ks), B1 | 
 | 	MOVOU (16*2)(ks), B2 | 
 | 	MOVOU (16*3)(ks), B3 | 
 | 	MOVOU (16*4)(ks), B4 | 
 | 	MOVOU (16*5)(ks), B5 | 
 | 	MOVOU (16*6)(ks), B6 | 
 | 	MOVOU (16*7)(ks), B7 | 
 |  | 
 | 	MOVOU (16*14)(pTbl), T2 | 
 |  | 
 | gcmAesDecSinglesLoop: | 
 |  | 
 | 		CMPQ ptxLen, $16 | 
 | 		JB gcmAesDecTail | 
 | 		SUBQ $16, ptxLen | 
 |  | 
 | 		MOVOU (ctx), B0 | 
 | 		MOVOU B0, T1 | 
 | 		PSHUFB BSWAP, B0 | 
 | 		PXOR ACC0, B0 | 
 |  | 
 | 		MOVOU T2, ACC0 | 
 | 		MOVOU T2, ACC1 | 
 | 		MOVOU (16*15)(pTbl), ACCM | 
 |  | 
 | 		PCLMULQDQ $0x00, B0, ACC0 | 
 | 		PCLMULQDQ $0x11, B0, ACC1 | 
 | 		PSHUFD $78, B0, T0 | 
 | 		PXOR B0, T0 | 
 | 		PCLMULQDQ $0x00, T0, ACCM | 
 |  | 
 | 		PXOR ACC0, ACCM | 
 | 		PXOR ACC1, ACCM | 
 | 		MOVOU ACCM, T0 | 
 | 		PSRLDQ $8, ACCM | 
 | 		PSLLDQ $8, T0 | 
 | 		PXOR ACCM, ACC1 | 
 | 		PXOR T0, ACC0 | 
 |  | 
 | 		reduceRound(ACC0) | 
 | 		reduceRound(ACC0) | 
 | 		PXOR ACC1, ACC0 | 
 |  | 
 | 		MOVOU (0*16)(SP), B0 | 
 | 		increment(0) | 
 | 		AESENC B1, B0 | 
 | 		AESENC B2, B0 | 
 | 		AESENC B3, B0 | 
 | 		AESENC B4, B0 | 
 | 		AESENC B5, B0 | 
 | 		AESENC B6, B0 | 
 | 		AESENC B7, B0 | 
 | 		MOVOU (16*8)(ks), T0 | 
 | 		AESENC T0, B0 | 
 | 		MOVOU (16*9)(ks), T0 | 
 | 		AESENC T0, B0 | 
 | 		MOVOU (16*10)(ks), T0 | 
 | 		CMPQ NR, $12 | 
 | 		JB decLast2 | 
 | 		AESENC T0, B0 | 
 | 		MOVOU (16*11)(ks), T0 | 
 | 		AESENC T0, B0 | 
 | 		MOVOU (16*12)(ks), T0 | 
 | 		JE decLast2 | 
 | 		AESENC T0, B0 | 
 | 		MOVOU (16*13)(ks), T0 | 
 | 		AESENC T0, B0 | 
 | 		MOVOU (16*14)(ks), T0 | 
 | decLast2: | 
 | 		AESENCLAST T0, B0 | 
 |  | 
 | 		PXOR T1, B0 | 
 | 		MOVOU B0, (ptx) | 
 |  | 
 | 		LEAQ (16*1)(ptx), ptx | 
 | 		LEAQ (16*1)(ctx), ctx | 
 |  | 
 | 	JMP gcmAesDecSinglesLoop | 
 |  | 
 | gcmAesDecTail: | 
 |  | 
 | 	TESTQ ptxLen, ptxLen | 
 | 	JE gcmAesDecDone | 
 |  | 
 | 	MOVQ ptxLen, aluTMP | 
 | 	SHLQ $4, aluTMP | 
 | 	LEAQ andMask<>(SB), aluCTR | 
 | 	MOVOU -16(aluCTR)(aluTMP*1), T1 | 
 |  | 
 | 	MOVOU (ctx), B0	// I assume there is TAG attached to the ctx, and there is no read overflow | 
 | 	PAND T1, B0 | 
 |  | 
 | 	MOVOU B0, T1 | 
 | 	PSHUFB BSWAP, B0 | 
 | 	PXOR ACC0, B0 | 
 |  | 
 | 	MOVOU (16*14)(pTbl), ACC0 | 
 | 	MOVOU (16*15)(pTbl), ACCM | 
 | 	MOVOU ACC0, ACC1 | 
 |  | 
 | 	PCLMULQDQ $0x00, B0, ACC0 | 
 | 	PCLMULQDQ $0x11, B0, ACC1 | 
 | 	PSHUFD $78, B0, T0 | 
 | 	PXOR B0, T0 | 
 | 	PCLMULQDQ $0x00, T0, ACCM | 
 |  | 
 | 	PXOR ACC0, ACCM | 
 | 	PXOR ACC1, ACCM | 
 | 	MOVOU ACCM, T0 | 
 | 	PSRLDQ $8, ACCM | 
 | 	PSLLDQ $8, T0 | 
 | 	PXOR ACCM, ACC1 | 
 | 	PXOR T0, ACC0 | 
 |  | 
 | 	reduceRound(ACC0) | 
 | 	reduceRound(ACC0) | 
 | 	PXOR ACC1, ACC0 | 
 |  | 
 | 	MOVOU (0*16)(SP), B0 | 
 | 	increment(0) | 
 | 	AESENC B1, B0 | 
 | 	AESENC B2, B0 | 
 | 	AESENC B3, B0 | 
 | 	AESENC B4, B0 | 
 | 	AESENC B5, B0 | 
 | 	AESENC B6, B0 | 
 | 	AESENC B7, B0 | 
 | 	MOVOU (16*8)(ks), T0 | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*9)(ks), T0 | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*10)(ks), T0 | 
 | 	CMPQ NR, $12 | 
 | 	JB decLast3 | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*11)(ks), T0 | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*12)(ks), T0 | 
 | 	JE decLast3 | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*13)(ks), T0 | 
 | 	AESENC T0, B0 | 
 | 	MOVOU (16*14)(ks), T0 | 
 | decLast3: | 
 | 	AESENCLAST T0, B0 | 
 | 	PXOR T1, B0 | 
 |  | 
 | ptxStoreLoop: | 
 | 		PEXTRB $0, B0, (ptx) | 
 | 		PSRLDQ $1, B0 | 
 | 		LEAQ 1(ptx), ptx | 
 | 		DECQ ptxLen | 
 |  | 
 | 	JNE ptxStoreLoop | 
 |  | 
 | gcmAesDecDone: | 
 |  | 
 | 	MOVOU ACC0, (tPtr) | 
 | 	RET |