| // Copyright 2013 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // AVX2 version by Intel, same algorithm as code in Linux kernel: |
| // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha1_avx2_x86_64_asm.S |
| // Authors: |
| // Ilya Albrekht <ilya.albrekht@intel.com> |
| // Maxim Locktyukhin <maxim.locktyukhin@intel.com> |
| // Ronen Zohar <ronen.zohar@intel.com> |
| // Chandramouli Narayanan <mouli@linux.intel.com> |
| |
| |
| #include "textflag.h" |
| |
| // SHA-1 block routine. See sha1block.go for Go equivalent. |
| // |
| // There are 80 rounds of 4 types: |
| // - rounds 0-15 are type 1 and load data (ROUND1 macro). |
| // - rounds 16-19 are type 1 and do not load data (ROUND1x macro). |
| // - rounds 20-39 are type 2 and do not load data (ROUND2 macro). |
| // - rounds 40-59 are type 3 and do not load data (ROUND3 macro). |
| // - rounds 60-79 are type 4 and do not load data (ROUND4 macro). |
| // |
| // Each round loads or shuffles the data, then computes a per-round |
| // function of b, c, d, and then mixes the result into and rotates the |
| // five registers a, b, c, d, e holding the intermediate results. |
| // |
| // The register rotation is implemented by rotating the arguments to |
| // the round macros instead of by explicit move instructions. |
| |
| #define LOAD(index) \ |
| MOVL (index*4)(SI), R10; \ |
| BSWAPL R10; \ |
| MOVL R10, (index*4)(SP) |
| |
| #define SHUFFLE(index) \ |
| MOVL (((index)&0xf)*4)(SP), R10; \ |
| XORL (((index-3)&0xf)*4)(SP), R10; \ |
| XORL (((index-8)&0xf)*4)(SP), R10; \ |
| XORL (((index-14)&0xf)*4)(SP), R10; \ |
| ROLL $1, R10; \ |
| MOVL R10, (((index)&0xf)*4)(SP) |
| |
| #define FUNC1(a, b, c, d, e) \ |
| MOVL d, R9; \ |
| XORL c, R9; \ |
| ANDL b, R9; \ |
| XORL d, R9 |
| |
| #define FUNC2(a, b, c, d, e) \ |
| MOVL b, R9; \ |
| XORL c, R9; \ |
| XORL d, R9 |
| |
| #define FUNC3(a, b, c, d, e) \ |
| MOVL b, R8; \ |
| ORL c, R8; \ |
| ANDL d, R8; \ |
| MOVL b, R9; \ |
| ANDL c, R9; \ |
| ORL R8, R9 |
| |
| #define FUNC4 FUNC2 |
| |
| #define MIX(a, b, c, d, e, const) \ |
| ROLL $30, b; \ |
| ADDL R9, e; \ |
| MOVL a, R8; \ |
| ROLL $5, R8; \ |
| LEAL const(e)(R10*1), e; \ |
| ADDL R8, e |
| |
| #define ROUND1(a, b, c, d, e, index) \ |
| LOAD(index); \ |
| FUNC1(a, b, c, d, e); \ |
| MIX(a, b, c, d, e, 0x5A827999) |
| |
| #define ROUND1x(a, b, c, d, e, index) \ |
| SHUFFLE(index); \ |
| FUNC1(a, b, c, d, e); \ |
| MIX(a, b, c, d, e, 0x5A827999) |
| |
| #define ROUND2(a, b, c, d, e, index) \ |
| SHUFFLE(index); \ |
| FUNC2(a, b, c, d, e); \ |
| MIX(a, b, c, d, e, 0x6ED9EBA1) |
| |
| #define ROUND3(a, b, c, d, e, index) \ |
| SHUFFLE(index); \ |
| FUNC3(a, b, c, d, e); \ |
| MIX(a, b, c, d, e, 0x8F1BBCDC) |
| |
| #define ROUND4(a, b, c, d, e, index) \ |
| SHUFFLE(index); \ |
| FUNC4(a, b, c, d, e); \ |
| MIX(a, b, c, d, e, 0xCA62C1D6) |
| |
| TEXT ·blockAMD64(SB),NOSPLIT,$64-32 |
| MOVQ dig+0(FP), BP |
| MOVQ p_base+8(FP), SI |
| MOVQ p_len+16(FP), DX |
| SHRQ $6, DX |
| SHLQ $6, DX |
| |
| LEAQ (SI)(DX*1), DI |
| MOVL (0*4)(BP), AX |
| MOVL (1*4)(BP), BX |
| MOVL (2*4)(BP), CX |
| MOVL (3*4)(BP), DX |
| MOVL (4*4)(BP), BP |
| |
| CMPQ SI, DI |
| JEQ end |
| |
| loop: |
| MOVL AX, R11 |
| MOVL BX, R12 |
| MOVL CX, R13 |
| MOVL DX, R14 |
| MOVL BP, R15 |
| |
| ROUND1(AX, BX, CX, DX, BP, 0) |
| ROUND1(BP, AX, BX, CX, DX, 1) |
| ROUND1(DX, BP, AX, BX, CX, 2) |
| ROUND1(CX, DX, BP, AX, BX, 3) |
| ROUND1(BX, CX, DX, BP, AX, 4) |
| ROUND1(AX, BX, CX, DX, BP, 5) |
| ROUND1(BP, AX, BX, CX, DX, 6) |
| ROUND1(DX, BP, AX, BX, CX, 7) |
| ROUND1(CX, DX, BP, AX, BX, 8) |
| ROUND1(BX, CX, DX, BP, AX, 9) |
| ROUND1(AX, BX, CX, DX, BP, 10) |
| ROUND1(BP, AX, BX, CX, DX, 11) |
| ROUND1(DX, BP, AX, BX, CX, 12) |
| ROUND1(CX, DX, BP, AX, BX, 13) |
| ROUND1(BX, CX, DX, BP, AX, 14) |
| ROUND1(AX, BX, CX, DX, BP, 15) |
| |
| ROUND1x(BP, AX, BX, CX, DX, 16) |
| ROUND1x(DX, BP, AX, BX, CX, 17) |
| ROUND1x(CX, DX, BP, AX, BX, 18) |
| ROUND1x(BX, CX, DX, BP, AX, 19) |
| |
| ROUND2(AX, BX, CX, DX, BP, 20) |
| ROUND2(BP, AX, BX, CX, DX, 21) |
| ROUND2(DX, BP, AX, BX, CX, 22) |
| ROUND2(CX, DX, BP, AX, BX, 23) |
| ROUND2(BX, CX, DX, BP, AX, 24) |
| ROUND2(AX, BX, CX, DX, BP, 25) |
| ROUND2(BP, AX, BX, CX, DX, 26) |
| ROUND2(DX, BP, AX, BX, CX, 27) |
| ROUND2(CX, DX, BP, AX, BX, 28) |
| ROUND2(BX, CX, DX, BP, AX, 29) |
| ROUND2(AX, BX, CX, DX, BP, 30) |
| ROUND2(BP, AX, BX, CX, DX, 31) |
| ROUND2(DX, BP, AX, BX, CX, 32) |
| ROUND2(CX, DX, BP, AX, BX, 33) |
| ROUND2(BX, CX, DX, BP, AX, 34) |
| ROUND2(AX, BX, CX, DX, BP, 35) |
| ROUND2(BP, AX, BX, CX, DX, 36) |
| ROUND2(DX, BP, AX, BX, CX, 37) |
| ROUND2(CX, DX, BP, AX, BX, 38) |
| ROUND2(BX, CX, DX, BP, AX, 39) |
| |
| ROUND3(AX, BX, CX, DX, BP, 40) |
| ROUND3(BP, AX, BX, CX, DX, 41) |
| ROUND3(DX, BP, AX, BX, CX, 42) |
| ROUND3(CX, DX, BP, AX, BX, 43) |
| ROUND3(BX, CX, DX, BP, AX, 44) |
| ROUND3(AX, BX, CX, DX, BP, 45) |
| ROUND3(BP, AX, BX, CX, DX, 46) |
| ROUND3(DX, BP, AX, BX, CX, 47) |
| ROUND3(CX, DX, BP, AX, BX, 48) |
| ROUND3(BX, CX, DX, BP, AX, 49) |
| ROUND3(AX, BX, CX, DX, BP, 50) |
| ROUND3(BP, AX, BX, CX, DX, 51) |
| ROUND3(DX, BP, AX, BX, CX, 52) |
| ROUND3(CX, DX, BP, AX, BX, 53) |
| ROUND3(BX, CX, DX, BP, AX, 54) |
| ROUND3(AX, BX, CX, DX, BP, 55) |
| ROUND3(BP, AX, BX, CX, DX, 56) |
| ROUND3(DX, BP, AX, BX, CX, 57) |
| ROUND3(CX, DX, BP, AX, BX, 58) |
| ROUND3(BX, CX, DX, BP, AX, 59) |
| |
| ROUND4(AX, BX, CX, DX, BP, 60) |
| ROUND4(BP, AX, BX, CX, DX, 61) |
| ROUND4(DX, BP, AX, BX, CX, 62) |
| ROUND4(CX, DX, BP, AX, BX, 63) |
| ROUND4(BX, CX, DX, BP, AX, 64) |
| ROUND4(AX, BX, CX, DX, BP, 65) |
| ROUND4(BP, AX, BX, CX, DX, 66) |
| ROUND4(DX, BP, AX, BX, CX, 67) |
| ROUND4(CX, DX, BP, AX, BX, 68) |
| ROUND4(BX, CX, DX, BP, AX, 69) |
| ROUND4(AX, BX, CX, DX, BP, 70) |
| ROUND4(BP, AX, BX, CX, DX, 71) |
| ROUND4(DX, BP, AX, BX, CX, 72) |
| ROUND4(CX, DX, BP, AX, BX, 73) |
| ROUND4(BX, CX, DX, BP, AX, 74) |
| ROUND4(AX, BX, CX, DX, BP, 75) |
| ROUND4(BP, AX, BX, CX, DX, 76) |
| ROUND4(DX, BP, AX, BX, CX, 77) |
| ROUND4(CX, DX, BP, AX, BX, 78) |
| ROUND4(BX, CX, DX, BP, AX, 79) |
| |
| ADDL R11, AX |
| ADDL R12, BX |
| ADDL R13, CX |
| ADDL R14, DX |
| ADDL R15, BP |
| |
| ADDQ $64, SI |
| CMPQ SI, DI |
| JB loop |
| |
| end: |
| MOVQ dig+0(FP), DI |
| MOVL AX, (0*4)(DI) |
| MOVL BX, (1*4)(DI) |
| MOVL CX, (2*4)(DI) |
| MOVL DX, (3*4)(DI) |
| MOVL BP, (4*4)(DI) |
| RET |
| |
| |
| // This is the implementation using AVX2, BMI1 and BMI2. It is based on: |
| // "SHA-1 implementation with Intel(R) AVX2 instruction set extensions" |
| // From http://software.intel.com/en-us/articles |
| // (look for improving-the-performance-of-the-secure-hash-algorithm-1) |
| // This implementation is 2x unrolled, and interleaves vector instructions, |
| // used to precompute W, with scalar computation of current round |
| // for optimal scheduling. |
| |
| // Trivial helper macros. |
| #define UPDATE_HASH(A,TB,C,D,E) \ |
| ADDL (R9), A \ |
| MOVL A, (R9) \ |
| ADDL 4(R9), TB \ |
| MOVL TB, 4(R9) \ |
| ADDL 8(R9), C \ |
| MOVL C, 8(R9) \ |
| ADDL 12(R9), D \ |
| MOVL D, 12(R9) \ |
| ADDL 16(R9), E \ |
| MOVL E, 16(R9) |
| |
| |
| |
| // Helper macros for PRECALC, which does precomputations |
| #define PRECALC_0(OFFSET) \ |
| VMOVDQU OFFSET(R10),X0 |
| |
| #define PRECALC_1(OFFSET) \ |
| VINSERTI128 $1, OFFSET(R13), Y0, Y0 |
| |
| #define PRECALC_2(YREG) \ |
| VPSHUFB Y10, Y0, YREG |
| |
| #define PRECALC_4(YREG,K_OFFSET) \ |
| VPADDD K_OFFSET(R8), YREG, Y0 |
| |
| #define PRECALC_7(OFFSET) \ |
| VMOVDQU Y0, (OFFSET*2)(R14) |
| |
| |
| // Message scheduling pre-compute for rounds 0-15 |
| // R13 is a pointer to even 64-byte block |
| // R10 is a pointer to odd 64-byte block |
| // R14 is a pointer to temp buffer |
| // X0 is used as temp register |
| // YREG is clobbered as part of computation |
| // OFFSET chooses 16 byte chunk within a block |
| // R8 is a pointer to constants block |
| // K_OFFSET chooses K constants relevant to this round |
| // X10 holds swap mask |
| #define PRECALC_00_15(OFFSET,YREG) \ |
| PRECALC_0(OFFSET) \ |
| PRECALC_1(OFFSET) \ |
| PRECALC_2(YREG) \ |
| PRECALC_4(YREG,0x0) \ |
| PRECALC_7(OFFSET) |
| |
| |
| // Helper macros for PRECALC_16_31 |
| #define PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \ |
| VPALIGNR $8, REG_SUB_16, REG_SUB_12, REG \ // w[i-14] |
| VPSRLDQ $4, REG_SUB_4, Y0 // w[i-3] |
| |
| #define PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \ |
| VPXOR REG_SUB_8, REG, REG \ |
| VPXOR REG_SUB_16, Y0, Y0 |
| |
| #define PRECALC_18(REG) \ |
| VPXOR Y0, REG, REG \ |
| VPSLLDQ $12, REG, Y9 |
| |
| #define PRECALC_19(REG) \ |
| VPSLLD $1, REG, Y0 \ |
| VPSRLD $31, REG, REG |
| |
| #define PRECALC_20(REG) \ |
| VPOR REG, Y0, Y0 \ |
| VPSLLD $2, Y9, REG |
| |
| #define PRECALC_21(REG) \ |
| VPSRLD $30, Y9, Y9 \ |
| VPXOR REG, Y0, Y0 |
| |
| #define PRECALC_23(REG,K_OFFSET,OFFSET) \ |
| VPXOR Y9, Y0, REG \ |
| VPADDD K_OFFSET(R8), REG, Y0 \ |
| VMOVDQU Y0, (OFFSET)(R14) |
| |
| // Message scheduling pre-compute for rounds 16-31 |
| // calculating last 32 w[i] values in 8 XMM registers |
| // pre-calculate K+w[i] values and store to mem |
| // for later load by ALU add instruction. |
| // "brute force" vectorization for rounds 16-31 only |
| // due to w[i]->w[i-3] dependency. |
| // clobbers 5 input ymm registers REG_SUB* |
| // uses X0 and X9 as temp registers |
| // As always, R8 is a pointer to constants block |
| // and R14 is a pointer to temp buffer |
| #define PRECALC_16_31(REG,REG_SUB_4,REG_SUB_8,REG_SUB_12,REG_SUB_16,K_OFFSET,OFFSET) \ |
| PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \ |
| PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \ |
| PRECALC_18(REG) \ |
| PRECALC_19(REG) \ |
| PRECALC_20(REG) \ |
| PRECALC_21(REG) \ |
| PRECALC_23(REG,K_OFFSET,OFFSET) |
| |
| |
| // Helper macros for PRECALC_32_79 |
| #define PRECALC_32(REG_SUB_8,REG_SUB_4) \ |
| VPALIGNR $8, REG_SUB_8, REG_SUB_4, Y0 |
| |
| #define PRECALC_33(REG_SUB_28,REG) \ |
| VPXOR REG_SUB_28, REG, REG |
| |
| #define PRECALC_34(REG_SUB_16) \ |
| VPXOR REG_SUB_16, Y0, Y0 |
| |
| #define PRECALC_35(REG) \ |
| VPXOR Y0, REG, REG |
| |
| #define PRECALC_36(REG) \ |
| VPSLLD $2, REG, Y0 |
| |
| #define PRECALC_37(REG) \ |
| VPSRLD $30, REG, REG \ |
| VPOR REG, Y0, REG |
| |
| #define PRECALC_39(REG,K_OFFSET,OFFSET) \ |
| VPADDD K_OFFSET(R8), REG, Y0 \ |
| VMOVDQU Y0, (OFFSET)(R14) |
| |
| // Message scheduling pre-compute for rounds 32-79 |
| // In SHA-1 specification we have: |
| // w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 |
| // Which is the same as: |
| // w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 |
| // This allows for more efficient vectorization, |
| // since w[i]->w[i-3] dependency is broken |
| #define PRECALC_32_79(REG,REG_SUB_4,REG_SUB_8,REG_SUB_16,REG_SUB_28,K_OFFSET,OFFSET) \ |
| PRECALC_32(REG_SUB_8,REG_SUB_4) \ |
| PRECALC_33(REG_SUB_28,REG) \ |
| PRECALC_34(REG_SUB_16) \ |
| PRECALC_35(REG) \ |
| PRECALC_36(REG) \ |
| PRECALC_37(REG) \ |
| PRECALC_39(REG,K_OFFSET,OFFSET) |
| |
| #define PRECALC \ |
| PRECALC_00_15(0,Y15) \ |
| PRECALC_00_15(0x10,Y14) \ |
| PRECALC_00_15(0x20,Y13) \ |
| PRECALC_00_15(0x30,Y12) \ |
| PRECALC_16_31(Y8,Y12,Y13,Y14,Y15,0,0x80) \ |
| PRECALC_16_31(Y7,Y8,Y12,Y13,Y14,0x20,0xa0) \ |
| PRECALC_16_31(Y5,Y7,Y8,Y12,Y13,0x20,0xc0) \ |
| PRECALC_16_31(Y3,Y5,Y7,Y8,Y12,0x20,0xe0) \ |
| PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x20,0x100) \ |
| PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x20,0x120) \ |
| PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x40,0x140) \ |
| PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x40,0x160) \ |
| PRECALC_32_79(Y8,Y12,Y13,Y15,Y7,0x40,0x180) \ |
| PRECALC_32_79(Y7,Y8,Y12,Y14,Y5,0x40,0x1a0) \ |
| PRECALC_32_79(Y5,Y7,Y8,Y13,Y3,0x40,0x1c0) \ |
| PRECALC_32_79(Y3,Y5,Y7,Y12,Y15,0x60,0x1e0) \ |
| PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x60,0x200) \ |
| PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x60,0x220) \ |
| PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x60,0x240) \ |
| PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x60,0x260) |
| |
| // Macros calculating individual rounds have general form |
| // CALC_ROUND_PRE + PRECALC_ROUND + CALC_ROUND_POST |
| // CALC_ROUND_{PRE,POST} macros follow |
| |
| #define CALC_F1_PRE(OFFSET,REG_A,REG_B,REG_C,REG_E) \ |
| ADDL OFFSET(R15),REG_E \ |
| ANDNL REG_C,REG_A,BP \ |
| LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round |
| RORXL $0x1b, REG_A, R12 \ |
| RORXL $2, REG_A, REG_B // for next round |
| |
| // Calculate F for the next round |
| #define CALC_F1_POST(REG_A,REG_B,REG_E) \ |
| ANDL REG_B,REG_A \ // b&c |
| XORL BP, REG_A \ // F1 = (b&c) ^ (~b&d) |
| LEAL (REG_E)(R12*1), REG_E // E += A >>> 5 |
| |
| |
| // Registers are cyclically rotated DX -> AX -> DI -> SI -> BX -> CX |
| #define CALC_0 \ |
| MOVL SI, BX \ // Precalculating first round |
| RORXL $2, SI, SI \ |
| ANDNL AX, BX, BP \ |
| ANDL DI, BX \ |
| XORL BP, BX \ |
| CALC_F1_PRE(0x0,CX,BX,DI,DX) \ |
| PRECALC_0(0x80) \ |
| CALC_F1_POST(CX,SI,DX) |
| |
| #define CALC_1 \ |
| CALC_F1_PRE(0x4,DX,CX,SI,AX) \ |
| PRECALC_1(0x80) \ |
| CALC_F1_POST(DX,BX,AX) |
| |
| #define CALC_2 \ |
| CALC_F1_PRE(0x8,AX,DX,BX,DI) \ |
| PRECALC_2(Y15) \ |
| CALC_F1_POST(AX,CX,DI) |
| |
| #define CALC_3 \ |
| CALC_F1_PRE(0xc,DI,AX,CX,SI) \ |
| CALC_F1_POST(DI,DX,SI) |
| |
| #define CALC_4 \ |
| CALC_F1_PRE(0x20,SI,DI,DX,BX) \ |
| PRECALC_4(Y15,0x0) \ |
| CALC_F1_POST(SI,AX,BX) |
| |
| #define CALC_5 \ |
| CALC_F1_PRE(0x24,BX,SI,AX,CX) \ |
| CALC_F1_POST(BX,DI,CX) |
| |
| #define CALC_6 \ |
| CALC_F1_PRE(0x28,CX,BX,DI,DX) \ |
| CALC_F1_POST(CX,SI,DX) |
| |
| #define CALC_7 \ |
| CALC_F1_PRE(0x2c,DX,CX,SI,AX) \ |
| PRECALC_7(0x0) \ |
| CALC_F1_POST(DX,BX,AX) |
| |
| #define CALC_8 \ |
| CALC_F1_PRE(0x40,AX,DX,BX,DI) \ |
| PRECALC_0(0x90) \ |
| CALC_F1_POST(AX,CX,DI) |
| |
| #define CALC_9 \ |
| CALC_F1_PRE(0x44,DI,AX,CX,SI) \ |
| PRECALC_1(0x90) \ |
| CALC_F1_POST(DI,DX,SI) |
| |
| #define CALC_10 \ |
| CALC_F1_PRE(0x48,SI,DI,DX,BX) \ |
| PRECALC_2(Y14) \ |
| CALC_F1_POST(SI,AX,BX) |
| |
| #define CALC_11 \ |
| CALC_F1_PRE(0x4c,BX,SI,AX,CX) \ |
| CALC_F1_POST(BX,DI,CX) |
| |
| #define CALC_12 \ |
| CALC_F1_PRE(0x60,CX,BX,DI,DX) \ |
| PRECALC_4(Y14,0x0) \ |
| CALC_F1_POST(CX,SI,DX) |
| |
| #define CALC_13 \ |
| CALC_F1_PRE(0x64,DX,CX,SI,AX) \ |
| CALC_F1_POST(DX,BX,AX) |
| |
| #define CALC_14 \ |
| CALC_F1_PRE(0x68,AX,DX,BX,DI) \ |
| CALC_F1_POST(AX,CX,DI) |
| |
| #define CALC_15 \ |
| CALC_F1_PRE(0x6c,DI,AX,CX,SI) \ |
| PRECALC_7(0x10) \ |
| CALC_F1_POST(DI,DX,SI) |
| |
| #define CALC_16 \ |
| CALC_F1_PRE(0x80,SI,DI,DX,BX) \ |
| PRECALC_0(0xa0) \ |
| CALC_F1_POST(SI,AX,BX) |
| |
| #define CALC_17 \ |
| CALC_F1_PRE(0x84,BX,SI,AX,CX) \ |
| PRECALC_1(0xa0) \ |
| CALC_F1_POST(BX,DI,CX) |
| |
| #define CALC_18 \ |
| CALC_F1_PRE(0x88,CX,BX,DI,DX) \ |
| PRECALC_2(Y13) \ |
| CALC_F1_POST(CX,SI,DX) |
| |
| |
| #define CALC_F2_PRE(OFFSET,REG_A,REG_B,REG_E) \ |
| ADDL OFFSET(R15),REG_E \ |
| LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round |
| RORXL $0x1b, REG_A, R12 \ |
| RORXL $2, REG_A, REG_B // for next round |
| |
| #define CALC_F2_POST(REG_A,REG_B,REG_C,REG_E) \ |
| XORL REG_B, REG_A \ |
| ADDL R12, REG_E \ |
| XORL REG_C, REG_A |
| |
| #define CALC_19 \ |
| CALC_F2_PRE(0x8c,DX,CX,AX) \ |
| CALC_F2_POST(DX,BX,SI,AX) |
| |
| #define CALC_20 \ |
| CALC_F2_PRE(0xa0,AX,DX,DI) \ |
| PRECALC_4(Y13,0x0) \ |
| CALC_F2_POST(AX,CX,BX,DI) |
| |
| #define CALC_21 \ |
| CALC_F2_PRE(0xa4,DI,AX,SI) \ |
| CALC_F2_POST(DI,DX,CX,SI) |
| |
| #define CALC_22 \ |
| CALC_F2_PRE(0xa8,SI,DI,BX) \ |
| CALC_F2_POST(SI,AX,DX,BX) |
| |
| #define CALC_23 \ |
| CALC_F2_PRE(0xac,BX,SI,CX) \ |
| PRECALC_7(0x20) \ |
| CALC_F2_POST(BX,DI,AX,CX) |
| |
| #define CALC_24 \ |
| CALC_F2_PRE(0xc0,CX,BX,DX) \ |
| PRECALC_0(0xb0) \ |
| CALC_F2_POST(CX,SI,DI,DX) |
| |
| #define CALC_25 \ |
| CALC_F2_PRE(0xc4,DX,CX,AX) \ |
| PRECALC_1(0xb0) \ |
| CALC_F2_POST(DX,BX,SI,AX) |
| |
| #define CALC_26 \ |
| CALC_F2_PRE(0xc8,AX,DX,DI) \ |
| PRECALC_2(Y12) \ |
| CALC_F2_POST(AX,CX,BX,DI) |
| |
| #define CALC_27 \ |
| CALC_F2_PRE(0xcc,DI,AX,SI) \ |
| CALC_F2_POST(DI,DX,CX,SI) |
| |
| #define CALC_28 \ |
| CALC_F2_PRE(0xe0,SI,DI,BX) \ |
| PRECALC_4(Y12,0x0) \ |
| CALC_F2_POST(SI,AX,DX,BX) |
| |
| #define CALC_29 \ |
| CALC_F2_PRE(0xe4,BX,SI,CX) \ |
| CALC_F2_POST(BX,DI,AX,CX) |
| |
| #define CALC_30 \ |
| CALC_F2_PRE(0xe8,CX,BX,DX) \ |
| CALC_F2_POST(CX,SI,DI,DX) |
| |
| #define CALC_31 \ |
| CALC_F2_PRE(0xec,DX,CX,AX) \ |
| PRECALC_7(0x30) \ |
| CALC_F2_POST(DX,BX,SI,AX) |
| |
| #define CALC_32 \ |
| CALC_F2_PRE(0x100,AX,DX,DI) \ |
| PRECALC_16(Y15,Y14,Y12,Y8) \ |
| CALC_F2_POST(AX,CX,BX,DI) |
| |
| #define CALC_33 \ |
| CALC_F2_PRE(0x104,DI,AX,SI) \ |
| PRECALC_17(Y15,Y13,Y8) \ |
| CALC_F2_POST(DI,DX,CX,SI) |
| |
| #define CALC_34 \ |
| CALC_F2_PRE(0x108,SI,DI,BX) \ |
| PRECALC_18(Y8) \ |
| CALC_F2_POST(SI,AX,DX,BX) |
| |
| #define CALC_35 \ |
| CALC_F2_PRE(0x10c,BX,SI,CX) \ |
| PRECALC_19(Y8) \ |
| CALC_F2_POST(BX,DI,AX,CX) |
| |
| #define CALC_36 \ |
| CALC_F2_PRE(0x120,CX,BX,DX) \ |
| PRECALC_20(Y8) \ |
| CALC_F2_POST(CX,SI,DI,DX) |
| |
| #define CALC_37 \ |
| CALC_F2_PRE(0x124,DX,CX,AX) \ |
| PRECALC_21(Y8) \ |
| CALC_F2_POST(DX,BX,SI,AX) |
| |
| #define CALC_38 \ |
| CALC_F2_PRE(0x128,AX,DX,DI) \ |
| CALC_F2_POST(AX,CX,BX,DI) |
| |
| |
| #define CALC_F3_PRE(OFFSET,REG_E) \ |
| ADDL OFFSET(R15),REG_E |
| |
| #define CALC_F3_POST(REG_A,REG_B,REG_C,REG_E,REG_TB) \ |
| LEAL (REG_E)(REG_TB*1), REG_E \ // Add F from the previous round |
| MOVL REG_B, BP \ |
| ORL REG_A, BP \ |
| RORXL $0x1b, REG_A, R12 \ |
| RORXL $2, REG_A, REG_TB \ |
| ANDL REG_C, BP \ // Calculate F for the next round |
| ANDL REG_B, REG_A \ |
| ORL BP, REG_A \ |
| ADDL R12, REG_E |
| |
| #define CALC_39 \ |
| CALC_F3_PRE(0x12c,SI) \ |
| PRECALC_23(Y8,0x0,0x80) \ |
| CALC_F3_POST(DI,DX,CX,SI,AX) |
| |
| #define CALC_40 \ |
| CALC_F3_PRE(0x140,BX) \ |
| PRECALC_16(Y14,Y13,Y8,Y7) \ |
| CALC_F3_POST(SI,AX,DX,BX,DI) |
| |
| #define CALC_41 \ |
| CALC_F3_PRE(0x144,CX) \ |
| PRECALC_17(Y14,Y12,Y7) \ |
| CALC_F3_POST(BX,DI,AX,CX,SI) |
| |
| #define CALC_42 \ |
| CALC_F3_PRE(0x148,DX) \ |
| PRECALC_18(Y7) \ |
| CALC_F3_POST(CX,SI,DI,DX,BX) |
| |
| #define CALC_43 \ |
| CALC_F3_PRE(0x14c,AX) \ |
| PRECALC_19(Y7) \ |
| CALC_F3_POST(DX,BX,SI,AX,CX) |
| |
| #define CALC_44 \ |
| CALC_F3_PRE(0x160,DI) \ |
| PRECALC_20(Y7) \ |
| CALC_F3_POST(AX,CX,BX,DI,DX) |
| |
| #define CALC_45 \ |
| CALC_F3_PRE(0x164,SI) \ |
| PRECALC_21(Y7) \ |
| CALC_F3_POST(DI,DX,CX,SI,AX) |
| |
| #define CALC_46 \ |
| CALC_F3_PRE(0x168,BX) \ |
| CALC_F3_POST(SI,AX,DX,BX,DI) |
| |
| #define CALC_47 \ |
| CALC_F3_PRE(0x16c,CX) \ |
| VPXOR Y9, Y0, Y7 \ |
| VPADDD 0x20(R8), Y7, Y0 \ |
| VMOVDQU Y0, 0xa0(R14) \ |
| CALC_F3_POST(BX,DI,AX,CX,SI) |
| |
| #define CALC_48 \ |
| CALC_F3_PRE(0x180,DX) \ |
| PRECALC_16(Y13,Y12,Y7,Y5) \ |
| CALC_F3_POST(CX,SI,DI,DX,BX) |
| |
| #define CALC_49 \ |
| CALC_F3_PRE(0x184,AX) \ |
| PRECALC_17(Y13,Y8,Y5) \ |
| CALC_F3_POST(DX,BX,SI,AX,CX) |
| |
| #define CALC_50 \ |
| CALC_F3_PRE(0x188,DI) \ |
| PRECALC_18(Y5) \ |
| CALC_F3_POST(AX,CX,BX,DI,DX) |
| |
| #define CALC_51 \ |
| CALC_F3_PRE(0x18c,SI) \ |
| PRECALC_19(Y5) \ |
| CALC_F3_POST(DI,DX,CX,SI,AX) |
| |
| #define CALC_52 \ |
| CALC_F3_PRE(0x1a0,BX) \ |
| PRECALC_20(Y5) \ |
| CALC_F3_POST(SI,AX,DX,BX,DI) |
| |
| #define CALC_53 \ |
| CALC_F3_PRE(0x1a4,CX) \ |
| PRECALC_21(Y5) \ |
| CALC_F3_POST(BX,DI,AX,CX,SI) |
| |
| #define CALC_54 \ |
| CALC_F3_PRE(0x1a8,DX) \ |
| CALC_F3_POST(CX,SI,DI,DX,BX) |
| |
| #define CALC_55 \ |
| CALC_F3_PRE(0x1ac,AX) \ |
| PRECALC_23(Y5,0x20,0xc0) \ |
| CALC_F3_POST(DX,BX,SI,AX,CX) |
| |
| #define CALC_56 \ |
| CALC_F3_PRE(0x1c0,DI) \ |
| PRECALC_16(Y12,Y8,Y5,Y3) \ |
| CALC_F3_POST(AX,CX,BX,DI,DX) |
| |
| #define CALC_57 \ |
| CALC_F3_PRE(0x1c4,SI) \ |
| PRECALC_17(Y12,Y7,Y3) \ |
| CALC_F3_POST(DI,DX,CX,SI,AX) |
| |
| #define CALC_58 \ |
| CALC_F3_PRE(0x1c8,BX) \ |
| PRECALC_18(Y3) \ |
| CALC_F3_POST(SI,AX,DX,BX,DI) |
| |
| #define CALC_59 \ |
| CALC_F2_PRE(0x1cc,BX,SI,CX) \ |
| PRECALC_19(Y3) \ |
| CALC_F2_POST(BX,DI,AX,CX) |
| |
| #define CALC_60 \ |
| CALC_F2_PRE(0x1e0,CX,BX,DX) \ |
| PRECALC_20(Y3) \ |
| CALC_F2_POST(CX,SI,DI,DX) |
| |
| #define CALC_61 \ |
| CALC_F2_PRE(0x1e4,DX,CX,AX) \ |
| PRECALC_21(Y3) \ |
| CALC_F2_POST(DX,BX,SI,AX) |
| |
| #define CALC_62 \ |
| CALC_F2_PRE(0x1e8,AX,DX,DI) \ |
| CALC_F2_POST(AX,CX,BX,DI) |
| |
| #define CALC_63 \ |
| CALC_F2_PRE(0x1ec,DI,AX,SI) \ |
| PRECALC_23(Y3,0x20,0xe0) \ |
| CALC_F2_POST(DI,DX,CX,SI) |
| |
| #define CALC_64 \ |
| CALC_F2_PRE(0x200,SI,DI,BX) \ |
| PRECALC_32(Y5,Y3) \ |
| CALC_F2_POST(SI,AX,DX,BX) |
| |
| #define CALC_65 \ |
| CALC_F2_PRE(0x204,BX,SI,CX) \ |
| PRECALC_33(Y14,Y15) \ |
| CALC_F2_POST(BX,DI,AX,CX) |
| |
| #define CALC_66 \ |
| CALC_F2_PRE(0x208,CX,BX,DX) \ |
| PRECALC_34(Y8) \ |
| CALC_F2_POST(CX,SI,DI,DX) |
| |
| #define CALC_67 \ |
| CALC_F2_PRE(0x20c,DX,CX,AX) \ |
| PRECALC_35(Y15) \ |
| CALC_F2_POST(DX,BX,SI,AX) |
| |
| #define CALC_68 \ |
| CALC_F2_PRE(0x220,AX,DX,DI) \ |
| PRECALC_36(Y15) \ |
| CALC_F2_POST(AX,CX,BX,DI) |
| |
| #define CALC_69 \ |
| CALC_F2_PRE(0x224,DI,AX,SI) \ |
| PRECALC_37(Y15) \ |
| CALC_F2_POST(DI,DX,CX,SI) |
| |
| #define CALC_70 \ |
| CALC_F2_PRE(0x228,SI,DI,BX) \ |
| CALC_F2_POST(SI,AX,DX,BX) |
| |
| #define CALC_71 \ |
| CALC_F2_PRE(0x22c,BX,SI,CX) \ |
| PRECALC_39(Y15,0x20,0x100) \ |
| CALC_F2_POST(BX,DI,AX,CX) |
| |
| #define CALC_72 \ |
| CALC_F2_PRE(0x240,CX,BX,DX) \ |
| PRECALC_32(Y3,Y15) \ |
| CALC_F2_POST(CX,SI,DI,DX) |
| |
| #define CALC_73 \ |
| CALC_F2_PRE(0x244,DX,CX,AX) \ |
| PRECALC_33(Y13,Y14) \ |
| CALC_F2_POST(DX,BX,SI,AX) |
| |
| #define CALC_74 \ |
| CALC_F2_PRE(0x248,AX,DX,DI) \ |
| PRECALC_34(Y7) \ |
| CALC_F2_POST(AX,CX,BX,DI) |
| |
| #define CALC_75 \ |
| CALC_F2_PRE(0x24c,DI,AX,SI) \ |
| PRECALC_35(Y14) \ |
| CALC_F2_POST(DI,DX,CX,SI) |
| |
| #define CALC_76 \ |
| CALC_F2_PRE(0x260,SI,DI,BX) \ |
| PRECALC_36(Y14) \ |
| CALC_F2_POST(SI,AX,DX,BX) |
| |
| #define CALC_77 \ |
| CALC_F2_PRE(0x264,BX,SI,CX) \ |
| PRECALC_37(Y14) \ |
| CALC_F2_POST(BX,DI,AX,CX) |
| |
| #define CALC_78 \ |
| CALC_F2_PRE(0x268,CX,BX,DX) \ |
| CALC_F2_POST(CX,SI,DI,DX) |
| |
| #define CALC_79 \ |
| ADDL 0x26c(R15), AX \ |
| LEAL (AX)(CX*1), AX \ |
| RORXL $0x1b, DX, R12 \ |
| PRECALC_39(Y14,0x20,0x120) \ |
| ADDL R12, AX |
| |
| // Similar to CALC_0 |
| #define CALC_80 \ |
| MOVL CX, DX \ |
| RORXL $2, CX, CX \ |
| ANDNL SI, DX, BP \ |
| ANDL BX, DX \ |
| XORL BP, DX \ |
| CALC_F1_PRE(0x10,AX,DX,BX,DI) \ |
| PRECALC_32(Y15,Y14) \ |
| CALC_F1_POST(AX,CX,DI) |
| |
| #define CALC_81 \ |
| CALC_F1_PRE(0x14,DI,AX,CX,SI) \ |
| PRECALC_33(Y12,Y13) \ |
| CALC_F1_POST(DI,DX,SI) |
| |
| #define CALC_82 \ |
| CALC_F1_PRE(0x18,SI,DI,DX,BX) \ |
| PRECALC_34(Y5) \ |
| CALC_F1_POST(SI,AX,BX) |
| |
| #define CALC_83 \ |
| CALC_F1_PRE(0x1c,BX,SI,AX,CX) \ |
| PRECALC_35(Y13) \ |
| CALC_F1_POST(BX,DI,CX) |
| |
| #define CALC_84 \ |
| CALC_F1_PRE(0x30,CX,BX,DI,DX) \ |
| PRECALC_36(Y13) \ |
| CALC_F1_POST(CX,SI,DX) |
| |
| #define CALC_85 \ |
| CALC_F1_PRE(0x34,DX,CX,SI,AX) \ |
| PRECALC_37(Y13) \ |
| CALC_F1_POST(DX,BX,AX) |
| |
| #define CALC_86 \ |
| CALC_F1_PRE(0x38,AX,DX,BX,DI) \ |
| CALC_F1_POST(AX,CX,DI) |
| |
| #define CALC_87 \ |
| CALC_F1_PRE(0x3c,DI,AX,CX,SI) \ |
| PRECALC_39(Y13,0x40,0x140) \ |
| CALC_F1_POST(DI,DX,SI) |
| |
| #define CALC_88 \ |
| CALC_F1_PRE(0x50,SI,DI,DX,BX) \ |
| PRECALC_32(Y14,Y13) \ |
| CALC_F1_POST(SI,AX,BX) |
| |
| #define CALC_89 \ |
| CALC_F1_PRE(0x54,BX,SI,AX,CX) \ |
| PRECALC_33(Y8,Y12) \ |
| CALC_F1_POST(BX,DI,CX) |
| |
| #define CALC_90 \ |
| CALC_F1_PRE(0x58,CX,BX,DI,DX) \ |
| PRECALC_34(Y3) \ |
| CALC_F1_POST(CX,SI,DX) |
| |
| #define CALC_91 \ |
| CALC_F1_PRE(0x5c,DX,CX,SI,AX) \ |
| PRECALC_35(Y12) \ |
| CALC_F1_POST(DX,BX,AX) |
| |
| #define CALC_92 \ |
| CALC_F1_PRE(0x70,AX,DX,BX,DI) \ |
| PRECALC_36(Y12) \ |
| CALC_F1_POST(AX,CX,DI) |
| |
| #define CALC_93 \ |
| CALC_F1_PRE(0x74,DI,AX,CX,SI) \ |
| PRECALC_37(Y12) \ |
| CALC_F1_POST(DI,DX,SI) |
| |
| #define CALC_94 \ |
| CALC_F1_PRE(0x78,SI,DI,DX,BX) \ |
| CALC_F1_POST(SI,AX,BX) |
| |
| #define CALC_95 \ |
| CALC_F1_PRE(0x7c,BX,SI,AX,CX) \ |
| PRECALC_39(Y12,0x40,0x160) \ |
| CALC_F1_POST(BX,DI,CX) |
| |
| #define CALC_96 \ |
| CALC_F1_PRE(0x90,CX,BX,DI,DX) \ |
| PRECALC_32(Y13,Y12) \ |
| CALC_F1_POST(CX,SI,DX) |
| |
| #define CALC_97 \ |
| CALC_F1_PRE(0x94,DX,CX,SI,AX) \ |
| PRECALC_33(Y7,Y8) \ |
| CALC_F1_POST(DX,BX,AX) |
| |
| #define CALC_98 \ |
| CALC_F1_PRE(0x98,AX,DX,BX,DI) \ |
| PRECALC_34(Y15) \ |
| CALC_F1_POST(AX,CX,DI) |
| |
| #define CALC_99 \ |
| CALC_F2_PRE(0x9c,DI,AX,SI) \ |
| PRECALC_35(Y8) \ |
| CALC_F2_POST(DI,DX,CX,SI) |
| |
| #define CALC_100 \ |
| CALC_F2_PRE(0xb0,SI,DI,BX) \ |
| PRECALC_36(Y8) \ |
| CALC_F2_POST(SI,AX,DX,BX) |
| |
| #define CALC_101 \ |
| CALC_F2_PRE(0xb4,BX,SI,CX) \ |
| PRECALC_37(Y8) \ |
| CALC_F2_POST(BX,DI,AX,CX) |
| |
| #define CALC_102 \ |
| CALC_F2_PRE(0xb8,CX,BX,DX) \ |
| CALC_F2_POST(CX,SI,DI,DX) |
| |
| #define CALC_103 \ |
| CALC_F2_PRE(0xbc,DX,CX,AX) \ |
| PRECALC_39(Y8,0x40,0x180) \ |
| CALC_F2_POST(DX,BX,SI,AX) |
| |
| #define CALC_104 \ |
| CALC_F2_PRE(0xd0,AX,DX,DI) \ |
| PRECALC_32(Y12,Y8) \ |
| CALC_F2_POST(AX,CX,BX,DI) |
| |
| #define CALC_105 \ |
| CALC_F2_PRE(0xd4,DI,AX,SI) \ |
| PRECALC_33(Y5,Y7) \ |
| CALC_F2_POST(DI,DX,CX,SI) |
| |
| #define CALC_106 \ |
| CALC_F2_PRE(0xd8,SI,DI,BX) \ |
| PRECALC_34(Y14) \ |
| CALC_F2_POST(SI,AX,DX,BX) |
| |
| #define CALC_107 \ |
| CALC_F2_PRE(0xdc,BX,SI,CX) \ |
| PRECALC_35(Y7) \ |
| CALC_F2_POST(BX,DI,AX,CX) |
| |
| #define CALC_108 \ |
| CALC_F2_PRE(0xf0,CX,BX,DX) \ |
| PRECALC_36(Y7) \ |
| CALC_F2_POST(CX,SI,DI,DX) |
| |
| #define CALC_109 \ |
| CALC_F2_PRE(0xf4,DX,CX,AX) \ |
| PRECALC_37(Y7) \ |
| CALC_F2_POST(DX,BX,SI,AX) |
| |
| #define CALC_110 \ |
| CALC_F2_PRE(0xf8,AX,DX,DI) \ |
| CALC_F2_POST(AX,CX,BX,DI) |
| |
| #define CALC_111 \ |
| CALC_F2_PRE(0xfc,DI,AX,SI) \ |
| PRECALC_39(Y7,0x40,0x1a0) \ |
| CALC_F2_POST(DI,DX,CX,SI) |
| |
| #define CALC_112 \ |
| CALC_F2_PRE(0x110,SI,DI,BX) \ |
| PRECALC_32(Y8,Y7) \ |
| CALC_F2_POST(SI,AX,DX,BX) |
| |
| #define CALC_113 \ |
| CALC_F2_PRE(0x114,BX,SI,CX) \ |
| PRECALC_33(Y3,Y5) \ |
| CALC_F2_POST(BX,DI,AX,CX) |
| |
| #define CALC_114 \ |
| CALC_F2_PRE(0x118,CX,BX,DX) \ |
| PRECALC_34(Y13) \ |
| CALC_F2_POST(CX,SI,DI,DX) |
| |
| #define CALC_115 \ |
| CALC_F2_PRE(0x11c,DX,CX,AX) \ |
| PRECALC_35(Y5) \ |
| CALC_F2_POST(DX,BX,SI,AX) |
| |
| #define CALC_116 \ |
| CALC_F2_PRE(0x130,AX,DX,DI) \ |
| PRECALC_36(Y5) \ |
| CALC_F2_POST(AX,CX,BX,DI) |
| |
| #define CALC_117 \ |
| CALC_F2_PRE(0x134,DI,AX,SI) \ |
| PRECALC_37(Y5) \ |
| CALC_F2_POST(DI,DX,CX,SI) |
| |
| #define CALC_118 \ |
| CALC_F2_PRE(0x138,SI,DI,BX) \ |
| CALC_F2_POST(SI,AX,DX,BX) |
| |
| #define CALC_119 \ |
| CALC_F3_PRE(0x13c,CX) \ |
| PRECALC_39(Y5,0x40,0x1c0) \ |
| CALC_F3_POST(BX,DI,AX,CX,SI) |
| |
| #define CALC_120 \ |
| CALC_F3_PRE(0x150,DX) \ |
| PRECALC_32(Y7,Y5) \ |
| CALC_F3_POST(CX,SI,DI,DX,BX) |
| |
| #define CALC_121 \ |
| CALC_F3_PRE(0x154,AX) \ |
| PRECALC_33(Y15,Y3) \ |
| CALC_F3_POST(DX,BX,SI,AX,CX) |
| |
| #define CALC_122 \ |
| CALC_F3_PRE(0x158,DI) \ |
| PRECALC_34(Y12) \ |
| CALC_F3_POST(AX,CX,BX,DI,DX) |
| |
| #define CALC_123 \ |
| CALC_F3_PRE(0x15c,SI) \ |
| PRECALC_35(Y3) \ |
| CALC_F3_POST(DI,DX,CX,SI,AX) |
| |
| #define CALC_124 \ |
| CALC_F3_PRE(0x170,BX) \ |
| PRECALC_36(Y3) \ |
| CALC_F3_POST(SI,AX,DX,BX,DI) |
| |
| #define CALC_125 \ |
| CALC_F3_PRE(0x174,CX) \ |
| PRECALC_37(Y3) \ |
| CALC_F3_POST(BX,DI,AX,CX,SI) |
| |
| #define CALC_126 \ |
| CALC_F3_PRE(0x178,DX) \ |
| CALC_F3_POST(CX,SI,DI,DX,BX) |
| |
| #define CALC_127 \ |
| CALC_F3_PRE(0x17c,AX) \ |
| PRECALC_39(Y3,0x60,0x1e0) \ |
| CALC_F3_POST(DX,BX,SI,AX,CX) |
| |
| #define CALC_128 \ |
| CALC_F3_PRE(0x190,DI) \ |
| PRECALC_32(Y5,Y3) \ |
| CALC_F3_POST(AX,CX,BX,DI,DX) |
| |
| #define CALC_129 \ |
| CALC_F3_PRE(0x194,SI) \ |
| PRECALC_33(Y14,Y15) \ |
| CALC_F3_POST(DI,DX,CX,SI,AX) |
| |
| #define CALC_130 \ |
| CALC_F3_PRE(0x198,BX) \ |
| PRECALC_34(Y8) \ |
| CALC_F3_POST(SI,AX,DX,BX,DI) |
| |
| #define CALC_131 \ |
| CALC_F3_PRE(0x19c,CX) \ |
| PRECALC_35(Y15) \ |
| CALC_F3_POST(BX,DI,AX,CX,SI) |
| |
| #define CALC_132 \ |
| CALC_F3_PRE(0x1b0,DX) \ |
| PRECALC_36(Y15) \ |
| CALC_F3_POST(CX,SI,DI,DX,BX) |
| |
| #define CALC_133 \ |
| CALC_F3_PRE(0x1b4,AX) \ |
| PRECALC_37(Y15) \ |
| CALC_F3_POST(DX,BX,SI,AX,CX) |
| |
| #define CALC_134 \ |
| CALC_F3_PRE(0x1b8,DI) \ |
| CALC_F3_POST(AX,CX,BX,DI,DX) |
| |
| #define CALC_135 \ |
| CALC_F3_PRE(0x1bc,SI) \ |
| PRECALC_39(Y15,0x60,0x200) \ |
| CALC_F3_POST(DI,DX,CX,SI,AX) |
| |
| #define CALC_136 \ |
| CALC_F3_PRE(0x1d0,BX) \ |
| PRECALC_32(Y3,Y15) \ |
| CALC_F3_POST(SI,AX,DX,BX,DI) |
| |
| #define CALC_137 \ |
| CALC_F3_PRE(0x1d4,CX) \ |
| PRECALC_33(Y13,Y14) \ |
| CALC_F3_POST(BX,DI,AX,CX,SI) |
| |
| #define CALC_138 \ |
| CALC_F3_PRE(0x1d8,DX) \ |
| PRECALC_34(Y7) \ |
| CALC_F3_POST(CX,SI,DI,DX,BX) |
| |
| #define CALC_139 \ |
| CALC_F2_PRE(0x1dc,DX,CX,AX) \ |
| PRECALC_35(Y14) \ |
| CALC_F2_POST(DX,BX,SI,AX) |
| |
| #define CALC_140 \ |
| CALC_F2_PRE(0x1f0,AX,DX,DI) \ |
| PRECALC_36(Y14) \ |
| CALC_F2_POST(AX,CX,BX,DI) |
| |
| #define CALC_141 \ |
| CALC_F2_PRE(0x1f4,DI,AX,SI) \ |
| PRECALC_37(Y14) \ |
| CALC_F2_POST(DI,DX,CX,SI) |
| |
| #define CALC_142 \ |
| CALC_F2_PRE(0x1f8,SI,DI,BX) \ |
| CALC_F2_POST(SI,AX,DX,BX) |
| |
| #define CALC_143 \ |
| CALC_F2_PRE(0x1fc,BX,SI,CX) \ |
| PRECALC_39(Y14,0x60,0x220) \ |
| CALC_F2_POST(BX,DI,AX,CX) |
| |
| #define CALC_144 \ |
| CALC_F2_PRE(0x210,CX,BX,DX) \ |
| PRECALC_32(Y15,Y14) \ |
| CALC_F2_POST(CX,SI,DI,DX) |
| |
| #define CALC_145 \ |
| CALC_F2_PRE(0x214,DX,CX,AX) \ |
| PRECALC_33(Y12,Y13) \ |
| CALC_F2_POST(DX,BX,SI,AX) |
| |
| #define CALC_146 \ |
| CALC_F2_PRE(0x218,AX,DX,DI) \ |
| PRECALC_34(Y5) \ |
| CALC_F2_POST(AX,CX,BX,DI) |
| |
| #define CALC_147 \ |
| CALC_F2_PRE(0x21c,DI,AX,SI) \ |
| PRECALC_35(Y13) \ |
| CALC_F2_POST(DI,DX,CX,SI) |
| |
| #define CALC_148 \ |
| CALC_F2_PRE(0x230,SI,DI,BX) \ |
| PRECALC_36(Y13) \ |
| CALC_F2_POST(SI,AX,DX,BX) |
| |
| #define CALC_149 \ |
| CALC_F2_PRE(0x234,BX,SI,CX) \ |
| PRECALC_37(Y13) \ |
| CALC_F2_POST(BX,DI,AX,CX) |
| |
| #define CALC_150 \ |
| CALC_F2_PRE(0x238,CX,BX,DX) \ |
| CALC_F2_POST(CX,SI,DI,DX) |
| |
| #define CALC_151 \ |
| CALC_F2_PRE(0x23c,DX,CX,AX) \ |
| PRECALC_39(Y13,0x60,0x240) \ |
| CALC_F2_POST(DX,BX,SI,AX) |
| |
| #define CALC_152 \ |
| CALC_F2_PRE(0x250,AX,DX,DI) \ |
| PRECALC_32(Y14,Y13) \ |
| CALC_F2_POST(AX,CX,BX,DI) |
| |
| #define CALC_153 \ |
| CALC_F2_PRE(0x254,DI,AX,SI) \ |
| PRECALC_33(Y8,Y12) \ |
| CALC_F2_POST(DI,DX,CX,SI) |
| |
| #define CALC_154 \ |
| CALC_F2_PRE(0x258,SI,DI,BX) \ |
| PRECALC_34(Y3) \ |
| CALC_F2_POST(SI,AX,DX,BX) |
| |
| #define CALC_155 \ |
| CALC_F2_PRE(0x25c,BX,SI,CX) \ |
| PRECALC_35(Y12) \ |
| CALC_F2_POST(BX,DI,AX,CX) |
| |
| #define CALC_156 \ |
| CALC_F2_PRE(0x270,CX,BX,DX) \ |
| PRECALC_36(Y12) \ |
| CALC_F2_POST(CX,SI,DI,DX) |
| |
| #define CALC_157 \ |
| CALC_F2_PRE(0x274,DX,CX,AX) \ |
| PRECALC_37(Y12) \ |
| CALC_F2_POST(DX,BX,SI,AX) |
| |
| #define CALC_158 \ |
| CALC_F2_PRE(0x278,AX,DX,DI) \ |
| CALC_F2_POST(AX,CX,BX,DI) |
| |
| #define CALC_159 \ |
| ADDL 0x27c(R15),SI \ |
| LEAL (SI)(AX*1), SI \ |
| RORXL $0x1b, DI, R12 \ |
| PRECALC_39(Y12,0x60,0x260) \ |
| ADDL R12, SI |
| |
| |
| |
| #define CALC \ |
| MOVL (R9), CX \ |
| MOVL 4(R9), SI \ |
| MOVL 8(R9), DI \ |
| MOVL 12(R9), AX \ |
| MOVL 16(R9), DX \ |
| MOVQ SP, R14 \ |
| LEAQ (2*4*80+32)(SP), R15 \ |
| PRECALC \ // Precalc WK for first 2 blocks |
| XCHGQ R15, R14 \ |
| loop: \ // this loops is unrolled |
| CMPQ R10, R8 \ // we use R8 value (set below) as a signal of a last block |
| JNE begin \ |
| VZEROUPPER \ |
| RET \ |
| begin: \ |
| CALC_0 \ |
| CALC_1 \ |
| CALC_2 \ |
| CALC_3 \ |
| CALC_4 \ |
| CALC_5 \ |
| CALC_6 \ |
| CALC_7 \ |
| CALC_8 \ |
| CALC_9 \ |
| CALC_10 \ |
| CALC_11 \ |
| CALC_12 \ |
| CALC_13 \ |
| CALC_14 \ |
| CALC_15 \ |
| CALC_16 \ |
| CALC_17 \ |
| CALC_18 \ |
| CALC_19 \ |
| CALC_20 \ |
| CALC_21 \ |
| CALC_22 \ |
| CALC_23 \ |
| CALC_24 \ |
| CALC_25 \ |
| CALC_26 \ |
| CALC_27 \ |
| CALC_28 \ |
| CALC_29 \ |
| CALC_30 \ |
| CALC_31 \ |
| CALC_32 \ |
| CALC_33 \ |
| CALC_34 \ |
| CALC_35 \ |
| CALC_36 \ |
| CALC_37 \ |
| CALC_38 \ |
| CALC_39 \ |
| CALC_40 \ |
| CALC_41 \ |
| CALC_42 \ |
| CALC_43 \ |
| CALC_44 \ |
| CALC_45 \ |
| CALC_46 \ |
| CALC_47 \ |
| CALC_48 \ |
| CALC_49 \ |
| CALC_50 \ |
| CALC_51 \ |
| CALC_52 \ |
| CALC_53 \ |
| CALC_54 \ |
| CALC_55 \ |
| CALC_56 \ |
| CALC_57 \ |
| CALC_58 \ |
| CALC_59 \ |
| ADDQ $128, R10 \ // move to next even-64-byte block |
| CMPQ R10, R11 \ // is current block the last one? |
| CMOVQCC R8, R10 \ // signal the last iteration smartly |
| CALC_60 \ |
| CALC_61 \ |
| CALC_62 \ |
| CALC_63 \ |
| CALC_64 \ |
| CALC_65 \ |
| CALC_66 \ |
| CALC_67 \ |
| CALC_68 \ |
| CALC_69 \ |
| CALC_70 \ |
| CALC_71 \ |
| CALC_72 \ |
| CALC_73 \ |
| CALC_74 \ |
| CALC_75 \ |
| CALC_76 \ |
| CALC_77 \ |
| CALC_78 \ |
| CALC_79 \ |
| UPDATE_HASH(AX,DX,BX,SI,DI) \ |
| CMPQ R10, R8 \ // is current block the last one? |
| JE loop\ |
| MOVL DX, CX \ |
| CALC_80 \ |
| CALC_81 \ |
| CALC_82 \ |
| CALC_83 \ |
| CALC_84 \ |
| CALC_85 \ |
| CALC_86 \ |
| CALC_87 \ |
| CALC_88 \ |
| CALC_89 \ |
| CALC_90 \ |
| CALC_91 \ |
| CALC_92 \ |
| CALC_93 \ |
| CALC_94 \ |
| CALC_95 \ |
| CALC_96 \ |
| CALC_97 \ |
| CALC_98 \ |
| CALC_99 \ |
| CALC_100 \ |
| CALC_101 \ |
| CALC_102 \ |
| CALC_103 \ |
| CALC_104 \ |
| CALC_105 \ |
| CALC_106 \ |
| CALC_107 \ |
| CALC_108 \ |
| CALC_109 \ |
| CALC_110 \ |
| CALC_111 \ |
| CALC_112 \ |
| CALC_113 \ |
| CALC_114 \ |
| CALC_115 \ |
| CALC_116 \ |
| CALC_117 \ |
| CALC_118 \ |
| CALC_119 \ |
| CALC_120 \ |
| CALC_121 \ |
| CALC_122 \ |
| CALC_123 \ |
| CALC_124 \ |
| CALC_125 \ |
| CALC_126 \ |
| CALC_127 \ |
| CALC_128 \ |
| CALC_129 \ |
| CALC_130 \ |
| CALC_131 \ |
| CALC_132 \ |
| CALC_133 \ |
| CALC_134 \ |
| CALC_135 \ |
| CALC_136 \ |
| CALC_137 \ |
| CALC_138 \ |
| CALC_139 \ |
| ADDQ $128, R13 \ //move to next even-64-byte block |
| CMPQ R13, R11 \ //is current block the last one? |
| CMOVQCC R8, R10 \ |
| CALC_140 \ |
| CALC_141 \ |
| CALC_142 \ |
| CALC_143 \ |
| CALC_144 \ |
| CALC_145 \ |
| CALC_146 \ |
| CALC_147 \ |
| CALC_148 \ |
| CALC_149 \ |
| CALC_150 \ |
| CALC_151 \ |
| CALC_152 \ |
| CALC_153 \ |
| CALC_154 \ |
| CALC_155 \ |
| CALC_156 \ |
| CALC_157 \ |
| CALC_158 \ |
| CALC_159 \ |
| UPDATE_HASH(SI,DI,DX,CX,BX) \ |
| MOVL SI, R12 \ //Reset state for AVX2 reg permutation |
| MOVL DI, SI \ |
| MOVL DX, DI \ |
| MOVL BX, DX \ |
| MOVL CX, AX \ |
| MOVL R12, CX \ |
| XCHGQ R15, R14 \ |
| JMP loop |
| |
| |
| |
| TEXT ·blockAVX2(SB),$1408-32 |
| |
| MOVQ dig+0(FP), DI |
| MOVQ p_base+8(FP), SI |
| MOVQ p_len+16(FP), DX |
| SHRQ $6, DX |
| SHLQ $6, DX |
| |
| MOVQ $K_XMM_AR<>(SB), R8 |
| |
| MOVQ DI, R9 |
| MOVQ SI, R10 |
| LEAQ 64(SI), R13 |
| |
| ADDQ SI, DX |
| ADDQ $64, DX |
| MOVQ DX, R11 |
| |
| CMPQ R13, R11 |
| CMOVQCC R8, R13 |
| |
| VMOVDQU BSWAP_SHUFB_CTL<>(SB), Y10 |
| |
| CALC // RET is inside macros |
| |
| DATA K_XMM_AR<>+0x00(SB)/4,$0x5a827999 |
| DATA K_XMM_AR<>+0x04(SB)/4,$0x5a827999 |
| DATA K_XMM_AR<>+0x08(SB)/4,$0x5a827999 |
| DATA K_XMM_AR<>+0x0c(SB)/4,$0x5a827999 |
| DATA K_XMM_AR<>+0x10(SB)/4,$0x5a827999 |
| DATA K_XMM_AR<>+0x14(SB)/4,$0x5a827999 |
| DATA K_XMM_AR<>+0x18(SB)/4,$0x5a827999 |
| DATA K_XMM_AR<>+0x1c(SB)/4,$0x5a827999 |
| DATA K_XMM_AR<>+0x20(SB)/4,$0x6ed9eba1 |
| DATA K_XMM_AR<>+0x24(SB)/4,$0x6ed9eba1 |
| DATA K_XMM_AR<>+0x28(SB)/4,$0x6ed9eba1 |
| DATA K_XMM_AR<>+0x2c(SB)/4,$0x6ed9eba1 |
| DATA K_XMM_AR<>+0x30(SB)/4,$0x6ed9eba1 |
| DATA K_XMM_AR<>+0x34(SB)/4,$0x6ed9eba1 |
| DATA K_XMM_AR<>+0x38(SB)/4,$0x6ed9eba1 |
| DATA K_XMM_AR<>+0x3c(SB)/4,$0x6ed9eba1 |
| DATA K_XMM_AR<>+0x40(SB)/4,$0x8f1bbcdc |
| DATA K_XMM_AR<>+0x44(SB)/4,$0x8f1bbcdc |
| DATA K_XMM_AR<>+0x48(SB)/4,$0x8f1bbcdc |
| DATA K_XMM_AR<>+0x4c(SB)/4,$0x8f1bbcdc |
| DATA K_XMM_AR<>+0x50(SB)/4,$0x8f1bbcdc |
| DATA K_XMM_AR<>+0x54(SB)/4,$0x8f1bbcdc |
| DATA K_XMM_AR<>+0x58(SB)/4,$0x8f1bbcdc |
| DATA K_XMM_AR<>+0x5c(SB)/4,$0x8f1bbcdc |
| DATA K_XMM_AR<>+0x60(SB)/4,$0xca62c1d6 |
| DATA K_XMM_AR<>+0x64(SB)/4,$0xca62c1d6 |
| DATA K_XMM_AR<>+0x68(SB)/4,$0xca62c1d6 |
| DATA K_XMM_AR<>+0x6c(SB)/4,$0xca62c1d6 |
| DATA K_XMM_AR<>+0x70(SB)/4,$0xca62c1d6 |
| DATA K_XMM_AR<>+0x74(SB)/4,$0xca62c1d6 |
| DATA K_XMM_AR<>+0x78(SB)/4,$0xca62c1d6 |
| DATA K_XMM_AR<>+0x7c(SB)/4,$0xca62c1d6 |
| GLOBL K_XMM_AR<>(SB),RODATA,$128 |
| |
| DATA BSWAP_SHUFB_CTL<>+0x00(SB)/4,$0x00010203 |
| DATA BSWAP_SHUFB_CTL<>+0x04(SB)/4,$0x04050607 |
| DATA BSWAP_SHUFB_CTL<>+0x08(SB)/4,$0x08090a0b |
| DATA BSWAP_SHUFB_CTL<>+0x0c(SB)/4,$0x0c0d0e0f |
| DATA BSWAP_SHUFB_CTL<>+0x10(SB)/4,$0x00010203 |
| DATA BSWAP_SHUFB_CTL<>+0x14(SB)/4,$0x04050607 |
| DATA BSWAP_SHUFB_CTL<>+0x18(SB)/4,$0x08090a0b |
| DATA BSWAP_SHUFB_CTL<>+0x1c(SB)/4,$0x0c0d0e0f |
| GLOBL BSWAP_SHUFB_CTL<>(SB),RODATA,$32 |