| // Code generated by command: go run sha256block_amd64_asm.go -out ../sha256block_amd64.s. DO NOT EDIT. |
| |
| //go:build !purego |
| |
| #include "textflag.h" |
| |
| // func blockAVX2(dig *Digest, p []byte) |
| // Requires: AVX, AVX2, BMI2 |
| TEXT ·blockAVX2(SB), $536-32 |
| MOVQ dig+0(FP), SI |
| MOVQ p_base+8(FP), DI |
| MOVQ p_len+16(FP), DX |
| LEAQ -64(DI)(DX*1), DX |
| MOVQ DX, 512(SP) |
| CMPQ DX, DI |
| JE avx2_only_one_block |
| |
| // Load initial digest |
| MOVL (SI), AX |
| MOVL 4(SI), BX |
| MOVL 8(SI), CX |
| MOVL 12(SI), R8 |
| MOVL 16(SI), DX |
| MOVL 20(SI), R9 |
| MOVL 24(SI), R10 |
| MOVL 28(SI), R11 |
| |
| avx2_loop0: |
| // at each iteration works with one block (512 bit) |
| VMOVDQU (DI), Y0 |
| VMOVDQU 32(DI), Y1 |
| VMOVDQU 64(DI), Y2 |
| VMOVDQU 96(DI), Y3 |
| VMOVDQU flip_mask<>+0(SB), Y13 |
| |
| // Apply Byte Flip Mask: LE -> BE |
| VPSHUFB Y13, Y0, Y0 |
| VPSHUFB Y13, Y1, Y1 |
| VPSHUFB Y13, Y2, Y2 |
| VPSHUFB Y13, Y3, Y3 |
| |
| // Transpose data into high/low parts |
| VPERM2I128 $0x20, Y2, Y0, Y4 |
| VPERM2I128 $0x31, Y2, Y0, Y5 |
| VPERM2I128 $0x20, Y3, Y1, Y6 |
| VPERM2I128 $0x31, Y3, Y1, Y7 |
| LEAQ K256<>+0(SB), BP |
| |
| avx2_last_block_enter: |
| ADDQ $0x40, DI |
| MOVQ DI, 520(SP) |
| XORQ SI, SI |
| |
| avx2_loop1: |
| // Do 4 rounds and scheduling |
| VPADDD (BP)(SI*1), Y4, Y9 |
| VMOVDQU Y9, (SP)(SI*1) |
| MOVL AX, DI |
| RORXL $0x19, DX, R13 |
| RORXL $0x0b, DX, R14 |
| ADDL (SP)(SI*1), R11 |
| ORL CX, DI |
| VPALIGNR $0x04, Y6, Y7, Y0 |
| MOVL R9, R15 |
| RORXL $0x0d, AX, R12 |
| XORL R14, R13 |
| XORL R10, R15 |
| VPADDD Y4, Y0, Y0 |
| RORXL $0x06, DX, R14 |
| ANDL DX, R15 |
| XORL R14, R13 |
| RORXL $0x16, AX, R14 |
| ADDL R11, R8 |
| ANDL BX, DI |
| VPALIGNR $0x04, Y4, Y5, Y1 |
| XORL R12, R14 |
| RORXL $0x02, AX, R12 |
| XORL R10, R15 |
| VPSRLD $0x07, Y1, Y2 |
| XORL R12, R14 |
| MOVL AX, R12 |
| ANDL CX, R12 |
| ADDL R13, R15 |
| VPSLLD $0x19, Y1, Y3 |
| ORL R12, DI |
| ADDL R14, R11 |
| ADDL R15, R8 |
| VPOR Y2, Y3, Y3 |
| VPSRLD $0x12, Y1, Y2 |
| ADDL R15, R11 |
| ADDL DI, R11 |
| MOVL R11, DI |
| RORXL $0x19, R8, R13 |
| RORXL $0x0b, R8, R14 |
| ADDL 4(SP)(SI*1), R10 |
| ORL BX, DI |
| VPSRLD $0x03, Y1, Y8 |
| MOVL DX, R15 |
| RORXL $0x0d, R11, R12 |
| XORL R14, R13 |
| XORL R9, R15 |
| RORXL $0x06, R8, R14 |
| XORL R14, R13 |
| RORXL $0x16, R11, R14 |
| ANDL R8, R15 |
| ADDL R10, CX |
| VPSLLD $0x0e, Y1, Y1 |
| ANDL AX, DI |
| XORL R12, R14 |
| VPXOR Y1, Y3, Y3 |
| RORXL $0x02, R11, R12 |
| XORL R9, R15 |
| VPXOR Y2, Y3, Y3 |
| XORL R12, R14 |
| MOVL R11, R12 |
| ANDL BX, R12 |
| ADDL R13, R15 |
| VPXOR Y8, Y3, Y1 |
| VPSHUFD $0xfa, Y7, Y2 |
| ORL R12, DI |
| ADDL R14, R10 |
| VPADDD Y1, Y0, Y0 |
| ADDL R15, CX |
| ADDL R15, R10 |
| ADDL DI, R10 |
| VPSRLD $0x0a, Y2, Y8 |
| MOVL R10, DI |
| RORXL $0x19, CX, R13 |
| ADDL 8(SP)(SI*1), R9 |
| VPSRLQ $0x13, Y2, Y3 |
| RORXL $0x0b, CX, R14 |
| ORL AX, DI |
| MOVL R8, R15 |
| XORL DX, R15 |
| RORXL $0x0d, R10, R12 |
| XORL R14, R13 |
| VPSRLQ $0x11, Y2, Y2 |
| ANDL CX, R15 |
| RORXL $0x06, CX, R14 |
| VPXOR Y3, Y2, Y2 |
| ADDL R9, BX |
| ANDL R11, DI |
| XORL R14, R13 |
| RORXL $0x16, R10, R14 |
| VPXOR Y2, Y8, Y8 |
| XORL DX, R15 |
| VPSHUFB shuff_00BA<>+0(SB), Y8, Y8 |
| XORL R12, R14 |
| RORXL $0x02, R10, R12 |
| VPADDD Y8, Y0, Y0 |
| XORL R12, R14 |
| MOVL R10, R12 |
| ANDL AX, R12 |
| ADDL R13, R15 |
| VPSHUFD $0x50, Y0, Y2 |
| ORL R12, DI |
| ADDL R14, R9 |
| ADDL R15, BX |
| ADDL R15, R9 |
| ADDL DI, R9 |
| MOVL R9, DI |
| RORXL $0x19, BX, R13 |
| RORXL $0x0b, BX, R14 |
| ADDL 12(SP)(SI*1), DX |
| ORL R11, DI |
| VPSRLD $0x0a, Y2, Y11 |
| MOVL CX, R15 |
| RORXL $0x0d, R9, R12 |
| XORL R14, R13 |
| XORL R8, R15 |
| VPSRLQ $0x13, Y2, Y3 |
| RORXL $0x06, BX, R14 |
| ANDL BX, R15 |
| ADDL DX, AX |
| ANDL R10, DI |
| VPSRLQ $0x11, Y2, Y2 |
| XORL R14, R13 |
| XORL R8, R15 |
| VPXOR Y3, Y2, Y2 |
| RORXL $0x16, R9, R14 |
| ADDL R13, R15 |
| VPXOR Y2, Y11, Y11 |
| XORL R12, R14 |
| ADDL R15, AX |
| RORXL $0x02, R9, R12 |
| VPSHUFB shuff_DC00<>+0(SB), Y11, Y11 |
| VPADDD Y0, Y11, Y4 |
| XORL R12, R14 |
| MOVL R9, R12 |
| ANDL R11, R12 |
| ORL R12, DI |
| ADDL R14, DX |
| ADDL R15, DX |
| ADDL DI, DX |
| |
| // Do 4 rounds and scheduling |
| VPADDD 32(BP)(SI*1), Y5, Y9 |
| VMOVDQU Y9, 32(SP)(SI*1) |
| MOVL DX, DI |
| RORXL $0x19, AX, R13 |
| RORXL $0x0b, AX, R14 |
| ADDL 32(SP)(SI*1), R8 |
| ORL R10, DI |
| VPALIGNR $0x04, Y7, Y4, Y0 |
| MOVL BX, R15 |
| RORXL $0x0d, DX, R12 |
| XORL R14, R13 |
| XORL CX, R15 |
| VPADDD Y5, Y0, Y0 |
| RORXL $0x06, AX, R14 |
| ANDL AX, R15 |
| XORL R14, R13 |
| RORXL $0x16, DX, R14 |
| ADDL R8, R11 |
| ANDL R9, DI |
| VPALIGNR $0x04, Y5, Y6, Y1 |
| XORL R12, R14 |
| RORXL $0x02, DX, R12 |
| XORL CX, R15 |
| VPSRLD $0x07, Y1, Y2 |
| XORL R12, R14 |
| MOVL DX, R12 |
| ANDL R10, R12 |
| ADDL R13, R15 |
| VPSLLD $0x19, Y1, Y3 |
| ORL R12, DI |
| ADDL R14, R8 |
| ADDL R15, R11 |
| VPOR Y2, Y3, Y3 |
| VPSRLD $0x12, Y1, Y2 |
| ADDL R15, R8 |
| ADDL DI, R8 |
| MOVL R8, DI |
| RORXL $0x19, R11, R13 |
| RORXL $0x0b, R11, R14 |
| ADDL 36(SP)(SI*1), CX |
| ORL R9, DI |
| VPSRLD $0x03, Y1, Y8 |
| MOVL AX, R15 |
| RORXL $0x0d, R8, R12 |
| XORL R14, R13 |
| XORL BX, R15 |
| RORXL $0x06, R11, R14 |
| XORL R14, R13 |
| RORXL $0x16, R8, R14 |
| ANDL R11, R15 |
| ADDL CX, R10 |
| VPSLLD $0x0e, Y1, Y1 |
| ANDL DX, DI |
| XORL R12, R14 |
| VPXOR Y1, Y3, Y3 |
| RORXL $0x02, R8, R12 |
| XORL BX, R15 |
| VPXOR Y2, Y3, Y3 |
| XORL R12, R14 |
| MOVL R8, R12 |
| ANDL R9, R12 |
| ADDL R13, R15 |
| VPXOR Y8, Y3, Y1 |
| VPSHUFD $0xfa, Y4, Y2 |
| ORL R12, DI |
| ADDL R14, CX |
| VPADDD Y1, Y0, Y0 |
| ADDL R15, R10 |
| ADDL R15, CX |
| ADDL DI, CX |
| VPSRLD $0x0a, Y2, Y8 |
| MOVL CX, DI |
| RORXL $0x19, R10, R13 |
| ADDL 40(SP)(SI*1), BX |
| VPSRLQ $0x13, Y2, Y3 |
| RORXL $0x0b, R10, R14 |
| ORL DX, DI |
| MOVL R11, R15 |
| XORL AX, R15 |
| RORXL $0x0d, CX, R12 |
| XORL R14, R13 |
| VPSRLQ $0x11, Y2, Y2 |
| ANDL R10, R15 |
| RORXL $0x06, R10, R14 |
| VPXOR Y3, Y2, Y2 |
| ADDL BX, R9 |
| ANDL R8, DI |
| XORL R14, R13 |
| RORXL $0x16, CX, R14 |
| VPXOR Y2, Y8, Y8 |
| XORL AX, R15 |
| VPSHUFB shuff_00BA<>+0(SB), Y8, Y8 |
| XORL R12, R14 |
| RORXL $0x02, CX, R12 |
| VPADDD Y8, Y0, Y0 |
| XORL R12, R14 |
| MOVL CX, R12 |
| ANDL DX, R12 |
| ADDL R13, R15 |
| VPSHUFD $0x50, Y0, Y2 |
| ORL R12, DI |
| ADDL R14, BX |
| ADDL R15, R9 |
| ADDL R15, BX |
| ADDL DI, BX |
| MOVL BX, DI |
| RORXL $0x19, R9, R13 |
| RORXL $0x0b, R9, R14 |
| ADDL 44(SP)(SI*1), AX |
| ORL R8, DI |
| VPSRLD $0x0a, Y2, Y11 |
| MOVL R10, R15 |
| RORXL $0x0d, BX, R12 |
| XORL R14, R13 |
| XORL R11, R15 |
| VPSRLQ $0x13, Y2, Y3 |
| RORXL $0x06, R9, R14 |
| ANDL R9, R15 |
| ADDL AX, DX |
| ANDL CX, DI |
| VPSRLQ $0x11, Y2, Y2 |
| XORL R14, R13 |
| XORL R11, R15 |
| VPXOR Y3, Y2, Y2 |
| RORXL $0x16, BX, R14 |
| ADDL R13, R15 |
| VPXOR Y2, Y11, Y11 |
| XORL R12, R14 |
| ADDL R15, DX |
| RORXL $0x02, BX, R12 |
| VPSHUFB shuff_DC00<>+0(SB), Y11, Y11 |
| VPADDD Y0, Y11, Y5 |
| XORL R12, R14 |
| MOVL BX, R12 |
| ANDL R8, R12 |
| ORL R12, DI |
| ADDL R14, AX |
| ADDL R15, AX |
| ADDL DI, AX |
| |
| // Do 4 rounds and scheduling |
| VPADDD 64(BP)(SI*1), Y6, Y9 |
| VMOVDQU Y9, 64(SP)(SI*1) |
| MOVL AX, DI |
| RORXL $0x19, DX, R13 |
| RORXL $0x0b, DX, R14 |
| ADDL 64(SP)(SI*1), R11 |
| ORL CX, DI |
| VPALIGNR $0x04, Y4, Y5, Y0 |
| MOVL R9, R15 |
| RORXL $0x0d, AX, R12 |
| XORL R14, R13 |
| XORL R10, R15 |
| VPADDD Y6, Y0, Y0 |
| RORXL $0x06, DX, R14 |
| ANDL DX, R15 |
| XORL R14, R13 |
| RORXL $0x16, AX, R14 |
| ADDL R11, R8 |
| ANDL BX, DI |
| VPALIGNR $0x04, Y6, Y7, Y1 |
| XORL R12, R14 |
| RORXL $0x02, AX, R12 |
| XORL R10, R15 |
| VPSRLD $0x07, Y1, Y2 |
| XORL R12, R14 |
| MOVL AX, R12 |
| ANDL CX, R12 |
| ADDL R13, R15 |
| VPSLLD $0x19, Y1, Y3 |
| ORL R12, DI |
| ADDL R14, R11 |
| ADDL R15, R8 |
| VPOR Y2, Y3, Y3 |
| VPSRLD $0x12, Y1, Y2 |
| ADDL R15, R11 |
| ADDL DI, R11 |
| MOVL R11, DI |
| RORXL $0x19, R8, R13 |
| RORXL $0x0b, R8, R14 |
| ADDL 68(SP)(SI*1), R10 |
| ORL BX, DI |
| VPSRLD $0x03, Y1, Y8 |
| MOVL DX, R15 |
| RORXL $0x0d, R11, R12 |
| XORL R14, R13 |
| XORL R9, R15 |
| RORXL $0x06, R8, R14 |
| XORL R14, R13 |
| RORXL $0x16, R11, R14 |
| ANDL R8, R15 |
| ADDL R10, CX |
| VPSLLD $0x0e, Y1, Y1 |
| ANDL AX, DI |
| XORL R12, R14 |
| VPXOR Y1, Y3, Y3 |
| RORXL $0x02, R11, R12 |
| XORL R9, R15 |
| VPXOR Y2, Y3, Y3 |
| XORL R12, R14 |
| MOVL R11, R12 |
| ANDL BX, R12 |
| ADDL R13, R15 |
| VPXOR Y8, Y3, Y1 |
| VPSHUFD $0xfa, Y5, Y2 |
| ORL R12, DI |
| ADDL R14, R10 |
| VPADDD Y1, Y0, Y0 |
| ADDL R15, CX |
| ADDL R15, R10 |
| ADDL DI, R10 |
| VPSRLD $0x0a, Y2, Y8 |
| MOVL R10, DI |
| RORXL $0x19, CX, R13 |
| ADDL 72(SP)(SI*1), R9 |
| VPSRLQ $0x13, Y2, Y3 |
| RORXL $0x0b, CX, R14 |
| ORL AX, DI |
| MOVL R8, R15 |
| XORL DX, R15 |
| RORXL $0x0d, R10, R12 |
| XORL R14, R13 |
| VPSRLQ $0x11, Y2, Y2 |
| ANDL CX, R15 |
| RORXL $0x06, CX, R14 |
| VPXOR Y3, Y2, Y2 |
| ADDL R9, BX |
| ANDL R11, DI |
| XORL R14, R13 |
| RORXL $0x16, R10, R14 |
| VPXOR Y2, Y8, Y8 |
| XORL DX, R15 |
| VPSHUFB shuff_00BA<>+0(SB), Y8, Y8 |
| XORL R12, R14 |
| RORXL $0x02, R10, R12 |
| VPADDD Y8, Y0, Y0 |
| XORL R12, R14 |
| MOVL R10, R12 |
| ANDL AX, R12 |
| ADDL R13, R15 |
| VPSHUFD $0x50, Y0, Y2 |
| ORL R12, DI |
| ADDL R14, R9 |
| ADDL R15, BX |
| ADDL R15, R9 |
| ADDL DI, R9 |
| MOVL R9, DI |
| RORXL $0x19, BX, R13 |
| RORXL $0x0b, BX, R14 |
| ADDL 76(SP)(SI*1), DX |
| ORL R11, DI |
| VPSRLD $0x0a, Y2, Y11 |
| MOVL CX, R15 |
| RORXL $0x0d, R9, R12 |
| XORL R14, R13 |
| XORL R8, R15 |
| VPSRLQ $0x13, Y2, Y3 |
| RORXL $0x06, BX, R14 |
| ANDL BX, R15 |
| ADDL DX, AX |
| ANDL R10, DI |
| VPSRLQ $0x11, Y2, Y2 |
| XORL R14, R13 |
| XORL R8, R15 |
| VPXOR Y3, Y2, Y2 |
| RORXL $0x16, R9, R14 |
| ADDL R13, R15 |
| VPXOR Y2, Y11, Y11 |
| XORL R12, R14 |
| ADDL R15, AX |
| RORXL $0x02, R9, R12 |
| VPSHUFB shuff_DC00<>+0(SB), Y11, Y11 |
| VPADDD Y0, Y11, Y6 |
| XORL R12, R14 |
| MOVL R9, R12 |
| ANDL R11, R12 |
| ORL R12, DI |
| ADDL R14, DX |
| ADDL R15, DX |
| ADDL DI, DX |
| |
| // Do 4 rounds and scheduling |
| VPADDD 96(BP)(SI*1), Y7, Y9 |
| VMOVDQU Y9, 96(SP)(SI*1) |
| MOVL DX, DI |
| RORXL $0x19, AX, R13 |
| RORXL $0x0b, AX, R14 |
| ADDL 96(SP)(SI*1), R8 |
| ORL R10, DI |
| VPALIGNR $0x04, Y5, Y6, Y0 |
| MOVL BX, R15 |
| RORXL $0x0d, DX, R12 |
| XORL R14, R13 |
| XORL CX, R15 |
| VPADDD Y7, Y0, Y0 |
| RORXL $0x06, AX, R14 |
| ANDL AX, R15 |
| XORL R14, R13 |
| RORXL $0x16, DX, R14 |
| ADDL R8, R11 |
| ANDL R9, DI |
| VPALIGNR $0x04, Y7, Y4, Y1 |
| XORL R12, R14 |
| RORXL $0x02, DX, R12 |
| XORL CX, R15 |
| VPSRLD $0x07, Y1, Y2 |
| XORL R12, R14 |
| MOVL DX, R12 |
| ANDL R10, R12 |
| ADDL R13, R15 |
| VPSLLD $0x19, Y1, Y3 |
| ORL R12, DI |
| ADDL R14, R8 |
| ADDL R15, R11 |
| VPOR Y2, Y3, Y3 |
| VPSRLD $0x12, Y1, Y2 |
| ADDL R15, R8 |
| ADDL DI, R8 |
| MOVL R8, DI |
| RORXL $0x19, R11, R13 |
| RORXL $0x0b, R11, R14 |
| ADDL 100(SP)(SI*1), CX |
| ORL R9, DI |
| VPSRLD $0x03, Y1, Y8 |
| MOVL AX, R15 |
| RORXL $0x0d, R8, R12 |
| XORL R14, R13 |
| XORL BX, R15 |
| RORXL $0x06, R11, R14 |
| XORL R14, R13 |
| RORXL $0x16, R8, R14 |
| ANDL R11, R15 |
| ADDL CX, R10 |
| VPSLLD $0x0e, Y1, Y1 |
| ANDL DX, DI |
| XORL R12, R14 |
| VPXOR Y1, Y3, Y3 |
| RORXL $0x02, R8, R12 |
| XORL BX, R15 |
| VPXOR Y2, Y3, Y3 |
| XORL R12, R14 |
| MOVL R8, R12 |
| ANDL R9, R12 |
| ADDL R13, R15 |
| VPXOR Y8, Y3, Y1 |
| VPSHUFD $0xfa, Y6, Y2 |
| ORL R12, DI |
| ADDL R14, CX |
| VPADDD Y1, Y0, Y0 |
| ADDL R15, R10 |
| ADDL R15, CX |
| ADDL DI, CX |
| VPSRLD $0x0a, Y2, Y8 |
| MOVL CX, DI |
| RORXL $0x19, R10, R13 |
| ADDL 104(SP)(SI*1), BX |
| VPSRLQ $0x13, Y2, Y3 |
| RORXL $0x0b, R10, R14 |
| ORL DX, DI |
| MOVL R11, R15 |
| XORL AX, R15 |
| RORXL $0x0d, CX, R12 |
| XORL R14, R13 |
| VPSRLQ $0x11, Y2, Y2 |
| ANDL R10, R15 |
| RORXL $0x06, R10, R14 |
| VPXOR Y3, Y2, Y2 |
| ADDL BX, R9 |
| ANDL R8, DI |
| XORL R14, R13 |
| RORXL $0x16, CX, R14 |
| VPXOR Y2, Y8, Y8 |
| XORL AX, R15 |
| VPSHUFB shuff_00BA<>+0(SB), Y8, Y8 |
| XORL R12, R14 |
| RORXL $0x02, CX, R12 |
| VPADDD Y8, Y0, Y0 |
| XORL R12, R14 |
| MOVL CX, R12 |
| ANDL DX, R12 |
| ADDL R13, R15 |
| VPSHUFD $0x50, Y0, Y2 |
| ORL R12, DI |
| ADDL R14, BX |
| ADDL R15, R9 |
| ADDL R15, BX |
| ADDL DI, BX |
| MOVL BX, DI |
| RORXL $0x19, R9, R13 |
| RORXL $0x0b, R9, R14 |
| ADDL 108(SP)(SI*1), AX |
| ORL R8, DI |
| VPSRLD $0x0a, Y2, Y11 |
| MOVL R10, R15 |
| RORXL $0x0d, BX, R12 |
| XORL R14, R13 |
| XORL R11, R15 |
| VPSRLQ $0x13, Y2, Y3 |
| RORXL $0x06, R9, R14 |
| ANDL R9, R15 |
| ADDL AX, DX |
| ANDL CX, DI |
| VPSRLQ $0x11, Y2, Y2 |
| XORL R14, R13 |
| XORL R11, R15 |
| VPXOR Y3, Y2, Y2 |
| RORXL $0x16, BX, R14 |
| ADDL R13, R15 |
| VPXOR Y2, Y11, Y11 |
| XORL R12, R14 |
| ADDL R15, DX |
| RORXL $0x02, BX, R12 |
| VPSHUFB shuff_DC00<>+0(SB), Y11, Y11 |
| VPADDD Y0, Y11, Y7 |
| XORL R12, R14 |
| MOVL BX, R12 |
| ANDL R8, R12 |
| ORL R12, DI |
| ADDL R14, AX |
| ADDL R15, AX |
| ADDL DI, AX |
| ADDQ $0x80, SI |
| CMPQ SI, $0x00000180 |
| JB avx2_loop1 |
| |
| avx2_loop2: |
| VPADDD (BP)(SI*1), Y4, Y9 |
| VMOVDQU Y9, (SP)(SI*1) |
| MOVL R9, R15 |
| RORXL $0x19, DX, R13 |
| RORXL $0x0b, DX, R14 |
| XORL R10, R15 |
| XORL R14, R13 |
| RORXL $0x06, DX, R14 |
| ANDL DX, R15 |
| XORL R14, R13 |
| RORXL $0x0d, AX, R12 |
| XORL R10, R15 |
| RORXL $0x16, AX, R14 |
| MOVL AX, DI |
| XORL R12, R14 |
| RORXL $0x02, AX, R12 |
| ADDL (SP)(SI*1), R11 |
| ORL CX, DI |
| XORL R12, R14 |
| MOVL AX, R12 |
| ANDL BX, DI |
| ANDL CX, R12 |
| ADDL R13, R15 |
| ADDL R11, R8 |
| ORL R12, DI |
| ADDL R14, R11 |
| ADDL R15, R8 |
| ADDL R15, R11 |
| MOVL DX, R15 |
| RORXL $0x19, R8, R13 |
| RORXL $0x0b, R8, R14 |
| XORL R9, R15 |
| XORL R14, R13 |
| RORXL $0x06, R8, R14 |
| ANDL R8, R15 |
| ADDL DI, R11 |
| XORL R14, R13 |
| RORXL $0x0d, R11, R12 |
| XORL R9, R15 |
| RORXL $0x16, R11, R14 |
| MOVL R11, DI |
| XORL R12, R14 |
| RORXL $0x02, R11, R12 |
| ADDL 4(SP)(SI*1), R10 |
| ORL BX, DI |
| XORL R12, R14 |
| MOVL R11, R12 |
| ANDL AX, DI |
| ANDL BX, R12 |
| ADDL R13, R15 |
| ADDL R10, CX |
| ORL R12, DI |
| ADDL R14, R10 |
| ADDL R15, CX |
| ADDL R15, R10 |
| MOVL R8, R15 |
| RORXL $0x19, CX, R13 |
| RORXL $0x0b, CX, R14 |
| XORL DX, R15 |
| XORL R14, R13 |
| RORXL $0x06, CX, R14 |
| ANDL CX, R15 |
| ADDL DI, R10 |
| XORL R14, R13 |
| RORXL $0x0d, R10, R12 |
| XORL DX, R15 |
| RORXL $0x16, R10, R14 |
| MOVL R10, DI |
| XORL R12, R14 |
| RORXL $0x02, R10, R12 |
| ADDL 8(SP)(SI*1), R9 |
| ORL AX, DI |
| XORL R12, R14 |
| MOVL R10, R12 |
| ANDL R11, DI |
| ANDL AX, R12 |
| ADDL R13, R15 |
| ADDL R9, BX |
| ORL R12, DI |
| ADDL R14, R9 |
| ADDL R15, BX |
| ADDL R15, R9 |
| MOVL CX, R15 |
| RORXL $0x19, BX, R13 |
| RORXL $0x0b, BX, R14 |
| XORL R8, R15 |
| XORL R14, R13 |
| RORXL $0x06, BX, R14 |
| ANDL BX, R15 |
| ADDL DI, R9 |
| XORL R14, R13 |
| RORXL $0x0d, R9, R12 |
| XORL R8, R15 |
| RORXL $0x16, R9, R14 |
| MOVL R9, DI |
| XORL R12, R14 |
| RORXL $0x02, R9, R12 |
| ADDL 12(SP)(SI*1), DX |
| ORL R11, DI |
| XORL R12, R14 |
| MOVL R9, R12 |
| ANDL R10, DI |
| ANDL R11, R12 |
| ADDL R13, R15 |
| ADDL DX, AX |
| ORL R12, DI |
| ADDL R14, DX |
| ADDL R15, AX |
| ADDL R15, DX |
| ADDL DI, DX |
| VPADDD 32(BP)(SI*1), Y5, Y9 |
| VMOVDQU Y9, 32(SP)(SI*1) |
| MOVL BX, R15 |
| RORXL $0x19, AX, R13 |
| RORXL $0x0b, AX, R14 |
| XORL CX, R15 |
| XORL R14, R13 |
| RORXL $0x06, AX, R14 |
| ANDL AX, R15 |
| XORL R14, R13 |
| RORXL $0x0d, DX, R12 |
| XORL CX, R15 |
| RORXL $0x16, DX, R14 |
| MOVL DX, DI |
| XORL R12, R14 |
| RORXL $0x02, DX, R12 |
| ADDL 32(SP)(SI*1), R8 |
| ORL R10, DI |
| XORL R12, R14 |
| MOVL DX, R12 |
| ANDL R9, DI |
| ANDL R10, R12 |
| ADDL R13, R15 |
| ADDL R8, R11 |
| ORL R12, DI |
| ADDL R14, R8 |
| ADDL R15, R11 |
| ADDL R15, R8 |
| MOVL AX, R15 |
| RORXL $0x19, R11, R13 |
| RORXL $0x0b, R11, R14 |
| XORL BX, R15 |
| XORL R14, R13 |
| RORXL $0x06, R11, R14 |
| ANDL R11, R15 |
| ADDL DI, R8 |
| XORL R14, R13 |
| RORXL $0x0d, R8, R12 |
| XORL BX, R15 |
| RORXL $0x16, R8, R14 |
| MOVL R8, DI |
| XORL R12, R14 |
| RORXL $0x02, R8, R12 |
| ADDL 36(SP)(SI*1), CX |
| ORL R9, DI |
| XORL R12, R14 |
| MOVL R8, R12 |
| ANDL DX, DI |
| ANDL R9, R12 |
| ADDL R13, R15 |
| ADDL CX, R10 |
| ORL R12, DI |
| ADDL R14, CX |
| ADDL R15, R10 |
| ADDL R15, CX |
| MOVL R11, R15 |
| RORXL $0x19, R10, R13 |
| RORXL $0x0b, R10, R14 |
| XORL AX, R15 |
| XORL R14, R13 |
| RORXL $0x06, R10, R14 |
| ANDL R10, R15 |
| ADDL DI, CX |
| XORL R14, R13 |
| RORXL $0x0d, CX, R12 |
| XORL AX, R15 |
| RORXL $0x16, CX, R14 |
| MOVL CX, DI |
| XORL R12, R14 |
| RORXL $0x02, CX, R12 |
| ADDL 40(SP)(SI*1), BX |
| ORL DX, DI |
| XORL R12, R14 |
| MOVL CX, R12 |
| ANDL R8, DI |
| ANDL DX, R12 |
| ADDL R13, R15 |
| ADDL BX, R9 |
| ORL R12, DI |
| ADDL R14, BX |
| ADDL R15, R9 |
| ADDL R15, BX |
| MOVL R10, R15 |
| RORXL $0x19, R9, R13 |
| RORXL $0x0b, R9, R14 |
| XORL R11, R15 |
| XORL R14, R13 |
| RORXL $0x06, R9, R14 |
| ANDL R9, R15 |
| ADDL DI, BX |
| XORL R14, R13 |
| RORXL $0x0d, BX, R12 |
| XORL R11, R15 |
| RORXL $0x16, BX, R14 |
| MOVL BX, DI |
| XORL R12, R14 |
| RORXL $0x02, BX, R12 |
| ADDL 44(SP)(SI*1), AX |
| ORL R8, DI |
| XORL R12, R14 |
| MOVL BX, R12 |
| ANDL CX, DI |
| ANDL R8, R12 |
| ADDL R13, R15 |
| ADDL AX, DX |
| ORL R12, DI |
| ADDL R14, AX |
| ADDL R15, DX |
| ADDL R15, AX |
| ADDL DI, AX |
| ADDQ $0x40, SI |
| VMOVDQU Y6, Y4 |
| VMOVDQU Y7, Y5 |
| CMPQ SI, $0x00000200 |
| JB avx2_loop2 |
| MOVQ dig+0(FP), SI |
| MOVQ 520(SP), DI |
| ADDL AX, (SI) |
| MOVL (SI), AX |
| ADDL BX, 4(SI) |
| MOVL 4(SI), BX |
| ADDL CX, 8(SI) |
| MOVL 8(SI), CX |
| ADDL R8, 12(SI) |
| MOVL 12(SI), R8 |
| ADDL DX, 16(SI) |
| MOVL 16(SI), DX |
| ADDL R9, 20(SI) |
| MOVL 20(SI), R9 |
| ADDL R10, 24(SI) |
| MOVL 24(SI), R10 |
| ADDL R11, 28(SI) |
| MOVL 28(SI), R11 |
| CMPQ 512(SP), DI |
| JB done_hash |
| XORQ SI, SI |
| |
| avx2_loop3: |
| MOVL R9, R15 |
| RORXL $0x19, DX, R13 |
| RORXL $0x0b, DX, R14 |
| XORL R10, R15 |
| XORL R14, R13 |
| RORXL $0x06, DX, R14 |
| ANDL DX, R15 |
| XORL R14, R13 |
| RORXL $0x0d, AX, R12 |
| XORL R10, R15 |
| RORXL $0x16, AX, R14 |
| MOVL AX, DI |
| XORL R12, R14 |
| RORXL $0x02, AX, R12 |
| ADDL 16(SP)(SI*1), R11 |
| ORL CX, DI |
| XORL R12, R14 |
| MOVL AX, R12 |
| ANDL BX, DI |
| ANDL CX, R12 |
| ADDL R13, R15 |
| ADDL R11, R8 |
| ORL R12, DI |
| ADDL R14, R11 |
| ADDL R15, R8 |
| ADDL R15, R11 |
| MOVL DX, R15 |
| RORXL $0x19, R8, R13 |
| RORXL $0x0b, R8, R14 |
| XORL R9, R15 |
| XORL R14, R13 |
| RORXL $0x06, R8, R14 |
| ANDL R8, R15 |
| ADDL DI, R11 |
| XORL R14, R13 |
| RORXL $0x0d, R11, R12 |
| XORL R9, R15 |
| RORXL $0x16, R11, R14 |
| MOVL R11, DI |
| XORL R12, R14 |
| RORXL $0x02, R11, R12 |
| ADDL 20(SP)(SI*1), R10 |
| ORL BX, DI |
| XORL R12, R14 |
| MOVL R11, R12 |
| ANDL AX, DI |
| ANDL BX, R12 |
| ADDL R13, R15 |
| ADDL R10, CX |
| ORL R12, DI |
| ADDL R14, R10 |
| ADDL R15, CX |
| ADDL R15, R10 |
| MOVL R8, R15 |
| RORXL $0x19, CX, R13 |
| RORXL $0x0b, CX, R14 |
| XORL DX, R15 |
| XORL R14, R13 |
| RORXL $0x06, CX, R14 |
| ANDL CX, R15 |
| ADDL DI, R10 |
| XORL R14, R13 |
| RORXL $0x0d, R10, R12 |
| XORL DX, R15 |
| RORXL $0x16, R10, R14 |
| MOVL R10, DI |
| XORL R12, R14 |
| RORXL $0x02, R10, R12 |
| ADDL 24(SP)(SI*1), R9 |
| ORL AX, DI |
| XORL R12, R14 |
| MOVL R10, R12 |
| ANDL R11, DI |
| ANDL AX, R12 |
| ADDL R13, R15 |
| ADDL R9, BX |
| ORL R12, DI |
| ADDL R14, R9 |
| ADDL R15, BX |
| ADDL R15, R9 |
| MOVL CX, R15 |
| RORXL $0x19, BX, R13 |
| RORXL $0x0b, BX, R14 |
| XORL R8, R15 |
| XORL R14, R13 |
| RORXL $0x06, BX, R14 |
| ANDL BX, R15 |
| ADDL DI, R9 |
| XORL R14, R13 |
| RORXL $0x0d, R9, R12 |
| XORL R8, R15 |
| RORXL $0x16, R9, R14 |
| MOVL R9, DI |
| XORL R12, R14 |
| RORXL $0x02, R9, R12 |
| ADDL 28(SP)(SI*1), DX |
| ORL R11, DI |
| XORL R12, R14 |
| MOVL R9, R12 |
| ANDL R10, DI |
| ANDL R11, R12 |
| ADDL R13, R15 |
| ADDL DX, AX |
| ORL R12, DI |
| ADDL R14, DX |
| ADDL R15, AX |
| ADDL R15, DX |
| ADDL DI, DX |
| MOVL BX, R15 |
| RORXL $0x19, AX, R13 |
| RORXL $0x0b, AX, R14 |
| XORL CX, R15 |
| XORL R14, R13 |
| RORXL $0x06, AX, R14 |
| ANDL AX, R15 |
| XORL R14, R13 |
| RORXL $0x0d, DX, R12 |
| XORL CX, R15 |
| RORXL $0x16, DX, R14 |
| MOVL DX, DI |
| XORL R12, R14 |
| RORXL $0x02, DX, R12 |
| ADDL 48(SP)(SI*1), R8 |
| ORL R10, DI |
| XORL R12, R14 |
| MOVL DX, R12 |
| ANDL R9, DI |
| ANDL R10, R12 |
| ADDL R13, R15 |
| ADDL R8, R11 |
| ORL R12, DI |
| ADDL R14, R8 |
| ADDL R15, R11 |
| ADDL R15, R8 |
| MOVL AX, R15 |
| RORXL $0x19, R11, R13 |
| RORXL $0x0b, R11, R14 |
| XORL BX, R15 |
| XORL R14, R13 |
| RORXL $0x06, R11, R14 |
| ANDL R11, R15 |
| ADDL DI, R8 |
| XORL R14, R13 |
| RORXL $0x0d, R8, R12 |
| XORL BX, R15 |
| RORXL $0x16, R8, R14 |
| MOVL R8, DI |
| XORL R12, R14 |
| RORXL $0x02, R8, R12 |
| ADDL 52(SP)(SI*1), CX |
| ORL R9, DI |
| XORL R12, R14 |
| MOVL R8, R12 |
| ANDL DX, DI |
| ANDL R9, R12 |
| ADDL R13, R15 |
| ADDL CX, R10 |
| ORL R12, DI |
| ADDL R14, CX |
| ADDL R15, R10 |
| ADDL R15, CX |
| MOVL R11, R15 |
| RORXL $0x19, R10, R13 |
| RORXL $0x0b, R10, R14 |
| XORL AX, R15 |
| XORL R14, R13 |
| RORXL $0x06, R10, R14 |
| ANDL R10, R15 |
| ADDL DI, CX |
| XORL R14, R13 |
| RORXL $0x0d, CX, R12 |
| XORL AX, R15 |
| RORXL $0x16, CX, R14 |
| MOVL CX, DI |
| XORL R12, R14 |
| RORXL $0x02, CX, R12 |
| ADDL 56(SP)(SI*1), BX |
| ORL DX, DI |
| XORL R12, R14 |
| MOVL CX, R12 |
| ANDL R8, DI |
| ANDL DX, R12 |
| ADDL R13, R15 |
| ADDL BX, R9 |
| ORL R12, DI |
| ADDL R14, BX |
| ADDL R15, R9 |
| ADDL R15, BX |
| MOVL R10, R15 |
| RORXL $0x19, R9, R13 |
| RORXL $0x0b, R9, R14 |
| XORL R11, R15 |
| XORL R14, R13 |
| RORXL $0x06, R9, R14 |
| ANDL R9, R15 |
| ADDL DI, BX |
| XORL R14, R13 |
| RORXL $0x0d, BX, R12 |
| XORL R11, R15 |
| RORXL $0x16, BX, R14 |
| MOVL BX, DI |
| XORL R12, R14 |
| RORXL $0x02, BX, R12 |
| ADDL 60(SP)(SI*1), AX |
| ORL R8, DI |
| XORL R12, R14 |
| MOVL BX, R12 |
| ANDL CX, DI |
| ANDL R8, R12 |
| ADDL R13, R15 |
| ADDL AX, DX |
| ORL R12, DI |
| ADDL R14, AX |
| ADDL R15, DX |
| ADDL R15, AX |
| ADDL DI, AX |
| ADDQ $0x40, SI |
| CMPQ SI, $0x00000200 |
| JB avx2_loop3 |
| MOVQ dig+0(FP), SI |
| MOVQ 520(SP), DI |
| ADDQ $0x40, DI |
| ADDL AX, (SI) |
| MOVL (SI), AX |
| ADDL BX, 4(SI) |
| MOVL 4(SI), BX |
| ADDL CX, 8(SI) |
| MOVL 8(SI), CX |
| ADDL R8, 12(SI) |
| MOVL 12(SI), R8 |
| ADDL DX, 16(SI) |
| MOVL 16(SI), DX |
| ADDL R9, 20(SI) |
| MOVL 20(SI), R9 |
| ADDL R10, 24(SI) |
| MOVL 24(SI), R10 |
| ADDL R11, 28(SI) |
| MOVL 28(SI), R11 |
| CMPQ 512(SP), DI |
| JA avx2_loop0 |
| JB done_hash |
| |
| avx2_do_last_block: |
| VMOVDQU (DI), X4 |
| VMOVDQU 16(DI), X5 |
| VMOVDQU 32(DI), X6 |
| VMOVDQU 48(DI), X7 |
| VMOVDQU flip_mask<>+0(SB), Y13 |
| VPSHUFB X13, X4, X4 |
| VPSHUFB X13, X5, X5 |
| VPSHUFB X13, X6, X6 |
| VPSHUFB X13, X7, X7 |
| LEAQ K256<>+0(SB), BP |
| JMP avx2_last_block_enter |
| |
| avx2_only_one_block: |
| MOVL (SI), AX |
| MOVL 4(SI), BX |
| MOVL 8(SI), CX |
| MOVL 12(SI), R8 |
| MOVL 16(SI), DX |
| MOVL 20(SI), R9 |
| MOVL 24(SI), R10 |
| MOVL 28(SI), R11 |
| JMP avx2_do_last_block |
| |
| done_hash: |
| VZEROUPPER |
| RET |
| |
| DATA flip_mask<>+0(SB)/8, $0x0405060700010203 |
| DATA flip_mask<>+8(SB)/8, $0x0c0d0e0f08090a0b |
| DATA flip_mask<>+16(SB)/8, $0x0405060700010203 |
| DATA flip_mask<>+24(SB)/8, $0x0c0d0e0f08090a0b |
| GLOBL flip_mask<>(SB), RODATA, $32 |
| |
| DATA K256<>+0(SB)/4, $0x428a2f98 |
| DATA K256<>+4(SB)/4, $0x71374491 |
| DATA K256<>+8(SB)/4, $0xb5c0fbcf |
| DATA K256<>+12(SB)/4, $0xe9b5dba5 |
| DATA K256<>+16(SB)/4, $0x428a2f98 |
| DATA K256<>+20(SB)/4, $0x71374491 |
| DATA K256<>+24(SB)/4, $0xb5c0fbcf |
| DATA K256<>+28(SB)/4, $0xe9b5dba5 |
| DATA K256<>+32(SB)/4, $0x3956c25b |
| DATA K256<>+36(SB)/4, $0x59f111f1 |
| DATA K256<>+40(SB)/4, $0x923f82a4 |
| DATA K256<>+44(SB)/4, $0xab1c5ed5 |
| DATA K256<>+48(SB)/4, $0x3956c25b |
| DATA K256<>+52(SB)/4, $0x59f111f1 |
| DATA K256<>+56(SB)/4, $0x923f82a4 |
| DATA K256<>+60(SB)/4, $0xab1c5ed5 |
| DATA K256<>+64(SB)/4, $0xd807aa98 |
| DATA K256<>+68(SB)/4, $0x12835b01 |
| DATA K256<>+72(SB)/4, $0x243185be |
| DATA K256<>+76(SB)/4, $0x550c7dc3 |
| DATA K256<>+80(SB)/4, $0xd807aa98 |
| DATA K256<>+84(SB)/4, $0x12835b01 |
| DATA K256<>+88(SB)/4, $0x243185be |
| DATA K256<>+92(SB)/4, $0x550c7dc3 |
| DATA K256<>+96(SB)/4, $0x72be5d74 |
| DATA K256<>+100(SB)/4, $0x80deb1fe |
| DATA K256<>+104(SB)/4, $0x9bdc06a7 |
| DATA K256<>+108(SB)/4, $0xc19bf174 |
| DATA K256<>+112(SB)/4, $0x72be5d74 |
| DATA K256<>+116(SB)/4, $0x80deb1fe |
| DATA K256<>+120(SB)/4, $0x9bdc06a7 |
| DATA K256<>+124(SB)/4, $0xc19bf174 |
| DATA K256<>+128(SB)/4, $0xe49b69c1 |
| DATA K256<>+132(SB)/4, $0xefbe4786 |
| DATA K256<>+136(SB)/4, $0x0fc19dc6 |
| DATA K256<>+140(SB)/4, $0x240ca1cc |
| DATA K256<>+144(SB)/4, $0xe49b69c1 |
| DATA K256<>+148(SB)/4, $0xefbe4786 |
| DATA K256<>+152(SB)/4, $0x0fc19dc6 |
| DATA K256<>+156(SB)/4, $0x240ca1cc |
| DATA K256<>+160(SB)/4, $0x2de92c6f |
| DATA K256<>+164(SB)/4, $0x4a7484aa |
| DATA K256<>+168(SB)/4, $0x5cb0a9dc |
| DATA K256<>+172(SB)/4, $0x76f988da |
| DATA K256<>+176(SB)/4, $0x2de92c6f |
| DATA K256<>+180(SB)/4, $0x4a7484aa |
| DATA K256<>+184(SB)/4, $0x5cb0a9dc |
| DATA K256<>+188(SB)/4, $0x76f988da |
| DATA K256<>+192(SB)/4, $0x983e5152 |
| DATA K256<>+196(SB)/4, $0xa831c66d |
| DATA K256<>+200(SB)/4, $0xb00327c8 |
| DATA K256<>+204(SB)/4, $0xbf597fc7 |
| DATA K256<>+208(SB)/4, $0x983e5152 |
| DATA K256<>+212(SB)/4, $0xa831c66d |
| DATA K256<>+216(SB)/4, $0xb00327c8 |
| DATA K256<>+220(SB)/4, $0xbf597fc7 |
| DATA K256<>+224(SB)/4, $0xc6e00bf3 |
| DATA K256<>+228(SB)/4, $0xd5a79147 |
| DATA K256<>+232(SB)/4, $0x06ca6351 |
| DATA K256<>+236(SB)/4, $0x14292967 |
| DATA K256<>+240(SB)/4, $0xc6e00bf3 |
| DATA K256<>+244(SB)/4, $0xd5a79147 |
| DATA K256<>+248(SB)/4, $0x06ca6351 |
| DATA K256<>+252(SB)/4, $0x14292967 |
| DATA K256<>+256(SB)/4, $0x27b70a85 |
| DATA K256<>+260(SB)/4, $0x2e1b2138 |
| DATA K256<>+264(SB)/4, $0x4d2c6dfc |
| DATA K256<>+268(SB)/4, $0x53380d13 |
| DATA K256<>+272(SB)/4, $0x27b70a85 |
| DATA K256<>+276(SB)/4, $0x2e1b2138 |
| DATA K256<>+280(SB)/4, $0x4d2c6dfc |
| DATA K256<>+284(SB)/4, $0x53380d13 |
| DATA K256<>+288(SB)/4, $0x650a7354 |
| DATA K256<>+292(SB)/4, $0x766a0abb |
| DATA K256<>+296(SB)/4, $0x81c2c92e |
| DATA K256<>+300(SB)/4, $0x92722c85 |
| DATA K256<>+304(SB)/4, $0x650a7354 |
| DATA K256<>+308(SB)/4, $0x766a0abb |
| DATA K256<>+312(SB)/4, $0x81c2c92e |
| DATA K256<>+316(SB)/4, $0x92722c85 |
| DATA K256<>+320(SB)/4, $0xa2bfe8a1 |
| DATA K256<>+324(SB)/4, $0xa81a664b |
| DATA K256<>+328(SB)/4, $0xc24b8b70 |
| DATA K256<>+332(SB)/4, $0xc76c51a3 |
| DATA K256<>+336(SB)/4, $0xa2bfe8a1 |
| DATA K256<>+340(SB)/4, $0xa81a664b |
| DATA K256<>+344(SB)/4, $0xc24b8b70 |
| DATA K256<>+348(SB)/4, $0xc76c51a3 |
| DATA K256<>+352(SB)/4, $0xd192e819 |
| DATA K256<>+356(SB)/4, $0xd6990624 |
| DATA K256<>+360(SB)/4, $0xf40e3585 |
| DATA K256<>+364(SB)/4, $0x106aa070 |
| DATA K256<>+368(SB)/4, $0xd192e819 |
| DATA K256<>+372(SB)/4, $0xd6990624 |
| DATA K256<>+376(SB)/4, $0xf40e3585 |
| DATA K256<>+380(SB)/4, $0x106aa070 |
| DATA K256<>+384(SB)/4, $0x19a4c116 |
| DATA K256<>+388(SB)/4, $0x1e376c08 |
| DATA K256<>+392(SB)/4, $0x2748774c |
| DATA K256<>+396(SB)/4, $0x34b0bcb5 |
| DATA K256<>+400(SB)/4, $0x19a4c116 |
| DATA K256<>+404(SB)/4, $0x1e376c08 |
| DATA K256<>+408(SB)/4, $0x2748774c |
| DATA K256<>+412(SB)/4, $0x34b0bcb5 |
| DATA K256<>+416(SB)/4, $0x391c0cb3 |
| DATA K256<>+420(SB)/4, $0x4ed8aa4a |
| DATA K256<>+424(SB)/4, $0x5b9cca4f |
| DATA K256<>+428(SB)/4, $0x682e6ff3 |
| DATA K256<>+432(SB)/4, $0x391c0cb3 |
| DATA K256<>+436(SB)/4, $0x4ed8aa4a |
| DATA K256<>+440(SB)/4, $0x5b9cca4f |
| DATA K256<>+444(SB)/4, $0x682e6ff3 |
| DATA K256<>+448(SB)/4, $0x748f82ee |
| DATA K256<>+452(SB)/4, $0x78a5636f |
| DATA K256<>+456(SB)/4, $0x84c87814 |
| DATA K256<>+460(SB)/4, $0x8cc70208 |
| DATA K256<>+464(SB)/4, $0x748f82ee |
| DATA K256<>+468(SB)/4, $0x78a5636f |
| DATA K256<>+472(SB)/4, $0x84c87814 |
| DATA K256<>+476(SB)/4, $0x8cc70208 |
| DATA K256<>+480(SB)/4, $0x90befffa |
| DATA K256<>+484(SB)/4, $0xa4506ceb |
| DATA K256<>+488(SB)/4, $0xbef9a3f7 |
| DATA K256<>+492(SB)/4, $0xc67178f2 |
| DATA K256<>+496(SB)/4, $0x90befffa |
| DATA K256<>+500(SB)/4, $0xa4506ceb |
| DATA K256<>+504(SB)/4, $0xbef9a3f7 |
| DATA K256<>+508(SB)/4, $0xc67178f2 |
| GLOBL K256<>(SB), RODATA|NOPTR, $512 |
| |
| DATA shuff_00BA<>+0(SB)/8, $0x0b0a090803020100 |
| DATA shuff_00BA<>+8(SB)/8, $0xffffffffffffffff |
| DATA shuff_00BA<>+16(SB)/8, $0x0b0a090803020100 |
| DATA shuff_00BA<>+24(SB)/8, $0xffffffffffffffff |
| GLOBL shuff_00BA<>(SB), RODATA, $32 |
| |
| DATA shuff_DC00<>+0(SB)/8, $0xffffffffffffffff |
| DATA shuff_DC00<>+8(SB)/8, $0x0b0a090803020100 |
| DATA shuff_DC00<>+16(SB)/8, $0xffffffffffffffff |
| DATA shuff_DC00<>+24(SB)/8, $0x0b0a090803020100 |
| GLOBL shuff_DC00<>(SB), RODATA, $32 |
| |
| // func blockSHANI(dig *Digest, p []byte) |
| // Requires: AVX, SHA, SSE2, SSE4.1, SSSE3 |
| TEXT ·blockSHANI(SB), $0-32 |
| MOVQ dig+0(FP), DI |
| MOVQ p_base+8(FP), SI |
| MOVQ p_len+16(FP), DX |
| SHRQ $0x06, DX |
| SHLQ $0x06, DX |
| CMPQ DX, $0x00 |
| JEQ done |
| ADDQ SI, DX |
| VMOVDQU (DI), X1 |
| VMOVDQU 16(DI), X2 |
| PSHUFD $0xb1, X1, X1 |
| PSHUFD $0x1b, X2, X2 |
| VMOVDQA X1, X7 |
| PALIGNR $0x08, X2, X1 |
| PBLENDW $0xf0, X7, X2 |
| VMOVDQA flip_mask<>+0(SB), X8 |
| LEAQ K256<>+0(SB), AX |
| |
| roundLoop: |
| // save hash values for addition after rounds |
| VMOVDQA X1, X9 |
| VMOVDQA X2, X10 |
| |
| // do rounds 0-59 |
| VMOVDQU (SI), X0 |
| PSHUFB X8, X0 |
| VMOVDQA X0, X3 |
| PADDD (AX), X0 |
| SHA256RNDS2 X0, X1, X2 |
| PSHUFD $0x0e, X0, X0 |
| SHA256RNDS2 X0, X2, X1 |
| VMOVDQU 16(SI), X0 |
| PSHUFB X8, X0 |
| VMOVDQA X0, X4 |
| PADDD 32(AX), X0 |
| SHA256RNDS2 X0, X1, X2 |
| PSHUFD $0x0e, X0, X0 |
| SHA256RNDS2 X0, X2, X1 |
| SHA256MSG1 X4, X3 |
| VMOVDQU 32(SI), X0 |
| PSHUFB X8, X0 |
| VMOVDQA X0, X5 |
| PADDD 64(AX), X0 |
| SHA256RNDS2 X0, X1, X2 |
| PSHUFD $0x0e, X0, X0 |
| SHA256RNDS2 X0, X2, X1 |
| SHA256MSG1 X5, X4 |
| VMOVDQU 48(SI), X0 |
| PSHUFB X8, X0 |
| VMOVDQA X0, X6 |
| PADDD 96(AX), X0 |
| SHA256RNDS2 X0, X1, X2 |
| VMOVDQA X6, X7 |
| PALIGNR $0x04, X5, X7 |
| PADDD X7, X3 |
| SHA256MSG2 X6, X3 |
| PSHUFD $0x0e, X0, X0 |
| SHA256RNDS2 X0, X2, X1 |
| SHA256MSG1 X6, X5 |
| VMOVDQA X3, X0 |
| PADDD 128(AX), X0 |
| SHA256RNDS2 X0, X1, X2 |
| VMOVDQA X3, X7 |
| PALIGNR $0x04, X6, X7 |
| PADDD X7, X4 |
| SHA256MSG2 X3, X4 |
| PSHUFD $0x0e, X0, X0 |
| SHA256RNDS2 X0, X2, X1 |
| SHA256MSG1 X3, X6 |
| VMOVDQA X4, X0 |
| PADDD 160(AX), X0 |
| SHA256RNDS2 X0, X1, X2 |
| VMOVDQA X4, X7 |
| PALIGNR $0x04, X3, X7 |
| PADDD X7, X5 |
| SHA256MSG2 X4, X5 |
| PSHUFD $0x0e, X0, X0 |
| SHA256RNDS2 X0, X2, X1 |
| SHA256MSG1 X4, X3 |
| VMOVDQA X5, X0 |
| PADDD 192(AX), X0 |
| SHA256RNDS2 X0, X1, X2 |
| VMOVDQA X5, X7 |
| PALIGNR $0x04, X4, X7 |
| PADDD X7, X6 |
| SHA256MSG2 X5, X6 |
| PSHUFD $0x0e, X0, X0 |
| SHA256RNDS2 X0, X2, X1 |
| SHA256MSG1 X5, X4 |
| VMOVDQA X6, X0 |
| PADDD 224(AX), X0 |
| SHA256RNDS2 X0, X1, X2 |
| VMOVDQA X6, X7 |
| PALIGNR $0x04, X5, X7 |
| PADDD X7, X3 |
| SHA256MSG2 X6, X3 |
| PSHUFD $0x0e, X0, X0 |
| SHA256RNDS2 X0, X2, X1 |
| SHA256MSG1 X6, X5 |
| VMOVDQA X3, X0 |
| PADDD 256(AX), X0 |
| SHA256RNDS2 X0, X1, X2 |
| VMOVDQA X3, X7 |
| PALIGNR $0x04, X6, X7 |
| PADDD X7, X4 |
| SHA256MSG2 X3, X4 |
| PSHUFD $0x0e, X0, X0 |
| SHA256RNDS2 X0, X2, X1 |
| SHA256MSG1 X3, X6 |
| VMOVDQA X4, X0 |
| PADDD 288(AX), X0 |
| SHA256RNDS2 X0, X1, X2 |
| VMOVDQA X4, X7 |
| PALIGNR $0x04, X3, X7 |
| PADDD X7, X5 |
| SHA256MSG2 X4, X5 |
| PSHUFD $0x0e, X0, X0 |
| SHA256RNDS2 X0, X2, X1 |
| SHA256MSG1 X4, X3 |
| VMOVDQA X5, X0 |
| PADDD 320(AX), X0 |
| SHA256RNDS2 X0, X1, X2 |
| VMOVDQA X5, X7 |
| PALIGNR $0x04, X4, X7 |
| PADDD X7, X6 |
| SHA256MSG2 X5, X6 |
| PSHUFD $0x0e, X0, X0 |
| SHA256RNDS2 X0, X2, X1 |
| SHA256MSG1 X5, X4 |
| VMOVDQA X6, X0 |
| PADDD 352(AX), X0 |
| SHA256RNDS2 X0, X1, X2 |
| VMOVDQA X6, X7 |
| PALIGNR $0x04, X5, X7 |
| PADDD X7, X3 |
| SHA256MSG2 X6, X3 |
| PSHUFD $0x0e, X0, X0 |
| SHA256RNDS2 X0, X2, X1 |
| SHA256MSG1 X6, X5 |
| VMOVDQA X3, X0 |
| PADDD 384(AX), X0 |
| SHA256RNDS2 X0, X1, X2 |
| VMOVDQA X3, X7 |
| PALIGNR $0x04, X6, X7 |
| PADDD X7, X4 |
| SHA256MSG2 X3, X4 |
| PSHUFD $0x0e, X0, X0 |
| SHA256RNDS2 X0, X2, X1 |
| SHA256MSG1 X3, X6 |
| VMOVDQA X4, X0 |
| PADDD 416(AX), X0 |
| SHA256RNDS2 X0, X1, X2 |
| VMOVDQA X4, X7 |
| PALIGNR $0x04, X3, X7 |
| PADDD X7, X5 |
| SHA256MSG2 X4, X5 |
| PSHUFD $0x0e, X0, X0 |
| SHA256RNDS2 X0, X2, X1 |
| VMOVDQA X5, X0 |
| PADDD 448(AX), X0 |
| SHA256RNDS2 X0, X1, X2 |
| VMOVDQA X5, X7 |
| PALIGNR $0x04, X4, X7 |
| PADDD X7, X6 |
| SHA256MSG2 X5, X6 |
| PSHUFD $0x0e, X0, X0 |
| SHA256RNDS2 X0, X2, X1 |
| |
| // do rounds 60-63 |
| VMOVDQA X6, X0 |
| PADDD 480(AX), X0 |
| SHA256RNDS2 X0, X1, X2 |
| PSHUFD $0x0e, X0, X0 |
| SHA256RNDS2 X0, X2, X1 |
| |
| // add current hash values with previously saved |
| PADDD X9, X1 |
| PADDD X10, X2 |
| |
| // advance data pointer; loop until buffer empty |
| ADDQ $0x40, SI |
| CMPQ DX, SI |
| JNE roundLoop |
| |
| // write hash values back in the correct order |
| PSHUFD $0x1b, X1, X1 |
| PSHUFD $0xb1, X2, X2 |
| VMOVDQA X1, X7 |
| PBLENDW $0xf0, X2, X1 |
| PALIGNR $0x08, X7, X2 |
| VMOVDQU X1, (DI) |
| VMOVDQU X2, 16(DI) |
| |
| done: |
| RET |