blob: 0fa0df2f60e8b733f1875d754643e60945a2a876 [file] [log] [blame]
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// SHA512 block routine. See sha512block.go for Go equivalent.
//
// The algorithm is detailed in FIPS 180-4:
//
// https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
//
// Wt = Mt; for 0 <= t <= 15
// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
//
// a = H0
// b = H1
// c = H2
// d = H3
// e = H4
// f = H5
// g = H6
// h = H7
//
// for t = 0 to 79 {
// T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
// T2 = BIGSIGMA0(a) + Maj(a,b,c)
// h = g
// g = f
// f = e
// e = d + T1
// d = c
// c = b
// b = a
// a = T1 + T2
// }
//
// H0 = a + H0
// H1 = b + H1
// H2 = c + H2
// H3 = d + H3
// H4 = e + H4
// H5 = f + H5
// H6 = g + H6
// H7 = h + H7
// Wt = Mt; for 0 <= t <= 15
#define MSGSCHEDULE0(index) \
MOVQ (index*8)(SI), AX; \
BSWAPQ AX; \
MOVQ AX, (index*8)(BP)
// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
// SIGMA0(x) = ROTR(1,x) XOR ROTR(8,x) XOR SHR(7,x)
// SIGMA1(x) = ROTR(19,x) XOR ROTR(61,x) XOR SHR(6,x)
#define MSGSCHEDULE1(index) \
MOVQ ((index-2)*8)(BP), AX; \
MOVQ AX, CX; \
RORQ $19, AX; \
MOVQ CX, DX; \
RORQ $61, CX; \
SHRQ $6, DX; \
MOVQ ((index-15)*8)(BP), BX; \
XORQ CX, AX; \
MOVQ BX, CX; \
XORQ DX, AX; \
RORQ $1, BX; \
MOVQ CX, DX; \
SHRQ $7, DX; \
RORQ $8, CX; \
ADDQ ((index-7)*8)(BP), AX; \
XORQ CX, BX; \
XORQ DX, BX; \
ADDQ ((index-16)*8)(BP), BX; \
ADDQ BX, AX; \
MOVQ AX, ((index)*8)(BP)
// Calculate T1 in AX - uses AX, CX and DX registers.
// h is also used as an accumulator. Wt is passed in AX.
// T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
// BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x)
// Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
#define SHA512T1(const, e, f, g, h) \
MOVQ $const, DX; \
ADDQ AX, h; \
MOVQ e, AX; \
ADDQ DX, h; \
MOVQ e, CX; \
RORQ $14, AX; \
MOVQ e, DX; \
RORQ $18, CX; \
XORQ CX, AX; \
MOVQ e, CX; \
RORQ $41, DX; \
ANDQ f, CX; \
XORQ AX, DX; \
MOVQ e, AX; \
NOTQ AX; \
ADDQ DX, h; \
ANDQ g, AX; \
XORQ CX, AX; \
ADDQ h, AX
// Calculate T2 in BX - uses BX, CX, DX and DI registers.
// T2 = BIGSIGMA0(a) + Maj(a, b, c)
// BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x)
// Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
#define SHA512T2(a, b, c) \
MOVQ a, DI; \
MOVQ c, BX; \
RORQ $28, DI; \
MOVQ a, DX; \
ANDQ b, BX; \
RORQ $34, DX; \
MOVQ a, CX; \
ANDQ c, CX; \
XORQ DX, DI; \
XORQ CX, BX; \
MOVQ a, DX; \
MOVQ b, CX; \
RORQ $39, DX; \
ANDQ a, CX; \
XORQ CX, BX; \
XORQ DX, DI; \
ADDQ DI, BX
// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
// The values for e and a are stored in d and h, ready for rotation.
#define SHA512ROUND(index, const, a, b, c, d, e, f, g, h) \
SHA512T1(const, e, f, g, h); \
SHA512T2(a, b, c); \
MOVQ BX, h; \
ADDQ AX, d; \
ADDQ AX, h
#define SHA512ROUND0(index, const, a, b, c, d, e, f, g, h) \
MSGSCHEDULE0(index); \
SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
#define SHA512ROUND1(index, const, a, b, c, d, e, f, g, h) \
MSGSCHEDULE1(index); \
SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
TEXT ·blockAMD64(SB),0,$648-32
MOVQ p_base+8(FP), SI
MOVQ p_len+16(FP), DX
SHRQ $7, DX
SHLQ $7, DX
LEAQ (SI)(DX*1), DI
MOVQ DI, 640(SP)
CMPQ SI, DI
JEQ end
MOVQ dig+0(FP), BP
MOVQ (0*8)(BP), R8 // a = H0
MOVQ (1*8)(BP), R9 // b = H1
MOVQ (2*8)(BP), R10 // c = H2
MOVQ (3*8)(BP), R11 // d = H3
MOVQ (4*8)(BP), R12 // e = H4
MOVQ (5*8)(BP), R13 // f = H5
MOVQ (6*8)(BP), R14 // g = H6
MOVQ (7*8)(BP), R15 // h = H7
loop:
MOVQ SP, BP // message schedule
SHA512ROUND0(0, 0x428a2f98d728ae22, R8, R9, R10, R11, R12, R13, R14, R15)
SHA512ROUND0(1, 0x7137449123ef65cd, R15, R8, R9, R10, R11, R12, R13, R14)
SHA512ROUND0(2, 0xb5c0fbcfec4d3b2f, R14, R15, R8, R9, R10, R11, R12, R13)
SHA512ROUND0(3, 0xe9b5dba58189dbbc, R13, R14, R15, R8, R9, R10, R11, R12)
SHA512ROUND0(4, 0x3956c25bf348b538, R12, R13, R14, R15, R8, R9, R10, R11)
SHA512ROUND0(5, 0x59f111f1b605d019, R11, R12, R13, R14, R15, R8, R9, R10)
SHA512ROUND0(6, 0x923f82a4af194f9b, R10, R11, R12, R13, R14, R15, R8, R9)
SHA512ROUND0(7, 0xab1c5ed5da6d8118, R9, R10, R11, R12, R13, R14, R15, R8)
SHA512ROUND0(8, 0xd807aa98a3030242, R8, R9, R10, R11, R12, R13, R14, R15)
SHA512ROUND0(9, 0x12835b0145706fbe, R15, R8, R9, R10, R11, R12, R13, R14)
SHA512ROUND0(10, 0x243185be4ee4b28c, R14, R15, R8, R9, R10, R11, R12, R13)
SHA512ROUND0(11, 0x550c7dc3d5ffb4e2, R13, R14, R15, R8, R9, R10, R11, R12)
SHA512ROUND0(12, 0x72be5d74f27b896f, R12, R13, R14, R15, R8, R9, R10, R11)
SHA512ROUND0(13, 0x80deb1fe3b1696b1, R11, R12, R13, R14, R15, R8, R9, R10)
SHA512ROUND0(14, 0x9bdc06a725c71235, R10, R11, R12, R13, R14, R15, R8, R9)
SHA512ROUND0(15, 0xc19bf174cf692694, R9, R10, R11, R12, R13, R14, R15, R8)
SHA512ROUND1(16, 0xe49b69c19ef14ad2, R8, R9, R10, R11, R12, R13, R14, R15)
SHA512ROUND1(17, 0xefbe4786384f25e3, R15, R8, R9, R10, R11, R12, R13, R14)
SHA512ROUND1(18, 0x0fc19dc68b8cd5b5, R14, R15, R8, R9, R10, R11, R12, R13)
SHA512ROUND1(19, 0x240ca1cc77ac9c65, R13, R14, R15, R8, R9, R10, R11, R12)
SHA512ROUND1(20, 0x2de92c6f592b0275, R12, R13, R14, R15, R8, R9, R10, R11)
SHA512ROUND1(21, 0x4a7484aa6ea6e483, R11, R12, R13, R14, R15, R8, R9, R10)
SHA512ROUND1(22, 0x5cb0a9dcbd41fbd4, R10, R11, R12, R13, R14, R15, R8, R9)
SHA512ROUND1(23, 0x76f988da831153b5, R9, R10, R11, R12, R13, R14, R15, R8)
SHA512ROUND1(24, 0x983e5152ee66dfab, R8, R9, R10, R11, R12, R13, R14, R15)
SHA512ROUND1(25, 0xa831c66d2db43210, R15, R8, R9, R10, R11, R12, R13, R14)
SHA512ROUND1(26, 0xb00327c898fb213f, R14, R15, R8, R9, R10, R11, R12, R13)
SHA512ROUND1(27, 0xbf597fc7beef0ee4, R13, R14, R15, R8, R9, R10, R11, R12)
SHA512ROUND1(28, 0xc6e00bf33da88fc2, R12, R13, R14, R15, R8, R9, R10, R11)
SHA512ROUND1(29, 0xd5a79147930aa725, R11, R12, R13, R14, R15, R8, R9, R10)
SHA512ROUND1(30, 0x06ca6351e003826f, R10, R11, R12, R13, R14, R15, R8, R9)
SHA512ROUND1(31, 0x142929670a0e6e70, R9, R10, R11, R12, R13, R14, R15, R8)
SHA512ROUND1(32, 0x27b70a8546d22ffc, R8, R9, R10, R11, R12, R13, R14, R15)
SHA512ROUND1(33, 0x2e1b21385c26c926, R15, R8, R9, R10, R11, R12, R13, R14)
SHA512ROUND1(34, 0x4d2c6dfc5ac42aed, R14, R15, R8, R9, R10, R11, R12, R13)
SHA512ROUND1(35, 0x53380d139d95b3df, R13, R14, R15, R8, R9, R10, R11, R12)
SHA512ROUND1(36, 0x650a73548baf63de, R12, R13, R14, R15, R8, R9, R10, R11)
SHA512ROUND1(37, 0x766a0abb3c77b2a8, R11, R12, R13, R14, R15, R8, R9, R10)
SHA512ROUND1(38, 0x81c2c92e47edaee6, R10, R11, R12, R13, R14, R15, R8, R9)
SHA512ROUND1(39, 0x92722c851482353b, R9, R10, R11, R12, R13, R14, R15, R8)
SHA512ROUND1(40, 0xa2bfe8a14cf10364, R8, R9, R10, R11, R12, R13, R14, R15)
SHA512ROUND1(41, 0xa81a664bbc423001, R15, R8, R9, R10, R11, R12, R13, R14)
SHA512ROUND1(42, 0xc24b8b70d0f89791, R14, R15, R8, R9, R10, R11, R12, R13)
SHA512ROUND1(43, 0xc76c51a30654be30, R13, R14, R15, R8, R9, R10, R11, R12)
SHA512ROUND1(44, 0xd192e819d6ef5218, R12, R13, R14, R15, R8, R9, R10, R11)
SHA512ROUND1(45, 0xd69906245565a910, R11, R12, R13, R14, R15, R8, R9, R10)
SHA512ROUND1(46, 0xf40e35855771202a, R10, R11, R12, R13, R14, R15, R8, R9)
SHA512ROUND1(47, 0x106aa07032bbd1b8, R9, R10, R11, R12, R13, R14, R15, R8)
SHA512ROUND1(48, 0x19a4c116b8d2d0c8, R8, R9, R10, R11, R12, R13, R14, R15)
SHA512ROUND1(49, 0x1e376c085141ab53, R15, R8, R9, R10, R11, R12, R13, R14)
SHA512ROUND1(50, 0x2748774cdf8eeb99, R14, R15, R8, R9, R10, R11, R12, R13)
SHA512ROUND1(51, 0x34b0bcb5e19b48a8, R13, R14, R15, R8, R9, R10, R11, R12)
SHA512ROUND1(52, 0x391c0cb3c5c95a63, R12, R13, R14, R15, R8, R9, R10, R11)
SHA512ROUND1(53, 0x4ed8aa4ae3418acb, R11, R12, R13, R14, R15, R8, R9, R10)
SHA512ROUND1(54, 0x5b9cca4f7763e373, R10, R11, R12, R13, R14, R15, R8, R9)
SHA512ROUND1(55, 0x682e6ff3d6b2b8a3, R9, R10, R11, R12, R13, R14, R15, R8)
SHA512ROUND1(56, 0x748f82ee5defb2fc, R8, R9, R10, R11, R12, R13, R14, R15)
SHA512ROUND1(57, 0x78a5636f43172f60, R15, R8, R9, R10, R11, R12, R13, R14)
SHA512ROUND1(58, 0x84c87814a1f0ab72, R14, R15, R8, R9, R10, R11, R12, R13)
SHA512ROUND1(59, 0x8cc702081a6439ec, R13, R14, R15, R8, R9, R10, R11, R12)
SHA512ROUND1(60, 0x90befffa23631e28, R12, R13, R14, R15, R8, R9, R10, R11)
SHA512ROUND1(61, 0xa4506cebde82bde9, R11, R12, R13, R14, R15, R8, R9, R10)
SHA512ROUND1(62, 0xbef9a3f7b2c67915, R10, R11, R12, R13, R14, R15, R8, R9)
SHA512ROUND1(63, 0xc67178f2e372532b, R9, R10, R11, R12, R13, R14, R15, R8)
SHA512ROUND1(64, 0xca273eceea26619c, R8, R9, R10, R11, R12, R13, R14, R15)
SHA512ROUND1(65, 0xd186b8c721c0c207, R15, R8, R9, R10, R11, R12, R13, R14)
SHA512ROUND1(66, 0xeada7dd6cde0eb1e, R14, R15, R8, R9, R10, R11, R12, R13)
SHA512ROUND1(67, 0xf57d4f7fee6ed178, R13, R14, R15, R8, R9, R10, R11, R12)
SHA512ROUND1(68, 0x06f067aa72176fba, R12, R13, R14, R15, R8, R9, R10, R11)
SHA512ROUND1(69, 0x0a637dc5a2c898a6, R11, R12, R13, R14, R15, R8, R9, R10)
SHA512ROUND1(70, 0x113f9804bef90dae, R10, R11, R12, R13, R14, R15, R8, R9)
SHA512ROUND1(71, 0x1b710b35131c471b, R9, R10, R11, R12, R13, R14, R15, R8)
SHA512ROUND1(72, 0x28db77f523047d84, R8, R9, R10, R11, R12, R13, R14, R15)
SHA512ROUND1(73, 0x32caab7b40c72493, R15, R8, R9, R10, R11, R12, R13, R14)
SHA512ROUND1(74, 0x3c9ebe0a15c9bebc, R14, R15, R8, R9, R10, R11, R12, R13)
SHA512ROUND1(75, 0x431d67c49c100d4c, R13, R14, R15, R8, R9, R10, R11, R12)
SHA512ROUND1(76, 0x4cc5d4becb3e42b6, R12, R13, R14, R15, R8, R9, R10, R11)
SHA512ROUND1(77, 0x597f299cfc657e2a, R11, R12, R13, R14, R15, R8, R9, R10)
SHA512ROUND1(78, 0x5fcb6fab3ad6faec, R10, R11, R12, R13, R14, R15, R8, R9)
SHA512ROUND1(79, 0x6c44198c4a475817, R9, R10, R11, R12, R13, R14, R15, R8)
MOVQ dig+0(FP), BP
ADDQ (0*8)(BP), R8 // H0 = a + H0
MOVQ R8, (0*8)(BP)
ADDQ (1*8)(BP), R9 // H1 = b + H1
MOVQ R9, (1*8)(BP)
ADDQ (2*8)(BP), R10 // H2 = c + H2
MOVQ R10, (2*8)(BP)
ADDQ (3*8)(BP), R11 // H3 = d + H3
MOVQ R11, (3*8)(BP)
ADDQ (4*8)(BP), R12 // H4 = e + H4
MOVQ R12, (4*8)(BP)
ADDQ (5*8)(BP), R13 // H5 = f + H5
MOVQ R13, (5*8)(BP)
ADDQ (6*8)(BP), R14 // H6 = g + H6
MOVQ R14, (6*8)(BP)
ADDQ (7*8)(BP), R15 // H7 = h + H7
MOVQ R15, (7*8)(BP)
ADDQ $128, SI
CMPQ SI, 640(SP)
JB loop
end:
RET
// Version below is based on "Fast SHA512 Implementations on Intel
// Architecture Processors" White-paper
// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-sha512-implementations-ia-processors-paper.pdf
// AVX2 version by Intel, same algorithm in Linux kernel:
// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha512-avx2-asm.S
// James Guilford <james.guilford@intel.com>
// Kirk Yap <kirk.s.yap@intel.com>
// Tim Chen <tim.c.chen@linux.intel.com>
// David Cote <david.m.cote@intel.com>
// Aleksey Sidorov <aleksey.sidorov@intel.com>
#define YFER_SIZE (4*8)
#define SRND_SIZE (1*8)
#define INP_SIZE (1*8)
#define frame_YFER (0)
#define frame_SRND (frame_YFER + YFER_SIZE)
#define frame_INP (frame_SRND + SRND_SIZE)
#define frame_INPEND (frame_INP + INP_SIZE)
#define addm(p1, p2) \
ADDQ p1, p2; \
MOVQ p2, p1
#define COPY_YMM_AND_BSWAP(p1, p2, p3) \
VMOVDQU p2, p1; \
VPSHUFB p3, p1, p1
#define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \
VPERM2F128 $0x3, YSRC2, YSRC1, YDST; \
VPALIGNR $RVAL, YSRC2, YDST, YDST
DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x00(SB)/8, $0x0001020304050607
DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x08(SB)/8, $0x08090a0b0c0d0e0f
DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x10(SB)/8, $0x1011121314151617
DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x18(SB)/8, $0x18191a1b1c1d1e1f
GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), (NOPTR+RODATA), $32
DATA MASK_YMM_LO<>+0x00(SB)/8, $0x0000000000000000
DATA MASK_YMM_LO<>+0x08(SB)/8, $0x0000000000000000
DATA MASK_YMM_LO<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA MASK_YMM_LO<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
GLOBL MASK_YMM_LO<>(SB), (NOPTR+RODATA), $32
TEXT ·blockAVX2(SB), NOSPLIT, $56-32
MOVQ dig+0(FP), SI
MOVQ p_base+8(FP), DI
MOVQ p_len+16(FP), DX
SHRQ $7, DX
SHLQ $7, DX
JZ done_hash
ADDQ DI, DX
MOVQ DX, frame_INPEND(SP)
MOVQ (0*8)(SI), AX
MOVQ (1*8)(SI), BX
MOVQ (2*8)(SI), CX
MOVQ (3*8)(SI), R8
MOVQ (4*8)(SI), DX
MOVQ (5*8)(SI), R9
MOVQ (6*8)(SI), R10
MOVQ (7*8)(SI), R11
VMOVDQU PSHUFFLE_BYTE_FLIP_MASK<>(SB), Y9
loop0:
MOVQ ·_K+0(SB), BP
// byte swap first 16 dwords
COPY_YMM_AND_BSWAP(Y4, (0*32)(DI), Y9)
COPY_YMM_AND_BSWAP(Y5, (1*32)(DI), Y9)
COPY_YMM_AND_BSWAP(Y6, (2*32)(DI), Y9)
COPY_YMM_AND_BSWAP(Y7, (3*32)(DI), Y9)
MOVQ DI, frame_INP(SP)
// schedule 64 input dwords, by doing 12 rounds of 4 each
MOVQ $4, frame_SRND(SP)
loop1:
VPADDQ (BP), Y4, Y0
VMOVDQU Y0, frame_YFER(SP)
MY_VPALIGNR(Y0, Y7, Y6, 8)
VPADDQ Y4, Y0, Y0
MY_VPALIGNR(Y1, Y5, Y4, 8)
VPSRLQ $1, Y1, Y2
VPSLLQ $(64-1), Y1, Y3
VPOR Y2, Y3, Y3
VPSRLQ $7, Y1, Y8
MOVQ AX, DI
RORXQ $41, DX, R13
RORXQ $18, DX, R14
ADDQ frame_YFER(SP), R11
ORQ CX, DI
MOVQ R9, R15
RORXQ $34, AX, R12
XORQ R14, R13
XORQ R10, R15
RORXQ $14, DX, R14
ANDQ DX, R15
XORQ R14, R13
RORXQ $39, AX, R14
ADDQ R11, R8
ANDQ BX, DI
XORQ R12, R14
RORXQ $28, AX, R12
XORQ R10, R15
XORQ R12, R14
MOVQ AX, R12
ANDQ CX, R12
ADDQ R13, R15
ORQ R12, DI
ADDQ R14, R11
ADDQ R15, R8
ADDQ R15, R11
ADDQ DI, R11
VPSRLQ $8, Y1, Y2
VPSLLQ $(64-8), Y1, Y1
VPOR Y2, Y1, Y1
VPXOR Y8, Y3, Y3
VPXOR Y1, Y3, Y1
VPADDQ Y1, Y0, Y0
VPERM2F128 $0x0, Y0, Y0, Y4
VPAND MASK_YMM_LO<>(SB), Y0, Y0
VPERM2F128 $0x11, Y7, Y7, Y2
VPSRLQ $6, Y2, Y8
MOVQ R11, DI
RORXQ $41, R8, R13
RORXQ $18, R8, R14
ADDQ 1*8+frame_YFER(SP), R10
ORQ BX, DI
MOVQ DX, R15
RORXQ $34, R11, R12
XORQ R14, R13
XORQ R9, R15
RORXQ $14, R8, R14
XORQ R14, R13
RORXQ $39, R11, R14
ANDQ R8, R15
ADDQ R10, CX
ANDQ AX, DI
XORQ R12, R14
RORXQ $28, R11, R12
XORQ R9, R15
XORQ R12, R14
MOVQ R11, R12
ANDQ BX, R12
ADDQ R13, R15
ORQ R12, DI
ADDQ R14, R10
ADDQ R15, CX
ADDQ R15, R10
ADDQ DI, R10
VPSRLQ $19, Y2, Y3
VPSLLQ $(64-19), Y2, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPSRLQ $61, Y2, Y3
VPSLLQ $(64-61), Y2, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPADDQ Y8, Y4, Y4
VPSRLQ $6, Y4, Y8
MOVQ R10, DI
RORXQ $41, CX, R13
ADDQ 2*8+frame_YFER(SP), R9
RORXQ $18, CX, R14
ORQ AX, DI
MOVQ R8, R15
XORQ DX, R15
RORXQ $34, R10, R12
XORQ R14, R13
ANDQ CX, R15
RORXQ $14, CX, R14
ADDQ R9, BX
ANDQ R11, DI
XORQ R14, R13
RORXQ $39, R10, R14
XORQ DX, R15
XORQ R12, R14
RORXQ $28, R10, R12
XORQ R12, R14
MOVQ R10, R12
ANDQ AX, R12
ADDQ R13, R15
ORQ R12, DI
ADDQ R14, R9
ADDQ R15, BX
ADDQ R15, R9
ADDQ DI, R9
VPSRLQ $19, Y4, Y3
VPSLLQ $(64-19), Y4, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPSRLQ $61, Y4, Y3
VPSLLQ $(64-61), Y4, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPADDQ Y8, Y0, Y2
VPBLENDD $0xF0, Y2, Y4, Y4
MOVQ R9, DI
RORXQ $41, BX, R13
RORXQ $18, BX, R14
ADDQ 3*8+frame_YFER(SP), DX
ORQ R11, DI
MOVQ CX, R15
RORXQ $34, R9, R12
XORQ R14, R13
XORQ R8, R15
RORXQ $14, BX, R14
ANDQ BX, R15
ADDQ DX, AX
ANDQ R10, DI
XORQ R14, R13
XORQ R8, R15
RORXQ $39, R9, R14
ADDQ R13, R15
XORQ R12, R14
ADDQ R15, AX
RORXQ $28, R9, R12
XORQ R12, R14
MOVQ R9, R12
ANDQ R11, R12
ORQ R12, DI
ADDQ R14, DX
ADDQ R15, DX
ADDQ DI, DX
VPADDQ 1*32(BP), Y5, Y0
VMOVDQU Y0, frame_YFER(SP)
MY_VPALIGNR(Y0, Y4, Y7, 8)
VPADDQ Y5, Y0, Y0
MY_VPALIGNR(Y1, Y6, Y5, 8)
VPSRLQ $1, Y1, Y2
VPSLLQ $(64-1), Y1, Y3
VPOR Y2, Y3, Y3
VPSRLQ $7, Y1, Y8
MOVQ DX, DI
RORXQ $41, AX, R13
RORXQ $18, AX, R14
ADDQ frame_YFER(SP), R8
ORQ R10, DI
MOVQ BX, R15
RORXQ $34, DX, R12
XORQ R14, R13
XORQ CX, R15
RORXQ $14, AX, R14
ANDQ AX, R15
XORQ R14, R13
RORXQ $39, DX, R14
ADDQ R8, R11
ANDQ R9, DI
XORQ R12, R14
RORXQ $28, DX, R12
XORQ CX, R15
XORQ R12, R14
MOVQ DX, R12
ANDQ R10, R12
ADDQ R13, R15
ORQ R12, DI
ADDQ R14, R8
ADDQ R15, R11
ADDQ R15, R8
ADDQ DI, R8
VPSRLQ $8, Y1, Y2
VPSLLQ $(64-8), Y1, Y1
VPOR Y2, Y1, Y1
VPXOR Y8, Y3, Y3
VPXOR Y1, Y3, Y1
VPADDQ Y1, Y0, Y0
VPERM2F128 $0x0, Y0, Y0, Y5
VPAND MASK_YMM_LO<>(SB), Y0, Y0
VPERM2F128 $0x11, Y4, Y4, Y2
VPSRLQ $6, Y2, Y8
MOVQ R8, DI
RORXQ $41, R11, R13
RORXQ $18, R11, R14
ADDQ 1*8+frame_YFER(SP), CX
ORQ R9, DI
MOVQ AX, R15
RORXQ $34, R8, R12
XORQ R14, R13
XORQ BX, R15
RORXQ $14, R11, R14
XORQ R14, R13
RORXQ $39, R8, R14
ANDQ R11, R15
ADDQ CX, R10
ANDQ DX, DI
XORQ R12, R14
RORXQ $28, R8, R12
XORQ BX, R15
XORQ R12, R14
MOVQ R8, R12
ANDQ R9, R12
ADDQ R13, R15
ORQ R12, DI
ADDQ R14, CX
ADDQ R15, R10
ADDQ R15, CX
ADDQ DI, CX
VPSRLQ $19, Y2, Y3
VPSLLQ $(64-19), Y2, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPSRLQ $61, Y2, Y3
VPSLLQ $(64-61), Y2, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPADDQ Y8, Y5, Y5
VPSRLQ $6, Y5, Y8
MOVQ CX, DI
RORXQ $41, R10, R13
ADDQ 2*8+frame_YFER(SP), BX
RORXQ $18, R10, R14
ORQ DX, DI
MOVQ R11, R15
XORQ AX, R15
RORXQ $34, CX, R12
XORQ R14, R13
ANDQ R10, R15
RORXQ $14, R10, R14
ADDQ BX, R9
ANDQ R8, DI
XORQ R14, R13
RORXQ $39, CX, R14
XORQ AX, R15
XORQ R12, R14
RORXQ $28, CX, R12
XORQ R12, R14
MOVQ CX, R12
ANDQ DX, R12
ADDQ R13, R15
ORQ R12, DI
ADDQ R14, BX
ADDQ R15, R9
ADDQ R15, BX
ADDQ DI, BX
VPSRLQ $19, Y5, Y3
VPSLLQ $(64-19), Y5, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPSRLQ $61, Y5, Y3
VPSLLQ $(64-61), Y5, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPADDQ Y8, Y0, Y2
VPBLENDD $0xF0, Y2, Y5, Y5
MOVQ BX, DI
RORXQ $41, R9, R13
RORXQ $18, R9, R14
ADDQ 3*8+frame_YFER(SP), AX
ORQ R8, DI
MOVQ R10, R15
RORXQ $34, BX, R12
XORQ R14, R13
XORQ R11, R15
RORXQ $14, R9, R14
ANDQ R9, R15
ADDQ AX, DX
ANDQ CX, DI
XORQ R14, R13
XORQ R11, R15
RORXQ $39, BX, R14
ADDQ R13, R15
XORQ R12, R14
ADDQ R15, DX
RORXQ $28, BX, R12
XORQ R12, R14
MOVQ BX, R12
ANDQ R8, R12
ORQ R12, DI
ADDQ R14, AX
ADDQ R15, AX
ADDQ DI, AX
VPADDQ 2*32(BP), Y6, Y0
VMOVDQU Y0, frame_YFER(SP)
MY_VPALIGNR(Y0, Y5, Y4, 8)
VPADDQ Y6, Y0, Y0
MY_VPALIGNR(Y1, Y7, Y6, 8)
VPSRLQ $1, Y1, Y2
VPSLLQ $(64-1), Y1, Y3
VPOR Y2, Y3, Y3
VPSRLQ $7, Y1, Y8
MOVQ AX, DI
RORXQ $41, DX, R13
RORXQ $18, DX, R14
ADDQ frame_YFER(SP), R11
ORQ CX, DI
MOVQ R9, R15
RORXQ $34, AX, R12
XORQ R14, R13
XORQ R10, R15
RORXQ $14, DX, R14
ANDQ DX, R15
XORQ R14, R13
RORXQ $39, AX, R14
ADDQ R11, R8
ANDQ BX, DI
XORQ R12, R14
RORXQ $28, AX, R12
XORQ R10, R15
XORQ R12, R14
MOVQ AX, R12
ANDQ CX, R12
ADDQ R13, R15
ORQ R12, DI
ADDQ R14, R11
ADDQ R15, R8
ADDQ R15, R11
ADDQ DI, R11
VPSRLQ $8, Y1, Y2
VPSLLQ $(64-8), Y1, Y1
VPOR Y2, Y1, Y1
VPXOR Y8, Y3, Y3
VPXOR Y1, Y3, Y1
VPADDQ Y1, Y0, Y0
VPERM2F128 $0x0, Y0, Y0, Y6
VPAND MASK_YMM_LO<>(SB), Y0, Y0
VPERM2F128 $0x11, Y5, Y5, Y2
VPSRLQ $6, Y2, Y8
MOVQ R11, DI
RORXQ $41, R8, R13
RORXQ $18, R8, R14
ADDQ 1*8+frame_YFER(SP), R10
ORQ BX, DI
MOVQ DX, R15
RORXQ $34, R11, R12
XORQ R14, R13
XORQ R9, R15
RORXQ $14, R8, R14
XORQ R14, R13
RORXQ $39, R11, R14
ANDQ R8, R15
ADDQ R10, CX
ANDQ AX, DI
XORQ R12, R14
RORXQ $28, R11, R12
XORQ R9, R15
XORQ R12, R14
MOVQ R11, R12
ANDQ BX, R12
ADDQ R13, R15
ORQ R12, DI
ADDQ R14, R10
ADDQ R15, CX
ADDQ R15, R10
ADDQ DI, R10
VPSRLQ $19, Y2, Y3
VPSLLQ $(64-19), Y2, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPSRLQ $61, Y2, Y3
VPSLLQ $(64-61), Y2, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPADDQ Y8, Y6, Y6
VPSRLQ $6, Y6, Y8
MOVQ R10, DI
RORXQ $41, CX, R13
ADDQ 2*8+frame_YFER(SP), R9
RORXQ $18, CX, R14
ORQ AX, DI
MOVQ R8, R15
XORQ DX, R15
RORXQ $34, R10, R12
XORQ R14, R13
ANDQ CX, R15
RORXQ $14, CX, R14
ADDQ R9, BX
ANDQ R11, DI
XORQ R14, R13
RORXQ $39, R10, R14
XORQ DX, R15
XORQ R12, R14
RORXQ $28, R10, R12
XORQ R12, R14
MOVQ R10, R12
ANDQ AX, R12
ADDQ R13, R15
ORQ R12, DI
ADDQ R14, R9
ADDQ R15, BX
ADDQ R15, R9
ADDQ DI, R9
VPSRLQ $19, Y6, Y3
VPSLLQ $(64-19), Y6, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPSRLQ $61, Y6, Y3
VPSLLQ $(64-61), Y6, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPADDQ Y8, Y0, Y2
VPBLENDD $0xF0, Y2, Y6, Y6
MOVQ R9, DI
RORXQ $41, BX, R13
RORXQ $18, BX, R14
ADDQ 3*8+frame_YFER(SP), DX
ORQ R11, DI
MOVQ CX, R15
RORXQ $34, R9, R12
XORQ R14, R13
XORQ R8, R15
RORXQ $14, BX, R14
ANDQ BX, R15
ADDQ DX, AX
ANDQ R10, DI
XORQ R14, R13
XORQ R8, R15
RORXQ $39, R9, R14
ADDQ R13, R15
XORQ R12, R14
ADDQ R15, AX
RORXQ $28, R9, R12
XORQ R12, R14
MOVQ R9, R12
ANDQ R11, R12
ORQ R12, DI
ADDQ R14, DX
ADDQ R15, DX
ADDQ DI, DX
VPADDQ 3*32(BP), Y7, Y0
VMOVDQU Y0, frame_YFER(SP)
ADDQ $(4*32), BP
MY_VPALIGNR(Y0, Y6, Y5, 8)
VPADDQ Y7, Y0, Y0
MY_VPALIGNR(Y1, Y4, Y7, 8)
VPSRLQ $1, Y1, Y2
VPSLLQ $(64-1), Y1, Y3
VPOR Y2, Y3, Y3
VPSRLQ $7, Y1, Y8
MOVQ DX, DI
RORXQ $41, AX, R13
RORXQ $18, AX, R14
ADDQ frame_YFER(SP), R8
ORQ R10, DI
MOVQ BX, R15
RORXQ $34, DX, R12
XORQ R14, R13
XORQ CX, R15
RORXQ $14, AX, R14
ANDQ AX, R15
XORQ R14, R13
RORXQ $39, DX, R14
ADDQ R8, R11
ANDQ R9, DI
XORQ R12, R14
RORXQ $28, DX, R12
XORQ CX, R15
XORQ R12, R14
MOVQ DX, R12
ANDQ R10, R12
ADDQ R13, R15
ORQ R12, DI
ADDQ R14, R8
ADDQ R15, R11
ADDQ R15, R8
ADDQ DI, R8
VPSRLQ $8, Y1, Y2
VPSLLQ $(64-8), Y1, Y1
VPOR Y2, Y1, Y1
VPXOR Y8, Y3, Y3
VPXOR Y1, Y3, Y1
VPADDQ Y1, Y0, Y0
VPERM2F128 $0x0, Y0, Y0, Y7
VPAND MASK_YMM_LO<>(SB), Y0, Y0
VPERM2F128 $0x11, Y6, Y6, Y2
VPSRLQ $6, Y2, Y8
MOVQ R8, DI
RORXQ $41, R11, R13
RORXQ $18, R11, R14
ADDQ 1*8+frame_YFER(SP), CX
ORQ R9, DI
MOVQ AX, R15
RORXQ $34, R8, R12
XORQ R14, R13
XORQ BX, R15
RORXQ $14, R11, R14
XORQ R14, R13
RORXQ $39, R8, R14
ANDQ R11, R15
ADDQ CX, R10
ANDQ DX, DI
XORQ R12, R14
RORXQ $28, R8, R12
XORQ BX, R15
XORQ R12, R14
MOVQ R8, R12
ANDQ R9, R12
ADDQ R13, R15
ORQ R12, DI
ADDQ R14, CX
ADDQ R15, R10
ADDQ R15, CX
ADDQ DI, CX
VPSRLQ $19, Y2, Y3
VPSLLQ $(64-19), Y2, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPSRLQ $61, Y2, Y3
VPSLLQ $(64-61), Y2, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPADDQ Y8, Y7, Y7
VPSRLQ $6, Y7, Y8
MOVQ CX, DI
RORXQ $41, R10, R13
ADDQ 2*8+frame_YFER(SP), BX
RORXQ $18, R10, R14
ORQ DX, DI
MOVQ R11, R15
XORQ AX, R15
RORXQ $34, CX, R12
XORQ R14, R13
ANDQ R10, R15
RORXQ $14, R10, R14
ADDQ BX, R9
ANDQ R8, DI
XORQ R14, R13
RORXQ $39, CX, R14
XORQ AX, R15
XORQ R12, R14
RORXQ $28, CX, R12
XORQ R12, R14
MOVQ CX, R12
ANDQ DX, R12
ADDQ R13, R15
ORQ R12, DI
ADDQ R14, BX
ADDQ R15, R9
ADDQ R15, BX
ADDQ DI, BX
VPSRLQ $19, Y7, Y3
VPSLLQ $(64-19), Y7, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPSRLQ $61, Y7, Y3
VPSLLQ $(64-61), Y7, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPADDQ Y8, Y0, Y2
VPBLENDD $0xF0, Y2, Y7, Y7
MOVQ BX, DI
RORXQ $41, R9, R13
RORXQ $18, R9, R14
ADDQ 3*8+frame_YFER(SP), AX
ORQ R8, DI
MOVQ R10, R15
RORXQ $34, BX, R12
XORQ R14, R13
XORQ R11, R15
RORXQ $14, R9, R14
ANDQ R9, R15
ADDQ AX, DX
ANDQ CX, DI
XORQ R14, R13
XORQ R11, R15
RORXQ $39, BX, R14
ADDQ R13, R15
XORQ R12, R14
ADDQ R15, DX
RORXQ $28, BX, R12
XORQ R12, R14
MOVQ BX, R12
ANDQ R8, R12
ORQ R12, DI
ADDQ R14, AX
ADDQ R15, AX
ADDQ DI, AX
SUBQ $1, frame_SRND(SP)
JNE loop1
MOVQ $2, frame_SRND(SP)
loop2:
VPADDQ (BP), Y4, Y0
VMOVDQU Y0, frame_YFER(SP)
MOVQ R9, R15
RORXQ $41, DX, R13
RORXQ $18, DX, R14
XORQ R10, R15
XORQ R14, R13
RORXQ $14, DX, R14
ANDQ DX, R15
XORQ R14, R13
RORXQ $34, AX, R12
XORQ R10, R15
RORXQ $39, AX, R14
MOVQ AX, DI
XORQ R12, R14
RORXQ $28, AX, R12
ADDQ frame_YFER(SP), R11
ORQ CX, DI
XORQ R12, R14
MOVQ AX, R12
ANDQ BX, DI
ANDQ CX, R12
ADDQ R13, R15
ADDQ R11, R8
ORQ R12, DI
ADDQ R14, R11
ADDQ R15, R8
ADDQ R15, R11
MOVQ DX, R15
RORXQ $41, R8, R13
RORXQ $18, R8, R14
XORQ R9, R15
XORQ R14, R13
RORXQ $14, R8, R14
ANDQ R8, R15
ADDQ DI, R11
XORQ R14, R13
RORXQ $34, R11, R12
XORQ R9, R15
RORXQ $39, R11, R14
MOVQ R11, DI
XORQ R12, R14
RORXQ $28, R11, R12
ADDQ 8*1+frame_YFER(SP), R10
ORQ BX, DI
XORQ R12, R14
MOVQ R11, R12
ANDQ AX, DI
ANDQ BX, R12
ADDQ R13, R15
ADDQ R10, CX
ORQ R12, DI
ADDQ R14, R10
ADDQ R15, CX
ADDQ R15, R10
MOVQ R8, R15
RORXQ $41, CX, R13
RORXQ $18, CX, R14
XORQ DX, R15
XORQ R14, R13
RORXQ $14, CX, R14
ANDQ CX, R15
ADDQ DI, R10
XORQ R14, R13
RORXQ $34, R10, R12
XORQ DX, R15
RORXQ $39, R10, R14
MOVQ R10, DI
XORQ R12, R14
RORXQ $28, R10, R12
ADDQ 8*2+frame_YFER(SP), R9
ORQ AX, DI
XORQ R12, R14
MOVQ R10, R12
ANDQ R11, DI
ANDQ AX, R12
ADDQ R13, R15
ADDQ R9, BX
ORQ R12, DI
ADDQ R14, R9
ADDQ R15, BX
ADDQ R15, R9
MOVQ CX, R15
RORXQ $41, BX, R13
RORXQ $18, BX, R14
XORQ R8, R15
XORQ R14, R13
RORXQ $14, BX, R14
ANDQ BX, R15
ADDQ DI, R9
XORQ R14, R13
RORXQ $34, R9, R12
XORQ R8, R15
RORXQ $39, R9, R14
MOVQ R9, DI
XORQ R12, R14
RORXQ $28, R9, R12
ADDQ 8*3+frame_YFER(SP), DX
ORQ R11, DI
XORQ R12, R14
MOVQ R9, R12
ANDQ R10, DI
ANDQ R11, R12
ADDQ R13, R15
ADDQ DX, AX
ORQ R12, DI
ADDQ R14, DX
ADDQ R15, AX
ADDQ R15, DX
ADDQ DI, DX
VPADDQ 1*32(BP), Y5, Y0
VMOVDQU Y0, frame_YFER(SP)
ADDQ $(2*32), BP
MOVQ BX, R15
RORXQ $41, AX, R13
RORXQ $18, AX, R14
XORQ CX, R15
XORQ R14, R13
RORXQ $14, AX, R14
ANDQ AX, R15
XORQ R14, R13
RORXQ $34, DX, R12
XORQ CX, R15
RORXQ $39, DX, R14
MOVQ DX, DI
XORQ R12, R14
RORXQ $28, DX, R12
ADDQ frame_YFER(SP), R8
ORQ R10, DI
XORQ R12, R14
MOVQ DX, R12
ANDQ R9, DI
ANDQ R10, R12
ADDQ R13, R15
ADDQ R8, R11
ORQ R12, DI
ADDQ R14, R8
ADDQ R15, R11
ADDQ R15, R8
MOVQ AX, R15
RORXQ $41, R11, R13
RORXQ $18, R11, R14
XORQ BX, R15
XORQ R14, R13
RORXQ $14, R11, R14
ANDQ R11, R15
ADDQ DI, R8
XORQ R14, R13
RORXQ $34, R8, R12
XORQ BX, R15
RORXQ $39, R8, R14
MOVQ R8, DI
XORQ R12, R14
RORXQ $28, R8, R12
ADDQ 8*1+frame_YFER(SP), CX
ORQ R9, DI
XORQ R12, R14
MOVQ R8, R12
ANDQ DX, DI
ANDQ R9, R12
ADDQ R13, R15
ADDQ CX, R10
ORQ R12, DI
ADDQ R14, CX
ADDQ R15, R10
ADDQ R15, CX
MOVQ R11, R15
RORXQ $41, R10, R13
RORXQ $18, R10, R14
XORQ AX, R15
XORQ R14, R13
RORXQ $14, R10, R14
ANDQ R10, R15
ADDQ DI, CX
XORQ R14, R13
RORXQ $34, CX, R12
XORQ AX, R15
RORXQ $39, CX, R14
MOVQ CX, DI
XORQ R12, R14
RORXQ $28, CX, R12
ADDQ 8*2+frame_YFER(SP), BX
ORQ DX, DI
XORQ R12, R14
MOVQ CX, R12
ANDQ R8, DI
ANDQ DX, R12
ADDQ R13, R15
ADDQ BX, R9
ORQ R12, DI
ADDQ R14, BX
ADDQ R15, R9
ADDQ R15, BX
MOVQ R10, R15
RORXQ $41, R9, R13
RORXQ $18, R9, R14
XORQ R11, R15
XORQ R14, R13
RORXQ $14, R9, R14
ANDQ R9, R15
ADDQ DI, BX
XORQ R14, R13
RORXQ $34, BX, R12
XORQ R11, R15
RORXQ $39, BX, R14
MOVQ BX, DI
XORQ R12, R14
RORXQ $28, BX, R12
ADDQ 8*3+frame_YFER(SP), AX
ORQ R8, DI
XORQ R12, R14
MOVQ BX, R12
ANDQ CX, DI
ANDQ R8, R12
ADDQ R13, R15
ADDQ AX, DX
ORQ R12, DI
ADDQ R14, AX
ADDQ R15, DX
ADDQ R15, AX
ADDQ DI, AX
VMOVDQU Y6, Y4
VMOVDQU Y7, Y5
SUBQ $1, frame_SRND(SP)
JNE loop2
addm(8*0(SI),AX)
addm(8*1(SI),BX)
addm(8*2(SI),CX)
addm(8*3(SI),R8)
addm(8*4(SI),DX)
addm(8*5(SI),R9)
addm(8*6(SI),R10)
addm(8*7(SI),R11)
MOVQ frame_INP(SP), DI
ADDQ $128, DI
CMPQ DI, frame_INPEND(SP)
JNE loop0
done_hash:
VZEROUPPER
RET