| // Copyright 2017 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| //go:build amd64 && gc && !purego |
| |
| #include "textflag.h" |
| |
| DATA ·c40<>+0x00(SB)/8, $0x0201000706050403 |
| DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b |
| GLOBL ·c40<>(SB), (NOPTR+RODATA), $16 |
| |
| DATA ·c48<>+0x00(SB)/8, $0x0100070605040302 |
| DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a |
| GLOBL ·c48<>(SB), (NOPTR+RODATA), $16 |
| |
| #define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \ |
| MOVO v4, t1; \ |
| MOVO v5, v4; \ |
| MOVO t1, v5; \ |
| MOVO v6, t1; \ |
| PUNPCKLQDQ v6, t2; \ |
| PUNPCKHQDQ v7, v6; \ |
| PUNPCKHQDQ t2, v6; \ |
| PUNPCKLQDQ v7, t2; \ |
| MOVO t1, v7; \ |
| MOVO v2, t1; \ |
| PUNPCKHQDQ t2, v7; \ |
| PUNPCKLQDQ v3, t2; \ |
| PUNPCKHQDQ t2, v2; \ |
| PUNPCKLQDQ t1, t2; \ |
| PUNPCKHQDQ t2, v3 |
| |
| #define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \ |
| MOVO v4, t1; \ |
| MOVO v5, v4; \ |
| MOVO t1, v5; \ |
| MOVO v2, t1; \ |
| PUNPCKLQDQ v2, t2; \ |
| PUNPCKHQDQ v3, v2; \ |
| PUNPCKHQDQ t2, v2; \ |
| PUNPCKLQDQ v3, t2; \ |
| MOVO t1, v3; \ |
| MOVO v6, t1; \ |
| PUNPCKHQDQ t2, v3; \ |
| PUNPCKLQDQ v7, t2; \ |
| PUNPCKHQDQ t2, v6; \ |
| PUNPCKLQDQ t1, t2; \ |
| PUNPCKHQDQ t2, v7 |
| |
| #define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48) \ |
| MOVO v0, t0; \ |
| PMULULQ v2, t0; \ |
| PADDQ v2, v0; \ |
| PADDQ t0, v0; \ |
| PADDQ t0, v0; \ |
| PXOR v0, v6; \ |
| PSHUFD $0xB1, v6, v6; \ |
| MOVO v4, t0; \ |
| PMULULQ v6, t0; \ |
| PADDQ v6, v4; \ |
| PADDQ t0, v4; \ |
| PADDQ t0, v4; \ |
| PXOR v4, v2; \ |
| PSHUFB c40, v2; \ |
| MOVO v0, t0; \ |
| PMULULQ v2, t0; \ |
| PADDQ v2, v0; \ |
| PADDQ t0, v0; \ |
| PADDQ t0, v0; \ |
| PXOR v0, v6; \ |
| PSHUFB c48, v6; \ |
| MOVO v4, t0; \ |
| PMULULQ v6, t0; \ |
| PADDQ v6, v4; \ |
| PADDQ t0, v4; \ |
| PADDQ t0, v4; \ |
| PXOR v4, v2; \ |
| MOVO v2, t0; \ |
| PADDQ v2, t0; \ |
| PSRLQ $63, v2; \ |
| PXOR t0, v2; \ |
| MOVO v1, t0; \ |
| PMULULQ v3, t0; \ |
| PADDQ v3, v1; \ |
| PADDQ t0, v1; \ |
| PADDQ t0, v1; \ |
| PXOR v1, v7; \ |
| PSHUFD $0xB1, v7, v7; \ |
| MOVO v5, t0; \ |
| PMULULQ v7, t0; \ |
| PADDQ v7, v5; \ |
| PADDQ t0, v5; \ |
| PADDQ t0, v5; \ |
| PXOR v5, v3; \ |
| PSHUFB c40, v3; \ |
| MOVO v1, t0; \ |
| PMULULQ v3, t0; \ |
| PADDQ v3, v1; \ |
| PADDQ t0, v1; \ |
| PADDQ t0, v1; \ |
| PXOR v1, v7; \ |
| PSHUFB c48, v7; \ |
| MOVO v5, t0; \ |
| PMULULQ v7, t0; \ |
| PADDQ v7, v5; \ |
| PADDQ t0, v5; \ |
| PADDQ t0, v5; \ |
| PXOR v5, v3; \ |
| MOVO v3, t0; \ |
| PADDQ v3, t0; \ |
| PSRLQ $63, v3; \ |
| PXOR t0, v3 |
| |
| #define LOAD_MSG_0(block, off) \ |
| MOVOU 8*(off+0)(block), X0; \ |
| MOVOU 8*(off+2)(block), X1; \ |
| MOVOU 8*(off+4)(block), X2; \ |
| MOVOU 8*(off+6)(block), X3; \ |
| MOVOU 8*(off+8)(block), X4; \ |
| MOVOU 8*(off+10)(block), X5; \ |
| MOVOU 8*(off+12)(block), X6; \ |
| MOVOU 8*(off+14)(block), X7 |
| |
| #define STORE_MSG_0(block, off) \ |
| MOVOU X0, 8*(off+0)(block); \ |
| MOVOU X1, 8*(off+2)(block); \ |
| MOVOU X2, 8*(off+4)(block); \ |
| MOVOU X3, 8*(off+6)(block); \ |
| MOVOU X4, 8*(off+8)(block); \ |
| MOVOU X5, 8*(off+10)(block); \ |
| MOVOU X6, 8*(off+12)(block); \ |
| MOVOU X7, 8*(off+14)(block) |
| |
| #define LOAD_MSG_1(block, off) \ |
| MOVOU 8*off+0*8(block), X0; \ |
| MOVOU 8*off+16*8(block), X1; \ |
| MOVOU 8*off+32*8(block), X2; \ |
| MOVOU 8*off+48*8(block), X3; \ |
| MOVOU 8*off+64*8(block), X4; \ |
| MOVOU 8*off+80*8(block), X5; \ |
| MOVOU 8*off+96*8(block), X6; \ |
| MOVOU 8*off+112*8(block), X7 |
| |
| #define STORE_MSG_1(block, off) \ |
| MOVOU X0, 8*off+0*8(block); \ |
| MOVOU X1, 8*off+16*8(block); \ |
| MOVOU X2, 8*off+32*8(block); \ |
| MOVOU X3, 8*off+48*8(block); \ |
| MOVOU X4, 8*off+64*8(block); \ |
| MOVOU X5, 8*off+80*8(block); \ |
| MOVOU X6, 8*off+96*8(block); \ |
| MOVOU X7, 8*off+112*8(block) |
| |
| #define BLAMKA_ROUND_0(block, off, t0, t1, c40, c48) \ |
| LOAD_MSG_0(block, off); \ |
| HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ |
| SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \ |
| HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ |
| SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \ |
| STORE_MSG_0(block, off) |
| |
| #define BLAMKA_ROUND_1(block, off, t0, t1, c40, c48) \ |
| LOAD_MSG_1(block, off); \ |
| HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ |
| SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \ |
| HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ |
| SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \ |
| STORE_MSG_1(block, off) |
| |
| // func blamkaSSE4(b *block) |
| TEXT ·blamkaSSE4(SB), 4, $0-8 |
| MOVQ b+0(FP), AX |
| |
| MOVOU ·c40<>(SB), X10 |
| MOVOU ·c48<>(SB), X11 |
| |
| BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11) |
| BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11) |
| BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11) |
| BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11) |
| BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11) |
| BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11) |
| BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11) |
| BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11) |
| |
| BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11) |
| BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11) |
| BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11) |
| BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11) |
| BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11) |
| BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11) |
| BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11) |
| BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11) |
| RET |
| |
| // func mixBlocksSSE2(out, a, b, c *block) |
| TEXT ·mixBlocksSSE2(SB), 4, $0-32 |
| MOVQ out+0(FP), DX |
| MOVQ a+8(FP), AX |
| MOVQ b+16(FP), BX |
| MOVQ c+24(FP), CX |
| MOVQ $128, DI |
| |
| loop: |
| MOVOU 0(AX), X0 |
| MOVOU 0(BX), X1 |
| MOVOU 0(CX), X2 |
| PXOR X1, X0 |
| PXOR X2, X0 |
| MOVOU X0, 0(DX) |
| ADDQ $16, AX |
| ADDQ $16, BX |
| ADDQ $16, CX |
| ADDQ $16, DX |
| SUBQ $2, DI |
| JA loop |
| RET |
| |
| // func xorBlocksSSE2(out, a, b, c *block) |
| TEXT ·xorBlocksSSE2(SB), 4, $0-32 |
| MOVQ out+0(FP), DX |
| MOVQ a+8(FP), AX |
| MOVQ b+16(FP), BX |
| MOVQ c+24(FP), CX |
| MOVQ $128, DI |
| |
| loop: |
| MOVOU 0(AX), X0 |
| MOVOU 0(BX), X1 |
| MOVOU 0(CX), X2 |
| MOVOU 0(DX), X3 |
| PXOR X1, X0 |
| PXOR X2, X0 |
| PXOR X3, X0 |
| MOVOU X0, 0(DX) |
| ADDQ $16, AX |
| ADDQ $16, BX |
| ADDQ $16, CX |
| ADDQ $16, DX |
| SUBQ $2, DI |
| JA loop |
| RET |