| // Copyright 2023 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| #include "textflag.h" |
| |
| // ChaCha8 is ChaCha with 8 rounds. |
| // See https://cr.yp.to/chacha/chacha-20080128.pdf. |
| // See chacha8_generic.go for additional details. |
| |
| // ROL rotates the uint32s in register R left by N bits, using temporary T. |
| #define ROL(N, R, T) \ |
| MOVO R, T; PSLLL $(N), T; PSRLL $(32-(N)), R; PXOR T, R |
| |
| // ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. |
| #ifdef GOAMD64_v2 |
| #define ROL16(R, T) PSHUFB ·rol16<>(SB), R |
| #else |
| #define ROL16(R, T) ROL(16, R, T) |
| #endif |
| |
| // ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. |
| #ifdef GOAMD64_v2 |
| #define ROL8(R, T) PSHUFB ·rol8<>(SB), R |
| #else |
| #define ROL8(R, T) ROL(8, R, T) |
| #endif |
| |
| // QR is the ChaCha quarter-round on A, B, C, and D. T is an available temporary. |
| #define QR(A, B, C, D, T) \ |
| PADDD B, A; PXOR A, D; ROL16(D, T); \ |
| PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B; \ |
| PADDD B, A; PXOR A, D; ROL8(D, T); \ |
| PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B |
| |
| // REPLREG replicates the register R into 4 uint32s in XR. |
| #define REPLREG(R, XR) \ |
| MOVQ R, XR; \ |
| PSHUFD $0, XR, XR |
| |
| // REPL replicates the uint32 constant val into 4 uint32s in XR. It smashes DX. |
| #define REPL(val, XR) \ |
| MOVL $val, DX; \ |
| REPLREG(DX, XR) |
| |
| // SEED copies the off'th uint32 of the seed into the register XR, |
| // replicating it into all four stripes of the register. |
| #define SEED(off, reg, XR) \ |
| MOVL (4*off)(AX), reg; \ |
| REPLREG(reg, XR) \ |
| |
| // block runs 4 ChaCha8 block transformations in the four stripes of the X registers. |
| |
| // func block(seed *[8]uint32, blocks *[16][4]uint32, counter uint32) |
| TEXT ·block<ABIInternal>(SB), NOSPLIT, $16 |
| // seed in AX |
| // blocks in BX |
| // counter in CX |
| |
| // Load initial constants into top row. |
| REPL(0x61707865, X0) |
| REPL(0x3320646e, X1) |
| REPL(0x79622d32, X2) |
| REPL(0x6b206574, X3) |
| |
| // Load counter into bottom left cell. |
| // Each stripe gets a different counter: 0, 1, 2, 3. |
| // (PINSRD is not available in GOAMD64_v1, |
| // so just do it in memory on all systems. |
| // This is not on the critical path.) |
| MOVL CX, 0(SP) |
| INCL CX |
| MOVL CX, 4(SP) |
| INCL CX |
| MOVL CX, 8(SP) |
| INCL CX |
| MOVL CX, 12(SP) |
| MOVOU 0(SP), X12 |
| |
| // Load seed words into next two rows and into DI, SI, R8..R13 |
| SEED(0, DI, X4) |
| SEED(1, SI, X5) |
| SEED(2, R8, X6) |
| SEED(3, R9, X7) |
| SEED(4, R10, X8) |
| SEED(5, R11, X9) |
| SEED(6, R12, X10) |
| SEED(7, R13, X11) |
| |
| // Zeros for remaining two matrix entries. |
| // We have just enough XMM registers to hold the state, |
| // without one for the temporary, so we flush and restore |
| // some values to and from memory to provide a temporary. |
| // The initial temporary is X15, so zero its memory instead |
| // of X15 itself. |
| MOVL $0, DX |
| MOVQ DX, X13 |
| MOVQ DX, X14 |
| MOVOU X14, (15*16)(BX) |
| |
| // 4 iterations. Each iteration is 8 quarter-rounds. |
| MOVL $4, DX |
| loop: |
| QR(X0, X4, X8, X12, X15) |
| MOVOU X4, (4*16)(BX) // save X4 |
| QR(X1, X5, X9, X13, X15) |
| MOVOU (15*16)(BX), X15 // reload X15; temp now X4 |
| QR(X2, X6, X10, X14, X4) |
| QR(X3, X7, X11, X15, X4) |
| |
| QR(X0, X5, X10, X15, X4) |
| MOVOU X15, (15*16)(BX) // save X15 |
| QR(X1, X6, X11, X12, X4) |
| MOVOU (4*16)(BX), X4 // reload X4; temp now X15 |
| QR(X2, X7, X8, X13, X15) |
| QR(X3, X4, X9, X14, X15) |
| |
| DECL DX |
| JNZ loop |
| |
| // Store interlaced blocks back to output buffer, |
| // adding original seed along the way. |
| |
| // First the top and bottom rows. |
| MOVOU X0, (0*16)(BX) |
| MOVOU X1, (1*16)(BX) |
| MOVOU X2, (2*16)(BX) |
| MOVOU X3, (3*16)(BX) |
| MOVOU X12, (12*16)(BX) |
| MOVOU X13, (13*16)(BX) |
| MOVOU X14, (14*16)(BX) |
| // X15 has already been stored. |
| |
| // Now we have X0-X3, X12-X15 available for temporaries. |
| // Add seed rows back to output. We left seed in DI, SI, R8..R13 above. |
| REPLREG(DI, X0) |
| REPLREG(SI, X1) |
| REPLREG(R8, X2) |
| REPLREG(R9, X3) |
| REPLREG(R10, X12) |
| REPLREG(R11, X13) |
| REPLREG(R12, X14) |
| REPLREG(R13, X15) |
| PADDD X0, X4 |
| PADDD X1, X5 |
| PADDD X2, X6 |
| PADDD X3, X7 |
| PADDD X12, X8 |
| PADDD X13, X9 |
| PADDD X14, X10 |
| PADDD X15, X11 |
| MOVOU X4, (4*16)(BX) |
| MOVOU X5, (5*16)(BX) |
| MOVOU X6, (6*16)(BX) |
| MOVOU X7, (7*16)(BX) |
| MOVOU X8, (8*16)(BX) |
| MOVOU X9, (9*16)(BX) |
| MOVOU X10, (10*16)(BX) |
| MOVOU X11, (11*16)(BX) |
| |
| MOVL $0, AX |
| MOVQ AX, X15 // must be 0 on return |
| |
| RET |
| |
| // rotate left 16 indexes for PSHUFB |
| GLOBL ·rol16<>(SB), NOPTR|RODATA, $16 |
| DATA ·rol16<>+0(SB)/8, $0x0504070601000302 |
| DATA ·rol16<>+8(SB)/8, $0x0D0C0F0E09080B0A |
| |
| // rotate left 8 indexes for PSHUFB |
| GLOBL ·rol8<>(SB), NOPTR|RODATA, $16 |
| DATA ·rol8<>+0(SB)/8, $0x0605040702010003 |
| DATA ·rol8<>+8(SB)/8, $0x0E0D0C0F0A09080B |