blob: 73a1e5bf05f659ec14020eab8180f1bbdbecc65c [file] [log] [blame]
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
DATA ·chachaConst+0x00(SB)/4, $0x61707865
DATA ·chachaConst+0x04(SB)/4, $0x3320646e
DATA ·chachaConst+0x08(SB)/4, $0x79622d32
DATA ·chachaConst+0x0c(SB)/4, $0x6b206574
GLOBL ·chachaConst(SB), NOPTR|RODATA, $32
DATA ·chachaIncRot+0x00(SB)/4, $0x00000000
DATA ·chachaIncRot+0x04(SB)/4, $0x00000001
DATA ·chachaIncRot+0x08(SB)/4, $0x00000002
DATA ·chachaIncRot+0x0c(SB)/4, $0x00000003
GLOBL ·chachaIncRot(SB), NOPTR|RODATA, $32
// QR is the ChaCha8 quarter-round on a, b, c, and d.
#define QR(a, b, c, d) \
VADDW a, b, a; \
VXORV d, a, d; \
VROTRW $16, d; \
VADDW c, d, c; \
VXORV b, c, b; \
VROTRW $20, b; \
VADDW a, b, a; \
VXORV d, a, d; \
VROTRW $24, d; \
VADDW c, d, c; \
VXORV b, c, b; \
VROTRW $25, b
// func block(seed *[8]uint32, blocks *[4][16]uint32, counter uint32)
TEXT ·block<ABIInternal>(SB), NOSPLIT, $0
// seed in R4
// blocks in R5
// counter in R6
MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7
BNE R7, lsx_chacha8
JMP ·block_generic<ABIInternal>(SB)
RET
lsx_chacha8:
MOVV $·chachaConst(SB), R10
MOVV $·chachaIncRot(SB), R11
// load contants
VMOVQ (R10), V0.W4
VMOVQ 4(R10), V1.W4
VMOVQ 8(R10), V2.W4
VMOVQ 12(R10), V3.W4
// load 4-32bit data from incRotMatrix added to counter
VMOVQ (R11), V30
// load seed
VMOVQ (R4), V4.W4
VMOVQ 4(R4), V5.W4
VMOVQ 8(R4), V6.W4
VMOVQ 12(R4), V7.W4
VMOVQ 16(R4), V8.W4
VMOVQ 20(R4), V9.W4
VMOVQ 24(R4), V10.W4
VMOVQ 28(R4), V11.W4
// load counter and update counter
VMOVQ R6, V12.W4
VADDW V12, V30, V12
// zeros for remaining three matrix entries
VXORV V13, V13, V13
VXORV V14, V14, V14
VXORV V15, V15, V15
// save seed state for adding back later
VORV V4, V13, V20
VORV V5, V13, V21
VORV V6, V13, V22
VORV V7, V13, V23
VORV V8, V13, V24
VORV V9, V13, V25
VORV V10, V13, V26
VORV V11, V13, V27
// 4 iterations. Each iteration is 8 quarter-rounds.
MOVV $4, R7
loop:
QR(V0, V4, V8, V12)
QR(V1, V5, V9, V13)
QR(V2, V6, V10, V14)
QR(V3, V7, V11, V15)
QR(V0, V5, V10, V15)
QR(V1, V6, V11, V12)
QR(V2, V7, V8, V13)
QR(V3, V4, V9, V14)
SUBV $1, R7
BNE R7, R0, loop
// add seed back
VADDW V4, V20, V4
VADDW V5, V21, V5
VADDW V6, V22, V6
VADDW V7, V23, V7
VADDW V8, V24, V8
VADDW V9, V25, V9
VADDW V10, V26, V10
VADDW V11, V27, V11
// store blocks back to output buffer
VMOVQ V0, (R5)
VMOVQ V1, 16(R5)
VMOVQ V2, 32(R5)
VMOVQ V3, 48(R5)
VMOVQ V4, 64(R5)
VMOVQ V5, 80(R5)
VMOVQ V6, 96(R5)
VMOVQ V7, 112(R5)
VMOVQ V8, 128(R5)
VMOVQ V9, 144(R5)
VMOVQ V10, 160(R5)
VMOVQ V11, 176(R5)
VMOVQ V12, 192(R5)
VMOVQ V13, 208(R5)
VMOVQ V14, 224(R5)
VMOVQ V15, 240(R5)
RET