blob: 4c95dd215274a1453c69967c0528f51928f0dbea [file] [log] [blame]
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build (ppc64 || ppc64le) && !purego
// Based on CRYPTOGAMS code with the following comment:
// # ====================================================================
// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
// # project. The module is, however, dual licensed under OpenSSL and
// # CRYPTOGAMS licenses depending on where you obtain it. For further
// # details see http://www.openssl.org/~appro/cryptogams/.
// # ====================================================================
// Original code can be found at the link below:
// https://github.com/dot-asm/cryptogams/blob/master/ppc/aesp8-ppc.pl
// Some function names were changed to be consistent with Go function
// names. For instance, function aes_p8_set_{en,de}crypt_key become
// set{En,De}cryptKeyAsm. I also split setEncryptKeyAsm in two parts
// and a new session was created (doEncryptKeyAsm). This was necessary to
// avoid arguments overwriting when setDecryptKeyAsm calls setEncryptKeyAsm.
// There were other modifications as well but kept the same functionality.
#include "textflag.h"
// For expandKeyAsm
#define INP R3
#define BITS R4
#define OUTENC R5 // Pointer to next expanded encrypt key
#define PTR R6
#define CNT R7
#define ROUNDS R8
#define OUTDEC R9 // Pointer to next expanded decrypt key
#define TEMP R19
#define ZERO V0
#define IN0 V1
#define IN1 V2
#define KEY V3
#define RCON V4
#define MASK V5
#define TMP V6
#define STAGE V7
#define OUTPERM V8
#define OUTMASK V9
#define OUTHEAD V10
#define OUTTAIL V11
// For P9 instruction emulation
#define ESPERM V21 // Endian swapping permute into BE
#define TMP2 V22 // Temporary for P8_STXVB16X/P8_STXVB16X
// For {en,de}cryptBlockAsm
#define BLK_INP R3
#define BLK_OUT R4
#define BLK_KEY R5
#define BLK_ROUNDS R6
#define BLK_IDX R7
DATA ·rcon+0x00(SB)/8, $0x0f0e0d0c0b0a0908 // Permute for vector doubleword endian swap
DATA ·rcon+0x08(SB)/8, $0x0706050403020100
DATA ·rcon+0x10(SB)/8, $0x0100000001000000 // RCON
DATA ·rcon+0x18(SB)/8, $0x0100000001000000 // RCON
DATA ·rcon+0x20(SB)/8, $0x1b0000001b000000
DATA ·rcon+0x28(SB)/8, $0x1b0000001b000000
DATA ·rcon+0x30(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
DATA ·rcon+0x38(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
DATA ·rcon+0x40(SB)/8, $0x0000000000000000
DATA ·rcon+0x48(SB)/8, $0x0000000000000000
GLOBL ·rcon(SB), RODATA, $80
#ifdef GOARCH_ppc64le
# ifdef GOPPC64_power9
#define P8_LXVB16X(RA,RB,VT) LXVB16X (RA+RB), VT
#define P8_STXVB16X(VS,RA,RB) STXVB16X VS, (RA+RB)
#define XXBRD_ON_LE(VA,VT) XXBRD VA, VT
#define SETUP_ESPERM(rtmp)
# else
// On POWER8/ppc64le, emulate the POWER9 instructions by loading unaligned
// doublewords and byte-swapping each doubleword to emulate BE load/stores.
#define NEEDS_ESPERM
#define P8_LXVB16X(RA,RB,VT) \
LXVD2X (RA+RB), VT \
VPERM VT, VT, ESPERM, VT
#define P8_STXVB16X(VS,RA,RB) \
VPERM VS, VS, ESPERM, TMP2 \
STXVD2X TMP2, (RA+RB)
#define XXBRD_ON_LE(VA,VT) \
VPERM VA, VA, ESPERM, VT
// Setup byte-swapping permute value in ESPERM for POWER9 instruction
// emulation macros.
#define SETUP_ESPERM(rtmp) \
MOVD $·rcon(SB), rtmp \
LVX (rtmp), ESPERM
# endif // defined(GOPPC64_power9)
#else
#define P8_LXVB16X(RA,RB,VT) LXVD2X (RA+RB), VT
#define P8_STXVB16X(VS,RA,RB) STXVD2X VS, (RA+RB)
#define XXBRD_ON_LE(VA, VT)
#define SETUP_ESPERM(rtmp)
#endif // defined(GOARCH_ppc64le)
// func setEncryptKeyAsm(nr int, key *byte, enc *uint32, dec *uint32)
TEXT ·expandKeyAsm(SB), NOSPLIT|NOFRAME, $0
// Load the arguments inside the registers
MOVD nr+0(FP), ROUNDS
MOVD key+8(FP), INP
MOVD enc+16(FP), OUTENC
MOVD dec+24(FP), OUTDEC
#ifdef NEEDS_ESPERM
MOVD $·rcon(SB), PTR // PTR points to rcon addr
LVX (PTR), ESPERM
ADD $0x10, PTR
#else
MOVD $·rcon+0x10(SB), PTR // PTR points to rcon addr (skipping permute vector)
#endif
// Get key from memory and write aligned into VR
P8_LXVB16X(INP, R0, IN0)
ADD $0x10, INP, INP
MOVD $0x20, TEMP
CMPW ROUNDS, $12
LVX (PTR)(R0), RCON // lvx 4,0,6 Load first 16 bytes into RCON
LVX (PTR)(TEMP), MASK
ADD $0x10, PTR, PTR // addi 6,6,0x10 PTR to next 16 bytes of RCON
MOVD $8, CNT // li 7,8 CNT = 8
VXOR ZERO, ZERO, ZERO // vxor 0,0,0 Zero to be zero :)
MOVD CNT, CTR // mtctr 7 Set the counter to 8 (rounds)
// The expanded decrypt key is the expanded encrypt key stored in reverse order.
// Move OUTDEC to the last key location, and store in descending order.
ADD $160, OUTDEC, OUTDEC
BLT loop128
ADD $32, OUTDEC, OUTDEC
BEQ l192
ADD $32, OUTDEC, OUTDEC
JMP l256
loop128:
// Key schedule (Round 1 to 8)
VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-splat
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
STXVD2X IN0, (R0+OUTENC)
STXVD2X IN0, (R0+OUTDEC)
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
ADD $16, OUTENC, OUTENC
ADD $-16, OUTDEC, OUTDEC
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
VADDUWM RCON, RCON, RCON // vadduwm 4,4,4
VXOR IN0, KEY, IN0 // vxor 1,1,3
BDNZ loop128
LVX (PTR)(R0), RCON // lvx 4,0,6 Last two round keys
// Key schedule (Round 9)
VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-spat
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
STXVD2X IN0, (R0+OUTENC)
STXVD2X IN0, (R0+OUTDEC)
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
ADD $16, OUTENC, OUTENC
ADD $-16, OUTDEC, OUTDEC
// Key schedule (Round 10)
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
VADDUWM RCON, RCON, RCON // vadduwm 4,4,4
VXOR IN0, KEY, IN0 // vxor 1,1,3
VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-splat
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
STXVD2X IN0, (R0+OUTENC)
STXVD2X IN0, (R0+OUTDEC)
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
ADD $16, OUTENC, OUTENC
ADD $-16, OUTDEC, OUTDEC
// Key schedule (Round 11)
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
VXOR IN0, KEY, IN0 // vxor 1,1,3
STXVD2X IN0, (R0+OUTENC)
STXVD2X IN0, (R0+OUTDEC)
RET
l192:
LXSDX (INP+R0), IN1 // Load next 8 bytes into upper half of VSR.
XXBRD_ON_LE(IN1, IN1) // and convert to BE ordering on LE hosts.
MOVD $4, CNT // li 7,4
STXVD2X IN0, (R0+OUTENC)
STXVD2X IN0, (R0+OUTDEC)
ADD $16, OUTENC, OUTENC
ADD $-16, OUTDEC, OUTDEC
VSPLTISB $8, KEY // vspltisb 3,8
MOVD CNT, CTR // mtctr 7
VSUBUBM MASK, KEY, MASK // vsububm 5,5,3
loop192:
VPERM IN1, IN1, MASK, KEY // vperm 3,2,2,5
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $8, ZERO, IN1, STAGE // vsldoi 7,0,2,8
VSPLTW $3, IN0, TMP // vspltw 6,1,3
VXOR TMP, IN1, TMP // vxor 6,6,2
VSLDOI $12, ZERO, IN1, IN1 // vsldoi 2,0,2,12
VADDUWM RCON, RCON, RCON // vadduwm 4,4,4
VXOR IN1, TMP, IN1 // vxor 2,2,6
VXOR IN0, KEY, IN0 // vxor 1,1,3
VXOR IN1, KEY, IN1 // vxor 2,2,3
VSLDOI $8, STAGE, IN0, STAGE // vsldoi 7,7,1,8
VPERM IN1, IN1, MASK, KEY // vperm 3,2,2,5
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
STXVD2X STAGE, (R0+OUTENC)
STXVD2X STAGE, (R0+OUTDEC)
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
ADD $16, OUTENC, OUTENC
ADD $-16, OUTDEC, OUTDEC
VSLDOI $8, IN0, IN1, STAGE // vsldoi 7,1,2,8
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
STXVD2X STAGE, (R0+OUTENC)
STXVD2X STAGE, (R0+OUTDEC)
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
ADD $16, OUTENC, OUTENC
ADD $-16, OUTDEC, OUTDEC
VSPLTW $3, IN0, TMP // vspltw 6,1,3
VXOR TMP, IN1, TMP // vxor 6,6,2
VSLDOI $12, ZERO, IN1, IN1 // vsldoi 2,0,2,12
VADDUWM RCON, RCON, RCON // vadduwm 4,4,4
VXOR IN1, TMP, IN1 // vxor 2,2,6
VXOR IN0, KEY, IN0 // vxor 1,1,3
VXOR IN1, KEY, IN1 // vxor 2,2,3
STXVD2X IN0, (R0+OUTENC)
STXVD2X IN0, (R0+OUTDEC)
ADD $16, OUTENC, OUTENC
ADD $-16, OUTDEC, OUTDEC
BDNZ loop192
RET
l256:
P8_LXVB16X(INP, R0, IN1)
MOVD $7, CNT // li 7,7
STXVD2X IN0, (R0+OUTENC)
STXVD2X IN0, (R0+OUTDEC)
ADD $16, OUTENC, OUTENC
ADD $-16, OUTDEC, OUTDEC
MOVD CNT, CTR // mtctr 7
loop256:
VPERM IN1, IN1, MASK, KEY // vperm 3,2,2,5
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
STXVD2X IN1, (R0+OUTENC)
STXVD2X IN1, (R0+OUTDEC)
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
ADD $16, OUTENC, OUTENC
ADD $-16, OUTDEC, OUTDEC
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN0, TMP, IN0 // vxor 1,1,6
VADDUWM RCON, RCON, RCON // vadduwm 4,4,4
VXOR IN0, KEY, IN0 // vxor 1,1,3
STXVD2X IN0, (R0+OUTENC)
STXVD2X IN0, (R0+OUTDEC)
ADD $16, OUTENC, OUTENC
ADD $-16, OUTDEC, OUTDEC
BDZ done
VSPLTW $3, IN0, KEY // vspltw 3,1,3
VSLDOI $12, ZERO, IN1, TMP // vsldoi 6,0,2,12
VSBOX KEY, KEY // vsbox 3,3
VXOR IN1, TMP, IN1 // vxor 2,2,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN1, TMP, IN1 // vxor 2,2,6
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
VXOR IN1, TMP, IN1 // vxor 2,2,6
VXOR IN1, KEY, IN1 // vxor 2,2,3
JMP loop256 // b .Loop256
done:
RET
// func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
TEXT ·encryptBlockAsm(SB), NOSPLIT|NOFRAME, $0
MOVD nr+0(FP), R6 // Round count/Key size
MOVD xk+8(FP), R5 // Key pointer
MOVD dst+16(FP), R3 // Dest pointer
MOVD src+24(FP), R4 // Src pointer
SETUP_ESPERM(R7)
// Set CR{1,2,3}EQ to hold the key size information.
CMPU R6, $10, CR1
CMPU R6, $12, CR2
CMPU R6, $14, CR3
MOVD $16, R6
MOVD $32, R7
MOVD $48, R8
MOVD $64, R9
MOVD $80, R10
MOVD $96, R11
MOVD $112, R12
// Load text in BE order
P8_LXVB16X(R4, R0, V0)
// V1, V2 will hold keys, V0 is a temp.
// At completion, V2 will hold the ciphertext.
// Load xk[0:3] and xor with text
LXVD2X (R0+R5), V1
VXOR V0, V1, V0
// Load xk[4:11] and cipher
LXVD2X (R6+R5), V1
LXVD2X (R7+R5), V2
VCIPHER V0, V1, V0
VCIPHER V0, V2, V0
// Load xk[12:19] and cipher
LXVD2X (R8+R5), V1
LXVD2X (R9+R5), V2
VCIPHER V0, V1, V0
VCIPHER V0, V2, V0
// Load xk[20:27] and cipher
LXVD2X (R10+R5), V1
LXVD2X (R11+R5), V2
VCIPHER V0, V1, V0
VCIPHER V0, V2, V0
// Increment xk pointer to reuse constant offsets in R6-R12.
ADD $112, R5
// Load xk[28:35] and cipher
LXVD2X (R0+R5), V1
LXVD2X (R6+R5), V2
VCIPHER V0, V1, V0
VCIPHER V0, V2, V0
// Load xk[36:43] and cipher
LXVD2X (R7+R5), V1
LXVD2X (R8+R5), V2
BEQ CR1, Ldec_tail // Key size 10?
VCIPHER V0, V1, V0
VCIPHER V0, V2, V0
// Load xk[44:51] and cipher
LXVD2X (R9+R5), V1
LXVD2X (R10+R5), V2
BEQ CR2, Ldec_tail // Key size 12?
VCIPHER V0, V1, V0
VCIPHER V0, V2, V0
// Load xk[52:59] and cipher
LXVD2X (R11+R5), V1
LXVD2X (R12+R5), V2
BNE CR3, Linvalid_key_len // Not key size 14?
// Fallthrough to final cipher
Ldec_tail:
// Cipher last two keys such that key information is
// cleared from V1 and V2.
VCIPHER V0, V1, V1
VCIPHERLAST V1, V2, V2
// Store the result in BE order.
P8_STXVB16X(V2, R3, R0)
RET
Linvalid_key_len:
// Segfault, this should never happen. Only 3 keys sizes are created/used.
MOVD R0, 0(R0)
RET
// func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
TEXT ·decryptBlockAsm(SB), NOSPLIT|NOFRAME, $0
MOVD nr+0(FP), R6 // Round count/Key size
MOVD xk+8(FP), R5 // Key pointer
MOVD dst+16(FP), R3 // Dest pointer
MOVD src+24(FP), R4 // Src pointer
SETUP_ESPERM(R7)
// Set CR{1,2,3}EQ to hold the key size information.
CMPU R6, $10, CR1
CMPU R6, $12, CR2
CMPU R6, $14, CR3
MOVD $16, R6
MOVD $32, R7
MOVD $48, R8
MOVD $64, R9
MOVD $80, R10
MOVD $96, R11
MOVD $112, R12
// Load text in BE order
P8_LXVB16X(R4, R0, V0)
// V1, V2 will hold keys, V0 is a temp.
// At completion, V2 will hold the text.
// Load xk[0:3] and xor with ciphertext
LXVD2X (R0+R5), V1
VXOR V0, V1, V0
// Load xk[4:11] and cipher
LXVD2X (R6+R5), V1
LXVD2X (R7+R5), V2
VNCIPHER V0, V1, V0
VNCIPHER V0, V2, V0
// Load xk[12:19] and cipher
LXVD2X (R8+R5), V1
LXVD2X (R9+R5), V2
VNCIPHER V0, V1, V0
VNCIPHER V0, V2, V0
// Load xk[20:27] and cipher
LXVD2X (R10+R5), V1
LXVD2X (R11+R5), V2
VNCIPHER V0, V1, V0
VNCIPHER V0, V2, V0
// Increment xk pointer to reuse constant offsets in R6-R12.
ADD $112, R5
// Load xk[28:35] and cipher
LXVD2X (R0+R5), V1
LXVD2X (R6+R5), V2
VNCIPHER V0, V1, V0
VNCIPHER V0, V2, V0
// Load xk[36:43] and cipher
LXVD2X (R7+R5), V1
LXVD2X (R8+R5), V2
BEQ CR1, Ldec_tail // Key size 10?
VNCIPHER V0, V1, V0
VNCIPHER V0, V2, V0
// Load xk[44:51] and cipher
LXVD2X (R9+R5), V1
LXVD2X (R10+R5), V2
BEQ CR2, Ldec_tail // Key size 12?
VNCIPHER V0, V1, V0
VNCIPHER V0, V2, V0
// Load xk[52:59] and cipher
LXVD2X (R11+R5), V1
LXVD2X (R12+R5), V2
BNE CR3, Linvalid_key_len // Not key size 14?
// Fallthrough to final cipher
Ldec_tail:
// Cipher last two keys such that key information is
// cleared from V1 and V2.
VNCIPHER V0, V1, V1
VNCIPHERLAST V1, V2, V2
// Store the result in BE order.
P8_STXVB16X(V2, R3, R0)
RET
Linvalid_key_len:
// Segfault, this should never happen. Only 3 keys sizes are created/used.
MOVD R0, 0(R0)
RET
// Remove defines from above so they can be defined here
#undef INP
#undef OUTENC
#undef ROUNDS
#undef KEY
#undef TMP
#define INP R3
#define OUTP R4
#define LEN R5
#define KEYP R6
#define ROUNDS R7
#define IVP R8
#define ENC R9
#define INOUT V2
#define TMP V3
#define IVEC V4
// Load the crypt key into VSRs.
//
// The expanded key is stored and loaded using
// STXVD2X/LXVD2X. The in-memory byte ordering
// depends on the endianness of the machine. The
// expanded keys are generated by expandKeyAsm above.
//
// Rkeyp holds the key pointer. It is clobbered. Once
// the expanded keys are loaded, it is not needed.
//
// R12,R14-R21 are scratch registers.
// For keyp of 10, V6, V11-V20 hold the expanded key.
// For keyp of 12, V6, V9-V20 hold the expanded key.
// For keyp of 14, V6, V7-V20 hold the expanded key.
#define LOAD_KEY(Rkeyp) \
MOVD $16, R12 \
MOVD $32, R14 \
MOVD $48, R15 \
MOVD $64, R16 \
MOVD $80, R17 \
MOVD $96, R18 \
MOVD $112, R19 \
MOVD $128, R20 \
MOVD $144, R21 \
LXVD2X (R0+Rkeyp), V6 \
ADD $16, Rkeyp \
BEQ CR1, L_start10 \
BEQ CR2, L_start12 \
LXVD2X (R0+Rkeyp), V7 \
LXVD2X (R12+Rkeyp), V8 \
ADD $32, Rkeyp \
L_start12: \
LXVD2X (R0+Rkeyp), V9 \
LXVD2X (R12+Rkeyp), V10 \
ADD $32, Rkeyp \
L_start10: \
LXVD2X (R0+Rkeyp), V11 \
LXVD2X (R12+Rkeyp), V12 \
LXVD2X (R14+Rkeyp), V13 \
LXVD2X (R15+Rkeyp), V14 \
LXVD2X (R16+Rkeyp), V15 \
LXVD2X (R17+Rkeyp), V16 \
LXVD2X (R18+Rkeyp), V17 \
LXVD2X (R19+Rkeyp), V18 \
LXVD2X (R20+Rkeyp), V19 \
LXVD2X (R21+Rkeyp), V20
// Perform aes cipher operation for keysize 10/12/14 using the keys
// loaded by LOAD_KEY, and key size information held in CR1EQ/CR2EQ.
//
// Vxor is ideally V6 (Key[0-3]), but for slightly improved encrypting
// performance V6 and IVEC can be swapped (xor is both associative and
// commutative) during encryption:
//
// VXOR INOUT, IVEC, INOUT
// VXOR INOUT, V6, INOUT
//
// into
//
// VXOR INOUT, V6, INOUT
// VXOR INOUT, IVEC, INOUT
//
#define CIPHER_BLOCK(Vin, Vxor, Vout, vcipher, vciphel, label10, label12) \
VXOR Vin, Vxor, Vout \
BEQ CR1, label10 \
BEQ CR2, label12 \
vcipher Vout, V7, Vout \
vcipher Vout, V8, Vout \
label12: \
vcipher Vout, V9, Vout \
vcipher Vout, V10, Vout \
label10: \
vcipher Vout, V11, Vout \
vcipher Vout, V12, Vout \
vcipher Vout, V13, Vout \
vcipher Vout, V14, Vout \
vcipher Vout, V15, Vout \
vcipher Vout, V16, Vout \
vcipher Vout, V17, Vout \
vcipher Vout, V18, Vout \
vcipher Vout, V19, Vout \
vciphel Vout, V20, Vout \
#define CLEAR_KEYS() \
VXOR V6, V6, V6 \
VXOR V7, V7, V7 \
VXOR V8, V8, V8 \
VXOR V9, V9, V9 \
VXOR V10, V10, V10 \
VXOR V11, V11, V11 \
VXOR V12, V12, V12 \
VXOR V13, V13, V13 \
VXOR V14, V14, V14 \
VXOR V15, V15, V15 \
VXOR V16, V16, V16 \
VXOR V17, V17, V17 \
VXOR V18, V18, V18 \
VXOR V19, V19, V19 \
VXOR V20, V20, V20
//func cryptBlocksChain(src, dst *byte, length int, key *uint32, iv *byte, enc int, nr int)
TEXT ·cryptBlocksChain(SB), NOSPLIT|NOFRAME, $0
MOVD src+0(FP), INP
MOVD dst+8(FP), OUTP
MOVD length+16(FP), LEN
MOVD key+24(FP), KEYP
MOVD iv+32(FP), IVP
MOVD enc+40(FP), ENC
MOVD nr+48(FP), ROUNDS
SETUP_ESPERM(R11)
// Assume len > 0 && len % blockSize == 0.
CMPW ENC, $0
P8_LXVB16X(IVP, R0, IVEC)
CMPU ROUNDS, $10, CR1
CMPU ROUNDS, $12, CR2 // Only sizes 10/12/14 are supported.
// Setup key in VSRs, and set loop count in CTR.
LOAD_KEY(KEYP)
SRD $4, LEN
MOVD LEN, CTR
BEQ Lcbc_dec
PCALIGN $16
Lcbc_enc:
P8_LXVB16X(INP, R0, INOUT)
ADD $16, INP
VXOR INOUT, V6, INOUT
CIPHER_BLOCK(INOUT, IVEC, INOUT, VCIPHER, VCIPHERLAST, Lcbc_enc10, Lcbc_enc12)
VOR INOUT, INOUT, IVEC // ciphertext (INOUT) is IVEC for next block.
P8_STXVB16X(INOUT, OUTP, R0)
ADD $16, OUTP
BDNZ Lcbc_enc
P8_STXVB16X(INOUT, IVP, R0)
CLEAR_KEYS()
RET
PCALIGN $16
Lcbc_dec:
P8_LXVB16X(INP, R0, TMP)
ADD $16, INP
CIPHER_BLOCK(TMP, V6, INOUT, VNCIPHER, VNCIPHERLAST, Lcbc_dec10, Lcbc_dec12)
VXOR INOUT, IVEC, INOUT
VOR TMP, TMP, IVEC // TMP is IVEC for next block.
P8_STXVB16X(INOUT, OUTP, R0)
ADD $16, OUTP
BDNZ Lcbc_dec
P8_STXVB16X(IVEC, IVP, R0)
CLEAR_KEYS()
RET
#define DO1_CIPHER(iv0, keyv, key, op) \
LXVD2X (key), keyv \
ADD $16, key \
op iv0, keyv, iv0
#define DO2_CIPHER(iv0, iv1, keyv, key, op) \
DO1_CIPHER(iv0, keyv, key, op) \
op iv1, keyv, iv1
#define DO4_CIPHER(iv0, iv1, iv2, iv3, keyv, key, op) \
DO2_CIPHER(iv0, iv1, keyv, key, op) \
op iv2, keyv, iv2 \
op iv3, keyv, iv3
#define DO8_CIPHER(iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7, keyv, key, op) \
DO4_CIPHER(iv0, iv1, iv2, iv3, keyv, key, op) \
op iv4, keyv, iv4 \
op iv5, keyv, iv5 \
op iv6, keyv, iv6 \
op iv7, keyv, iv7
#define XOR_STORE(src, iv, dstp, dstpoff) \
XXLXOR src, iv, V8 \
P8_STXVB16X(V8,dstp,dstpoff)
//func ctrBlocks1Asm(nr int, xk *[60]uint32, dst, src *[1 * BlockSize]byte, ivlo, ivhi uint64)
TEXT ·ctrBlocks1Asm(SB), NOSPLIT|NOFRAME, $0
#define CTRBLOCK_PROLOGUE \
MOVD nr+0(FP), R3 \
MOVD xk+8(FP), R4 \
MOVD dst+16(FP), R5 \
MOVD src+24(FP), R6 \
MOVD ivlo+32(FP), R8 \
MOVD ivhi+40(FP), R9 \
CMP R3, $12, CR1 \
MTVSRD R8, V0 \
MTVSRD R9, V1 \
XXPERMDI V1, V0, $0, V0 \
SETUP_ESPERM(R8)
CTRBLOCK_PROLOGUE
DO1_CIPHER(V0,V8,R4,VXOR)
BEQ CR1, key_12
BLT CR1, key_10
key_14:
DO1_CIPHER(V0,V8,R4,VCIPHER)
DO1_CIPHER(V0,V8,R4,VCIPHER)
key_12:
DO1_CIPHER(V0,V8,R4,VCIPHER)
DO1_CIPHER(V0,V8,R4,VCIPHER)
key_10:
P8_LXVB16X(R6,R0,V9)
DO1_CIPHER(V0,V8,R4,VCIPHER)
DO1_CIPHER(V0,V8,R4,VCIPHER)
DO1_CIPHER(V0,V8,R4,VCIPHER)
DO1_CIPHER(V0,V8,R4,VCIPHER)
DO1_CIPHER(V0,V8,R4,VCIPHER)
DO1_CIPHER(V0,V8,R4,VCIPHER)
DO1_CIPHER(V0,V8,R4,VCIPHER)
DO1_CIPHER(V0,V8,R4,VCIPHER)
DO1_CIPHER(V0,V8,R4,VCIPHER)
DO1_CIPHER(V0,V8,R4,VCIPHERLAST)
XOR_STORE(V9,V0,R5,R0)
RET
//func ctrBlocks2Asm(nr int, xk *[60]uint32, dst, src *[2 * BlockSize]byte, ivlo, ivhi uint64)
TEXT ·ctrBlocks2Asm(SB), NOSPLIT|NOFRAME, $0
CTRBLOCK_PROLOGUE
XXLEQV V8, V8, V8 // V0 is -1
VSUBUQM V0, V8, V1 // Vi = IV + i (as IV - (-1))
DO2_CIPHER(V0,V1,V8,R4,VXOR)
BEQ CR1, key_12
BLT CR1, key_10
key_14:
DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
key_12:
DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
key_10:
P8_LXVB16X(R6,R0,V9)
DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
MOVD $16, R8
P8_LXVB16X(R6,R8,V10)
DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
DO2_CIPHER(V0,V1,V8,R4,VCIPHERLAST)
XOR_STORE(V9,V0,R5,R0)
XOR_STORE(V10,V1,R5,R8)
RET
//func ctrBlocks4Asm(nr int, xk *[60]uint32, dst, src *[4 * BlockSize]byte, ivlo, ivhi uint64)
TEXT ·ctrBlocks4Asm(SB), NOSPLIT|NOFRAME, $0
CTRBLOCK_PROLOGUE
XXLEQV V8, V8, V8 // V0 is -1
VSUBUQM V0, V8, V1 // Vi = IV + i (as IV - (-1))
VSUBUQM V1, V8, V2
VSUBUQM V2, V8, V3
DO4_CIPHER(V0,V1,V2,V3,V8,R4,VXOR)
BEQ CR1, key_12
BLT CR1, key_10
key_14:
DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
key_12:
DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
key_10:
P8_LXVB16X(R6,R0,V9)
DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
MOVD $16, R8
P8_LXVB16X(R6,R8,V10)
DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
MOVD $32, R9
P8_LXVB16X(R6,R9,V11)
DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
MOVD $48, R10
P8_LXVB16X(R6,R10,V12)
DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHERLAST)
XOR_STORE(V9,V0,R5,R0)
XOR_STORE(V10,V1,R5,R8)
XOR_STORE(V11,V2,R5,R9)
XOR_STORE(V12,V3,R5,R10)
RET
//func ctrBlocks8Asm(nr int, xk *[60]uint32, dst, src *[8 * BlockSize]byte, ivlo, ivhi uint64)
TEXT ·ctrBlocks8Asm(SB), NOSPLIT|NOFRAME, $0
CTRBLOCK_PROLOGUE
XXLEQV V8, V8, V8 // V8 is -1
VSUBUQM V0, V8, V1 // Vi = IV + i (as IV - (-1))
VADDUQM V8, V8, V9 // V9 is -2
VSUBUQM V0, V9, V2
VSUBUQM V1, V9, V3
VSUBUQM V2, V9, V4
VSUBUQM V3, V9, V5
VSUBUQM V4, V9, V6
VSUBUQM V5, V9, V7
DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VXOR)
BEQ CR1, key_12
BLT CR1, key_10
key_14:
DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
key_12:
DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
key_10:
P8_LXVB16X(R6,R0,V9)
DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
MOVD $16, R8
P8_LXVB16X(R6,R8,V10)
DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
MOVD $32, R9
P8_LXVB16X(R6,R9,V11)
DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
MOVD $48, R10
P8_LXVB16X(R6,R10,V12)
DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
MOVD $64, R11
P8_LXVB16X(R6,R11,V13)
DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
MOVD $80, R12
P8_LXVB16X(R6,R12,V14)
DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
MOVD $96, R14
P8_LXVB16X(R6,R14,V15)
DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
MOVD $112, R15
P8_LXVB16X(R6,R15,V16)
DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHERLAST)
XOR_STORE(V9,V0,R5,R0)
XOR_STORE(V10,V1,R5,R8)
XOR_STORE(V11,V2,R5,R9)
XOR_STORE(V12,V3,R5,R10)
XOR_STORE(V13,V4,R5,R11)
XOR_STORE(V14,V5,R5,R12)
XOR_STORE(V15,V6,R5,R14)
XOR_STORE(V16,V7,R5,R15)
RET