| // Copyright 2019 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // Based on CRYPTOGAMS code with the following comment: |
| // # ==================================================================== |
| // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| // # project. The module is, however, dual licensed under OpenSSL and |
| // # CRYPTOGAMS licenses depending on where you obtain it. For further |
| // # details see http://www.openssl.org/~appro/cryptogams/. |
| // # ==================================================================== |
| |
| // Code for the perl script that generates the ppc64 assembler |
| // can be found in the cryptogams repository at the link below. It is based on |
| // the original from openssl. |
| |
| // https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91 |
| |
| // The differences in this and the original implementation are |
| // due to the calling conventions and initialization of constants. |
| |
| //go:build gc && !purego |
| |
| #include "textflag.h" |
| |
| #define OUT R3 |
| #define INP R4 |
| #define LEN R5 |
| #define KEY R6 |
| #define CNT R7 |
| #define TMP R15 |
| |
| #define CONSTBASE R16 |
| #define BLOCKS R17 |
| |
| // for VPERMXOR |
| #define MASK R18 |
| |
| DATA consts<>+0x00(SB)/8, $0x3320646e61707865 |
| DATA consts<>+0x08(SB)/8, $0x6b20657479622d32 |
| DATA consts<>+0x10(SB)/8, $0x0000000000000001 |
| DATA consts<>+0x18(SB)/8, $0x0000000000000000 |
| DATA consts<>+0x20(SB)/8, $0x0000000000000004 |
| DATA consts<>+0x28(SB)/8, $0x0000000000000000 |
| DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d |
| DATA consts<>+0x38(SB)/8, $0x0203000106070405 |
| DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c |
| DATA consts<>+0x48(SB)/8, $0x0102030005060704 |
| DATA consts<>+0x50(SB)/8, $0x6170786561707865 |
| DATA consts<>+0x58(SB)/8, $0x6170786561707865 |
| DATA consts<>+0x60(SB)/8, $0x3320646e3320646e |
| DATA consts<>+0x68(SB)/8, $0x3320646e3320646e |
| DATA consts<>+0x70(SB)/8, $0x79622d3279622d32 |
| DATA consts<>+0x78(SB)/8, $0x79622d3279622d32 |
| DATA consts<>+0x80(SB)/8, $0x6b2065746b206574 |
| DATA consts<>+0x88(SB)/8, $0x6b2065746b206574 |
| DATA consts<>+0x90(SB)/8, $0x0000000100000000 |
| DATA consts<>+0x98(SB)/8, $0x0000000300000002 |
| DATA consts<>+0xa0(SB)/8, $0x5566774411223300 |
| DATA consts<>+0xa8(SB)/8, $0xddeeffcc99aabb88 |
| DATA consts<>+0xb0(SB)/8, $0x6677445522330011 |
| DATA consts<>+0xb8(SB)/8, $0xeeffccddaabb8899 |
| GLOBL consts<>(SB), RODATA, $0xc0 |
| |
| //func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32) |
| TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 |
| MOVD out+0(FP), OUT |
| MOVD inp+8(FP), INP |
| MOVD len+16(FP), LEN |
| MOVD key+24(FP), KEY |
| MOVD counter+32(FP), CNT |
| |
| // Addressing for constants |
| MOVD $consts<>+0x00(SB), CONSTBASE |
| MOVD $16, R8 |
| MOVD $32, R9 |
| MOVD $48, R10 |
| MOVD $64, R11 |
| SRD $6, LEN, BLOCKS |
| // for VPERMXOR |
| MOVD $consts<>+0xa0(SB), MASK |
| MOVD $16, R20 |
| // V16 |
| LXVW4X (CONSTBASE)(R0), VS48 |
| ADD $80,CONSTBASE |
| |
| // Load key into V17,V18 |
| LXVW4X (KEY)(R0), VS49 |
| LXVW4X (KEY)(R8), VS50 |
| |
| // Load CNT, NONCE into V19 |
| LXVW4X (CNT)(R0), VS51 |
| |
| // Clear V27 |
| VXOR V27, V27, V27 |
| |
| // V28 |
| LXVW4X (CONSTBASE)(R11), VS60 |
| |
| // Load mask constants for VPERMXOR |
| LXVW4X (MASK)(R0), V20 |
| LXVW4X (MASK)(R20), V21 |
| |
| // splat slot from V19 -> V26 |
| VSPLTW $0, V19, V26 |
| |
| VSLDOI $4, V19, V27, V19 |
| VSLDOI $12, V27, V19, V19 |
| |
| VADDUWM V26, V28, V26 |
| |
| MOVD $10, R14 |
| MOVD R14, CTR |
| PCALIGN $16 |
| loop_outer_vsx: |
| // V0, V1, V2, V3 |
| LXVW4X (R0)(CONSTBASE), VS32 |
| LXVW4X (R8)(CONSTBASE), VS33 |
| LXVW4X (R9)(CONSTBASE), VS34 |
| LXVW4X (R10)(CONSTBASE), VS35 |
| |
| // splat values from V17, V18 into V4-V11 |
| VSPLTW $0, V17, V4 |
| VSPLTW $1, V17, V5 |
| VSPLTW $2, V17, V6 |
| VSPLTW $3, V17, V7 |
| VSPLTW $0, V18, V8 |
| VSPLTW $1, V18, V9 |
| VSPLTW $2, V18, V10 |
| VSPLTW $3, V18, V11 |
| |
| // VOR |
| VOR V26, V26, V12 |
| |
| // splat values from V19 -> V13, V14, V15 |
| VSPLTW $1, V19, V13 |
| VSPLTW $2, V19, V14 |
| VSPLTW $3, V19, V15 |
| |
| // splat const values |
| VSPLTISW $-16, V27 |
| VSPLTISW $12, V28 |
| VSPLTISW $8, V29 |
| VSPLTISW $7, V30 |
| PCALIGN $16 |
| loop_vsx: |
| VADDUWM V0, V4, V0 |
| VADDUWM V1, V5, V1 |
| VADDUWM V2, V6, V2 |
| VADDUWM V3, V7, V3 |
| |
| VPERMXOR V12, V0, V21, V12 |
| VPERMXOR V13, V1, V21, V13 |
| VPERMXOR V14, V2, V21, V14 |
| VPERMXOR V15, V3, V21, V15 |
| |
| VADDUWM V8, V12, V8 |
| VADDUWM V9, V13, V9 |
| VADDUWM V10, V14, V10 |
| VADDUWM V11, V15, V11 |
| |
| VXOR V4, V8, V4 |
| VXOR V5, V9, V5 |
| VXOR V6, V10, V6 |
| VXOR V7, V11, V7 |
| |
| VRLW V4, V28, V4 |
| VRLW V5, V28, V5 |
| VRLW V6, V28, V6 |
| VRLW V7, V28, V7 |
| |
| VADDUWM V0, V4, V0 |
| VADDUWM V1, V5, V1 |
| VADDUWM V2, V6, V2 |
| VADDUWM V3, V7, V3 |
| |
| VPERMXOR V12, V0, V20, V12 |
| VPERMXOR V13, V1, V20, V13 |
| VPERMXOR V14, V2, V20, V14 |
| VPERMXOR V15, V3, V20, V15 |
| |
| VADDUWM V8, V12, V8 |
| VADDUWM V9, V13, V9 |
| VADDUWM V10, V14, V10 |
| VADDUWM V11, V15, V11 |
| |
| VXOR V4, V8, V4 |
| VXOR V5, V9, V5 |
| VXOR V6, V10, V6 |
| VXOR V7, V11, V7 |
| |
| VRLW V4, V30, V4 |
| VRLW V5, V30, V5 |
| VRLW V6, V30, V6 |
| VRLW V7, V30, V7 |
| |
| VADDUWM V0, V5, V0 |
| VADDUWM V1, V6, V1 |
| VADDUWM V2, V7, V2 |
| VADDUWM V3, V4, V3 |
| |
| VPERMXOR V15, V0, V21, V15 |
| VPERMXOR V12, V1, V21, V12 |
| VPERMXOR V13, V2, V21, V13 |
| VPERMXOR V14, V3, V21, V14 |
| |
| VADDUWM V10, V15, V10 |
| VADDUWM V11, V12, V11 |
| VADDUWM V8, V13, V8 |
| VADDUWM V9, V14, V9 |
| |
| VXOR V5, V10, V5 |
| VXOR V6, V11, V6 |
| VXOR V7, V8, V7 |
| VXOR V4, V9, V4 |
| |
| VRLW V5, V28, V5 |
| VRLW V6, V28, V6 |
| VRLW V7, V28, V7 |
| VRLW V4, V28, V4 |
| |
| VADDUWM V0, V5, V0 |
| VADDUWM V1, V6, V1 |
| VADDUWM V2, V7, V2 |
| VADDUWM V3, V4, V3 |
| |
| VPERMXOR V15, V0, V20, V15 |
| VPERMXOR V12, V1, V20, V12 |
| VPERMXOR V13, V2, V20, V13 |
| VPERMXOR V14, V3, V20, V14 |
| |
| VADDUWM V10, V15, V10 |
| VADDUWM V11, V12, V11 |
| VADDUWM V8, V13, V8 |
| VADDUWM V9, V14, V9 |
| |
| VXOR V5, V10, V5 |
| VXOR V6, V11, V6 |
| VXOR V7, V8, V7 |
| VXOR V4, V9, V4 |
| |
| VRLW V5, V30, V5 |
| VRLW V6, V30, V6 |
| VRLW V7, V30, V7 |
| VRLW V4, V30, V4 |
| BDNZ loop_vsx |
| |
| VADDUWM V12, V26, V12 |
| |
| VMRGEW V0, V1, V27 |
| VMRGEW V2, V3, V28 |
| |
| VMRGOW V0, V1, V0 |
| VMRGOW V2, V3, V2 |
| |
| VMRGEW V4, V5, V29 |
| VMRGEW V6, V7, V30 |
| |
| XXPERMDI VS32, VS34, $0, VS33 |
| XXPERMDI VS32, VS34, $3, VS35 |
| XXPERMDI VS59, VS60, $0, VS32 |
| XXPERMDI VS59, VS60, $3, VS34 |
| |
| VMRGOW V4, V5, V4 |
| VMRGOW V6, V7, V6 |
| |
| VMRGEW V8, V9, V27 |
| VMRGEW V10, V11, V28 |
| |
| XXPERMDI VS36, VS38, $0, VS37 |
| XXPERMDI VS36, VS38, $3, VS39 |
| XXPERMDI VS61, VS62, $0, VS36 |
| XXPERMDI VS61, VS62, $3, VS38 |
| |
| VMRGOW V8, V9, V8 |
| VMRGOW V10, V11, V10 |
| |
| VMRGEW V12, V13, V29 |
| VMRGEW V14, V15, V30 |
| |
| XXPERMDI VS40, VS42, $0, VS41 |
| XXPERMDI VS40, VS42, $3, VS43 |
| XXPERMDI VS59, VS60, $0, VS40 |
| XXPERMDI VS59, VS60, $3, VS42 |
| |
| VMRGOW V12, V13, V12 |
| VMRGOW V14, V15, V14 |
| |
| VSPLTISW $4, V27 |
| VADDUWM V26, V27, V26 |
| |
| XXPERMDI VS44, VS46, $0, VS45 |
| XXPERMDI VS44, VS46, $3, VS47 |
| XXPERMDI VS61, VS62, $0, VS44 |
| XXPERMDI VS61, VS62, $3, VS46 |
| |
| VADDUWM V0, V16, V0 |
| VADDUWM V4, V17, V4 |
| VADDUWM V8, V18, V8 |
| VADDUWM V12, V19, V12 |
| |
| CMPU LEN, $64 |
| BLT tail_vsx |
| |
| // Bottom of loop |
| LXVW4X (INP)(R0), VS59 |
| LXVW4X (INP)(R8), VS60 |
| LXVW4X (INP)(R9), VS61 |
| LXVW4X (INP)(R10), VS62 |
| |
| VXOR V27, V0, V27 |
| VXOR V28, V4, V28 |
| VXOR V29, V8, V29 |
| VXOR V30, V12, V30 |
| |
| STXVW4X VS59, (OUT)(R0) |
| STXVW4X VS60, (OUT)(R8) |
| ADD $64, INP |
| STXVW4X VS61, (OUT)(R9) |
| ADD $-64, LEN |
| STXVW4X VS62, (OUT)(R10) |
| ADD $64, OUT |
| BEQ done_vsx |
| |
| VADDUWM V1, V16, V0 |
| VADDUWM V5, V17, V4 |
| VADDUWM V9, V18, V8 |
| VADDUWM V13, V19, V12 |
| |
| CMPU LEN, $64 |
| BLT tail_vsx |
| |
| LXVW4X (INP)(R0), VS59 |
| LXVW4X (INP)(R8), VS60 |
| LXVW4X (INP)(R9), VS61 |
| LXVW4X (INP)(R10), VS62 |
| VXOR V27, V0, V27 |
| |
| VXOR V28, V4, V28 |
| VXOR V29, V8, V29 |
| VXOR V30, V12, V30 |
| |
| STXVW4X VS59, (OUT)(R0) |
| STXVW4X VS60, (OUT)(R8) |
| ADD $64, INP |
| STXVW4X VS61, (OUT)(R9) |
| ADD $-64, LEN |
| STXVW4X VS62, (OUT)(V10) |
| ADD $64, OUT |
| BEQ done_vsx |
| |
| VADDUWM V2, V16, V0 |
| VADDUWM V6, V17, V4 |
| VADDUWM V10, V18, V8 |
| VADDUWM V14, V19, V12 |
| |
| CMPU LEN, $64 |
| BLT tail_vsx |
| |
| LXVW4X (INP)(R0), VS59 |
| LXVW4X (INP)(R8), VS60 |
| LXVW4X (INP)(R9), VS61 |
| LXVW4X (INP)(R10), VS62 |
| |
| VXOR V27, V0, V27 |
| VXOR V28, V4, V28 |
| VXOR V29, V8, V29 |
| VXOR V30, V12, V30 |
| |
| STXVW4X VS59, (OUT)(R0) |
| STXVW4X VS60, (OUT)(R8) |
| ADD $64, INP |
| STXVW4X VS61, (OUT)(R9) |
| ADD $-64, LEN |
| STXVW4X VS62, (OUT)(R10) |
| ADD $64, OUT |
| BEQ done_vsx |
| |
| VADDUWM V3, V16, V0 |
| VADDUWM V7, V17, V4 |
| VADDUWM V11, V18, V8 |
| VADDUWM V15, V19, V12 |
| |
| CMPU LEN, $64 |
| BLT tail_vsx |
| |
| LXVW4X (INP)(R0), VS59 |
| LXVW4X (INP)(R8), VS60 |
| LXVW4X (INP)(R9), VS61 |
| LXVW4X (INP)(R10), VS62 |
| |
| VXOR V27, V0, V27 |
| VXOR V28, V4, V28 |
| VXOR V29, V8, V29 |
| VXOR V30, V12, V30 |
| |
| STXVW4X VS59, (OUT)(R0) |
| STXVW4X VS60, (OUT)(R8) |
| ADD $64, INP |
| STXVW4X VS61, (OUT)(R9) |
| ADD $-64, LEN |
| STXVW4X VS62, (OUT)(R10) |
| ADD $64, OUT |
| |
| MOVD $10, R14 |
| MOVD R14, CTR |
| BNE loop_outer_vsx |
| |
| done_vsx: |
| // Increment counter by number of 64 byte blocks |
| MOVD (CNT), R14 |
| ADD BLOCKS, R14 |
| MOVD R14, (CNT) |
| RET |
| |
| tail_vsx: |
| ADD $32, R1, R11 |
| MOVD LEN, CTR |
| |
| // Save values on stack to copy from |
| STXVW4X VS32, (R11)(R0) |
| STXVW4X VS36, (R11)(R8) |
| STXVW4X VS40, (R11)(R9) |
| STXVW4X VS44, (R11)(R10) |
| ADD $-1, R11, R12 |
| ADD $-1, INP |
| ADD $-1, OUT |
| PCALIGN $16 |
| looptail_vsx: |
| // Copying the result to OUT |
| // in bytes. |
| MOVBZU 1(R12), KEY |
| MOVBZU 1(INP), TMP |
| XOR KEY, TMP, KEY |
| MOVBU KEY, 1(OUT) |
| BDNZ looptail_vsx |
| |
| // Clear the stack values |
| STXVW4X VS48, (R11)(R0) |
| STXVW4X VS48, (R11)(R8) |
| STXVW4X VS48, (R11)(R9) |
| STXVW4X VS48, (R11)(R10) |
| BR done_vsx |