| // Copyright 2019 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| //go:build ppc64 || ppc64le |
| |
| // Portions based on CRYPTOGAMS code with the following comment: |
| // # ==================================================================== |
| // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| // # project. The module is, however, dual licensed under OpenSSL and |
| // # CRYPTOGAMS licenses depending on where you obtain it. For further |
| // # details see http://www.openssl.org/~appro/cryptogams/. |
| // # ==================================================================== |
| |
| // The implementations for gcmHash, gcmInit and gcmMul are based on the generated asm |
| // from the script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl |
| // from commit d47afb3c. |
| |
| // Changes were made due to differences in the ABI and some register usage. |
| // Some arguments were changed due to the way the Go code passes them. |
| |
| // Portions that use the stitched AES-GCM approach in counterCryptASM |
| // are based on code found in |
| // https://github.com/IBM/ipcri/blob/main/aes/p10_aes_gcm.s |
| |
| #include "textflag.h" |
| |
| #define XIP R3 |
| #define HTBL R4 |
| #define INP R5 |
| #define LEN R6 |
| |
| #define XL V0 |
| #define XM V1 |
| #define XH V2 |
| #define IN V3 |
| #define ZERO V4 |
| #define T0 V5 |
| #define T1 V6 |
| #define T2 V7 |
| #define XC2 V8 |
| #define H V9 |
| #define HH V10 |
| #define HL V11 |
| #define LEMASK V12 |
| #define XL1 V13 |
| #define XM1 V14 |
| #define XH1 V15 |
| #define IN1 V16 |
| #define H2 V17 |
| #define H2H V18 |
| #define H2L V19 |
| #define XL3 V20 |
| #define XM2 V21 |
| #define IN2 V22 |
| #define H3L V23 |
| #define H3 V24 |
| #define H3H V25 |
| #define XH3 V26 |
| #define XM3 V27 |
| #define IN3 V28 |
| #define H4L V29 |
| #define H4 V30 |
| #define H4H V31 |
| |
| #define IN0 IN |
| #define H21L HL |
| #define H21H HH |
| #define LOPERM H2L |
| #define HIPERM H2H |
| |
| #define VXL VS32 |
| #define VIN VS35 |
| #define VXC2 VS40 |
| #define VH VS41 |
| #define VHH VS42 |
| #define VHL VS43 |
| #define VIN1 VS48 |
| #define VH2 VS49 |
| #define VH2H VS50 |
| #define VH2L VS51 |
| |
| #define VIN2 VS54 |
| #define VH3L VS55 |
| #define VH3 VS56 |
| #define VH3H VS57 |
| #define VIN3 VS60 |
| #define VH4L VS61 |
| #define VH4 VS62 |
| #define VH4H VS63 |
| |
| #define VIN0 VIN |
| |
| #define ESPERM V10 |
| #define TMP2 V11 |
| |
| // The following macros provide appropriate |
| // implementations for endianness as well as |
| // ISA specific for power8 and power9. |
| #ifdef GOARCH_ppc64le |
| # ifdef GOPPC64_power9 |
| #define P8_LXVB16X(RA,RB,VT) LXVB16X (RA)(RB), VT |
| #define P8_STXVB16X(VS,RA,RB) STXVB16X VS, (RA)(RB) |
| # else |
| #define NEEDS_ESPERM |
| #define P8_LXVB16X(RA,RB,VT) \ |
| LXVD2X (RA+RB), VT \ |
| VPERM VT, VT, ESPERM, VT |
| |
| #define P8_STXVB16X(VS,RA,RB) \ |
| VPERM VS, VS, ESPERM, TMP2; \ |
| STXVD2X TMP2, (RA+RB) |
| |
| # endif |
| #else |
| #define P8_LXVB16X(RA,RB,VT) \ |
| LXVD2X (RA+RB), VT |
| |
| #define P8_STXVB16X(VS,RA,RB) \ |
| STXVD2X VS, (RA+RB) |
| |
| #endif |
| |
| #define MASK_PTR R8 |
| |
| #define MASKV V0 |
| #define INV V1 |
| |
| // The following macros are used for |
| // the stitched implementation within |
| // counterCryptASM. |
| |
| // Load the initial GCM counter value |
| // in V30 and set up the counter increment |
| // in V31 |
| #define SETUP_COUNTER \ |
| P8_LXVB16X(COUNTER, R0, V30); \ |
| VSPLTISB $1, V28; \ |
| VXOR V31, V31, V31; \ |
| VSLDOI $1, V31, V28, V31 |
| |
| // These macros set up the initial value |
| // for a single encryption, or 4 or 8 |
| // stitched encryptions implemented |
| // with interleaving vciphers. |
| // |
| // The input value for each encryption |
| // is generated by XORing the counter |
| // from V30 with the first key in VS0 |
| // and incrementing the counter. |
| // |
| // Single encryption in V15 |
| #define GEN_VCIPHER_INPUT \ |
| XXLOR VS0, VS0, V29 \ |
| VXOR V30, V29, V15; \ |
| VADDUWM V30, V31, V30 |
| |
| // 4 encryptions in V15 - V18 |
| #define GEN_VCIPHER_4_INPUTS \ |
| XXLOR VS0, VS0, V29; \ |
| VXOR V30, V29, V15; \ |
| VADDUWM V30, V31, V30; \ |
| VXOR V30, V29, V16; \ |
| VADDUWM V30, V31, V30; \ |
| VXOR V30, V29, V17; \ |
| VADDUWM V30, V31, V30; \ |
| VXOR V30, V29, V18; \ |
| VADDUWM V30, V31, V30 |
| |
| // 8 encryptions in V15 - V22 |
| #define GEN_VCIPHER_8_INPUTS \ |
| XXLOR VS0, VS0, V29; \ |
| VXOR V30, V29, V15; \ |
| VADDUWM V30, V31, V30; \ |
| VXOR V30, V29, V16; \ |
| VADDUWM V30, V31, V30; \ |
| VXOR V30, V29, V17; \ |
| VADDUWM V30, V31, V30; \ |
| VXOR V30, V29, V18; \ |
| VADDUWM V30, V31, V30; \ |
| VXOR V30, V29, V19; \ |
| VADDUWM V30, V31, V30; \ |
| VXOR V30, V29, V20; \ |
| VADDUWM V30, V31, V30; \ |
| VXOR V30, V29, V21; \ |
| VADDUWM V30, V31, V30; \ |
| VXOR V30, V29, V22; \ |
| VADDUWM V30, V31, V30 |
| |
| // Load the keys to be used for |
| // encryption based on key_len. |
| // Keys are in VS0 - VS14 |
| // depending on key_len. |
| // Valid keys sizes are verified |
| // here. CR2 is set and used |
| // throughout to check key_len. |
| #define LOAD_KEYS(blk_key, key_len) \ |
| MOVD $16, R16; \ |
| MOVD $32, R17; \ |
| MOVD $48, R18; \ |
| MOVD $64, R19; \ |
| LXVD2X (blk_key)(R0), VS0; \ |
| LXVD2X (blk_key)(R16), VS1; \ |
| LXVD2X (blk_key)(R17), VS2; \ |
| LXVD2X (blk_key)(R18), VS3; \ |
| LXVD2X (blk_key)(R19), VS4; \ |
| ADD $64, R16; \ |
| ADD $64, R17; \ |
| ADD $64, R18; \ |
| ADD $64, R19; \ |
| LXVD2X (blk_key)(R16), VS5; \ |
| LXVD2X (blk_key)(R17), VS6; \ |
| LXVD2X (blk_key)(R18), VS7; \ |
| LXVD2X (blk_key)(R19), VS8; \ |
| ADD $64, R16; \ |
| ADD $64, R17; \ |
| ADD $64, R18; \ |
| ADD $64, R19; \ |
| LXVD2X (blk_key)(R16), VS9; \ |
| LXVD2X (blk_key)(R17), VS10; \ |
| CMP key_len, $12, CR2; \ |
| CMP key_len, $10; \ |
| BEQ keysLoaded; \ |
| LXVD2X (blk_key)(R18), VS11; \ |
| LXVD2X (blk_key)(R19), VS12; \ |
| BEQ CR2, keysLoaded; \ |
| ADD $64, R16; \ |
| ADD $64, R17; \ |
| LXVD2X (blk_key)(R16), VS13; \ |
| LXVD2X (blk_key)(R17), VS14; \ |
| CMP key_len, $14; \ |
| BEQ keysLoaded; \ |
| MOVD R0,0(R0); \ |
| keysLoaded: |
| |
| // Encrypt 1 (vin) with first 9 |
| // keys from VS1 - VS9. |
| #define VCIPHER_1X9_KEYS(vin) \ |
| XXLOR VS1, VS1, V23; \ |
| XXLOR VS2, VS2, V24; \ |
| XXLOR VS3, VS3, V25; \ |
| XXLOR VS4, VS4, V26; \ |
| XXLOR VS5, VS5, V27; \ |
| VCIPHER vin, V23, vin; \ |
| VCIPHER vin, V24, vin; \ |
| VCIPHER vin, V25, vin; \ |
| VCIPHER vin, V26, vin; \ |
| VCIPHER vin, V27, vin; \ |
| XXLOR VS6, VS6, V23; \ |
| XXLOR VS7, VS7, V24; \ |
| XXLOR VS8, VS8, V25; \ |
| XXLOR VS9, VS9, V26; \ |
| VCIPHER vin, V23, vin; \ |
| VCIPHER vin, V24, vin; \ |
| VCIPHER vin, V25, vin; \ |
| VCIPHER vin, V26, vin |
| |
| // Encrypt 1 value (vin) with |
| // 2 specified keys |
| #define VCIPHER_1X2_KEYS(vin, key1, key2) \ |
| XXLOR key1, key1, V25; \ |
| XXLOR key2, key2, V26; \ |
| VCIPHER vin, V25, vin; \ |
| VCIPHER vin, V26, vin |
| |
| // Encrypt 4 values in V15 - V18 |
| // with the specified key from |
| // VS1 - VS9. |
| #define VCIPHER_4X1_KEY(key) \ |
| XXLOR key, key, V23; \ |
| VCIPHER V15, V23, V15; \ |
| VCIPHER V16, V23, V16; \ |
| VCIPHER V17, V23, V17; \ |
| VCIPHER V18, V23, V18 |
| |
| // Encrypt 8 values in V15 - V22 |
| // with the specified key, |
| // assuming it is a VSreg |
| #define VCIPHER_8X1_KEY(key) \ |
| XXLOR key, key, V23; \ |
| VCIPHER V15, V23, V15; \ |
| VCIPHER V16, V23, V16; \ |
| VCIPHER V17, V23, V17; \ |
| VCIPHER V18, V23, V18; \ |
| VCIPHER V19, V23, V19; \ |
| VCIPHER V20, V23, V20; \ |
| VCIPHER V21, V23, V21; \ |
| VCIPHER V22, V23, V22 |
| |
| // Load input block into V1-V4 |
| // in big endian order and |
| // update blk_inp by 64. |
| #define LOAD_INPUT_BLOCK64(blk_inp) \ |
| MOVD $16, R16; \ |
| MOVD $32, R17; \ |
| MOVD $48, R18; \ |
| P8_LXVB16X(blk_inp,R0,V1); \ |
| P8_LXVB16X(blk_inp,R16,V2); \ |
| P8_LXVB16X(blk_inp,R17,V3); \ |
| P8_LXVB16X(blk_inp,R18,V4); \ |
| ADD $64, blk_inp |
| |
| // Load input block into V1-V8 |
| // in big endian order and |
| // Update blk_inp by 128 |
| #define LOAD_INPUT_BLOCK128(blk_inp) \ |
| MOVD $16, R16; \ |
| MOVD $32, R17; \ |
| MOVD $48, R18; \ |
| MOVD $64, R19; \ |
| MOVD $80, R20; \ |
| MOVD $96, R21; \ |
| MOVD $112, R22; \ |
| P8_LXVB16X(blk_inp,R0,V1); \ |
| P8_LXVB16X(blk_inp,R16,V2); \ |
| P8_LXVB16X(blk_inp,R17,V3); \ |
| P8_LXVB16X(blk_inp,R18,V4); \ |
| P8_LXVB16X(blk_inp,R19,V5); \ |
| P8_LXVB16X(blk_inp,R20,V6); \ |
| P8_LXVB16X(blk_inp,R21,V7); \ |
| P8_LXVB16X(blk_inp,R22,V8); \ |
| ADD $128, blk_inp |
| |
| // Finish encryption on 8 streams and |
| // XOR with input block |
| #define VCIPHERLAST8_XOR_INPUT \ |
| VCIPHERLAST V15, V23, V15; \ |
| VCIPHERLAST V16, V23, V16; \ |
| VCIPHERLAST V17, V23, V17; \ |
| VCIPHERLAST V18, V23, V18; \ |
| VCIPHERLAST V19, V23, V19; \ |
| VCIPHERLAST V20, V23, V20; \ |
| VCIPHERLAST V21, V23, V21; \ |
| VCIPHERLAST V22, V23, V22; \ |
| XXLXOR V1, V15, V1; \ |
| XXLXOR V2, V16, V2; \ |
| XXLXOR V3, V17, V3; \ |
| XXLXOR V4, V18, V4; \ |
| XXLXOR V5, V19, V5; \ |
| XXLXOR V6, V20, V6; \ |
| XXLXOR V7, V21, V7; \ |
| XXLXOR V8, V22, V8 |
| |
| // Finish encryption on 4 streams and |
| // XOR with input block |
| #define VCIPHERLAST4_XOR_INPUT \ |
| VCIPHERLAST V15, V23, V15; \ |
| VCIPHERLAST V16, V23, V16; \ |
| VCIPHERLAST V17, V23, V17; \ |
| VCIPHERLAST V18, V23, V18; \ |
| XXLXOR V1, V15, V1; \ |
| XXLXOR V2, V16, V2; \ |
| XXLXOR V3, V17, V3; \ |
| XXLXOR V4, V18, V4 |
| |
| // Store output block from V1-V8 |
| // in big endian order and |
| // Update blk_out by 128 |
| #define STORE_OUTPUT_BLOCK128(blk_out) \ |
| P8_STXVB16X(V1,blk_out,R0); \ |
| P8_STXVB16X(V2,blk_out,R16); \ |
| P8_STXVB16X(V3,blk_out,R17); \ |
| P8_STXVB16X(V4,blk_out,R18); \ |
| P8_STXVB16X(V5,blk_out,R19); \ |
| P8_STXVB16X(V6,blk_out,R20); \ |
| P8_STXVB16X(V7,blk_out,R21); \ |
| P8_STXVB16X(V8,blk_out,R22); \ |
| ADD $128, blk_out |
| |
| // Store output block from V1-V4 |
| // in big endian order and |
| // Update blk_out by 64 |
| #define STORE_OUTPUT_BLOCK64(blk_out) \ |
| P8_STXVB16X(V1,blk_out,R0); \ |
| P8_STXVB16X(V2,blk_out,R16); \ |
| P8_STXVB16X(V3,blk_out,R17); \ |
| P8_STXVB16X(V4,blk_out,R18); \ |
| ADD $64, blk_out |
| |
| // func gcmInit(productTable *[256]byte, h []byte) |
| TEXT ·gcmInit(SB), NOSPLIT, $0-32 |
| MOVD productTable+0(FP), XIP |
| MOVD h+8(FP), HTBL |
| |
| MOVD $0x10, R8 |
| MOVD $0x20, R9 |
| MOVD $0x30, R10 |
| LXVD2X (HTBL)(R0), VH // Load H |
| |
| VSPLTISB $-16, XC2 // 0xf0 |
| VSPLTISB $1, T0 // one |
| VADDUBM XC2, XC2, XC2 // 0xe0 |
| VXOR ZERO, ZERO, ZERO |
| VOR XC2, T0, XC2 // 0xe1 |
| VSLDOI $15, XC2, ZERO, XC2 // 0xe1... |
| VSLDOI $1, ZERO, T0, T1 // ...1 |
| VADDUBM XC2, XC2, XC2 // 0xc2... |
| VSPLTISB $7, T2 |
| VOR XC2, T1, XC2 // 0xc2....01 |
| VSPLTB $0, H, T1 // most significant byte |
| VSL H, T0, H // H<<=1 |
| VSRAB T1, T2, T1 // broadcast carry bit |
| VAND T1, XC2, T1 |
| VXOR H, T1, IN // twisted H |
| |
| VSLDOI $8, IN, IN, H // twist even more ... |
| VSLDOI $8, ZERO, XC2, XC2 // 0xc2.0 |
| VSLDOI $8, ZERO, H, HL // ... and split |
| VSLDOI $8, H, ZERO, HH |
| |
| STXVD2X VXC2, (XIP+R0) // save pre-computed table |
| STXVD2X VHL, (XIP+R8) |
| MOVD $0x40, R8 |
| STXVD2X VH, (XIP+R9) |
| MOVD $0x50, R9 |
| STXVD2X VHH, (XIP+R10) |
| MOVD $0x60, R10 |
| |
| VPMSUMD IN, HL, XL // H.lo·H.lo |
| VPMSUMD IN, H, XM // H.hi·H.lo+H.lo·H.hi |
| VPMSUMD IN, HH, XH // H.hi·H.hi |
| |
| VPMSUMD XL, XC2, T2 // 1st reduction phase |
| |
| VSLDOI $8, XM, ZERO, T0 |
| VSLDOI $8, ZERO, XM, T1 |
| VXOR XL, T0, XL |
| VXOR XH, T1, XH |
| |
| VSLDOI $8, XL, XL, XL |
| VXOR XL, T2, XL |
| |
| VSLDOI $8, XL, XL, T1 // 2nd reduction phase |
| VPMSUMD XL, XC2, XL |
| VXOR T1, XH, T1 |
| VXOR XL, T1, IN1 |
| |
| VSLDOI $8, IN1, IN1, H2 |
| VSLDOI $8, ZERO, H2, H2L |
| VSLDOI $8, H2, ZERO, H2H |
| |
| STXVD2X VH2L, (XIP+R8) // save H^2 |
| MOVD $0x70, R8 |
| STXVD2X VH2, (XIP+R9) |
| MOVD $0x80, R9 |
| STXVD2X VH2H, (XIP+R10) |
| MOVD $0x90, R10 |
| |
| VPMSUMD IN, H2L, XL // H.lo·H^2.lo |
| VPMSUMD IN1, H2L, XL1 // H^2.lo·H^2.lo |
| VPMSUMD IN, H2, XM // H.hi·H^2.lo+H.lo·H^2.hi |
| VPMSUMD IN1, H2, XM1 // H^2.hi·H^2.lo+H^2.lo·H^2.hi |
| VPMSUMD IN, H2H, XH // H.hi·H^2.hi |
| VPMSUMD IN1, H2H, XH1 // H^2.hi·H^2.hi |
| |
| VPMSUMD XL, XC2, T2 // 1st reduction phase |
| VPMSUMD XL1, XC2, HH // 1st reduction phase |
| |
| VSLDOI $8, XM, ZERO, T0 |
| VSLDOI $8, ZERO, XM, T1 |
| VSLDOI $8, XM1, ZERO, HL |
| VSLDOI $8, ZERO, XM1, H |
| VXOR XL, T0, XL |
| VXOR XH, T1, XH |
| VXOR XL1, HL, XL1 |
| VXOR XH1, H, XH1 |
| |
| VSLDOI $8, XL, XL, XL |
| VSLDOI $8, XL1, XL1, XL1 |
| VXOR XL, T2, XL |
| VXOR XL1, HH, XL1 |
| |
| VSLDOI $8, XL, XL, T1 // 2nd reduction phase |
| VSLDOI $8, XL1, XL1, H // 2nd reduction phase |
| VPMSUMD XL, XC2, XL |
| VPMSUMD XL1, XC2, XL1 |
| VXOR T1, XH, T1 |
| VXOR H, XH1, H |
| VXOR XL, T1, XL |
| VXOR XL1, H, XL1 |
| |
| VSLDOI $8, XL, XL, H |
| VSLDOI $8, XL1, XL1, H2 |
| VSLDOI $8, ZERO, H, HL |
| VSLDOI $8, H, ZERO, HH |
| VSLDOI $8, ZERO, H2, H2L |
| VSLDOI $8, H2, ZERO, H2H |
| |
| STXVD2X VHL, (XIP+R8) // save H^3 |
| MOVD $0xa0, R8 |
| STXVD2X VH, (XIP+R9) |
| MOVD $0xb0, R9 |
| STXVD2X VHH, (XIP+R10) |
| MOVD $0xc0, R10 |
| STXVD2X VH2L, (XIP+R8) // save H^4 |
| STXVD2X VH2, (XIP+R9) |
| STXVD2X VH2H, (XIP+R10) |
| |
| RET |
| |
| // func gcmHash(output []byte, productTable *[256]byte, inp []byte, len int) |
| TEXT ·gcmHash(SB), NOSPLIT, $0-64 |
| MOVD output+0(FP), XIP |
| MOVD productTable+24(FP), HTBL |
| MOVD inp+32(FP), INP |
| MOVD len+56(FP), LEN |
| |
| MOVD $0x10, R8 |
| MOVD $0x20, R9 |
| MOVD $0x30, R10 |
| LXVD2X (XIP)(R0), VXL // load Xi |
| |
| LXVD2X (HTBL)(R8), VHL // load pre-computed table |
| MOVD $0x40, R8 |
| LXVD2X (HTBL)(R9), VH |
| MOVD $0x50, R9 |
| LXVD2X (HTBL)(R10), VHH |
| MOVD $0x60, R10 |
| LXVD2X (HTBL)(R0), VXC2 |
| #ifdef GOARCH_ppc64le |
| LVSL (R0)(R0), LEMASK |
| VSPLTISB $0x07, T0 |
| VXOR LEMASK, T0, LEMASK |
| VPERM XL, XL, LEMASK, XL |
| #endif |
| VXOR ZERO, ZERO, ZERO |
| |
| CMPU LEN, $64 |
| BGE gcm_ghash_p8_4x |
| |
| LXVD2X (INP)(R0), VIN |
| ADD $16, INP, INP |
| SUBCCC $16, LEN, LEN |
| #ifdef GOARCH_ppc64le |
| VPERM IN, IN, LEMASK, IN |
| #endif |
| VXOR IN, XL, IN |
| BEQ short |
| |
| LXVD2X (HTBL)(R8), VH2L // load H^2 |
| MOVD $16, R8 |
| LXVD2X (HTBL)(R9), VH2 |
| ADD LEN, INP, R9 // end of input |
| LXVD2X (HTBL)(R10), VH2H |
| |
| loop_2x: |
| LXVD2X (INP)(R0), VIN1 |
| #ifdef GOARCH_ppc64le |
| VPERM IN1, IN1, LEMASK, IN1 |
| #endif |
| |
| SUBC $32, LEN, LEN |
| VPMSUMD IN, H2L, XL // H^2.lo·Xi.lo |
| VPMSUMD IN1, HL, XL1 // H.lo·Xi+1.lo |
| SUBE R11, R11, R11 // borrow?-1:0 |
| VPMSUMD IN, H2, XM // H^2.hi·Xi.lo+H^2.lo·Xi.hi |
| VPMSUMD IN1, H, XM1 // H.hi·Xi+1.lo+H.lo·Xi+1.hi |
| AND LEN, R11, R11 |
| VPMSUMD IN, H2H, XH // H^2.hi·Xi.hi |
| VPMSUMD IN1, HH, XH1 // H.hi·Xi+1.hi |
| ADD R11, INP, INP |
| |
| VXOR XL, XL1, XL |
| VXOR XM, XM1, XM |
| |
| VPMSUMD XL, XC2, T2 // 1st reduction phase |
| |
| VSLDOI $8, XM, ZERO, T0 |
| VSLDOI $8, ZERO, XM, T1 |
| VXOR XH, XH1, XH |
| VXOR XL, T0, XL |
| VXOR XH, T1, XH |
| |
| VSLDOI $8, XL, XL, XL |
| VXOR XL, T2, XL |
| LXVD2X (INP)(R8), VIN |
| ADD $32, INP, INP |
| |
| VSLDOI $8, XL, XL, T1 // 2nd reduction phase |
| VPMSUMD XL, XC2, XL |
| #ifdef GOARCH_ppc64le |
| VPERM IN, IN, LEMASK, IN |
| #endif |
| VXOR T1, XH, T1 |
| VXOR IN, T1, IN |
| VXOR IN, XL, IN |
| CMP R9, INP |
| BGT loop_2x // done yet? |
| |
| CMPWU LEN, $0 |
| BNE even |
| |
| short: |
| VPMSUMD IN, HL, XL // H.lo·Xi.lo |
| VPMSUMD IN, H, XM // H.hi·Xi.lo+H.lo·Xi.hi |
| VPMSUMD IN, HH, XH // H.hi·Xi.hi |
| |
| VPMSUMD XL, XC2, T2 // 1st reduction phase |
| |
| VSLDOI $8, XM, ZERO, T0 |
| VSLDOI $8, ZERO, XM, T1 |
| VXOR XL, T0, XL |
| VXOR XH, T1, XH |
| |
| VSLDOI $8, XL, XL, XL |
| VXOR XL, T2, XL |
| |
| VSLDOI $8, XL, XL, T1 // 2nd reduction phase |
| VPMSUMD XL, XC2, XL |
| VXOR T1, XH, T1 |
| |
| even: |
| VXOR XL, T1, XL |
| #ifdef GOARCH_ppc64le |
| VPERM XL, XL, LEMASK, XL |
| #endif |
| STXVD2X VXL, (XIP+R0) |
| |
| OR R12, R12, R12 // write out Xi |
| RET |
| |
| gcm_ghash_p8_4x: |
| LVSL (R8)(R0), T0 // 0x0001..0e0f |
| MOVD $0x70, R8 |
| LXVD2X (HTBL)(R9), VH2 |
| MOVD $0x80, R9 |
| VSPLTISB $8, T1 // 0x0808..0808 |
| MOVD $0x90, R10 |
| LXVD2X (HTBL)(R8), VH3L // load H^3 |
| MOVD $0xa0, R8 |
| LXVD2X (HTBL)(R9), VH3 |
| MOVD $0xb0, R9 |
| LXVD2X (HTBL)(R10), VH3H |
| MOVD $0xc0, R10 |
| LXVD2X (HTBL)(R8), VH4L // load H^4 |
| MOVD $0x10, R8 |
| LXVD2X (HTBL)(R9), VH4 |
| MOVD $0x20, R9 |
| LXVD2X (HTBL)(R10), VH4H |
| MOVD $0x30, R10 |
| |
| VSLDOI $8, ZERO, T1, T2 // 0x0000..0808 |
| VADDUBM T0, T2, HIPERM // 0x0001..1617 |
| VADDUBM T1, HIPERM, LOPERM // 0x0809..1e1f |
| |
| SRD $4, LEN, LEN // this allows to use sign bit as carry |
| |
| LXVD2X (INP)(R0), VIN0 // load input |
| LXVD2X (INP)(R8), VIN1 |
| SUBCCC $8, LEN, LEN |
| LXVD2X (INP)(R9), VIN2 |
| LXVD2X (INP)(R10), VIN3 |
| ADD $0x40, INP, INP |
| #ifdef GOARCH_ppc64le |
| VPERM IN0, IN0, LEMASK, IN0 |
| VPERM IN1, IN1, LEMASK, IN1 |
| VPERM IN2, IN2, LEMASK, IN2 |
| VPERM IN3, IN3, LEMASK, IN3 |
| #endif |
| |
| VXOR IN0, XL, XH |
| |
| VPMSUMD IN1, H3L, XL1 |
| VPMSUMD IN1, H3, XM1 |
| VPMSUMD IN1, H3H, XH1 |
| |
| VPERM H2, H, HIPERM, H21L |
| VPERM IN2, IN3, LOPERM, T0 |
| VPERM H2, H, LOPERM, H21H |
| VPERM IN2, IN3, HIPERM, T1 |
| VPMSUMD IN2, H2, XM2 // H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo |
| VPMSUMD T0, H21L, XL3 // H^2.lo·Xi+2.lo+H.lo·Xi+3.lo |
| VPMSUMD IN3, H, XM3 // H.hi·Xi+3.lo +H.lo·Xi+3.hi |
| VPMSUMD T1, H21H, XH3 // H^2.hi·Xi+2.hi+H.hi·Xi+3.hi |
| |
| VXOR XM2, XM1, XM2 |
| VXOR XL3, XL1, XL3 |
| VXOR XM3, XM2, XM3 |
| VXOR XH3, XH1, XH3 |
| |
| BLT tail_4x |
| |
| loop_4x: |
| LXVD2X (INP)(R0), VIN0 |
| LXVD2X (INP)(R8), VIN1 |
| SUBCCC $4, LEN, LEN |
| LXVD2X (INP)(R9), VIN2 |
| LXVD2X (INP)(R10), VIN3 |
| ADD $0x40, INP, INP |
| #ifdef GOARCH_ppc64le |
| VPERM IN1, IN1, LEMASK, IN1 |
| VPERM IN2, IN2, LEMASK, IN2 |
| VPERM IN3, IN3, LEMASK, IN3 |
| VPERM IN0, IN0, LEMASK, IN0 |
| #endif |
| |
| VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo |
| VPMSUMD XH, H4, XM // H^4.hi·Xi.lo+H^4.lo·Xi.hi |
| VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi |
| VPMSUMD IN1, H3L, XL1 |
| VPMSUMD IN1, H3, XM1 |
| VPMSUMD IN1, H3H, XH1 |
| |
| VXOR XL, XL3, XL |
| VXOR XM, XM3, XM |
| VXOR XH, XH3, XH |
| VPERM IN2, IN3, LOPERM, T0 |
| VPERM IN2, IN3, HIPERM, T1 |
| |
| VPMSUMD XL, XC2, T2 // 1st reduction phase |
| VPMSUMD T0, H21L, XL3 // H.lo·Xi+3.lo +H^2.lo·Xi+2.lo |
| VPMSUMD T1, H21H, XH3 // H.hi·Xi+3.hi +H^2.hi·Xi+2.hi |
| |
| VSLDOI $8, XM, ZERO, T0 |
| VSLDOI $8, ZERO, XM, T1 |
| VXOR XL, T0, XL |
| VXOR XH, T1, XH |
| |
| VSLDOI $8, XL, XL, XL |
| VXOR XL, T2, XL |
| |
| VSLDOI $8, XL, XL, T1 // 2nd reduction phase |
| VPMSUMD IN2, H2, XM2 // H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi |
| VPMSUMD IN3, H, XM3 // H.hi·Xi+3.lo +H.lo·Xi+3.hi |
| VPMSUMD XL, XC2, XL |
| |
| VXOR XL3, XL1, XL3 |
| VXOR XH3, XH1, XH3 |
| VXOR XH, IN0, XH |
| VXOR XM2, XM1, XM2 |
| VXOR XH, T1, XH |
| VXOR XM3, XM2, XM3 |
| VXOR XH, XL, XH |
| BGE loop_4x |
| |
| tail_4x: |
| VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo |
| VPMSUMD XH, H4, XM // H^4.hi·Xi.lo+H^4.lo·Xi.hi |
| VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi |
| |
| VXOR XL, XL3, XL |
| VXOR XM, XM3, XM |
| |
| VPMSUMD XL, XC2, T2 // 1st reduction phase |
| |
| VSLDOI $8, XM, ZERO, T0 |
| VSLDOI $8, ZERO, XM, T1 |
| VXOR XH, XH3, XH |
| VXOR XL, T0, XL |
| VXOR XH, T1, XH |
| |
| VSLDOI $8, XL, XL, XL |
| VXOR XL, T2, XL |
| |
| VSLDOI $8, XL, XL, T1 // 2nd reduction phase |
| VPMSUMD XL, XC2, XL |
| VXOR T1, XH, T1 |
| VXOR XL, T1, XL |
| |
| ADDCCC $4, LEN, LEN |
| BEQ done_4x |
| |
| LXVD2X (INP)(R0), VIN0 |
| CMPU LEN, $2 |
| MOVD $-4, LEN |
| BLT one |
| LXVD2X (INP)(R8), VIN1 |
| BEQ two |
| |
| three: |
| LXVD2X (INP)(R9), VIN2 |
| #ifdef GOARCH_ppc64le |
| VPERM IN0, IN0, LEMASK, IN0 |
| VPERM IN1, IN1, LEMASK, IN1 |
| VPERM IN2, IN2, LEMASK, IN2 |
| #endif |
| |
| VXOR IN0, XL, XH |
| VOR H3L, H3L, H4L |
| VOR H3, H3, H4 |
| VOR H3H, H3H, H4H |
| |
| VPERM IN1, IN2, LOPERM, T0 |
| VPERM IN1, IN2, HIPERM, T1 |
| VPMSUMD IN1, H2, XM2 // H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo |
| VPMSUMD IN2, H, XM3 // H.hi·Xi+2.lo +H.lo·Xi+2.hi |
| VPMSUMD T0, H21L, XL3 // H^2.lo·Xi+1.lo+H.lo·Xi+2.lo |
| VPMSUMD T1, H21H, XH3 // H^2.hi·Xi+1.hi+H.hi·Xi+2.hi |
| |
| VXOR XM3, XM2, XM3 |
| JMP tail_4x |
| |
| two: |
| #ifdef GOARCH_ppc64le |
| VPERM IN0, IN0, LEMASK, IN0 |
| VPERM IN1, IN1, LEMASK, IN1 |
| #endif |
| |
| VXOR IN, XL, XH |
| VPERM ZERO, IN1, LOPERM, T0 |
| VPERM ZERO, IN1, HIPERM, T1 |
| |
| VSLDOI $8, ZERO, H2, H4L |
| VOR H2, H2, H4 |
| VSLDOI $8, H2, ZERO, H4H |
| |
| VPMSUMD T0, H21L, XL3 // H.lo·Xi+1.lo |
| VPMSUMD IN1, H, XM3 // H.hi·Xi+1.lo+H.lo·Xi+2.hi |
| VPMSUMD T1, H21H, XH3 // H.hi·Xi+1.hi |
| |
| JMP tail_4x |
| |
| one: |
| #ifdef GOARCH_ppc64le |
| VPERM IN0, IN0, LEMASK, IN0 |
| #endif |
| |
| VSLDOI $8, ZERO, H, H4L |
| VOR H, H, H4 |
| VSLDOI $8, H, ZERO, H4H |
| |
| VXOR IN0, XL, XH |
| VXOR XL3, XL3, XL3 |
| VXOR XM3, XM3, XM3 |
| VXOR XH3, XH3, XH3 |
| |
| JMP tail_4x |
| |
| done_4x: |
| #ifdef GOARCH_ppc64le |
| VPERM XL, XL, LEMASK, XL |
| #endif |
| STXVD2X VXL, (XIP+R0) // write out Xi |
| RET |
| |
| // func gcmMul(output []byte, productTable *[256]byte) |
| TEXT ·gcmMul(SB), NOSPLIT, $0-32 |
| MOVD output+0(FP), XIP |
| MOVD productTable+24(FP), HTBL |
| |
| MOVD $0x10, R8 |
| MOVD $0x20, R9 |
| MOVD $0x30, R10 |
| LXVD2X (XIP)(R0), VIN // load Xi |
| |
| LXVD2X (HTBL)(R8), VHL // Load pre-computed table |
| LXVD2X (HTBL)(R9), VH |
| LXVD2X (HTBL)(R10), VHH |
| LXVD2X (HTBL)(R0), VXC2 |
| #ifdef GOARCH_ppc64le |
| VSPLTISB $0x07, T0 |
| VXOR LEMASK, T0, LEMASK |
| VPERM IN, IN, LEMASK, IN |
| #endif |
| VXOR ZERO, ZERO, ZERO |
| |
| VPMSUMD IN, HL, XL // H.lo·Xi.lo |
| VPMSUMD IN, H, XM // H.hi·Xi.lo+H.lo·Xi.hi |
| VPMSUMD IN, HH, XH // H.hi·Xi.hi |
| |
| VPMSUMD XL, XC2, T2 // 1st reduction phase |
| |
| VSLDOI $8, XM, ZERO, T0 |
| VSLDOI $8, ZERO, XM, T1 |
| VXOR XL, T0, XL |
| VXOR XH, T1, XH |
| |
| VSLDOI $8, XL, XL, XL |
| VXOR XL, T2, XL |
| |
| VSLDOI $8, XL, XL, T1 // 2nd reduction phase |
| VPMSUMD XL, XC2, XL |
| VXOR T1, XH, T1 |
| VXOR XL, T1, XL |
| |
| #ifdef GOARCH_ppc64le |
| VPERM XL, XL, LEMASK, XL |
| #endif |
| STXVD2X VXL, (XIP+R0) // write out Xi |
| RET |
| |
| #define BLK_INP R3 |
| #define BLK_OUT R4 |
| #define BLK_KEY R5 |
| #define KEY_LEN R6 |
| #define BLK_IDX R7 |
| #define IDX R8 |
| #define IN_LEN R9 |
| #define COUNTER R10 |
| #define CONPTR R14 |
| #define MASK V5 |
| |
| // Implementation of the counterCrypt function in assembler. |
| // Original loop is unrolled to allow for multiple encryption |
| // streams to be done in parallel, which is achieved by interleaving |
| // vcipher instructions from each stream. This is also referred to as |
| // stitching, and provides significant performance improvements. |
| // Some macros are defined which enable execution for big or little |
| // endian as well as different ISA targets. |
| //func (g *gcmAsm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte, key[gcmBlockSize]uint32) |
| //func counterCryptASM(xr, out, in, counter, key) |
| TEXT ·counterCryptASM(SB), NOSPLIT, $16-72 |
| MOVD xr(FP), KEY_LEN |
| MOVD out+8(FP), BLK_OUT |
| MOVD out_len+16(FP), R8 |
| MOVD in+32(FP), BLK_INP |
| MOVD in_len+40(FP), IN_LEN |
| MOVD counter+56(FP), COUNTER |
| MOVD key+64(FP), BLK_KEY |
| |
| // Set up permute string when needed. |
| #ifdef NEEDS_ESPERM |
| MOVD $·rcon(SB), R14 |
| LVX (R14), ESPERM // Permute value for P8_ macros. |
| #endif |
| SETUP_COUNTER // V30 Counter V31 BE {0, 0, 0, 1} |
| LOAD_KEYS(BLK_KEY, KEY_LEN) // VS1 - VS10/12/14 based on keysize |
| CMP IN_LEN, $128 |
| BLT block64 |
| block128_loop: |
| // Do 8 encryptions in parallel by setting |
| // input values in V15-V22 and executing |
| // vcipher on the updated value and the keys. |
| GEN_VCIPHER_8_INPUTS |
| VCIPHER_8X1_KEY(VS1) |
| VCIPHER_8X1_KEY(VS2) |
| VCIPHER_8X1_KEY(VS3) |
| VCIPHER_8X1_KEY(VS4) |
| VCIPHER_8X1_KEY(VS5) |
| VCIPHER_8X1_KEY(VS6) |
| VCIPHER_8X1_KEY(VS7) |
| VCIPHER_8X1_KEY(VS8) |
| VCIPHER_8X1_KEY(VS9) |
| // Additional encryptions are done based on |
| // the key length, with the last key moved |
| // to V23 for use with VCIPHERLAST. |
| // CR2 = CMP key_len, $12 |
| XXLOR VS10, VS10, V23 |
| BLT CR2, block128_last // key_len = 10 |
| VCIPHER_8X1_KEY(VS10) |
| VCIPHER_8X1_KEY(VS11) |
| XXLOR VS12,VS12,V23 |
| BEQ CR2, block128_last // ken_len = 12 |
| VCIPHER_8X1_KEY(VS12) |
| VCIPHER_8X1_KEY(VS13) |
| XXLOR VS14,VS14,V23 // key_len = 14 |
| block128_last: |
| // vcipher encryptions are in V15-V22 at this |
| // point with vcipherlast remaining to be done. |
| // Load input block into V1-V8, setting index offsets |
| // in R16-R22 to use with the STORE. |
| LOAD_INPUT_BLOCK128(BLK_INP) |
| // Do VCIPHERLAST on the last key for each encryption |
| // stream and XOR the result with the corresponding |
| // value from the input block. |
| VCIPHERLAST8_XOR_INPUT |
| // Store the results (8*16) and update BLK_OUT by 128. |
| STORE_OUTPUT_BLOCK128(BLK_OUT) |
| ADD $-128, IN_LEN // input size |
| CMP IN_LEN, $128 // check if >= blocksize |
| BGE block128_loop // next input block |
| CMP IN_LEN, $0 |
| BEQ done |
| block64: |
| CMP IN_LEN, $64 // Check if >= 64 |
| BLT block16_loop |
| // Do 4 encryptions in parallel by setting |
| // input values in V15-V18 and executing |
| // vcipher on the updated value and the keys. |
| GEN_VCIPHER_4_INPUTS |
| VCIPHER_4X1_KEY(VS1) |
| VCIPHER_4X1_KEY(VS2) |
| VCIPHER_4X1_KEY(VS3) |
| VCIPHER_4X1_KEY(VS4) |
| VCIPHER_4X1_KEY(VS5) |
| VCIPHER_4X1_KEY(VS6) |
| VCIPHER_4X1_KEY(VS7) |
| VCIPHER_4X1_KEY(VS8) |
| VCIPHER_4X1_KEY(VS9) |
| // Check key length based on CR2 |
| // Move last key to V23 for use with later vcipherlast |
| XXLOR VS10, VS10, V23 |
| BLT CR2, block64_last // size = 10 |
| VCIPHER_4X1_KEY(VS10) // Encrypt next 2 keys |
| VCIPHER_4X1_KEY(VS11) |
| XXLOR VS12, VS12, V23 |
| BEQ CR2, block64_last // size = 12 |
| VCIPHER_4X1_KEY(VS12) // Encrypt last 2 keys |
| VCIPHER_4X1_KEY(VS13) |
| XXLOR VS14, VS14, V23 // size = 14 |
| block64_last: |
| LOAD_INPUT_BLOCK64(BLK_INP) // Load 64 bytes of input |
| // Do VCIPHERLAST on the last for each encryption |
| // stream and XOR the result with the corresponding |
| // value from the input block. |
| VCIPHERLAST4_XOR_INPUT |
| // Store the results (4*16) and update BLK_OUT by 64. |
| STORE_OUTPUT_BLOCK64(BLK_OUT) |
| ADD $-64, IN_LEN // decrement input block length |
| CMP IN_LEN, $0 // check for remaining length |
| BEQ done |
| block16_loop: |
| CMP IN_LEN, $16 // More input |
| BLT final_block // If not, then handle partial block |
| // Single encryption, no stitching |
| GEN_VCIPHER_INPUT // Generate input value for single encryption |
| VCIPHER_1X9_KEYS(V15) // Encrypt V15 value with 9 keys |
| XXLOR VS10, VS10, V23 // Last key -> V23 for later vcipiherlast |
| // Key length based on CR2. (LT=10, EQ=12, GT=14) |
| BLT CR2, block16_last // Finish for key size 10 |
| VCIPHER_1X2_KEYS(V15, VS10, VS11) // Encrypt V15 with 2 more keys |
| XXLOR VS12, VS12, V23 // Last key -> V23 for later vcipherlast |
| BEQ CR2, block16_last // Finish for key size 12 |
| VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys |
| XXLOR VS14, VS14, V23 // Last key -> V23 for vcipherlast with key size 14 |
| block16_last: |
| P8_LXVB16X(BLK_INP, R0, V1) // Load input |
| VCIPHERLAST V15, V23, V15 // Encrypt last value in V23 |
| XXLXOR V15, V1, V1 // XOR with input |
| P8_STXVB16X(V1,R0,BLK_OUT) // Store final encryption value to output |
| ADD $16, BLK_INP // Increment input pointer |
| ADD $16, BLK_OUT // Increment output pointer |
| ADD $-16, IN_LEN // Decrement input length |
| BR block16_loop // Check for next |
| final_block: |
| CMP IN_LEN, $0 |
| BEQ done |
| GEN_VCIPHER_INPUT // Generate input value for partial encryption |
| VCIPHER_1X9_KEYS(V15) // Encrypt V15 with 9 keys |
| XXLOR VS10, VS10, V23 // Save possible last key |
| BLT CR2, final_block_last |
| VCIPHER_1X2_KEYS(V15, VS10, VS11) // Encrypt V15 with next 2 keys |
| XXLOR VS12, VS12, V23 // Save possible last key |
| BEQ CR2, final_block_last |
| VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys |
| XXLOR VS14, VS14, V23 // Save last key |
| final_block_last: |
| VCIPHERLAST V15, V23, V15 // Finish encryption |
| #ifdef GOPPC64_power10 |
| // set up length |
| SLD $56, IN_LEN, R17 |
| LXVLL BLK_INP, R17, V25 |
| VXOR V25, V15, V25 |
| STXVLL V25, BLK_OUT, R17 |
| #else |
| ADD $32, R1, MASK_PTR |
| MOVD $0, R16 |
| P8_STXVB16X(V15, MASK_PTR, R0) |
| CMP IN_LEN, $8 |
| BLT next4 |
| MOVD 0(MASK_PTR), R14 |
| MOVD 0(BLK_INP), R15 |
| XOR R14, R15, R14 |
| MOVD R14, 0(BLK_OUT) |
| ADD $8, R16 |
| ADD $-8, IN_LEN |
| next4: |
| CMP IN_LEN, $4 |
| BLT next2 |
| MOVWZ (BLK_INP)(R16), R15 |
| MOVWZ (MASK_PTR)(R16), R14 |
| XOR R14, R15, R14 |
| MOVW R14, (R16)(BLK_OUT) |
| ADD $4, R16 |
| ADD $-4, IN_LEN |
| next2: |
| CMP IN_LEN, $2 |
| BLT next1 |
| MOVHZ (BLK_INP)(R16), R15 |
| MOVHZ (MASK_PTR)(R16), R14 |
| XOR R14, R15, R14 |
| MOVH R14, (R16)(BLK_OUT) |
| ADD $2, R16 |
| ADD $-2, IN_LEN |
| next1: |
| CMP IN_LEN, $1 |
| BLT done |
| MOVBZ (MASK_PTR)(R16), R14 |
| MOVBZ (BLK_INP)(R16), R15 |
| XOR R14, R15, R14 |
| MOVB R14, (R16)(BLK_OUT) |
| #endif |
| done: |
| // Save the updated counter value |
| P8_STXVB16X(V30, COUNTER, R0) |
| // Clear the keys |
| XXLXOR VS0, VS0, VS0 |
| XXLXOR VS1, VS1, VS1 |
| XXLXOR VS2, VS2, VS2 |
| XXLXOR VS3, VS3, VS3 |
| XXLXOR VS4, VS4, VS4 |
| XXLXOR VS5, VS5, VS5 |
| XXLXOR VS6, VS6, VS6 |
| XXLXOR VS7, VS7, VS7 |
| XXLXOR VS8, VS8, VS8 |
| XXLXOR VS9, VS9, VS9 |
| XXLXOR VS10, VS10, VS10 |
| XXLXOR VS11, VS11, VS11 |
| XXLXOR VS12, VS12, VS12 |
| XXLXOR VS13, VS13, VS13 |
| XXLXOR VS14, VS14, VS14 |
| RET |
| |