| // Copyright 2019 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // Based on CRYPTOGAMS code with the following comment: |
| // # ==================================================================== |
| // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| // # project. The module is, however, dual licensed under OpenSSL and |
| // # CRYPTOGAMS licenses depending on where you obtain it. For further |
| // # details see http://www.openssl.org/~appro/cryptogams/. |
| // # ==================================================================== |
| |
| // This implementation is based on the ppc64 asm generated by the |
| // script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl |
| // from commit d47afb3c. |
| |
| // Changes were made due to differences in the ABI and some register usage. |
| // Some arguments were changed due to the way the Go code passes them. |
| |
| #include "textflag.h" |
| |
| #define XIP R3 |
| #define HTBL R4 |
| #define INP R5 |
| #define LEN R6 |
| |
| #define XL V0 |
| #define XM V1 |
| #define XH V2 |
| #define IN V3 |
| #define ZERO V4 |
| #define T0 V5 |
| #define T1 V6 |
| #define T2 V7 |
| #define XC2 V8 |
| #define H V9 |
| #define HH V10 |
| #define HL V11 |
| #define LEMASK V12 |
| #define XL1 V13 |
| #define XM1 V14 |
| #define XH1 V15 |
| #define IN1 V16 |
| #define H2 V17 |
| #define H2H V18 |
| #define H2L V19 |
| #define XL3 V20 |
| #define XM2 V21 |
| #define IN2 V22 |
| #define H3L V23 |
| #define H3 V24 |
| #define H3H V25 |
| #define XH3 V26 |
| #define XM3 V27 |
| #define IN3 V28 |
| #define H4L V29 |
| #define H4 V30 |
| #define H4H V31 |
| |
| #define IN0 IN |
| #define H21L HL |
| #define H21H HH |
| #define LOPERM H2L |
| #define HIPERM H2H |
| |
| #define VXL VS32 |
| #define VIN VS35 |
| #define VXC2 VS40 |
| #define VH VS41 |
| #define VHH VS42 |
| #define VHL VS43 |
| #define VIN1 VS48 |
| #define VH2 VS49 |
| #define VH2H VS50 |
| #define VH2L VS51 |
| |
| #define VIN2 VS54 |
| #define VH3L VS55 |
| #define VH3 VS56 |
| #define VH3H VS57 |
| #define VIN3 VS60 |
| #define VH4L VS61 |
| #define VH4 VS62 |
| #define VH4H VS63 |
| |
| #define VIN0 VIN |
| |
| // func gcmInit(productTable *[256]byte, h []byte) |
| TEXT ·gcmInit(SB), NOSPLIT, $0-32 |
| MOVD productTable+0(FP), XIP |
| MOVD h+8(FP), HTBL |
| |
| MOVD $0x10, R8 |
| MOVD $0x20, R9 |
| MOVD $0x30, R10 |
| LXVD2X (HTBL)(R0), VH // Load H |
| |
| VSPLTISB $-16, XC2 // 0xf0 |
| VSPLTISB $1, T0 // one |
| VADDUBM XC2, XC2, XC2 // 0xe0 |
| VXOR ZERO, ZERO, ZERO |
| VOR XC2, T0, XC2 // 0xe1 |
| VSLDOI $15, XC2, ZERO, XC2 // 0xe1... |
| VSLDOI $1, ZERO, T0, T1 // ...1 |
| VADDUBM XC2, XC2, XC2 // 0xc2... |
| VSPLTISB $7, T2 |
| VOR XC2, T1, XC2 // 0xc2....01 |
| VSPLTB $0, H, T1 // most significant byte |
| VSL H, T0, H // H<<=1 |
| VSRAB T1, T2, T1 // broadcast carry bit |
| VAND T1, XC2, T1 |
| VXOR H, T1, IN // twisted H |
| |
| VSLDOI $8, IN, IN, H // twist even more ... |
| VSLDOI $8, ZERO, XC2, XC2 // 0xc2.0 |
| VSLDOI $8, ZERO, H, HL // ... and split |
| VSLDOI $8, H, ZERO, HH |
| |
| STXVD2X VXC2, (XIP+R0) // save pre-computed table |
| STXVD2X VHL, (XIP+R8) |
| MOVD $0x40, R8 |
| STXVD2X VH, (XIP+R9) |
| MOVD $0x50, R9 |
| STXVD2X VHH, (XIP+R10) |
| MOVD $0x60, R10 |
| |
| VPMSUMD IN, HL, XL // H.lo·H.lo |
| VPMSUMD IN, H, XM // H.hi·H.lo+H.lo·H.hi |
| VPMSUMD IN, HH, XH // H.hi·H.hi |
| |
| VPMSUMD XL, XC2, T2 // 1st reduction phase |
| |
| VSLDOI $8, XM, ZERO, T0 |
| VSLDOI $8, ZERO, XM, T1 |
| VXOR XL, T0, XL |
| VXOR XH, T1, XH |
| |
| VSLDOI $8, XL, XL, XL |
| VXOR XL, T2, XL |
| |
| VSLDOI $8, XL, XL, T1 // 2nd reduction phase |
| VPMSUMD XL, XC2, XL |
| VXOR T1, XH, T1 |
| VXOR XL, T1, IN1 |
| |
| VSLDOI $8, IN1, IN1, H2 |
| VSLDOI $8, ZERO, H2, H2L |
| VSLDOI $8, H2, ZERO, H2H |
| |
| STXVD2X VH2L, (XIP+R8) // save H^2 |
| MOVD $0x70, R8 |
| STXVD2X VH2, (XIP+R9) |
| MOVD $0x80, R9 |
| STXVD2X VH2H, (XIP+R10) |
| MOVD $0x90, R10 |
| |
| VPMSUMD IN, H2L, XL // H.lo·H^2.lo |
| VPMSUMD IN1, H2L, XL1 // H^2.lo·H^2.lo |
| VPMSUMD IN, H2, XM // H.hi·H^2.lo+H.lo·H^2.hi |
| VPMSUMD IN1, H2, XM1 // H^2.hi·H^2.lo+H^2.lo·H^2.hi |
| VPMSUMD IN, H2H, XH // H.hi·H^2.hi |
| VPMSUMD IN1, H2H, XH1 // H^2.hi·H^2.hi |
| |
| VPMSUMD XL, XC2, T2 // 1st reduction phase |
| VPMSUMD XL1, XC2, HH // 1st reduction phase |
| |
| VSLDOI $8, XM, ZERO, T0 |
| VSLDOI $8, ZERO, XM, T1 |
| VSLDOI $8, XM1, ZERO, HL |
| VSLDOI $8, ZERO, XM1, H |
| VXOR XL, T0, XL |
| VXOR XH, T1, XH |
| VXOR XL1, HL, XL1 |
| VXOR XH1, H, XH1 |
| |
| VSLDOI $8, XL, XL, XL |
| VSLDOI $8, XL1, XL1, XL1 |
| VXOR XL, T2, XL |
| VXOR XL1, HH, XL1 |
| |
| VSLDOI $8, XL, XL, T1 // 2nd reduction phase |
| VSLDOI $8, XL1, XL1, H // 2nd reduction phase |
| VPMSUMD XL, XC2, XL |
| VPMSUMD XL1, XC2, XL1 |
| VXOR T1, XH, T1 |
| VXOR H, XH1, H |
| VXOR XL, T1, XL |
| VXOR XL1, H, XL1 |
| |
| VSLDOI $8, XL, XL, H |
| VSLDOI $8, XL1, XL1, H2 |
| VSLDOI $8, ZERO, H, HL |
| VSLDOI $8, H, ZERO, HH |
| VSLDOI $8, ZERO, H2, H2L |
| VSLDOI $8, H2, ZERO, H2H |
| |
| STXVD2X VHL, (XIP+R8) // save H^3 |
| MOVD $0xa0, R8 |
| STXVD2X VH, (XIP+R9) |
| MOVD $0xb0, R9 |
| STXVD2X VHH, (XIP+R10) |
| MOVD $0xc0, R10 |
| STXVD2X VH2L, (XIP+R8) // save H^4 |
| STXVD2X VH2, (XIP+R9) |
| STXVD2X VH2H, (XIP+R10) |
| |
| RET |
| |
| // func gcmHash(output []byte, productTable *[256]byte, inp []byte, len int) |
| TEXT ·gcmHash(SB), NOSPLIT, $0-64 |
| MOVD output+0(FP), XIP |
| MOVD productTable+24(FP), HTBL |
| MOVD inp+32(FP), INP |
| MOVD len+56(FP), LEN |
| |
| MOVD $0x10, R8 |
| MOVD $0x20, R9 |
| MOVD $0x30, R10 |
| LXVD2X (XIP)(R0), VXL // load Xi |
| |
| LXVD2X (HTBL)(R8), VHL // load pre-computed table |
| MOVD $0x40, R8 |
| LVSL (R0)(R0), LEMASK |
| LXVD2X (HTBL)(R9), VH |
| MOVD $0x50, R9 |
| VSPLTISB $0x07, T0 |
| LXVD2X (HTBL)(R10), VHH |
| MOVD $0x60, R10 |
| VXOR LEMASK, T0, LEMASK |
| LXVD2X (HTBL)(R0), VXC2 |
| VPERM XL, XL, LEMASK, XL |
| VXOR ZERO, ZERO, ZERO |
| |
| CMPU LEN, $64 |
| BGE gcm_ghash_p8_4x |
| |
| LXVD2X (INP)(R0), VIN |
| ADD $16, INP, INP |
| SUBCCC $16, LEN, LEN |
| VPERM IN, IN, LEMASK, IN |
| VXOR IN, XL, IN |
| BEQ short |
| |
| LXVD2X (HTBL)(R8), VH2L // load H^2 |
| MOVD $16, R8 |
| LXVD2X (HTBL)(R9), VH2 |
| ADD LEN, INP, R9 // end of input |
| LXVD2X (HTBL)(R10), VH2H |
| |
| loop_2x: |
| LXVD2X (INP)(R0), VIN1 |
| VPERM IN1, IN1, LEMASK, IN1 |
| |
| SUBC $32, LEN, LEN |
| VPMSUMD IN, H2L, XL // H^2.lo·Xi.lo |
| VPMSUMD IN1, HL, XL1 // H.lo·Xi+1.lo |
| SUBE R11, R11, R11 // borrow?-1:0 |
| VPMSUMD IN, H2, XM // H^2.hi·Xi.lo+H^2.lo·Xi.hi |
| VPMSUMD IN1, H, XM1 // H.hi·Xi+1.lo+H.lo·Xi+1.hi |
| AND LEN, R11, R11 |
| VPMSUMD IN, H2H, XH // H^2.hi·Xi.hi |
| VPMSUMD IN1, HH, XH1 // H.hi·Xi+1.hi |
| ADD R11, INP, INP |
| |
| VXOR XL, XL1, XL |
| VXOR XM, XM1, XM |
| |
| VPMSUMD XL, XC2, T2 // 1st reduction phase |
| |
| VSLDOI $8, XM, ZERO, T0 |
| VSLDOI $8, ZERO, XM, T1 |
| VXOR XH, XH1, XH |
| VXOR XL, T0, XL |
| VXOR XH, T1, XH |
| |
| VSLDOI $8, XL, XL, XL |
| VXOR XL, T2, XL |
| LXVD2X (INP)(R8), VIN |
| ADD $32, INP, INP |
| |
| VSLDOI $8, XL, XL, T1 // 2nd reduction phase |
| VPMSUMD XL, XC2, XL |
| VPERM IN, IN, LEMASK, IN |
| VXOR T1, XH, T1 |
| VXOR IN, T1, IN |
| VXOR IN, XL, IN |
| CMP R9, INP |
| BGT loop_2x // done yet? |
| |
| CMPWU LEN, $0 |
| BNE even |
| |
| short: |
| VPMSUMD IN, HL, XL // H.lo·Xi.lo |
| VPMSUMD IN, H, XM // H.hi·Xi.lo+H.lo·Xi.hi |
| VPMSUMD IN, HH, XH // H.hi·Xi.hi |
| |
| VPMSUMD XL, XC2, T2 // 1st reduction phase |
| |
| VSLDOI $8, XM, ZERO, T0 |
| VSLDOI $8, ZERO, XM, T1 |
| VXOR XL, T0, XL |
| VXOR XH, T1, XH |
| |
| VSLDOI $8, XL, XL, XL |
| VXOR XL, T2, XL |
| |
| VSLDOI $8, XL, XL, T1 // 2nd reduction phase |
| VPMSUMD XL, XC2, XL |
| VXOR T1, XH, T1 |
| |
| even: |
| VXOR XL, T1, XL |
| VPERM XL, XL, LEMASK, XL |
| STXVD2X VXL, (XIP+R0) |
| |
| OR R12, R12, R12 // write out Xi |
| RET |
| |
| gcm_ghash_p8_4x: |
| LVSL (R8)(R0), T0 // 0x0001..0e0f |
| MOVD $0x70, R8 |
| LXVD2X (HTBL)(R9), VH2 |
| MOVD $0x80, R9 |
| VSPLTISB $8, T1 // 0x0808..0808 |
| MOVD $0x90, R10 |
| LXVD2X (HTBL)(R8), VH3L // load H^3 |
| MOVD $0xa0, R8 |
| LXVD2X (HTBL)(R9), VH3 |
| MOVD $0xb0, R9 |
| LXVD2X (HTBL)(R10), VH3H |
| MOVD $0xc0, R10 |
| LXVD2X (HTBL)(R8), VH4L // load H^4 |
| MOVD $0x10, R8 |
| LXVD2X (HTBL)(R9), VH4 |
| MOVD $0x20, R9 |
| LXVD2X (HTBL)(R10), VH4H |
| MOVD $0x30, R10 |
| |
| VSLDOI $8, ZERO, T1, T2 // 0x0000..0808 |
| VADDUBM T0, T2, HIPERM // 0x0001..1617 |
| VADDUBM T1, HIPERM, LOPERM // 0x0809..1e1f |
| |
| SRD $4, LEN, LEN // this allows to use sign bit as carry |
| |
| LXVD2X (INP)(R0), VIN0 // load input |
| LXVD2X (INP)(R8), VIN1 |
| SUBCCC $8, LEN, LEN |
| LXVD2X (INP)(R9), VIN2 |
| LXVD2X (INP)(R10), VIN3 |
| ADD $0x40, INP, INP |
| VPERM IN0, IN0, LEMASK, IN0 |
| VPERM IN1, IN1, LEMASK, IN1 |
| VPERM IN2, IN2, LEMASK, IN2 |
| VPERM IN3, IN3, LEMASK, IN3 |
| |
| VXOR IN0, XL, XH |
| |
| VPMSUMD IN1, H3L, XL1 |
| VPMSUMD IN1, H3, XM1 |
| VPMSUMD IN1, H3H, XH1 |
| |
| VPERM H2, H, HIPERM, H21L |
| VPERM IN2, IN3, LOPERM, T0 |
| VPERM H2, H, LOPERM, H21H |
| VPERM IN2, IN3, HIPERM, T1 |
| VPMSUMD IN2, H2, XM2 // H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo |
| VPMSUMD T0, H21L, XL3 // H^2.lo·Xi+2.lo+H.lo·Xi+3.lo |
| VPMSUMD IN3, H, XM3 // H.hi·Xi+3.lo +H.lo·Xi+3.hi |
| VPMSUMD T1, H21H, XH3 // H^2.hi·Xi+2.hi+H.hi·Xi+3.hi |
| |
| VXOR XM2, XM1, XM2 |
| VXOR XL3, XL1, XL3 |
| VXOR XM3, XM2, XM3 |
| VXOR XH3, XH1, XH3 |
| |
| BLT tail_4x |
| |
| loop_4x: |
| LXVD2X (INP)(R0), VIN0 |
| LXVD2X (INP)(R8), VIN1 |
| SUBCCC $4, LEN, LEN |
| LXVD2X (INP)(R9), VIN2 |
| LXVD2X (INP)(R10), VIN3 |
| ADD $0x40, INP, INP |
| VPERM IN1, IN1, LEMASK, IN1 |
| VPERM IN2, IN2, LEMASK, IN2 |
| VPERM IN3, IN3, LEMASK, IN3 |
| VPERM IN0, IN0, LEMASK, IN0 |
| |
| VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo |
| VPMSUMD XH, H4, XM // H^4.hi·Xi.lo+H^4.lo·Xi.hi |
| VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi |
| VPMSUMD IN1, H3L, XL1 |
| VPMSUMD IN1, H3, XM1 |
| VPMSUMD IN1, H3H, XH1 |
| |
| VXOR XL, XL3, XL |
| VXOR XM, XM3, XM |
| VXOR XH, XH3, XH |
| VPERM IN2, IN3, LOPERM, T0 |
| VPERM IN2, IN3, HIPERM, T1 |
| |
| VPMSUMD XL, XC2, T2 // 1st reduction phase |
| VPMSUMD T0, H21L, XL3 // H.lo·Xi+3.lo +H^2.lo·Xi+2.lo |
| VPMSUMD T1, H21H, XH3 // H.hi·Xi+3.hi +H^2.hi·Xi+2.hi |
| |
| VSLDOI $8, XM, ZERO, T0 |
| VSLDOI $8, ZERO, XM, T1 |
| VXOR XL, T0, XL |
| VXOR XH, T1, XH |
| |
| VSLDOI $8, XL, XL, XL |
| VXOR XL, T2, XL |
| |
| VSLDOI $8, XL, XL, T1 // 2nd reduction phase |
| VPMSUMD IN2, H2, XM2 // H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi |
| VPMSUMD IN3, H, XM3 // H.hi·Xi+3.lo +H.lo·Xi+3.hi |
| VPMSUMD XL, XC2, XL |
| |
| VXOR XL3, XL1, XL3 |
| VXOR XH3, XH1, XH3 |
| VXOR XH, IN0, XH |
| VXOR XM2, XM1, XM2 |
| VXOR XH, T1, XH |
| VXOR XM3, XM2, XM3 |
| VXOR XH, XL, XH |
| BGE loop_4x |
| |
| tail_4x: |
| VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo |
| VPMSUMD XH, H4, XM // H^4.hi·Xi.lo+H^4.lo·Xi.hi |
| VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi |
| |
| VXOR XL, XL3, XL |
| VXOR XM, XM3, XM |
| |
| VPMSUMD XL, XC2, T2 // 1st reduction phase |
| |
| VSLDOI $8, XM, ZERO, T0 |
| VSLDOI $8, ZERO, XM, T1 |
| VXOR XH, XH3, XH |
| VXOR XL, T0, XL |
| VXOR XH, T1, XH |
| |
| VSLDOI $8, XL, XL, XL |
| VXOR XL, T2, XL |
| |
| VSLDOI $8, XL, XL, T1 // 2nd reduction phase |
| VPMSUMD XL, XC2, XL |
| VXOR T1, XH, T1 |
| VXOR XL, T1, XL |
| |
| ADDCCC $4, LEN, LEN |
| BEQ done_4x |
| |
| LXVD2X (INP)(R0), VIN0 |
| CMPU LEN, $2 |
| MOVD $-4, LEN |
| BLT one |
| LXVD2X (INP)(R8), VIN1 |
| BEQ two |
| |
| three: |
| LXVD2X (INP)(R9), VIN2 |
| VPERM IN0, IN0, LEMASK, IN0 |
| VPERM IN1, IN1, LEMASK, IN1 |
| VPERM IN2, IN2, LEMASK, IN2 |
| |
| VXOR IN0, XL, XH |
| VOR H3L, H3L, H4L |
| VOR H3, H3, H4 |
| VOR H3H, H3H, H4H |
| |
| VPERM IN1, IN2, LOPERM, T0 |
| VPERM IN1, IN2, HIPERM, T1 |
| VPMSUMD IN1, H2, XM2 // H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo |
| VPMSUMD IN2, H, XM3 // H.hi·Xi+2.lo +H.lo·Xi+2.hi |
| VPMSUMD T0, H21L, XL3 // H^2.lo·Xi+1.lo+H.lo·Xi+2.lo |
| VPMSUMD T1, H21H, XH3 // H^2.hi·Xi+1.hi+H.hi·Xi+2.hi |
| |
| VXOR XM3, XM2, XM3 |
| JMP tail_4x |
| |
| two: |
| VPERM IN0, IN0, LEMASK, IN0 |
| VPERM IN1, IN1, LEMASK, IN1 |
| |
| VXOR IN, XL, XH |
| VPERM ZERO, IN1, LOPERM, T0 |
| VPERM ZERO, IN1, HIPERM, T1 |
| |
| VSLDOI $8, ZERO, H2, H4L |
| VOR H2, H2, H4 |
| VSLDOI $8, H2, ZERO, H4H |
| |
| VPMSUMD T0, H21L, XL3 // H.lo·Xi+1.lo |
| VPMSUMD IN1, H, XM3 // H.hi·Xi+1.lo+H.lo·Xi+2.hi |
| VPMSUMD T1, H21H, XH3 // H.hi·Xi+1.hi |
| |
| JMP tail_4x |
| |
| one: |
| VPERM IN0, IN0, LEMASK, IN0 |
| |
| VSLDOI $8, ZERO, H, H4L |
| VOR H, H, H4 |
| VSLDOI $8, H, ZERO, H4H |
| |
| VXOR IN0, XL, XH |
| VXOR XL3, XL3, XL3 |
| VXOR XM3, XM3, XM3 |
| VXOR XH3, XH3, XH3 |
| |
| JMP tail_4x |
| |
| done_4x: |
| VPERM XL, XL, LEMASK, XL |
| STXVD2X VXL, (XIP+R0) // write out Xi |
| RET |
| |
| // func gcmMul(output []byte, productTable *[256]byte) |
| TEXT ·gcmMul(SB), NOSPLIT, $0-32 |
| MOVD output+0(FP), XIP |
| MOVD productTable+24(FP), HTBL |
| |
| MOVD $0x10, R8 |
| MOVD $0x20, R9 |
| MOVD $0x30, R10 |
| LXVD2X (XIP)(R0), VIN // load Xi |
| |
| LXVD2X (HTBL)(R8), VHL // Load pre-computed table |
| LVSL (R0)(R0), LEMASK |
| LXVD2X (HTBL)(R9), VH |
| VSPLTISB $0x07, T0 |
| LXVD2X (HTBL)(R10), VHH |
| VXOR LEMASK, T0, LEMASK |
| LXVD2X (HTBL)(R0), VXC2 |
| VPERM IN, IN, LEMASK, IN |
| VXOR ZERO, ZERO, ZERO |
| |
| VPMSUMD IN, HL, XL // H.lo·Xi.lo |
| VPMSUMD IN, H, XM // H.hi·Xi.lo+H.lo·Xi.hi |
| VPMSUMD IN, HH, XH // H.hi·Xi.hi |
| |
| VPMSUMD XL, XC2, T2 // 1st reduction phase |
| |
| VSLDOI $8, XM, ZERO, T0 |
| VSLDOI $8, ZERO, XM, T1 |
| VXOR XL, T0, XL |
| VXOR XH, T1, XH |
| |
| VSLDOI $8, XL, XL, XL |
| VXOR XL, T2, XL |
| |
| VSLDOI $8, XL, XL, T1 // 2nd reduction phase |
| VPMSUMD XL, XC2, XL |
| VXOR T1, XH, T1 |
| VXOR XL, T1, XL |
| |
| VPERM XL, XL, LEMASK, XL |
| STXVD2X VXL, (XIP+R0) // write out Xi |
| RET |