| // Copyright 2016 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| #include "textflag.h" |
| #include "go_asm.h" |
| |
| |
| DATA p256ordK0<>+0x00(SB)/4, $0xee00bc4f |
| DATA p256ord<>+0x00(SB)/8, $0xffffffff00000000 |
| DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff |
| DATA p256ord<>+0x10(SB)/8, $0xbce6faada7179e84 |
| DATA p256ord<>+0x18(SB)/8, $0xf3b9cac2fc632551 |
| DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256 |
| DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256 |
| DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256 |
| DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256 |
| DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 |
| DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 |
| DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0 d1 d0 0 |
| DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0 d1 d0 0 |
| DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0 |
| DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0 |
| DATA p256mul<>+0x00(SB)/8, $0xffffffff00000001 // P256 |
| DATA p256mul<>+0x08(SB)/8, $0x0000000000000000 // P256 |
| DATA p256mul<>+0x10(SB)/8, $0x00000000ffffffff // P256 |
| DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256 |
| DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0 |
| DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0 |
| DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0 0 d1 d0 |
| DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0 0 d1 d0 |
| DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL 0 d1 d0 d1 |
| DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL 0 d1 d0 d1 |
| DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL 0 0 d1 d0 |
| DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0 |
| DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 |
| DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 |
| DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0 d1 d0 0 |
| DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0 d1 d0 0 |
| DATA p256mul<>+0x80(SB)/8, $0x00000000fffffffe // (1*2^256)%P256 |
| DATA p256mul<>+0x88(SB)/8, $0xffffffffffffffff // (1*2^256)%P256 |
| DATA p256mul<>+0x90(SB)/8, $0xffffffff00000000 // (1*2^256)%P256 |
| DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256 |
| GLOBL p256ordK0<>(SB), 8, $4 |
| GLOBL p256ord<>(SB), 8, $32 |
| GLOBL p256<>(SB), 8, $80 |
| GLOBL p256mul<>(SB), 8, $160 |
| |
| DATA p256vmsl<>+0x0(SB)/8, $0x0012131415161718 |
| DATA p256vmsl<>+0x8(SB)/8, $0x00191a1b1c1d1e1f |
| DATA p256vmsl<>+0x10(SB)/8, $0x0012131415161718 |
| DATA p256vmsl<>+0x18(SB)/8, $0x000b0c0d0e0f1011 |
| DATA p256vmsl<>+0x20(SB)/8, $0x00191a1b1c1d1e1f |
| DATA p256vmsl<>+0x28(SB)/8, $0x0012131415161718 |
| DATA p256vmsl<>+0x30(SB)/8, $0x000b0c0d0e0f1011 |
| DATA p256vmsl<>+0x38(SB)/8, $0x0012131415161718 |
| DATA p256vmsl<>+0x40(SB)/8, $0x000405060708090a |
| DATA p256vmsl<>+0x48(SB)/8, $0x000b0c0d0e0f1011 |
| DATA p256vmsl<>+0x50(SB)/8, $0x000b0c0d0e0f1011 |
| DATA p256vmsl<>+0x58(SB)/8, $0x000405060708090a |
| DATA p256vmsl<>+0x60(SB)/8, $0x1010101000010203 |
| DATA p256vmsl<>+0x68(SB)/8, $0x100405060708090a |
| DATA p256vmsl<>+0x70(SB)/8, $0x100405060708090a |
| DATA p256vmsl<>+0x78(SB)/8, $0x1010101000010203 |
| GLOBL p256vmsl<>(SB), 8, $128 |
| |
| // --------------------------------------- |
| // iff cond == 1 val <- -val |
| // func p256NegCond(val *p256Point, cond int) |
| #define P1ptr R1 |
| #define CPOOL R4 |
| |
| #define Y1L V0 |
| #define Y1H V1 |
| #define T1L V2 |
| #define T1H V3 |
| |
| #define PL V30 |
| #define PH V31 |
| |
| #define ZER V4 |
| #define SEL1 V5 |
| #define CAR1 V6 |
| TEXT ·p256NegCond(SB), NOSPLIT, $0 |
| MOVD val+0(FP), P1ptr |
| |
| MOVD $p256mul<>+0x00(SB), CPOOL |
| VL 16(CPOOL), PL |
| VL 0(CPOOL), PH |
| |
| VL 32(P1ptr), Y1H |
| VL 48(P1ptr), Y1L |
| |
| VLREPG cond+8(FP), SEL1 |
| VZERO ZER |
| VCEQG SEL1, ZER, SEL1 |
| |
| VSCBIQ Y1L, PL, CAR1 |
| VSQ Y1L, PL, T1L |
| VSBIQ PH, Y1H, CAR1, T1H |
| |
| VSEL Y1L, T1L, SEL1, Y1L |
| VSEL Y1H, T1H, SEL1, Y1H |
| |
| VST Y1H, 32(P1ptr) |
| VST Y1L, 48(P1ptr) |
| RET |
| |
| #undef P1ptr |
| #undef CPOOL |
| #undef Y1L |
| #undef Y1H |
| #undef T1L |
| #undef T1H |
| #undef PL |
| #undef PH |
| #undef ZER |
| #undef SEL1 |
| #undef CAR1 |
| |
| // --------------------------------------- |
| // if cond == 0 res <- b; else res <- a |
| // func p256MovCond(res, a, b *p256Point, cond int) |
| #define P3ptr R1 |
| #define P1ptr R2 |
| #define P2ptr R3 |
| |
| #define X1L V0 |
| #define X1H V1 |
| #define Y1L V2 |
| #define Y1H V3 |
| #define Z1L V4 |
| #define Z1H V5 |
| #define X2L V6 |
| #define X2H V7 |
| #define Y2L V8 |
| #define Y2H V9 |
| #define Z2L V10 |
| #define Z2H V11 |
| |
| #define ZER V18 |
| #define SEL1 V19 |
| TEXT ·p256MovCond(SB), NOSPLIT, $0 |
| MOVD res+0(FP), P3ptr |
| MOVD a+8(FP), P1ptr |
| MOVD b+16(FP), P2ptr |
| VLREPG cond+24(FP), SEL1 |
| VZERO ZER |
| VCEQG SEL1, ZER, SEL1 |
| |
| VL 0(P1ptr), X1H |
| VL 16(P1ptr), X1L |
| VL 32(P1ptr), Y1H |
| VL 48(P1ptr), Y1L |
| VL 64(P1ptr), Z1H |
| VL 80(P1ptr), Z1L |
| |
| VL 0(P2ptr), X2H |
| VL 16(P2ptr), X2L |
| VL 32(P2ptr), Y2H |
| VL 48(P2ptr), Y2L |
| VL 64(P2ptr), Z2H |
| VL 80(P2ptr), Z2L |
| |
| VSEL X2L, X1L, SEL1, X1L |
| VSEL X2H, X1H, SEL1, X1H |
| VSEL Y2L, Y1L, SEL1, Y1L |
| VSEL Y2H, Y1H, SEL1, Y1H |
| VSEL Z2L, Z1L, SEL1, Z1L |
| VSEL Z2H, Z1H, SEL1, Z1H |
| |
| VST X1H, 0(P3ptr) |
| VST X1L, 16(P3ptr) |
| VST Y1H, 32(P3ptr) |
| VST Y1L, 48(P3ptr) |
| VST Z1H, 64(P3ptr) |
| VST Z1L, 80(P3ptr) |
| |
| RET |
| |
| #undef P3ptr |
| #undef P1ptr |
| #undef P2ptr |
| #undef X1L |
| #undef X1H |
| #undef Y1L |
| #undef Y1H |
| #undef Z1L |
| #undef Z1H |
| #undef X2L |
| #undef X2H |
| #undef Y2L |
| #undef Y2H |
| #undef Z2L |
| #undef Z2H |
| #undef ZER |
| #undef SEL1 |
| |
| // --------------------------------------- |
| // Constant time table access |
| // Indexed from 1 to 15, with -1 offset |
| // (index 0 is implicitly point at infinity) |
| // func p256Select(point *p256Point, table []p256Point, idx int) |
| #define P3ptr R1 |
| #define P1ptr R2 |
| #define COUNT R4 |
| |
| #define X1L V0 |
| #define X1H V1 |
| #define Y1L V2 |
| #define Y1H V3 |
| #define Z1L V4 |
| #define Z1H V5 |
| #define X2L V6 |
| #define X2H V7 |
| #define Y2L V8 |
| #define Y2H V9 |
| #define Z2L V10 |
| #define Z2H V11 |
| |
| #define ONE V18 |
| #define IDX V19 |
| #define SEL1 V20 |
| #define SEL2 V21 |
| TEXT ·p256Select(SB), NOSPLIT, $0 |
| MOVD point+0(FP), P3ptr |
| MOVD table+8(FP), P1ptr |
| VLREPB idx+(32+7)(FP), IDX |
| VREPIB $1, ONE |
| VREPIB $1, SEL2 |
| MOVD $1, COUNT |
| |
| VZERO X1H |
| VZERO X1L |
| VZERO Y1H |
| VZERO Y1L |
| VZERO Z1H |
| VZERO Z1L |
| |
| loop_select: |
| VL 0(P1ptr), X2H |
| VL 16(P1ptr), X2L |
| VL 32(P1ptr), Y2H |
| VL 48(P1ptr), Y2L |
| VL 64(P1ptr), Z2H |
| VL 80(P1ptr), Z2L |
| |
| VCEQG SEL2, IDX, SEL1 |
| |
| VSEL X2L, X1L, SEL1, X1L |
| VSEL X2H, X1H, SEL1, X1H |
| VSEL Y2L, Y1L, SEL1, Y1L |
| VSEL Y2H, Y1H, SEL1, Y1H |
| VSEL Z2L, Z1L, SEL1, Z1L |
| VSEL Z2H, Z1H, SEL1, Z1H |
| |
| VAB SEL2, ONE, SEL2 |
| ADDW $1, COUNT |
| ADD $96, P1ptr |
| CMPW COUNT, $17 |
| BLT loop_select |
| |
| VST X1H, 0(P3ptr) |
| VST X1L, 16(P3ptr) |
| VST Y1H, 32(P3ptr) |
| VST Y1L, 48(P3ptr) |
| VST Z1H, 64(P3ptr) |
| VST Z1L, 80(P3ptr) |
| RET |
| |
| #undef P3ptr |
| #undef P1ptr |
| #undef COUNT |
| #undef X1L |
| #undef X1H |
| #undef Y1L |
| #undef Y1H |
| #undef Z1L |
| #undef Z1H |
| #undef X2L |
| #undef X2H |
| #undef Y2L |
| #undef Y2H |
| #undef Z2L |
| #undef Z2H |
| #undef ONE |
| #undef IDX |
| #undef SEL1 |
| #undef SEL2 |
| |
| // --------------------------------------- |
| // Constant time table access |
| // Indexed from 1 to 15, with -1 offset |
| // (index 0 is implicitly point at infinity) |
| // func p256SelectBase(point *p256Point, table []p256Point, idx int) |
| #define P3ptr R1 |
| #define P1ptr R2 |
| #define COUNT R4 |
| |
| #define X1L V0 |
| #define X1H V1 |
| #define Y1L V2 |
| #define Y1H V3 |
| #define Z1L V4 |
| #define Z1H V5 |
| #define X2L V6 |
| #define X2H V7 |
| #define Y2L V8 |
| #define Y2H V9 |
| #define Z2L V10 |
| #define Z2H V11 |
| |
| #define ONE V18 |
| #define IDX V19 |
| #define SEL1 V20 |
| #define SEL2 V21 |
| TEXT ·p256SelectBase(SB), NOSPLIT, $0 |
| MOVD point+0(FP), P3ptr |
| MOVD table+8(FP), P1ptr |
| VLREPB idx+(32+7)(FP), IDX |
| VREPIB $1, ONE |
| VREPIB $1, SEL2 |
| MOVD $1, COUNT |
| |
| VZERO X1H |
| VZERO X1L |
| VZERO Y1H |
| VZERO Y1L |
| VZERO Z1H |
| VZERO Z1L |
| |
| loop_select: |
| VL 0(P1ptr), X2H |
| VL 16(P1ptr), X2L |
| VL 32(P1ptr), Y2H |
| VL 48(P1ptr), Y2L |
| VL 64(P1ptr), Z2H |
| VL 80(P1ptr), Z2L |
| |
| VCEQG SEL2, IDX, SEL1 |
| |
| VSEL X2L, X1L, SEL1, X1L |
| VSEL X2H, X1H, SEL1, X1H |
| VSEL Y2L, Y1L, SEL1, Y1L |
| VSEL Y2H, Y1H, SEL1, Y1H |
| VSEL Z2L, Z1L, SEL1, Z1L |
| VSEL Z2H, Z1H, SEL1, Z1H |
| |
| VAB SEL2, ONE, SEL2 |
| ADDW $1, COUNT |
| ADD $96, P1ptr |
| CMPW COUNT, $65 |
| BLT loop_select |
| |
| VST X1H, 0(P3ptr) |
| VST X1L, 16(P3ptr) |
| VST Y1H, 32(P3ptr) |
| VST Y1L, 48(P3ptr) |
| VST Z1H, 64(P3ptr) |
| VST Z1L, 80(P3ptr) |
| RET |
| |
| #undef P3ptr |
| #undef P1ptr |
| #undef COUNT |
| #undef X1L |
| #undef X1H |
| #undef Y1L |
| #undef Y1H |
| #undef Z1L |
| #undef Z1H |
| #undef X2L |
| #undef X2H |
| #undef Y2L |
| #undef Y2H |
| #undef Z2L |
| #undef Z2H |
| #undef ONE |
| #undef IDX |
| #undef SEL1 |
| #undef SEL2 |
| |
| // --------------------------------------- |
| // func p256FromMont(res, in []byte) |
| #define res_ptr R1 |
| #define x_ptr R2 |
| #define CPOOL R4 |
| |
| #define T0 V0 |
| #define T1 V1 |
| #define T2 V2 |
| #define TT0 V3 |
| #define TT1 V4 |
| |
| #define ZER V6 |
| #define SEL1 V7 |
| #define SEL2 V8 |
| #define CAR1 V9 |
| #define CAR2 V10 |
| #define RED1 V11 |
| #define RED2 V12 |
| #define PL V13 |
| #define PH V14 |
| |
| TEXT ·p256FromMont(SB), NOSPLIT, $0 |
| MOVD res+0(FP), res_ptr |
| MOVD in+24(FP), x_ptr |
| |
| VZERO T2 |
| VZERO ZER |
| MOVD $p256<>+0x00(SB), CPOOL |
| VL 16(CPOOL), PL |
| VL 0(CPOOL), PH |
| VL 48(CPOOL), SEL2 |
| VL 64(CPOOL), SEL1 |
| |
| VL (1*16)(x_ptr), T0 |
| VL (0*16)(x_ptr), T1 |
| |
| // First round |
| VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 |
| VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 |
| VSQ RED1, RED2, RED2 // Guaranteed not to underflow |
| |
| VSLDB $8, T1, T0, T0 |
| VSLDB $8, T2, T1, T1 |
| |
| VACCQ T0, RED1, CAR1 |
| VAQ T0, RED1, T0 |
| VACCCQ T1, RED2, CAR1, CAR2 |
| VACQ T1, RED2, CAR1, T1 |
| VAQ T2, CAR2, T2 |
| |
| // Second round |
| VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 |
| VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 |
| VSQ RED1, RED2, RED2 // Guaranteed not to underflow |
| |
| VSLDB $8, T1, T0, T0 |
| VSLDB $8, T2, T1, T1 |
| |
| VACCQ T0, RED1, CAR1 |
| VAQ T0, RED1, T0 |
| VACCCQ T1, RED2, CAR1, CAR2 |
| VACQ T1, RED2, CAR1, T1 |
| VAQ T2, CAR2, T2 |
| |
| // Third round |
| VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 |
| VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 |
| VSQ RED1, RED2, RED2 // Guaranteed not to underflow |
| |
| VSLDB $8, T1, T0, T0 |
| VSLDB $8, T2, T1, T1 |
| |
| VACCQ T0, RED1, CAR1 |
| VAQ T0, RED1, T0 |
| VACCCQ T1, RED2, CAR1, CAR2 |
| VACQ T1, RED2, CAR1, T1 |
| VAQ T2, CAR2, T2 |
| |
| // Last round |
| VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 |
| VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 |
| VSQ RED1, RED2, RED2 // Guaranteed not to underflow |
| |
| VSLDB $8, T1, T0, T0 |
| VSLDB $8, T2, T1, T1 |
| |
| VACCQ T0, RED1, CAR1 |
| VAQ T0, RED1, T0 |
| VACCCQ T1, RED2, CAR1, CAR2 |
| VACQ T1, RED2, CAR1, T1 |
| VAQ T2, CAR2, T2 |
| |
| // --------------------------------------------------- |
| |
| VSCBIQ PL, T0, CAR1 |
| VSQ PL, T0, TT0 |
| VSBCBIQ T1, PH, CAR1, CAR2 |
| VSBIQ T1, PH, CAR1, TT1 |
| VSBIQ T2, ZER, CAR2, T2 |
| |
| // what output to use, TT1||TT0 or T1||T0? |
| VSEL T0, TT0, T2, T0 |
| VSEL T1, TT1, T2, T1 |
| |
| VST T0, (1*16)(res_ptr) |
| VST T1, (0*16)(res_ptr) |
| RET |
| |
| #undef res_ptr |
| #undef x_ptr |
| #undef CPOOL |
| #undef T0 |
| #undef T1 |
| #undef T2 |
| #undef TT0 |
| #undef TT1 |
| #undef ZER |
| #undef SEL1 |
| #undef SEL2 |
| #undef CAR1 |
| #undef CAR2 |
| #undef RED1 |
| #undef RED2 |
| #undef PL |
| #undef PH |
| |
| // --------------------------------------- |
| // func p256OrdMul(res, in1, in2 []byte) |
| #define res_ptr R1 |
| #define x_ptr R2 |
| #define y_ptr R3 |
| #define X0 V0 |
| #define X1 V1 |
| #define Y0 V2 |
| #define Y1 V3 |
| #define M0 V4 |
| #define M1 V5 |
| #define T0 V6 |
| #define T1 V7 |
| #define T2 V8 |
| #define YDIG V9 |
| |
| #define ADD1 V16 |
| #define ADD1H V17 |
| #define ADD2 V18 |
| #define ADD2H V19 |
| #define RED1 V20 |
| #define RED1H V21 |
| #define RED2 V22 |
| #define RED2H V23 |
| #define CAR1 V24 |
| #define CAR1M V25 |
| |
| #define MK0 V30 |
| #define K0 V31 |
| TEXT ·p256OrdMul(SB), NOSPLIT, $0 |
| MOVD res+0(FP), res_ptr |
| MOVD in1+24(FP), x_ptr |
| MOVD in2+48(FP), y_ptr |
| |
| VZERO T2 |
| MOVD $p256ordK0<>+0x00(SB), R4 |
| |
| // VLEF $3, 0(R4), K0 |
| WORD $0xE7F40000 |
| BYTE $0x38 |
| BYTE $0x03 |
| MOVD $p256ord<>+0x00(SB), R4 |
| VL 16(R4), M0 |
| VL 0(R4), M1 |
| |
| VL (1*16)(x_ptr), X0 |
| VL (0*16)(x_ptr), X1 |
| VL (1*16)(y_ptr), Y0 |
| VL (0*16)(y_ptr), Y1 |
| |
| // ---------------------------------------------------------------------------/ |
| VREPF $3, Y0, YDIG |
| VMLF X0, YDIG, ADD1 |
| VMLF ADD1, K0, MK0 |
| VREPF $3, MK0, MK0 |
| |
| VMLF X1, YDIG, ADD2 |
| VMLHF X0, YDIG, ADD1H |
| VMLHF X1, YDIG, ADD2H |
| |
| VMALF M0, MK0, ADD1, RED1 |
| VMALHF M0, MK0, ADD1, RED1H |
| VMALF M1, MK0, ADD2, RED2 |
| VMALHF M1, MK0, ADD2, RED2H |
| |
| VSLDB $12, RED2, RED1, RED1 |
| VSLDB $12, T2, RED2, RED2 |
| |
| VACCQ RED1, ADD1H, CAR1 |
| VAQ RED1, ADD1H, T0 |
| VACCQ RED1H, T0, CAR1M |
| VAQ RED1H, T0, T0 |
| |
| // << ready for next MK0 |
| |
| VACQ RED2, ADD2H, CAR1, T1 |
| VACCCQ RED2, ADD2H, CAR1, CAR1 |
| VACCCQ RED2H, T1, CAR1M, T2 |
| VACQ RED2H, T1, CAR1M, T1 |
| VAQ CAR1, T2, T2 |
| |
| // --------------------------------------------------- |
| /* * |
| * ---+--------+--------+ |
| * T2| T1 | T0 | |
| * ---+--------+--------+ |
| * *(add)* |
| * +--------+--------+ |
| * | X1 | X0 | |
| * +--------+--------+ |
| * *(mul)* |
| * +--------+--------+ |
| * | YDIG | YDIG | |
| * +--------+--------+ |
| * *(add)* |
| * +--------+--------+ |
| * | M1 | M0 | |
| * +--------+--------+ |
| * *(mul)* |
| * +--------+--------+ |
| * | MK0 | MK0 | |
| * +--------+--------+ |
| * |
| * --------------------- |
| * |
| * +--------+--------+ |
| * | ADD2 | ADD1 | |
| * +--------+--------+ |
| * +--------+--------+ |
| * | ADD2H | ADD1H | |
| * +--------+--------+ |
| * +--------+--------+ |
| * | RED2 | RED1 | |
| * +--------+--------+ |
| * +--------+--------+ |
| * | RED2H | RED1H | |
| * +--------+--------+ |
| */ |
| VREPF $2, Y0, YDIG |
| VMALF X0, YDIG, T0, ADD1 |
| VMLF ADD1, K0, MK0 |
| VREPF $3, MK0, MK0 |
| |
| VMALF X1, YDIG, T1, ADD2 |
| VMALHF X0, YDIG, T0, ADD1H |
| VMALHF X1, YDIG, T1, ADD2H |
| |
| VMALF M0, MK0, ADD1, RED1 |
| VMALHF M0, MK0, ADD1, RED1H |
| VMALF M1, MK0, ADD2, RED2 |
| VMALHF M1, MK0, ADD2, RED2H |
| |
| VSLDB $12, RED2, RED1, RED1 |
| VSLDB $12, T2, RED2, RED2 |
| |
| VACCQ RED1, ADD1H, CAR1 |
| VAQ RED1, ADD1H, T0 |
| VACCQ RED1H, T0, CAR1M |
| VAQ RED1H, T0, T0 |
| |
| // << ready for next MK0 |
| |
| VACQ RED2, ADD2H, CAR1, T1 |
| VACCCQ RED2, ADD2H, CAR1, CAR1 |
| VACCCQ RED2H, T1, CAR1M, T2 |
| VACQ RED2H, T1, CAR1M, T1 |
| VAQ CAR1, T2, T2 |
| |
| // --------------------------------------------------- |
| VREPF $1, Y0, YDIG |
| VMALF X0, YDIG, T0, ADD1 |
| VMLF ADD1, K0, MK0 |
| VREPF $3, MK0, MK0 |
| |
| VMALF X1, YDIG, T1, ADD2 |
| VMALHF X0, YDIG, T0, ADD1H |
| VMALHF X1, YDIG, T1, ADD2H |
| |
| VMALF M0, MK0, ADD1, RED1 |
| VMALHF M0, MK0, ADD1, RED1H |
| VMALF M1, MK0, ADD2, RED2 |
| VMALHF M1, MK0, ADD2, RED2H |
| |
| VSLDB $12, RED2, RED1, RED1 |
| VSLDB $12, T2, RED2, RED2 |
| |
| VACCQ RED1, ADD1H, CAR1 |
| VAQ RED1, ADD1H, T0 |
| VACCQ RED1H, T0, CAR1M |
| VAQ RED1H, T0, T0 |
| |
| // << ready for next MK0 |
| |
| VACQ RED2, ADD2H, CAR1, T1 |
| VACCCQ RED2, ADD2H, CAR1, CAR1 |
| VACCCQ RED2H, T1, CAR1M, T2 |
| VACQ RED2H, T1, CAR1M, T1 |
| VAQ CAR1, T2, T2 |
| |
| // --------------------------------------------------- |
| VREPF $0, Y0, YDIG |
| VMALF X0, YDIG, T0, ADD1 |
| VMLF ADD1, K0, MK0 |
| VREPF $3, MK0, MK0 |
| |
| VMALF X1, YDIG, T1, ADD2 |
| VMALHF X0, YDIG, T0, ADD1H |
| VMALHF X1, YDIG, T1, ADD2H |
| |
| VMALF M0, MK0, ADD1, RED1 |
| VMALHF M0, MK0, ADD1, RED1H |
| VMALF M1, MK0, ADD2, RED2 |
| VMALHF M1, MK0, ADD2, RED2H |
| |
| VSLDB $12, RED2, RED1, RED1 |
| VSLDB $12, T2, RED2, RED2 |
| |
| VACCQ RED1, ADD1H, CAR1 |
| VAQ RED1, ADD1H, T0 |
| VACCQ RED1H, T0, CAR1M |
| VAQ RED1H, T0, T0 |
| |
| // << ready for next MK0 |
| |
| VACQ RED2, ADD2H, CAR1, T1 |
| VACCCQ RED2, ADD2H, CAR1, CAR1 |
| VACCCQ RED2H, T1, CAR1M, T2 |
| VACQ RED2H, T1, CAR1M, T1 |
| VAQ CAR1, T2, T2 |
| |
| // --------------------------------------------------- |
| VREPF $3, Y1, YDIG |
| VMALF X0, YDIG, T0, ADD1 |
| VMLF ADD1, K0, MK0 |
| VREPF $3, MK0, MK0 |
| |
| VMALF X1, YDIG, T1, ADD2 |
| VMALHF X0, YDIG, T0, ADD1H |
| VMALHF X1, YDIG, T1, ADD2H |
| |
| VMALF M0, MK0, ADD1, RED1 |
| VMALHF M0, MK0, ADD1, RED1H |
| VMALF M1, MK0, ADD2, RED2 |
| VMALHF M1, MK0, ADD2, RED2H |
| |
| VSLDB $12, RED2, RED1, RED1 |
| VSLDB $12, T2, RED2, RED2 |
| |
| VACCQ RED1, ADD1H, CAR1 |
| VAQ RED1, ADD1H, T0 |
| VACCQ RED1H, T0, CAR1M |
| VAQ RED1H, T0, T0 |
| |
| // << ready for next MK0 |
| |
| VACQ RED2, ADD2H, CAR1, T1 |
| VACCCQ RED2, ADD2H, CAR1, CAR1 |
| VACCCQ RED2H, T1, CAR1M, T2 |
| VACQ RED2H, T1, CAR1M, T1 |
| VAQ CAR1, T2, T2 |
| |
| // --------------------------------------------------- |
| VREPF $2, Y1, YDIG |
| VMALF X0, YDIG, T0, ADD1 |
| VMLF ADD1, K0, MK0 |
| VREPF $3, MK0, MK0 |
| |
| VMALF X1, YDIG, T1, ADD2 |
| VMALHF X0, YDIG, T0, ADD1H |
| VMALHF X1, YDIG, T1, ADD2H |
| |
| VMALF M0, MK0, ADD1, RED1 |
| VMALHF M0, MK0, ADD1, RED1H |
| VMALF M1, MK0, ADD2, RED2 |
| VMALHF M1, MK0, ADD2, RED2H |
| |
| VSLDB $12, RED2, RED1, RED1 |
| VSLDB $12, T2, RED2, RED2 |
| |
| VACCQ RED1, ADD1H, CAR1 |
| VAQ RED1, ADD1H, T0 |
| VACCQ RED1H, T0, CAR1M |
| VAQ RED1H, T0, T0 |
| |
| // << ready for next MK0 |
| |
| VACQ RED2, ADD2H, CAR1, T1 |
| VACCCQ RED2, ADD2H, CAR1, CAR1 |
| VACCCQ RED2H, T1, CAR1M, T2 |
| VACQ RED2H, T1, CAR1M, T1 |
| VAQ CAR1, T2, T2 |
| |
| // --------------------------------------------------- |
| VREPF $1, Y1, YDIG |
| VMALF X0, YDIG, T0, ADD1 |
| VMLF ADD1, K0, MK0 |
| VREPF $3, MK0, MK0 |
| |
| VMALF X1, YDIG, T1, ADD2 |
| VMALHF X0, YDIG, T0, ADD1H |
| VMALHF X1, YDIG, T1, ADD2H |
| |
| VMALF M0, MK0, ADD1, RED1 |
| VMALHF M0, MK0, ADD1, RED1H |
| VMALF M1, MK0, ADD2, RED2 |
| VMALHF M1, MK0, ADD2, RED2H |
| |
| VSLDB $12, RED2, RED1, RED1 |
| VSLDB $12, T2, RED2, RED2 |
| |
| VACCQ RED1, ADD1H, CAR1 |
| VAQ RED1, ADD1H, T0 |
| VACCQ RED1H, T0, CAR1M |
| VAQ RED1H, T0, T0 |
| |
| // << ready for next MK0 |
| |
| VACQ RED2, ADD2H, CAR1, T1 |
| VACCCQ RED2, ADD2H, CAR1, CAR1 |
| VACCCQ RED2H, T1, CAR1M, T2 |
| VACQ RED2H, T1, CAR1M, T1 |
| VAQ CAR1, T2, T2 |
| |
| // --------------------------------------------------- |
| VREPF $0, Y1, YDIG |
| VMALF X0, YDIG, T0, ADD1 |
| VMLF ADD1, K0, MK0 |
| VREPF $3, MK0, MK0 |
| |
| VMALF X1, YDIG, T1, ADD2 |
| VMALHF X0, YDIG, T0, ADD1H |
| VMALHF X1, YDIG, T1, ADD2H |
| |
| VMALF M0, MK0, ADD1, RED1 |
| VMALHF M0, MK0, ADD1, RED1H |
| VMALF M1, MK0, ADD2, RED2 |
| VMALHF M1, MK0, ADD2, RED2H |
| |
| VSLDB $12, RED2, RED1, RED1 |
| VSLDB $12, T2, RED2, RED2 |
| |
| VACCQ RED1, ADD1H, CAR1 |
| VAQ RED1, ADD1H, T0 |
| VACCQ RED1H, T0, CAR1M |
| VAQ RED1H, T0, T0 |
| |
| // << ready for next MK0 |
| |
| VACQ RED2, ADD2H, CAR1, T1 |
| VACCCQ RED2, ADD2H, CAR1, CAR1 |
| VACCCQ RED2H, T1, CAR1M, T2 |
| VACQ RED2H, T1, CAR1M, T1 |
| VAQ CAR1, T2, T2 |
| |
| // --------------------------------------------------- |
| |
| VZERO RED1 |
| VSCBIQ M0, T0, CAR1 |
| VSQ M0, T0, ADD1 |
| VSBCBIQ T1, M1, CAR1, CAR1M |
| VSBIQ T1, M1, CAR1, ADD2 |
| VSBIQ T2, RED1, CAR1M, T2 |
| |
| // what output to use, ADD2||ADD1 or T1||T0? |
| VSEL T0, ADD1, T2, T0 |
| VSEL T1, ADD2, T2, T1 |
| |
| VST T0, (1*16)(res_ptr) |
| VST T1, (0*16)(res_ptr) |
| RET |
| |
| #undef res_ptr |
| #undef x_ptr |
| #undef y_ptr |
| #undef X0 |
| #undef X1 |
| #undef Y0 |
| #undef Y1 |
| #undef M0 |
| #undef M1 |
| #undef T0 |
| #undef T1 |
| #undef T2 |
| #undef YDIG |
| |
| #undef ADD1 |
| #undef ADD1H |
| #undef ADD2 |
| #undef ADD2H |
| #undef RED1 |
| #undef RED1H |
| #undef RED2 |
| #undef RED2H |
| #undef CAR1 |
| #undef CAR1M |
| |
| #undef MK0 |
| #undef K0 |
| |
| // --------------------------------------- |
| // p256MulInternalVX |
| // V0-V3,V30,V31 - Not Modified |
| // V4-V15 - Volatile |
| |
| #define CPOOL R4 |
| |
| // Parameters |
| #define X0 V0 // Not modified |
| #define X1 V1 // Not modified |
| #define Y0 V2 // Not modified |
| #define Y1 V3 // Not modified |
| #define T0 V4 |
| #define T1 V5 |
| #define P0 V30 // Not modified |
| #define P1 V31 // Not modified |
| |
| // Temporaries |
| #define YDIG V6 // Overloaded with CAR2, ZER |
| #define ADD1H V7 // Overloaded with ADD3H |
| #define ADD2H V8 // Overloaded with ADD4H |
| #define ADD3 V9 // Overloaded with SEL2,SEL5 |
| #define ADD4 V10 // Overloaded with SEL3,SEL6 |
| #define RED1 V11 // Overloaded with CAR2 |
| #define RED2 V12 |
| #define RED3 V13 // Overloaded with SEL1 |
| #define T2 V14 |
| // Overloaded temporaries |
| #define ADD1 V4 // Overloaded with T0 |
| #define ADD2 V5 // Overloaded with T1 |
| #define ADD3H V7 // Overloaded with ADD1H |
| #define ADD4H V8 // Overloaded with ADD2H |
| #define ZER V6 // Overloaded with YDIG, CAR2 |
| #define CAR1 V6 // Overloaded with YDIG, ZER |
| #define CAR2 V11 // Overloaded with RED1 |
| // Constant Selects |
| #define SEL1 V13 // Overloaded with RED3 |
| #define SEL2 V9 // Overloaded with ADD3,SEL5 |
| #define SEL3 V10 // Overloaded with ADD4,SEL6 |
| #define SEL4 V6 // Overloaded with YDIG,CAR2,ZER |
| #define SEL5 V9 // Overloaded with ADD3,SEL2 |
| #define SEL6 V10 // Overloaded with ADD4,SEL3 |
| |
| /* * |
| * To follow the flow of bits, for your own sanity a stiff drink, need you shall. |
| * Of a single round, a 'helpful' picture, here is. Meaning, column position has. |
| * With you, SIMD be... |
| * |
| * +--------+--------+ |
| * +--------| RED2 | RED1 | |
| * | +--------+--------+ |
| * | ---+--------+--------+ |
| * | +---- T2| T1 | T0 |--+ |
| * | | ---+--------+--------+ | |
| * | | | |
| * | | ======================= | |
| * | | | |
| * | | +--------+--------+<-+ |
| * | +-------| ADD2 | ADD1 |--|-----+ |
| * | | +--------+--------+ | | |
| * | | +--------+--------+<---+ | |
| * | | | ADD2H | ADD1H |--+ | |
| * | | +--------+--------+ | | |
| * | | +--------+--------+<-+ | |
| * | | | ADD4 | ADD3 |--|-+ | |
| * | | +--------+--------+ | | | |
| * | | +--------+--------+<---+ | | |
| * | | | ADD4H | ADD3H |------|-+ |(+vzero) |
| * | | +--------+--------+ | | V |
| * | | ------------------------ | | +--------+ |
| * | | | | | RED3 | [d0 0 0 d0] |
| * | | | | +--------+ |
| * | +---->+--------+--------+ | | | |
| * (T2[1w]||ADD2[4w]||ADD1[3w]) +--------| T1 | T0 | | | | |
| * | +--------+--------+ | | | |
| * +---->---+--------+--------+ | | | |
| * T2| T1 | T0 |----+ | | |
| * ---+--------+--------+ | | | |
| * ---+--------+--------+<---+ | | |
| * +--- T2| T1 | T0 |----------+ |
| * | ---+--------+--------+ | | |
| * | +--------+--------+<-------------+ |
| * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0] |
| * | +--------+--------+ | | | |
| * | +--------+<----------------------+ |
| * | | RED3 |--------------+ | [0 0 d1 d0] |
| * | +--------+ | | |
| * +--->+--------+--------+ | | |
| * | T1 | T0 |--------+ |
| * +--------+--------+ | | |
| * --------------------------- | | |
| * | | |
| * +--------+--------+<----+ | |
| * | RED2 | RED1 | | |
| * +--------+--------+ | |
| * ---+--------+--------+<-------+ |
| * T2| T1 | T0 | (H1P-H1P-H00RRAY!) |
| * ---+--------+--------+ |
| * |
| * *Mi obra de arte de siglo XXI @vpaprots |
| * |
| * |
| * First group is special, doesn't get the two inputs: |
| * +--------+--------+<-+ |
| * +-------| ADD2 | ADD1 |--|-----+ |
| * | +--------+--------+ | | |
| * | +--------+--------+<---+ | |
| * | | ADD2H | ADD1H |--+ | |
| * | +--------+--------+ | | |
| * | +--------+--------+<-+ | |
| * | | ADD4 | ADD3 |--|-+ | |
| * | +--------+--------+ | | | |
| * | +--------+--------+<---+ | | |
| * | | ADD4H | ADD3H |------|-+ |(+vzero) |
| * | +--------+--------+ | | V |
| * | ------------------------ | | +--------+ |
| * | | | | RED3 | [d0 0 0 d0] |
| * | | | +--------+ |
| * +---->+--------+--------+ | | | |
| * (T2[1w]||ADD2[4w]||ADD1[3w]) | T1 | T0 |----+ | | |
| * +--------+--------+ | | | |
| * ---+--------+--------+<---+ | | |
| * +--- T2| T1 | T0 |----------+ |
| * | ---+--------+--------+ | | |
| * | +--------+--------+<-------------+ |
| * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0] |
| * | +--------+--------+ | | | |
| * | +--------+<----------------------+ |
| * | | RED3 |--------------+ | [0 0 d1 d0] |
| * | +--------+ | | |
| * +--->+--------+--------+ | | |
| * | T1 | T0 |--------+ |
| * +--------+--------+ | | |
| * --------------------------- | | |
| * | | |
| * +--------+--------+<----+ | |
| * | RED2 | RED1 | | |
| * +--------+--------+ | |
| * ---+--------+--------+<-------+ |
| * T2| T1 | T0 | (H1P-H1P-H00RRAY!) |
| * ---+--------+--------+ |
| * |
| * Last 'group' needs to RED2||RED1 shifted less |
| */ |
| TEXT ·p256MulInternalVX(SB), NOSPLIT, $0-0 |
| VL 32(CPOOL), SEL1 |
| VL 48(CPOOL), SEL2 |
| VL 64(CPOOL), SEL3 |
| VL 80(CPOOL), SEL4 |
| |
| // --------------------------------------------------- |
| |
| VREPF $3, Y0, YDIG |
| VMLHF X0, YDIG, ADD1H |
| VMLHF X1, YDIG, ADD2H |
| VMLF X0, YDIG, ADD1 |
| VMLF X1, YDIG, ADD2 |
| |
| VREPF $2, Y0, YDIG |
| VMALF X0, YDIG, ADD1H, ADD3 |
| VMALF X1, YDIG, ADD2H, ADD4 |
| VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free |
| VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free |
| |
| VZERO ZER |
| VL 32(CPOOL), SEL1 |
| VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] |
| |
| VSLDB $12, ADD2, ADD1, T0 // ADD1 Free |
| VSLDB $12, ZER, ADD2, T1 // ADD2 Free |
| |
| VACCQ T0, ADD3, CAR1 |
| VAQ T0, ADD3, T0 // ADD3 Free |
| VACCCQ T1, ADD4, CAR1, T2 |
| VACQ T1, ADD4, CAR1, T1 // ADD4 Free |
| |
| VL 48(CPOOL), SEL2 |
| VL 64(CPOOL), SEL3 |
| VL 80(CPOOL), SEL4 |
| VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] |
| VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] |
| VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] |
| VSQ RED3, RED2, RED2 // Guaranteed not to underflow |
| |
| VSLDB $12, T1, T0, T0 |
| VSLDB $12, T2, T1, T1 |
| |
| VACCQ T0, ADD3H, CAR1 |
| VAQ T0, ADD3H, T0 |
| VACCCQ T1, ADD4H, CAR1, T2 |
| VACQ T1, ADD4H, CAR1, T1 |
| |
| // --------------------------------------------------- |
| |
| VREPF $1, Y0, YDIG |
| VMALHF X0, YDIG, T0, ADD1H |
| VMALHF X1, YDIG, T1, ADD2H |
| VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1 |
| VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2 |
| |
| VREPF $0, Y0, YDIG |
| VMALF X0, YDIG, ADD1H, ADD3 |
| VMALF X1, YDIG, ADD2H, ADD4 |
| VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H |
| VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER |
| |
| VZERO ZER |
| VL 32(CPOOL), SEL1 |
| VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] |
| |
| VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0 |
| VSLDB $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free |
| |
| VACCQ T0, RED1, CAR1 |
| VAQ T0, RED1, T0 |
| VACCCQ T1, RED2, CAR1, T2 |
| VACQ T1, RED2, CAR1, T1 |
| |
| VACCQ T0, ADD3, CAR1 |
| VAQ T0, ADD3, T0 |
| VACCCQ T1, ADD4, CAR1, CAR2 |
| VACQ T1, ADD4, CAR1, T1 |
| VAQ T2, CAR2, T2 |
| |
| VL 48(CPOOL), SEL2 |
| VL 64(CPOOL), SEL3 |
| VL 80(CPOOL), SEL4 |
| VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] |
| VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] |
| VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] |
| VSQ RED3, RED2, RED2 // Guaranteed not to underflow |
| |
| VSLDB $12, T1, T0, T0 |
| VSLDB $12, T2, T1, T1 |
| |
| VACCQ T0, ADD3H, CAR1 |
| VAQ T0, ADD3H, T0 |
| VACCCQ T1, ADD4H, CAR1, T2 |
| VACQ T1, ADD4H, CAR1, T1 |
| |
| // --------------------------------------------------- |
| |
| VREPF $3, Y1, YDIG |
| VMALHF X0, YDIG, T0, ADD1H |
| VMALHF X1, YDIG, T1, ADD2H |
| VMALF X0, YDIG, T0, ADD1 |
| VMALF X1, YDIG, T1, ADD2 |
| |
| VREPF $2, Y1, YDIG |
| VMALF X0, YDIG, ADD1H, ADD3 |
| VMALF X1, YDIG, ADD2H, ADD4 |
| VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free |
| VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free |
| |
| VZERO ZER |
| VL 32(CPOOL), SEL1 |
| VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] |
| |
| VSLDB $12, ADD2, ADD1, T0 // ADD1 Free |
| VSLDB $12, T2, ADD2, T1 // ADD2 Free |
| |
| VACCQ T0, RED1, CAR1 |
| VAQ T0, RED1, T0 |
| VACCCQ T1, RED2, CAR1, T2 |
| VACQ T1, RED2, CAR1, T1 |
| |
| VACCQ T0, ADD3, CAR1 |
| VAQ T0, ADD3, T0 |
| VACCCQ T1, ADD4, CAR1, CAR2 |
| VACQ T1, ADD4, CAR1, T1 |
| VAQ T2, CAR2, T2 |
| |
| VL 48(CPOOL), SEL2 |
| VL 64(CPOOL), SEL3 |
| VL 80(CPOOL), SEL4 |
| VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] |
| VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] |
| VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] |
| VSQ RED3, RED2, RED2 // Guaranteed not to underflow |
| |
| VSLDB $12, T1, T0, T0 |
| VSLDB $12, T2, T1, T1 |
| |
| VACCQ T0, ADD3H, CAR1 |
| VAQ T0, ADD3H, T0 |
| VACCCQ T1, ADD4H, CAR1, T2 |
| VACQ T1, ADD4H, CAR1, T1 |
| |
| // --------------------------------------------------- |
| |
| VREPF $1, Y1, YDIG |
| VMALHF X0, YDIG, T0, ADD1H |
| VMALHF X1, YDIG, T1, ADD2H |
| VMALF X0, YDIG, T0, ADD1 |
| VMALF X1, YDIG, T1, ADD2 |
| |
| VREPF $0, Y1, YDIG |
| VMALF X0, YDIG, ADD1H, ADD3 |
| VMALF X1, YDIG, ADD2H, ADD4 |
| VMALHF X0, YDIG, ADD1H, ADD3H |
| VMALHF X1, YDIG, ADD2H, ADD4H |
| |
| VZERO ZER |
| VL 32(CPOOL), SEL1 |
| VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] |
| |
| VSLDB $12, ADD2, ADD1, T0 |
| VSLDB $12, T2, ADD2, T1 |
| |
| VACCQ T0, RED1, CAR1 |
| VAQ T0, RED1, T0 |
| VACCCQ T1, RED2, CAR1, T2 |
| VACQ T1, RED2, CAR1, T1 |
| |
| VACCQ T0, ADD3, CAR1 |
| VAQ T0, ADD3, T0 |
| VACCCQ T1, ADD4, CAR1, CAR2 |
| VACQ T1, ADD4, CAR1, T1 |
| VAQ T2, CAR2, T2 |
| |
| VL 96(CPOOL), SEL5 |
| VL 112(CPOOL), SEL6 |
| VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0] |
| VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0 0] |
| VSQ RED1, RED2, RED2 // Guaranteed not to underflow |
| |
| VSLDB $12, T1, T0, T0 |
| VSLDB $12, T2, T1, T1 |
| |
| VACCQ T0, ADD3H, CAR1 |
| VAQ T0, ADD3H, T0 |
| VACCCQ T1, ADD4H, CAR1, T2 |
| VACQ T1, ADD4H, CAR1, T1 |
| |
| VACCQ T0, RED1, CAR1 |
| VAQ T0, RED1, T0 |
| VACCCQ T1, RED2, CAR1, CAR2 |
| VACQ T1, RED2, CAR1, T1 |
| VAQ T2, CAR2, T2 |
| |
| // --------------------------------------------------- |
| |
| VZERO RED3 |
| VSCBIQ P0, T0, CAR1 |
| VSQ P0, T0, ADD1H |
| VSBCBIQ T1, P1, CAR1, CAR2 |
| VSBIQ T1, P1, CAR1, ADD2H |
| VSBIQ T2, RED3, CAR2, T2 |
| |
| // what output to use, ADD2H||ADD1H or T1||T0? |
| VSEL T0, ADD1H, T2, T0 |
| VSEL T1, ADD2H, T2, T1 |
| RET |
| |
| #undef CPOOL |
| |
| #undef X0 |
| #undef X1 |
| #undef Y0 |
| #undef Y1 |
| #undef T0 |
| #undef T1 |
| #undef P0 |
| #undef P1 |
| |
| #undef SEL1 |
| #undef SEL2 |
| #undef SEL3 |
| #undef SEL4 |
| #undef SEL5 |
| #undef SEL6 |
| |
| #undef YDIG |
| #undef ADD1H |
| #undef ADD2H |
| #undef ADD3 |
| #undef ADD4 |
| #undef RED1 |
| #undef RED2 |
| #undef RED3 |
| #undef T2 |
| #undef ADD1 |
| #undef ADD2 |
| #undef ADD3H |
| #undef ADD4H |
| #undef ZER |
| #undef CAR1 |
| #undef CAR2 |
| |
| // --------------------------------------- |
| // p256MulInternalVMSL |
| // V0-V3,V30,V31 - Not Modified |
| // V4-V14 - Volatile |
| |
| #define CPOOL R4 |
| #define SCRATCH R9 |
| |
| // Parameters |
| #define X0 V0 // Not modified |
| #define X1 V1 // Not modified |
| #define Y0 V2 // Not modified |
| #define Y1 V3 // Not modified |
| #define T0 V4 |
| #define T1 V5 |
| #define T2 V6 |
| #define P0 V30 // Not modified |
| #define P1 V31 // Not modified |
| |
| // input: d0 |
| // output: h0, h1 |
| // temp: TEMP, ZERO, BORROW |
| #define OBSERVATION3(d0, h0, h1, TEMP, ZERO, BORROW) \ |
| VZERO ZERO \ |
| VSLDB $4, d0, ZERO, h0 \ |
| VLR h0, BORROW \ |
| VSLDB $12, ZERO, h0, TEMP \ |
| VSQ TEMP, h0, h0 \ |
| VSLDB $12, d0, BORROW, h1 \ |
| VSLDB $8, ZERO, BORROW, TEMP \ |
| VAQ TEMP, h0, h0 \ |
| |
| #define OBSERVATION3A(d2, h0, h1, TEMP, ZERO) \ |
| VZERO ZERO \ |
| VSLDB $8, d2, ZERO, TEMP \ |
| VSLDB $8, d2, TEMP, h0 \ |
| VSLDB $12, ZERO, TEMP, h1 \ |
| VSQ h1, h0, h0 \ |
| |
| TEXT ·p256MulInternalVMSL(SB), NOFRAME|NOSPLIT, $0-0 |
| VSTM V16, V19, (SCRATCH) |
| |
| MOVD $p256vmsl<>+0x00(SB), CPOOL |
| |
| // Divide input1 into 5 limbs |
| VGBM $0x007f, V14 |
| VZERO V12 |
| VSLDB $2, X1, X0, V13 |
| VSLDB $2, Y1, Y0, V8 |
| VSLDB $4, V12, X1, V11 // V11(X1): 4 bytes limb |
| VSLDB $4, V12, Y1, V6 // V6: 4 bytes limb |
| |
| VN V14, X0, V5 // V5: first 7 bytes limb |
| VN V14, Y0, V10 // V10: first 7 bytes limb |
| VN V14, V13, V13 // v13: third 7 bytes limb |
| VN V14, V8, V8 // V8: third 7 bytes limb |
| |
| VMSLG V10, V5, V12, V10 // v10: l10 x l5 (column 1) |
| VMSLG V8, V5, V12, V8 // v8: l8 x l5 |
| VMSLG V6, V13, V12, V13 // v13: l6 x l3 |
| VMSLG V6, V11, V12, V11 // v11: l6 x l1 (column 9) |
| VMSLG V6, V5, V12, V6 // v6: l6 x l5 |
| |
| MOVD $p256vmsl<>+0x00(SB), CPOOL |
| VGBM $0x7f7f, V14 |
| |
| VL 0(CPOOL), V4 |
| VL 16(CPOOL), V7 |
| VL 32(CPOOL), V9 |
| VL 48(CPOOL), V5 |
| VLM 64(CPOOL), V16, V19 |
| |
| VPERM V12, X0, V4, V4 // v4: limb4 | limb5 |
| VPERM Y1, Y0, V7, V7 |
| VPERM V12, Y0, V9, V9 // v9: limb10 | limb9 |
| VPERM X1, X0, V5, V5 |
| VPERM X1, X0, V16, V16 |
| VPERM Y1, Y0, V17, V17 |
| VPERM X1, V12, V18, V18 // v18: limb1 | limb2 |
| VPERM Y1, V12, V19, V19 // v19: limb7 | limb6 |
| VN V14, V7, V7 // v7: limb9 | limb8 |
| VN V14, V5, V5 // v5: limb3 | limb4 |
| VN V14, V16, V16 // v16: limb2 | limb3 |
| VN V14, V17, V17 // v17: limb8 | limb7 |
| |
| VMSLG V9, V4, V12, V14 // v14: l10 x l4 + l9 x l5 (column 2) |
| VMSLG V9, V5, V8, V8 // v8: l10 x l9 + l3 x l4 + l8 x l5 (column 3) |
| VMSLG V9, V16, V12, V16 // v16: l10 x l9 + l2 x l3 |
| VMSLG V9, V18, V12, V9 // v9: l10 x l1 + l9 x l2 |
| VMSLG V7, V18, V12, V7 // v7: l9 x l1 + l8 x l2 |
| VMSLG V17, V4, V16, V16 // v16: l8 x l4 + l7 x l5 + l10 x l9 + l2 x l3 (column 4) |
| VMSLG V17, V5, V9, V9 // v9: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 |
| VMSLG V17, V18, V12, V17 // v18: l8 x l1 + l7 x l2 |
| VMSLG V19, V5, V7, V7 // v7: l9 x l1 + l8 x l2 + l7 x l3 + l6 x l4 (column 6) |
| VMSLG V19, V18, V12, V19 // v19: l7 x l1 + l6 x l2 (column 8) |
| VAQ V9, V6, V9 // v9: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 + l6 x l5 (column 5) |
| VAQ V17, V13, V13 // v13: l8 x l1 + l7 x l2 + l6 x l3 (column 7) |
| |
| VSLDB $9, V12, V10, V4 |
| VSLDB $9, V12, V7, V5 |
| VAQ V4, V14, V14 |
| VAQ V5, V13, V13 |
| |
| VSLDB $9, V12, V14, V4 |
| VSLDB $9, V12, V13, V5 |
| VAQ V4, V8, V8 |
| VAQ V5, V19, V19 |
| |
| VSLDB $9, V12, V8, V4 |
| VSLDB $9, V12, V19, V5 |
| VAQ V4, V16, V16 |
| VAQ V5, V11, V11 |
| |
| VSLDB $9, V12, V16, V4 |
| VAQ V4, V9, V17 |
| |
| VGBM $0x007f, V4 |
| VGBM $0x00ff, V5 |
| |
| VN V10, V4, V10 |
| VN V14, V4, V14 |
| VN V8, V4, V8 |
| VN V16, V4, V16 |
| VN V17, V4, V9 |
| VN V7, V4, V7 |
| VN V13, V4, V13 |
| VN V19, V4, V19 |
| VN V11, V5, V11 |
| |
| VSLDB $7, V14, V14, V14 |
| VSLDB $14, V8, V12, V4 |
| VSLDB $14, V12, V8, V8 |
| VSLDB $5, V16, V16, V16 |
| VSLDB $12, V9, V12, V5 |
| |
| VO V14, V10, V10 |
| VO V8, V16, V16 |
| VO V4, V10, V10 // first rightmost 128bits of the multiplication result |
| VO V5, V16, V16 // second rightmost 128bits of the multiplication result |
| |
| // adjust v7, v13, v19, v11 |
| VSLDB $7, V13, V13, V13 |
| VSLDB $14, V19, V12, V4 |
| VSLDB $14, V12, V19, V19 |
| VSLDB $5, V11, V12, V5 |
| VO V13, V7, V7 |
| VO V4, V7, V7 |
| VO V19, V5, V11 |
| |
| VSLDB $9, V12, V17, V14 |
| VSLDB $12, V12, V9, V9 |
| VACCQ V7, V14, V13 |
| VAQ V7, V14, V7 |
| VAQ V11, V13, V11 |
| |
| // First reduction, 96 bits |
| VSLDB $4, V16, V10, T0 |
| VSLDB $4, V12, V16, T1 |
| VSLDB $3, V11, V7, V11 // fourth rightmost 128bits of the multiplication result |
| VSLDB $3, V7, V12, V7 |
| OBSERVATION3(V10, V8, T2, V17, V18, V19)// results V8 | T2 |
| VO V7, V9, V7 // third rightmost 128bits of the multiplication result |
| VACCQ T0, T2, V9 |
| VAQ T0, T2, T2 |
| VACQ T1, V8, V9, V8 |
| |
| // Second reduction 96 bits |
| VSLDB $4, V8, T2, T0 |
| VSLDB $4, V12, V8, T1 |
| OBSERVATION3(T2, V9, V8, V17, V18, V19)// results V9 | V8 |
| VACCQ T0, V8, T2 |
| VAQ T0, V8, V8 |
| VACQ T1, V9, T2, V9 |
| |
| // Third reduction 64 bits |
| VSLDB $8, V9, V8, T0 |
| VSLDB $8, V12, V9, T1 |
| OBSERVATION3A(V8, V14, V13, V17, V18)// results V14 | V13 |
| VACCQ T0, V13, V12 |
| VAQ T0, V13, V13 |
| VACQ T1, V14, V12, V14 |
| VACCQ V13, V7, V12 |
| VAQ V13, V7, T0 |
| VACCCQ V14, V11, V12, T2 |
| VACQ V14, V11, V12, T1 // results T2 | T1 | T0 |
| |
| // --------------------------------------------------- |
| MOVD $p256mul<>+0x00(SB), CPOOL |
| |
| VZERO V12 |
| VSCBIQ P0, T0, V8 |
| VSQ P0, T0, V7 |
| VSBCBIQ T1, P1, V8, V10 |
| VSBIQ T1, P1, V8, V9 |
| VSBIQ T2, V12, V10, T2 |
| |
| // what output to use, V9||V7 or T1||T0? |
| VSEL T0, V7, T2, T0 |
| VSEL T1, V9, T2, T1 |
| |
| VLM (SCRATCH), V16, V19 |
| |
| RET |
| |
| // --------------------------------------- |
| // p256SqrInternalVMSL |
| // V0-V1,V30,V31 - Not Modified |
| // V4-V14 - Volatile |
| |
| TEXT ·p256SqrInternalVMSL(SB), NOFRAME|NOSPLIT, $0-0 |
| VSTM V16, V18, (SCRATCH) |
| |
| MOVD $p256vmsl<>+0x00(SB), CPOOL |
| // Divide input into limbs |
| VGBM $0x007f, V14 |
| VZERO V12 |
| VSLDB $2, X1, X0, V13 |
| VSLDB $4, V12, X1, V11 // V11(X1): 4 bytes limb |
| |
| VN V14, X0, V10 // V10: first 7 bytes limb |
| VN V14, V13, V13 // v13: third 7 bytes limb |
| |
| VMSLG V10, V10, V12, V10 // v10: l10 x l5 (column 1) |
| VMSLG V13, V13, V12, V13 // v13: l8 x l3 |
| VMSLG V11, V11, V12, V11 // v11: l6 x l1 (column 9) |
| |
| MOVD $p256vmsl<>+0x00(SB), CPOOL |
| VGBM $0x7f7f, V14 |
| |
| VL 0(CPOOL), V4 |
| VL 16(CPOOL), V7 |
| VL 32(CPOOL), V9 |
| VL 48(CPOOL), V5 |
| VLM 64(CPOOL), V16, V18 |
| VL 112(CPOOL), V8 |
| |
| VPERM V12, X0, V4, V4 // v4: limb4 | limb5 |
| VPERM X1, X0, V7, V7 |
| VPERM V12, X0, V9, V9 // v9: limb10 | limb9 |
| VPERM X1, X0, V5, V5 |
| VPERM X1, X0, V16, V16 |
| VPERM X1, X0, V17, V17 |
| VPERM X1, V12, V18, V18 // v18: limb1 | limb2 |
| VPERM X1, V12, V8, V8 // v8: limb7 | limb6 |
| VN V14, V7, V7 // v7: limb9 | limb8 |
| VN V14, V5, V5 // v5: limb3 | limb4 |
| VN V14, V16, V16 // v16: limb2 | limb3 |
| VN V14, V17, V17 // v17: limb8 | limb7 |
| |
| VMSLEOG V9, V18, V13, V6 // v6: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 + l6 x l5 (column 5) |
| VMSLG V9, V4, V12, V14 // v14: l10 x l4 + l9 x l5 (column 2) |
| VMSLEOG V9, V16, V12, V16 // v16: l10 x l2 + l9 x l3 + l8 x l4 + l7 x l5 (column 4) |
| VMSLEOG V7, V18, V12, V7 // v7: l9 x l1 + l8 x l2 (column 6) |
| VMSLEG V17, V18, V12, V13 // v13: l8 x l1 + l7 x l2 + l6 x l3 (column 7) |
| VMSLG V8, V18, V12, V8 // v8: l7 x l1 + l6 x l2 (column 8) |
| VMSLEG V9, V5, V12, V18 // v18: l10 x l3 + l9 x l4 + l8 x l5 (column 3) |
| |
| VSLDB $9, V12, V10, V4 |
| VSLDB $9, V12, V7, V5 |
| VAQ V4, V14, V14 |
| VAQ V5, V13, V13 |
| |
| VSLDB $9, V12, V14, V4 |
| VSLDB $9, V12, V13, V5 |
| VAQ V4, V18, V18 |
| VAQ V5, V8, V8 |
| |
| VSLDB $9, V12, V18, V4 |
| VSLDB $9, V12, V8, V5 |
| VAQ V4, V16, V16 |
| VAQ V5, V11, V11 |
| |
| VSLDB $9, V12, V16, V4 |
| VAQ V4, V6, V17 |
| |
| VGBM $0x007f, V4 |
| VGBM $0x00ff, V5 |
| |
| VN V10, V4, V10 |
| VN V14, V4, V14 |
| VN V18, V4, V18 |
| VN V16, V4, V16 |
| VN V17, V4, V9 |
| VN V7, V4, V7 |
| VN V13, V4, V13 |
| VN V8, V4, V8 |
| VN V11, V5, V11 |
| |
| VSLDB $7, V14, V14, V14 |
| VSLDB $14, V18, V12, V4 |
| VSLDB $14, V12, V18, V18 |
| VSLDB $5, V16, V16, V16 |
| VSLDB $12, V9, V12, V5 |
| |
| VO V14, V10, V10 |
| VO V18, V16, V16 |
| VO V4, V10, V10 // first rightmost 128bits of the multiplication result |
| VO V5, V16, V16 // second rightmost 128bits of the multiplication result |
| |
| // adjust v7, v13, v8, v11 |
| VSLDB $7, V13, V13, V13 |
| VSLDB $14, V8, V12, V4 |
| VSLDB $14, V12, V8, V8 |
| VSLDB $5, V11, V12, V5 |
| VO V13, V7, V7 |
| VO V4, V7, V7 |
| VO V8, V5, V11 |
| |
| VSLDB $9, V12, V17, V14 |
| VSLDB $12, V12, V9, V9 |
| VACCQ V7, V14, V13 |
| VAQ V7, V14, V7 |
| VAQ V11, V13, V11 |
| |
| // First reduction, 96 bits |
| VSLDB $4, V16, V10, T0 |
| VSLDB $4, V12, V16, T1 |
| VSLDB $3, V11, V7, V11 // fourth rightmost 128bits of the multiplication result |
| VSLDB $3, V7, V12, V7 |
| OBSERVATION3(V10, V8, T2, V16, V17, V18)// results V8 | T2 |
| VO V7, V9, V7 // third rightmost 128bits of the multiplication result |
| VACCQ T0, T2, V9 |
| VAQ T0, T2, T2 |
| VACQ T1, V8, V9, V8 |
| |
| // Second reduction 96 bits |
| VSLDB $4, V8, T2, T0 |
| VSLDB $4, V12, V8, T1 |
| OBSERVATION3(T2, V9, V8, V16, V17, V18)// results V9 | V8 |
| VACCQ T0, V8, T2 |
| VAQ T0, V8, V8 |
| VACQ T1, V9, T2, V9 |
| |
| // Third reduction 64 bits |
| VSLDB $8, V9, V8, T0 |
| VSLDB $8, V12, V9, T1 |
| OBSERVATION3A(V8, V14, V13, V17, V18)// results V14 | V13 |
| VACCQ T0, V13, V12 |
| VAQ T0, V13, V13 |
| VACQ T1, V14, V12, V14 |
| VACCQ V13, V7, V12 |
| VAQ V13, V7, T0 |
| VACCCQ V14, V11, V12, T2 |
| VACQ V14, V11, V12, T1 // results T2 | T1 | T0 |
| |
| // --------------------------------------------------- |
| MOVD $p256mul<>+0x00(SB), CPOOL |
| |
| VZERO V12 |
| VSCBIQ P0, T0, V8 |
| VSQ P0, T0, V7 |
| VSBCBIQ T1, P1, V8, V10 |
| VSBIQ T1, P1, V8, V9 |
| VSBIQ T2, V12, V10, T2 |
| |
| // what output to use, V9||V7 or T1||T0? |
| VSEL T0, V7, T2, T0 |
| VSEL T1, V9, T2, T1 |
| |
| VLM (SCRATCH), V16, V18 |
| RET |
| |
| |
| |
| #undef CPOOL |
| #undef SCRATCH |
| #undef X0 |
| #undef X1 |
| #undef Y0 |
| #undef Y1 |
| #undef T0 |
| #undef T1 |
| #undef T2 |
| #undef P0 |
| #undef P1 |
| |
| #define SCRATCH R9 |
| |
| TEXT p256MulInternal<>(SB),NOSPLIT,$64-0 |
| MOVD $scratch-64(SP), SCRATCH |
| MOVD ·p256MulInternalFacility+0x00(SB),R7 |
| CALL (R7) |
| RET |
| |
| TEXT ·p256MulInternalTrampolineSetup(SB),NOSPLIT|NOFRAME, $0 |
| MOVBZ internal∕cpu·S390X+const_offsetS390xHasVE1(SB), R0 |
| MOVD $·p256MulInternalFacility+0x00(SB), R7 |
| MOVD $·p256MulInternalVX(SB), R8 |
| CMPBEQ R0, $0, novmsl // VE1 facility = 1, VMSL supported |
| MOVD $·p256MulInternalVMSL(SB), R8 |
| novmsl: |
| MOVD R8, 0(R7) |
| BR (R8) |
| |
| GLOBL ·p256MulInternalFacility+0x00(SB), NOPTR, $8 |
| DATA ·p256MulInternalFacility+0x00(SB)/8, $·p256MulInternalTrampolineSetup(SB) |
| |
| // Parameters |
| #define X0 V0 |
| #define X1 V1 |
| #define Y0 V2 |
| #define Y1 V3 |
| |
| TEXT ·p256SqrInternalVX(SB), NOFRAME|NOSPLIT, $0 |
| VLR X0, Y0 |
| VLR X1, Y1 |
| BR ·p256MulInternalVX(SB) |
| |
| #undef X0 |
| #undef X1 |
| #undef Y0 |
| #undef Y1 |
| |
| |
| TEXT p256SqrInternal<>(SB),NOSPLIT,$48-0 |
| MOVD $scratch-48(SP), SCRATCH |
| MOVD ·p256SqrInternalFacility+0x00(SB),R7 |
| CALL (R7) |
| RET |
| |
| TEXT ·p256SqrInternalTrampolineSetup(SB),NOSPLIT|NOFRAME, $0 |
| MOVBZ internal∕cpu·S390X+const_offsetS390xHasVE1(SB), R0 |
| MOVD $·p256SqrInternalFacility+0x00(SB), R7 |
| MOVD $·p256SqrInternalVX(SB), R8 |
| CMPBEQ R0, $0, novmsl // VE1 facility = 1, VMSL supported |
| MOVD $·p256SqrInternalVMSL(SB), R8 |
| novmsl: |
| MOVD R8, 0(R7) |
| BR (R8) |
| |
| |
| GLOBL ·p256SqrInternalFacility+0x00(SB), NOPTR, $8 |
| DATA ·p256SqrInternalFacility+0x00(SB)/8, $·p256SqrInternalTrampolineSetup(SB) |
| |
| #undef SCRATCH |
| |
| |
| #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \ |
| VZERO ZER \ |
| VSCBIQ Y0, X0, CAR1 \ |
| VSQ Y0, X0, T0 \ |
| VSBCBIQ X1, Y1, CAR1, SEL1 \ |
| VSBIQ X1, Y1, CAR1, T1 \ |
| VSQ SEL1, ZER, SEL1 \ |
| \ |
| VACCQ T0, PL, CAR1 \ |
| VAQ T0, PL, TT0 \ |
| VACQ T1, PH, CAR1, TT1 \ |
| \ |
| VSEL T0, TT0, SEL1, T0 \ |
| VSEL T1, TT1, SEL1, T1 \ |
| |
| #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \ |
| VACCQ X0, Y0, CAR1 \ |
| VAQ X0, Y0, T0 \ |
| VACCCQ X1, Y1, CAR1, T2 \ |
| VACQ X1, Y1, CAR1, T1 \ |
| \ |
| VZERO ZER \ |
| VSCBIQ PL, T0, CAR1 \ |
| VSQ PL, T0, TT0 \ |
| VSBCBIQ T1, PH, CAR1, CAR2 \ |
| VSBIQ T1, PH, CAR1, TT1 \ |
| VSBIQ T2, ZER, CAR2, SEL1 \ |
| \ |
| VSEL T0, TT0, SEL1, T0 \ |
| VSEL T1, TT1, SEL1, T1 |
| |
| #define p256HalfInternal(T1, T0, X1, X0) \ |
| VZERO ZER \ |
| VSBIQ ZER, ZER, X0, SEL1 \ |
| \ |
| VACCQ X0, PL, CAR1 \ |
| VAQ X0, PL, T0 \ |
| VACCCQ X1, PH, CAR1, T2 \ |
| VACQ X1, PH, CAR1, T1 \ |
| \ |
| VSEL X0, T0, SEL1, T0 \ |
| VSEL X1, T1, SEL1, T1 \ |
| VSEL ZER, T2, SEL1, T2 \ |
| \ |
| VSLDB $15, T2, ZER, TT1 \ |
| VSLDB $15, T1, ZER, TT0 \ |
| VREPIB $1, SEL1 \ |
| VSRL SEL1, T0, T0 \ |
| VSRL SEL1, T1, T1 \ |
| VREPIB $7, SEL1 \ |
| VSL SEL1, TT0, TT0 \ |
| VSL SEL1, TT1, TT1 \ |
| VO T0, TT0, T0 \ |
| VO T1, TT1, T1 |
| |
| // --------------------------------------- |
| // func p256MulAsm(res, in1, in2 []byte) |
| #define res_ptr R1 |
| #define x_ptr R2 |
| #define y_ptr R3 |
| #define CPOOL R4 |
| |
| // Parameters |
| #define X0 V0 |
| #define X1 V1 |
| #define Y0 V2 |
| #define Y1 V3 |
| #define T0 V4 |
| #define T1 V5 |
| |
| // Constants |
| #define P0 V30 |
| #define P1 V31 |
| TEXT ·p256MulAsm(SB), NOSPLIT, $0 |
| MOVD res+0(FP), res_ptr |
| MOVD in1+24(FP), x_ptr |
| MOVD in2+48(FP), y_ptr |
| |
| VL (1*16)(x_ptr), X0 |
| VL (0*16)(x_ptr), X1 |
| VL (1*16)(y_ptr), Y0 |
| VL (0*16)(y_ptr), Y1 |
| |
| MOVD $p256mul<>+0x00(SB), CPOOL |
| VL 16(CPOOL), P0 |
| VL 0(CPOOL), P1 |
| |
| CALL p256MulInternal<>(SB) |
| |
| VST T0, (1*16)(res_ptr) |
| VST T1, (0*16)(res_ptr) |
| RET |
| |
| #undef res_ptr |
| #undef x_ptr |
| #undef y_ptr |
| #undef CPOOL |
| |
| #undef X0 |
| #undef X1 |
| #undef Y0 |
| #undef Y1 |
| #undef T0 |
| #undef T1 |
| #undef P0 |
| #undef P1 |
| |
| // --------------------------------------- |
| // func p256SqrAsm(res, in1 []byte) |
| #define res_ptr R1 |
| #define x_ptr R2 |
| #define y_ptr R3 |
| #define CPOOL R4 |
| |
| // Parameters |
| #define X0 V0 |
| #define X1 V1 |
| #define T0 V4 |
| #define T1 V5 |
| |
| // Constants |
| #define P0 V30 |
| #define P1 V31 |
| TEXT ·p256SqrAsm(SB), NOSPLIT, $0 |
| MOVD res+0(FP), res_ptr |
| MOVD in1+24(FP), x_ptr |
| |
| VL (1*16)(x_ptr), X0 |
| VL (0*16)(x_ptr), X1 |
| |
| MOVD $p256mul<>+0x00(SB), CPOOL |
| VL 16(CPOOL), P0 |
| VL 0(CPOOL), P1 |
| |
| CALL p256SqrInternal<>(SB) |
| |
| VST T0, (1*16)(res_ptr) |
| VST T1, (0*16)(res_ptr) |
| RET |
| |
| #undef res_ptr |
| #undef x_ptr |
| #undef y_ptr |
| #undef CPOOL |
| |
| #undef X0 |
| #undef X1 |
| #undef T0 |
| #undef T1 |
| #undef P0 |
| #undef P1 |
| |
| |
| // Point add with P2 being affine point |
| // If sign == 1 -> P2 = -P2 |
| // If sel == 0 -> P3 = P1 |
| // if zero == 0 -> P3 = P2 |
| // p256PointAddAffineAsm(P3, P1, P2 *p256Point, sign, sel, zero int) |
| #define P3ptr R1 |
| #define P1ptr R2 |
| #define P2ptr R3 |
| #define CPOOL R4 |
| |
| // Temporaries in REGs |
| #define Y2L V15 |
| #define Y2H V16 |
| #define T1L V17 |
| #define T1H V18 |
| #define T2L V19 |
| #define T2H V20 |
| #define T3L V21 |
| #define T3H V22 |
| #define T4L V23 |
| #define T4H V24 |
| |
| // Temps for Sub and Add |
| #define TT0 V11 |
| #define TT1 V12 |
| #define T2 V13 |
| |
| // p256MulAsm Parameters |
| #define X0 V0 |
| #define X1 V1 |
| #define Y0 V2 |
| #define Y1 V3 |
| #define T0 V4 |
| #define T1 V5 |
| |
| #define PL V30 |
| #define PH V31 |
| |
| // Names for zero/sel selects |
| #define X1L V0 |
| #define X1H V1 |
| #define Y1L V2 // p256MulAsmParmY |
| #define Y1H V3 // p256MulAsmParmY |
| #define Z1L V4 |
| #define Z1H V5 |
| #define X2L V0 |
| #define X2H V1 |
| #define Z2L V4 |
| #define Z2H V5 |
| #define X3L V17 // T1L |
| #define X3H V18 // T1H |
| #define Y3L V21 // T3L |
| #define Y3H V22 // T3H |
| #define Z3L V28 |
| #define Z3H V29 |
| |
| #define ZER V6 |
| #define SEL1 V7 |
| #define CAR1 V8 |
| #define CAR2 V9 |
| /* * |
| * Three operand formula: |
| * Source: 2004 Hankerson–Menezes–Vanstone, page 91. |
| * T1 = Z1² |
| * T2 = T1*Z1 |
| * T1 = T1*X2 |
| * T2 = T2*Y2 |
| * T1 = T1-X1 |
| * T2 = T2-Y1 |
| * Z3 = Z1*T1 |
| * T3 = T1² |
| * T4 = T3*T1 |
| * T3 = T3*X1 |
| * T1 = 2*T3 |
| * X3 = T2² |
| * X3 = X3-T1 |
| * X3 = X3-T4 |
| * T3 = T3-X3 |
| * T3 = T3*T2 |
| * T4 = T4*Y1 |
| * Y3 = T3-T4 |
| |
| * Three operand formulas, but with MulInternal X,Y used to store temps |
| X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1 |
| X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2 |
| X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2 |
| X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2 |
| SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2 |
| SUB(Y<T1-X1) // T1 = T1-X1 T1 T2 |
| X=Z1; Y- ; MUL;Z3:=T// Z3 = Z1*T1 T2 |
| X=Y; Y- ; MUL;X=T // T3 = T1*T1 T2 |
| X- ; Y- ; MUL;T4=T // T4 = T3*T1 T2 T4 |
| X- ; Y=X1; MUL;T3=T // T3 = T3*X1 T2 T3 T4 |
| ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4 |
| X=T2; Y=T2; MUL;T- // X3 = T2*T2 T1 T2 T3 T4 |
| SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 |
| SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4 |
| SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4 |
| X- ; Y- ; MUL;T3=T // T3 = T3*T2 T2 T3 T4 |
| X=T4; Y=Y1; MUL;T- // T4 = T4*Y1 T3 T4 |
| SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 |
| |
| */ |
| TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0 |
| MOVD P3+0(FP), P3ptr |
| MOVD P1+8(FP), P1ptr |
| MOVD P2+16(FP), P2ptr |
| |
| MOVD $p256mul<>+0x00(SB), CPOOL |
| VL 16(CPOOL), PL |
| VL 0(CPOOL), PH |
| |
| // if (sign == 1) { |
| // Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2 = P-Y2 |
| // } |
| |
| VL 32(P2ptr), Y2H |
| VL 48(P2ptr), Y2L |
| |
| VLREPG sign+24(FP), SEL1 |
| VZERO ZER |
| VCEQG SEL1, ZER, SEL1 |
| |
| VSCBIQ Y2L, PL, CAR1 |
| VSQ Y2L, PL, T1L |
| VSBIQ PH, Y2H, CAR1, T1H |
| |
| VSEL Y2L, T1L, SEL1, Y2L |
| VSEL Y2H, T1H, SEL1, Y2H |
| |
| /* * |
| * Three operand formula: |
| * Source: 2004 Hankerson–Menezes–Vanstone, page 91. |
| */ |
| // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1 |
| VL 64(P1ptr), X1 // Z1H |
| VL 80(P1ptr), X0 // Z1L |
| VLR X0, Y0 |
| VLR X1, Y1 |
| CALL p256SqrInternal<>(SB) |
| |
| // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2 |
| VLR T0, X0 |
| VLR T1, X1 |
| CALL p256MulInternal<>(SB) |
| VLR T0, T2L |
| VLR T1, T2H |
| |
| // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2 |
| VL 0(P2ptr), Y1 // X2H |
| VL 16(P2ptr), Y0 // X2L |
| CALL p256MulInternal<>(SB) |
| VLR T0, T1L |
| VLR T1, T1H |
| |
| // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2 |
| VLR T2L, X0 |
| VLR T2H, X1 |
| VLR Y2L, Y0 |
| VLR Y2H, Y1 |
| CALL p256MulInternal<>(SB) |
| |
| // SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2 |
| VL 32(P1ptr), Y1H |
| VL 48(P1ptr), Y1L |
| p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L) |
| |
| // SUB(Y<T1-X1) // T1 = T1-X1 T1 T2 |
| VL 0(P1ptr), X1H |
| VL 16(P1ptr), X1L |
| p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L) |
| |
| // X=Z1; Y- ; MUL; Z3:=T// Z3 = Z1*T1 T2 |
| VL 64(P1ptr), X1 // Z1H |
| VL 80(P1ptr), X0 // Z1L |
| CALL p256MulInternal<>(SB) |
| |
| // VST T1, 64(P3ptr) |
| // VST T0, 80(P3ptr) |
| VLR T0, Z3L |
| VLR T1, Z3H |
| |
| // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2 |
| VLR Y0, X0 |
| VLR Y1, X1 |
| CALL p256SqrInternal<>(SB) |
| VLR T0, X0 |
| VLR T1, X1 |
| |
| // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4 |
| CALL p256MulInternal<>(SB) |
| VLR T0, T4L |
| VLR T1, T4H |
| |
| // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4 |
| VL 0(P1ptr), Y1 // X1H |
| VL 16(P1ptr), Y0 // X1L |
| CALL p256MulInternal<>(SB) |
| VLR T0, T3L |
| VLR T1, T3H |
| |
| // ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4 |
| p256AddInternal(T1H,T1L, T1,T0,T1,T0) |
| |
| // X=T2; Y=T2; MUL; T- // X3 = T2*T2 T1 T2 T3 T4 |
| VLR T2L, X0 |
| VLR T2H, X1 |
| VLR T2L, Y0 |
| VLR T2H, Y1 |
| CALL p256SqrInternal<>(SB) |
| |
| // SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 (T1 = X3) |
| p256SubInternal(T1,T0,T1,T0,T1H,T1L) |
| |
| // SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4 |
| p256SubInternal(T1,T0,T1,T0,T4H,T4L) |
| VLR T0, X3L |
| VLR T1, X3H |
| |
| // SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4 |
| p256SubInternal(X1,X0,T3H,T3L,T1,T0) |
| |
| // X- ; Y- ; MUL; T3=T // T3 = T3*T2 T2 T3 T4 |
| CALL p256MulInternal<>(SB) |
| VLR T0, T3L |
| VLR T1, T3H |
| |
| // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4 |
| VLR T4L, X0 |
| VLR T4H, X1 |
| VL 32(P1ptr), Y1 // Y1H |
| VL 48(P1ptr), Y0 // Y1L |
| CALL p256MulInternal<>(SB) |
| |
| // SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 (T3 = Y3) |
| p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0) |
| |
| // if (sel == 0) { |
| // copy(P3.x[:], X1) |
| // copy(P3.y[:], Y1) |
| // copy(P3.z[:], Z1) |
| // } |
| |
| VL 0(P1ptr), X1H |
| VL 16(P1ptr), X1L |
| |
| // Y1 already loaded, left over from addition |
| VL 64(P1ptr), Z1H |
| VL 80(P1ptr), Z1L |
| |
| VLREPG sel+32(FP), SEL1 |
| VZERO ZER |
| VCEQG SEL1, ZER, SEL1 |
| |
| VSEL X1L, X3L, SEL1, X3L |
| VSEL X1H, X3H, SEL1, X3H |
| VSEL Y1L, Y3L, SEL1, Y3L |
| VSEL Y1H, Y3H, SEL1, Y3H |
| VSEL Z1L, Z3L, SEL1, Z3L |
| VSEL Z1H, Z3H, SEL1, Z3H |
| |
| // if (zero == 0) { |
| // copy(P3.x[:], X2) |
| // copy(P3.y[:], Y2) |
| // copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
| // 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}) //(p256.z*2^256)%p |
| // } |
| VL 0(P2ptr), X2H |
| VL 16(P2ptr), X2L |
| |
| // Y2 already loaded |
| VL 128(CPOOL), Z2H |
| VL 144(CPOOL), Z2L |
| |
| VLREPG zero+40(FP), SEL1 |
| VZERO ZER |
| VCEQG SEL1, ZER, SEL1 |
| |
| VSEL X2L, X3L, SEL1, X3L |
| VSEL X2H, X3H, SEL1, X3H |
| VSEL Y2L, Y3L, SEL1, Y3L |
| VSEL Y2H, Y3H, SEL1, Y3H |
| VSEL Z2L, Z3L, SEL1, Z3L |
| VSEL Z2H, Z3H, SEL1, Z3H |
| |
| // All done, store out the result!!! |
| VST X3H, 0(P3ptr) |
| VST X3L, 16(P3ptr) |
| VST Y3H, 32(P3ptr) |
| VST Y3L, 48(P3ptr) |
| VST Z3H, 64(P3ptr) |
| VST Z3L, 80(P3ptr) |
| |
| RET |
| |
| #undef P3ptr |
| #undef P1ptr |
| #undef P2ptr |
| #undef CPOOL |
| |
| #undef Y2L |
| #undef Y2H |
| #undef T1L |
| #undef T1H |
| #undef T2L |
| #undef T2H |
| #undef T3L |
| #undef T3H |
| #undef T4L |
| #undef T4H |
| |
| #undef TT0 |
| #undef TT1 |
| #undef T2 |
| |
| #undef X0 |
| #undef X1 |
| #undef Y0 |
| #undef Y1 |
| #undef T0 |
| #undef T1 |
| |
| #undef PL |
| #undef PH |
| |
| #undef X1L |
| #undef X1H |
| #undef Y1L |
| #undef Y1H |
| #undef Z1L |
| #undef Z1H |
| #undef X2L |
| #undef X2H |
| #undef Z2L |
| #undef Z2H |
| #undef X3L |
| #undef X3H |
| #undef Y3L |
| #undef Y3H |
| #undef Z3L |
| #undef Z3H |
| |
| #undef ZER |
| #undef SEL1 |
| #undef CAR1 |
| #undef CAR2 |
| |
| // p256PointDoubleAsm(P3, P1 *p256Point) |
| // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl |
| // https://www.hyperelliptic.org/EFD/g1p/auto-shortw.html |
| // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html |
| #define P3ptr R1 |
| #define P1ptr R2 |
| #define CPOOL R4 |
| |
| // Temporaries in REGs |
| #define X3L V15 |
| #define X3H V16 |
| #define Y3L V17 |
| #define Y3H V18 |
| #define T1L V19 |
| #define T1H V20 |
| #define T2L V21 |
| #define T2H V22 |
| #define T3L V23 |
| #define T3H V24 |
| |
| #define X1L V6 |
| #define X1H V7 |
| #define Y1L V8 |
| #define Y1H V9 |
| #define Z1L V10 |
| #define Z1H V11 |
| |
| // Temps for Sub and Add |
| #define TT0 V11 |
| #define TT1 V12 |
| #define T2 V13 |
| |
| // p256MulAsm Parameters |
| #define X0 V0 |
| #define X1 V1 |
| #define Y0 V2 |
| #define Y1 V3 |
| #define T0 V4 |
| #define T1 V5 |
| |
| #define PL V30 |
| #define PH V31 |
| |
| #define Z3L V23 |
| #define Z3H V24 |
| |
| #define ZER V26 |
| #define SEL1 V27 |
| #define CAR1 V28 |
| #define CAR2 V29 |
| /* |
| * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv |
| * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3. |
| * Source: 2004 Hankerson–Menezes–Vanstone, page 91. |
| * A = 3(X₁-Z₁²)×(X₁+Z₁²) |
| * B = 2Y₁ |
| * Z₃ = B×Z₁ |
| * C = B² |
| * D = C×X₁ |
| * X₃ = A²-2D |
| * Y₃ = (D-X₃)×A-C²/2 |
| * |
| * Three-operand formula: |
| * T1 = Z1² |
| * T2 = X1-T1 |
| * T1 = X1+T1 |
| * T2 = T2*T1 |
| * T2 = 3*T2 |
| * Y3 = 2*Y1 |
| * Z3 = Y3*Z1 |
| * Y3 = Y3² |
| * T3 = Y3*X1 |
| * Y3 = Y3² |
| * Y3 = half*Y3 |
| * X3 = T2² |
| * T1 = 2*T3 |
| * X3 = X3-T1 |
| * T1 = T3-X3 |
| * T1 = T1*T2 |
| * Y3 = T1-Y3 |
| */ |
| |
| TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0 |
| MOVD P3+0(FP), P3ptr |
| MOVD P1+8(FP), P1ptr |
| |
| MOVD $p256mul<>+0x00(SB), CPOOL |
| VL 16(CPOOL), PL |
| VL 0(CPOOL), PH |
| |
| // X=Z1; Y=Z1; MUL; T- // T1 = Z1² |
| VL 64(P1ptr), X1 // Z1H |
| VL 80(P1ptr), X0 // Z1L |
| VLR X0, Y0 |
| VLR X1, Y1 |
| CALL p256SqrInternal<>(SB) |
| |
| // SUB(X<X1-T) // T2 = X1-T1 |
| VL 0(P1ptr), X1H |
| VL 16(P1ptr), X1L |
| p256SubInternal(X1,X0,X1H,X1L,T1,T0) |
| |
| // ADD(Y<X1+T) // T1 = X1+T1 |
| p256AddInternal(Y1,Y0,X1H,X1L,T1,T0) |
| |
| // X- ; Y- ; MUL; T- // T2 = T2*T1 |
| CALL p256MulInternal<>(SB) |
| |
| // ADD(T2<T+T); ADD(T2<T2+T) // T2 = 3*T2 |
| p256AddInternal(T2H,T2L,T1,T0,T1,T0) |
| p256AddInternal(T2H,T2L,T2H,T2L,T1,T0) |
| |
| // ADD(X<Y1+Y1) // Y3 = 2*Y1 |
| VL 32(P1ptr), Y1H |
| VL 48(P1ptr), Y1L |
| p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L) |
| |
| // X- ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1 |
| VL 64(P1ptr), Y1 // Z1H |
| VL 80(P1ptr), Y0 // Z1L |
| CALL p256MulInternal<>(SB) |
| VST T1, 64(P3ptr) |
| VST T0, 80(P3ptr) |
| |
| // X- ; Y=X ; MUL; T- // Y3 = Y3² |
| VLR X0, Y0 |
| VLR X1, Y1 |
| CALL p256SqrInternal<>(SB) |
| |
| // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1 |
| VLR T0, X0 |
| VLR T1, X1 |
| VL 0(P1ptr), Y1 |
| VL 16(P1ptr), Y0 |
| CALL p256MulInternal<>(SB) |
| VLR T0, T3L |
| VLR T1, T3H |
| |
| // X- ; Y=X ; MUL; T- // Y3 = Y3² |
| VLR X0, Y0 |
| VLR X1, Y1 |
| CALL p256SqrInternal<>(SB) |
| |
| // HAL(Y3<T) // Y3 = half*Y3 |
| p256HalfInternal(Y3H,Y3L, T1,T0) |
| |
| // X=T2; Y=T2; MUL; T- // X3 = T2² |
| VLR T2L, X0 |
| VLR T2H, X1 |
| VLR T2L, Y0 |
| VLR T2H, Y1 |
| CALL p256SqrInternal<>(SB) |
| |
| // ADD(T1<T3+T3) // T1 = 2*T3 |
| p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L) |
| |
| // SUB(X3<T-T1) X3:=X3 // X3 = X3-T1 |
| p256SubInternal(X3H,X3L,T1,T0,T1H,T1L) |
| VST X3H, 0(P3ptr) |
| VST X3L, 16(P3ptr) |
| |
| // SUB(X<T3-X3) // T1 = T3-X3 |
| p256SubInternal(X1,X0,T3H,T3L,X3H,X3L) |
| |
| // X- ; Y- ; MUL; T- // T1 = T1*T2 |
| CALL p256MulInternal<>(SB) |
| |
| // SUB(Y3<T-Y3) // Y3 = T1-Y3 |
| p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L) |
| |
| VST Y3H, 32(P3ptr) |
| VST Y3L, 48(P3ptr) |
| RET |
| |
| #undef P3ptr |
| #undef P1ptr |
| #undef CPOOL |
| #undef X3L |
| #undef X3H |
| #undef Y3L |
| #undef Y3H |
| #undef T1L |
| #undef T1H |
| #undef T2L |
| #undef T2H |
| #undef T3L |
| #undef T3H |
| #undef X1L |
| #undef X1H |
| #undef Y1L |
| #undef Y1H |
| #undef Z1L |
| #undef Z1H |
| #undef TT0 |
| #undef TT1 |
| #undef T2 |
| #undef X0 |
| #undef X1 |
| #undef Y0 |
| #undef Y1 |
| #undef T0 |
| #undef T1 |
| #undef PL |
| #undef PH |
| #undef Z3L |
| #undef Z3H |
| #undef ZER |
| #undef SEL1 |
| #undef CAR1 |
| #undef CAR2 |
| |
| // p256PointAddAsm(P3, P1, P2 *p256Point) |
| #define P3ptr R1 |
| #define P1ptr R2 |
| #define P2ptr R3 |
| #define CPOOL R4 |
| #define ISZERO R5 |
| #define TRUE R6 |
| |
| // Temporaries in REGs |
| #define T1L V16 |
| #define T1H V17 |
| #define T2L V18 |
| #define T2H V19 |
| #define U1L V20 |
| #define U1H V21 |
| #define S1L V22 |
| #define S1H V23 |
| #define HL V24 |
| #define HH V25 |
| #define RL V26 |
| #define RH V27 |
| |
| // Temps for Sub and Add |
| #define ZER V6 |
| #define SEL1 V7 |
| #define CAR1 V8 |
| #define CAR2 V9 |
| #define TT0 V11 |
| #define TT1 V12 |
| #define T2 V13 |
| |
| // p256MulAsm Parameters |
| #define X0 V0 |
| #define X1 V1 |
| #define Y0 V2 |
| #define Y1 V3 |
| #define T0 V4 |
| #define T1 V5 |
| |
| #define PL V30 |
| #define PH V31 |
| /* |
| * https://delta.cs.cinvestav.mx/~francisco/arith/julio.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields" |
| * |
| * A = X₁×Z₂² |
| * B = Y₁×Z₂³ |
| * C = X₂×Z₁²-A |
| * D = Y₂×Z₁³-B |
| * X₃ = D² - 2A×C² - C³ |
| * Y₃ = D×(A×C² - X₃) - B×C³ |
| * Z₃ = Z₁×Z₂×C |
| * |
| * Three-operand formula (adopted): https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2 |
| * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R |
| * |
| * T1 = Z1*Z1 |
| * T2 = Z2*Z2 |
| * U1 = X1*T2 |
| * H = X2*T1 |
| * H = H-U1 |
| * Z3 = Z1*Z2 |
| * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array |
| * |
| * S1 = Z2*T2 |
| * S1 = Y1*S1 |
| * R = Z1*T1 |
| * R = Y2*R |
| * R = R-S1 |
| * |
| * T1 = H*H |
| * T2 = H*T1 |
| * U1 = U1*T1 |
| * |
| * X3 = R*R |
| * X3 = X3-T2 |
| * T1 = 2*U1 |
| * X3 = X3-T1 << store-out X3 result reg |
| * |
| * T2 = S1*T2 |
| * Y3 = U1-X3 |
| * Y3 = R*Y3 |
| * Y3 = Y3-T2 << store-out Y3 result reg |
| |
| // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 |
| // X- ; Y=T ; MUL; R=T // R = Z1*T1 |
| // X=X2; Y- ; MUL; H=T // H = X2*T1 |
| // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 |
| // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 |
| // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 |
| // SUB(H<H-T) // H = H-U1 |
| // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2 |
| // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array |
| // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 |
| // X=Y2; Y=R ; MUL; T- // R = Y2*R |
| // SUB(R<T-S1) // R = R-S1 |
| // X=H ; Y=H ; MUL; T- // T1 = H*H |
| // X- ; Y=T ; MUL; T2=T // T2 = H*T1 |
| // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 |
| // X=R ; Y=R ; MUL; T- // X3 = R*R |
| // SUB(T<T-T2) // X3 = X3-T2 |
| // ADD(X<U1+U1) // T1 = 2*U1 |
| // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg |
| // SUB(Y<U1-T) // Y3 = U1-X3 |
| // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3 |
| // X=S1; Y=T2; MUL; T- // T2 = S1*T2 |
| // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg |
| */ |
| TEXT ·p256PointAddAsm(SB), NOSPLIT, $0 |
| MOVD P3+0(FP), P3ptr |
| MOVD P1+8(FP), P1ptr |
| MOVD P2+16(FP), P2ptr |
| |
| MOVD $p256mul<>+0x00(SB), CPOOL |
| VL 16(CPOOL), PL |
| VL 0(CPOOL), PH |
| |
| // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 |
| VL 64(P1ptr), X1 // Z1H |
| VL 80(P1ptr), X0 // Z1L |
| VLR X0, Y0 |
| VLR X1, Y1 |
| CALL p256SqrInternal<>(SB) |
| |
| // X- ; Y=T ; MUL; R=T // R = Z1*T1 |
| VLR T0, Y0 |
| VLR T1, Y1 |
| CALL p256MulInternal<>(SB) |
| VLR T0, RL |
| VLR T1, RH |
| |
| // X=X2; Y- ; MUL; H=T // H = X2*T1 |
| VL 0(P2ptr), X1 // X2H |
| VL 16(P2ptr), X0 // X2L |
| CALL p256MulInternal<>(SB) |
| VLR T0, HL |
| VLR T1, HH |
| |
| // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 |
| VL 64(P2ptr), X1 // Z2H |
| VL 80(P2ptr), X0 // Z2L |
| VLR X0, Y0 |
| VLR X1, Y1 |
| CALL p256SqrInternal<>(SB) |
| |
| // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 |
| VLR T0, Y0 |
| VLR T1, Y1 |
| CALL p256MulInternal<>(SB) |
| VLR T0, S1L |
| VLR T1, S1H |
| |
| // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 |
| VL 0(P1ptr), X1 // X1H |
| VL 16(P1ptr), X0 // X1L |
| CALL p256MulInternal<>(SB) |
| VLR T0, U1L |
| VLR T1, U1H |
| |
| // SUB(H<H-T) // H = H-U1 |
| p256SubInternal(HH,HL,HH,HL,T1,T0) |
| |
| // if H == 0 or H^P == 0 then ret=1 else ret=0 |
| // clobbers T1H and T1L |
| MOVD $0, ISZERO |
| MOVD $1, TRUE |
| VZERO ZER |
| VO HL, HH, T1H |
| VCEQGS ZER, T1H, T1H |
| MOVDEQ TRUE, ISZERO |
| VX HL, PL, T1L |
| VX HH, PH, T1H |
| VO T1L, T1H, T1H |
| VCEQGS ZER, T1H, T1H |
| MOVDEQ TRUE, ISZERO |
| MOVD ISZERO, ret+24(FP) |
| |
| // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2 |
| VL 64(P1ptr), X1 // Z1H |
| VL 80(P1ptr), X0 // Z1L |
| VL 64(P2ptr), Y1 // Z2H |
| VL 80(P2ptr), Y0 // Z2L |
| CALL p256MulInternal<>(SB) |
| |
| // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H |
| VLR T0, X0 |
| VLR T1, X1 |
| VLR HL, Y0 |
| VLR HH, Y1 |
| CALL p256MulInternal<>(SB) |
| VST T1, 64(P3ptr) |
| VST T0, 80(P3ptr) |
| |
| // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 |
| VL 32(P1ptr), X1 |
| VL 48(P1ptr), X0 |
| VLR S1L, Y0 |
| VLR S1H, Y1 |
| CALL p256MulInternal<>(SB) |
| VLR T0, S1L |
| VLR T1, S1H |
| |
| // X=Y2; Y=R ; MUL; T- // R = Y2*R |
| VL 32(P2ptr), X1 |
| VL 48(P2ptr), X0 |
| VLR RL, Y0 |
| VLR RH, Y1 |
| CALL p256MulInternal<>(SB) |
| |
| // SUB(R<T-S1) // R = T-S1 |
| p256SubInternal(RH,RL,T1,T0,S1H,S1L) |
| |
| // if R == 0 or R^P == 0 then ret=ret else ret=0 |
| // clobbers T1H and T1L |
| MOVD $0, ISZERO |
| MOVD $1, TRUE |
| VZERO ZER |
| VO RL, RH, T1H |
| VCEQGS ZER, T1H, T1H |
| MOVDEQ TRUE, ISZERO |
| VX RL, PL, T1L |
| VX RH, PH, T1H |
| VO T1L, T1H, T1H |
| VCEQGS ZER, T1H, T1H |
| MOVDEQ TRUE, ISZERO |
| AND ret+24(FP), ISZERO |
| MOVD ISZERO, ret+24(FP) |
| |
| // X=H ; Y=H ; MUL; T- // T1 = H*H |
| VLR HL, X0 |
| VLR HH, X1 |
| VLR HL, Y0 |
| VLR HH, Y1 |
| CALL p256SqrInternal<>(SB) |
| |
| // X- ; Y=T ; MUL; T2=T // T2 = H*T1 |
| VLR T0, Y0 |
| VLR T1, Y1 |
| CALL p256MulInternal<>(SB) |
| VLR T0, T2L |
| VLR T1, T2H |
| |
| // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 |
| VLR U1L, X0 |
| VLR U1H, X1 |
| CALL p256MulInternal<>(SB) |
| VLR T0, U1L |
| VLR T1, U1H |
| |
| // X=R ; Y=R ; MUL; T- // X3 = R*R |
| VLR RL, X0 |
| VLR RH, X1 |
| VLR RL, Y0 |
| VLR RH, Y1 |
| CALL p256SqrInternal<>(SB) |
| |
| // SUB(T<T-T2) // X3 = X3-T2 |
| p256SubInternal(T1,T0,T1,T0,T2H,T2L) |
| |
| // ADD(X<U1+U1) // T1 = 2*U1 |
| p256AddInternal(X1,X0,U1H,U1L,U1H,U1L) |
| |
| // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg |
| p256SubInternal(T1,T0,T1,T0,X1,X0) |
| VST T1, 0(P3ptr) |
| VST T0, 16(P3ptr) |
| |
| // SUB(Y<U1-T) // Y3 = U1-X3 |
| p256SubInternal(Y1,Y0,U1H,U1L,T1,T0) |
| |
| // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3 |
| VLR RL, X0 |
| VLR RH, X1 |
| CALL p256MulInternal<>(SB) |
| VLR T0, U1L |
| VLR T1, U1H |
| |
| // X=S1; Y=T2; MUL; T- // T2 = S1*T2 |
| VLR S1L, X0 |
| VLR S1H, X1 |
| VLR T2L, Y0 |
| VLR T2H, Y1 |
| CALL p256MulInternal<>(SB) |
| |
| // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg |
| p256SubInternal(T1,T0,U1H,U1L,T1,T0) |
| VST T1, 32(P3ptr) |
| VST T0, 48(P3ptr) |
| |
| RET |