| // Copyright 2016 The Go Authors. All rights reserved. | 
 | // Use of this source code is governed by a BSD-style | 
 | // license that can be found in the LICENSE file. | 
 |  | 
 | #include "textflag.h" | 
 | #include "go_asm.h" | 
 |  | 
 |  | 
 | DATA p256ordK0<>+0x00(SB)/4, $0xee00bc4f | 
 | DATA p256ord<>+0x00(SB)/8, $0xffffffff00000000 | 
 | DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff | 
 | DATA p256ord<>+0x10(SB)/8, $0xbce6faada7179e84 | 
 | DATA p256ord<>+0x18(SB)/8, $0xf3b9cac2fc632551 | 
 | DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256 | 
 | DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256 | 
 | DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256 | 
 | DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256 | 
 | DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 | 
 | DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 | 
 | DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0  d1 d0  0 | 
 | DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0  d1 d0  0 | 
 | DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0 | 
 | DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0 | 
 | DATA p256mul<>+0x00(SB)/8, $0xffffffff00000001 // P256 | 
 | DATA p256mul<>+0x08(SB)/8, $0x0000000000000000 // P256 | 
 | DATA p256mul<>+0x10(SB)/8, $0x00000000ffffffff // P256 | 
 | DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256 | 
 | DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0  0  0 d0 | 
 | DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0  0  0 d0 | 
 | DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0  0 d1 d0 | 
 | DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0  0 d1 d0 | 
 | DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL  0 d1 d0 d1 | 
 | DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL  0 d1 d0 d1 | 
 | DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL  0  0 d1 d0 | 
 | DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL  0  0 d1 d0 | 
 | DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 | 
 | DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 | 
 | DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0  d1 d0  0 | 
 | DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0  d1 d0  0 | 
 | DATA p256mul<>+0x80(SB)/8, $0x00000000fffffffe // (1*2^256)%P256 | 
 | DATA p256mul<>+0x88(SB)/8, $0xffffffffffffffff // (1*2^256)%P256 | 
 | DATA p256mul<>+0x90(SB)/8, $0xffffffff00000000 // (1*2^256)%P256 | 
 | DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256 | 
 | GLOBL p256ordK0<>(SB), 8, $4 | 
 | GLOBL p256ord<>(SB), 8, $32 | 
 | GLOBL p256<>(SB), 8, $80 | 
 | GLOBL p256mul<>(SB), 8, $160 | 
 |  | 
 | DATA p256vmsl<>+0x0(SB)/8, $0x0012131415161718 | 
 | DATA p256vmsl<>+0x8(SB)/8, $0x00191a1b1c1d1e1f | 
 | DATA p256vmsl<>+0x10(SB)/8, $0x0012131415161718 | 
 | DATA p256vmsl<>+0x18(SB)/8, $0x000b0c0d0e0f1011 | 
 | DATA p256vmsl<>+0x20(SB)/8, $0x00191a1b1c1d1e1f | 
 | DATA p256vmsl<>+0x28(SB)/8, $0x0012131415161718 | 
 | DATA p256vmsl<>+0x30(SB)/8, $0x000b0c0d0e0f1011 | 
 | DATA p256vmsl<>+0x38(SB)/8, $0x0012131415161718 | 
 | DATA p256vmsl<>+0x40(SB)/8, $0x000405060708090a | 
 | DATA p256vmsl<>+0x48(SB)/8, $0x000b0c0d0e0f1011 | 
 | DATA p256vmsl<>+0x50(SB)/8, $0x000b0c0d0e0f1011 | 
 | DATA p256vmsl<>+0x58(SB)/8, $0x000405060708090a | 
 | DATA p256vmsl<>+0x60(SB)/8, $0x1010101000010203 | 
 | DATA p256vmsl<>+0x68(SB)/8, $0x100405060708090a | 
 | DATA p256vmsl<>+0x70(SB)/8, $0x100405060708090a | 
 | DATA p256vmsl<>+0x78(SB)/8, $0x1010101000010203 | 
 | GLOBL p256vmsl<>(SB), 8, $128 | 
 |  | 
 | // --------------------------------------- | 
 | // iff cond == 1  val <- -val | 
 | // func p256NegCond(val *p256Point, cond int) | 
 | #define P1ptr   R1 | 
 | #define CPOOL   R4 | 
 |  | 
 | #define Y1L   V0 | 
 | #define Y1H   V1 | 
 | #define T1L   V2 | 
 | #define T1H   V3 | 
 |  | 
 | #define PL    V30 | 
 | #define PH    V31 | 
 |  | 
 | #define ZER   V4 | 
 | #define SEL1  V5 | 
 | #define CAR1  V6 | 
 | TEXT ·p256NegCond(SB), NOSPLIT, $0 | 
 | 	MOVD val+0(FP), P1ptr | 
 |  | 
 | 	MOVD $p256mul<>+0x00(SB), CPOOL | 
 | 	VL   16(CPOOL), PL | 
 | 	VL   0(CPOOL), PH | 
 |  | 
 | 	VL 32(P1ptr), Y1H | 
 | 	VL 48(P1ptr), Y1L | 
 |  | 
 | 	VLREPG cond+8(FP), SEL1 | 
 | 	VZERO  ZER | 
 | 	VCEQG  SEL1, ZER, SEL1 | 
 |  | 
 | 	VSCBIQ Y1L, PL, CAR1 | 
 | 	VSQ    Y1L, PL, T1L | 
 | 	VSBIQ  PH, Y1H, CAR1, T1H | 
 |  | 
 | 	VSEL Y1L, T1L, SEL1, Y1L | 
 | 	VSEL Y1H, T1H, SEL1, Y1H | 
 |  | 
 | 	VST Y1H, 32(P1ptr) | 
 | 	VST Y1L, 48(P1ptr) | 
 | 	RET | 
 |  | 
 | #undef P1ptr | 
 | #undef CPOOL | 
 | #undef Y1L | 
 | #undef Y1H | 
 | #undef T1L | 
 | #undef T1H | 
 | #undef PL | 
 | #undef PH | 
 | #undef ZER | 
 | #undef SEL1 | 
 | #undef CAR1 | 
 |  | 
 | // --------------------------------------- | 
 | // if cond == 0 res <- b; else res <- a | 
 | // func p256MovCond(res, a, b *p256Point, cond int) | 
 | #define P3ptr   R1 | 
 | #define P1ptr   R2 | 
 | #define P2ptr   R3 | 
 |  | 
 | #define X1L    V0 | 
 | #define X1H    V1 | 
 | #define Y1L    V2 | 
 | #define Y1H    V3 | 
 | #define Z1L    V4 | 
 | #define Z1H    V5 | 
 | #define X2L    V6 | 
 | #define X2H    V7 | 
 | #define Y2L    V8 | 
 | #define Y2H    V9 | 
 | #define Z2L    V10 | 
 | #define Z2H    V11 | 
 |  | 
 | #define ZER   V18 | 
 | #define SEL1  V19 | 
 | TEXT ·p256MovCond(SB), NOSPLIT, $0 | 
 | 	MOVD   res+0(FP), P3ptr | 
 | 	MOVD   a+8(FP), P1ptr | 
 | 	MOVD   b+16(FP), P2ptr | 
 | 	VLREPG cond+24(FP), SEL1 | 
 | 	VZERO  ZER | 
 | 	VCEQG  SEL1, ZER, SEL1 | 
 |  | 
 | 	VL 0(P1ptr), X1H | 
 | 	VL 16(P1ptr), X1L | 
 | 	VL 32(P1ptr), Y1H | 
 | 	VL 48(P1ptr), Y1L | 
 | 	VL 64(P1ptr), Z1H | 
 | 	VL 80(P1ptr), Z1L | 
 |  | 
 | 	VL 0(P2ptr), X2H | 
 | 	VL 16(P2ptr), X2L | 
 | 	VL 32(P2ptr), Y2H | 
 | 	VL 48(P2ptr), Y2L | 
 | 	VL 64(P2ptr), Z2H | 
 | 	VL 80(P2ptr), Z2L | 
 |  | 
 | 	VSEL X2L, X1L, SEL1, X1L | 
 | 	VSEL X2H, X1H, SEL1, X1H | 
 | 	VSEL Y2L, Y1L, SEL1, Y1L | 
 | 	VSEL Y2H, Y1H, SEL1, Y1H | 
 | 	VSEL Z2L, Z1L, SEL1, Z1L | 
 | 	VSEL Z2H, Z1H, SEL1, Z1H | 
 |  | 
 | 	VST X1H, 0(P3ptr) | 
 | 	VST X1L, 16(P3ptr) | 
 | 	VST Y1H, 32(P3ptr) | 
 | 	VST Y1L, 48(P3ptr) | 
 | 	VST Z1H, 64(P3ptr) | 
 | 	VST Z1L, 80(P3ptr) | 
 |  | 
 | 	RET | 
 |  | 
 | #undef P3ptr | 
 | #undef P1ptr | 
 | #undef P2ptr | 
 | #undef X1L | 
 | #undef X1H | 
 | #undef Y1L | 
 | #undef Y1H | 
 | #undef Z1L | 
 | #undef Z1H | 
 | #undef X2L | 
 | #undef X2H | 
 | #undef Y2L | 
 | #undef Y2H | 
 | #undef Z2L | 
 | #undef Z2H | 
 | #undef ZER | 
 | #undef SEL1 | 
 |  | 
 | // --------------------------------------- | 
 | // Constant time table access | 
 | // Indexed from 1 to 15, with -1 offset | 
 | // (index 0 is implicitly point at infinity) | 
 | // func p256Select(point *p256Point, table []p256Point, idx int) | 
 | #define P3ptr   R1 | 
 | #define P1ptr   R2 | 
 | #define COUNT   R4 | 
 |  | 
 | #define X1L    V0 | 
 | #define X1H    V1 | 
 | #define Y1L    V2 | 
 | #define Y1H    V3 | 
 | #define Z1L    V4 | 
 | #define Z1H    V5 | 
 | #define X2L    V6 | 
 | #define X2H    V7 | 
 | #define Y2L    V8 | 
 | #define Y2H    V9 | 
 | #define Z2L    V10 | 
 | #define Z2H    V11 | 
 |  | 
 | #define ONE   V18 | 
 | #define IDX   V19 | 
 | #define SEL1  V20 | 
 | #define SEL2  V21 | 
 | TEXT ·p256Select(SB), NOSPLIT, $0 | 
 | 	MOVD   point+0(FP), P3ptr | 
 | 	MOVD   table+8(FP), P1ptr | 
 | 	VLREPB idx+(32+7)(FP), IDX | 
 | 	VREPIB $1, ONE | 
 | 	VREPIB $1, SEL2 | 
 | 	MOVD   $1, COUNT | 
 |  | 
 | 	VZERO X1H | 
 | 	VZERO X1L | 
 | 	VZERO Y1H | 
 | 	VZERO Y1L | 
 | 	VZERO Z1H | 
 | 	VZERO Z1L | 
 |  | 
 | loop_select: | 
 | 	VL 0(P1ptr), X2H | 
 | 	VL 16(P1ptr), X2L | 
 | 	VL 32(P1ptr), Y2H | 
 | 	VL 48(P1ptr), Y2L | 
 | 	VL 64(P1ptr), Z2H | 
 | 	VL 80(P1ptr), Z2L | 
 |  | 
 | 	VCEQG SEL2, IDX, SEL1 | 
 |  | 
 | 	VSEL X2L, X1L, SEL1, X1L | 
 | 	VSEL X2H, X1H, SEL1, X1H | 
 | 	VSEL Y2L, Y1L, SEL1, Y1L | 
 | 	VSEL Y2H, Y1H, SEL1, Y1H | 
 | 	VSEL Z2L, Z1L, SEL1, Z1L | 
 | 	VSEL Z2H, Z1H, SEL1, Z1H | 
 |  | 
 | 	VAB  SEL2, ONE, SEL2 | 
 | 	ADDW $1, COUNT | 
 | 	ADD  $96, P1ptr | 
 | 	CMPW COUNT, $17 | 
 | 	BLT  loop_select | 
 |  | 
 | 	VST X1H, 0(P3ptr) | 
 | 	VST X1L, 16(P3ptr) | 
 | 	VST Y1H, 32(P3ptr) | 
 | 	VST Y1L, 48(P3ptr) | 
 | 	VST Z1H, 64(P3ptr) | 
 | 	VST Z1L, 80(P3ptr) | 
 | 	RET | 
 |  | 
 | #undef P3ptr | 
 | #undef P1ptr | 
 | #undef COUNT | 
 | #undef X1L | 
 | #undef X1H | 
 | #undef Y1L | 
 | #undef Y1H | 
 | #undef Z1L | 
 | #undef Z1H | 
 | #undef X2L | 
 | #undef X2H | 
 | #undef Y2L | 
 | #undef Y2H | 
 | #undef Z2L | 
 | #undef Z2H | 
 | #undef ONE | 
 | #undef IDX | 
 | #undef SEL1 | 
 | #undef SEL2 | 
 |  | 
 | // --------------------------------------- | 
 | // Constant time table access | 
 | // Indexed from 1 to 15, with -1 offset | 
 | // (index 0 is implicitly point at infinity) | 
 | // func p256SelectBase(point *p256Point, table []p256Point, idx int) | 
 | #define P3ptr   R1 | 
 | #define P1ptr   R2 | 
 | #define COUNT   R4 | 
 |  | 
 | #define X1L    V0 | 
 | #define X1H    V1 | 
 | #define Y1L    V2 | 
 | #define Y1H    V3 | 
 | #define Z1L    V4 | 
 | #define Z1H    V5 | 
 | #define X2L    V6 | 
 | #define X2H    V7 | 
 | #define Y2L    V8 | 
 | #define Y2H    V9 | 
 | #define Z2L    V10 | 
 | #define Z2H    V11 | 
 |  | 
 | #define ONE   V18 | 
 | #define IDX   V19 | 
 | #define SEL1  V20 | 
 | #define SEL2  V21 | 
 | TEXT ·p256SelectBase(SB), NOSPLIT, $0 | 
 | 	MOVD   point+0(FP), P3ptr | 
 | 	MOVD   table+8(FP), P1ptr | 
 | 	VLREPB idx+(32+7)(FP), IDX | 
 | 	VREPIB $1, ONE | 
 | 	VREPIB $1, SEL2 | 
 | 	MOVD   $1, COUNT | 
 |  | 
 | 	VZERO X1H | 
 | 	VZERO X1L | 
 | 	VZERO Y1H | 
 | 	VZERO Y1L | 
 | 	VZERO Z1H | 
 | 	VZERO Z1L | 
 |  | 
 | loop_select: | 
 | 	VL 0(P1ptr), X2H | 
 | 	VL 16(P1ptr), X2L | 
 | 	VL 32(P1ptr), Y2H | 
 | 	VL 48(P1ptr), Y2L | 
 | 	VL 64(P1ptr), Z2H | 
 | 	VL 80(P1ptr), Z2L | 
 |  | 
 | 	VCEQG SEL2, IDX, SEL1 | 
 |  | 
 | 	VSEL X2L, X1L, SEL1, X1L | 
 | 	VSEL X2H, X1H, SEL1, X1H | 
 | 	VSEL Y2L, Y1L, SEL1, Y1L | 
 | 	VSEL Y2H, Y1H, SEL1, Y1H | 
 | 	VSEL Z2L, Z1L, SEL1, Z1L | 
 | 	VSEL Z2H, Z1H, SEL1, Z1H | 
 |  | 
 | 	VAB  SEL2, ONE, SEL2 | 
 | 	ADDW $1, COUNT | 
 | 	ADD  $96, P1ptr | 
 | 	CMPW COUNT, $65 | 
 | 	BLT  loop_select | 
 |  | 
 | 	VST X1H, 0(P3ptr) | 
 | 	VST X1L, 16(P3ptr) | 
 | 	VST Y1H, 32(P3ptr) | 
 | 	VST Y1L, 48(P3ptr) | 
 | 	VST Z1H, 64(P3ptr) | 
 | 	VST Z1L, 80(P3ptr) | 
 | 	RET | 
 |  | 
 | #undef P3ptr | 
 | #undef P1ptr | 
 | #undef COUNT | 
 | #undef X1L | 
 | #undef X1H | 
 | #undef Y1L | 
 | #undef Y1H | 
 | #undef Z1L | 
 | #undef Z1H | 
 | #undef X2L | 
 | #undef X2H | 
 | #undef Y2L | 
 | #undef Y2H | 
 | #undef Z2L | 
 | #undef Z2H | 
 | #undef ONE | 
 | #undef IDX | 
 | #undef SEL1 | 
 | #undef SEL2 | 
 |  | 
 | // --------------------------------------- | 
 | // func p256FromMont(res, in []byte) | 
 | #define res_ptr R1 | 
 | #define x_ptr   R2 | 
 | #define CPOOL   R4 | 
 |  | 
 | #define T0   V0 | 
 | #define T1   V1 | 
 | #define T2   V2 | 
 | #define TT0  V3 | 
 | #define TT1  V4 | 
 |  | 
 | #define ZER   V6 | 
 | #define SEL1  V7 | 
 | #define SEL2  V8 | 
 | #define CAR1  V9 | 
 | #define CAR2  V10 | 
 | #define RED1  V11 | 
 | #define RED2  V12 | 
 | #define PL    V13 | 
 | #define PH    V14 | 
 |  | 
 | TEXT ·p256FromMont(SB), NOSPLIT, $0 | 
 | 	MOVD res+0(FP), res_ptr | 
 | 	MOVD in+24(FP), x_ptr | 
 |  | 
 | 	VZERO T2 | 
 | 	VZERO ZER | 
 | 	MOVD  $p256<>+0x00(SB), CPOOL | 
 | 	VL    16(CPOOL), PL | 
 | 	VL    0(CPOOL), PH | 
 | 	VL    48(CPOOL), SEL2 | 
 | 	VL    64(CPOOL), SEL1 | 
 |  | 
 | 	VL (1*16)(x_ptr), T0 | 
 | 	VL (0*16)(x_ptr), T1 | 
 |  | 
 | 	// First round | 
 | 	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0 | 
 | 	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0 | 
 | 	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow | 
 |  | 
 | 	VSLDB $8, T1, T0, T0 | 
 | 	VSLDB $8, T2, T1, T1 | 
 |  | 
 | 	VACCQ  T0, RED1, CAR1 | 
 | 	VAQ    T0, RED1, T0 | 
 | 	VACCCQ T1, RED2, CAR1, CAR2 | 
 | 	VACQ   T1, RED2, CAR1, T1 | 
 | 	VAQ    T2, CAR2, T2 | 
 |  | 
 | 	// Second round | 
 | 	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0 | 
 | 	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0 | 
 | 	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow | 
 |  | 
 | 	VSLDB $8, T1, T0, T0 | 
 | 	VSLDB $8, T2, T1, T1 | 
 |  | 
 | 	VACCQ  T0, RED1, CAR1 | 
 | 	VAQ    T0, RED1, T0 | 
 | 	VACCCQ T1, RED2, CAR1, CAR2 | 
 | 	VACQ   T1, RED2, CAR1, T1 | 
 | 	VAQ    T2, CAR2, T2 | 
 |  | 
 | 	// Third round | 
 | 	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0 | 
 | 	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0 | 
 | 	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow | 
 |  | 
 | 	VSLDB $8, T1, T0, T0 | 
 | 	VSLDB $8, T2, T1, T1 | 
 |  | 
 | 	VACCQ  T0, RED1, CAR1 | 
 | 	VAQ    T0, RED1, T0 | 
 | 	VACCCQ T1, RED2, CAR1, CAR2 | 
 | 	VACQ   T1, RED2, CAR1, T1 | 
 | 	VAQ    T2, CAR2, T2 | 
 |  | 
 | 	// Last round | 
 | 	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0 | 
 | 	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0 | 
 | 	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow | 
 |  | 
 | 	VSLDB $8, T1, T0, T0 | 
 | 	VSLDB $8, T2, T1, T1 | 
 |  | 
 | 	VACCQ  T0, RED1, CAR1 | 
 | 	VAQ    T0, RED1, T0 | 
 | 	VACCCQ T1, RED2, CAR1, CAR2 | 
 | 	VACQ   T1, RED2, CAR1, T1 | 
 | 	VAQ    T2, CAR2, T2 | 
 |  | 
 | 	// --------------------------------------------------- | 
 |  | 
 | 	VSCBIQ  PL, T0, CAR1 | 
 | 	VSQ     PL, T0, TT0 | 
 | 	VSBCBIQ T1, PH, CAR1, CAR2 | 
 | 	VSBIQ   T1, PH, CAR1, TT1 | 
 | 	VSBIQ   T2, ZER, CAR2, T2 | 
 |  | 
 | 	// what output to use, TT1||TT0 or T1||T0? | 
 | 	VSEL T0, TT0, T2, T0 | 
 | 	VSEL T1, TT1, T2, T1 | 
 |  | 
 | 	VST T0, (1*16)(res_ptr) | 
 | 	VST T1, (0*16)(res_ptr) | 
 | 	RET | 
 |  | 
 | #undef res_ptr | 
 | #undef x_ptr | 
 | #undef CPOOL | 
 | #undef T0 | 
 | #undef T1 | 
 | #undef T2 | 
 | #undef TT0 | 
 | #undef TT1 | 
 | #undef ZER | 
 | #undef SEL1 | 
 | #undef SEL2 | 
 | #undef CAR1 | 
 | #undef CAR2 | 
 | #undef RED1 | 
 | #undef RED2 | 
 | #undef PL | 
 | #undef PH | 
 |  | 
 | // --------------------------------------- | 
 | // func p256OrdMul(res, in1, in2 []byte) | 
 | #define res_ptr R1 | 
 | #define x_ptr R2 | 
 | #define y_ptr R3 | 
 | #define X0    V0 | 
 | #define X1    V1 | 
 | #define Y0    V2 | 
 | #define Y1    V3 | 
 | #define M0    V4 | 
 | #define M1    V5 | 
 | #define T0    V6 | 
 | #define T1    V7 | 
 | #define T2    V8 | 
 | #define YDIG  V9 | 
 |  | 
 | #define ADD1  V16 | 
 | #define ADD1H V17 | 
 | #define ADD2  V18 | 
 | #define ADD2H V19 | 
 | #define RED1  V20 | 
 | #define RED1H V21 | 
 | #define RED2  V22 | 
 | #define RED2H V23 | 
 | #define CAR1  V24 | 
 | #define CAR1M V25 | 
 |  | 
 | #define MK0   V30 | 
 | #define K0    V31 | 
 | TEXT ·p256OrdMul(SB), NOSPLIT, $0 | 
 | 	MOVD res+0(FP), res_ptr | 
 | 	MOVD in1+24(FP), x_ptr | 
 | 	MOVD in2+48(FP), y_ptr | 
 |  | 
 | 	VZERO T2 | 
 | 	MOVD  $p256ordK0<>+0x00(SB), R4 | 
 |  | 
 | 	// VLEF    $3, 0(R4), K0 | 
 | 	WORD $0xE7F40000 | 
 | 	BYTE $0x38 | 
 | 	BYTE $0x03 | 
 | 	MOVD $p256ord<>+0x00(SB), R4 | 
 | 	VL   16(R4), M0 | 
 | 	VL   0(R4), M1 | 
 |  | 
 | 	VL (1*16)(x_ptr), X0 | 
 | 	VL (0*16)(x_ptr), X1 | 
 | 	VL (1*16)(y_ptr), Y0 | 
 | 	VL (0*16)(y_ptr), Y1 | 
 |  | 
 | 	// ---------------------------------------------------------------------------/ | 
 | 	VREPF $3, Y0, YDIG | 
 | 	VMLF  X0, YDIG, ADD1 | 
 | 	VMLF  ADD1, K0, MK0 | 
 | 	VREPF $3, MK0, MK0 | 
 |  | 
 | 	VMLF  X1, YDIG, ADD2 | 
 | 	VMLHF X0, YDIG, ADD1H | 
 | 	VMLHF X1, YDIG, ADD2H | 
 |  | 
 | 	VMALF  M0, MK0, ADD1, RED1 | 
 | 	VMALHF M0, MK0, ADD1, RED1H | 
 | 	VMALF  M1, MK0, ADD2, RED2 | 
 | 	VMALHF M1, MK0, ADD2, RED2H | 
 |  | 
 | 	VSLDB $12, RED2, RED1, RED1 | 
 | 	VSLDB $12, T2, RED2, RED2 | 
 |  | 
 | 	VACCQ RED1, ADD1H, CAR1 | 
 | 	VAQ   RED1, ADD1H, T0 | 
 | 	VACCQ RED1H, T0, CAR1M | 
 | 	VAQ   RED1H, T0, T0 | 
 |  | 
 | 	// << ready for next MK0 | 
 |  | 
 | 	VACQ   RED2, ADD2H, CAR1, T1 | 
 | 	VACCCQ RED2, ADD2H, CAR1, CAR1 | 
 | 	VACCCQ RED2H, T1, CAR1M, T2 | 
 | 	VACQ   RED2H, T1, CAR1M, T1 | 
 | 	VAQ    CAR1, T2, T2 | 
 |  | 
 | 	// --------------------------------------------------- | 
 | /* * | 
 |  * ---+--------+--------+ | 
 |  *  T2|   T1   |   T0   | | 
 |  * ---+--------+--------+ | 
 |  *           *(add)* | 
 |  *    +--------+--------+ | 
 |  *    |   X1   |   X0   | | 
 |  *    +--------+--------+ | 
 |  *           *(mul)* | 
 |  *    +--------+--------+ | 
 |  *    |  YDIG  |  YDIG  | | 
 |  *    +--------+--------+ | 
 |  *           *(add)* | 
 |  *    +--------+--------+ | 
 |  *    |   M1   |   M0   | | 
 |  *    +--------+--------+ | 
 |  *           *(mul)* | 
 |  *    +--------+--------+ | 
 |  *    |   MK0  |   MK0  | | 
 |  *    +--------+--------+ | 
 |  * | 
 |  *   --------------------- | 
 |  * | 
 |  *    +--------+--------+ | 
 |  *    |  ADD2  |  ADD1  | | 
 |  *    +--------+--------+ | 
 |  *  +--------+--------+ | 
 |  *  | ADD2H  | ADD1H  | | 
 |  *  +--------+--------+ | 
 |  *    +--------+--------+ | 
 |  *    |  RED2  |  RED1  | | 
 |  *    +--------+--------+ | 
 |  *  +--------+--------+ | 
 |  *  | RED2H  | RED1H  | | 
 |  *  +--------+--------+ | 
 |  */ | 
 | 	VREPF $2, Y0, YDIG | 
 | 	VMALF X0, YDIG, T0, ADD1 | 
 | 	VMLF  ADD1, K0, MK0 | 
 | 	VREPF $3, MK0, MK0 | 
 |  | 
 | 	VMALF  X1, YDIG, T1, ADD2 | 
 | 	VMALHF X0, YDIG, T0, ADD1H | 
 | 	VMALHF X1, YDIG, T1, ADD2H | 
 |  | 
 | 	VMALF  M0, MK0, ADD1, RED1 | 
 | 	VMALHF M0, MK0, ADD1, RED1H | 
 | 	VMALF  M1, MK0, ADD2, RED2 | 
 | 	VMALHF M1, MK0, ADD2, RED2H | 
 |  | 
 | 	VSLDB $12, RED2, RED1, RED1 | 
 | 	VSLDB $12, T2, RED2, RED2 | 
 |  | 
 | 	VACCQ RED1, ADD1H, CAR1 | 
 | 	VAQ   RED1, ADD1H, T0 | 
 | 	VACCQ RED1H, T0, CAR1M | 
 | 	VAQ   RED1H, T0, T0 | 
 |  | 
 | 	// << ready for next MK0 | 
 |  | 
 | 	VACQ   RED2, ADD2H, CAR1, T1 | 
 | 	VACCCQ RED2, ADD2H, CAR1, CAR1 | 
 | 	VACCCQ RED2H, T1, CAR1M, T2 | 
 | 	VACQ   RED2H, T1, CAR1M, T1 | 
 | 	VAQ    CAR1, T2, T2 | 
 |  | 
 | 	// --------------------------------------------------- | 
 | 	VREPF $1, Y0, YDIG | 
 | 	VMALF X0, YDIG, T0, ADD1 | 
 | 	VMLF  ADD1, K0, MK0 | 
 | 	VREPF $3, MK0, MK0 | 
 |  | 
 | 	VMALF  X1, YDIG, T1, ADD2 | 
 | 	VMALHF X0, YDIG, T0, ADD1H | 
 | 	VMALHF X1, YDIG, T1, ADD2H | 
 |  | 
 | 	VMALF  M0, MK0, ADD1, RED1 | 
 | 	VMALHF M0, MK0, ADD1, RED1H | 
 | 	VMALF  M1, MK0, ADD2, RED2 | 
 | 	VMALHF M1, MK0, ADD2, RED2H | 
 |  | 
 | 	VSLDB $12, RED2, RED1, RED1 | 
 | 	VSLDB $12, T2, RED2, RED2 | 
 |  | 
 | 	VACCQ RED1, ADD1H, CAR1 | 
 | 	VAQ   RED1, ADD1H, T0 | 
 | 	VACCQ RED1H, T0, CAR1M | 
 | 	VAQ   RED1H, T0, T0 | 
 |  | 
 | 	// << ready for next MK0 | 
 |  | 
 | 	VACQ   RED2, ADD2H, CAR1, T1 | 
 | 	VACCCQ RED2, ADD2H, CAR1, CAR1 | 
 | 	VACCCQ RED2H, T1, CAR1M, T2 | 
 | 	VACQ   RED2H, T1, CAR1M, T1 | 
 | 	VAQ    CAR1, T2, T2 | 
 |  | 
 | 	// --------------------------------------------------- | 
 | 	VREPF $0, Y0, YDIG | 
 | 	VMALF X0, YDIG, T0, ADD1 | 
 | 	VMLF  ADD1, K0, MK0 | 
 | 	VREPF $3, MK0, MK0 | 
 |  | 
 | 	VMALF  X1, YDIG, T1, ADD2 | 
 | 	VMALHF X0, YDIG, T0, ADD1H | 
 | 	VMALHF X1, YDIG, T1, ADD2H | 
 |  | 
 | 	VMALF  M0, MK0, ADD1, RED1 | 
 | 	VMALHF M0, MK0, ADD1, RED1H | 
 | 	VMALF  M1, MK0, ADD2, RED2 | 
 | 	VMALHF M1, MK0, ADD2, RED2H | 
 |  | 
 | 	VSLDB $12, RED2, RED1, RED1 | 
 | 	VSLDB $12, T2, RED2, RED2 | 
 |  | 
 | 	VACCQ RED1, ADD1H, CAR1 | 
 | 	VAQ   RED1, ADD1H, T0 | 
 | 	VACCQ RED1H, T0, CAR1M | 
 | 	VAQ   RED1H, T0, T0 | 
 |  | 
 | 	// << ready for next MK0 | 
 |  | 
 | 	VACQ   RED2, ADD2H, CAR1, T1 | 
 | 	VACCCQ RED2, ADD2H, CAR1, CAR1 | 
 | 	VACCCQ RED2H, T1, CAR1M, T2 | 
 | 	VACQ   RED2H, T1, CAR1M, T1 | 
 | 	VAQ    CAR1, T2, T2 | 
 |  | 
 | 	// --------------------------------------------------- | 
 | 	VREPF $3, Y1, YDIG | 
 | 	VMALF X0, YDIG, T0, ADD1 | 
 | 	VMLF  ADD1, K0, MK0 | 
 | 	VREPF $3, MK0, MK0 | 
 |  | 
 | 	VMALF  X1, YDIG, T1, ADD2 | 
 | 	VMALHF X0, YDIG, T0, ADD1H | 
 | 	VMALHF X1, YDIG, T1, ADD2H | 
 |  | 
 | 	VMALF  M0, MK0, ADD1, RED1 | 
 | 	VMALHF M0, MK0, ADD1, RED1H | 
 | 	VMALF  M1, MK0, ADD2, RED2 | 
 | 	VMALHF M1, MK0, ADD2, RED2H | 
 |  | 
 | 	VSLDB $12, RED2, RED1, RED1 | 
 | 	VSLDB $12, T2, RED2, RED2 | 
 |  | 
 | 	VACCQ RED1, ADD1H, CAR1 | 
 | 	VAQ   RED1, ADD1H, T0 | 
 | 	VACCQ RED1H, T0, CAR1M | 
 | 	VAQ   RED1H, T0, T0 | 
 |  | 
 | 	// << ready for next MK0 | 
 |  | 
 | 	VACQ   RED2, ADD2H, CAR1, T1 | 
 | 	VACCCQ RED2, ADD2H, CAR1, CAR1 | 
 | 	VACCCQ RED2H, T1, CAR1M, T2 | 
 | 	VACQ   RED2H, T1, CAR1M, T1 | 
 | 	VAQ    CAR1, T2, T2 | 
 |  | 
 | 	// --------------------------------------------------- | 
 | 	VREPF $2, Y1, YDIG | 
 | 	VMALF X0, YDIG, T0, ADD1 | 
 | 	VMLF  ADD1, K0, MK0 | 
 | 	VREPF $3, MK0, MK0 | 
 |  | 
 | 	VMALF  X1, YDIG, T1, ADD2 | 
 | 	VMALHF X0, YDIG, T0, ADD1H | 
 | 	VMALHF X1, YDIG, T1, ADD2H | 
 |  | 
 | 	VMALF  M0, MK0, ADD1, RED1 | 
 | 	VMALHF M0, MK0, ADD1, RED1H | 
 | 	VMALF  M1, MK0, ADD2, RED2 | 
 | 	VMALHF M1, MK0, ADD2, RED2H | 
 |  | 
 | 	VSLDB $12, RED2, RED1, RED1 | 
 | 	VSLDB $12, T2, RED2, RED2 | 
 |  | 
 | 	VACCQ RED1, ADD1H, CAR1 | 
 | 	VAQ   RED1, ADD1H, T0 | 
 | 	VACCQ RED1H, T0, CAR1M | 
 | 	VAQ   RED1H, T0, T0 | 
 |  | 
 | 	// << ready for next MK0 | 
 |  | 
 | 	VACQ   RED2, ADD2H, CAR1, T1 | 
 | 	VACCCQ RED2, ADD2H, CAR1, CAR1 | 
 | 	VACCCQ RED2H, T1, CAR1M, T2 | 
 | 	VACQ   RED2H, T1, CAR1M, T1 | 
 | 	VAQ    CAR1, T2, T2 | 
 |  | 
 | 	// --------------------------------------------------- | 
 | 	VREPF $1, Y1, YDIG | 
 | 	VMALF X0, YDIG, T0, ADD1 | 
 | 	VMLF  ADD1, K0, MK0 | 
 | 	VREPF $3, MK0, MK0 | 
 |  | 
 | 	VMALF  X1, YDIG, T1, ADD2 | 
 | 	VMALHF X0, YDIG, T0, ADD1H | 
 | 	VMALHF X1, YDIG, T1, ADD2H | 
 |  | 
 | 	VMALF  M0, MK0, ADD1, RED1 | 
 | 	VMALHF M0, MK0, ADD1, RED1H | 
 | 	VMALF  M1, MK0, ADD2, RED2 | 
 | 	VMALHF M1, MK0, ADD2, RED2H | 
 |  | 
 | 	VSLDB $12, RED2, RED1, RED1 | 
 | 	VSLDB $12, T2, RED2, RED2 | 
 |  | 
 | 	VACCQ RED1, ADD1H, CAR1 | 
 | 	VAQ   RED1, ADD1H, T0 | 
 | 	VACCQ RED1H, T0, CAR1M | 
 | 	VAQ   RED1H, T0, T0 | 
 |  | 
 | 	// << ready for next MK0 | 
 |  | 
 | 	VACQ   RED2, ADD2H, CAR1, T1 | 
 | 	VACCCQ RED2, ADD2H, CAR1, CAR1 | 
 | 	VACCCQ RED2H, T1, CAR1M, T2 | 
 | 	VACQ   RED2H, T1, CAR1M, T1 | 
 | 	VAQ    CAR1, T2, T2 | 
 |  | 
 | 	// --------------------------------------------------- | 
 | 	VREPF $0, Y1, YDIG | 
 | 	VMALF X0, YDIG, T0, ADD1 | 
 | 	VMLF  ADD1, K0, MK0 | 
 | 	VREPF $3, MK0, MK0 | 
 |  | 
 | 	VMALF  X1, YDIG, T1, ADD2 | 
 | 	VMALHF X0, YDIG, T0, ADD1H | 
 | 	VMALHF X1, YDIG, T1, ADD2H | 
 |  | 
 | 	VMALF  M0, MK0, ADD1, RED1 | 
 | 	VMALHF M0, MK0, ADD1, RED1H | 
 | 	VMALF  M1, MK0, ADD2, RED2 | 
 | 	VMALHF M1, MK0, ADD2, RED2H | 
 |  | 
 | 	VSLDB $12, RED2, RED1, RED1 | 
 | 	VSLDB $12, T2, RED2, RED2 | 
 |  | 
 | 	VACCQ RED1, ADD1H, CAR1 | 
 | 	VAQ   RED1, ADD1H, T0 | 
 | 	VACCQ RED1H, T0, CAR1M | 
 | 	VAQ   RED1H, T0, T0 | 
 |  | 
 | 	// << ready for next MK0 | 
 |  | 
 | 	VACQ   RED2, ADD2H, CAR1, T1 | 
 | 	VACCCQ RED2, ADD2H, CAR1, CAR1 | 
 | 	VACCCQ RED2H, T1, CAR1M, T2 | 
 | 	VACQ   RED2H, T1, CAR1M, T1 | 
 | 	VAQ    CAR1, T2, T2 | 
 |  | 
 | 	// --------------------------------------------------- | 
 |  | 
 | 	VZERO   RED1 | 
 | 	VSCBIQ  M0, T0, CAR1 | 
 | 	VSQ     M0, T0, ADD1 | 
 | 	VSBCBIQ T1, M1, CAR1, CAR1M | 
 | 	VSBIQ   T1, M1, CAR1, ADD2 | 
 | 	VSBIQ   T2, RED1, CAR1M, T2 | 
 |  | 
 | 	// what output to use, ADD2||ADD1 or T1||T0? | 
 | 	VSEL T0, ADD1, T2, T0 | 
 | 	VSEL T1, ADD2, T2, T1 | 
 |  | 
 | 	VST T0, (1*16)(res_ptr) | 
 | 	VST T1, (0*16)(res_ptr) | 
 | 	RET | 
 |  | 
 | #undef res_ptr | 
 | #undef x_ptr | 
 | #undef y_ptr | 
 | #undef X0 | 
 | #undef X1 | 
 | #undef Y0 | 
 | #undef Y1 | 
 | #undef M0 | 
 | #undef M1 | 
 | #undef T0 | 
 | #undef T1 | 
 | #undef T2 | 
 | #undef YDIG | 
 |  | 
 | #undef ADD1 | 
 | #undef ADD1H | 
 | #undef ADD2 | 
 | #undef ADD2H | 
 | #undef RED1 | 
 | #undef RED1H | 
 | #undef RED2 | 
 | #undef RED2H | 
 | #undef CAR1 | 
 | #undef CAR1M | 
 |  | 
 | #undef MK0 | 
 | #undef K0 | 
 |  | 
 | // --------------------------------------- | 
 | // p256MulInternalVX | 
 | // V0-V3,V30,V31 - Not Modified | 
 | // V4-V15 - Volatile | 
 |  | 
 | #define CPOOL   R4 | 
 |  | 
 | // Parameters | 
 | #define X0    V0 // Not modified | 
 | #define X1    V1 // Not modified | 
 | #define Y0    V2 // Not modified | 
 | #define Y1    V3 // Not modified | 
 | #define T0    V4 | 
 | #define T1    V5 | 
 | #define P0    V30 // Not modified | 
 | #define P1    V31 // Not modified | 
 |  | 
 | // Temporaries | 
 | #define YDIG  V6 // Overloaded with CAR2, ZER | 
 | #define ADD1H V7 // Overloaded with ADD3H | 
 | #define ADD2H V8 // Overloaded with ADD4H | 
 | #define ADD3  V9 // Overloaded with SEL2,SEL5 | 
 | #define ADD4  V10 // Overloaded with SEL3,SEL6 | 
 | #define RED1  V11 // Overloaded with CAR2 | 
 | #define RED2  V12 | 
 | #define RED3  V13 // Overloaded with SEL1 | 
 | #define T2    V14 | 
 | // Overloaded temporaries | 
 | #define ADD1  V4 // Overloaded with T0 | 
 | #define ADD2  V5 // Overloaded with T1 | 
 | #define ADD3H V7 // Overloaded with ADD1H | 
 | #define ADD4H V8 // Overloaded with ADD2H | 
 | #define ZER   V6 // Overloaded with YDIG, CAR2 | 
 | #define CAR1  V6 // Overloaded with YDIG, ZER | 
 | #define CAR2  V11 // Overloaded with RED1 | 
 | // Constant Selects | 
 | #define SEL1  V13 // Overloaded with RED3 | 
 | #define SEL2  V9 // Overloaded with ADD3,SEL5 | 
 | #define SEL3  V10 // Overloaded with ADD4,SEL6 | 
 | #define SEL4  V6 // Overloaded with YDIG,CAR2,ZER | 
 | #define SEL5  V9 // Overloaded with ADD3,SEL2 | 
 | #define SEL6  V10 // Overloaded with ADD4,SEL3 | 
 |  | 
 | /* * | 
 |  * To follow the flow of bits, for your own sanity a stiff drink, need you shall. | 
 |  * Of a single round, a 'helpful' picture, here is. Meaning, column position has. | 
 |  * With you, SIMD be... | 
 |  * | 
 |  *                                           +--------+--------+ | 
 |  *                                  +--------|  RED2  |  RED1  | | 
 |  *                                  |        +--------+--------+ | 
 |  *                                  |       ---+--------+--------+ | 
 |  *                                  |  +---- T2|   T1   |   T0   |--+ | 
 |  *                                  |  |    ---+--------+--------+  | | 
 |  *                                  |  |                            | | 
 |  *                                  |  |    ======================= | | 
 |  *                                  |  |                            | | 
 |  *                                  |  |       +--------+--------+<-+ | 
 |  *                                  |  +-------|  ADD2  |  ADD1  |--|-----+ | 
 |  *                                  |  |       +--------+--------+  |     | | 
 |  *                                  |  |     +--------+--------+<---+     | | 
 |  *                                  |  |     | ADD2H  | ADD1H  |--+       | | 
 |  *                                  |  |     +--------+--------+  |       | | 
 |  *                                  |  |     +--------+--------+<-+       | | 
 |  *                                  |  |     |  ADD4  |  ADD3  |--|-+     | | 
 |  *                                  |  |     +--------+--------+  | |     | | 
 |  *                                  |  |   +--------+--------+<---+ |     | | 
 |  *                                  |  |   | ADD4H  | ADD3H  |------|-+   |(+vzero) | 
 |  *                                  |  |   +--------+--------+      | |   V | 
 |  *                                  |  | ------------------------   | | +--------+ | 
 |  *                                  |  |                            | | |  RED3  |  [d0 0 0 d0] | 
 |  *                                  |  |                            | | +--------+ | 
 |  *                                  |  +---->+--------+--------+    | |   | | 
 |  *   (T2[1w]||ADD2[4w]||ADD1[3w])   +--------|   T1   |   T0   |    | |   | | 
 |  *                                  |        +--------+--------+    | |   | | 
 |  *                                  +---->---+--------+--------+    | |   | | 
 |  *                                         T2|   T1   |   T0   |----+ |   | | 
 |  *                                        ---+--------+--------+    | |   | | 
 |  *                                        ---+--------+--------+<---+ |   | | 
 |  *                                    +--- T2|   T1   |   T0   |----------+ | 
 |  *                                    |   ---+--------+--------+      |   | | 
 |  *                                    |  +--------+--------+<-------------+ | 
 |  *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0] | 
 |  *                                    |  +--------+--------+     |    |   | | 
 |  *                                    |  +--------+<----------------------+ | 
 |  *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0] | 
 |  *                                    |  +--------+              |    | | 
 |  *                                    +--->+--------+--------+   |    | | 
 |  *                                         |   T1   |   T0   |--------+ | 
 |  *                                         +--------+--------+   |    | | 
 |  *                                   --------------------------- |    | | 
 |  *                                                               |    | | 
 |  *                                       +--------+--------+<----+    | | 
 |  *                                       |  RED2  |  RED1  |          | | 
 |  *                                       +--------+--------+          | | 
 |  *                                      ---+--------+--------+<-------+ | 
 |  *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!) | 
 |  *                                      ---+--------+--------+ | 
 |  * | 
 |  *                                                                *Mi obra de arte de siglo XXI @vpaprots | 
 |  * | 
 |  * | 
 |  * First group is special, doesn't get the two inputs: | 
 |  *                                             +--------+--------+<-+ | 
 |  *                                     +-------|  ADD2  |  ADD1  |--|-----+ | 
 |  *                                     |       +--------+--------+  |     | | 
 |  *                                     |     +--------+--------+<---+     | | 
 |  *                                     |     | ADD2H  | ADD1H  |--+       | | 
 |  *                                     |     +--------+--------+  |       | | 
 |  *                                     |     +--------+--------+<-+       | | 
 |  *                                     |     |  ADD4  |  ADD3  |--|-+     | | 
 |  *                                     |     +--------+--------+  | |     | | 
 |  *                                     |   +--------+--------+<---+ |     | | 
 |  *                                     |   | ADD4H  | ADD3H  |------|-+   |(+vzero) | 
 |  *                                     |   +--------+--------+      | |   V | 
 |  *                                     | ------------------------   | | +--------+ | 
 |  *                                     |                            | | |  RED3  |  [d0 0 0 d0] | 
 |  *                                     |                            | | +--------+ | 
 |  *                                     +---->+--------+--------+    | |   | | 
 |  *   (T2[1w]||ADD2[4w]||ADD1[3w])            |   T1   |   T0   |----+ |   | | 
 |  *                                           +--------+--------+    | |   | | 
 |  *                                        ---+--------+--------+<---+ |   | | 
 |  *                                    +--- T2|   T1   |   T0   |----------+ | 
 |  *                                    |   ---+--------+--------+      |   | | 
 |  *                                    |  +--------+--------+<-------------+ | 
 |  *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0] | 
 |  *                                    |  +--------+--------+     |    |   | | 
 |  *                                    |  +--------+<----------------------+ | 
 |  *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0] | 
 |  *                                    |  +--------+              |    | | 
 |  *                                    +--->+--------+--------+   |    | | 
 |  *                                         |   T1   |   T0   |--------+ | 
 |  *                                         +--------+--------+   |    | | 
 |  *                                   --------------------------- |    | | 
 |  *                                                               |    | | 
 |  *                                       +--------+--------+<----+    | | 
 |  *                                       |  RED2  |  RED1  |          | | 
 |  *                                       +--------+--------+          | | 
 |  *                                      ---+--------+--------+<-------+ | 
 |  *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!) | 
 |  *                                      ---+--------+--------+ | 
 |  * | 
 |  * Last 'group' needs to RED2||RED1 shifted less | 
 |  */ | 
 | TEXT ·p256MulInternalVX(SB), NOSPLIT, $0-0 | 
 | 	VL 32(CPOOL), SEL1 | 
 | 	VL 48(CPOOL), SEL2 | 
 | 	VL 64(CPOOL), SEL3 | 
 | 	VL 80(CPOOL), SEL4 | 
 |  | 
 | 	// --------------------------------------------------- | 
 |  | 
 | 	VREPF $3, Y0, YDIG | 
 | 	VMLHF X0, YDIG, ADD1H | 
 | 	VMLHF X1, YDIG, ADD2H | 
 | 	VMLF  X0, YDIG, ADD1 | 
 | 	VMLF  X1, YDIG, ADD2 | 
 |  | 
 | 	VREPF  $2, Y0, YDIG | 
 | 	VMALF  X0, YDIG, ADD1H, ADD3 | 
 | 	VMALF  X1, YDIG, ADD2H, ADD4 | 
 | 	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free | 
 | 	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free | 
 |  | 
 | 	VZERO ZER | 
 | 	VL    32(CPOOL), SEL1 | 
 | 	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] | 
 |  | 
 | 	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free | 
 | 	VSLDB $12, ZER, ADD2, T1  // ADD2 Free | 
 |  | 
 | 	VACCQ  T0, ADD3, CAR1 | 
 | 	VAQ    T0, ADD3, T0       // ADD3 Free | 
 | 	VACCCQ T1, ADD4, CAR1, T2 | 
 | 	VACQ   T1, ADD4, CAR1, T1 // ADD4 Free | 
 |  | 
 | 	VL    48(CPOOL), SEL2 | 
 | 	VL    64(CPOOL), SEL3 | 
 | 	VL    80(CPOOL), SEL4 | 
 | 	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0] | 
 | 	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] | 
 | 	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0] | 
 | 	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow | 
 |  | 
 | 	VSLDB $12, T1, T0, T0 | 
 | 	VSLDB $12, T2, T1, T1 | 
 |  | 
 | 	VACCQ  T0, ADD3H, CAR1 | 
 | 	VAQ    T0, ADD3H, T0 | 
 | 	VACCCQ T1, ADD4H, CAR1, T2 | 
 | 	VACQ   T1, ADD4H, CAR1, T1 | 
 |  | 
 | 	// --------------------------------------------------- | 
 |  | 
 | 	VREPF  $1, Y0, YDIG | 
 | 	VMALHF X0, YDIG, T0, ADD1H | 
 | 	VMALHF X1, YDIG, T1, ADD2H | 
 | 	VMALF  X0, YDIG, T0, ADD1  // T0 Free->ADD1 | 
 | 	VMALF  X1, YDIG, T1, ADD2  // T1 Free->ADD2 | 
 |  | 
 | 	VREPF  $0, Y0, YDIG | 
 | 	VMALF  X0, YDIG, ADD1H, ADD3 | 
 | 	VMALF  X1, YDIG, ADD2H, ADD4 | 
 | 	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H | 
 | 	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER | 
 |  | 
 | 	VZERO ZER | 
 | 	VL    32(CPOOL), SEL1 | 
 | 	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] | 
 |  | 
 | 	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0 | 
 | 	VSLDB $12, T2, ADD2, T1   // ADD2 Free->T1, T2 Free | 
 |  | 
 | 	VACCQ  T0, RED1, CAR1 | 
 | 	VAQ    T0, RED1, T0 | 
 | 	VACCCQ T1, RED2, CAR1, T2 | 
 | 	VACQ   T1, RED2, CAR1, T1 | 
 |  | 
 | 	VACCQ  T0, ADD3, CAR1 | 
 | 	VAQ    T0, ADD3, T0 | 
 | 	VACCCQ T1, ADD4, CAR1, CAR2 | 
 | 	VACQ   T1, ADD4, CAR1, T1 | 
 | 	VAQ    T2, CAR2, T2 | 
 |  | 
 | 	VL    48(CPOOL), SEL2 | 
 | 	VL    64(CPOOL), SEL3 | 
 | 	VL    80(CPOOL), SEL4 | 
 | 	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0] | 
 | 	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] | 
 | 	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0] | 
 | 	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow | 
 |  | 
 | 	VSLDB $12, T1, T0, T0 | 
 | 	VSLDB $12, T2, T1, T1 | 
 |  | 
 | 	VACCQ  T0, ADD3H, CAR1 | 
 | 	VAQ    T0, ADD3H, T0 | 
 | 	VACCCQ T1, ADD4H, CAR1, T2 | 
 | 	VACQ   T1, ADD4H, CAR1, T1 | 
 |  | 
 | 	// --------------------------------------------------- | 
 |  | 
 | 	VREPF  $3, Y1, YDIG | 
 | 	VMALHF X0, YDIG, T0, ADD1H | 
 | 	VMALHF X1, YDIG, T1, ADD2H | 
 | 	VMALF  X0, YDIG, T0, ADD1 | 
 | 	VMALF  X1, YDIG, T1, ADD2 | 
 |  | 
 | 	VREPF  $2, Y1, YDIG | 
 | 	VMALF  X0, YDIG, ADD1H, ADD3 | 
 | 	VMALF  X1, YDIG, ADD2H, ADD4 | 
 | 	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free | 
 | 	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free | 
 |  | 
 | 	VZERO ZER | 
 | 	VL    32(CPOOL), SEL1 | 
 | 	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] | 
 |  | 
 | 	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free | 
 | 	VSLDB $12, T2, ADD2, T1   // ADD2 Free | 
 |  | 
 | 	VACCQ  T0, RED1, CAR1 | 
 | 	VAQ    T0, RED1, T0 | 
 | 	VACCCQ T1, RED2, CAR1, T2 | 
 | 	VACQ   T1, RED2, CAR1, T1 | 
 |  | 
 | 	VACCQ  T0, ADD3, CAR1 | 
 | 	VAQ    T0, ADD3, T0 | 
 | 	VACCCQ T1, ADD4, CAR1, CAR2 | 
 | 	VACQ   T1, ADD4, CAR1, T1 | 
 | 	VAQ    T2, CAR2, T2 | 
 |  | 
 | 	VL    48(CPOOL), SEL2 | 
 | 	VL    64(CPOOL), SEL3 | 
 | 	VL    80(CPOOL), SEL4 | 
 | 	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0] | 
 | 	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] | 
 | 	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0] | 
 | 	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow | 
 |  | 
 | 	VSLDB $12, T1, T0, T0 | 
 | 	VSLDB $12, T2, T1, T1 | 
 |  | 
 | 	VACCQ  T0, ADD3H, CAR1 | 
 | 	VAQ    T0, ADD3H, T0 | 
 | 	VACCCQ T1, ADD4H, CAR1, T2 | 
 | 	VACQ   T1, ADD4H, CAR1, T1 | 
 |  | 
 | 	// --------------------------------------------------- | 
 |  | 
 | 	VREPF  $1, Y1, YDIG | 
 | 	VMALHF X0, YDIG, T0, ADD1H | 
 | 	VMALHF X1, YDIG, T1, ADD2H | 
 | 	VMALF  X0, YDIG, T0, ADD1 | 
 | 	VMALF  X1, YDIG, T1, ADD2 | 
 |  | 
 | 	VREPF  $0, Y1, YDIG | 
 | 	VMALF  X0, YDIG, ADD1H, ADD3 | 
 | 	VMALF  X1, YDIG, ADD2H, ADD4 | 
 | 	VMALHF X0, YDIG, ADD1H, ADD3H | 
 | 	VMALHF X1, YDIG, ADD2H, ADD4H | 
 |  | 
 | 	VZERO ZER | 
 | 	VL    32(CPOOL), SEL1 | 
 | 	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] | 
 |  | 
 | 	VSLDB $12, ADD2, ADD1, T0 | 
 | 	VSLDB $12, T2, ADD2, T1 | 
 |  | 
 | 	VACCQ  T0, RED1, CAR1 | 
 | 	VAQ    T0, RED1, T0 | 
 | 	VACCCQ T1, RED2, CAR1, T2 | 
 | 	VACQ   T1, RED2, CAR1, T1 | 
 |  | 
 | 	VACCQ  T0, ADD3, CAR1 | 
 | 	VAQ    T0, ADD3, T0 | 
 | 	VACCCQ T1, ADD4, CAR1, CAR2 | 
 | 	VACQ   T1, ADD4, CAR1, T1 | 
 | 	VAQ    T2, CAR2, T2 | 
 |  | 
 | 	VL    96(CPOOL), SEL5 | 
 | 	VL    112(CPOOL), SEL6 | 
 | 	VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0] | 
 | 	VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0  0] | 
 | 	VSQ   RED1, RED2, RED2     // Guaranteed not to underflow | 
 |  | 
 | 	VSLDB $12, T1, T0, T0 | 
 | 	VSLDB $12, T2, T1, T1 | 
 |  | 
 | 	VACCQ  T0, ADD3H, CAR1 | 
 | 	VAQ    T0, ADD3H, T0 | 
 | 	VACCCQ T1, ADD4H, CAR1, T2 | 
 | 	VACQ   T1, ADD4H, CAR1, T1 | 
 |  | 
 | 	VACCQ  T0, RED1, CAR1 | 
 | 	VAQ    T0, RED1, T0 | 
 | 	VACCCQ T1, RED2, CAR1, CAR2 | 
 | 	VACQ   T1, RED2, CAR1, T1 | 
 | 	VAQ    T2, CAR2, T2 | 
 |  | 
 | 	// --------------------------------------------------- | 
 |  | 
 | 	VZERO   RED3 | 
 | 	VSCBIQ  P0, T0, CAR1 | 
 | 	VSQ     P0, T0, ADD1H | 
 | 	VSBCBIQ T1, P1, CAR1, CAR2 | 
 | 	VSBIQ   T1, P1, CAR1, ADD2H | 
 | 	VSBIQ   T2, RED3, CAR2, T2 | 
 |  | 
 | 	// what output to use, ADD2H||ADD1H or T1||T0? | 
 | 	VSEL T0, ADD1H, T2, T0 | 
 | 	VSEL T1, ADD2H, T2, T1 | 
 | 	RET | 
 |  | 
 | #undef CPOOL | 
 |  | 
 | #undef X0 | 
 | #undef X1 | 
 | #undef Y0 | 
 | #undef Y1 | 
 | #undef T0 | 
 | #undef T1 | 
 | #undef P0 | 
 | #undef P1 | 
 |  | 
 | #undef SEL1 | 
 | #undef SEL2 | 
 | #undef SEL3 | 
 | #undef SEL4 | 
 | #undef SEL5 | 
 | #undef SEL6 | 
 |  | 
 | #undef YDIG | 
 | #undef ADD1H | 
 | #undef ADD2H | 
 | #undef ADD3 | 
 | #undef ADD4 | 
 | #undef RED1 | 
 | #undef RED2 | 
 | #undef RED3 | 
 | #undef T2 | 
 | #undef ADD1 | 
 | #undef ADD2 | 
 | #undef ADD3H | 
 | #undef ADD4H | 
 | #undef ZER | 
 | #undef CAR1 | 
 | #undef CAR2 | 
 |  | 
 | // --------------------------------------- | 
 | // p256MulInternalVMSL | 
 | // V0-V3,V30,V31 - Not Modified | 
 | // V4-V14 - Volatile | 
 |  | 
 | #define CPOOL   R4 | 
 | #define SCRATCH R9 | 
 |  | 
 | // Parameters | 
 | #define X0    V0 // Not modified | 
 | #define X1    V1 // Not modified | 
 | #define Y0    V2 // Not modified | 
 | #define Y1    V3 // Not modified | 
 | #define T0    V4 | 
 | #define T1    V5 | 
 | #define T2    V6 | 
 | #define P0    V30 // Not modified | 
 | #define P1    V31 // Not modified | 
 |  | 
 | // input: d0 | 
 | // output: h0, h1 | 
 | // temp: TEMP, ZERO, BORROW | 
 | #define OBSERVATION3(d0, h0, h1, TEMP, ZERO, BORROW) \ | 
 | 	VZERO ZERO                   \ | 
 | 	VSLDB $4, d0, ZERO, h0       \ | 
 | 	VLR   h0, BORROW             \ | 
 | 	VSLDB $12, ZERO, h0, TEMP    \ | 
 | 	VSQ   TEMP, h0, h0           \ | 
 | 	VSLDB $12, d0, BORROW, h1    \ | 
 | 	VSLDB $8, ZERO, BORROW, TEMP \ | 
 | 	VAQ   TEMP, h0, h0           \ | 
 |  | 
 | #define OBSERVATION3A(d2, h0, h1, TEMP, ZERO) \ | 
 | 	VZERO ZERO                \ | 
 | 	VSLDB $8, d2, ZERO, TEMP  \ | 
 | 	VSLDB $8, d2, TEMP, h0    \ | 
 | 	VSLDB $12, ZERO, TEMP, h1 \ | 
 | 	VSQ   h1, h0, h0          \ | 
 |  | 
 | TEXT ·p256MulInternalVMSL(SB), NOFRAME|NOSPLIT, $0-0 | 
 | 	VSTM V16, V19, (SCRATCH) | 
 |  | 
 | 	MOVD $p256vmsl<>+0x00(SB), CPOOL | 
 |  | 
 | 	// Divide input1 into 5 limbs | 
 | 	VGBM  $0x007f, V14 | 
 | 	VZERO V12 | 
 | 	VSLDB $2, X1, X0, V13 | 
 | 	VSLDB $2, Y1, Y0, V8 | 
 | 	VSLDB $4, V12, X1, V11 // V11(X1): 4 bytes limb | 
 | 	VSLDB $4, V12, Y1, V6  // V6: 4 bytes limb | 
 |  | 
 | 	VN V14, X0, V5   // V5: first 7 bytes limb | 
 | 	VN V14, Y0, V10  // V10: first 7 bytes limb | 
 | 	VN V14, V13, V13 // v13: third 7 bytes limb | 
 | 	VN V14, V8, V8   // V8: third 7 bytes limb | 
 |  | 
 | 	VMSLG V10, V5, V12, V10 // v10: l10 x l5 (column 1) | 
 | 	VMSLG V8, V5, V12, V8   // v8: l8 x l5 | 
 | 	VMSLG V6, V13, V12, V13 // v13: l6 x l3 | 
 | 	VMSLG V6, V11, V12, V11 // v11: l6 x l1 (column 9) | 
 | 	VMSLG V6, V5, V12, V6   // v6: l6 x l5 | 
 |  | 
 | 	MOVD $p256vmsl<>+0x00(SB), CPOOL | 
 | 	VGBM $0x7f7f, V14 | 
 |  | 
 | 	VL 0(CPOOL), V4 | 
 | 	VL 16(CPOOL), V7 | 
 | 	VL 32(CPOOL), V9 | 
 | 	VL 48(CPOOL), V5 | 
 | 	VLM 64(CPOOL), V16, V19 | 
 |  | 
 | 	VPERM V12, X0, V4, V4   // v4: limb4 | limb5 | 
 | 	VPERM Y1, Y0, V7, V7 | 
 | 	VPERM V12, Y0, V9, V9   // v9: limb10 | limb9 | 
 | 	VPERM X1, X0, V5, V5 | 
 | 	VPERM X1, X0, V16, V16 | 
 | 	VPERM Y1, Y0, V17, V17 | 
 | 	VPERM X1, V12, V18, V18 // v18: limb1 | limb2 | 
 | 	VPERM Y1, V12, V19, V19 // v19: limb7 | limb6 | 
 | 	VN    V14, V7, V7       // v7:  limb9 | limb8 | 
 | 	VN    V14, V5, V5       // v5:  limb3 | limb4 | 
 | 	VN    V14, V16, V16     // v16: limb2 | limb3 | 
 | 	VN    V14, V17, V17     // v17: limb8 | limb7 | 
 |  | 
 | 	VMSLG V9, V4, V12, V14   // v14: l10 x l4 + l9 x l5 (column 2) | 
 | 	VMSLG V9, V5, V8, V8     // v8: l10 x l9 + l3 x l4 + l8 x l5 (column 3) | 
 | 	VMSLG V9, V16, V12, V16  // v16: l10 x l9 + l2 x l3 | 
 | 	VMSLG V9, V18, V12, V9   // v9: l10 x l1 + l9 x l2 | 
 | 	VMSLG V7, V18, V12, V7   // v7: l9 x l1 + l8 x l2 | 
 | 	VMSLG V17, V4, V16, V16  // v16: l8 x l4 + l7 x l5 + l10 x l9 + l2 x l3 (column 4) | 
 | 	VMSLG V17, V5, V9, V9    // v9: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 | 
 | 	VMSLG V17, V18, V12, V17 // v18: l8 x l1 + l7 x l2 | 
 | 	VMSLG V19, V5, V7, V7    // v7: l9 x l1 + l8 x l2 + l7 x l3 + l6 x l4 (column 6) | 
 | 	VMSLG V19, V18, V12, V19 // v19: l7 x l1 + l6 x l2 (column 8) | 
 | 	VAQ   V9, V6, V9         // v9: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 + l6 x l5 (column 5) | 
 | 	VAQ   V17, V13, V13      // v13: l8 x l1 + l7 x l2 + l6 x l3 (column 7) | 
 |  | 
 | 	VSLDB $9, V12, V10, V4 | 
 | 	VSLDB $9, V12, V7, V5 | 
 | 	VAQ   V4, V14, V14 | 
 | 	VAQ   V5, V13, V13 | 
 |  | 
 | 	VSLDB $9, V12, V14, V4 | 
 | 	VSLDB $9, V12, V13, V5 | 
 | 	VAQ   V4, V8, V8 | 
 | 	VAQ   V5, V19, V19 | 
 |  | 
 | 	VSLDB $9, V12, V8, V4 | 
 | 	VSLDB $9, V12, V19, V5 | 
 | 	VAQ   V4, V16, V16 | 
 | 	VAQ   V5, V11, V11 | 
 |  | 
 | 	VSLDB $9, V12, V16, V4 | 
 | 	VAQ   V4, V9, V17 | 
 |  | 
 | 	VGBM $0x007f, V4 | 
 | 	VGBM $0x00ff, V5 | 
 |  | 
 | 	VN V10, V4, V10 | 
 | 	VN V14, V4, V14 | 
 | 	VN V8, V4, V8 | 
 | 	VN V16, V4, V16 | 
 | 	VN V17, V4, V9 | 
 | 	VN V7, V4, V7 | 
 | 	VN V13, V4, V13 | 
 | 	VN V19, V4, V19 | 
 | 	VN V11, V5, V11 | 
 |  | 
 | 	VSLDB $7, V14, V14, V14 | 
 | 	VSLDB $14, V8, V12, V4 | 
 | 	VSLDB $14, V12, V8, V8 | 
 | 	VSLDB $5, V16, V16, V16 | 
 | 	VSLDB $12, V9, V12, V5 | 
 |  | 
 | 	VO V14, V10, V10 | 
 | 	VO V8, V16, V16 | 
 | 	VO V4, V10, V10  // first rightmost 128bits of the multiplication result | 
 | 	VO V5, V16, V16  // second rightmost 128bits of the multiplication result | 
 |  | 
 | 	// adjust v7, v13, v19, v11 | 
 | 	VSLDB $7, V13, V13, V13 | 
 | 	VSLDB $14, V19, V12, V4 | 
 | 	VSLDB $14, V12, V19, V19 | 
 | 	VSLDB $5, V11, V12, V5 | 
 | 	VO    V13, V7, V7 | 
 | 	VO    V4, V7, V7 | 
 | 	VO    V19, V5, V11 | 
 |  | 
 | 	VSLDB $9, V12, V17, V14 | 
 | 	VSLDB $12, V12, V9, V9 | 
 | 	VACCQ V7, V14, V13 | 
 | 	VAQ   V7, V14, V7 | 
 | 	VAQ   V11, V13, V11 | 
 |  | 
 | 	// First reduction, 96 bits | 
 | 	VSLDB $4, V16, V10, T0 | 
 | 	VSLDB $4, V12, V16, T1 | 
 | 	VSLDB $3, V11, V7, V11 // fourth rightmost 128bits of the multiplication result | 
 | 	VSLDB $3, V7, V12, V7 | 
 | 	OBSERVATION3(V10, V8, T2, V17, V18, V19)// results V8 | T2 | 
 | 	VO    V7, V9, V7       // third rightmost 128bits of the multiplication result | 
 | 	VACCQ T0, T2, V9 | 
 | 	VAQ   T0, T2, T2 | 
 | 	VACQ  T1, V8, V9, V8 | 
 |  | 
 | 	// Second reduction 96 bits | 
 | 	VSLDB $4, V8, T2, T0 | 
 | 	VSLDB $4, V12, V8, T1 | 
 | 	OBSERVATION3(T2, V9, V8, V17, V18, V19)// results V9 | V8 | 
 | 	VACCQ T0, V8, T2 | 
 | 	VAQ   T0, V8, V8 | 
 | 	VACQ  T1, V9, T2, V9 | 
 |  | 
 | 	// Third reduction 64 bits | 
 | 	VSLDB  $8, V9, V8, T0 | 
 | 	VSLDB  $8, V12, V9, T1 | 
 | 	OBSERVATION3A(V8, V14, V13, V17, V18)// results V14 | V13 | 
 | 	VACCQ  T0, V13, V12 | 
 | 	VAQ    T0, V13, V13 | 
 | 	VACQ   T1, V14, V12, V14 | 
 | 	VACCQ  V13, V7, V12 | 
 | 	VAQ    V13, V7, T0 | 
 | 	VACCCQ V14, V11, V12, T2 | 
 | 	VACQ   V14, V11, V12, T1 // results T2 | T1 | T0 | 
 |  | 
 | 	// --------------------------------------------------- | 
 | 	MOVD $p256mul<>+0x00(SB), CPOOL | 
 |  | 
 | 	VZERO   V12 | 
 | 	VSCBIQ  P0, T0, V8 | 
 | 	VSQ     P0, T0, V7 | 
 | 	VSBCBIQ T1, P1, V8, V10 | 
 | 	VSBIQ   T1, P1, V8, V9 | 
 | 	VSBIQ   T2, V12, V10, T2 | 
 |  | 
 | 	// what output to use, V9||V7 or T1||T0? | 
 | 	VSEL T0, V7, T2, T0 | 
 | 	VSEL T1, V9, T2, T1 | 
 |  | 
 | 	VLM (SCRATCH), V16, V19 | 
 |  | 
 | 	RET | 
 |  | 
 | // --------------------------------------- | 
 | // p256SqrInternalVMSL | 
 | // V0-V1,V30,V31 - Not Modified | 
 | // V4-V14 - Volatile | 
 |  | 
 | TEXT ·p256SqrInternalVMSL(SB), NOFRAME|NOSPLIT, $0-0 | 
 | 	VSTM V16, V18, (SCRATCH) | 
 |  | 
 | 	MOVD $p256vmsl<>+0x00(SB), CPOOL | 
 | 	// Divide input into limbs | 
 | 	VGBM  $0x007f, V14 | 
 | 	VZERO V12 | 
 | 	VSLDB $2, X1, X0, V13 | 
 | 	VSLDB $4, V12, X1, V11 // V11(X1): 4 bytes limb | 
 |  | 
 | 	VN V14, X0, V10  // V10: first 7 bytes limb | 
 | 	VN V14, V13, V13 // v13: third 7 bytes limb | 
 |  | 
 | 	VMSLG V10, V10, V12, V10 // v10: l10 x l5 (column 1) | 
 | 	VMSLG V13, V13, V12, V13 // v13: l8 x l3 | 
 | 	VMSLG V11, V11, V12, V11 // v11: l6 x l1 (column 9) | 
 |  | 
 | 	MOVD $p256vmsl<>+0x00(SB), CPOOL | 
 | 	VGBM $0x7f7f, V14 | 
 |  | 
 | 	VL 0(CPOOL), V4 | 
 | 	VL 16(CPOOL), V7 | 
 | 	VL 32(CPOOL), V9 | 
 | 	VL 48(CPOOL), V5 | 
 | 	VLM 64(CPOOL), V16, V18 | 
 | 	VL 112(CPOOL), V8 | 
 |  | 
 | 	VPERM V12, X0, V4, V4   // v4: limb4 | limb5 | 
 | 	VPERM X1, X0, V7, V7 | 
 | 	VPERM V12, X0, V9, V9   // v9: limb10 | limb9 | 
 | 	VPERM X1, X0, V5, V5 | 
 | 	VPERM X1, X0, V16, V16 | 
 | 	VPERM X1, X0, V17, V17 | 
 | 	VPERM X1, V12, V18, V18 // v18: limb1 | limb2 | 
 | 	VPERM X1, V12, V8, V8   // v8:  limb7 | limb6 | 
 | 	VN    V14, V7, V7       // v7:  limb9 | limb8 | 
 | 	VN    V14, V5, V5       // v5:  limb3 | limb4 | 
 | 	VN    V14, V16, V16     // v16: limb2 | limb3 | 
 | 	VN    V14, V17, V17     // v17: limb8 | limb7 | 
 |  | 
 | 	VMSLEOG V9, V18, V13, V6   // v6: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 + l6 x l5 (column 5) | 
 | 	VMSLG   V9, V4, V12, V14   // v14: l10 x l4 + l9 x l5 (column 2) | 
 | 	VMSLEOG V9, V16, V12, V16  // v16: l10 x l2 + l9 x l3 + l8 x l4 + l7 x l5 (column 4) | 
 | 	VMSLEOG V7, V18, V12, V7   // v7: l9 x l1 + l8 x l2 (column 6) | 
 | 	VMSLEG  V17, V18, V12, V13 // v13: l8 x l1 + l7 x l2 + l6 x l3 (column 7) | 
 | 	VMSLG   V8, V18, V12, V8   // v8: l7 x l1 + l6 x l2 (column 8) | 
 | 	VMSLEG  V9, V5, V12, V18   // v18: l10 x l3 + l9 x l4 + l8 x l5 (column 3) | 
 |  | 
 | 	VSLDB $9, V12, V10, V4 | 
 | 	VSLDB $9, V12, V7, V5 | 
 | 	VAQ   V4, V14, V14 | 
 | 	VAQ   V5, V13, V13 | 
 |  | 
 | 	VSLDB $9, V12, V14, V4 | 
 | 	VSLDB $9, V12, V13, V5 | 
 | 	VAQ   V4, V18, V18 | 
 | 	VAQ   V5, V8, V8 | 
 |  | 
 | 	VSLDB $9, V12, V18, V4 | 
 | 	VSLDB $9, V12, V8, V5 | 
 | 	VAQ   V4, V16, V16 | 
 | 	VAQ   V5, V11, V11 | 
 |  | 
 | 	VSLDB $9, V12, V16, V4 | 
 | 	VAQ   V4, V6, V17 | 
 |  | 
 | 	VGBM $0x007f, V4 | 
 | 	VGBM $0x00ff, V5 | 
 |  | 
 | 	VN V10, V4, V10 | 
 | 	VN V14, V4, V14 | 
 | 	VN V18, V4, V18 | 
 | 	VN V16, V4, V16 | 
 | 	VN V17, V4, V9 | 
 | 	VN V7, V4, V7 | 
 | 	VN V13, V4, V13 | 
 | 	VN V8, V4, V8 | 
 | 	VN V11, V5, V11 | 
 |  | 
 | 	VSLDB $7, V14, V14, V14 | 
 | 	VSLDB $14, V18, V12, V4 | 
 | 	VSLDB $14, V12, V18, V18 | 
 | 	VSLDB $5, V16, V16, V16 | 
 | 	VSLDB $12, V9, V12, V5 | 
 |  | 
 | 	VO V14, V10, V10 | 
 | 	VO V18, V16, V16 | 
 | 	VO V4, V10, V10  // first rightmost 128bits of the multiplication result | 
 | 	VO V5, V16, V16  // second rightmost 128bits of the multiplication result | 
 |  | 
 | 	// adjust v7, v13, v8, v11 | 
 | 	VSLDB $7, V13, V13, V13 | 
 | 	VSLDB $14, V8, V12, V4 | 
 | 	VSLDB $14, V12, V8, V8 | 
 | 	VSLDB $5, V11, V12, V5 | 
 | 	VO    V13, V7, V7 | 
 | 	VO    V4, V7, V7 | 
 | 	VO    V8, V5, V11 | 
 |  | 
 | 	VSLDB $9, V12, V17, V14 | 
 | 	VSLDB $12, V12, V9, V9 | 
 | 	VACCQ V7, V14, V13 | 
 | 	VAQ   V7, V14, V7 | 
 | 	VAQ   V11, V13, V11 | 
 |  | 
 | 	// First reduction, 96 bits | 
 | 	VSLDB $4, V16, V10, T0 | 
 | 	VSLDB $4, V12, V16, T1 | 
 | 	VSLDB $3, V11, V7, V11 // fourth rightmost 128bits of the multiplication result | 
 | 	VSLDB $3, V7, V12, V7 | 
 | 	OBSERVATION3(V10, V8, T2, V16, V17, V18)// results V8 | T2 | 
 | 	VO    V7, V9, V7       // third rightmost 128bits of the multiplication result | 
 | 	VACCQ T0, T2, V9 | 
 | 	VAQ   T0, T2, T2 | 
 | 	VACQ  T1, V8, V9, V8 | 
 |  | 
 | 	// Second reduction 96 bits | 
 | 	VSLDB $4, V8, T2, T0 | 
 | 	VSLDB $4, V12, V8, T1 | 
 | 	OBSERVATION3(T2, V9, V8, V16, V17, V18)// results V9 | V8 | 
 | 	VACCQ T0, V8, T2 | 
 | 	VAQ   T0, V8, V8 | 
 | 	VACQ  T1, V9, T2, V9 | 
 |  | 
 | 	// Third reduction 64 bits | 
 | 	VSLDB  $8, V9, V8, T0 | 
 | 	VSLDB  $8, V12, V9, T1 | 
 | 	OBSERVATION3A(V8, V14, V13, V17, V18)// results V14 | V13 | 
 | 	VACCQ  T0, V13, V12 | 
 | 	VAQ    T0, V13, V13 | 
 | 	VACQ   T1, V14, V12, V14 | 
 | 	VACCQ  V13, V7, V12 | 
 | 	VAQ    V13, V7, T0 | 
 | 	VACCCQ V14, V11, V12, T2 | 
 | 	VACQ   V14, V11, V12, T1 // results T2 | T1 | T0 | 
 |  | 
 | 	// --------------------------------------------------- | 
 | 	MOVD $p256mul<>+0x00(SB), CPOOL | 
 |  | 
 | 	VZERO   V12 | 
 | 	VSCBIQ  P0, T0, V8 | 
 | 	VSQ     P0, T0, V7 | 
 | 	VSBCBIQ T1, P1, V8, V10 | 
 | 	VSBIQ   T1, P1, V8, V9 | 
 | 	VSBIQ   T2, V12, V10, T2 | 
 |  | 
 | 	// what output to use, V9||V7 or T1||T0? | 
 | 	VSEL T0, V7, T2, T0 | 
 | 	VSEL T1, V9, T2, T1 | 
 |  | 
 | 	VLM (SCRATCH), V16, V18 | 
 | 	RET | 
 |  | 
 |  | 
 |  | 
 | #undef CPOOL | 
 | #undef SCRATCH | 
 | #undef X0 | 
 | #undef X1 | 
 | #undef Y0 | 
 | #undef Y1 | 
 | #undef T0 | 
 | #undef T1 | 
 | #undef T2 | 
 | #undef P0 | 
 | #undef P1 | 
 |  | 
 | #define SCRATCH R9 | 
 |  | 
 | TEXT p256MulInternal<>(SB),NOSPLIT,$64-0 | 
 | 	MOVD    $scratch-64(SP), SCRATCH | 
 | 	MOVD    ·p256MulInternalFacility+0x00(SB),R7 | 
 | 	CALL    (R7) | 
 | 	RET | 
 |  | 
 | TEXT ·p256MulInternalTrampolineSetup(SB),NOSPLIT|NOFRAME, $0 | 
 | 	MOVBZ  internal∕cpu·S390X+const_offsetS390xHasVE1(SB), R0 | 
 | 	MOVD    $·p256MulInternalFacility+0x00(SB), R7 | 
 | 	MOVD    $·p256MulInternalVX(SB), R8 | 
 | 	CMPBEQ  R0, $0, novmsl      // VE1 facility = 1, VMSL supported | 
 | 	MOVD    $·p256MulInternalVMSL(SB), R8 | 
 | novmsl: | 
 | 	MOVD    R8, 0(R7) | 
 | 	BR      (R8) | 
 |  | 
 | GLOBL ·p256MulInternalFacility+0x00(SB), NOPTR, $8 | 
 | DATA ·p256MulInternalFacility+0x00(SB)/8, $·p256MulInternalTrampolineSetup(SB) | 
 |  | 
 | // Parameters | 
 | #define X0    V0 | 
 | #define X1    V1 | 
 | #define Y0    V2 | 
 | #define Y1    V3 | 
 |  | 
 | TEXT ·p256SqrInternalVX(SB), NOFRAME|NOSPLIT, $0 | 
 | 	VLR X0, Y0 | 
 | 	VLR X1, Y1 | 
 | 	BR  ·p256MulInternalVX(SB) | 
 |  | 
 | #undef X0 | 
 | #undef X1 | 
 | #undef Y0 | 
 | #undef Y1 | 
 |  | 
 |  | 
 | TEXT p256SqrInternal<>(SB),NOSPLIT,$48-0 | 
 | 	MOVD    $scratch-48(SP), SCRATCH | 
 |         MOVD    ·p256SqrInternalFacility+0x00(SB),R7 | 
 |         CALL    (R7) | 
 | 	RET | 
 |  | 
 | TEXT ·p256SqrInternalTrampolineSetup(SB),NOSPLIT|NOFRAME, $0 | 
 | 	MOVBZ  internal∕cpu·S390X+const_offsetS390xHasVE1(SB), R0 | 
 | 	MOVD    $·p256SqrInternalFacility+0x00(SB), R7 | 
 | 	MOVD    $·p256SqrInternalVX(SB), R8 | 
 | 	CMPBEQ  R0, $0, novmsl      // VE1 facility = 1, VMSL supported | 
 | 	MOVD    $·p256SqrInternalVMSL(SB), R8 | 
 | novmsl: | 
 | 	MOVD    R8, 0(R7) | 
 | 	BR      (R8) | 
 |  | 
 |  | 
 | GLOBL ·p256SqrInternalFacility+0x00(SB), NOPTR, $8 | 
 | DATA ·p256SqrInternalFacility+0x00(SB)/8, $·p256SqrInternalTrampolineSetup(SB) | 
 |  | 
 | #undef SCRATCH | 
 |  | 
 |  | 
 | #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \ | 
 | 	VZERO   ZER                \ | 
 | 	VSCBIQ  Y0, X0, CAR1       \ | 
 | 	VSQ     Y0, X0, T0         \ | 
 | 	VSBCBIQ X1, Y1, CAR1, SEL1 \ | 
 | 	VSBIQ   X1, Y1, CAR1, T1   \ | 
 | 	VSQ     SEL1, ZER, SEL1    \ | 
 | 	                           \ | 
 | 	VACCQ   T0, PL, CAR1       \ | 
 | 	VAQ     T0, PL, TT0        \ | 
 | 	VACQ    T1, PH, CAR1, TT1  \ | 
 | 	                           \ | 
 | 	VSEL    T0, TT0, SEL1, T0  \ | 
 | 	VSEL    T1, TT1, SEL1, T1  \ | 
 |  | 
 | #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \ | 
 | 	VACCQ   X0, Y0, CAR1        \ | 
 | 	VAQ     X0, Y0, T0          \ | 
 | 	VACCCQ  X1, Y1, CAR1, T2    \ | 
 | 	VACQ    X1, Y1, CAR1, T1    \ | 
 | 	                            \ | 
 | 	VZERO   ZER                 \ | 
 | 	VSCBIQ  PL, T0, CAR1        \ | 
 | 	VSQ     PL, T0, TT0         \ | 
 | 	VSBCBIQ T1, PH, CAR1, CAR2  \ | 
 | 	VSBIQ   T1, PH, CAR1, TT1   \ | 
 | 	VSBIQ   T2, ZER, CAR2, SEL1 \ | 
 | 	                            \ | 
 | 	VSEL    T0, TT0, SEL1, T0   \ | 
 | 	VSEL    T1, TT1, SEL1, T1 | 
 |  | 
 | #define p256HalfInternal(T1, T0, X1, X0) \ | 
 | 	VZERO  ZER                \ | 
 | 	VSBIQ  ZER, ZER, X0, SEL1 \ | 
 | 	                          \ | 
 | 	VACCQ  X0, PL, CAR1       \ | 
 | 	VAQ    X0, PL, T0         \ | 
 | 	VACCCQ X1, PH, CAR1, T2   \ | 
 | 	VACQ   X1, PH, CAR1, T1   \ | 
 | 	                          \ | 
 | 	VSEL   X0, T0, SEL1, T0   \ | 
 | 	VSEL   X1, T1, SEL1, T1   \ | 
 | 	VSEL   ZER, T2, SEL1, T2  \ | 
 | 	                          \ | 
 | 	VSLDB  $15, T2, ZER, TT1  \ | 
 | 	VSLDB  $15, T1, ZER, TT0  \ | 
 | 	VREPIB $1, SEL1           \ | 
 | 	VSRL   SEL1, T0, T0       \ | 
 | 	VSRL   SEL1, T1, T1       \ | 
 | 	VREPIB $7, SEL1           \ | 
 | 	VSL    SEL1, TT0, TT0     \ | 
 | 	VSL    SEL1, TT1, TT1     \ | 
 | 	VO     T0, TT0, T0        \ | 
 | 	VO     T1, TT1, T1 | 
 |  | 
 | // --------------------------------------- | 
 | // func p256MulAsm(res, in1, in2 []byte) | 
 | #define res_ptr R1 | 
 | #define x_ptr   R2 | 
 | #define y_ptr   R3 | 
 | #define CPOOL   R4 | 
 |  | 
 | // Parameters | 
 | #define X0    V0 | 
 | #define X1    V1 | 
 | #define Y0    V2 | 
 | #define Y1    V3 | 
 | #define T0    V4 | 
 | #define T1    V5 | 
 |  | 
 | // Constants | 
 | #define P0    V30 | 
 | #define P1    V31 | 
 | TEXT ·p256MulAsm(SB), NOSPLIT, $0 | 
 | 	MOVD res+0(FP), res_ptr | 
 | 	MOVD in1+24(FP), x_ptr | 
 | 	MOVD in2+48(FP), y_ptr | 
 |  | 
 | 	VL (1*16)(x_ptr), X0 | 
 | 	VL (0*16)(x_ptr), X1 | 
 | 	VL (1*16)(y_ptr), Y0 | 
 | 	VL (0*16)(y_ptr), Y1 | 
 |  | 
 | 	MOVD $p256mul<>+0x00(SB), CPOOL | 
 | 	VL   16(CPOOL), P0 | 
 | 	VL   0(CPOOL), P1 | 
 |  | 
 | 	CALL p256MulInternal<>(SB) | 
 |  | 
 | 	VST T0, (1*16)(res_ptr) | 
 | 	VST T1, (0*16)(res_ptr) | 
 | 	RET | 
 |  | 
 | #undef res_ptr | 
 | #undef x_ptr | 
 | #undef y_ptr | 
 | #undef CPOOL | 
 |  | 
 | #undef X0 | 
 | #undef X1 | 
 | #undef Y0 | 
 | #undef Y1 | 
 | #undef T0 | 
 | #undef T1 | 
 | #undef P0 | 
 | #undef P1 | 
 |  | 
 | // --------------------------------------- | 
 | // func p256SqrAsm(res, in1 []byte) | 
 | #define res_ptr R1 | 
 | #define x_ptr   R2 | 
 | #define y_ptr   R3 | 
 | #define CPOOL   R4 | 
 |  | 
 | // Parameters | 
 | #define X0    V0 | 
 | #define X1    V1 | 
 | #define T0    V4 | 
 | #define T1    V5 | 
 |  | 
 | // Constants | 
 | #define P0    V30 | 
 | #define P1    V31 | 
 | TEXT ·p256SqrAsm(SB), NOSPLIT, $0 | 
 | 	MOVD res+0(FP), res_ptr | 
 | 	MOVD in1+24(FP), x_ptr | 
 |  | 
 | 	VL (1*16)(x_ptr), X0 | 
 | 	VL (0*16)(x_ptr), X1 | 
 |  | 
 | 	MOVD $p256mul<>+0x00(SB), CPOOL | 
 | 	VL   16(CPOOL), P0 | 
 | 	VL   0(CPOOL), P1 | 
 |  | 
 | 	CALL p256SqrInternal<>(SB) | 
 |  | 
 | 	VST T0, (1*16)(res_ptr) | 
 | 	VST T1, (0*16)(res_ptr) | 
 | 	RET | 
 |  | 
 | #undef res_ptr | 
 | #undef x_ptr | 
 | #undef y_ptr | 
 | #undef CPOOL | 
 |  | 
 | #undef X0 | 
 | #undef X1 | 
 | #undef T0 | 
 | #undef T1 | 
 | #undef P0 | 
 | #undef P1 | 
 |  | 
 |  | 
 | // Point add with P2 being affine point | 
 | // If sign == 1 -> P2 = -P2 | 
 | // If sel == 0 -> P3 = P1 | 
 | // if zero == 0 -> P3 = P2 | 
 | // p256PointAddAffineAsm(P3, P1, P2 *p256Point, sign, sel, zero int) | 
 | #define P3ptr   R1 | 
 | #define P1ptr   R2 | 
 | #define P2ptr   R3 | 
 | #define CPOOL   R4 | 
 |  | 
 | // Temporaries in REGs | 
 | #define Y2L    V15 | 
 | #define Y2H    V16 | 
 | #define T1L    V17 | 
 | #define T1H    V18 | 
 | #define T2L    V19 | 
 | #define T2H    V20 | 
 | #define T3L    V21 | 
 | #define T3H    V22 | 
 | #define T4L    V23 | 
 | #define T4H    V24 | 
 |  | 
 | // Temps for Sub and Add | 
 | #define TT0  V11 | 
 | #define TT1  V12 | 
 | #define T2   V13 | 
 |  | 
 | // p256MulAsm Parameters | 
 | #define X0    V0 | 
 | #define X1    V1 | 
 | #define Y0    V2 | 
 | #define Y1    V3 | 
 | #define T0    V4 | 
 | #define T1    V5 | 
 |  | 
 | #define PL    V30 | 
 | #define PH    V31 | 
 |  | 
 | // Names for zero/sel selects | 
 | #define X1L    V0 | 
 | #define X1H    V1 | 
 | #define Y1L    V2 // p256MulAsmParmY | 
 | #define Y1H    V3 // p256MulAsmParmY | 
 | #define Z1L    V4 | 
 | #define Z1H    V5 | 
 | #define X2L    V0 | 
 | #define X2H    V1 | 
 | #define Z2L    V4 | 
 | #define Z2H    V5 | 
 | #define X3L    V17 // T1L | 
 | #define X3H    V18 // T1H | 
 | #define Y3L    V21 // T3L | 
 | #define Y3H    V22 // T3H | 
 | #define Z3L    V28 | 
 | #define Z3H    V29 | 
 |  | 
 | #define ZER   V6 | 
 | #define SEL1  V7 | 
 | #define CAR1  V8 | 
 | #define CAR2  V9 | 
 | /* * | 
 |  * Three operand formula: | 
 |  * Source: 2004 Hankerson–Menezes–Vanstone, page 91. | 
 |  * T1 = Z1² | 
 |  * T2 = T1*Z1 | 
 |  * T1 = T1*X2 | 
 |  * T2 = T2*Y2 | 
 |  * T1 = T1-X1 | 
 |  * T2 = T2-Y1 | 
 |  * Z3 = Z1*T1 | 
 |  * T3 = T1² | 
 |  * T4 = T3*T1 | 
 |  * T3 = T3*X1 | 
 |  * T1 = 2*T3 | 
 |  * X3 = T2² | 
 |  * X3 = X3-T1 | 
 |  * X3 = X3-T4 | 
 |  * T3 = T3-X3 | 
 |  * T3 = T3*T2 | 
 |  * T4 = T4*Y1 | 
 |  * Y3 = T3-T4 | 
 |  | 
 |  * Three operand formulas, but with MulInternal X,Y used to store temps | 
 | X=Z1; Y=Z1; MUL;T-   // T1 = Z1²      T1 | 
 | X=T ; Y-  ; MUL;T2=T // T2 = T1*Z1    T1   T2 | 
 | X-  ; Y=X2; MUL;T1=T // T1 = T1*X2    T1   T2 | 
 | X=T2; Y=Y2; MUL;T-   // T2 = T2*Y2    T1   T2 | 
 | SUB(T2<T-Y1)         // T2 = T2-Y1    T1   T2 | 
 | SUB(Y<T1-X1)         // T1 = T1-X1    T1   T2 | 
 | X=Z1; Y- ;  MUL;Z3:=T// Z3 = Z1*T1         T2 | 
 | X=Y;  Y- ;  MUL;X=T  // T3 = T1*T1         T2 | 
 | X- ;  Y- ;  MUL;T4=T // T4 = T3*T1         T2        T4 | 
 | X- ;  Y=X1; MUL;T3=T // T3 = T3*X1         T2   T3   T4 | 
 | ADD(T1<T+T)          // T1 = T3+T3    T1   T2   T3   T4 | 
 | X=T2; Y=T2; MUL;T-   // X3 = T2*T2    T1   T2   T3   T4 | 
 | SUB(T<T-T1)          // X3 = X3-T1    T1   T2   T3   T4 | 
 | SUB(T<T-T4) X3:=T    // X3 = X3-T4         T2   T3   T4 | 
 | SUB(X<T3-T)          // T3 = T3-X3         T2   T3   T4 | 
 | X- ;  Y- ;  MUL;T3=T // T3 = T3*T2         T2   T3   T4 | 
 | X=T4; Y=Y1; MUL;T-   // T4 = T4*Y1              T3   T4 | 
 | SUB(T<T3-T) Y3:=T    // Y3 = T3-T4              T3   T4 | 
 |  | 
 | 	*/ | 
 | TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0 | 
 | 	MOVD P3+0(FP), P3ptr | 
 | 	MOVD P1+8(FP), P1ptr | 
 | 	MOVD P2+16(FP), P2ptr | 
 |  | 
 | 	MOVD $p256mul<>+0x00(SB), CPOOL | 
 | 	VL   16(CPOOL), PL | 
 | 	VL   0(CPOOL), PH | 
 |  | 
 | 	//	if (sign == 1) { | 
 | 	//		Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2  = P-Y2 | 
 | 	//	} | 
 |  | 
 | 	VL 32(P2ptr), Y2H | 
 | 	VL 48(P2ptr), Y2L | 
 |  | 
 | 	VLREPG sign+24(FP), SEL1 | 
 | 	VZERO  ZER | 
 | 	VCEQG  SEL1, ZER, SEL1 | 
 |  | 
 | 	VSCBIQ Y2L, PL, CAR1 | 
 | 	VSQ    Y2L, PL, T1L | 
 | 	VSBIQ  PH, Y2H, CAR1, T1H | 
 |  | 
 | 	VSEL Y2L, T1L, SEL1, Y2L | 
 | 	VSEL Y2H, T1H, SEL1, Y2H | 
 |  | 
 | /* * | 
 |  * Three operand formula: | 
 |  * Source: 2004 Hankerson–Menezes–Vanstone, page 91. | 
 |  */ | 
 | 	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1²      T1 | 
 | 	VL   64(P1ptr), X1       // Z1H | 
 | 	VL   80(P1ptr), X0       // Z1L | 
 | 	VLR  X0, Y0 | 
 | 	VLR  X1, Y1 | 
 | 	CALL p256SqrInternal<>(SB) | 
 |  | 
 | 	// X=T ; Y-  ; MUL; T2=T // T2 = T1*Z1    T1   T2 | 
 | 	VLR  T0, X0 | 
 | 	VLR  T1, X1 | 
 | 	CALL p256MulInternal<>(SB) | 
 | 	VLR  T0, T2L | 
 | 	VLR  T1, T2H | 
 |  | 
 | 	// X-  ; Y=X2; MUL; T1=T // T1 = T1*X2    T1   T2 | 
 | 	VL   0(P2ptr), Y1        // X2H | 
 | 	VL   16(P2ptr), Y0       // X2L | 
 | 	CALL p256MulInternal<>(SB) | 
 | 	VLR  T0, T1L | 
 | 	VLR  T1, T1H | 
 |  | 
 | 	// X=T2; Y=Y2; MUL; T-   // T2 = T2*Y2    T1   T2 | 
 | 	VLR  T2L, X0 | 
 | 	VLR  T2H, X1 | 
 | 	VLR  Y2L, Y0 | 
 | 	VLR  Y2H, Y1 | 
 | 	CALL p256MulInternal<>(SB) | 
 |  | 
 | 	// SUB(T2<T-Y1)          // T2 = T2-Y1    T1   T2 | 
 | 	VL 32(P1ptr), Y1H | 
 | 	VL 48(P1ptr), Y1L | 
 | 	p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L) | 
 |  | 
 | 	// SUB(Y<T1-X1)          // T1 = T1-X1    T1   T2 | 
 | 	VL 0(P1ptr), X1H | 
 | 	VL 16(P1ptr), X1L | 
 | 	p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L) | 
 |  | 
 | 	// X=Z1; Y- ;  MUL; Z3:=T// Z3 = Z1*T1         T2 | 
 | 	VL   64(P1ptr), X1       // Z1H | 
 | 	VL   80(P1ptr), X0       // Z1L | 
 | 	CALL p256MulInternal<>(SB) | 
 |  | 
 | 	// VST T1, 64(P3ptr) | 
 | 	// VST T0, 80(P3ptr) | 
 | 	VLR T0, Z3L | 
 | 	VLR T1, Z3H | 
 |  | 
 | 	// X=Y;  Y- ;  MUL; X=T  // T3 = T1*T1         T2 | 
 | 	VLR  Y0, X0 | 
 | 	VLR  Y1, X1 | 
 | 	CALL p256SqrInternal<>(SB) | 
 | 	VLR  T0, X0 | 
 | 	VLR  T1, X1 | 
 |  | 
 | 	// X- ;  Y- ;  MUL; T4=T // T4 = T3*T1         T2        T4 | 
 | 	CALL p256MulInternal<>(SB) | 
 | 	VLR  T0, T4L | 
 | 	VLR  T1, T4H | 
 |  | 
 | 	// X- ;  Y=X1; MUL; T3=T // T3 = T3*X1         T2   T3   T4 | 
 | 	VL   0(P1ptr), Y1        // X1H | 
 | 	VL   16(P1ptr), Y0       // X1L | 
 | 	CALL p256MulInternal<>(SB) | 
 | 	VLR  T0, T3L | 
 | 	VLR  T1, T3H | 
 |  | 
 | 	// ADD(T1<T+T)           // T1 = T3+T3    T1   T2   T3   T4 | 
 | 	p256AddInternal(T1H,T1L, T1,T0,T1,T0) | 
 |  | 
 | 	// X=T2; Y=T2; MUL; T-   // X3 = T2*T2    T1   T2   T3   T4 | 
 | 	VLR  T2L, X0 | 
 | 	VLR  T2H, X1 | 
 | 	VLR  T2L, Y0 | 
 | 	VLR  T2H, Y1 | 
 | 	CALL p256SqrInternal<>(SB) | 
 |  | 
 | 	// SUB(T<T-T1)           // X3 = X3-T1    T1   T2   T3   T4  (T1 = X3) | 
 | 	p256SubInternal(T1,T0,T1,T0,T1H,T1L) | 
 |  | 
 | 	// SUB(T<T-T4) X3:=T     // X3 = X3-T4         T2   T3   T4 | 
 | 	p256SubInternal(T1,T0,T1,T0,T4H,T4L) | 
 | 	VLR T0, X3L | 
 | 	VLR T1, X3H | 
 |  | 
 | 	// SUB(X<T3-T)           // T3 = T3-X3         T2   T3   T4 | 
 | 	p256SubInternal(X1,X0,T3H,T3L,T1,T0) | 
 |  | 
 | 	// X- ;  Y- ;  MUL; T3=T // T3 = T3*T2         T2   T3   T4 | 
 | 	CALL p256MulInternal<>(SB) | 
 | 	VLR  T0, T3L | 
 | 	VLR  T1, T3H | 
 |  | 
 | 	// X=T4; Y=Y1; MUL; T-   // T4 = T4*Y1              T3   T4 | 
 | 	VLR  T4L, X0 | 
 | 	VLR  T4H, X1 | 
 | 	VL   32(P1ptr), Y1       // Y1H | 
 | 	VL   48(P1ptr), Y0       // Y1L | 
 | 	CALL p256MulInternal<>(SB) | 
 |  | 
 | 	// SUB(T<T3-T) Y3:=T     // Y3 = T3-T4              T3   T4  (T3 = Y3) | 
 | 	p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0) | 
 |  | 
 | 	//	if (sel == 0) { | 
 | 	//		copy(P3.x[:], X1) | 
 | 	//		copy(P3.y[:], Y1) | 
 | 	//		copy(P3.z[:], Z1) | 
 | 	//	} | 
 |  | 
 | 	VL 0(P1ptr), X1H | 
 | 	VL 16(P1ptr), X1L | 
 |  | 
 | 	// Y1 already loaded, left over from addition | 
 | 	VL 64(P1ptr), Z1H | 
 | 	VL 80(P1ptr), Z1L | 
 |  | 
 | 	VLREPG sel+32(FP), SEL1 | 
 | 	VZERO  ZER | 
 | 	VCEQG  SEL1, ZER, SEL1 | 
 |  | 
 | 	VSEL X1L, X3L, SEL1, X3L | 
 | 	VSEL X1H, X3H, SEL1, X3H | 
 | 	VSEL Y1L, Y3L, SEL1, Y3L | 
 | 	VSEL Y1H, Y3H, SEL1, Y3H | 
 | 	VSEL Z1L, Z3L, SEL1, Z3L | 
 | 	VSEL Z1H, Z3H, SEL1, Z3H | 
 |  | 
 | 	//	if (zero == 0) { | 
 | 	//		copy(P3.x[:], X2) | 
 | 	//		copy(P3.y[:], Y2) | 
 | 	//		copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | 
 | 	//			0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01})  //(p256.z*2^256)%p | 
 | 	//	} | 
 | 	VL 0(P2ptr), X2H | 
 | 	VL 16(P2ptr), X2L | 
 |  | 
 | 	// Y2 already loaded | 
 | 	VL 128(CPOOL), Z2H | 
 | 	VL 144(CPOOL), Z2L | 
 |  | 
 | 	VLREPG zero+40(FP), SEL1 | 
 | 	VZERO  ZER | 
 | 	VCEQG  SEL1, ZER, SEL1 | 
 |  | 
 | 	VSEL X2L, X3L, SEL1, X3L | 
 | 	VSEL X2H, X3H, SEL1, X3H | 
 | 	VSEL Y2L, Y3L, SEL1, Y3L | 
 | 	VSEL Y2H, Y3H, SEL1, Y3H | 
 | 	VSEL Z2L, Z3L, SEL1, Z3L | 
 | 	VSEL Z2H, Z3H, SEL1, Z3H | 
 |  | 
 | 	// All done, store out the result!!! | 
 | 	VST X3H, 0(P3ptr) | 
 | 	VST X3L, 16(P3ptr) | 
 | 	VST Y3H, 32(P3ptr) | 
 | 	VST Y3L, 48(P3ptr) | 
 | 	VST Z3H, 64(P3ptr) | 
 | 	VST Z3L, 80(P3ptr) | 
 |  | 
 | 	RET | 
 |  | 
 | #undef P3ptr | 
 | #undef P1ptr | 
 | #undef P2ptr | 
 | #undef CPOOL | 
 |  | 
 | #undef Y2L | 
 | #undef Y2H | 
 | #undef T1L | 
 | #undef T1H | 
 | #undef T2L | 
 | #undef T2H | 
 | #undef T3L | 
 | #undef T3H | 
 | #undef T4L | 
 | #undef T4H | 
 |  | 
 | #undef TT0 | 
 | #undef TT1 | 
 | #undef T2 | 
 |  | 
 | #undef X0 | 
 | #undef X1 | 
 | #undef Y0 | 
 | #undef Y1 | 
 | #undef T0 | 
 | #undef T1 | 
 |  | 
 | #undef PL | 
 | #undef PH | 
 |  | 
 | #undef X1L | 
 | #undef X1H | 
 | #undef Y1L | 
 | #undef Y1H | 
 | #undef Z1L | 
 | #undef Z1H | 
 | #undef X2L | 
 | #undef X2H | 
 | #undef Z2L | 
 | #undef Z2H | 
 | #undef X3L | 
 | #undef X3H | 
 | #undef Y3L | 
 | #undef Y3H | 
 | #undef Z3L | 
 | #undef Z3H | 
 |  | 
 | #undef ZER | 
 | #undef SEL1 | 
 | #undef CAR1 | 
 | #undef CAR2 | 
 |  | 
 | // p256PointDoubleAsm(P3, P1 *p256Point) | 
 | // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl | 
 | // https://www.hyperelliptic.org/EFD/g1p/auto-shortw.html | 
 | // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html | 
 | #define P3ptr   R1 | 
 | #define P1ptr   R2 | 
 | #define CPOOL   R4 | 
 |  | 
 | // Temporaries in REGs | 
 | #define X3L    V15 | 
 | #define X3H    V16 | 
 | #define Y3L    V17 | 
 | #define Y3H    V18 | 
 | #define T1L    V19 | 
 | #define T1H    V20 | 
 | #define T2L    V21 | 
 | #define T2H    V22 | 
 | #define T3L    V23 | 
 | #define T3H    V24 | 
 |  | 
 | #define X1L    V6 | 
 | #define X1H    V7 | 
 | #define Y1L    V8 | 
 | #define Y1H    V9 | 
 | #define Z1L    V10 | 
 | #define Z1H    V11 | 
 |  | 
 | // Temps for Sub and Add | 
 | #define TT0  V11 | 
 | #define TT1  V12 | 
 | #define T2   V13 | 
 |  | 
 | // p256MulAsm Parameters | 
 | #define X0    V0 | 
 | #define X1    V1 | 
 | #define Y0    V2 | 
 | #define Y1    V3 | 
 | #define T0    V4 | 
 | #define T1    V5 | 
 |  | 
 | #define PL    V30 | 
 | #define PH    V31 | 
 |  | 
 | #define Z3L    V23 | 
 | #define Z3H    V24 | 
 |  | 
 | #define ZER   V26 | 
 | #define SEL1  V27 | 
 | #define CAR1  V28 | 
 | #define CAR2  V29 | 
 | /* | 
 |  * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv | 
 |  * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3. | 
 |  * Source: 2004 Hankerson–Menezes–Vanstone, page 91. | 
 |  * 	A  = 3(X₁-Z₁²)×(X₁+Z₁²) | 
 |  * 	B  = 2Y₁ | 
 |  * 	Z₃ = B×Z₁ | 
 |  * 	C  = B² | 
 |  * 	D  = C×X₁ | 
 |  * 	X₃ = A²-2D | 
 |  * 	Y₃ = (D-X₃)×A-C²/2 | 
 |  * | 
 |  * Three-operand formula: | 
 |  *       T1 = Z1² | 
 |  *       T2 = X1-T1 | 
 |  *       T1 = X1+T1 | 
 |  *       T2 = T2*T1 | 
 |  *       T2 = 3*T2 | 
 |  *       Y3 = 2*Y1 | 
 |  *       Z3 = Y3*Z1 | 
 |  *       Y3 = Y3² | 
 |  *       T3 = Y3*X1 | 
 |  *       Y3 = Y3² | 
 |  *       Y3 = half*Y3 | 
 |  *       X3 = T2² | 
 |  *       T1 = 2*T3 | 
 |  *       X3 = X3-T1 | 
 |  *       T1 = T3-X3 | 
 |  *       T1 = T1*T2 | 
 |  *       Y3 = T1-Y3 | 
 |  */ | 
 |  | 
 | TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0 | 
 | 	MOVD P3+0(FP), P3ptr | 
 | 	MOVD P1+8(FP), P1ptr | 
 |  | 
 | 	MOVD $p256mul<>+0x00(SB), CPOOL | 
 | 	VL   16(CPOOL), PL | 
 | 	VL   0(CPOOL), PH | 
 |  | 
 | 	// X=Z1; Y=Z1; MUL; T-    // T1 = Z1² | 
 | 	VL   64(P1ptr), X1       // Z1H | 
 | 	VL   80(P1ptr), X0       // Z1L | 
 | 	VLR  X0, Y0 | 
 | 	VLR  X1, Y1 | 
 | 	CALL p256SqrInternal<>(SB) | 
 |  | 
 | 	// SUB(X<X1-T)            // T2 = X1-T1 | 
 | 	VL 0(P1ptr), X1H | 
 | 	VL 16(P1ptr), X1L | 
 | 	p256SubInternal(X1,X0,X1H,X1L,T1,T0) | 
 |  | 
 | 	// ADD(Y<X1+T)            // T1 = X1+T1 | 
 | 	p256AddInternal(Y1,Y0,X1H,X1L,T1,T0) | 
 |  | 
 | 	// X-  ; Y-  ; MUL; T-    // T2 = T2*T1 | 
 | 	CALL p256MulInternal<>(SB) | 
 |  | 
 | 	// ADD(T2<T+T); ADD(T2<T2+T)  // T2 = 3*T2 | 
 | 	p256AddInternal(T2H,T2L,T1,T0,T1,T0) | 
 | 	p256AddInternal(T2H,T2L,T2H,T2L,T1,T0) | 
 |  | 
 | 	// ADD(X<Y1+Y1)           // Y3 = 2*Y1 | 
 | 	VL 32(P1ptr), Y1H | 
 | 	VL 48(P1ptr), Y1L | 
 | 	p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L) | 
 |  | 
 | 	// X-  ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1 | 
 | 	VL   64(P1ptr), Y1       // Z1H | 
 | 	VL   80(P1ptr), Y0       // Z1L | 
 | 	CALL p256MulInternal<>(SB) | 
 | 	VST  T1, 64(P3ptr) | 
 | 	VST  T0, 80(P3ptr) | 
 |  | 
 | 	// X-  ; Y=X ; MUL; T-    // Y3 = Y3² | 
 | 	VLR  X0, Y0 | 
 | 	VLR  X1, Y1 | 
 | 	CALL p256SqrInternal<>(SB) | 
 |  | 
 | 	// X=T ; Y=X1; MUL; T3=T  // T3 = Y3*X1 | 
 | 	VLR  T0, X0 | 
 | 	VLR  T1, X1 | 
 | 	VL   0(P1ptr), Y1 | 
 | 	VL   16(P1ptr), Y0 | 
 | 	CALL p256MulInternal<>(SB) | 
 | 	VLR  T0, T3L | 
 | 	VLR  T1, T3H | 
 |  | 
 | 	// X-  ; Y=X ; MUL; T-    // Y3 = Y3² | 
 | 	VLR  X0, Y0 | 
 | 	VLR  X1, Y1 | 
 | 	CALL p256SqrInternal<>(SB) | 
 |  | 
 | 	// HAL(Y3<T)              // Y3 = half*Y3 | 
 | 	p256HalfInternal(Y3H,Y3L, T1,T0) | 
 |  | 
 | 	// X=T2; Y=T2; MUL; T-    // X3 = T2² | 
 | 	VLR  T2L, X0 | 
 | 	VLR  T2H, X1 | 
 | 	VLR  T2L, Y0 | 
 | 	VLR  T2H, Y1 | 
 | 	CALL p256SqrInternal<>(SB) | 
 |  | 
 | 	// ADD(T1<T3+T3)          // T1 = 2*T3 | 
 | 	p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L) | 
 |  | 
 | 	// SUB(X3<T-T1) X3:=X3    // X3 = X3-T1 | 
 | 	p256SubInternal(X3H,X3L,T1,T0,T1H,T1L) | 
 | 	VST X3H, 0(P3ptr) | 
 | 	VST X3L, 16(P3ptr) | 
 |  | 
 | 	// SUB(X<T3-X3)           // T1 = T3-X3 | 
 | 	p256SubInternal(X1,X0,T3H,T3L,X3H,X3L) | 
 |  | 
 | 	// X-  ; Y-  ; MUL; T-    // T1 = T1*T2 | 
 | 	CALL p256MulInternal<>(SB) | 
 |  | 
 | 	// SUB(Y3<T-Y3)           // Y3 = T1-Y3 | 
 | 	p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L) | 
 |  | 
 | 	VST Y3H, 32(P3ptr) | 
 | 	VST Y3L, 48(P3ptr) | 
 | 	RET | 
 |  | 
 | #undef P3ptr | 
 | #undef P1ptr | 
 | #undef CPOOL | 
 | #undef X3L | 
 | #undef X3H | 
 | #undef Y3L | 
 | #undef Y3H | 
 | #undef T1L | 
 | #undef T1H | 
 | #undef T2L | 
 | #undef T2H | 
 | #undef T3L | 
 | #undef T3H | 
 | #undef X1L | 
 | #undef X1H | 
 | #undef Y1L | 
 | #undef Y1H | 
 | #undef Z1L | 
 | #undef Z1H | 
 | #undef TT0 | 
 | #undef TT1 | 
 | #undef T2 | 
 | #undef X0 | 
 | #undef X1 | 
 | #undef Y0 | 
 | #undef Y1 | 
 | #undef T0 | 
 | #undef T1 | 
 | #undef PL | 
 | #undef PH | 
 | #undef Z3L | 
 | #undef Z3H | 
 | #undef ZER | 
 | #undef SEL1 | 
 | #undef CAR1 | 
 | #undef CAR2 | 
 |  | 
 | // p256PointAddAsm(P3, P1, P2 *p256Point) | 
 | #define P3ptr  R1 | 
 | #define P1ptr  R2 | 
 | #define P2ptr  R3 | 
 | #define CPOOL  R4 | 
 | #define ISZERO R5 | 
 | #define TRUE   R6 | 
 |  | 
 | // Temporaries in REGs | 
 | #define T1L   V16 | 
 | #define T1H   V17 | 
 | #define T2L   V18 | 
 | #define T2H   V19 | 
 | #define U1L   V20 | 
 | #define U1H   V21 | 
 | #define S1L   V22 | 
 | #define S1H   V23 | 
 | #define HL    V24 | 
 | #define HH    V25 | 
 | #define RL    V26 | 
 | #define RH    V27 | 
 |  | 
 | // Temps for Sub and Add | 
 | #define ZER   V6 | 
 | #define SEL1  V7 | 
 | #define CAR1  V8 | 
 | #define CAR2  V9 | 
 | #define TT0  V11 | 
 | #define TT1  V12 | 
 | #define T2   V13 | 
 |  | 
 | // p256MulAsm Parameters | 
 | #define X0    V0 | 
 | #define X1    V1 | 
 | #define Y0    V2 | 
 | #define Y1    V3 | 
 | #define T0    V4 | 
 | #define T1    V5 | 
 |  | 
 | #define PL    V30 | 
 | #define PH    V31 | 
 | /* | 
 |  * https://delta.cs.cinvestav.mx/~francisco/arith/julio.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields" | 
 |  * | 
 |  * A = X₁×Z₂² | 
 |  * B = Y₁×Z₂³ | 
 |  * C = X₂×Z₁²-A | 
 |  * D = Y₂×Z₁³-B | 
 |  * X₃ = D² - 2A×C² - C³ | 
 |  * Y₃ = D×(A×C² - X₃) - B×C³ | 
 |  * Z₃ = Z₁×Z₂×C | 
 |  * | 
 |  * Three-operand formula (adopted): https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2 | 
 |  * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R | 
 |  * | 
 |  * T1 = Z1*Z1 | 
 |  * T2 = Z2*Z2 | 
 |  * U1 = X1*T2 | 
 |  * H  = X2*T1 | 
 |  * H  = H-U1 | 
 |  * Z3 = Z1*Z2 | 
 |  * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array | 
 |  * | 
 |  * S1 = Z2*T2 | 
 |  * S1 = Y1*S1 | 
 |  * R  = Z1*T1 | 
 |  * R  = Y2*R | 
 |  * R  = R-S1 | 
 |  * | 
 |  * T1 = H*H | 
 |  * T2 = H*T1 | 
 |  * U1 = U1*T1 | 
 |  * | 
 |  * X3 = R*R | 
 |  * X3 = X3-T2 | 
 |  * T1 = 2*U1 | 
 |  * X3 = X3-T1 << store-out X3 result reg | 
 |  * | 
 |  * T2 = S1*T2 | 
 |  * Y3 = U1-X3 | 
 |  * Y3 = R*Y3 | 
 |  * Y3 = Y3-T2 << store-out Y3 result reg | 
 |  | 
 |  	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1 | 
 | 	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1 | 
 | 	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1 | 
 | 	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2 | 
 | 	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2 | 
 | 	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2 | 
 | 	// SUB(H<H-T)            // H  = H-U1 | 
 | 	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2 | 
 | 	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array | 
 | 	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 | 
 | 	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R | 
 | 	// SUB(R<T-S1)           // R  = R-S1 | 
 | 	// X=H ; Y=H ; MUL; T-   // T1 = H*H | 
 | 	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1 | 
 | 	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1 | 
 | 	// X=R ; Y=R ; MUL; T-   // X3 = R*R | 
 | 	// SUB(T<T-T2)           // X3 = X3-T2 | 
 | 	// ADD(X<U1+U1)          // T1 = 2*U1 | 
 | 	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg | 
 | 	// SUB(Y<U1-T)           // Y3 = U1-X3 | 
 | 	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3 | 
 | 	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2 | 
 | 	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg | 
 | 	*/ | 
 | TEXT ·p256PointAddAsm(SB), NOSPLIT, $0 | 
 | 	MOVD P3+0(FP), P3ptr | 
 | 	MOVD P1+8(FP), P1ptr | 
 | 	MOVD P2+16(FP), P2ptr | 
 |  | 
 | 	MOVD $p256mul<>+0x00(SB), CPOOL | 
 | 	VL   16(CPOOL), PL | 
 | 	VL   0(CPOOL), PH | 
 |  | 
 | 	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1 | 
 | 	VL   64(P1ptr), X1       // Z1H | 
 | 	VL   80(P1ptr), X0       // Z1L | 
 | 	VLR  X0, Y0 | 
 | 	VLR  X1, Y1 | 
 | 	CALL p256SqrInternal<>(SB) | 
 |  | 
 | 	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1 | 
 | 	VLR  T0, Y0 | 
 | 	VLR  T1, Y1 | 
 | 	CALL p256MulInternal<>(SB) | 
 | 	VLR  T0, RL | 
 | 	VLR  T1, RH | 
 |  | 
 | 	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1 | 
 | 	VL   0(P2ptr), X1        // X2H | 
 | 	VL   16(P2ptr), X0       // X2L | 
 | 	CALL p256MulInternal<>(SB) | 
 | 	VLR  T0, HL | 
 | 	VLR  T1, HH | 
 |  | 
 | 	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2 | 
 | 	VL   64(P2ptr), X1       // Z2H | 
 | 	VL   80(P2ptr), X0       // Z2L | 
 | 	VLR  X0, Y0 | 
 | 	VLR  X1, Y1 | 
 | 	CALL p256SqrInternal<>(SB) | 
 |  | 
 | 	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2 | 
 | 	VLR  T0, Y0 | 
 | 	VLR  T1, Y1 | 
 | 	CALL p256MulInternal<>(SB) | 
 | 	VLR  T0, S1L | 
 | 	VLR  T1, S1H | 
 |  | 
 | 	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2 | 
 | 	VL   0(P1ptr), X1        // X1H | 
 | 	VL   16(P1ptr), X0       // X1L | 
 | 	CALL p256MulInternal<>(SB) | 
 | 	VLR  T0, U1L | 
 | 	VLR  T1, U1H | 
 |  | 
 | 	// SUB(H<H-T)            // H  = H-U1 | 
 | 	p256SubInternal(HH,HL,HH,HL,T1,T0) | 
 |  | 
 | 	// if H == 0 or H^P == 0 then ret=1 else ret=0 | 
 | 	// clobbers T1H and T1L | 
 | 	MOVD   $0, ISZERO | 
 | 	MOVD   $1, TRUE | 
 | 	VZERO  ZER | 
 | 	VO     HL, HH, T1H | 
 | 	VCEQGS ZER, T1H, T1H | 
 | 	MOVDEQ TRUE, ISZERO | 
 | 	VX     HL, PL, T1L | 
 | 	VX     HH, PH, T1H | 
 | 	VO     T1L, T1H, T1H | 
 | 	VCEQGS ZER, T1H, T1H | 
 | 	MOVDEQ TRUE, ISZERO | 
 | 	MOVD   ISZERO, ret+24(FP) | 
 |  | 
 | 	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2 | 
 | 	VL   64(P1ptr), X1       // Z1H | 
 | 	VL   80(P1ptr), X0       // Z1L | 
 | 	VL   64(P2ptr), Y1       // Z2H | 
 | 	VL   80(P2ptr), Y0       // Z2L | 
 | 	CALL p256MulInternal<>(SB) | 
 |  | 
 | 	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H | 
 | 	VLR  T0, X0 | 
 | 	VLR  T1, X1 | 
 | 	VLR  HL, Y0 | 
 | 	VLR  HH, Y1 | 
 | 	CALL p256MulInternal<>(SB) | 
 | 	VST  T1, 64(P3ptr) | 
 | 	VST  T0, 80(P3ptr) | 
 |  | 
 | 	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 | 
 | 	VL   32(P1ptr), X1 | 
 | 	VL   48(P1ptr), X0 | 
 | 	VLR  S1L, Y0 | 
 | 	VLR  S1H, Y1 | 
 | 	CALL p256MulInternal<>(SB) | 
 | 	VLR  T0, S1L | 
 | 	VLR  T1, S1H | 
 |  | 
 | 	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R | 
 | 	VL   32(P2ptr), X1 | 
 | 	VL   48(P2ptr), X0 | 
 | 	VLR  RL, Y0 | 
 | 	VLR  RH, Y1 | 
 | 	CALL p256MulInternal<>(SB) | 
 |  | 
 | 	// SUB(R<T-S1)           // R  = T-S1 | 
 | 	p256SubInternal(RH,RL,T1,T0,S1H,S1L) | 
 |  | 
 | 	// if R == 0 or R^P == 0 then ret=ret else ret=0 | 
 | 	// clobbers T1H and T1L | 
 | 	MOVD   $0, ISZERO | 
 | 	MOVD   $1, TRUE | 
 | 	VZERO  ZER | 
 | 	VO     RL, RH, T1H | 
 | 	VCEQGS ZER, T1H, T1H | 
 | 	MOVDEQ TRUE, ISZERO | 
 | 	VX     RL, PL, T1L | 
 | 	VX     RH, PH, T1H | 
 | 	VO     T1L, T1H, T1H | 
 | 	VCEQGS ZER, T1H, T1H | 
 | 	MOVDEQ TRUE, ISZERO | 
 | 	AND    ret+24(FP), ISZERO | 
 | 	MOVD   ISZERO, ret+24(FP) | 
 |  | 
 | 	// X=H ; Y=H ; MUL; T-   // T1 = H*H | 
 | 	VLR  HL, X0 | 
 | 	VLR  HH, X1 | 
 | 	VLR  HL, Y0 | 
 | 	VLR  HH, Y1 | 
 | 	CALL p256SqrInternal<>(SB) | 
 |  | 
 | 	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1 | 
 | 	VLR  T0, Y0 | 
 | 	VLR  T1, Y1 | 
 | 	CALL p256MulInternal<>(SB) | 
 | 	VLR  T0, T2L | 
 | 	VLR  T1, T2H | 
 |  | 
 | 	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1 | 
 | 	VLR  U1L, X0 | 
 | 	VLR  U1H, X1 | 
 | 	CALL p256MulInternal<>(SB) | 
 | 	VLR  T0, U1L | 
 | 	VLR  T1, U1H | 
 |  | 
 | 	// X=R ; Y=R ; MUL; T-   // X3 = R*R | 
 | 	VLR  RL, X0 | 
 | 	VLR  RH, X1 | 
 | 	VLR  RL, Y0 | 
 | 	VLR  RH, Y1 | 
 | 	CALL p256SqrInternal<>(SB) | 
 |  | 
 | 	// SUB(T<T-T2)           // X3 = X3-T2 | 
 | 	p256SubInternal(T1,T0,T1,T0,T2H,T2L) | 
 |  | 
 | 	// ADD(X<U1+U1)          // T1 = 2*U1 | 
 | 	p256AddInternal(X1,X0,U1H,U1L,U1H,U1L) | 
 |  | 
 | 	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg | 
 | 	p256SubInternal(T1,T0,T1,T0,X1,X0) | 
 | 	VST T1, 0(P3ptr) | 
 | 	VST T0, 16(P3ptr) | 
 |  | 
 | 	// SUB(Y<U1-T)           // Y3 = U1-X3 | 
 | 	p256SubInternal(Y1,Y0,U1H,U1L,T1,T0) | 
 |  | 
 | 	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3 | 
 | 	VLR  RL, X0 | 
 | 	VLR  RH, X1 | 
 | 	CALL p256MulInternal<>(SB) | 
 | 	VLR  T0, U1L | 
 | 	VLR  T1, U1H | 
 |  | 
 | 	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2 | 
 | 	VLR  S1L, X0 | 
 | 	VLR  S1H, X1 | 
 | 	VLR  T2L, Y0 | 
 | 	VLR  T2H, Y1 | 
 | 	CALL p256MulInternal<>(SB) | 
 |  | 
 | 	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg | 
 | 	p256SubInternal(T1,T0,U1H,U1L,T1,T0) | 
 | 	VST T1, 32(P3ptr) | 
 | 	VST T0, 48(P3ptr) | 
 |  | 
 | 	RET |