| // Copyright 2015 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // This file contains constant-time, 64-bit assembly implementation of |
| // P256. The optimizations performed here are described in detail in: |
| // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with |
| // 256-bit primes" |
| // http://link.springer.com/article/10.1007%2Fs13389-014-0090-x |
| // https://eprint.iacr.org/2013/816.pdf |
| |
| #include "textflag.h" |
| |
| #define res_ptr DI |
| #define x_ptr SI |
| #define y_ptr CX |
| |
| #define acc0 R8 |
| #define acc1 R9 |
| #define acc2 R10 |
| #define acc3 R11 |
| #define acc4 R12 |
| #define acc5 R13 |
| #define t0 R14 |
| #define t1 R15 |
| |
| DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff |
| DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001 |
| DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f |
| DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551 |
| DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84 |
| DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff |
| DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000 |
| DATA p256one<>+0x00(SB)/8, $0x0000000000000001 |
| DATA p256one<>+0x08(SB)/8, $0xffffffff00000000 |
| DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff |
| DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe |
| GLOBL p256const0<>(SB), 8, $8 |
| GLOBL p256const1<>(SB), 8, $8 |
| GLOBL p256ordK0<>(SB), 8, $8 |
| GLOBL p256ord<>(SB), 8, $32 |
| GLOBL p256one<>(SB), 8, $32 |
| |
| /* ---------------------------------------*/ |
| // func p256LittleToBig(res []byte, in []uint64) |
| TEXT ·p256LittleToBig(SB),NOSPLIT,$0 |
| JMP ·p256BigToLittle(SB) |
| /* ---------------------------------------*/ |
| // func p256BigToLittle(res []uint64, in []byte) |
| TEXT ·p256BigToLittle(SB),NOSPLIT,$0 |
| MOVQ res+0(FP), res_ptr |
| MOVQ in+24(FP), x_ptr |
| |
| MOVQ (8*0)(x_ptr), acc0 |
| MOVQ (8*1)(x_ptr), acc1 |
| MOVQ (8*2)(x_ptr), acc2 |
| MOVQ (8*3)(x_ptr), acc3 |
| |
| BSWAPQ acc0 |
| BSWAPQ acc1 |
| BSWAPQ acc2 |
| BSWAPQ acc3 |
| |
| MOVQ acc3, (8*0)(res_ptr) |
| MOVQ acc2, (8*1)(res_ptr) |
| MOVQ acc1, (8*2)(res_ptr) |
| MOVQ acc0, (8*3)(res_ptr) |
| |
| RET |
| /* ---------------------------------------*/ |
| // func p256MovCond(res, a, b []uint64, cond int) |
| // If cond == 0 res=b, else res=a |
| TEXT ·p256MovCond(SB),NOSPLIT,$0 |
| MOVQ res+0(FP), res_ptr |
| MOVQ a+24(FP), x_ptr |
| MOVQ b+48(FP), y_ptr |
| MOVQ cond+72(FP), X12 |
| |
| PXOR X13, X13 |
| PSHUFD $0, X12, X12 |
| PCMPEQL X13, X12 |
| |
| MOVOU X12, X0 |
| PANDN (16*0)(x_ptr), X0 |
| MOVOU X12, X1 |
| PANDN (16*1)(x_ptr), X1 |
| MOVOU X12, X2 |
| PANDN (16*2)(x_ptr), X2 |
| MOVOU X12, X3 |
| PANDN (16*3)(x_ptr), X3 |
| MOVOU X12, X4 |
| PANDN (16*4)(x_ptr), X4 |
| MOVOU X12, X5 |
| PANDN (16*5)(x_ptr), X5 |
| |
| MOVOU (16*0)(y_ptr), X6 |
| MOVOU (16*1)(y_ptr), X7 |
| MOVOU (16*2)(y_ptr), X8 |
| MOVOU (16*3)(y_ptr), X9 |
| MOVOU (16*4)(y_ptr), X10 |
| MOVOU (16*5)(y_ptr), X11 |
| |
| PAND X12, X6 |
| PAND X12, X7 |
| PAND X12, X8 |
| PAND X12, X9 |
| PAND X12, X10 |
| PAND X12, X11 |
| |
| PXOR X6, X0 |
| PXOR X7, X1 |
| PXOR X8, X2 |
| PXOR X9, X3 |
| PXOR X10, X4 |
| PXOR X11, X5 |
| |
| MOVOU X0, (16*0)(res_ptr) |
| MOVOU X1, (16*1)(res_ptr) |
| MOVOU X2, (16*2)(res_ptr) |
| MOVOU X3, (16*3)(res_ptr) |
| MOVOU X4, (16*4)(res_ptr) |
| MOVOU X5, (16*5)(res_ptr) |
| |
| RET |
| /* ---------------------------------------*/ |
| // func p256NegCond(val []uint64, cond int) |
| TEXT ·p256NegCond(SB),NOSPLIT,$0 |
| MOVQ val+0(FP), res_ptr |
| MOVQ cond+24(FP), t0 |
| // acc = poly |
| MOVQ $-1, acc0 |
| MOVQ p256const0<>(SB), acc1 |
| MOVQ $0, acc2 |
| MOVQ p256const1<>(SB), acc3 |
| // Load the original value |
| MOVQ (8*0)(res_ptr), acc5 |
| MOVQ (8*1)(res_ptr), x_ptr |
| MOVQ (8*2)(res_ptr), y_ptr |
| MOVQ (8*3)(res_ptr), t1 |
| // Speculatively subtract |
| SUBQ acc5, acc0 |
| SBBQ x_ptr, acc1 |
| SBBQ y_ptr, acc2 |
| SBBQ t1, acc3 |
| // If condition is 0, keep original value |
| TESTQ t0, t0 |
| CMOVQEQ acc5, acc0 |
| CMOVQEQ x_ptr, acc1 |
| CMOVQEQ y_ptr, acc2 |
| CMOVQEQ t1, acc3 |
| // Store result |
| MOVQ acc0, (8*0)(res_ptr) |
| MOVQ acc1, (8*1)(res_ptr) |
| MOVQ acc2, (8*2)(res_ptr) |
| MOVQ acc3, (8*3)(res_ptr) |
| |
| RET |
| /* ---------------------------------------*/ |
| // func p256Sqr(res, in []uint64) |
| TEXT ·p256Sqr(SB),NOSPLIT,$0 |
| MOVQ res+0(FP), res_ptr |
| MOVQ in+24(FP), x_ptr |
| // y[1:] * y[0] |
| MOVQ (8*0)(x_ptr), t0 |
| |
| MOVQ (8*1)(x_ptr), AX |
| MULQ t0 |
| MOVQ AX, acc1 |
| MOVQ DX, acc2 |
| |
| MOVQ (8*2)(x_ptr), AX |
| MULQ t0 |
| ADDQ AX, acc2 |
| ADCQ $0, DX |
| MOVQ DX, acc3 |
| |
| MOVQ (8*3)(x_ptr), AX |
| MULQ t0 |
| ADDQ AX, acc3 |
| ADCQ $0, DX |
| MOVQ DX, acc4 |
| // y[2:] * y[1] |
| MOVQ (8*1)(x_ptr), t0 |
| |
| MOVQ (8*2)(x_ptr), AX |
| MULQ t0 |
| ADDQ AX, acc3 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ (8*3)(x_ptr), AX |
| MULQ t0 |
| ADDQ t1, acc4 |
| ADCQ $0, DX |
| ADDQ AX, acc4 |
| ADCQ $0, DX |
| MOVQ DX, acc5 |
| // y[3] * y[2] |
| MOVQ (8*2)(x_ptr), t0 |
| |
| MOVQ (8*3)(x_ptr), AX |
| MULQ t0 |
| ADDQ AX, acc5 |
| ADCQ $0, DX |
| MOVQ DX, y_ptr |
| XORQ t1, t1 |
| // *2 |
| ADDQ acc1, acc1 |
| ADCQ acc2, acc2 |
| ADCQ acc3, acc3 |
| ADCQ acc4, acc4 |
| ADCQ acc5, acc5 |
| ADCQ y_ptr, y_ptr |
| ADCQ $0, t1 |
| // Missing products |
| MOVQ (8*0)(x_ptr), AX |
| MULQ AX |
| MOVQ AX, acc0 |
| MOVQ DX, t0 |
| |
| MOVQ (8*1)(x_ptr), AX |
| MULQ AX |
| ADDQ t0, acc1 |
| ADCQ AX, acc2 |
| ADCQ $0, DX |
| MOVQ DX, t0 |
| |
| MOVQ (8*2)(x_ptr), AX |
| MULQ AX |
| ADDQ t0, acc3 |
| ADCQ AX, acc4 |
| ADCQ $0, DX |
| MOVQ DX, t0 |
| |
| MOVQ (8*3)(x_ptr), AX |
| MULQ AX |
| ADDQ t0, acc5 |
| ADCQ AX, y_ptr |
| ADCQ DX, t1 |
| MOVQ t1, x_ptr |
| // First reduction step |
| MOVQ acc0, AX |
| MOVQ acc0, t1 |
| SHLQ $32, acc0 |
| MULQ p256const1<>(SB) |
| SHRQ $32, t1 |
| ADDQ acc0, acc1 |
| ADCQ t1, acc2 |
| ADCQ AX, acc3 |
| ADCQ $0, DX |
| MOVQ DX, acc0 |
| // Second reduction step |
| MOVQ acc1, AX |
| MOVQ acc1, t1 |
| SHLQ $32, acc1 |
| MULQ p256const1<>(SB) |
| SHRQ $32, t1 |
| ADDQ acc1, acc2 |
| ADCQ t1, acc3 |
| ADCQ AX, acc0 |
| ADCQ $0, DX |
| MOVQ DX, acc1 |
| // Third reduction step |
| MOVQ acc2, AX |
| MOVQ acc2, t1 |
| SHLQ $32, acc2 |
| MULQ p256const1<>(SB) |
| SHRQ $32, t1 |
| ADDQ acc2, acc3 |
| ADCQ t1, acc0 |
| ADCQ AX, acc1 |
| ADCQ $0, DX |
| MOVQ DX, acc2 |
| // Last reduction step |
| XORQ t0, t0 |
| MOVQ acc3, AX |
| MOVQ acc3, t1 |
| SHLQ $32, acc3 |
| MULQ p256const1<>(SB) |
| SHRQ $32, t1 |
| ADDQ acc3, acc0 |
| ADCQ t1, acc1 |
| ADCQ AX, acc2 |
| ADCQ $0, DX |
| MOVQ DX, acc3 |
| // Add bits [511:256] of the sqr result |
| ADCQ acc4, acc0 |
| ADCQ acc5, acc1 |
| ADCQ y_ptr, acc2 |
| ADCQ x_ptr, acc3 |
| ADCQ $0, t0 |
| |
| MOVQ acc0, acc4 |
| MOVQ acc1, acc5 |
| MOVQ acc2, y_ptr |
| MOVQ acc3, t1 |
| // Subtract p256 |
| SUBQ $-1, acc0 |
| SBBQ p256const0<>(SB) ,acc1 |
| SBBQ $0, acc2 |
| SBBQ p256const1<>(SB), acc3 |
| SBBQ $0, t0 |
| |
| CMOVQCS acc4, acc0 |
| CMOVQCS acc5, acc1 |
| CMOVQCS y_ptr, acc2 |
| CMOVQCS t1, acc3 |
| |
| MOVQ acc0, (8*0)(res_ptr) |
| MOVQ acc1, (8*1)(res_ptr) |
| MOVQ acc2, (8*2)(res_ptr) |
| MOVQ acc3, (8*3)(res_ptr) |
| |
| RET |
| /* ---------------------------------------*/ |
| // func p256Mul(res, in1, in2 []uint64) |
| TEXT ·p256Mul(SB),NOSPLIT,$0 |
| MOVQ res+0(FP), res_ptr |
| MOVQ in1+24(FP), x_ptr |
| MOVQ in2+48(FP), y_ptr |
| // x * y[0] |
| MOVQ (8*0)(y_ptr), t0 |
| |
| MOVQ (8*0)(x_ptr), AX |
| MULQ t0 |
| MOVQ AX, acc0 |
| MOVQ DX, acc1 |
| |
| MOVQ (8*1)(x_ptr), AX |
| MULQ t0 |
| ADDQ AX, acc1 |
| ADCQ $0, DX |
| MOVQ DX, acc2 |
| |
| MOVQ (8*2)(x_ptr), AX |
| MULQ t0 |
| ADDQ AX, acc2 |
| ADCQ $0, DX |
| MOVQ DX, acc3 |
| |
| MOVQ (8*3)(x_ptr), AX |
| MULQ t0 |
| ADDQ AX, acc3 |
| ADCQ $0, DX |
| MOVQ DX, acc4 |
| XORQ acc5, acc5 |
| // First reduction step |
| MOVQ acc0, AX |
| MOVQ acc0, t1 |
| SHLQ $32, acc0 |
| MULQ p256const1<>(SB) |
| SHRQ $32, t1 |
| ADDQ acc0, acc1 |
| ADCQ t1, acc2 |
| ADCQ AX, acc3 |
| ADCQ DX, acc4 |
| ADCQ $0, acc5 |
| XORQ acc0, acc0 |
| // x * y[1] |
| MOVQ (8*1)(y_ptr), t0 |
| |
| MOVQ (8*0)(x_ptr), AX |
| MULQ t0 |
| ADDQ AX, acc1 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ (8*1)(x_ptr), AX |
| MULQ t0 |
| ADDQ t1, acc2 |
| ADCQ $0, DX |
| ADDQ AX, acc2 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ (8*2)(x_ptr), AX |
| MULQ t0 |
| ADDQ t1, acc3 |
| ADCQ $0, DX |
| ADDQ AX, acc3 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ (8*3)(x_ptr), AX |
| MULQ t0 |
| ADDQ t1, acc4 |
| ADCQ $0, DX |
| ADDQ AX, acc4 |
| ADCQ DX, acc5 |
| ADCQ $0, acc0 |
| // Second reduction step |
| MOVQ acc1, AX |
| MOVQ acc1, t1 |
| SHLQ $32, acc1 |
| MULQ p256const1<>(SB) |
| SHRQ $32, t1 |
| ADDQ acc1, acc2 |
| ADCQ t1, acc3 |
| ADCQ AX, acc4 |
| ADCQ DX, acc5 |
| ADCQ $0, acc0 |
| XORQ acc1, acc1 |
| // x * y[2] |
| MOVQ (8*2)(y_ptr), t0 |
| |
| MOVQ (8*0)(x_ptr), AX |
| MULQ t0 |
| ADDQ AX, acc2 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ (8*1)(x_ptr), AX |
| MULQ t0 |
| ADDQ t1, acc3 |
| ADCQ $0, DX |
| ADDQ AX, acc3 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ (8*2)(x_ptr), AX |
| MULQ t0 |
| ADDQ t1, acc4 |
| ADCQ $0, DX |
| ADDQ AX, acc4 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ (8*3)(x_ptr), AX |
| MULQ t0 |
| ADDQ t1, acc5 |
| ADCQ $0, DX |
| ADDQ AX, acc5 |
| ADCQ DX, acc0 |
| ADCQ $0, acc1 |
| // Third reduction step |
| MOVQ acc2, AX |
| MOVQ acc2, t1 |
| SHLQ $32, acc2 |
| MULQ p256const1<>(SB) |
| SHRQ $32, t1 |
| ADDQ acc2, acc3 |
| ADCQ t1, acc4 |
| ADCQ AX, acc5 |
| ADCQ DX, acc0 |
| ADCQ $0, acc1 |
| XORQ acc2, acc2 |
| // x * y[3] |
| MOVQ (8*3)(y_ptr), t0 |
| |
| MOVQ (8*0)(x_ptr), AX |
| MULQ t0 |
| ADDQ AX, acc3 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ (8*1)(x_ptr), AX |
| MULQ t0 |
| ADDQ t1, acc4 |
| ADCQ $0, DX |
| ADDQ AX, acc4 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ (8*2)(x_ptr), AX |
| MULQ t0 |
| ADDQ t1, acc5 |
| ADCQ $0, DX |
| ADDQ AX, acc5 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ (8*3)(x_ptr), AX |
| MULQ t0 |
| ADDQ t1, acc0 |
| ADCQ $0, DX |
| ADDQ AX, acc0 |
| ADCQ DX, acc1 |
| ADCQ $0, acc2 |
| // Last reduction step |
| MOVQ acc3, AX |
| MOVQ acc3, t1 |
| SHLQ $32, acc3 |
| MULQ p256const1<>(SB) |
| SHRQ $32, t1 |
| ADDQ acc3, acc4 |
| ADCQ t1, acc5 |
| ADCQ AX, acc0 |
| ADCQ DX, acc1 |
| ADCQ $0, acc2 |
| // Copy result [255:0] |
| MOVQ acc4, x_ptr |
| MOVQ acc5, acc3 |
| MOVQ acc0, t0 |
| MOVQ acc1, t1 |
| // Subtract p256 |
| SUBQ $-1, acc4 |
| SBBQ p256const0<>(SB) ,acc5 |
| SBBQ $0, acc0 |
| SBBQ p256const1<>(SB), acc1 |
| SBBQ $0, acc2 |
| |
| CMOVQCS x_ptr, acc4 |
| CMOVQCS acc3, acc5 |
| CMOVQCS t0, acc0 |
| CMOVQCS t1, acc1 |
| |
| MOVQ acc4, (8*0)(res_ptr) |
| MOVQ acc5, (8*1)(res_ptr) |
| MOVQ acc0, (8*2)(res_ptr) |
| MOVQ acc1, (8*3)(res_ptr) |
| |
| RET |
| /* ---------------------------------------*/ |
| // func p256FromMont(res, in []uint64) |
| TEXT ·p256FromMont(SB),NOSPLIT,$0 |
| MOVQ res+0(FP), res_ptr |
| MOVQ in+24(FP), x_ptr |
| |
| MOVQ (8*0)(x_ptr), acc0 |
| MOVQ (8*1)(x_ptr), acc1 |
| MOVQ (8*2)(x_ptr), acc2 |
| MOVQ (8*3)(x_ptr), acc3 |
| XORQ acc4, acc4 |
| |
| // Only reduce, no multiplications are needed |
| // First stage |
| MOVQ acc0, AX |
| MOVQ acc0, t1 |
| SHLQ $32, acc0 |
| MULQ p256const1<>(SB) |
| SHRQ $32, t1 |
| ADDQ acc0, acc1 |
| ADCQ t1, acc2 |
| ADCQ AX, acc3 |
| ADCQ DX, acc4 |
| XORQ acc5, acc5 |
| // Second stage |
| MOVQ acc1, AX |
| MOVQ acc1, t1 |
| SHLQ $32, acc1 |
| MULQ p256const1<>(SB) |
| SHRQ $32, t1 |
| ADDQ acc1, acc2 |
| ADCQ t1, acc3 |
| ADCQ AX, acc4 |
| ADCQ DX, acc5 |
| XORQ acc0, acc0 |
| // Third stage |
| MOVQ acc2, AX |
| MOVQ acc2, t1 |
| SHLQ $32, acc2 |
| MULQ p256const1<>(SB) |
| SHRQ $32, t1 |
| ADDQ acc2, acc3 |
| ADCQ t1, acc4 |
| ADCQ AX, acc5 |
| ADCQ DX, acc0 |
| XORQ acc1, acc1 |
| // Last stage |
| MOVQ acc3, AX |
| MOVQ acc3, t1 |
| SHLQ $32, acc3 |
| MULQ p256const1<>(SB) |
| SHRQ $32, t1 |
| ADDQ acc3, acc4 |
| ADCQ t1, acc5 |
| ADCQ AX, acc0 |
| ADCQ DX, acc1 |
| |
| MOVQ acc4, x_ptr |
| MOVQ acc5, acc3 |
| MOVQ acc0, t0 |
| MOVQ acc1, t1 |
| |
| SUBQ $-1, acc4 |
| SBBQ p256const0<>(SB), acc5 |
| SBBQ $0, acc0 |
| SBBQ p256const1<>(SB), acc1 |
| |
| CMOVQCS x_ptr, acc4 |
| CMOVQCS acc3, acc5 |
| CMOVQCS t0, acc0 |
| CMOVQCS t1, acc1 |
| |
| MOVQ acc4, (8*0)(res_ptr) |
| MOVQ acc5, (8*1)(res_ptr) |
| MOVQ acc0, (8*2)(res_ptr) |
| MOVQ acc1, (8*3)(res_ptr) |
| |
| RET |
| /* ---------------------------------------*/ |
| // Constant time point access to arbitrary point table. |
| // Indexed from 1 to 15, with -1 offset |
| // (index 0 is implicitly point at infinity) |
| // func p256Select(point, table []uint64, idx int) |
| TEXT ·p256Select(SB),NOSPLIT,$0 |
| MOVQ idx+48(FP),AX |
| MOVQ table+24(FP),DI |
| MOVQ point+0(FP),DX |
| |
| PXOR X15, X15 // X15 = 0 |
| PCMPEQL X14, X14 // X14 = -1 |
| PSUBL X14, X15 // X15 = 1 |
| MOVL AX, X14 |
| PSHUFD $0, X14, X14 |
| |
| PXOR X0, X0 |
| PXOR X1, X1 |
| PXOR X2, X2 |
| PXOR X3, X3 |
| PXOR X4, X4 |
| PXOR X5, X5 |
| MOVQ $16, AX |
| |
| MOVOU X15, X13 |
| |
| loop_select: |
| |
| MOVOU X13, X12 |
| PADDL X15, X13 |
| PCMPEQL X14, X12 |
| |
| MOVOU (16*0)(DI), X6 |
| MOVOU (16*1)(DI), X7 |
| MOVOU (16*2)(DI), X8 |
| MOVOU (16*3)(DI), X9 |
| MOVOU (16*4)(DI), X10 |
| MOVOU (16*5)(DI), X11 |
| ADDQ $(16*6), DI |
| |
| PAND X12, X6 |
| PAND X12, X7 |
| PAND X12, X8 |
| PAND X12, X9 |
| PAND X12, X10 |
| PAND X12, X11 |
| |
| PXOR X6, X0 |
| PXOR X7, X1 |
| PXOR X8, X2 |
| PXOR X9, X3 |
| PXOR X10, X4 |
| PXOR X11, X5 |
| |
| DECQ AX |
| JNE loop_select |
| |
| MOVOU X0, (16*0)(DX) |
| MOVOU X1, (16*1)(DX) |
| MOVOU X2, (16*2)(DX) |
| MOVOU X3, (16*3)(DX) |
| MOVOU X4, (16*4)(DX) |
| MOVOU X5, (16*5)(DX) |
| |
| RET |
| /* ---------------------------------------*/ |
| // Constant time point access to base point table. |
| // func p256SelectBase(point, table []uint64, idx int) |
| TEXT ·p256SelectBase(SB),NOSPLIT,$0 |
| MOVQ idx+48(FP),AX |
| MOVQ table+24(FP),DI |
| MOVQ point+0(FP),DX |
| |
| PXOR X15, X15 // X15 = 0 |
| PCMPEQL X14, X14 // X14 = -1 |
| PSUBL X14, X15 // X15 = 1 |
| MOVL AX, X14 |
| PSHUFD $0, X14, X14 |
| |
| PXOR X0, X0 |
| PXOR X1, X1 |
| PXOR X2, X2 |
| PXOR X3, X3 |
| MOVQ $32, AX |
| |
| MOVOU X15, X13 |
| |
| loop_select_base: |
| |
| MOVOU X13, X12 |
| PADDL X15, X13 |
| PCMPEQL X14, X12 |
| |
| MOVOU (16*0)(DI), X4 |
| MOVOU (16*1)(DI), X5 |
| MOVOU (16*2)(DI), X6 |
| MOVOU (16*3)(DI), X7 |
| |
| MOVOU (16*4)(DI), X8 |
| MOVOU (16*5)(DI), X9 |
| MOVOU (16*6)(DI), X10 |
| MOVOU (16*7)(DI), X11 |
| |
| ADDQ $(16*8), DI |
| |
| PAND X12, X4 |
| PAND X12, X5 |
| PAND X12, X6 |
| PAND X12, X7 |
| |
| MOVOU X13, X12 |
| PADDL X15, X13 |
| PCMPEQL X14, X12 |
| |
| PAND X12, X8 |
| PAND X12, X9 |
| PAND X12, X10 |
| PAND X12, X11 |
| |
| PXOR X4, X0 |
| PXOR X5, X1 |
| PXOR X6, X2 |
| PXOR X7, X3 |
| |
| PXOR X8, X0 |
| PXOR X9, X1 |
| PXOR X10, X2 |
| PXOR X11, X3 |
| |
| DECQ AX |
| JNE loop_select_base |
| |
| MOVOU X0, (16*0)(DX) |
| MOVOU X1, (16*1)(DX) |
| MOVOU X2, (16*2)(DX) |
| MOVOU X3, (16*3)(DX) |
| |
| RET |
| /* ---------------------------------------*/ |
| // func p256OrdMul(res, in1, in2 []uint64) |
| TEXT ·p256OrdMul(SB),NOSPLIT,$0 |
| MOVQ res+0(FP), res_ptr |
| MOVQ in1+24(FP), x_ptr |
| MOVQ in2+48(FP), y_ptr |
| // x * y[0] |
| MOVQ (8*0)(y_ptr), t0 |
| |
| MOVQ (8*0)(x_ptr), AX |
| MULQ t0 |
| MOVQ AX, acc0 |
| MOVQ DX, acc1 |
| |
| MOVQ (8*1)(x_ptr), AX |
| MULQ t0 |
| ADDQ AX, acc1 |
| ADCQ $0, DX |
| MOVQ DX, acc2 |
| |
| MOVQ (8*2)(x_ptr), AX |
| MULQ t0 |
| ADDQ AX, acc2 |
| ADCQ $0, DX |
| MOVQ DX, acc3 |
| |
| MOVQ (8*3)(x_ptr), AX |
| MULQ t0 |
| ADDQ AX, acc3 |
| ADCQ $0, DX |
| MOVQ DX, acc4 |
| XORQ acc5, acc5 |
| // First reduction step |
| MOVQ acc0, AX |
| MULQ p256ordK0<>(SB) |
| MOVQ AX, t0 |
| |
| MOVQ p256ord<>+0x00(SB), AX |
| MULQ t0 |
| ADDQ AX, acc0 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ p256ord<>+0x08(SB), AX |
| MULQ t0 |
| ADDQ t1, acc1 |
| ADCQ $0, DX |
| ADDQ AX, acc1 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ p256ord<>+0x10(SB), AX |
| MULQ t0 |
| ADDQ t1, acc2 |
| ADCQ $0, DX |
| ADDQ AX, acc2 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ p256ord<>+0x18(SB), AX |
| MULQ t0 |
| ADDQ t1, acc3 |
| ADCQ $0, DX |
| ADDQ AX, acc3 |
| ADCQ DX, acc4 |
| ADCQ $0, acc5 |
| // x * y[1] |
| MOVQ (8*1)(y_ptr), t0 |
| |
| MOVQ (8*0)(x_ptr), AX |
| MULQ t0 |
| ADDQ AX, acc1 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ (8*1)(x_ptr), AX |
| MULQ t0 |
| ADDQ t1, acc2 |
| ADCQ $0, DX |
| ADDQ AX, acc2 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ (8*2)(x_ptr), AX |
| MULQ t0 |
| ADDQ t1, acc3 |
| ADCQ $0, DX |
| ADDQ AX, acc3 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ (8*3)(x_ptr), AX |
| MULQ t0 |
| ADDQ t1, acc4 |
| ADCQ $0, DX |
| ADDQ AX, acc4 |
| ADCQ DX, acc5 |
| ADCQ $0, acc0 |
| // Second reduction step |
| MOVQ acc1, AX |
| MULQ p256ordK0<>(SB) |
| MOVQ AX, t0 |
| |
| MOVQ p256ord<>+0x00(SB), AX |
| MULQ t0 |
| ADDQ AX, acc1 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ p256ord<>+0x08(SB), AX |
| MULQ t0 |
| ADDQ t1, acc2 |
| ADCQ $0, DX |
| ADDQ AX, acc2 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ p256ord<>+0x10(SB), AX |
| MULQ t0 |
| ADDQ t1, acc3 |
| ADCQ $0, DX |
| ADDQ AX, acc3 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ p256ord<>+0x18(SB), AX |
| MULQ t0 |
| ADDQ t1, acc4 |
| ADCQ $0, DX |
| ADDQ AX, acc4 |
| ADCQ DX, acc5 |
| ADCQ $0, acc0 |
| // x * y[2] |
| MOVQ (8*2)(y_ptr), t0 |
| |
| MOVQ (8*0)(x_ptr), AX |
| MULQ t0 |
| ADDQ AX, acc2 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ (8*1)(x_ptr), AX |
| MULQ t0 |
| ADDQ t1, acc3 |
| ADCQ $0, DX |
| ADDQ AX, acc3 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ (8*2)(x_ptr), AX |
| MULQ t0 |
| ADDQ t1, acc4 |
| ADCQ $0, DX |
| ADDQ AX, acc4 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ (8*3)(x_ptr), AX |
| MULQ t0 |
| ADDQ t1, acc5 |
| ADCQ $0, DX |
| ADDQ AX, acc5 |
| ADCQ DX, acc0 |
| ADCQ $0, acc1 |
| // Third reduction step |
| MOVQ acc2, AX |
| MULQ p256ordK0<>(SB) |
| MOVQ AX, t0 |
| |
| MOVQ p256ord<>+0x00(SB), AX |
| MULQ t0 |
| ADDQ AX, acc2 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ p256ord<>+0x08(SB), AX |
| MULQ t0 |
| ADDQ t1, acc3 |
| ADCQ $0, DX |
| ADDQ AX, acc3 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ p256ord<>+0x10(SB), AX |
| MULQ t0 |
| ADDQ t1, acc4 |
| ADCQ $0, DX |
| ADDQ AX, acc4 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ p256ord<>+0x18(SB), AX |
| MULQ t0 |
| ADDQ t1, acc5 |
| ADCQ $0, DX |
| ADDQ AX, acc5 |
| ADCQ DX, acc0 |
| ADCQ $0, acc1 |
| // x * y[3] |
| MOVQ (8*3)(y_ptr), t0 |
| |
| MOVQ (8*0)(x_ptr), AX |
| MULQ t0 |
| ADDQ AX, acc3 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ (8*1)(x_ptr), AX |
| MULQ t0 |
| ADDQ t1, acc4 |
| ADCQ $0, DX |
| ADDQ AX, acc4 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ (8*2)(x_ptr), AX |
| MULQ t0 |
| ADDQ t1, acc5 |
| ADCQ $0, DX |
| ADDQ AX, acc5 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ (8*3)(x_ptr), AX |
| MULQ t0 |
| ADDQ t1, acc0 |
| ADCQ $0, DX |
| ADDQ AX, acc0 |
| ADCQ DX, acc1 |
| ADCQ $0, acc2 |
| // Last reduction step |
| MOVQ acc3, AX |
| MULQ p256ordK0<>(SB) |
| MOVQ AX, t0 |
| |
| MOVQ p256ord<>+0x00(SB), AX |
| MULQ t0 |
| ADDQ AX, acc3 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ p256ord<>+0x08(SB), AX |
| MULQ t0 |
| ADDQ t1, acc4 |
| ADCQ $0, DX |
| ADDQ AX, acc4 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ p256ord<>+0x10(SB), AX |
| MULQ t0 |
| ADDQ t1, acc5 |
| ADCQ $0, DX |
| ADDQ AX, acc5 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ p256ord<>+0x18(SB), AX |
| MULQ t0 |
| ADDQ t1, acc0 |
| ADCQ $0, DX |
| ADDQ AX, acc0 |
| ADCQ DX, acc1 |
| ADCQ $0, acc2 |
| // Copy result [255:0] |
| MOVQ acc4, x_ptr |
| MOVQ acc5, acc3 |
| MOVQ acc0, t0 |
| MOVQ acc1, t1 |
| // Subtract p256 |
| SUBQ p256ord<>+0x00(SB), acc4 |
| SBBQ p256ord<>+0x08(SB) ,acc5 |
| SBBQ p256ord<>+0x10(SB), acc0 |
| SBBQ p256ord<>+0x18(SB), acc1 |
| SBBQ $0, acc2 |
| |
| CMOVQCS x_ptr, acc4 |
| CMOVQCS acc3, acc5 |
| CMOVQCS t0, acc0 |
| CMOVQCS t1, acc1 |
| |
| MOVQ acc4, (8*0)(res_ptr) |
| MOVQ acc5, (8*1)(res_ptr) |
| MOVQ acc0, (8*2)(res_ptr) |
| MOVQ acc1, (8*3)(res_ptr) |
| |
| RET |
| /* ---------------------------------------*/ |
| // func p256OrdSqr(res, in []uint64, n int) |
| TEXT ·p256OrdSqr(SB),NOSPLIT,$0 |
| MOVQ res+0(FP), res_ptr |
| MOVQ in+24(FP), x_ptr |
| MOVQ n+48(FP), BX |
| |
| ordSqrLoop: |
| |
| // y[1:] * y[0] |
| MOVQ (8*0)(x_ptr), t0 |
| |
| MOVQ (8*1)(x_ptr), AX |
| MULQ t0 |
| MOVQ AX, acc1 |
| MOVQ DX, acc2 |
| |
| MOVQ (8*2)(x_ptr), AX |
| MULQ t0 |
| ADDQ AX, acc2 |
| ADCQ $0, DX |
| MOVQ DX, acc3 |
| |
| MOVQ (8*3)(x_ptr), AX |
| MULQ t0 |
| ADDQ AX, acc3 |
| ADCQ $0, DX |
| MOVQ DX, acc4 |
| // y[2:] * y[1] |
| MOVQ (8*1)(x_ptr), t0 |
| |
| MOVQ (8*2)(x_ptr), AX |
| MULQ t0 |
| ADDQ AX, acc3 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ (8*3)(x_ptr), AX |
| MULQ t0 |
| ADDQ t1, acc4 |
| ADCQ $0, DX |
| ADDQ AX, acc4 |
| ADCQ $0, DX |
| MOVQ DX, acc5 |
| // y[3] * y[2] |
| MOVQ (8*2)(x_ptr), t0 |
| |
| MOVQ (8*3)(x_ptr), AX |
| MULQ t0 |
| ADDQ AX, acc5 |
| ADCQ $0, DX |
| MOVQ DX, y_ptr |
| XORQ t1, t1 |
| // *2 |
| ADDQ acc1, acc1 |
| ADCQ acc2, acc2 |
| ADCQ acc3, acc3 |
| ADCQ acc4, acc4 |
| ADCQ acc5, acc5 |
| ADCQ y_ptr, y_ptr |
| ADCQ $0, t1 |
| // Missing products |
| MOVQ (8*0)(x_ptr), AX |
| MULQ AX |
| MOVQ AX, acc0 |
| MOVQ DX, t0 |
| |
| MOVQ (8*1)(x_ptr), AX |
| MULQ AX |
| ADDQ t0, acc1 |
| ADCQ AX, acc2 |
| ADCQ $0, DX |
| MOVQ DX, t0 |
| |
| MOVQ (8*2)(x_ptr), AX |
| MULQ AX |
| ADDQ t0, acc3 |
| ADCQ AX, acc4 |
| ADCQ $0, DX |
| MOVQ DX, t0 |
| |
| MOVQ (8*3)(x_ptr), AX |
| MULQ AX |
| ADDQ t0, acc5 |
| ADCQ AX, y_ptr |
| ADCQ DX, t1 |
| MOVQ t1, x_ptr |
| // First reduction step |
| MOVQ acc0, AX |
| MULQ p256ordK0<>(SB) |
| MOVQ AX, t0 |
| |
| MOVQ p256ord<>+0x00(SB), AX |
| MULQ t0 |
| ADDQ AX, acc0 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ p256ord<>+0x08(SB), AX |
| MULQ t0 |
| ADDQ t1, acc1 |
| ADCQ $0, DX |
| ADDQ AX, acc1 |
| |
| MOVQ t0, t1 |
| ADCQ DX, acc2 |
| ADCQ $0, t1 |
| SUBQ t0, acc2 |
| SBBQ $0, t1 |
| |
| MOVQ t0, AX |
| MOVQ t0, DX |
| MOVQ t0, acc0 |
| SHLQ $32, AX |
| SHRQ $32, DX |
| |
| ADDQ t1, acc3 |
| ADCQ $0, acc0 |
| SUBQ AX, acc3 |
| SBBQ DX, acc0 |
| // Second reduction step |
| MOVQ acc1, AX |
| MULQ p256ordK0<>(SB) |
| MOVQ AX, t0 |
| |
| MOVQ p256ord<>+0x00(SB), AX |
| MULQ t0 |
| ADDQ AX, acc1 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ p256ord<>+0x08(SB), AX |
| MULQ t0 |
| ADDQ t1, acc2 |
| ADCQ $0, DX |
| ADDQ AX, acc2 |
| |
| MOVQ t0, t1 |
| ADCQ DX, acc3 |
| ADCQ $0, t1 |
| SUBQ t0, acc3 |
| SBBQ $0, t1 |
| |
| MOVQ t0, AX |
| MOVQ t0, DX |
| MOVQ t0, acc1 |
| SHLQ $32, AX |
| SHRQ $32, DX |
| |
| ADDQ t1, acc0 |
| ADCQ $0, acc1 |
| SUBQ AX, acc0 |
| SBBQ DX, acc1 |
| // Third reduction step |
| MOVQ acc2, AX |
| MULQ p256ordK0<>(SB) |
| MOVQ AX, t0 |
| |
| MOVQ p256ord<>+0x00(SB), AX |
| MULQ t0 |
| ADDQ AX, acc2 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ p256ord<>+0x08(SB), AX |
| MULQ t0 |
| ADDQ t1, acc3 |
| ADCQ $0, DX |
| ADDQ AX, acc3 |
| |
| MOVQ t0, t1 |
| ADCQ DX, acc0 |
| ADCQ $0, t1 |
| SUBQ t0, acc0 |
| SBBQ $0, t1 |
| |
| MOVQ t0, AX |
| MOVQ t0, DX |
| MOVQ t0, acc2 |
| SHLQ $32, AX |
| SHRQ $32, DX |
| |
| ADDQ t1, acc1 |
| ADCQ $0, acc2 |
| SUBQ AX, acc1 |
| SBBQ DX, acc2 |
| // Last reduction step |
| MOVQ acc3, AX |
| MULQ p256ordK0<>(SB) |
| MOVQ AX, t0 |
| |
| MOVQ p256ord<>+0x00(SB), AX |
| MULQ t0 |
| ADDQ AX, acc3 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ p256ord<>+0x08(SB), AX |
| MULQ t0 |
| ADDQ t1, acc0 |
| ADCQ $0, DX |
| ADDQ AX, acc0 |
| ADCQ $0, DX |
| MOVQ DX, t1 |
| |
| MOVQ t0, t1 |
| ADCQ DX, acc1 |
| ADCQ $0, t1 |
| SUBQ t0, acc1 |
| SBBQ $0, t1 |
| |
| MOVQ t0, AX |
| MOVQ t0, DX |
| MOVQ t0, acc3 |
| SHLQ $32, AX |
| SHRQ $32, DX |
| |
| ADDQ t1, acc2 |
| ADCQ $0, acc3 |
| SUBQ AX, acc2 |
| SBBQ DX, acc3 |
| XORQ t0, t0 |
| // Add bits [511:256] of the sqr result |
| ADCQ acc4, acc0 |
| ADCQ acc5, acc1 |
| ADCQ y_ptr, acc2 |
| ADCQ x_ptr, acc3 |
| ADCQ $0, t0 |
| |
| MOVQ acc0, acc4 |
| MOVQ acc1, acc5 |
| MOVQ acc2, y_ptr |
| MOVQ acc3, t1 |
| // Subtract p256 |
| SUBQ p256ord<>+0x00(SB), acc0 |
| SBBQ p256ord<>+0x08(SB) ,acc1 |
| SBBQ p256ord<>+0x10(SB), acc2 |
| SBBQ p256ord<>+0x18(SB), acc3 |
| SBBQ $0, t0 |
| |
| CMOVQCS acc4, acc0 |
| CMOVQCS acc5, acc1 |
| CMOVQCS y_ptr, acc2 |
| CMOVQCS t1, acc3 |
| |
| MOVQ acc0, (8*0)(res_ptr) |
| MOVQ acc1, (8*1)(res_ptr) |
| MOVQ acc2, (8*2)(res_ptr) |
| MOVQ acc3, (8*3)(res_ptr) |
| MOVQ res_ptr, x_ptr |
| DECQ BX |
| JNE ordSqrLoop |
| |
| RET |
| /* ---------------------------------------*/ |
| #undef res_ptr |
| #undef x_ptr |
| #undef y_ptr |
| |
| #undef acc0 |
| #undef acc1 |
| #undef acc2 |
| #undef acc3 |
| #undef acc4 |
| #undef acc5 |
| #undef t0 |
| #undef t1 |
| /* ---------------------------------------*/ |
| #define mul0 AX |
| #define mul1 DX |
| #define acc0 BX |
| #define acc1 CX |
| #define acc2 R8 |
| #define acc3 R9 |
| #define acc4 R10 |
| #define acc5 R11 |
| #define acc6 R12 |
| #define acc7 R13 |
| #define t0 R14 |
| #define t1 R15 |
| #define t2 DI |
| #define t3 SI |
| #define hlp BP |
| /* ---------------------------------------*/ |
| TEXT p256SubInternal(SB),NOSPLIT,$0 |
| XORQ mul0, mul0 |
| SUBQ t0, acc4 |
| SBBQ t1, acc5 |
| SBBQ t2, acc6 |
| SBBQ t3, acc7 |
| SBBQ $0, mul0 |
| |
| MOVQ acc4, acc0 |
| MOVQ acc5, acc1 |
| MOVQ acc6, acc2 |
| MOVQ acc7, acc3 |
| |
| ADDQ $-1, acc4 |
| ADCQ p256const0<>(SB), acc5 |
| ADCQ $0, acc6 |
| ADCQ p256const1<>(SB), acc7 |
| ADCQ $0, mul0 |
| |
| CMOVQNE acc0, acc4 |
| CMOVQNE acc1, acc5 |
| CMOVQNE acc2, acc6 |
| CMOVQNE acc3, acc7 |
| |
| RET |
| /* ---------------------------------------*/ |
| TEXT p256MulInternal(SB),NOSPLIT,$0 |
| MOVQ acc4, mul0 |
| MULQ t0 |
| MOVQ mul0, acc0 |
| MOVQ mul1, acc1 |
| |
| MOVQ acc4, mul0 |
| MULQ t1 |
| ADDQ mul0, acc1 |
| ADCQ $0, mul1 |
| MOVQ mul1, acc2 |
| |
| MOVQ acc4, mul0 |
| MULQ t2 |
| ADDQ mul0, acc2 |
| ADCQ $0, mul1 |
| MOVQ mul1, acc3 |
| |
| MOVQ acc4, mul0 |
| MULQ t3 |
| ADDQ mul0, acc3 |
| ADCQ $0, mul1 |
| MOVQ mul1, acc4 |
| |
| MOVQ acc5, mul0 |
| MULQ t0 |
| ADDQ mul0, acc1 |
| ADCQ $0, mul1 |
| MOVQ mul1, hlp |
| |
| MOVQ acc5, mul0 |
| MULQ t1 |
| ADDQ hlp, acc2 |
| ADCQ $0, mul1 |
| ADDQ mul0, acc2 |
| ADCQ $0, mul1 |
| MOVQ mul1, hlp |
| |
| MOVQ acc5, mul0 |
| MULQ t2 |
| ADDQ hlp, acc3 |
| ADCQ $0, mul1 |
| ADDQ mul0, acc3 |
| ADCQ $0, mul1 |
| MOVQ mul1, hlp |
| |
| MOVQ acc5, mul0 |
| MULQ t3 |
| ADDQ hlp, acc4 |
| ADCQ $0, mul1 |
| ADDQ mul0, acc4 |
| ADCQ $0, mul1 |
| MOVQ mul1, acc5 |
| |
| MOVQ acc6, mul0 |
| MULQ t0 |
| ADDQ mul0, acc2 |
| ADCQ $0, mul1 |
| MOVQ mul1, hlp |
| |
| MOVQ acc6, mul0 |
| MULQ t1 |
| ADDQ hlp, acc3 |
| ADCQ $0, mul1 |
| ADDQ mul0, acc3 |
| ADCQ $0, mul1 |
| MOVQ mul1, hlp |
| |
| MOVQ acc6, mul0 |
| MULQ t2 |
| ADDQ hlp, acc4 |
| ADCQ $0, mul1 |
| ADDQ mul0, acc4 |
| ADCQ $0, mul1 |
| MOVQ mul1, hlp |
| |
| MOVQ acc6, mul0 |
| MULQ t3 |
| ADDQ hlp, acc5 |
| ADCQ $0, mul1 |
| ADDQ mul0, acc5 |
| ADCQ $0, mul1 |
| MOVQ mul1, acc6 |
| |
| MOVQ acc7, mul0 |
| MULQ t0 |
| ADDQ mul0, acc3 |
| ADCQ $0, mul1 |
| MOVQ mul1, hlp |
| |
| MOVQ acc7, mul0 |
| MULQ t1 |
| ADDQ hlp, acc4 |
| ADCQ $0, mul1 |
| ADDQ mul0, acc4 |
| ADCQ $0, mul1 |
| MOVQ mul1, hlp |
| |
| MOVQ acc7, mul0 |
| MULQ t2 |
| ADDQ hlp, acc5 |
| ADCQ $0, mul1 |
| ADDQ mul0, acc5 |
| ADCQ $0, mul1 |
| MOVQ mul1, hlp |
| |
| MOVQ acc7, mul0 |
| MULQ t3 |
| ADDQ hlp, acc6 |
| ADCQ $0, mul1 |
| ADDQ mul0, acc6 |
| ADCQ $0, mul1 |
| MOVQ mul1, acc7 |
| // First reduction step |
| MOVQ acc0, mul0 |
| MOVQ acc0, hlp |
| SHLQ $32, acc0 |
| MULQ p256const1<>(SB) |
| SHRQ $32, hlp |
| ADDQ acc0, acc1 |
| ADCQ hlp, acc2 |
| ADCQ mul0, acc3 |
| ADCQ $0, mul1 |
| MOVQ mul1, acc0 |
| // Second reduction step |
| MOVQ acc1, mul0 |
| MOVQ acc1, hlp |
| SHLQ $32, acc1 |
| MULQ p256const1<>(SB) |
| SHRQ $32, hlp |
| ADDQ acc1, acc2 |
| ADCQ hlp, acc3 |
| ADCQ mul0, acc0 |
| ADCQ $0, mul1 |
| MOVQ mul1, acc1 |
| // Third reduction step |
| MOVQ acc2, mul0 |
| MOVQ acc2, hlp |
| SHLQ $32, acc2 |
| MULQ p256const1<>(SB) |
| SHRQ $32, hlp |
| ADDQ acc2, acc3 |
| ADCQ hlp, acc0 |
| ADCQ mul0, acc1 |
| ADCQ $0, mul1 |
| MOVQ mul1, acc2 |
| // Last reduction step |
| MOVQ acc3, mul0 |
| MOVQ acc3, hlp |
| SHLQ $32, acc3 |
| MULQ p256const1<>(SB) |
| SHRQ $32, hlp |
| ADDQ acc3, acc0 |
| ADCQ hlp, acc1 |
| ADCQ mul0, acc2 |
| ADCQ $0, mul1 |
| MOVQ mul1, acc3 |
| BYTE $0x48; BYTE $0xc7; BYTE $0xc5; BYTE $0x00; BYTE $0x00; BYTE $0x00; BYTE $0x00 // MOVQ $0, BP |
| // Add bits [511:256] of the result |
| ADCQ acc0, acc4 |
| ADCQ acc1, acc5 |
| ADCQ acc2, acc6 |
| ADCQ acc3, acc7 |
| ADCQ $0, hlp |
| // Copy result |
| MOVQ acc4, acc0 |
| MOVQ acc5, acc1 |
| MOVQ acc6, acc2 |
| MOVQ acc7, acc3 |
| // Subtract p256 |
| SUBQ $-1, acc4 |
| SBBQ p256const0<>(SB) ,acc5 |
| SBBQ $0, acc6 |
| SBBQ p256const1<>(SB), acc7 |
| SBBQ $0, hlp |
| // If the result of the subtraction is negative, restore the previous result |
| CMOVQCS acc0, acc4 |
| CMOVQCS acc1, acc5 |
| CMOVQCS acc2, acc6 |
| CMOVQCS acc3, acc7 |
| |
| RET |
| /* ---------------------------------------*/ |
| TEXT p256SqrInternal(SB),NOSPLIT,$0 |
| |
| MOVQ acc4, mul0 |
| MULQ acc5 |
| MOVQ mul0, acc1 |
| MOVQ mul1, acc2 |
| |
| MOVQ acc4, mul0 |
| MULQ acc6 |
| ADDQ mul0, acc2 |
| ADCQ $0, mul1 |
| MOVQ mul1, acc3 |
| |
| MOVQ acc4, mul0 |
| MULQ acc7 |
| ADDQ mul0, acc3 |
| ADCQ $0, mul1 |
| MOVQ mul1, t0 |
| |
| MOVQ acc5, mul0 |
| MULQ acc6 |
| ADDQ mul0, acc3 |
| ADCQ $0, mul1 |
| MOVQ mul1, hlp |
| |
| MOVQ acc5, mul0 |
| MULQ acc7 |
| ADDQ hlp, t0 |
| ADCQ $0, mul1 |
| ADDQ mul0, t0 |
| ADCQ $0, mul1 |
| MOVQ mul1, t1 |
| |
| MOVQ acc6, mul0 |
| MULQ acc7 |
| ADDQ mul0, t1 |
| ADCQ $0, mul1 |
| MOVQ mul1, t2 |
| XORQ t3, t3 |
| // *2 |
| ADDQ acc1, acc1 |
| ADCQ acc2, acc2 |
| ADCQ acc3, acc3 |
| ADCQ t0, t0 |
| ADCQ t1, t1 |
| ADCQ t2, t2 |
| ADCQ $0, t3 |
| // Missing products |
| MOVQ acc4, mul0 |
| MULQ mul0 |
| MOVQ mul0, acc0 |
| MOVQ DX, acc4 |
| |
| MOVQ acc5, mul0 |
| MULQ mul0 |
| ADDQ acc4, acc1 |
| ADCQ mul0, acc2 |
| ADCQ $0, DX |
| MOVQ DX, acc4 |
| |
| MOVQ acc6, mul0 |
| MULQ mul0 |
| ADDQ acc4, acc3 |
| ADCQ mul0, t0 |
| ADCQ $0, DX |
| MOVQ DX, acc4 |
| |
| MOVQ acc7, mul0 |
| MULQ mul0 |
| ADDQ acc4, t1 |
| ADCQ mul0, t2 |
| ADCQ DX, t3 |
| // First reduction step |
| MOVQ acc0, mul0 |
| MOVQ acc0, hlp |
| SHLQ $32, acc0 |
| MULQ p256const1<>(SB) |
| SHRQ $32, hlp |
| ADDQ acc0, acc1 |
| ADCQ hlp, acc2 |
| ADCQ mul0, acc3 |
| ADCQ $0, mul1 |
| MOVQ mul1, acc0 |
| // Second reduction step |
| MOVQ acc1, mul0 |
| MOVQ acc1, hlp |
| SHLQ $32, acc1 |
| MULQ p256const1<>(SB) |
| SHRQ $32, hlp |
| ADDQ acc1, acc2 |
| ADCQ hlp, acc3 |
| ADCQ mul0, acc0 |
| ADCQ $0, mul1 |
| MOVQ mul1, acc1 |
| // Third reduction step |
| MOVQ acc2, mul0 |
| MOVQ acc2, hlp |
| SHLQ $32, acc2 |
| MULQ p256const1<>(SB) |
| SHRQ $32, hlp |
| ADDQ acc2, acc3 |
| ADCQ hlp, acc0 |
| ADCQ mul0, acc1 |
| ADCQ $0, mul1 |
| MOVQ mul1, acc2 |
| // Last reduction step |
| MOVQ acc3, mul0 |
| MOVQ acc3, hlp |
| SHLQ $32, acc3 |
| MULQ p256const1<>(SB) |
| SHRQ $32, hlp |
| ADDQ acc3, acc0 |
| ADCQ hlp, acc1 |
| ADCQ mul0, acc2 |
| ADCQ $0, mul1 |
| MOVQ mul1, acc3 |
| BYTE $0x48; BYTE $0xc7; BYTE $0xc5; BYTE $0x00; BYTE $0x00; BYTE $0x00; BYTE $0x00 // MOVQ $0, BP |
| // Add bits [511:256] of the result |
| ADCQ acc0, t0 |
| ADCQ acc1, t1 |
| ADCQ acc2, t2 |
| ADCQ acc3, t3 |
| ADCQ $0, hlp |
| // Copy result |
| MOVQ t0, acc4 |
| MOVQ t1, acc5 |
| MOVQ t2, acc6 |
| MOVQ t3, acc7 |
| // Subtract p256 |
| SUBQ $-1, acc4 |
| SBBQ p256const0<>(SB) ,acc5 |
| SBBQ $0, acc6 |
| SBBQ p256const1<>(SB), acc7 |
| SBBQ $0, hlp |
| // If the result of the subtraction is negative, restore the previous result |
| CMOVQCS t0, acc4 |
| CMOVQCS t1, acc5 |
| CMOVQCS t2, acc6 |
| CMOVQCS t3, acc7 |
| |
| RET |
| /* ---------------------------------------*/ |
| #define p256MulBy2Inline\ |
| XORQ mul0, mul0;\ |
| ADDQ acc4, acc4;\ |
| ADCQ acc5, acc5;\ |
| ADCQ acc6, acc6;\ |
| ADCQ acc7, acc7;\ |
| ADCQ $0, mul0;\ |
| MOVQ acc4, t0;\ |
| MOVQ acc5, t1;\ |
| MOVQ acc6, t2;\ |
| MOVQ acc7, t3;\ |
| SUBQ $-1, t0;\ |
| SBBQ p256const0<>(SB), t1;\ |
| SBBQ $0, t2;\ |
| SBBQ p256const1<>(SB), t3;\ |
| SBBQ $0, mul0;\ |
| CMOVQCS acc4, t0;\ |
| CMOVQCS acc5, t1;\ |
| CMOVQCS acc6, t2;\ |
| CMOVQCS acc7, t3; |
| /* ---------------------------------------*/ |
| #define p256AddInline \ |
| XORQ mul0, mul0;\ |
| ADDQ t0, acc4;\ |
| ADCQ t1, acc5;\ |
| ADCQ t2, acc6;\ |
| ADCQ t3, acc7;\ |
| ADCQ $0, mul0;\ |
| MOVQ acc4, t0;\ |
| MOVQ acc5, t1;\ |
| MOVQ acc6, t2;\ |
| MOVQ acc7, t3;\ |
| SUBQ $-1, t0;\ |
| SBBQ p256const0<>(SB), t1;\ |
| SBBQ $0, t2;\ |
| SBBQ p256const1<>(SB), t3;\ |
| SBBQ $0, mul0;\ |
| CMOVQCS acc4, t0;\ |
| CMOVQCS acc5, t1;\ |
| CMOVQCS acc6, t2;\ |
| CMOVQCS acc7, t3; |
| /* ---------------------------------------*/ |
| #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7 |
| #define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3 |
| #define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3) |
| #define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3) |
| #define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3 |
| #define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7 |
| /* ---------------------------------------*/ |
| #define x1in(off) (32*0 + off)(SP) |
| #define y1in(off) (32*1 + off)(SP) |
| #define z1in(off) (32*2 + off)(SP) |
| #define x2in(off) (32*3 + off)(SP) |
| #define y2in(off) (32*4 + off)(SP) |
| #define xout(off) (32*5 + off)(SP) |
| #define yout(off) (32*6 + off)(SP) |
| #define zout(off) (32*7 + off)(SP) |
| #define s2(off) (32*8 + off)(SP) |
| #define z1sqr(off) (32*9 + off)(SP) |
| #define h(off) (32*10 + off)(SP) |
| #define r(off) (32*11 + off)(SP) |
| #define hsqr(off) (32*12 + off)(SP) |
| #define rsqr(off) (32*13 + off)(SP) |
| #define hcub(off) (32*14 + off)(SP) |
| #define rptr (32*15)(SP) |
| #define sel_save (32*15 + 8)(SP) |
| #define zero_save (32*15 + 8 + 4)(SP) |
| |
| // func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int) |
| TEXT ·p256PointAddAffineAsm(SB),0,$512-96 |
| // Move input to stack in order to free registers |
| MOVQ res+0(FP), AX |
| MOVQ in1+24(FP), BX |
| MOVQ in2+48(FP), CX |
| MOVQ sign+72(FP), DX |
| MOVQ sel+80(FP), t1 |
| MOVQ zero+88(FP), t2 |
| |
| MOVOU (16*0)(BX), X0 |
| MOVOU (16*1)(BX), X1 |
| MOVOU (16*2)(BX), X2 |
| MOVOU (16*3)(BX), X3 |
| MOVOU (16*4)(BX), X4 |
| MOVOU (16*5)(BX), X5 |
| |
| MOVOU X0, x1in(16*0) |
| MOVOU X1, x1in(16*1) |
| MOVOU X2, y1in(16*0) |
| MOVOU X3, y1in(16*1) |
| MOVOU X4, z1in(16*0) |
| MOVOU X5, z1in(16*1) |
| |
| MOVOU (16*0)(CX), X0 |
| MOVOU (16*1)(CX), X1 |
| |
| MOVOU X0, x2in(16*0) |
| MOVOU X1, x2in(16*1) |
| // Store pointer to result |
| MOVQ mul0, rptr |
| MOVL t1, sel_save |
| MOVL t2, zero_save |
| // Negate y2in based on sign |
| MOVQ (16*2 + 8*0)(CX), acc4 |
| MOVQ (16*2 + 8*1)(CX), acc5 |
| MOVQ (16*2 + 8*2)(CX), acc6 |
| MOVQ (16*2 + 8*3)(CX), acc7 |
| MOVQ $-1, acc0 |
| MOVQ p256const0<>(SB), acc1 |
| MOVQ $0, acc2 |
| MOVQ p256const1<>(SB), acc3 |
| XORQ mul0, mul0 |
| // Speculatively subtract |
| SUBQ acc4, acc0 |
| SBBQ acc5, acc1 |
| SBBQ acc6, acc2 |
| SBBQ acc7, acc3 |
| SBBQ $0, mul0 |
| MOVQ acc0, t0 |
| MOVQ acc1, t1 |
| MOVQ acc2, t2 |
| MOVQ acc3, t3 |
| // Add in case the operand was > p256 |
| ADDQ $-1, acc0 |
| ADCQ p256const0<>(SB), acc1 |
| ADCQ $0, acc2 |
| ADCQ p256const1<>(SB), acc3 |
| ADCQ $0, mul0 |
| CMOVQNE t0, acc0 |
| CMOVQNE t1, acc1 |
| CMOVQNE t2, acc2 |
| CMOVQNE t3, acc3 |
| // If condition is 0, keep original value |
| TESTQ DX, DX |
| CMOVQEQ acc4, acc0 |
| CMOVQEQ acc5, acc1 |
| CMOVQEQ acc6, acc2 |
| CMOVQEQ acc7, acc3 |
| // Store result |
| MOVQ acc0, y2in(8*0) |
| MOVQ acc1, y2in(8*1) |
| MOVQ acc2, y2in(8*2) |
| MOVQ acc3, y2in(8*3) |
| // Begin point add |
| LDacc (z1in) |
| CALL p256SqrInternal(SB) // z1ˆ2 |
| ST (z1sqr) |
| |
| LDt (x2in) |
| CALL p256MulInternal(SB) // x2 * z1ˆ2 |
| |
| LDt (x1in) |
| CALL p256SubInternal(SB) // h = u2 - u1 |
| ST (h) |
| |
| LDt (z1in) |
| CALL p256MulInternal(SB) // z3 = h * z1 |
| ST (zout) |
| |
| LDacc (z1sqr) |
| CALL p256MulInternal(SB) // z1ˆ3 |
| |
| LDt (y2in) |
| CALL p256MulInternal(SB) // s2 = y2 * z1ˆ3 |
| ST (s2) |
| |
| LDt (y1in) |
| CALL p256SubInternal(SB) // r = s2 - s1 |
| ST (r) |
| |
| CALL p256SqrInternal(SB) // rsqr = rˆ2 |
| ST (rsqr) |
| |
| LDacc (h) |
| CALL p256SqrInternal(SB) // hsqr = hˆ2 |
| ST (hsqr) |
| |
| LDt (h) |
| CALL p256MulInternal(SB) // hcub = hˆ3 |
| ST (hcub) |
| |
| LDt (y1in) |
| CALL p256MulInternal(SB) // y1 * hˆ3 |
| ST (s2) |
| |
| LDacc (x1in) |
| LDt (hsqr) |
| CALL p256MulInternal(SB) // u1 * hˆ2 |
| ST (h) |
| |
| p256MulBy2Inline // u1 * hˆ2 * 2, inline |
| LDacc (rsqr) |
| CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 |
| |
| LDt (hcub) |
| CALL p256SubInternal(SB) |
| ST (xout) |
| |
| MOVQ acc4, t0 |
| MOVQ acc5, t1 |
| MOVQ acc6, t2 |
| MOVQ acc7, t3 |
| LDacc (h) |
| CALL p256SubInternal(SB) |
| |
| LDt (r) |
| CALL p256MulInternal(SB) |
| |
| LDt (s2) |
| CALL p256SubInternal(SB) |
| ST (yout) |
| // Load stored values from stack |
| MOVQ rptr, AX |
| MOVL sel_save, BX |
| MOVL zero_save, CX |
| // The result is not valid if (sel == 0), conditional choose |
| MOVOU xout(16*0), X0 |
| MOVOU xout(16*1), X1 |
| MOVOU yout(16*0), X2 |
| MOVOU yout(16*1), X3 |
| MOVOU zout(16*0), X4 |
| MOVOU zout(16*1), X5 |
| |
| MOVL BX, X6 |
| MOVL CX, X7 |
| |
| PXOR X8, X8 |
| PCMPEQL X9, X9 |
| |
| PSHUFD $0, X6, X6 |
| PSHUFD $0, X7, X7 |
| |
| PCMPEQL X8, X6 |
| PCMPEQL X8, X7 |
| |
| MOVOU X6, X15 |
| PANDN X9, X15 |
| |
| MOVOU x1in(16*0), X9 |
| MOVOU x1in(16*1), X10 |
| MOVOU y1in(16*0), X11 |
| MOVOU y1in(16*1), X12 |
| MOVOU z1in(16*0), X13 |
| MOVOU z1in(16*1), X14 |
| |
| PAND X15, X0 |
| PAND X15, X1 |
| PAND X15, X2 |
| PAND X15, X3 |
| PAND X15, X4 |
| PAND X15, X5 |
| |
| PAND X6, X9 |
| PAND X6, X10 |
| PAND X6, X11 |
| PAND X6, X12 |
| PAND X6, X13 |
| PAND X6, X14 |
| |
| PXOR X9, X0 |
| PXOR X10, X1 |
| PXOR X11, X2 |
| PXOR X12, X3 |
| PXOR X13, X4 |
| PXOR X14, X5 |
| // Similarly if zero == 0 |
| PCMPEQL X9, X9 |
| MOVOU X7, X15 |
| PANDN X9, X15 |
| |
| MOVOU x2in(16*0), X9 |
| MOVOU x2in(16*1), X10 |
| MOVOU y2in(16*0), X11 |
| MOVOU y2in(16*1), X12 |
| MOVOU p256one<>+0x00(SB), X13 |
| MOVOU p256one<>+0x10(SB), X14 |
| |
| PAND X15, X0 |
| PAND X15, X1 |
| PAND X15, X2 |
| PAND X15, X3 |
| PAND X15, X4 |
| PAND X15, X5 |
| |
| PAND X7, X9 |
| PAND X7, X10 |
| PAND X7, X11 |
| PAND X7, X12 |
| PAND X7, X13 |
| PAND X7, X14 |
| |
| PXOR X9, X0 |
| PXOR X10, X1 |
| PXOR X11, X2 |
| PXOR X12, X3 |
| PXOR X13, X4 |
| PXOR X14, X5 |
| // Finally output the result |
| MOVOU X0, (16*0)(AX) |
| MOVOU X1, (16*1)(AX) |
| MOVOU X2, (16*2)(AX) |
| MOVOU X3, (16*3)(AX) |
| MOVOU X4, (16*4)(AX) |
| MOVOU X5, (16*5)(AX) |
| MOVQ $0, rptr |
| |
| RET |
| #undef x1in |
| #undef y1in |
| #undef z1in |
| #undef x2in |
| #undef y2in |
| #undef xout |
| #undef yout |
| #undef zout |
| #undef s2 |
| #undef z1sqr |
| #undef h |
| #undef r |
| #undef hsqr |
| #undef rsqr |
| #undef hcub |
| #undef rptr |
| #undef sel_save |
| #undef zero_save |
| /* ---------------------------------------*/ |
| #define x1in(off) (32*0 + off)(SP) |
| #define y1in(off) (32*1 + off)(SP) |
| #define z1in(off) (32*2 + off)(SP) |
| #define x2in(off) (32*3 + off)(SP) |
| #define y2in(off) (32*4 + off)(SP) |
| #define z2in(off) (32*5 + off)(SP) |
| |
| #define xout(off) (32*6 + off)(SP) |
| #define yout(off) (32*7 + off)(SP) |
| #define zout(off) (32*8 + off)(SP) |
| |
| #define u1(off) (32*9 + off)(SP) |
| #define u2(off) (32*10 + off)(SP) |
| #define s1(off) (32*11 + off)(SP) |
| #define s2(off) (32*12 + off)(SP) |
| #define z1sqr(off) (32*13 + off)(SP) |
| #define z2sqr(off) (32*14 + off)(SP) |
| #define h(off) (32*15 + off)(SP) |
| #define r(off) (32*16 + off)(SP) |
| #define hsqr(off) (32*17 + off)(SP) |
| #define rsqr(off) (32*18 + off)(SP) |
| #define hcub(off) (32*19 + off)(SP) |
| #define rptr (32*20)(SP) |
| |
| //func p256PointAddAsm(res, in1, in2 []uint64) |
| TEXT ·p256PointAddAsm(SB),0,$672-72 |
| // Move input to stack in order to free registers |
| MOVQ res+0(FP), AX |
| MOVQ in1+24(FP), BX |
| MOVQ in2+48(FP), CX |
| |
| MOVOU (16*0)(BX), X0 |
| MOVOU (16*1)(BX), X1 |
| MOVOU (16*2)(BX), X2 |
| MOVOU (16*3)(BX), X3 |
| MOVOU (16*4)(BX), X4 |
| MOVOU (16*5)(BX), X5 |
| |
| MOVOU X0, x1in(16*0) |
| MOVOU X1, x1in(16*1) |
| MOVOU X2, y1in(16*0) |
| MOVOU X3, y1in(16*1) |
| MOVOU X4, z1in(16*0) |
| MOVOU X5, z1in(16*1) |
| |
| MOVOU (16*0)(CX), X0 |
| MOVOU (16*1)(CX), X1 |
| MOVOU (16*2)(CX), X2 |
| MOVOU (16*3)(CX), X3 |
| MOVOU (16*4)(CX), X4 |
| MOVOU (16*5)(CX), X5 |
| |
| MOVOU X0, x2in(16*0) |
| MOVOU X1, x2in(16*1) |
| MOVOU X2, y2in(16*0) |
| MOVOU X3, y2in(16*1) |
| MOVOU X4, z2in(16*0) |
| MOVOU X5, z2in(16*1) |
| // Store pointer to result |
| MOVQ AX, rptr |
| // Begin point add |
| LDacc (z2in) |
| CALL p256SqrInternal(SB) // z2ˆ2 |
| ST (z2sqr) |
| LDt (z2in) |
| CALL p256MulInternal(SB) // z2ˆ3 |
| LDt (y1in) |
| CALL p256MulInternal(SB) // s1 = z2ˆ3*y1 |
| ST (s1) |
| |
| LDacc (z1in) |
| CALL p256SqrInternal(SB) // z1ˆ2 |
| ST (z1sqr) |
| LDt (z1in) |
| CALL p256MulInternal(SB) // z1ˆ3 |
| LDt (y2in) |
| CALL p256MulInternal(SB) // s2 = z1ˆ3*y2 |
| ST (s2) |
| |
| LDt (s1) |
| CALL p256SubInternal(SB) // r = s2 - s1 |
| ST (r) |
| |
| LDacc (z2sqr) |
| LDt (x1in) |
| CALL p256MulInternal(SB) // u1 = x1 * z2ˆ2 |
| ST (u1) |
| LDacc (z1sqr) |
| LDt (x2in) |
| CALL p256MulInternal(SB) // u2 = x2 * z1ˆ2 |
| ST (u2) |
| |
| LDt (u1) |
| CALL p256SubInternal(SB) // h = u2 - u1 |
| ST (h) |
| |
| LDacc (r) |
| CALL p256SqrInternal(SB) // rsqr = rˆ2 |
| ST (rsqr) |
| |
| LDacc (h) |
| CALL p256SqrInternal(SB) // hsqr = hˆ2 |
| ST (hsqr) |
| |
| LDt (h) |
| CALL p256MulInternal(SB) // hcub = hˆ3 |
| ST (hcub) |
| |
| LDt (s1) |
| CALL p256MulInternal(SB) |
| ST (s2) |
| |
| LDacc (z1in) |
| LDt (z2in) |
| CALL p256MulInternal(SB) // z1 * z2 |
| LDt (h) |
| CALL p256MulInternal(SB) // z1 * z2 * h |
| ST (zout) |
| |
| LDacc (hsqr) |
| LDt (u1) |
| CALL p256MulInternal(SB) // hˆ2 * u1 |
| ST (u2) |
| |
| p256MulBy2Inline // u1 * hˆ2 * 2, inline |
| LDacc (rsqr) |
| CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 |
| |
| LDt (hcub) |
| CALL p256SubInternal(SB) |
| ST (xout) |
| |
| MOVQ acc4, t0 |
| MOVQ acc5, t1 |
| MOVQ acc6, t2 |
| MOVQ acc7, t3 |
| LDacc (u2) |
| CALL p256SubInternal(SB) |
| |
| LDt (r) |
| CALL p256MulInternal(SB) |
| |
| LDt (s2) |
| CALL p256SubInternal(SB) |
| ST (yout) |
| |
| MOVOU xout(16*0), X0 |
| MOVOU xout(16*1), X1 |
| MOVOU yout(16*0), X2 |
| MOVOU yout(16*1), X3 |
| MOVOU zout(16*0), X4 |
| MOVOU zout(16*1), X5 |
| // Finally output the result |
| MOVQ rptr, AX |
| MOVQ $0, rptr |
| MOVOU X0, (16*0)(AX) |
| MOVOU X1, (16*1)(AX) |
| MOVOU X2, (16*2)(AX) |
| MOVOU X3, (16*3)(AX) |
| MOVOU X4, (16*4)(AX) |
| MOVOU X5, (16*5)(AX) |
| |
| RET |
| #undef x1in |
| #undef y1in |
| #undef z1in |
| #undef x2in |
| #undef y2in |
| #undef z2in |
| #undef xout |
| #undef yout |
| #undef zout |
| #undef s1 |
| #undef s2 |
| #undef u1 |
| #undef u2 |
| #undef z1sqr |
| #undef z2sqr |
| #undef h |
| #undef r |
| #undef hsqr |
| #undef rsqr |
| #undef hcub |
| #undef rptr |
| /* ---------------------------------------*/ |
| #define x(off) (32*0 + off)(SP) |
| #define y(off) (32*1 + off)(SP) |
| #define z(off) (32*2 + off)(SP) |
| |
| #define s(off) (32*3 + off)(SP) |
| #define m(off) (32*4 + off)(SP) |
| #define zsqr(off) (32*5 + off)(SP) |
| #define tmp(off) (32*6 + off)(SP) |
| #define rptr (32*7)(SP) |
| |
| //func p256PointDoubleAsm(res, in []uint64) |
| TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-48 |
| // Move input to stack in order to free registers |
| MOVQ res+0(FP), AX |
| MOVQ in+24(FP), BX |
| |
| MOVOU (16*0)(BX), X0 |
| MOVOU (16*1)(BX), X1 |
| MOVOU (16*2)(BX), X2 |
| MOVOU (16*3)(BX), X3 |
| MOVOU (16*4)(BX), X4 |
| MOVOU (16*5)(BX), X5 |
| |
| MOVOU X0, x(16*0) |
| MOVOU X1, x(16*1) |
| MOVOU X2, y(16*0) |
| MOVOU X3, y(16*1) |
| MOVOU X4, z(16*0) |
| MOVOU X5, z(16*1) |
| // Store pointer to result |
| MOVQ AX, rptr |
| // Begin point double |
| LDacc (z) |
| CALL p256SqrInternal(SB) |
| ST (zsqr) |
| |
| LDt (x) |
| p256AddInline |
| STt (m) |
| |
| LDacc (z) |
| LDt (y) |
| CALL p256MulInternal(SB) |
| p256MulBy2Inline |
| MOVQ rptr, AX |
| // Store z |
| MOVQ t0, (16*4 + 8*0)(AX) |
| MOVQ t1, (16*4 + 8*1)(AX) |
| MOVQ t2, (16*4 + 8*2)(AX) |
| MOVQ t3, (16*4 + 8*3)(AX) |
| |
| LDacc (x) |
| LDt (zsqr) |
| CALL p256SubInternal(SB) |
| LDt (m) |
| CALL p256MulInternal(SB) |
| ST (m) |
| // Multiply by 3 |
| p256MulBy2Inline |
| LDacc (m) |
| p256AddInline |
| STt (m) |
| //////////////////////// |
| LDacc (y) |
| p256MulBy2Inline |
| t2acc |
| CALL p256SqrInternal(SB) |
| ST (s) |
| CALL p256SqrInternal(SB) |
| // Divide by 2 |
| XORQ mul0, mul0 |
| MOVQ acc4, t0 |
| MOVQ acc5, t1 |
| MOVQ acc6, t2 |
| MOVQ acc7, t3 |
| |
| ADDQ $-1, acc4 |
| ADCQ p256const0<>(SB), acc5 |
| ADCQ $0, acc6 |
| ADCQ p256const1<>(SB), acc7 |
| ADCQ $0, mul0 |
| TESTQ $1, t0 |
| |
| CMOVQEQ t0, acc4 |
| CMOVQEQ t1, acc5 |
| CMOVQEQ t2, acc6 |
| CMOVQEQ t3, acc7 |
| ANDQ t0, mul0 |
| |
| SHRQ $1, acc4:acc5 |
| SHRQ $1, acc5:acc6 |
| SHRQ $1, acc6:acc7 |
| SHRQ $1, acc7:mul0 |
| ST (y) |
| ///////////////////////// |
| LDacc (x) |
| LDt (s) |
| CALL p256MulInternal(SB) |
| ST (s) |
| p256MulBy2Inline |
| STt (tmp) |
| |
| LDacc (m) |
| CALL p256SqrInternal(SB) |
| LDt (tmp) |
| CALL p256SubInternal(SB) |
| |
| MOVQ rptr, AX |
| // Store x |
| MOVQ acc4, (16*0 + 8*0)(AX) |
| MOVQ acc5, (16*0 + 8*1)(AX) |
| MOVQ acc6, (16*0 + 8*2)(AX) |
| MOVQ acc7, (16*0 + 8*3)(AX) |
| |
| acc2t |
| LDacc (s) |
| CALL p256SubInternal(SB) |
| |
| LDt (m) |
| CALL p256MulInternal(SB) |
| |
| LDt (y) |
| CALL p256SubInternal(SB) |
| MOVQ rptr, AX |
| // Store y |
| MOVQ acc4, (16*2 + 8*0)(AX) |
| MOVQ acc5, (16*2 + 8*1)(AX) |
| MOVQ acc6, (16*2 + 8*2)(AX) |
| MOVQ acc7, (16*2 + 8*3)(AX) |
| /////////////////////// |
| MOVQ $0, rptr |
| |
| RET |
| /* ---------------------------------------*/ |
| |