| // Code generated by command: go run p256_asm.go -out ../p256_asm_amd64.s. DO NOT EDIT. |
| |
| //go:build !purego |
| |
| #include "textflag.h" |
| |
| // func p256MovCond(res *P256Point, a *P256Point, b *P256Point, cond int) |
| // Requires: SSE2 |
| TEXT ·p256MovCond(SB), NOSPLIT, $0-32 |
| MOVQ res+0(FP), DI |
| MOVQ a+8(FP), SI |
| MOVQ b+16(FP), CX |
| MOVQ cond+24(FP), X12 |
| PXOR X13, X13 |
| PSHUFD $0x00, X12, X12 |
| PCMPEQL X13, X12 |
| MOVOU X12, X0 |
| MOVOU (SI), X6 |
| PANDN X6, X0 |
| MOVOU X12, X1 |
| MOVOU 16(SI), X7 |
| PANDN X7, X1 |
| MOVOU X12, X2 |
| MOVOU 32(SI), X8 |
| PANDN X8, X2 |
| MOVOU X12, X3 |
| MOVOU 48(SI), X9 |
| PANDN X9, X3 |
| MOVOU X12, X4 |
| MOVOU 64(SI), X10 |
| PANDN X10, X4 |
| MOVOU X12, X5 |
| MOVOU 80(SI), X11 |
| PANDN X11, X5 |
| MOVOU (CX), X6 |
| MOVOU 16(CX), X7 |
| MOVOU 32(CX), X8 |
| MOVOU 48(CX), X9 |
| MOVOU 64(CX), X10 |
| MOVOU 80(CX), X11 |
| PAND X12, X6 |
| PAND X12, X7 |
| PAND X12, X8 |
| PAND X12, X9 |
| PAND X12, X10 |
| PAND X12, X11 |
| PXOR X6, X0 |
| PXOR X7, X1 |
| PXOR X8, X2 |
| PXOR X9, X3 |
| PXOR X10, X4 |
| PXOR X11, X5 |
| MOVOU X0, (DI) |
| MOVOU X1, 16(DI) |
| MOVOU X2, 32(DI) |
| MOVOU X3, 48(DI) |
| MOVOU X4, 64(DI) |
| MOVOU X5, 80(DI) |
| RET |
| |
| // func p256NegCond(val *p256Element, cond int) |
| // Requires: CMOV |
| TEXT ·p256NegCond(SB), NOSPLIT, $0-16 |
| MOVQ val+0(FP), DI |
| MOVQ cond+8(FP), R14 |
| |
| // acc = poly |
| MOVQ $-1, R8 |
| MOVQ p256const0<>+0(SB), R9 |
| MOVQ $+0, R10 |
| MOVQ p256const1<>+0(SB), R11 |
| |
| // Load the original value |
| MOVQ (DI), R13 |
| MOVQ 8(DI), SI |
| MOVQ 16(DI), CX |
| MOVQ 24(DI), R15 |
| |
| // Speculatively subtract |
| SUBQ R13, R8 |
| SBBQ SI, R9 |
| SBBQ CX, R10 |
| SBBQ R15, R11 |
| |
| // If condition is 0, keep original value |
| TESTQ R14, R14 |
| CMOVQEQ R13, R8 |
| CMOVQEQ SI, R9 |
| CMOVQEQ CX, R10 |
| CMOVQEQ R15, R11 |
| |
| // Store result |
| MOVQ R8, (DI) |
| MOVQ R9, 8(DI) |
| MOVQ R10, 16(DI) |
| MOVQ R11, 24(DI) |
| RET |
| |
| DATA p256const0<>+0(SB)/8, $0x00000000ffffffff |
| GLOBL p256const0<>(SB), RODATA, $8 |
| |
| DATA p256const1<>+0(SB)/8, $0xffffffff00000001 |
| GLOBL p256const1<>(SB), RODATA, $8 |
| |
| // func p256Sqr(res *p256Element, in *p256Element, n int) |
| // Requires: CMOV |
| TEXT ·p256Sqr(SB), NOSPLIT, $0-24 |
| MOVQ res+0(FP), DI |
| MOVQ in+8(FP), SI |
| MOVQ n+16(FP), BX |
| |
| sqrLoop: |
| // y[1:] * y[0] |
| MOVQ (SI), R14 |
| MOVQ 8(SI), AX |
| MULQ R14 |
| MOVQ AX, R9 |
| MOVQ DX, R10 |
| MOVQ 16(SI), AX |
| MULQ R14 |
| ADDQ AX, R10 |
| ADCQ $0x00, DX |
| MOVQ DX, R11 |
| MOVQ 24(SI), AX |
| MULQ R14 |
| ADDQ AX, R11 |
| ADCQ $0x00, DX |
| MOVQ DX, R12 |
| |
| // y[2:] * y[1] |
| MOVQ 8(SI), R14 |
| MOVQ 16(SI), AX |
| MULQ R14 |
| ADDQ AX, R11 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ 24(SI), AX |
| MULQ R14 |
| ADDQ R15, R12 |
| ADCQ $0x00, DX |
| ADDQ AX, R12 |
| ADCQ $0x00, DX |
| MOVQ DX, R13 |
| |
| // y[3] * y[2] |
| MOVQ 16(SI), R14 |
| MOVQ 24(SI), AX |
| MULQ R14 |
| ADDQ AX, R13 |
| ADCQ $0x00, DX |
| MOVQ DX, CX |
| XORQ R15, R15 |
| |
| // *2 |
| ADDQ R9, R9 |
| ADCQ R10, R10 |
| ADCQ R11, R11 |
| ADCQ R12, R12 |
| ADCQ R13, R13 |
| ADCQ CX, CX |
| ADCQ $0x00, R15 |
| |
| // Missing products |
| MOVQ (SI), AX |
| MULQ AX |
| MOVQ AX, R8 |
| MOVQ DX, R14 |
| MOVQ 8(SI), AX |
| MULQ AX |
| ADDQ R14, R9 |
| ADCQ AX, R10 |
| ADCQ $0x00, DX |
| MOVQ DX, R14 |
| MOVQ 16(SI), AX |
| MULQ AX |
| ADDQ R14, R11 |
| ADCQ AX, R12 |
| ADCQ $0x00, DX |
| MOVQ DX, R14 |
| MOVQ 24(SI), AX |
| MULQ AX |
| ADDQ R14, R13 |
| ADCQ AX, CX |
| ADCQ DX, R15 |
| MOVQ R15, SI |
| |
| // First reduction step |
| MOVQ R8, AX |
| MOVQ R8, R15 |
| SHLQ $0x20, R8 |
| MULQ p256const1<>+0(SB) |
| SHRQ $0x20, R15 |
| ADDQ R8, R9 |
| ADCQ R15, R10 |
| ADCQ AX, R11 |
| ADCQ $0x00, DX |
| MOVQ DX, R8 |
| |
| // Second reduction step |
| MOVQ R9, AX |
| MOVQ R9, R15 |
| SHLQ $0x20, R9 |
| MULQ p256const1<>+0(SB) |
| SHRQ $0x20, R15 |
| ADDQ R9, R10 |
| ADCQ R15, R11 |
| ADCQ AX, R8 |
| ADCQ $0x00, DX |
| MOVQ DX, R9 |
| |
| // Third reduction step |
| MOVQ R10, AX |
| MOVQ R10, R15 |
| SHLQ $0x20, R10 |
| MULQ p256const1<>+0(SB) |
| SHRQ $0x20, R15 |
| ADDQ R10, R11 |
| ADCQ R15, R8 |
| ADCQ AX, R9 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| |
| // Last reduction step |
| XORQ R14, R14 |
| MOVQ R11, AX |
| MOVQ R11, R15 |
| SHLQ $0x20, R11 |
| MULQ p256const1<>+0(SB) |
| SHRQ $0x20, R15 |
| ADDQ R11, R8 |
| ADCQ R15, R9 |
| ADCQ AX, R10 |
| ADCQ $0x00, DX |
| MOVQ DX, R11 |
| |
| // Add bits [511:256] of the sqr result |
| ADCQ R12, R8 |
| ADCQ R13, R9 |
| ADCQ CX, R10 |
| ADCQ SI, R11 |
| ADCQ $0x00, R14 |
| MOVQ R8, R12 |
| MOVQ R9, R13 |
| MOVQ R10, CX |
| MOVQ R11, R15 |
| |
| // Subtract p256 |
| SUBQ $-1, R8 |
| SBBQ p256const0<>+0(SB), R9 |
| SBBQ $0x00, R10 |
| SBBQ p256const1<>+0(SB), R11 |
| SBBQ $0x00, R14 |
| CMOVQCS R12, R8 |
| CMOVQCS R13, R9 |
| CMOVQCS CX, R10 |
| CMOVQCS R15, R11 |
| MOVQ R8, (DI) |
| MOVQ R9, 8(DI) |
| MOVQ R10, 16(DI) |
| MOVQ R11, 24(DI) |
| MOVQ DI, SI |
| DECQ BX |
| JNE sqrLoop |
| RET |
| |
| // func p256Mul(res *p256Element, in1 *p256Element, in2 *p256Element) |
| // Requires: CMOV |
| TEXT ·p256Mul(SB), NOSPLIT, $0-24 |
| MOVQ res+0(FP), DI |
| MOVQ in1+8(FP), SI |
| MOVQ in2+16(FP), CX |
| |
| // x * y[0] |
| MOVQ (CX), R14 |
| MOVQ (SI), AX |
| MULQ R14 |
| MOVQ AX, R8 |
| MOVQ DX, R9 |
| MOVQ 8(SI), AX |
| MULQ R14 |
| ADDQ AX, R9 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 16(SI), AX |
| MULQ R14 |
| ADDQ AX, R10 |
| ADCQ $0x00, DX |
| MOVQ DX, R11 |
| MOVQ 24(SI), AX |
| MULQ R14 |
| ADDQ AX, R11 |
| ADCQ $0x00, DX |
| MOVQ DX, R12 |
| XORQ R13, R13 |
| |
| // First reduction step |
| MOVQ R8, AX |
| MOVQ R8, R15 |
| SHLQ $0x20, R8 |
| MULQ p256const1<>+0(SB) |
| SHRQ $0x20, R15 |
| ADDQ R8, R9 |
| ADCQ R15, R10 |
| ADCQ AX, R11 |
| ADCQ DX, R12 |
| ADCQ $0x00, R13 |
| XORQ R8, R8 |
| |
| // x * y[1] |
| MOVQ 8(CX), R14 |
| MOVQ (SI), AX |
| MULQ R14 |
| ADDQ AX, R9 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ 8(SI), AX |
| MULQ R14 |
| ADDQ R15, R10 |
| ADCQ $0x00, DX |
| ADDQ AX, R10 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ 16(SI), AX |
| MULQ R14 |
| ADDQ R15, R11 |
| ADCQ $0x00, DX |
| ADDQ AX, R11 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ 24(SI), AX |
| MULQ R14 |
| ADDQ R15, R12 |
| ADCQ $0x00, DX |
| ADDQ AX, R12 |
| ADCQ DX, R13 |
| ADCQ $0x00, R8 |
| |
| // Second reduction step |
| MOVQ R9, AX |
| MOVQ R9, R15 |
| SHLQ $0x20, R9 |
| MULQ p256const1<>+0(SB) |
| SHRQ $0x20, R15 |
| ADDQ R9, R10 |
| ADCQ R15, R11 |
| ADCQ AX, R12 |
| ADCQ DX, R13 |
| ADCQ $0x00, R8 |
| XORQ R9, R9 |
| |
| // x * y[2] |
| MOVQ 16(CX), R14 |
| MOVQ (SI), AX |
| MULQ R14 |
| ADDQ AX, R10 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ 8(SI), AX |
| MULQ R14 |
| ADDQ R15, R11 |
| ADCQ $0x00, DX |
| ADDQ AX, R11 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ 16(SI), AX |
| MULQ R14 |
| ADDQ R15, R12 |
| ADCQ $0x00, DX |
| ADDQ AX, R12 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ 24(SI), AX |
| MULQ R14 |
| ADDQ R15, R13 |
| ADCQ $0x00, DX |
| ADDQ AX, R13 |
| ADCQ DX, R8 |
| ADCQ $0x00, R9 |
| |
| // Third reduction step |
| MOVQ R10, AX |
| MOVQ R10, R15 |
| SHLQ $0x20, R10 |
| MULQ p256const1<>+0(SB) |
| SHRQ $0x20, R15 |
| ADDQ R10, R11 |
| ADCQ R15, R12 |
| ADCQ AX, R13 |
| ADCQ DX, R8 |
| ADCQ $0x00, R9 |
| XORQ R10, R10 |
| |
| // x * y[3] |
| MOVQ 24(CX), R14 |
| MOVQ (SI), AX |
| MULQ R14 |
| ADDQ AX, R11 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ 8(SI), AX |
| MULQ R14 |
| ADDQ R15, R12 |
| ADCQ $0x00, DX |
| ADDQ AX, R12 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ 16(SI), AX |
| MULQ R14 |
| ADDQ R15, R13 |
| ADCQ $0x00, DX |
| ADDQ AX, R13 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ 24(SI), AX |
| MULQ R14 |
| ADDQ R15, R8 |
| ADCQ $0x00, DX |
| ADDQ AX, R8 |
| ADCQ DX, R9 |
| ADCQ $0x00, R10 |
| |
| // Last reduction step |
| MOVQ R11, AX |
| MOVQ R11, R15 |
| SHLQ $0x20, R11 |
| MULQ p256const1<>+0(SB) |
| SHRQ $0x20, R15 |
| ADDQ R11, R12 |
| ADCQ R15, R13 |
| ADCQ AX, R8 |
| ADCQ DX, R9 |
| ADCQ $0x00, R10 |
| |
| // Copy result [255:0] |
| MOVQ R12, SI |
| MOVQ R13, R11 |
| MOVQ R8, R14 |
| MOVQ R9, R15 |
| |
| // Subtract p256 |
| SUBQ $-1, R12 |
| SBBQ p256const0<>+0(SB), R13 |
| SBBQ $0x00, R8 |
| SBBQ p256const1<>+0(SB), R9 |
| SBBQ $0x00, R10 |
| CMOVQCS SI, R12 |
| CMOVQCS R11, R13 |
| CMOVQCS R14, R8 |
| CMOVQCS R15, R9 |
| MOVQ R12, (DI) |
| MOVQ R13, 8(DI) |
| MOVQ R8, 16(DI) |
| MOVQ R9, 24(DI) |
| RET |
| |
| // func p256FromMont(res *p256Element, in *p256Element) |
| // Requires: CMOV |
| TEXT ·p256FromMont(SB), NOSPLIT, $0-16 |
| MOVQ res+0(FP), DI |
| MOVQ in+8(FP), SI |
| MOVQ (SI), R8 |
| MOVQ 8(SI), R9 |
| MOVQ 16(SI), R10 |
| MOVQ 24(SI), R11 |
| XORQ R12, R12 |
| |
| // Only reduce, no multiplications are needed |
| // First stage |
| MOVQ R8, AX |
| MOVQ R8, R15 |
| SHLQ $0x20, R8 |
| MULQ p256const1<>+0(SB) |
| SHRQ $0x20, R15 |
| ADDQ R8, R9 |
| ADCQ R15, R10 |
| ADCQ AX, R11 |
| ADCQ DX, R12 |
| XORQ R13, R13 |
| |
| // Second stage |
| MOVQ R9, AX |
| MOVQ R9, R15 |
| SHLQ $0x20, R9 |
| MULQ p256const1<>+0(SB) |
| SHRQ $0x20, R15 |
| ADDQ R9, R10 |
| ADCQ R15, R11 |
| ADCQ AX, R12 |
| ADCQ DX, R13 |
| XORQ R8, R8 |
| |
| // Third stage |
| MOVQ R10, AX |
| MOVQ R10, R15 |
| SHLQ $0x20, R10 |
| MULQ p256const1<>+0(SB) |
| SHRQ $0x20, R15 |
| ADDQ R10, R11 |
| ADCQ R15, R12 |
| ADCQ AX, R13 |
| ADCQ DX, R8 |
| XORQ R9, R9 |
| |
| // Last stage |
| MOVQ R11, AX |
| MOVQ R11, R15 |
| SHLQ $0x20, R11 |
| MULQ p256const1<>+0(SB) |
| SHRQ $0x20, R15 |
| ADDQ R11, R12 |
| ADCQ R15, R13 |
| ADCQ AX, R8 |
| ADCQ DX, R9 |
| MOVQ R12, SI |
| MOVQ R13, R11 |
| MOVQ R8, R14 |
| MOVQ R9, R15 |
| SUBQ $-1, R12 |
| SBBQ p256const0<>+0(SB), R13 |
| SBBQ $0x00, R8 |
| SBBQ p256const1<>+0(SB), R9 |
| CMOVQCS SI, R12 |
| CMOVQCS R11, R13 |
| CMOVQCS R14, R8 |
| CMOVQCS R15, R9 |
| MOVQ R12, (DI) |
| MOVQ R13, 8(DI) |
| MOVQ R8, 16(DI) |
| MOVQ R9, 24(DI) |
| RET |
| |
| // func p256Select(res *P256Point, table *p256Table, idx int) |
| // Requires: SSE2 |
| TEXT ·p256Select(SB), NOSPLIT, $0-24 |
| MOVQ idx+16(FP), AX |
| MOVQ table+8(FP), DI |
| MOVQ res+0(FP), DX |
| PXOR X15, X15 |
| PCMPEQL X14, X14 |
| PSUBL X14, X15 |
| MOVL AX, X14 |
| PSHUFD $0x00, X14, X14 |
| PXOR X0, X0 |
| PXOR X1, X1 |
| PXOR X2, X2 |
| PXOR X3, X3 |
| PXOR X4, X4 |
| PXOR X5, X5 |
| MOVQ $0x00000010, AX |
| MOVOU X15, X13 |
| |
| loop_select: |
| MOVOU X13, X12 |
| PADDL X15, X13 |
| PCMPEQL X14, X12 |
| MOVOU (DI), X6 |
| MOVOU 16(DI), X7 |
| MOVOU 32(DI), X8 |
| MOVOU 48(DI), X9 |
| MOVOU 64(DI), X10 |
| MOVOU 80(DI), X11 |
| ADDQ $0x60, DI |
| PAND X12, X6 |
| PAND X12, X7 |
| PAND X12, X8 |
| PAND X12, X9 |
| PAND X12, X10 |
| PAND X12, X11 |
| PXOR X6, X0 |
| PXOR X7, X1 |
| PXOR X8, X2 |
| PXOR X9, X3 |
| PXOR X10, X4 |
| PXOR X11, X5 |
| DECQ AX |
| JNE loop_select |
| MOVOU X0, (DX) |
| MOVOU X1, 16(DX) |
| MOVOU X2, 32(DX) |
| MOVOU X3, 48(DX) |
| MOVOU X4, 64(DX) |
| MOVOU X5, 80(DX) |
| RET |
| |
| // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int) |
| // Requires: SSE2 |
| TEXT ·p256SelectAffine(SB), NOSPLIT, $0-24 |
| MOVQ idx+16(FP), AX |
| MOVQ table+8(FP), DI |
| MOVQ res+0(FP), DX |
| PXOR X15, X15 |
| PCMPEQL X14, X14 |
| PSUBL X14, X15 |
| MOVL AX, X14 |
| PSHUFD $0x00, X14, X14 |
| PXOR X0, X0 |
| PXOR X1, X1 |
| PXOR X2, X2 |
| PXOR X3, X3 |
| MOVQ $0x00000010, AX |
| MOVOU X15, X13 |
| |
| loop_select_base: |
| MOVOU X13, X12 |
| PADDL X15, X13 |
| PCMPEQL X14, X12 |
| MOVOU (DI), X4 |
| MOVOU 16(DI), X5 |
| MOVOU 32(DI), X6 |
| MOVOU 48(DI), X7 |
| MOVOU 64(DI), X8 |
| MOVOU 80(DI), X9 |
| MOVOU 96(DI), X10 |
| MOVOU 112(DI), X11 |
| ADDQ $0x80, DI |
| PAND X12, X4 |
| PAND X12, X5 |
| PAND X12, X6 |
| PAND X12, X7 |
| MOVOU X13, X12 |
| PADDL X15, X13 |
| PCMPEQL X14, X12 |
| PAND X12, X8 |
| PAND X12, X9 |
| PAND X12, X10 |
| PAND X12, X11 |
| PXOR X4, X0 |
| PXOR X5, X1 |
| PXOR X6, X2 |
| PXOR X7, X3 |
| PXOR X8, X0 |
| PXOR X9, X1 |
| PXOR X10, X2 |
| PXOR X11, X3 |
| DECQ AX |
| JNE loop_select_base |
| MOVOU X0, (DX) |
| MOVOU X1, 16(DX) |
| MOVOU X2, 32(DX) |
| MOVOU X3, 48(DX) |
| RET |
| |
| // func p256OrdMul(res *p256OrdElement, in1 *p256OrdElement, in2 *p256OrdElement) |
| // Requires: CMOV |
| TEXT ·p256OrdMul(SB), NOSPLIT, $0-24 |
| MOVQ res+0(FP), DI |
| MOVQ in1+8(FP), SI |
| MOVQ in2+16(FP), CX |
| |
| // x * y[0] |
| MOVQ (CX), R14 |
| MOVQ (SI), AX |
| MULQ R14 |
| MOVQ AX, R8 |
| MOVQ DX, R9 |
| MOVQ 8(SI), AX |
| MULQ R14 |
| ADDQ AX, R9 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ 16(SI), AX |
| MULQ R14 |
| ADDQ AX, R10 |
| ADCQ $0x00, DX |
| MOVQ DX, R11 |
| MOVQ 24(SI), AX |
| MULQ R14 |
| ADDQ AX, R11 |
| ADCQ $0x00, DX |
| MOVQ DX, R12 |
| XORQ R13, R13 |
| |
| // First reduction step |
| MOVQ R8, AX |
| MULQ p256ordK0<>+0(SB) |
| MOVQ AX, R14 |
| MOVQ p256ord<>+0(SB), AX |
| MULQ R14 |
| ADDQ AX, R8 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ p256ord<>+8(SB), AX |
| MULQ R14 |
| ADDQ R15, R9 |
| ADCQ $0x00, DX |
| ADDQ AX, R9 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ p256ord<>+16(SB), AX |
| MULQ R14 |
| ADDQ R15, R10 |
| ADCQ $0x00, DX |
| ADDQ AX, R10 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ p256ord<>+24(SB), AX |
| MULQ R14 |
| ADDQ R15, R11 |
| ADCQ $0x00, DX |
| ADDQ AX, R11 |
| ADCQ DX, R12 |
| ADCQ $0x00, R13 |
| |
| // x * y[1] |
| MOVQ 8(CX), R14 |
| MOVQ (SI), AX |
| MULQ R14 |
| ADDQ AX, R9 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ 8(SI), AX |
| MULQ R14 |
| ADDQ R15, R10 |
| ADCQ $0x00, DX |
| ADDQ AX, R10 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ 16(SI), AX |
| MULQ R14 |
| ADDQ R15, R11 |
| ADCQ $0x00, DX |
| ADDQ AX, R11 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ 24(SI), AX |
| MULQ R14 |
| ADDQ R15, R12 |
| ADCQ $0x00, DX |
| ADDQ AX, R12 |
| ADCQ DX, R13 |
| ADCQ $0x00, R8 |
| |
| // Second reduction step |
| MOVQ R9, AX |
| MULQ p256ordK0<>+0(SB) |
| MOVQ AX, R14 |
| MOVQ p256ord<>+0(SB), AX |
| MULQ R14 |
| ADDQ AX, R9 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ p256ord<>+8(SB), AX |
| MULQ R14 |
| ADDQ R15, R10 |
| ADCQ $0x00, DX |
| ADDQ AX, R10 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ p256ord<>+16(SB), AX |
| MULQ R14 |
| ADDQ R15, R11 |
| ADCQ $0x00, DX |
| ADDQ AX, R11 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ p256ord<>+24(SB), AX |
| MULQ R14 |
| ADDQ R15, R12 |
| ADCQ $0x00, DX |
| ADDQ AX, R12 |
| ADCQ DX, R13 |
| ADCQ $0x00, R8 |
| |
| // x * y[2] |
| MOVQ 16(CX), R14 |
| MOVQ (SI), AX |
| MULQ R14 |
| ADDQ AX, R10 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ 8(SI), AX |
| MULQ R14 |
| ADDQ R15, R11 |
| ADCQ $0x00, DX |
| ADDQ AX, R11 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ 16(SI), AX |
| MULQ R14 |
| ADDQ R15, R12 |
| ADCQ $0x00, DX |
| ADDQ AX, R12 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ 24(SI), AX |
| MULQ R14 |
| ADDQ R15, R13 |
| ADCQ $0x00, DX |
| ADDQ AX, R13 |
| ADCQ DX, R8 |
| ADCQ $0x00, R9 |
| |
| // Third reduction step |
| MOVQ R10, AX |
| MULQ p256ordK0<>+0(SB) |
| MOVQ AX, R14 |
| MOVQ p256ord<>+0(SB), AX |
| MULQ R14 |
| ADDQ AX, R10 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ p256ord<>+8(SB), AX |
| MULQ R14 |
| ADDQ R15, R11 |
| ADCQ $0x00, DX |
| ADDQ AX, R11 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ p256ord<>+16(SB), AX |
| MULQ R14 |
| ADDQ R15, R12 |
| ADCQ $0x00, DX |
| ADDQ AX, R12 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ p256ord<>+24(SB), AX |
| MULQ R14 |
| ADDQ R15, R13 |
| ADCQ $0x00, DX |
| ADDQ AX, R13 |
| ADCQ DX, R8 |
| ADCQ $0x00, R9 |
| |
| // x * y[3] |
| MOVQ 24(CX), R14 |
| MOVQ (SI), AX |
| MULQ R14 |
| ADDQ AX, R11 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ 8(SI), AX |
| MULQ R14 |
| ADDQ R15, R12 |
| ADCQ $0x00, DX |
| ADDQ AX, R12 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ 16(SI), AX |
| MULQ R14 |
| ADDQ R15, R13 |
| ADCQ $0x00, DX |
| ADDQ AX, R13 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ 24(SI), AX |
| MULQ R14 |
| ADDQ R15, R8 |
| ADCQ $0x00, DX |
| ADDQ AX, R8 |
| ADCQ DX, R9 |
| ADCQ $0x00, R10 |
| |
| // Last reduction step |
| MOVQ R11, AX |
| MULQ p256ordK0<>+0(SB) |
| MOVQ AX, R14 |
| MOVQ p256ord<>+0(SB), AX |
| MULQ R14 |
| ADDQ AX, R11 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ p256ord<>+8(SB), AX |
| MULQ R14 |
| ADDQ R15, R12 |
| ADCQ $0x00, DX |
| ADDQ AX, R12 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ p256ord<>+16(SB), AX |
| MULQ R14 |
| ADDQ R15, R13 |
| ADCQ $0x00, DX |
| ADDQ AX, R13 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ p256ord<>+24(SB), AX |
| MULQ R14 |
| ADDQ R15, R8 |
| ADCQ $0x00, DX |
| ADDQ AX, R8 |
| ADCQ DX, R9 |
| ADCQ $0x00, R10 |
| |
| // Copy result [255:0] |
| MOVQ R12, SI |
| MOVQ R13, R11 |
| MOVQ R8, R14 |
| MOVQ R9, R15 |
| |
| // Subtract p256 |
| SUBQ p256ord<>+0(SB), R12 |
| SBBQ p256ord<>+8(SB), R13 |
| SBBQ p256ord<>+16(SB), R8 |
| SBBQ p256ord<>+24(SB), R9 |
| SBBQ $0x00, R10 |
| CMOVQCS SI, R12 |
| CMOVQCS R11, R13 |
| CMOVQCS R14, R8 |
| CMOVQCS R15, R9 |
| MOVQ R12, (DI) |
| MOVQ R13, 8(DI) |
| MOVQ R8, 16(DI) |
| MOVQ R9, 24(DI) |
| RET |
| |
| DATA p256ordK0<>+0(SB)/8, $0xccd1c8aaee00bc4f |
| GLOBL p256ordK0<>(SB), RODATA, $8 |
| |
| DATA p256ord<>+0(SB)/8, $0xf3b9cac2fc632551 |
| DATA p256ord<>+8(SB)/8, $0xbce6faada7179e84 |
| DATA p256ord<>+16(SB)/8, $0xffffffffffffffff |
| DATA p256ord<>+24(SB)/8, $0xffffffff00000000 |
| GLOBL p256ord<>(SB), RODATA, $32 |
| |
| // func p256OrdSqr(res *p256OrdElement, in *p256OrdElement, n int) |
| // Requires: CMOV |
| TEXT ·p256OrdSqr(SB), NOSPLIT, $0-24 |
| MOVQ res+0(FP), DI |
| MOVQ in+8(FP), SI |
| MOVQ n+16(FP), BX |
| |
| ordSqrLoop: |
| // y[1:] * y[0] |
| MOVQ (SI), R14 |
| MOVQ 8(SI), AX |
| MULQ R14 |
| MOVQ AX, R9 |
| MOVQ DX, R10 |
| MOVQ 16(SI), AX |
| MULQ R14 |
| ADDQ AX, R10 |
| ADCQ $0x00, DX |
| MOVQ DX, R11 |
| MOVQ 24(SI), AX |
| MULQ R14 |
| ADDQ AX, R11 |
| ADCQ $0x00, DX |
| MOVQ DX, R12 |
| |
| // y[2:] * y[1] |
| MOVQ 8(SI), R14 |
| MOVQ 16(SI), AX |
| MULQ R14 |
| ADDQ AX, R11 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ 24(SI), AX |
| MULQ R14 |
| ADDQ R15, R12 |
| ADCQ $0x00, DX |
| ADDQ AX, R12 |
| ADCQ $0x00, DX |
| MOVQ DX, R13 |
| |
| // y[3] * y[2] |
| MOVQ 16(SI), R14 |
| MOVQ 24(SI), AX |
| MULQ R14 |
| ADDQ AX, R13 |
| ADCQ $0x00, DX |
| MOVQ DX, CX |
| XORQ R15, R15 |
| |
| // *2 |
| ADDQ R9, R9 |
| ADCQ R10, R10 |
| ADCQ R11, R11 |
| ADCQ R12, R12 |
| ADCQ R13, R13 |
| ADCQ CX, CX |
| ADCQ $0x00, R15 |
| |
| // Missing products |
| MOVQ (SI), AX |
| MULQ AX |
| MOVQ AX, R8 |
| MOVQ DX, R14 |
| MOVQ 8(SI), AX |
| MULQ AX |
| ADDQ R14, R9 |
| ADCQ AX, R10 |
| ADCQ $0x00, DX |
| MOVQ DX, R14 |
| MOVQ 16(SI), AX |
| MULQ AX |
| ADDQ R14, R11 |
| ADCQ AX, R12 |
| ADCQ $0x00, DX |
| MOVQ DX, R14 |
| MOVQ 24(SI), AX |
| MULQ AX |
| ADDQ R14, R13 |
| ADCQ AX, CX |
| ADCQ DX, R15 |
| MOVQ R15, SI |
| |
| // First reduction step |
| MOVQ R8, AX |
| MULQ p256ordK0<>+0(SB) |
| MOVQ AX, R14 |
| MOVQ p256ord<>+0(SB), AX |
| MULQ R14 |
| ADDQ AX, R8 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ p256ord<>+8(SB), AX |
| MULQ R14 |
| ADDQ R15, R9 |
| ADCQ $0x00, DX |
| ADDQ AX, R9 |
| MOVQ R14, R15 |
| ADCQ DX, R10 |
| ADCQ $0x00, R15 |
| SUBQ R14, R10 |
| SBBQ $0x00, R15 |
| MOVQ R14, AX |
| MOVQ R14, DX |
| MOVQ R14, R8 |
| SHLQ $0x20, AX |
| SHRQ $0x20, DX |
| ADDQ R15, R11 |
| ADCQ $0x00, R8 |
| SUBQ AX, R11 |
| SBBQ DX, R8 |
| |
| // Second reduction step |
| MOVQ R9, AX |
| MULQ p256ordK0<>+0(SB) |
| MOVQ AX, R14 |
| MOVQ p256ord<>+0(SB), AX |
| MULQ R14 |
| ADDQ AX, R9 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ p256ord<>+8(SB), AX |
| MULQ R14 |
| ADDQ R15, R10 |
| ADCQ $0x00, DX |
| ADDQ AX, R10 |
| MOVQ R14, R15 |
| ADCQ DX, R11 |
| ADCQ $0x00, R15 |
| SUBQ R14, R11 |
| SBBQ $0x00, R15 |
| MOVQ R14, AX |
| MOVQ R14, DX |
| MOVQ R14, R9 |
| SHLQ $0x20, AX |
| SHRQ $0x20, DX |
| ADDQ R15, R8 |
| ADCQ $0x00, R9 |
| SUBQ AX, R8 |
| SBBQ DX, R9 |
| |
| // Third reduction step |
| MOVQ R10, AX |
| MULQ p256ordK0<>+0(SB) |
| MOVQ AX, R14 |
| MOVQ p256ord<>+0(SB), AX |
| MULQ R14 |
| ADDQ AX, R10 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ p256ord<>+8(SB), AX |
| MULQ R14 |
| ADDQ R15, R11 |
| ADCQ $0x00, DX |
| ADDQ AX, R11 |
| MOVQ R14, R15 |
| ADCQ DX, R8 |
| ADCQ $0x00, R15 |
| SUBQ R14, R8 |
| SBBQ $0x00, R15 |
| MOVQ R14, AX |
| MOVQ R14, DX |
| MOVQ R14, R10 |
| SHLQ $0x20, AX |
| SHRQ $0x20, DX |
| ADDQ R15, R9 |
| ADCQ $0x00, R10 |
| SUBQ AX, R9 |
| SBBQ DX, R10 |
| |
| // Last reduction step |
| MOVQ R11, AX |
| MULQ p256ordK0<>+0(SB) |
| MOVQ AX, R14 |
| MOVQ p256ord<>+0(SB), AX |
| MULQ R14 |
| ADDQ AX, R11 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ p256ord<>+8(SB), AX |
| MULQ R14 |
| ADDQ R15, R8 |
| ADCQ $0x00, DX |
| ADDQ AX, R8 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ R14, R15 |
| ADCQ DX, R9 |
| ADCQ $0x00, R15 |
| SUBQ R14, R9 |
| SBBQ $0x00, R15 |
| MOVQ R14, AX |
| MOVQ R14, DX |
| MOVQ R14, R11 |
| SHLQ $0x20, AX |
| SHRQ $0x20, DX |
| ADDQ R15, R10 |
| ADCQ $0x00, R11 |
| SUBQ AX, R10 |
| SBBQ DX, R11 |
| XORQ R14, R14 |
| |
| // Add bits [511:256] of the sqr result |
| ADCQ R12, R8 |
| ADCQ R13, R9 |
| ADCQ CX, R10 |
| ADCQ SI, R11 |
| ADCQ $0x00, R14 |
| MOVQ R8, R12 |
| MOVQ R9, R13 |
| MOVQ R10, CX |
| MOVQ R11, R15 |
| |
| // Subtract p256 |
| SUBQ p256ord<>+0(SB), R8 |
| SBBQ p256ord<>+8(SB), R9 |
| SBBQ p256ord<>+16(SB), R10 |
| SBBQ p256ord<>+24(SB), R11 |
| SBBQ $0x00, R14 |
| CMOVQCS R12, R8 |
| CMOVQCS R13, R9 |
| CMOVQCS CX, R10 |
| CMOVQCS R15, R11 |
| MOVQ R8, (DI) |
| MOVQ R9, 8(DI) |
| MOVQ R10, 16(DI) |
| MOVQ R11, 24(DI) |
| MOVQ DI, SI |
| DECQ BX |
| JNE ordSqrLoop |
| RET |
| |
| // func p256SubInternal() |
| // Requires: CMOV |
| TEXT p256SubInternal(SB), NOSPLIT, $0 |
| XORQ AX, AX |
| SUBQ R14, R10 |
| SBBQ R15, R11 |
| SBBQ DI, R12 |
| SBBQ SI, R13 |
| SBBQ $0x00, AX |
| MOVQ R10, BX |
| MOVQ R11, CX |
| MOVQ R12, R8 |
| MOVQ R13, R9 |
| ADDQ $-1, R10 |
| ADCQ p256const0<>+0(SB), R11 |
| ADCQ $0x00, R12 |
| ADCQ p256const1<>+0(SB), R13 |
| ANDQ $0x01, AX |
| CMOVQEQ BX, R10 |
| CMOVQEQ CX, R11 |
| CMOVQEQ R8, R12 |
| CMOVQEQ R9, R13 |
| RET |
| |
| // func p256MulInternal() |
| // Requires: CMOV |
| TEXT p256MulInternal(SB), NOSPLIT, $8 |
| MOVQ R10, AX |
| MULQ R14 |
| MOVQ AX, BX |
| MOVQ DX, CX |
| MOVQ R10, AX |
| MULQ R15 |
| ADDQ AX, CX |
| ADCQ $0x00, DX |
| MOVQ DX, R8 |
| MOVQ R10, AX |
| MULQ DI |
| ADDQ AX, R8 |
| ADCQ $0x00, DX |
| MOVQ DX, R9 |
| MOVQ R10, AX |
| MULQ SI |
| ADDQ AX, R9 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ R11, AX |
| MULQ R14 |
| ADDQ AX, CX |
| ADCQ $0x00, DX |
| MOVQ DX, BP |
| MOVQ R11, AX |
| MULQ R15 |
| ADDQ BP, R8 |
| ADCQ $0x00, DX |
| ADDQ AX, R8 |
| ADCQ $0x00, DX |
| MOVQ DX, BP |
| MOVQ R11, AX |
| MULQ DI |
| ADDQ BP, R9 |
| ADCQ $0x00, DX |
| ADDQ AX, R9 |
| ADCQ $0x00, DX |
| MOVQ DX, BP |
| MOVQ R11, AX |
| MULQ SI |
| ADDQ BP, R10 |
| ADCQ $0x00, DX |
| ADDQ AX, R10 |
| ADCQ $0x00, DX |
| MOVQ DX, R11 |
| MOVQ R12, AX |
| MULQ R14 |
| ADDQ AX, R8 |
| ADCQ $0x00, DX |
| MOVQ DX, BP |
| MOVQ R12, AX |
| MULQ R15 |
| ADDQ BP, R9 |
| ADCQ $0x00, DX |
| ADDQ AX, R9 |
| ADCQ $0x00, DX |
| MOVQ DX, BP |
| MOVQ R12, AX |
| MULQ DI |
| ADDQ BP, R10 |
| ADCQ $0x00, DX |
| ADDQ AX, R10 |
| ADCQ $0x00, DX |
| MOVQ DX, BP |
| MOVQ R12, AX |
| MULQ SI |
| ADDQ BP, R11 |
| ADCQ $0x00, DX |
| ADDQ AX, R11 |
| ADCQ $0x00, DX |
| MOVQ DX, R12 |
| MOVQ R13, AX |
| MULQ R14 |
| ADDQ AX, R9 |
| ADCQ $0x00, DX |
| MOVQ DX, BP |
| MOVQ R13, AX |
| MULQ R15 |
| ADDQ BP, R10 |
| ADCQ $0x00, DX |
| ADDQ AX, R10 |
| ADCQ $0x00, DX |
| MOVQ DX, BP |
| MOVQ R13, AX |
| MULQ DI |
| ADDQ BP, R11 |
| ADCQ $0x00, DX |
| ADDQ AX, R11 |
| ADCQ $0x00, DX |
| MOVQ DX, BP |
| MOVQ R13, AX |
| MULQ SI |
| ADDQ BP, R12 |
| ADCQ $0x00, DX |
| ADDQ AX, R12 |
| ADCQ $0x00, DX |
| MOVQ DX, R13 |
| |
| // First reduction step |
| MOVQ BX, AX |
| MOVQ BX, BP |
| SHLQ $0x20, BX |
| MULQ p256const1<>+0(SB) |
| SHRQ $0x20, BP |
| ADDQ BX, CX |
| ADCQ BP, R8 |
| ADCQ AX, R9 |
| ADCQ $0x00, DX |
| MOVQ DX, BX |
| |
| // Second reduction step |
| MOVQ CX, AX |
| MOVQ CX, BP |
| SHLQ $0x20, CX |
| MULQ p256const1<>+0(SB) |
| SHRQ $0x20, BP |
| ADDQ CX, R8 |
| ADCQ BP, R9 |
| ADCQ AX, BX |
| ADCQ $0x00, DX |
| MOVQ DX, CX |
| |
| // Third reduction step |
| MOVQ R8, AX |
| MOVQ R8, BP |
| SHLQ $0x20, R8 |
| MULQ p256const1<>+0(SB) |
| SHRQ $0x20, BP |
| ADDQ R8, R9 |
| ADCQ BP, BX |
| ADCQ AX, CX |
| ADCQ $0x00, DX |
| MOVQ DX, R8 |
| |
| // Last reduction step |
| MOVQ R9, AX |
| MOVQ R9, BP |
| SHLQ $0x20, R9 |
| MULQ p256const1<>+0(SB) |
| SHRQ $0x20, BP |
| ADDQ R9, BX |
| ADCQ BP, CX |
| ADCQ AX, R8 |
| ADCQ $0x00, DX |
| MOVQ DX, R9 |
| MOVQ $0x00000000, BP |
| |
| // Add bits [511:256] of the result |
| ADCQ BX, R10 |
| ADCQ CX, R11 |
| ADCQ R8, R12 |
| ADCQ R9, R13 |
| ADCQ $0x00, BP |
| |
| // Copy result |
| MOVQ R10, BX |
| MOVQ R11, CX |
| MOVQ R12, R8 |
| MOVQ R13, R9 |
| |
| // Subtract p256 |
| SUBQ $-1, R10 |
| SBBQ p256const0<>+0(SB), R11 |
| SBBQ $0x00, R12 |
| SBBQ p256const1<>+0(SB), R13 |
| SBBQ $0x00, BP |
| |
| // If the result of the subtraction is negative, restore the previous result |
| CMOVQCS BX, R10 |
| CMOVQCS CX, R11 |
| CMOVQCS R8, R12 |
| CMOVQCS R9, R13 |
| RET |
| |
| // func p256SqrInternal() |
| // Requires: CMOV |
| TEXT p256SqrInternal(SB), NOSPLIT, $8 |
| MOVQ R10, AX |
| MULQ R11 |
| MOVQ AX, CX |
| MOVQ DX, R8 |
| MOVQ R10, AX |
| MULQ R12 |
| ADDQ AX, R8 |
| ADCQ $0x00, DX |
| MOVQ DX, R9 |
| MOVQ R10, AX |
| MULQ R13 |
| ADDQ AX, R9 |
| ADCQ $0x00, DX |
| MOVQ DX, R14 |
| MOVQ R11, AX |
| MULQ R12 |
| ADDQ AX, R9 |
| ADCQ $0x00, DX |
| MOVQ DX, BP |
| MOVQ R11, AX |
| MULQ R13 |
| ADDQ BP, R14 |
| ADCQ $0x00, DX |
| ADDQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R15 |
| MOVQ R12, AX |
| MULQ R13 |
| ADDQ AX, R15 |
| ADCQ $0x00, DX |
| MOVQ DX, DI |
| XORQ SI, SI |
| |
| // *2 |
| ADDQ CX, CX |
| ADCQ R8, R8 |
| ADCQ R9, R9 |
| ADCQ R14, R14 |
| ADCQ R15, R15 |
| ADCQ DI, DI |
| ADCQ $0x00, SI |
| |
| // Missing products |
| MOVQ R10, AX |
| MULQ AX |
| MOVQ AX, BX |
| MOVQ DX, R10 |
| MOVQ R11, AX |
| MULQ AX |
| ADDQ R10, CX |
| ADCQ AX, R8 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ R12, AX |
| MULQ AX |
| ADDQ R10, R9 |
| ADCQ AX, R14 |
| ADCQ $0x00, DX |
| MOVQ DX, R10 |
| MOVQ R13, AX |
| MULQ AX |
| ADDQ R10, R15 |
| ADCQ AX, DI |
| ADCQ DX, SI |
| |
| // First reduction step |
| MOVQ BX, AX |
| MOVQ BX, BP |
| SHLQ $0x20, BX |
| MULQ p256const1<>+0(SB) |
| SHRQ $0x20, BP |
| ADDQ BX, CX |
| ADCQ BP, R8 |
| ADCQ AX, R9 |
| ADCQ $0x00, DX |
| MOVQ DX, BX |
| |
| // Second reduction step |
| MOVQ CX, AX |
| MOVQ CX, BP |
| SHLQ $0x20, CX |
| MULQ p256const1<>+0(SB) |
| SHRQ $0x20, BP |
| ADDQ CX, R8 |
| ADCQ BP, R9 |
| ADCQ AX, BX |
| ADCQ $0x00, DX |
| MOVQ DX, CX |
| |
| // Third reduction step |
| MOVQ R8, AX |
| MOVQ R8, BP |
| SHLQ $0x20, R8 |
| MULQ p256const1<>+0(SB) |
| SHRQ $0x20, BP |
| ADDQ R8, R9 |
| ADCQ BP, BX |
| ADCQ AX, CX |
| ADCQ $0x00, DX |
| MOVQ DX, R8 |
| |
| // Last reduction step |
| MOVQ R9, AX |
| MOVQ R9, BP |
| SHLQ $0x20, R9 |
| MULQ p256const1<>+0(SB) |
| SHRQ $0x20, BP |
| ADDQ R9, BX |
| ADCQ BP, CX |
| ADCQ AX, R8 |
| ADCQ $0x00, DX |
| MOVQ DX, R9 |
| MOVQ $0x00000000, BP |
| |
| // Add bits [511:256] of the result |
| ADCQ BX, R14 |
| ADCQ CX, R15 |
| ADCQ R8, DI |
| ADCQ R9, SI |
| ADCQ $0x00, BP |
| |
| // Copy result |
| MOVQ R14, R10 |
| MOVQ R15, R11 |
| MOVQ DI, R12 |
| MOVQ SI, R13 |
| |
| // Subtract p256 |
| SUBQ $-1, R10 |
| SBBQ p256const0<>+0(SB), R11 |
| SBBQ $0x00, R12 |
| SBBQ p256const1<>+0(SB), R13 |
| SBBQ $0x00, BP |
| |
| // If the result of the subtraction is negative, restore the previous result |
| CMOVQCS R14, R10 |
| CMOVQCS R15, R11 |
| CMOVQCS DI, R12 |
| CMOVQCS SI, R13 |
| RET |
| |
| // func p256PointAddAffineAsm(res *P256Point, in1 *P256Point, in2 *p256AffinePoint, sign int, sel int, zero int) |
| // Requires: CMOV, SSE2 |
| TEXT ·p256PointAddAffineAsm(SB), $512-48 |
| MOVQ res+0(FP), AX |
| MOVQ in1+8(FP), BX |
| MOVQ in2+16(FP), CX |
| MOVQ sign+24(FP), DX |
| MOVQ sel+32(FP), R15 |
| MOVQ zero+40(FP), DI |
| MOVOU (BX), X0 |
| MOVOU 16(BX), X1 |
| MOVOU 32(BX), X2 |
| MOVOU 48(BX), X3 |
| MOVOU 64(BX), X4 |
| MOVOU 80(BX), X5 |
| MOVOU X0, (SP) |
| MOVOU X1, 16(SP) |
| MOVOU X2, 32(SP) |
| MOVOU X3, 48(SP) |
| MOVOU X4, 64(SP) |
| MOVOU X5, 80(SP) |
| MOVOU (CX), X0 |
| MOVOU 16(CX), X1 |
| MOVOU X0, 96(SP) |
| MOVOU X1, 112(SP) |
| |
| // Store pointer to result |
| MOVQ AX, 480(SP) |
| MOVL R15, 488(SP) |
| MOVL DI, 492(SP) |
| |
| // Negate y2in based on sign |
| MOVQ 32(CX), R10 |
| MOVQ 40(CX), R11 |
| MOVQ 48(CX), R12 |
| MOVQ 56(CX), R13 |
| MOVQ $-1, BX |
| MOVQ p256const0<>+0(SB), CX |
| MOVQ $0x00000000, R8 |
| MOVQ p256const1<>+0(SB), R9 |
| XORQ AX, AX |
| |
| // Speculatively subtract |
| SUBQ R10, BX |
| SBBQ R11, CX |
| SBBQ R12, R8 |
| SBBQ R13, R9 |
| SBBQ $0x00, AX |
| MOVQ BX, R14 |
| MOVQ CX, R15 |
| MOVQ R8, DI |
| MOVQ R9, SI |
| |
| // Add in case the operand was > p256 |
| ADDQ $-1, BX |
| ADCQ p256const0<>+0(SB), CX |
| ADCQ $0x00, R8 |
| ADCQ p256const1<>+0(SB), R9 |
| ADCQ $0x00, AX |
| CMOVQNE R14, BX |
| CMOVQNE R15, CX |
| CMOVQNE DI, R8 |
| CMOVQNE SI, R9 |
| |
| // If condition is 0, keep original value |
| TESTQ DX, DX |
| CMOVQEQ R10, BX |
| CMOVQEQ R11, CX |
| CMOVQEQ R12, R8 |
| CMOVQEQ R13, R9 |
| |
| // Store result |
| MOVQ BX, 128(SP) |
| MOVQ CX, 136(SP) |
| MOVQ R8, 144(SP) |
| MOVQ R9, 152(SP) |
| |
| // Begin point add |
| MOVQ 64(SP), R10 |
| MOVQ 72(SP), R11 |
| MOVQ 80(SP), R12 |
| MOVQ 88(SP), R13 |
| CALL p256SqrInternal(SB) |
| MOVQ R10, 288(SP) |
| MOVQ R11, 296(SP) |
| MOVQ R12, 304(SP) |
| MOVQ R13, 312(SP) |
| MOVQ 96(SP), R14 |
| MOVQ 104(SP), R15 |
| MOVQ 112(SP), DI |
| MOVQ 120(SP), SI |
| CALL p256MulInternal(SB) |
| MOVQ (SP), R14 |
| MOVQ 8(SP), R15 |
| MOVQ 16(SP), DI |
| MOVQ 24(SP), SI |
| CALL p256SubInternal(SB) |
| MOVQ R10, 320(SP) |
| MOVQ R11, 328(SP) |
| MOVQ R12, 336(SP) |
| MOVQ R13, 344(SP) |
| MOVQ 64(SP), R14 |
| MOVQ 72(SP), R15 |
| MOVQ 80(SP), DI |
| MOVQ 88(SP), SI |
| CALL p256MulInternal(SB) |
| MOVQ R10, 224(SP) |
| MOVQ R11, 232(SP) |
| MOVQ R12, 240(SP) |
| MOVQ R13, 248(SP) |
| MOVQ 288(SP), R10 |
| MOVQ 296(SP), R11 |
| MOVQ 304(SP), R12 |
| MOVQ 312(SP), R13 |
| CALL p256MulInternal(SB) |
| MOVQ 128(SP), R14 |
| MOVQ 136(SP), R15 |
| MOVQ 144(SP), DI |
| MOVQ 152(SP), SI |
| CALL p256MulInternal(SB) |
| MOVQ R10, 256(SP) |
| MOVQ R11, 264(SP) |
| MOVQ R12, 272(SP) |
| MOVQ R13, 280(SP) |
| MOVQ 32(SP), R14 |
| MOVQ 40(SP), R15 |
| MOVQ 48(SP), DI |
| MOVQ 56(SP), SI |
| CALL p256SubInternal(SB) |
| MOVQ R10, 352(SP) |
| MOVQ R11, 360(SP) |
| MOVQ R12, 368(SP) |
| MOVQ R13, 376(SP) |
| CALL p256SqrInternal(SB) |
| MOVQ R10, 416(SP) |
| MOVQ R11, 424(SP) |
| MOVQ R12, 432(SP) |
| MOVQ R13, 440(SP) |
| MOVQ 320(SP), R10 |
| MOVQ 328(SP), R11 |
| MOVQ 336(SP), R12 |
| MOVQ 344(SP), R13 |
| CALL p256SqrInternal(SB) |
| MOVQ R10, 384(SP) |
| MOVQ R11, 392(SP) |
| MOVQ R12, 400(SP) |
| MOVQ R13, 408(SP) |
| MOVQ 320(SP), R14 |
| MOVQ 328(SP), R15 |
| MOVQ 336(SP), DI |
| MOVQ 344(SP), SI |
| CALL p256MulInternal(SB) |
| MOVQ R10, 448(SP) |
| MOVQ R11, 456(SP) |
| MOVQ R12, 464(SP) |
| MOVQ R13, 472(SP) |
| MOVQ 32(SP), R14 |
| MOVQ 40(SP), R15 |
| MOVQ 48(SP), DI |
| MOVQ 56(SP), SI |
| CALL p256MulInternal(SB) |
| MOVQ R10, 256(SP) |
| MOVQ R11, 264(SP) |
| MOVQ R12, 272(SP) |
| MOVQ R13, 280(SP) |
| MOVQ (SP), R10 |
| MOVQ 8(SP), R11 |
| MOVQ 16(SP), R12 |
| MOVQ 24(SP), R13 |
| MOVQ 384(SP), R14 |
| MOVQ 392(SP), R15 |
| MOVQ 400(SP), DI |
| MOVQ 408(SP), SI |
| CALL p256MulInternal(SB) |
| MOVQ R10, 320(SP) |
| MOVQ R11, 328(SP) |
| MOVQ R12, 336(SP) |
| MOVQ R13, 344(SP) |
| XORQ AX, AX |
| ADDQ R10, R10 |
| ADCQ R11, R11 |
| ADCQ R12, R12 |
| ADCQ R13, R13 |
| ADCQ $+0, AX |
| MOVQ R10, R14 |
| MOVQ R11, R15 |
| MOVQ R12, DI |
| MOVQ R13, SI |
| SUBQ $-1, R14 |
| SBBQ p256const0<>+0(SB), R15 |
| SBBQ $+0, DI |
| SBBQ p256const1<>+0(SB), SI |
| SBBQ $+0, AX |
| CMOVQCS R10, R14 |
| CMOVQCS R11, R15 |
| CMOVQCS R12, DI |
| CMOVQCS R13, SI |
| MOVQ 416(SP), R10 |
| MOVQ 424(SP), R11 |
| MOVQ 432(SP), R12 |
| MOVQ 440(SP), R13 |
| CALL p256SubInternal(SB) |
| MOVQ 448(SP), R14 |
| MOVQ 456(SP), R15 |
| MOVQ 464(SP), DI |
| MOVQ 472(SP), SI |
| CALL p256SubInternal(SB) |
| MOVQ R10, 160(SP) |
| MOVQ R11, 168(SP) |
| MOVQ R12, 176(SP) |
| MOVQ R13, 184(SP) |
| MOVQ R10, R14 |
| MOVQ R11, R15 |
| MOVQ R12, DI |
| MOVQ R13, SI |
| MOVQ 320(SP), R10 |
| MOVQ 328(SP), R11 |
| MOVQ 336(SP), R12 |
| MOVQ 344(SP), R13 |
| CALL p256SubInternal(SB) |
| MOVQ 352(SP), R14 |
| MOVQ 360(SP), R15 |
| MOVQ 368(SP), DI |
| MOVQ 376(SP), SI |
| CALL p256MulInternal(SB) |
| MOVQ 256(SP), R14 |
| MOVQ 264(SP), R15 |
| MOVQ 272(SP), DI |
| MOVQ 280(SP), SI |
| CALL p256SubInternal(SB) |
| MOVQ R10, 192(SP) |
| MOVQ R11, 200(SP) |
| MOVQ R12, 208(SP) |
| MOVQ R13, 216(SP) |
| |
| // Load stored values from stack |
| MOVQ 480(SP), AX |
| MOVL 488(SP), BX |
| MOVL 492(SP), CX |
| |
| // The result is not valid if (sel == 0), conditional choose |
| MOVOU 160(SP), X0 |
| MOVOU 176(SP), X1 |
| MOVOU 192(SP), X2 |
| MOVOU 208(SP), X3 |
| MOVOU 224(SP), X4 |
| MOVOU 240(SP), X5 |
| MOVL BX, X6 |
| MOVL CX, X7 |
| PXOR X8, X8 |
| PCMPEQL X9, X9 |
| PSHUFD $0x00, X6, X6 |
| PSHUFD $0x00, X7, X7 |
| PCMPEQL X8, X6 |
| PCMPEQL X8, X7 |
| MOVOU X6, X15 |
| PANDN X9, X15 |
| MOVOU (SP), X9 |
| MOVOU 16(SP), X10 |
| MOVOU 32(SP), X11 |
| MOVOU 48(SP), X12 |
| MOVOU 64(SP), X13 |
| MOVOU 80(SP), X14 |
| PAND X15, X0 |
| PAND X15, X1 |
| PAND X15, X2 |
| PAND X15, X3 |
| PAND X15, X4 |
| PAND X15, X5 |
| PAND X6, X9 |
| PAND X6, X10 |
| PAND X6, X11 |
| PAND X6, X12 |
| PAND X6, X13 |
| PAND X6, X14 |
| PXOR X9, X0 |
| PXOR X10, X1 |
| PXOR X11, X2 |
| PXOR X12, X3 |
| PXOR X13, X4 |
| PXOR X14, X5 |
| |
| // Similarly if zero == 0 |
| PCMPEQL X9, X9 |
| MOVOU X7, X15 |
| PANDN X9, X15 |
| MOVOU 96(SP), X9 |
| MOVOU 112(SP), X10 |
| MOVOU 128(SP), X11 |
| MOVOU 144(SP), X12 |
| MOVOU p256one<>+0(SB), X13 |
| MOVOU p256one<>+16(SB), X14 |
| PAND X15, X0 |
| PAND X15, X1 |
| PAND X15, X2 |
| PAND X15, X3 |
| PAND X15, X4 |
| PAND X15, X5 |
| PAND X7, X9 |
| PAND X7, X10 |
| PAND X7, X11 |
| PAND X7, X12 |
| PAND X7, X13 |
| PAND X7, X14 |
| PXOR X9, X0 |
| PXOR X10, X1 |
| PXOR X11, X2 |
| PXOR X12, X3 |
| PXOR X13, X4 |
| PXOR X14, X5 |
| |
| // Finally output the result |
| MOVOU X0, (AX) |
| MOVOU X1, 16(AX) |
| MOVOU X2, 32(AX) |
| MOVOU X3, 48(AX) |
| MOVOU X4, 64(AX) |
| MOVOU X5, 80(AX) |
| MOVQ $0x00000000, 480(SP) |
| RET |
| |
| DATA p256one<>+0(SB)/8, $0x0000000000000001 |
| DATA p256one<>+8(SB)/8, $0xffffffff00000000 |
| DATA p256one<>+16(SB)/8, $0xffffffffffffffff |
| DATA p256one<>+24(SB)/8, $0x00000000fffffffe |
| GLOBL p256one<>(SB), RODATA, $32 |
| |
| // func p256IsZero() |
| // Requires: CMOV |
| TEXT p256IsZero(SB), NOSPLIT, $0 |
| // AX contains a flag that is set if the input is zero. |
| XORQ AX, AX |
| MOVQ $0x00000001, R15 |
| |
| // Check whether [acc4..acc7] are all zero. |
| MOVQ R10, R14 |
| ORQ R11, R14 |
| ORQ R12, R14 |
| ORQ R13, R14 |
| |
| // Set the zero flag if so. (CMOV of a constant to a register doesn't |
| // appear to be supported in Go. Thus t1 = 1.) |
| CMOVQEQ R15, AX |
| |
| // XOR [acc4..acc7] with P and compare with zero again. |
| XORQ $-1, R10 |
| XORQ p256const0<>+0(SB), R11 |
| XORQ p256const1<>+0(SB), R13 |
| ORQ R11, R10 |
| ORQ R12, R10 |
| ORQ R13, R10 |
| |
| // Set the zero flag if so. |
| CMOVQEQ R15, AX |
| RET |
| |
| // func p256PointAddAsm(res *P256Point, in1 *P256Point, in2 *P256Point) int |
| // Requires: CMOV, SSE2 |
| TEXT ·p256PointAddAsm(SB), $680-32 |
| // Move input to stack in order to free registers |
| MOVQ res+0(FP), AX |
| MOVQ in1+8(FP), BX |
| MOVQ in2+16(FP), CX |
| MOVOU (BX), X0 |
| MOVOU 16(BX), X1 |
| MOVOU 32(BX), X2 |
| MOVOU 48(BX), X3 |
| MOVOU 64(BX), X4 |
| MOVOU 80(BX), X5 |
| MOVOU X0, (SP) |
| MOVOU X1, 16(SP) |
| MOVOU X2, 32(SP) |
| MOVOU X3, 48(SP) |
| MOVOU X4, 64(SP) |
| MOVOU X5, 80(SP) |
| MOVOU (CX), X0 |
| MOVOU 16(CX), X1 |
| MOVOU 32(CX), X2 |
| MOVOU 48(CX), X3 |
| MOVOU 64(CX), X4 |
| MOVOU 80(CX), X5 |
| MOVOU X0, 96(SP) |
| MOVOU X1, 112(SP) |
| MOVOU X2, 128(SP) |
| MOVOU X3, 144(SP) |
| MOVOU X4, 160(SP) |
| MOVOU X5, 176(SP) |
| |
| // Store pointer to result |
| MOVQ AX, 640(SP) |
| |
| // Begin point add |
| MOVQ 160(SP), R10 |
| MOVQ 168(SP), R11 |
| MOVQ 176(SP), R12 |
| MOVQ 184(SP), R13 |
| CALL p256SqrInternal(SB) |
| MOVQ R10, 448(SP) |
| MOVQ R11, 456(SP) |
| MOVQ R12, 464(SP) |
| MOVQ R13, 472(SP) |
| MOVQ 160(SP), R14 |
| MOVQ 168(SP), R15 |
| MOVQ 176(SP), DI |
| MOVQ 184(SP), SI |
| CALL p256MulInternal(SB) |
| MOVQ 32(SP), R14 |
| MOVQ 40(SP), R15 |
| MOVQ 48(SP), DI |
| MOVQ 56(SP), SI |
| CALL p256MulInternal(SB) |
| MOVQ R10, 352(SP) |
| MOVQ R11, 360(SP) |
| MOVQ R12, 368(SP) |
| MOVQ R13, 376(SP) |
| MOVQ 64(SP), R10 |
| MOVQ 72(SP), R11 |
| MOVQ 80(SP), R12 |
| MOVQ 88(SP), R13 |
| CALL p256SqrInternal(SB) |
| MOVQ R10, 416(SP) |
| MOVQ R11, 424(SP) |
| MOVQ R12, 432(SP) |
| MOVQ R13, 440(SP) |
| MOVQ 64(SP), R14 |
| MOVQ 72(SP), R15 |
| MOVQ 80(SP), DI |
| MOVQ 88(SP), SI |
| CALL p256MulInternal(SB) |
| MOVQ 128(SP), R14 |
| MOVQ 136(SP), R15 |
| MOVQ 144(SP), DI |
| MOVQ 152(SP), SI |
| CALL p256MulInternal(SB) |
| MOVQ R10, 384(SP) |
| MOVQ R11, 392(SP) |
| MOVQ R12, 400(SP) |
| MOVQ R13, 408(SP) |
| MOVQ 352(SP), R14 |
| MOVQ 360(SP), R15 |
| MOVQ 368(SP), DI |
| MOVQ 376(SP), SI |
| CALL p256SubInternal(SB) |
| MOVQ R10, 512(SP) |
| MOVQ R11, 520(SP) |
| MOVQ R12, 528(SP) |
| MOVQ R13, 536(SP) |
| CALL p256IsZero(SB) |
| MOVQ AX, 648(SP) |
| MOVQ 448(SP), R10 |
| MOVQ 456(SP), R11 |
| MOVQ 464(SP), R12 |
| MOVQ 472(SP), R13 |
| MOVQ (SP), R14 |
| MOVQ 8(SP), R15 |
| MOVQ 16(SP), DI |
| MOVQ 24(SP), SI |
| CALL p256MulInternal(SB) |
| MOVQ R10, 288(SP) |
| MOVQ R11, 296(SP) |
| MOVQ R12, 304(SP) |
| MOVQ R13, 312(SP) |
| MOVQ 416(SP), R10 |
| MOVQ 424(SP), R11 |
| MOVQ 432(SP), R12 |
| MOVQ 440(SP), R13 |
| MOVQ 96(SP), R14 |
| MOVQ 104(SP), R15 |
| MOVQ 112(SP), DI |
| MOVQ 120(SP), SI |
| CALL p256MulInternal(SB) |
| MOVQ R10, 320(SP) |
| MOVQ R11, 328(SP) |
| MOVQ R12, 336(SP) |
| MOVQ R13, 344(SP) |
| MOVQ 288(SP), R14 |
| MOVQ 296(SP), R15 |
| MOVQ 304(SP), DI |
| MOVQ 312(SP), SI |
| CALL p256SubInternal(SB) |
| MOVQ R10, 480(SP) |
| MOVQ R11, 488(SP) |
| MOVQ R12, 496(SP) |
| MOVQ R13, 504(SP) |
| CALL p256IsZero(SB) |
| ANDQ 648(SP), AX |
| MOVQ AX, 648(SP) |
| MOVQ 512(SP), R10 |
| MOVQ 520(SP), R11 |
| MOVQ 528(SP), R12 |
| MOVQ 536(SP), R13 |
| CALL p256SqrInternal(SB) |
| MOVQ R10, 576(SP) |
| MOVQ R11, 584(SP) |
| MOVQ R12, 592(SP) |
| MOVQ R13, 600(SP) |
| MOVQ 480(SP), R10 |
| MOVQ 488(SP), R11 |
| MOVQ 496(SP), R12 |
| MOVQ 504(SP), R13 |
| CALL p256SqrInternal(SB) |
| MOVQ R10, 544(SP) |
| MOVQ R11, 552(SP) |
| MOVQ R12, 560(SP) |
| MOVQ R13, 568(SP) |
| MOVQ 480(SP), R14 |
| MOVQ 488(SP), R15 |
| MOVQ 496(SP), DI |
| MOVQ 504(SP), SI |
| CALL p256MulInternal(SB) |
| MOVQ R10, 608(SP) |
| MOVQ R11, 616(SP) |
| MOVQ R12, 624(SP) |
| MOVQ R13, 632(SP) |
| MOVQ 352(SP), R14 |
| MOVQ 360(SP), R15 |
| MOVQ 368(SP), DI |
| MOVQ 376(SP), SI |
| CALL p256MulInternal(SB) |
| MOVQ R10, 384(SP) |
| MOVQ R11, 392(SP) |
| MOVQ R12, 400(SP) |
| MOVQ R13, 408(SP) |
| MOVQ 64(SP), R10 |
| MOVQ 72(SP), R11 |
| MOVQ 80(SP), R12 |
| MOVQ 88(SP), R13 |
| MOVQ 160(SP), R14 |
| MOVQ 168(SP), R15 |
| MOVQ 176(SP), DI |
| MOVQ 184(SP), SI |
| CALL p256MulInternal(SB) |
| MOVQ 480(SP), R14 |
| MOVQ 488(SP), R15 |
| MOVQ 496(SP), DI |
| MOVQ 504(SP), SI |
| CALL p256MulInternal(SB) |
| MOVQ R10, 256(SP) |
| MOVQ R11, 264(SP) |
| MOVQ R12, 272(SP) |
| MOVQ R13, 280(SP) |
| MOVQ 544(SP), R10 |
| MOVQ 552(SP), R11 |
| MOVQ 560(SP), R12 |
| MOVQ 568(SP), R13 |
| MOVQ 288(SP), R14 |
| MOVQ 296(SP), R15 |
| MOVQ 304(SP), DI |
| MOVQ 312(SP), SI |
| CALL p256MulInternal(SB) |
| MOVQ R10, 320(SP) |
| MOVQ R11, 328(SP) |
| MOVQ R12, 336(SP) |
| MOVQ R13, 344(SP) |
| XORQ AX, AX |
| ADDQ R10, R10 |
| ADCQ R11, R11 |
| ADCQ R12, R12 |
| ADCQ R13, R13 |
| ADCQ $+0, AX |
| MOVQ R10, R14 |
| MOVQ R11, R15 |
| MOVQ R12, DI |
| MOVQ R13, SI |
| SUBQ $-1, R14 |
| SBBQ p256const0<>+0(SB), R15 |
| SBBQ $+0, DI |
| SBBQ p256const1<>+0(SB), SI |
| SBBQ $+0, AX |
| CMOVQCS R10, R14 |
| CMOVQCS R11, R15 |
| CMOVQCS R12, DI |
| CMOVQCS R13, SI |
| MOVQ 576(SP), R10 |
| MOVQ 584(SP), R11 |
| MOVQ 592(SP), R12 |
| MOVQ 600(SP), R13 |
| CALL p256SubInternal(SB) |
| MOVQ 608(SP), R14 |
| MOVQ 616(SP), R15 |
| MOVQ 624(SP), DI |
| MOVQ 632(SP), SI |
| CALL p256SubInternal(SB) |
| MOVQ R10, 192(SP) |
| MOVQ R11, 200(SP) |
| MOVQ R12, 208(SP) |
| MOVQ R13, 216(SP) |
| MOVQ R10, R14 |
| MOVQ R11, R15 |
| MOVQ R12, DI |
| MOVQ R13, SI |
| MOVQ 320(SP), R10 |
| MOVQ 328(SP), R11 |
| MOVQ 336(SP), R12 |
| MOVQ 344(SP), R13 |
| CALL p256SubInternal(SB) |
| MOVQ 512(SP), R14 |
| MOVQ 520(SP), R15 |
| MOVQ 528(SP), DI |
| MOVQ 536(SP), SI |
| CALL p256MulInternal(SB) |
| MOVQ 384(SP), R14 |
| MOVQ 392(SP), R15 |
| MOVQ 400(SP), DI |
| MOVQ 408(SP), SI |
| CALL p256SubInternal(SB) |
| MOVQ R10, 224(SP) |
| MOVQ R11, 232(SP) |
| MOVQ R12, 240(SP) |
| MOVQ R13, 248(SP) |
| MOVOU 192(SP), X0 |
| MOVOU 208(SP), X1 |
| MOVOU 224(SP), X2 |
| MOVOU 240(SP), X3 |
| MOVOU 256(SP), X4 |
| MOVOU 272(SP), X5 |
| |
| // Finally output the result |
| MOVQ 640(SP), AX |
| MOVQ $0x00000000, 640(SP) |
| MOVOU X0, (AX) |
| MOVOU X1, 16(AX) |
| MOVOU X2, 32(AX) |
| MOVOU X3, 48(AX) |
| MOVOU X4, 64(AX) |
| MOVOU X5, 80(AX) |
| MOVQ 648(SP), AX |
| MOVQ AX, ret+24(FP) |
| RET |
| |
| // func p256PointDoubleAsm(res *P256Point, in *P256Point) |
| // Requires: CMOV, SSE2 |
| TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $256-16 |
| MOVQ res+0(FP), AX |
| MOVQ in+8(FP), BX |
| MOVOU (BX), X0 |
| MOVOU 16(BX), X1 |
| MOVOU 32(BX), X2 |
| MOVOU 48(BX), X3 |
| MOVOU 64(BX), X4 |
| MOVOU 80(BX), X5 |
| MOVOU X0, (SP) |
| MOVOU X1, 16(SP) |
| MOVOU X2, 32(SP) |
| MOVOU X3, 48(SP) |
| MOVOU X4, 64(SP) |
| MOVOU X5, 80(SP) |
| |
| // Store pointer to result |
| MOVQ AX, 224(SP) |
| |
| // Begin point double |
| MOVQ 64(SP), R10 |
| MOVQ 72(SP), R11 |
| MOVQ 80(SP), R12 |
| MOVQ 88(SP), R13 |
| CALL p256SqrInternal(SB) |
| MOVQ R10, 160(SP) |
| MOVQ R11, 168(SP) |
| MOVQ R12, 176(SP) |
| MOVQ R13, 184(SP) |
| MOVQ (SP), R14 |
| MOVQ 8(SP), R15 |
| MOVQ 16(SP), DI |
| MOVQ 24(SP), SI |
| XORQ AX, AX |
| ADDQ R14, R10 |
| ADCQ R15, R11 |
| ADCQ DI, R12 |
| ADCQ SI, R13 |
| ADCQ $+0, AX |
| MOVQ R10, R14 |
| MOVQ R11, R15 |
| MOVQ R12, DI |
| MOVQ R13, SI |
| SUBQ $-1, R14 |
| SBBQ p256const0<>+0(SB), R15 |
| SBBQ $+0, DI |
| SBBQ p256const1<>+0(SB), SI |
| SBBQ $+0, AX |
| CMOVQCS R10, R14 |
| CMOVQCS R11, R15 |
| CMOVQCS R12, DI |
| CMOVQCS R13, SI |
| MOVQ R14, 128(SP) |
| MOVQ R15, 136(SP) |
| MOVQ DI, 144(SP) |
| MOVQ SI, 152(SP) |
| MOVQ 64(SP), R10 |
| MOVQ 72(SP), R11 |
| MOVQ 80(SP), R12 |
| MOVQ 88(SP), R13 |
| MOVQ 32(SP), R14 |
| MOVQ 40(SP), R15 |
| MOVQ 48(SP), DI |
| MOVQ 56(SP), SI |
| CALL p256MulInternal(SB) |
| XORQ AX, AX |
| ADDQ R10, R10 |
| ADCQ R11, R11 |
| ADCQ R12, R12 |
| ADCQ R13, R13 |
| ADCQ $+0, AX |
| MOVQ R10, R14 |
| MOVQ R11, R15 |
| MOVQ R12, DI |
| MOVQ R13, SI |
| SUBQ $-1, R14 |
| SBBQ p256const0<>+0(SB), R15 |
| SBBQ $+0, DI |
| SBBQ p256const1<>+0(SB), SI |
| SBBQ $+0, AX |
| CMOVQCS R10, R14 |
| CMOVQCS R11, R15 |
| CMOVQCS R12, DI |
| CMOVQCS R13, SI |
| MOVQ 224(SP), AX |
| |
| // Store z |
| MOVQ R14, 64(AX) |
| MOVQ R15, 72(AX) |
| MOVQ DI, 80(AX) |
| MOVQ SI, 88(AX) |
| MOVQ (SP), R10 |
| MOVQ 8(SP), R11 |
| MOVQ 16(SP), R12 |
| MOVQ 24(SP), R13 |
| MOVQ 160(SP), R14 |
| MOVQ 168(SP), R15 |
| MOVQ 176(SP), DI |
| MOVQ 184(SP), SI |
| CALL p256SubInternal(SB) |
| MOVQ 128(SP), R14 |
| MOVQ 136(SP), R15 |
| MOVQ 144(SP), DI |
| MOVQ 152(SP), SI |
| CALL p256MulInternal(SB) |
| MOVQ R10, 128(SP) |
| MOVQ R11, 136(SP) |
| MOVQ R12, 144(SP) |
| MOVQ R13, 152(SP) |
| |
| // Multiply by 3 |
| XORQ AX, AX |
| ADDQ R10, R10 |
| ADCQ R11, R11 |
| ADCQ R12, R12 |
| ADCQ R13, R13 |
| ADCQ $+0, AX |
| MOVQ R10, R14 |
| MOVQ R11, R15 |
| MOVQ R12, DI |
| MOVQ R13, SI |
| SUBQ $-1, R14 |
| SBBQ p256const0<>+0(SB), R15 |
| SBBQ $+0, DI |
| SBBQ p256const1<>+0(SB), SI |
| SBBQ $+0, AX |
| CMOVQCS R10, R14 |
| CMOVQCS R11, R15 |
| CMOVQCS R12, DI |
| CMOVQCS R13, SI |
| MOVQ 128(SP), R10 |
| MOVQ 136(SP), R11 |
| MOVQ 144(SP), R12 |
| MOVQ 152(SP), R13 |
| XORQ AX, AX |
| ADDQ R14, R10 |
| ADCQ R15, R11 |
| ADCQ DI, R12 |
| ADCQ SI, R13 |
| ADCQ $+0, AX |
| MOVQ R10, R14 |
| MOVQ R11, R15 |
| MOVQ R12, DI |
| MOVQ R13, SI |
| SUBQ $-1, R14 |
| SBBQ p256const0<>+0(SB), R15 |
| SBBQ $+0, DI |
| SBBQ p256const1<>+0(SB), SI |
| SBBQ $+0, AX |
| CMOVQCS R10, R14 |
| CMOVQCS R11, R15 |
| CMOVQCS R12, DI |
| CMOVQCS R13, SI |
| MOVQ R14, 128(SP) |
| MOVQ R15, 136(SP) |
| MOVQ DI, 144(SP) |
| MOVQ SI, 152(SP) |
| |
| // //////////////////////// |
| MOVQ 32(SP), R10 |
| MOVQ 40(SP), R11 |
| MOVQ 48(SP), R12 |
| MOVQ 56(SP), R13 |
| XORQ AX, AX |
| ADDQ R10, R10 |
| ADCQ R11, R11 |
| ADCQ R12, R12 |
| ADCQ R13, R13 |
| ADCQ $+0, AX |
| MOVQ R10, R14 |
| MOVQ R11, R15 |
| MOVQ R12, DI |
| MOVQ R13, SI |
| SUBQ $-1, R14 |
| SBBQ p256const0<>+0(SB), R15 |
| SBBQ $+0, DI |
| SBBQ p256const1<>+0(SB), SI |
| SBBQ $+0, AX |
| CMOVQCS R10, R14 |
| CMOVQCS R11, R15 |
| CMOVQCS R12, DI |
| CMOVQCS R13, SI |
| MOVQ R14, R10 |
| MOVQ R15, R11 |
| MOVQ DI, R12 |
| MOVQ SI, R13 |
| CALL p256SqrInternal(SB) |
| MOVQ R10, 96(SP) |
| MOVQ R11, 104(SP) |
| MOVQ R12, 112(SP) |
| MOVQ R13, 120(SP) |
| CALL p256SqrInternal(SB) |
| |
| // Divide by 2 |
| XORQ AX, AX |
| MOVQ R10, R14 |
| MOVQ R11, R15 |
| MOVQ R12, DI |
| MOVQ R13, SI |
| ADDQ $-1, R10 |
| ADCQ p256const0<>+0(SB), R11 |
| ADCQ $0x00, R12 |
| ADCQ p256const1<>+0(SB), R13 |
| ADCQ $0x00, AX |
| TESTQ $0x00000001, R14 |
| CMOVQEQ R14, R10 |
| CMOVQEQ R15, R11 |
| CMOVQEQ DI, R12 |
| CMOVQEQ SI, R13 |
| ANDQ R14, AX |
| SHRQ $0x01, R11, R10 |
| SHRQ $0x01, R12, R11 |
| SHRQ $0x01, R13, R12 |
| SHRQ $0x01, AX, R13 |
| MOVQ R10, 32(SP) |
| MOVQ R11, 40(SP) |
| MOVQ R12, 48(SP) |
| MOVQ R13, 56(SP) |
| |
| // ///////////////////////// |
| MOVQ (SP), R10 |
| MOVQ 8(SP), R11 |
| MOVQ 16(SP), R12 |
| MOVQ 24(SP), R13 |
| MOVQ 96(SP), R14 |
| MOVQ 104(SP), R15 |
| MOVQ 112(SP), DI |
| MOVQ 120(SP), SI |
| CALL p256MulInternal(SB) |
| MOVQ R10, 96(SP) |
| MOVQ R11, 104(SP) |
| MOVQ R12, 112(SP) |
| MOVQ R13, 120(SP) |
| XORQ AX, AX |
| ADDQ R10, R10 |
| ADCQ R11, R11 |
| ADCQ R12, R12 |
| ADCQ R13, R13 |
| ADCQ $+0, AX |
| MOVQ R10, R14 |
| MOVQ R11, R15 |
| MOVQ R12, DI |
| MOVQ R13, SI |
| SUBQ $-1, R14 |
| SBBQ p256const0<>+0(SB), R15 |
| SBBQ $+0, DI |
| SBBQ p256const1<>+0(SB), SI |
| SBBQ $+0, AX |
| CMOVQCS R10, R14 |
| CMOVQCS R11, R15 |
| CMOVQCS R12, DI |
| CMOVQCS R13, SI |
| MOVQ R14, 192(SP) |
| MOVQ R15, 200(SP) |
| MOVQ DI, 208(SP) |
| MOVQ SI, 216(SP) |
| MOVQ 128(SP), R10 |
| MOVQ 136(SP), R11 |
| MOVQ 144(SP), R12 |
| MOVQ 152(SP), R13 |
| CALL p256SqrInternal(SB) |
| MOVQ 192(SP), R14 |
| MOVQ 200(SP), R15 |
| MOVQ 208(SP), DI |
| MOVQ 216(SP), SI |
| CALL p256SubInternal(SB) |
| MOVQ 224(SP), AX |
| |
| // Store x |
| MOVQ R10, (AX) |
| MOVQ R11, 8(AX) |
| MOVQ R12, 16(AX) |
| MOVQ R13, 24(AX) |
| MOVQ R10, R14 |
| MOVQ R11, R15 |
| MOVQ R12, DI |
| MOVQ R13, SI |
| MOVQ 96(SP), R10 |
| MOVQ 104(SP), R11 |
| MOVQ 112(SP), R12 |
| MOVQ 120(SP), R13 |
| CALL p256SubInternal(SB) |
| MOVQ 128(SP), R14 |
| MOVQ 136(SP), R15 |
| MOVQ 144(SP), DI |
| MOVQ 152(SP), SI |
| CALL p256MulInternal(SB) |
| MOVQ 32(SP), R14 |
| MOVQ 40(SP), R15 |
| MOVQ 48(SP), DI |
| MOVQ 56(SP), SI |
| CALL p256SubInternal(SB) |
| MOVQ 224(SP), AX |
| |
| // Store y |
| MOVQ R10, 32(AX) |
| MOVQ R11, 40(AX) |
| MOVQ R12, 48(AX) |
| MOVQ R13, 56(AX) |
| |
| // /////////////////////// |
| MOVQ $0x00000000, 224(SP) |
| RET |