blob: 501e094266a05e12fd7ee047d64c962e9d7d4357 [file] [log] [blame]
// Code generated by command: go run p256_asm_amd64.go -out ../p256_asm_amd64.s -pkg nistec. DO NOT EDIT.
//go:build !purego
#include "textflag.h"
// func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0-16
JMP ·p256BigToLittle(SB)
// func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0-16
JMP ·p256BigToLittle(SB)
// func p256LittleToBig(res *[32]byte, in *p256Element)
TEXT ·p256LittleToBig(SB), NOSPLIT, $0-16
JMP ·p256BigToLittle(SB)
// func p256BigToLittle(res *p256Element, in *[32]byte)
TEXT ·p256BigToLittle(SB), NOSPLIT, $0-16
MOVQ res+0(FP), DI
MOVQ in+8(FP), SI
MOVQ (SI), R8
MOVQ 8(SI), R9
MOVQ 16(SI), R10
MOVQ 24(SI), R11
BSWAPQ R8
BSWAPQ R9
BSWAPQ R10
BSWAPQ R11
MOVQ R11, (DI)
MOVQ R10, 8(DI)
MOVQ R9, 16(DI)
MOVQ R8, 24(DI)
RET
// func p256MovCond(res *P256Point, a *P256Point, b *P256Point, cond int)
// Requires: SSE2
TEXT ·p256MovCond(SB), NOSPLIT, $0-32
MOVQ res+0(FP), DI
MOVQ a+8(FP), SI
MOVQ b+16(FP), CX
MOVQ cond+24(FP), X12
PXOR X13, X13
PSHUFD $0x00, X12, X12
PCMPEQL X13, X12
MOVOU X12, X0
MOVOU (SI), X6
PANDN X6, X0
MOVOU X12, X1
MOVOU 16(SI), X7
PANDN X7, X1
MOVOU X12, X2
MOVOU 32(SI), X8
PANDN X8, X2
MOVOU X12, X3
MOVOU 48(SI), X9
PANDN X9, X3
MOVOU X12, X4
MOVOU 64(SI), X10
PANDN X10, X4
MOVOU X12, X5
MOVOU 80(SI), X11
PANDN X11, X5
MOVOU (CX), X6
MOVOU 16(CX), X7
MOVOU 32(CX), X8
MOVOU 48(CX), X9
MOVOU 64(CX), X10
MOVOU 80(CX), X11
PAND X12, X6
PAND X12, X7
PAND X12, X8
PAND X12, X9
PAND X12, X10
PAND X12, X11
PXOR X6, X0
PXOR X7, X1
PXOR X8, X2
PXOR X9, X3
PXOR X10, X4
PXOR X11, X5
MOVOU X0, (DI)
MOVOU X1, 16(DI)
MOVOU X2, 32(DI)
MOVOU X3, 48(DI)
MOVOU X4, 64(DI)
MOVOU X5, 80(DI)
RET
// func p256NegCond(val *p256Element, cond int)
// Requires: CMOV
TEXT ·p256NegCond(SB), NOSPLIT, $0-16
MOVQ val+0(FP), DI
MOVQ cond+8(FP), R14
// acc = poly
MOVQ $-1, R8
MOVQ p256const0<>+0(SB), R9
MOVQ $+0, R10
MOVQ p256const1<>+0(SB), R11
// Load the original value
MOVQ (DI), R13
MOVQ 8(DI), SI
MOVQ 16(DI), CX
MOVQ 24(DI), R15
// Speculatively subtract
SUBQ R13, R8
SBBQ SI, R9
SBBQ CX, R10
SBBQ R15, R11
// If condition is 0, keep original value
TESTQ R14, R14
CMOVQEQ R13, R8
CMOVQEQ SI, R9
CMOVQEQ CX, R10
CMOVQEQ R15, R11
// Store result
MOVQ R8, (DI)
MOVQ R9, 8(DI)
MOVQ R10, 16(DI)
MOVQ R11, 24(DI)
RET
DATA p256const0<>+0(SB)/8, $0x00000000ffffffff
GLOBL p256const0<>(SB), RODATA, $8
DATA p256const1<>+0(SB)/8, $0xffffffff00000001
GLOBL p256const1<>(SB), RODATA, $8
// func p256Sqr(res *p256Element, in *p256Element, n int)
// Requires: CMOV
TEXT ·p256Sqr(SB), NOSPLIT, $0-24
MOVQ res+0(FP), DI
MOVQ in+8(FP), SI
MOVQ n+16(FP), BX
sqrLoop:
// y[1:] * y[0]
MOVQ (SI), R14
MOVQ 8(SI), AX
MULQ R14
MOVQ AX, R9
MOVQ DX, R10
MOVQ 16(SI), AX
MULQ R14
ADDQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R11
MOVQ 24(SI), AX
MULQ R14
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R12
// y[2:] * y[1]
MOVQ 8(SI), R14
MOVQ 16(SI), AX
MULQ R14
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 24(SI), AX
MULQ R14
ADDQ R15, R12
ADCQ $0x00, DX
ADDQ AX, R12
ADCQ $0x00, DX
MOVQ DX, R13
// y[3] * y[2]
MOVQ 16(SI), R14
MOVQ 24(SI), AX
MULQ R14
ADDQ AX, R13
ADCQ $0x00, DX
MOVQ DX, CX
XORQ R15, R15
// *2
ADDQ R9, R9
ADCQ R10, R10
ADCQ R11, R11
ADCQ R12, R12
ADCQ R13, R13
ADCQ CX, CX
ADCQ $0x00, R15
// Missing products
MOVQ (SI), AX
MULQ AX
MOVQ AX, R8
MOVQ DX, R14
MOVQ 8(SI), AX
MULQ AX
ADDQ R14, R9
ADCQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R14
MOVQ 16(SI), AX
MULQ AX
ADDQ R14, R11
ADCQ AX, R12
ADCQ $0x00, DX
MOVQ DX, R14
MOVQ 24(SI), AX
MULQ AX
ADDQ R14, R13
ADCQ AX, CX
ADCQ DX, R15
MOVQ R15, SI
// First reduction step
MOVQ R8, AX
MOVQ R8, R15
SHLQ $0x20, R8
MULQ p256const1<>+0(SB)
SHRQ $0x20, R15
ADDQ R8, R9
ADCQ R15, R10
ADCQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R8
// Second reduction step
MOVQ R9, AX
MOVQ R9, R15
SHLQ $0x20, R9
MULQ p256const1<>+0(SB)
SHRQ $0x20, R15
ADDQ R9, R10
ADCQ R15, R11
ADCQ AX, R8
ADCQ $0x00, DX
MOVQ DX, R9
// Third reduction step
MOVQ R10, AX
MOVQ R10, R15
SHLQ $0x20, R10
MULQ p256const1<>+0(SB)
SHRQ $0x20, R15
ADDQ R10, R11
ADCQ R15, R8
ADCQ AX, R9
ADCQ $0x00, DX
MOVQ DX, R10
// Last reduction step
XORQ R14, R14
MOVQ R11, AX
MOVQ R11, R15
SHLQ $0x20, R11
MULQ p256const1<>+0(SB)
SHRQ $0x20, R15
ADDQ R11, R8
ADCQ R15, R9
ADCQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R11
// Add bits [511:256] of the sqr result
ADCQ R12, R8
ADCQ R13, R9
ADCQ CX, R10
ADCQ SI, R11
ADCQ $0x00, R14
MOVQ R8, R12
MOVQ R9, R13
MOVQ R10, CX
MOVQ R11, R15
// Subtract p256
SUBQ $-1, R8
SBBQ p256const0<>+0(SB), R9
SBBQ $0x00, R10
SBBQ p256const1<>+0(SB), R11
SBBQ $0x00, R14
CMOVQCS R12, R8
CMOVQCS R13, R9
CMOVQCS CX, R10
CMOVQCS R15, R11
MOVQ R8, (DI)
MOVQ R9, 8(DI)
MOVQ R10, 16(DI)
MOVQ R11, 24(DI)
MOVQ DI, SI
DECQ BX
JNE sqrLoop
RET
// func p256Mul(res *p256Element, in1 *p256Element, in2 *p256Element)
// Requires: CMOV
TEXT ·p256Mul(SB), NOSPLIT, $0-24
MOVQ res+0(FP), DI
MOVQ in1+8(FP), SI
MOVQ in2+16(FP), CX
// x * y[0]
MOVQ (CX), R14
MOVQ (SI), AX
MULQ R14
MOVQ AX, R8
MOVQ DX, R9
MOVQ 8(SI), AX
MULQ R14
ADDQ AX, R9
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 16(SI), AX
MULQ R14
ADDQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R11
MOVQ 24(SI), AX
MULQ R14
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R12
XORQ R13, R13
// First reduction step
MOVQ R8, AX
MOVQ R8, R15
SHLQ $0x20, R8
MULQ p256const1<>+0(SB)
SHRQ $0x20, R15
ADDQ R8, R9
ADCQ R15, R10
ADCQ AX, R11
ADCQ DX, R12
ADCQ $0x00, R13
XORQ R8, R8
// x * y[1]
MOVQ 8(CX), R14
MOVQ (SI), AX
MULQ R14
ADDQ AX, R9
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 8(SI), AX
MULQ R14
ADDQ R15, R10
ADCQ $0x00, DX
ADDQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 16(SI), AX
MULQ R14
ADDQ R15, R11
ADCQ $0x00, DX
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 24(SI), AX
MULQ R14
ADDQ R15, R12
ADCQ $0x00, DX
ADDQ AX, R12
ADCQ DX, R13
ADCQ $0x00, R8
// Second reduction step
MOVQ R9, AX
MOVQ R9, R15
SHLQ $0x20, R9
MULQ p256const1<>+0(SB)
SHRQ $0x20, R15
ADDQ R9, R10
ADCQ R15, R11
ADCQ AX, R12
ADCQ DX, R13
ADCQ $0x00, R8
XORQ R9, R9
// x * y[2]
MOVQ 16(CX), R14
MOVQ (SI), AX
MULQ R14
ADDQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 8(SI), AX
MULQ R14
ADDQ R15, R11
ADCQ $0x00, DX
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 16(SI), AX
MULQ R14
ADDQ R15, R12
ADCQ $0x00, DX
ADDQ AX, R12
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 24(SI), AX
MULQ R14
ADDQ R15, R13
ADCQ $0x00, DX
ADDQ AX, R13
ADCQ DX, R8
ADCQ $0x00, R9
// Third reduction step
MOVQ R10, AX
MOVQ R10, R15
SHLQ $0x20, R10
MULQ p256const1<>+0(SB)
SHRQ $0x20, R15
ADDQ R10, R11
ADCQ R15, R12
ADCQ AX, R13
ADCQ DX, R8
ADCQ $0x00, R9
XORQ R10, R10
// x * y[3]
MOVQ 24(CX), R14
MOVQ (SI), AX
MULQ R14
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 8(SI), AX
MULQ R14
ADDQ R15, R12
ADCQ $0x00, DX
ADDQ AX, R12
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 16(SI), AX
MULQ R14
ADDQ R15, R13
ADCQ $0x00, DX
ADDQ AX, R13
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 24(SI), AX
MULQ R14
ADDQ R15, R8
ADCQ $0x00, DX
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0x00, R10
// Last reduction step
MOVQ R11, AX
MOVQ R11, R15
SHLQ $0x20, R11
MULQ p256const1<>+0(SB)
SHRQ $0x20, R15
ADDQ R11, R12
ADCQ R15, R13
ADCQ AX, R8
ADCQ DX, R9
ADCQ $0x00, R10
// Copy result [255:0]
MOVQ R12, SI
MOVQ R13, R11
MOVQ R8, R14
MOVQ R9, R15
// Subtract p256
SUBQ $-1, R12
SBBQ p256const0<>+0(SB), R13
SBBQ $0x00, R8
SBBQ p256const1<>+0(SB), R9
SBBQ $0x00, R10
CMOVQCS SI, R12
CMOVQCS R11, R13
CMOVQCS R14, R8
CMOVQCS R15, R9
MOVQ R12, (DI)
MOVQ R13, 8(DI)
MOVQ R8, 16(DI)
MOVQ R9, 24(DI)
RET
// func p256FromMont(res *p256Element, in *p256Element)
// Requires: CMOV
TEXT ·p256FromMont(SB), NOSPLIT, $0-16
MOVQ res+0(FP), DI
MOVQ in+8(FP), SI
MOVQ (SI), R8
MOVQ 8(SI), R9
MOVQ 16(SI), R10
MOVQ 24(SI), R11
XORQ R12, R12
// Only reduce, no multiplications are needed
// First stage
MOVQ R8, AX
MOVQ R8, R15
SHLQ $0x20, R8
MULQ p256const1<>+0(SB)
SHRQ $0x20, R15
ADDQ R8, R9
ADCQ R15, R10
ADCQ AX, R11
ADCQ DX, R12
XORQ R13, R13
// Second stage
MOVQ R9, AX
MOVQ R9, R15
SHLQ $0x20, R9
MULQ p256const1<>+0(SB)
SHRQ $0x20, R15
ADDQ R9, R10
ADCQ R15, R11
ADCQ AX, R12
ADCQ DX, R13
XORQ R8, R8
// Third stage
MOVQ R10, AX
MOVQ R10, R15
SHLQ $0x20, R10
MULQ p256const1<>+0(SB)
SHRQ $0x20, R15
ADDQ R10, R11
ADCQ R15, R12
ADCQ AX, R13
ADCQ DX, R8
XORQ R9, R9
// Last stage
MOVQ R11, AX
MOVQ R11, R15
SHLQ $0x20, R11
MULQ p256const1<>+0(SB)
SHRQ $0x20, R15
ADDQ R11, R12
ADCQ R15, R13
ADCQ AX, R8
ADCQ DX, R9
MOVQ R12, SI
MOVQ R13, R11
MOVQ R8, R14
MOVQ R9, R15
SUBQ $-1, R12
SBBQ p256const0<>+0(SB), R13
SBBQ $0x00, R8
SBBQ p256const1<>+0(SB), R9
CMOVQCS SI, R12
CMOVQCS R11, R13
CMOVQCS R14, R8
CMOVQCS R15, R9
MOVQ R12, (DI)
MOVQ R13, 8(DI)
MOVQ R8, 16(DI)
MOVQ R9, 24(DI)
RET
// func p256Select(res *P256Point, table *p256Table, idx int)
// Requires: SSE2
TEXT ·p256Select(SB), NOSPLIT, $0-24
MOVQ idx+16(FP), AX
MOVQ table+8(FP), DI
MOVQ res+0(FP), DX
PXOR X15, X15
PCMPEQL X14, X14
PSUBL X14, X15
MOVL AX, X14
PSHUFD $0x00, X14, X14
PXOR X0, X0
PXOR X1, X1
PXOR X2, X2
PXOR X3, X3
PXOR X4, X4
PXOR X5, X5
MOVQ $0x00000010, AX
MOVOU X15, X13
loop_select:
MOVOU X13, X12
PADDL X15, X13
PCMPEQL X14, X12
MOVOU (DI), X6
MOVOU 16(DI), X7
MOVOU 32(DI), X8
MOVOU 48(DI), X9
MOVOU 64(DI), X10
MOVOU 80(DI), X11
ADDQ $0x60, DI
PAND X12, X6
PAND X12, X7
PAND X12, X8
PAND X12, X9
PAND X12, X10
PAND X12, X11
PXOR X6, X0
PXOR X7, X1
PXOR X8, X2
PXOR X9, X3
PXOR X10, X4
PXOR X11, X5
DECQ AX
JNE loop_select
MOVOU X0, (DX)
MOVOU X1, 16(DX)
MOVOU X2, 32(DX)
MOVOU X3, 48(DX)
MOVOU X4, 64(DX)
MOVOU X5, 80(DX)
RET
// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
// Requires: SSE2
TEXT ·p256SelectAffine(SB), NOSPLIT, $0-24
MOVQ idx+16(FP), AX
MOVQ table+8(FP), DI
MOVQ res+0(FP), DX
PXOR X15, X15
PCMPEQL X14, X14
PSUBL X14, X15
MOVL AX, X14
PSHUFD $0x00, X14, X14
PXOR X0, X0
PXOR X1, X1
PXOR X2, X2
PXOR X3, X3
MOVQ $0x00000010, AX
MOVOU X15, X13
loop_select_base:
MOVOU X13, X12
PADDL X15, X13
PCMPEQL X14, X12
MOVOU (DI), X4
MOVOU 16(DI), X5
MOVOU 32(DI), X6
MOVOU 48(DI), X7
MOVOU 64(DI), X8
MOVOU 80(DI), X9
MOVOU 96(DI), X10
MOVOU 112(DI), X11
ADDQ $0x80, DI
PAND X12, X4
PAND X12, X5
PAND X12, X6
PAND X12, X7
MOVOU X13, X12
PADDL X15, X13
PCMPEQL X14, X12
PAND X12, X8
PAND X12, X9
PAND X12, X10
PAND X12, X11
PXOR X4, X0
PXOR X5, X1
PXOR X6, X2
PXOR X7, X3
PXOR X8, X0
PXOR X9, X1
PXOR X10, X2
PXOR X11, X3
DECQ AX
JNE loop_select_base
MOVOU X0, (DX)
MOVOU X1, 16(DX)
MOVOU X2, 32(DX)
MOVOU X3, 48(DX)
RET
// func p256OrdMul(res *p256OrdElement, in1 *p256OrdElement, in2 *p256OrdElement)
// Requires: CMOV
TEXT ·p256OrdMul(SB), NOSPLIT, $0-24
MOVQ res+0(FP), DI
MOVQ in1+8(FP), SI
MOVQ in2+16(FP), CX
// x * y[0]
MOVQ (CX), R14
MOVQ (SI), AX
MULQ R14
MOVQ AX, R8
MOVQ DX, R9
MOVQ 8(SI), AX
MULQ R14
ADDQ AX, R9
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 16(SI), AX
MULQ R14
ADDQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R11
MOVQ 24(SI), AX
MULQ R14
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R12
XORQ R13, R13
// First reduction step
MOVQ R8, AX
MULQ p256ordK0<>+0(SB)
MOVQ AX, R14
MOVQ p256ord<>+0(SB), AX
MULQ R14
ADDQ AX, R8
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+8(SB), AX
MULQ R14
ADDQ R15, R9
ADCQ $0x00, DX
ADDQ AX, R9
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+16(SB), AX
MULQ R14
ADDQ R15, R10
ADCQ $0x00, DX
ADDQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+24(SB), AX
MULQ R14
ADDQ R15, R11
ADCQ $0x00, DX
ADDQ AX, R11
ADCQ DX, R12
ADCQ $0x00, R13
// x * y[1]
MOVQ 8(CX), R14
MOVQ (SI), AX
MULQ R14
ADDQ AX, R9
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 8(SI), AX
MULQ R14
ADDQ R15, R10
ADCQ $0x00, DX
ADDQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 16(SI), AX
MULQ R14
ADDQ R15, R11
ADCQ $0x00, DX
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 24(SI), AX
MULQ R14
ADDQ R15, R12
ADCQ $0x00, DX
ADDQ AX, R12
ADCQ DX, R13
ADCQ $0x00, R8
// Second reduction step
MOVQ R9, AX
MULQ p256ordK0<>+0(SB)
MOVQ AX, R14
MOVQ p256ord<>+0(SB), AX
MULQ R14
ADDQ AX, R9
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+8(SB), AX
MULQ R14
ADDQ R15, R10
ADCQ $0x00, DX
ADDQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+16(SB), AX
MULQ R14
ADDQ R15, R11
ADCQ $0x00, DX
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+24(SB), AX
MULQ R14
ADDQ R15, R12
ADCQ $0x00, DX
ADDQ AX, R12
ADCQ DX, R13
ADCQ $0x00, R8
// x * y[2]
MOVQ 16(CX), R14
MOVQ (SI), AX
MULQ R14
ADDQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 8(SI), AX
MULQ R14
ADDQ R15, R11
ADCQ $0x00, DX
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 16(SI), AX
MULQ R14
ADDQ R15, R12
ADCQ $0x00, DX
ADDQ AX, R12
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 24(SI), AX
MULQ R14
ADDQ R15, R13
ADCQ $0x00, DX
ADDQ AX, R13
ADCQ DX, R8
ADCQ $0x00, R9
// Third reduction step
MOVQ R10, AX
MULQ p256ordK0<>+0(SB)
MOVQ AX, R14
MOVQ p256ord<>+0(SB), AX
MULQ R14
ADDQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+8(SB), AX
MULQ R14
ADDQ R15, R11
ADCQ $0x00, DX
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+16(SB), AX
MULQ R14
ADDQ R15, R12
ADCQ $0x00, DX
ADDQ AX, R12
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+24(SB), AX
MULQ R14
ADDQ R15, R13
ADCQ $0x00, DX
ADDQ AX, R13
ADCQ DX, R8
ADCQ $0x00, R9
// x * y[3]
MOVQ 24(CX), R14
MOVQ (SI), AX
MULQ R14
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 8(SI), AX
MULQ R14
ADDQ R15, R12
ADCQ $0x00, DX
ADDQ AX, R12
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 16(SI), AX
MULQ R14
ADDQ R15, R13
ADCQ $0x00, DX
ADDQ AX, R13
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 24(SI), AX
MULQ R14
ADDQ R15, R8
ADCQ $0x00, DX
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0x00, R10
// Last reduction step
MOVQ R11, AX
MULQ p256ordK0<>+0(SB)
MOVQ AX, R14
MOVQ p256ord<>+0(SB), AX
MULQ R14
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+8(SB), AX
MULQ R14
ADDQ R15, R12
ADCQ $0x00, DX
ADDQ AX, R12
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+16(SB), AX
MULQ R14
ADDQ R15, R13
ADCQ $0x00, DX
ADDQ AX, R13
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+24(SB), AX
MULQ R14
ADDQ R15, R8
ADCQ $0x00, DX
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0x00, R10
// Copy result [255:0]
MOVQ R12, SI
MOVQ R13, R11
MOVQ R8, R14
MOVQ R9, R15
// Subtract p256
SUBQ p256ord<>+0(SB), R12
SBBQ p256ord<>+8(SB), R13
SBBQ p256ord<>+16(SB), R8
SBBQ p256ord<>+24(SB), R9
SBBQ $0x00, R10
CMOVQCS SI, R12
CMOVQCS R11, R13
CMOVQCS R14, R8
CMOVQCS R15, R9
MOVQ R12, (DI)
MOVQ R13, 8(DI)
MOVQ R8, 16(DI)
MOVQ R9, 24(DI)
RET
DATA p256ordK0<>+0(SB)/8, $0xccd1c8aaee00bc4f
GLOBL p256ordK0<>(SB), RODATA, $8
DATA p256ord<>+0(SB)/8, $0xf3b9cac2fc632551
DATA p256ord<>+8(SB)/8, $0xbce6faada7179e84
DATA p256ord<>+16(SB)/8, $0xffffffffffffffff
DATA p256ord<>+24(SB)/8, $0xffffffff00000000
GLOBL p256ord<>(SB), RODATA, $32
// func p256OrdSqr(res *p256OrdElement, in *p256OrdElement, n int)
// Requires: CMOV
TEXT ·p256OrdSqr(SB), NOSPLIT, $0-24
MOVQ res+0(FP), DI
MOVQ in+8(FP), SI
MOVQ n+16(FP), BX
ordSqrLoop:
// y[1:] * y[0]
MOVQ (SI), R14
MOVQ 8(SI), AX
MULQ R14
MOVQ AX, R9
MOVQ DX, R10
MOVQ 16(SI), AX
MULQ R14
ADDQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R11
MOVQ 24(SI), AX
MULQ R14
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R12
// y[2:] * y[1]
MOVQ 8(SI), R14
MOVQ 16(SI), AX
MULQ R14
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 24(SI), AX
MULQ R14
ADDQ R15, R12
ADCQ $0x00, DX
ADDQ AX, R12
ADCQ $0x00, DX
MOVQ DX, R13
// y[3] * y[2]
MOVQ 16(SI), R14
MOVQ 24(SI), AX
MULQ R14
ADDQ AX, R13
ADCQ $0x00, DX
MOVQ DX, CX
XORQ R15, R15
// *2
ADDQ R9, R9
ADCQ R10, R10
ADCQ R11, R11
ADCQ R12, R12
ADCQ R13, R13
ADCQ CX, CX
ADCQ $0x00, R15
// Missing products
MOVQ (SI), AX
MULQ AX
MOVQ AX, R8
MOVQ DX, R14
MOVQ 8(SI), AX
MULQ AX
ADDQ R14, R9
ADCQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R14
MOVQ 16(SI), AX
MULQ AX
ADDQ R14, R11
ADCQ AX, R12
ADCQ $0x00, DX
MOVQ DX, R14
MOVQ 24(SI), AX
MULQ AX
ADDQ R14, R13
ADCQ AX, CX
ADCQ DX, R15
MOVQ R15, SI
// First reduction step
MOVQ R8, AX
MULQ p256ordK0<>+0(SB)
MOVQ AX, R14
MOVQ p256ord<>+0(SB), AX
MULQ R14
ADDQ AX, R8
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+8(SB), AX
MULQ R14
ADDQ R15, R9
ADCQ $0x00, DX
ADDQ AX, R9
MOVQ R14, R15
ADCQ DX, R10
ADCQ $0x00, R15
SUBQ R14, R10
SBBQ $0x00, R15
MOVQ R14, AX
MOVQ R14, DX
MOVQ R14, R8
SHLQ $0x20, AX
SHRQ $0x20, DX
ADDQ R15, R11
ADCQ $0x00, R8
SUBQ AX, R11
SBBQ DX, R8
// Second reduction step
MOVQ R9, AX
MULQ p256ordK0<>+0(SB)
MOVQ AX, R14
MOVQ p256ord<>+0(SB), AX
MULQ R14
ADDQ AX, R9
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+8(SB), AX
MULQ R14
ADDQ R15, R10
ADCQ $0x00, DX
ADDQ AX, R10
MOVQ R14, R15
ADCQ DX, R11
ADCQ $0x00, R15
SUBQ R14, R11
SBBQ $0x00, R15
MOVQ R14, AX
MOVQ R14, DX
MOVQ R14, R9
SHLQ $0x20, AX
SHRQ $0x20, DX
ADDQ R15, R8
ADCQ $0x00, R9
SUBQ AX, R8
SBBQ DX, R9
// Third reduction step
MOVQ R10, AX
MULQ p256ordK0<>+0(SB)
MOVQ AX, R14
MOVQ p256ord<>+0(SB), AX
MULQ R14
ADDQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+8(SB), AX
MULQ R14
ADDQ R15, R11
ADCQ $0x00, DX
ADDQ AX, R11
MOVQ R14, R15
ADCQ DX, R8
ADCQ $0x00, R15
SUBQ R14, R8
SBBQ $0x00, R15
MOVQ R14, AX
MOVQ R14, DX
MOVQ R14, R10
SHLQ $0x20, AX
SHRQ $0x20, DX
ADDQ R15, R9
ADCQ $0x00, R10
SUBQ AX, R9
SBBQ DX, R10
// Last reduction step
MOVQ R11, AX
MULQ p256ordK0<>+0(SB)
MOVQ AX, R14
MOVQ p256ord<>+0(SB), AX
MULQ R14
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+8(SB), AX
MULQ R14
ADDQ R15, R8
ADCQ $0x00, DX
ADDQ AX, R8
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ R14, R15
ADCQ DX, R9
ADCQ $0x00, R15
SUBQ R14, R9
SBBQ $0x00, R15
MOVQ R14, AX
MOVQ R14, DX
MOVQ R14, R11
SHLQ $0x20, AX
SHRQ $0x20, DX
ADDQ R15, R10
ADCQ $0x00, R11
SUBQ AX, R10
SBBQ DX, R11
XORQ R14, R14
// Add bits [511:256] of the sqr result
ADCQ R12, R8
ADCQ R13, R9
ADCQ CX, R10
ADCQ SI, R11
ADCQ $0x00, R14
MOVQ R8, R12
MOVQ R9, R13
MOVQ R10, CX
MOVQ R11, R15
// Subtract p256
SUBQ p256ord<>+0(SB), R8
SBBQ p256ord<>+8(SB), R9
SBBQ p256ord<>+16(SB), R10
SBBQ p256ord<>+24(SB), R11
SBBQ $0x00, R14
CMOVQCS R12, R8
CMOVQCS R13, R9
CMOVQCS CX, R10
CMOVQCS R15, R11
MOVQ R8, (DI)
MOVQ R9, 8(DI)
MOVQ R10, 16(DI)
MOVQ R11, 24(DI)
MOVQ DI, SI
DECQ BX
JNE ordSqrLoop
RET
// func p256SubInternal()
// Requires: CMOV
TEXT p256SubInternal(SB), NOSPLIT, $0
XORQ AX, AX
SUBQ R14, R10
SBBQ R15, R11
SBBQ DI, R12
SBBQ SI, R13
SBBQ $0x00, AX
MOVQ R10, BX
MOVQ R11, CX
MOVQ R12, R8
MOVQ R13, R9
ADDQ $-1, R10
ADCQ p256const0<>+0(SB), R11
ADCQ $0x00, R12
ADCQ p256const1<>+0(SB), R13
ANDQ $0x01, AX
CMOVQEQ BX, R10
CMOVQEQ CX, R11
CMOVQEQ R8, R12
CMOVQEQ R9, R13
RET
// func p256MulInternal()
// Requires: CMOV
TEXT p256MulInternal(SB), NOSPLIT, $8
MOVQ R10, AX
MULQ R14
MOVQ AX, BX
MOVQ DX, CX
MOVQ R10, AX
MULQ R15
ADDQ AX, CX
ADCQ $0x00, DX
MOVQ DX, R8
MOVQ R10, AX
MULQ DI
ADDQ AX, R8
ADCQ $0x00, DX
MOVQ DX, R9
MOVQ R10, AX
MULQ SI
ADDQ AX, R9
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ R11, AX
MULQ R14
ADDQ AX, CX
ADCQ $0x00, DX
MOVQ DX, BP
MOVQ R11, AX
MULQ R15
ADDQ BP, R8
ADCQ $0x00, DX
ADDQ AX, R8
ADCQ $0x00, DX
MOVQ DX, BP
MOVQ R11, AX
MULQ DI
ADDQ BP, R9
ADCQ $0x00, DX
ADDQ AX, R9
ADCQ $0x00, DX
MOVQ DX, BP
MOVQ R11, AX
MULQ SI
ADDQ BP, R10
ADCQ $0x00, DX
ADDQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R11
MOVQ R12, AX
MULQ R14
ADDQ AX, R8
ADCQ $0x00, DX
MOVQ DX, BP
MOVQ R12, AX
MULQ R15
ADDQ BP, R9
ADCQ $0x00, DX
ADDQ AX, R9
ADCQ $0x00, DX
MOVQ DX, BP
MOVQ R12, AX
MULQ DI
ADDQ BP, R10
ADCQ $0x00, DX
ADDQ AX, R10
ADCQ $0x00, DX
MOVQ DX, BP
MOVQ R12, AX
MULQ SI
ADDQ BP, R11
ADCQ $0x00, DX
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R12
MOVQ R13, AX
MULQ R14
ADDQ AX, R9
ADCQ $0x00, DX
MOVQ DX, BP
MOVQ R13, AX
MULQ R15
ADDQ BP, R10
ADCQ $0x00, DX
ADDQ AX, R10
ADCQ $0x00, DX
MOVQ DX, BP
MOVQ R13, AX
MULQ DI
ADDQ BP, R11
ADCQ $0x00, DX
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, BP
MOVQ R13, AX
MULQ SI
ADDQ BP, R12
ADCQ $0x00, DX
ADDQ AX, R12
ADCQ $0x00, DX
MOVQ DX, R13
// First reduction step
MOVQ BX, AX
MOVQ BX, BP
SHLQ $0x20, BX
MULQ p256const1<>+0(SB)
SHRQ $0x20, BP
ADDQ BX, CX
ADCQ BP, R8
ADCQ AX, R9
ADCQ $0x00, DX
MOVQ DX, BX
// Second reduction step
MOVQ CX, AX
MOVQ CX, BP
SHLQ $0x20, CX
MULQ p256const1<>+0(SB)
SHRQ $0x20, BP
ADDQ CX, R8
ADCQ BP, R9
ADCQ AX, BX
ADCQ $0x00, DX
MOVQ DX, CX
// Third reduction step
MOVQ R8, AX
MOVQ R8, BP
SHLQ $0x20, R8
MULQ p256const1<>+0(SB)
SHRQ $0x20, BP
ADDQ R8, R9
ADCQ BP, BX
ADCQ AX, CX
ADCQ $0x00, DX
MOVQ DX, R8
// Last reduction step
MOVQ R9, AX
MOVQ R9, BP
SHLQ $0x20, R9
MULQ p256const1<>+0(SB)
SHRQ $0x20, BP
ADDQ R9, BX
ADCQ BP, CX
ADCQ AX, R8
ADCQ $0x00, DX
MOVQ DX, R9
MOVQ $0x00000000, BP
// Add bits [511:256] of the result
ADCQ BX, R10
ADCQ CX, R11
ADCQ R8, R12
ADCQ R9, R13
ADCQ $0x00, BP
// Copy result
MOVQ R10, BX
MOVQ R11, CX
MOVQ R12, R8
MOVQ R13, R9
// Subtract p256
SUBQ $-1, R10
SBBQ p256const0<>+0(SB), R11
SBBQ $0x00, R12
SBBQ p256const1<>+0(SB), R13
SBBQ $0x00, BP
// If the result of the subtraction is negative, restore the previous result
CMOVQCS BX, R10
CMOVQCS CX, R11
CMOVQCS R8, R12
CMOVQCS R9, R13
RET
// func p256SqrInternal()
// Requires: CMOV
TEXT p256SqrInternal(SB), NOSPLIT, $8
MOVQ R10, AX
MULQ R11
MOVQ AX, CX
MOVQ DX, R8
MOVQ R10, AX
MULQ R12
ADDQ AX, R8
ADCQ $0x00, DX
MOVQ DX, R9
MOVQ R10, AX
MULQ R13
ADDQ AX, R9
ADCQ $0x00, DX
MOVQ DX, R14
MOVQ R11, AX
MULQ R12
ADDQ AX, R9
ADCQ $0x00, DX
MOVQ DX, BP
MOVQ R11, AX
MULQ R13
ADDQ BP, R14
ADCQ $0x00, DX
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ R12, AX
MULQ R13
ADDQ AX, R15
ADCQ $0x00, DX
MOVQ DX, DI
XORQ SI, SI
// *2
ADDQ CX, CX
ADCQ R8, R8
ADCQ R9, R9
ADCQ R14, R14
ADCQ R15, R15
ADCQ DI, DI
ADCQ $0x00, SI
// Missing products
MOVQ R10, AX
MULQ AX
MOVQ AX, BX
MOVQ DX, R10
MOVQ R11, AX
MULQ AX
ADDQ R10, CX
ADCQ AX, R8
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ R12, AX
MULQ AX
ADDQ R10, R9
ADCQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ R13, AX
MULQ AX
ADDQ R10, R15
ADCQ AX, DI
ADCQ DX, SI
// First reduction step
MOVQ BX, AX
MOVQ BX, BP
SHLQ $0x20, BX
MULQ p256const1<>+0(SB)
SHRQ $0x20, BP
ADDQ BX, CX
ADCQ BP, R8
ADCQ AX, R9
ADCQ $0x00, DX
MOVQ DX, BX
// Second reduction step
MOVQ CX, AX
MOVQ CX, BP
SHLQ $0x20, CX
MULQ p256const1<>+0(SB)
SHRQ $0x20, BP
ADDQ CX, R8
ADCQ BP, R9
ADCQ AX, BX
ADCQ $0x00, DX
MOVQ DX, CX
// Third reduction step
MOVQ R8, AX
MOVQ R8, BP
SHLQ $0x20, R8
MULQ p256const1<>+0(SB)
SHRQ $0x20, BP
ADDQ R8, R9
ADCQ BP, BX
ADCQ AX, CX
ADCQ $0x00, DX
MOVQ DX, R8
// Last reduction step
MOVQ R9, AX
MOVQ R9, BP
SHLQ $0x20, R9
MULQ p256const1<>+0(SB)
SHRQ $0x20, BP
ADDQ R9, BX
ADCQ BP, CX
ADCQ AX, R8
ADCQ $0x00, DX
MOVQ DX, R9
MOVQ $0x00000000, BP
// Add bits [511:256] of the result
ADCQ BX, R14
ADCQ CX, R15
ADCQ R8, DI
ADCQ R9, SI
ADCQ $0x00, BP
// Copy result
MOVQ R14, R10
MOVQ R15, R11
MOVQ DI, R12
MOVQ SI, R13
// Subtract p256
SUBQ $-1, R10
SBBQ p256const0<>+0(SB), R11
SBBQ $0x00, R12
SBBQ p256const1<>+0(SB), R13
SBBQ $0x00, BP
// If the result of the subtraction is negative, restore the previous result
CMOVQCS R14, R10
CMOVQCS R15, R11
CMOVQCS DI, R12
CMOVQCS SI, R13
RET
// func p256PointAddAffineAsm(res *P256Point, in1 *P256Point, in2 *p256AffinePoint, sign int, sel int, zero int)
// Requires: CMOV, SSE2
TEXT ·p256PointAddAffineAsm(SB), $512-48
MOVQ res+0(FP), AX
MOVQ in1+8(FP), BX
MOVQ in2+16(FP), CX
MOVQ sign+24(FP), DX
MOVQ sel+32(FP), R15
MOVQ zero+40(FP), DI
MOVOU (BX), X0
MOVOU 16(BX), X1
MOVOU 32(BX), X2
MOVOU 48(BX), X3
MOVOU 64(BX), X4
MOVOU 80(BX), X5
MOVOU X0, (SP)
MOVOU X1, 16(SP)
MOVOU X2, 32(SP)
MOVOU X3, 48(SP)
MOVOU X4, 64(SP)
MOVOU X5, 80(SP)
MOVOU (CX), X0
MOVOU 16(CX), X1
MOVOU X0, 96(SP)
MOVOU X1, 112(SP)
// Store pointer to result
MOVQ AX, 480(SP)
MOVL R15, 488(SP)
MOVL DI, 492(SP)
// Negate y2in based on sign
MOVQ 32(CX), R10
MOVQ 40(CX), R11
MOVQ 48(CX), R12
MOVQ 56(CX), R13
MOVQ $-1, BX
MOVQ p256const0<>+0(SB), CX
MOVQ $0x00000000, R8
MOVQ p256const1<>+0(SB), R9
XORQ AX, AX
// Speculatively subtract
SUBQ R10, BX
SBBQ R11, CX
SBBQ R12, R8
SBBQ R13, R9
SBBQ $0x00, AX
MOVQ BX, R14
MOVQ CX, R15
MOVQ R8, DI
MOVQ R9, SI
// Add in case the operand was > p256
ADDQ $-1, BX
ADCQ p256const0<>+0(SB), CX
ADCQ $0x00, R8
ADCQ p256const1<>+0(SB), R9
ADCQ $0x00, AX
CMOVQNE R14, BX
CMOVQNE R15, CX
CMOVQNE DI, R8
CMOVQNE SI, R9
// If condition is 0, keep original value
TESTQ DX, DX
CMOVQEQ R10, BX
CMOVQEQ R11, CX
CMOVQEQ R12, R8
CMOVQEQ R13, R9
// Store result
MOVQ BX, 128(SP)
MOVQ CX, 136(SP)
MOVQ R8, 144(SP)
MOVQ R9, 152(SP)
// Begin point add
MOVQ 64(SP), R10
MOVQ 72(SP), R11
MOVQ 80(SP), R12
MOVQ 88(SP), R13
CALL p256SqrInternal(SB)
MOVQ R10, 288(SP)
MOVQ R11, 296(SP)
MOVQ R12, 304(SP)
MOVQ R13, 312(SP)
MOVQ 96(SP), R14
MOVQ 104(SP), R15
MOVQ 112(SP), DI
MOVQ 120(SP), SI
CALL p256MulInternal(SB)
MOVQ (SP), R14
MOVQ 8(SP), R15
MOVQ 16(SP), DI
MOVQ 24(SP), SI
CALL p256SubInternal(SB)
MOVQ R10, 320(SP)
MOVQ R11, 328(SP)
MOVQ R12, 336(SP)
MOVQ R13, 344(SP)
MOVQ 64(SP), R14
MOVQ 72(SP), R15
MOVQ 80(SP), DI
MOVQ 88(SP), SI
CALL p256MulInternal(SB)
MOVQ R10, 224(SP)
MOVQ R11, 232(SP)
MOVQ R12, 240(SP)
MOVQ R13, 248(SP)
MOVQ 288(SP), R10
MOVQ 296(SP), R11
MOVQ 304(SP), R12
MOVQ 312(SP), R13
CALL p256MulInternal(SB)
MOVQ 128(SP), R14
MOVQ 136(SP), R15
MOVQ 144(SP), DI
MOVQ 152(SP), SI
CALL p256MulInternal(SB)
MOVQ R10, 256(SP)
MOVQ R11, 264(SP)
MOVQ R12, 272(SP)
MOVQ R13, 280(SP)
MOVQ 32(SP), R14
MOVQ 40(SP), R15
MOVQ 48(SP), DI
MOVQ 56(SP), SI
CALL p256SubInternal(SB)
MOVQ R10, 352(SP)
MOVQ R11, 360(SP)
MOVQ R12, 368(SP)
MOVQ R13, 376(SP)
CALL p256SqrInternal(SB)
MOVQ R10, 416(SP)
MOVQ R11, 424(SP)
MOVQ R12, 432(SP)
MOVQ R13, 440(SP)
MOVQ 320(SP), R10
MOVQ 328(SP), R11
MOVQ 336(SP), R12
MOVQ 344(SP), R13
CALL p256SqrInternal(SB)
MOVQ R10, 384(SP)
MOVQ R11, 392(SP)
MOVQ R12, 400(SP)
MOVQ R13, 408(SP)
MOVQ 320(SP), R14
MOVQ 328(SP), R15
MOVQ 336(SP), DI
MOVQ 344(SP), SI
CALL p256MulInternal(SB)
MOVQ R10, 448(SP)
MOVQ R11, 456(SP)
MOVQ R12, 464(SP)
MOVQ R13, 472(SP)
MOVQ 32(SP), R14
MOVQ 40(SP), R15
MOVQ 48(SP), DI
MOVQ 56(SP), SI
CALL p256MulInternal(SB)
MOVQ R10, 256(SP)
MOVQ R11, 264(SP)
MOVQ R12, 272(SP)
MOVQ R13, 280(SP)
MOVQ (SP), R10
MOVQ 8(SP), R11
MOVQ 16(SP), R12
MOVQ 24(SP), R13
MOVQ 384(SP), R14
MOVQ 392(SP), R15
MOVQ 400(SP), DI
MOVQ 408(SP), SI
CALL p256MulInternal(SB)
MOVQ R10, 320(SP)
MOVQ R11, 328(SP)
MOVQ R12, 336(SP)
MOVQ R13, 344(SP)
XORQ AX, AX
ADDQ R10, R10
ADCQ R11, R11
ADCQ R12, R12
ADCQ R13, R13
ADCQ $+0, AX
MOVQ R10, R14
MOVQ R11, R15
MOVQ R12, DI
MOVQ R13, SI
SUBQ $-1, R14
SBBQ p256const0<>+0(SB), R15
SBBQ $+0, DI
SBBQ p256const1<>+0(SB), SI
SBBQ $+0, AX
CMOVQCS R10, R14
CMOVQCS R11, R15
CMOVQCS R12, DI
CMOVQCS R13, SI
MOVQ 416(SP), R10
MOVQ 424(SP), R11
MOVQ 432(SP), R12
MOVQ 440(SP), R13
CALL p256SubInternal(SB)
MOVQ 448(SP), R14
MOVQ 456(SP), R15
MOVQ 464(SP), DI
MOVQ 472(SP), SI
CALL p256SubInternal(SB)
MOVQ R10, 160(SP)
MOVQ R11, 168(SP)
MOVQ R12, 176(SP)
MOVQ R13, 184(SP)
MOVQ R10, R14
MOVQ R11, R15
MOVQ R12, DI
MOVQ R13, SI
MOVQ 320(SP), R10
MOVQ 328(SP), R11
MOVQ 336(SP), R12
MOVQ 344(SP), R13
CALL p256SubInternal(SB)
MOVQ 352(SP), R14
MOVQ 360(SP), R15
MOVQ 368(SP), DI
MOVQ 376(SP), SI
CALL p256MulInternal(SB)
MOVQ 256(SP), R14
MOVQ 264(SP), R15
MOVQ 272(SP), DI
MOVQ 280(SP), SI
CALL p256SubInternal(SB)
MOVQ R10, 192(SP)
MOVQ R11, 200(SP)
MOVQ R12, 208(SP)
MOVQ R13, 216(SP)
// Load stored values from stack
MOVQ 480(SP), AX
MOVL 488(SP), BX
MOVL 492(SP), CX
// The result is not valid if (sel == 0), conditional choose
MOVOU 160(SP), X0
MOVOU 176(SP), X1
MOVOU 192(SP), X2
MOVOU 208(SP), X3
MOVOU 224(SP), X4
MOVOU 240(SP), X5
MOVL BX, X6
MOVL CX, X7
PXOR X8, X8
PCMPEQL X9, X9
PSHUFD $0x00, X6, X6
PSHUFD $0x00, X7, X7
PCMPEQL X8, X6
PCMPEQL X8, X7
MOVOU X6, X15
PANDN X9, X15
MOVOU (SP), X9
MOVOU 16(SP), X10
MOVOU 32(SP), X11
MOVOU 48(SP), X12
MOVOU 64(SP), X13
MOVOU 80(SP), X14
PAND X15, X0
PAND X15, X1
PAND X15, X2
PAND X15, X3
PAND X15, X4
PAND X15, X5
PAND X6, X9
PAND X6, X10
PAND X6, X11
PAND X6, X12
PAND X6, X13
PAND X6, X14
PXOR X9, X0
PXOR X10, X1
PXOR X11, X2
PXOR X12, X3
PXOR X13, X4
PXOR X14, X5
// Similarly if zero == 0
PCMPEQL X9, X9
MOVOU X7, X15
PANDN X9, X15
MOVOU 96(SP), X9
MOVOU 112(SP), X10
MOVOU 128(SP), X11
MOVOU 144(SP), X12
MOVOU p256one<>+0(SB), X13
MOVOU p256one<>+16(SB), X14
PAND X15, X0
PAND X15, X1
PAND X15, X2
PAND X15, X3
PAND X15, X4
PAND X15, X5
PAND X7, X9
PAND X7, X10
PAND X7, X11
PAND X7, X12
PAND X7, X13
PAND X7, X14
PXOR X9, X0
PXOR X10, X1
PXOR X11, X2
PXOR X12, X3
PXOR X13, X4
PXOR X14, X5
// Finally output the result
MOVOU X0, (AX)
MOVOU X1, 16(AX)
MOVOU X2, 32(AX)
MOVOU X3, 48(AX)
MOVOU X4, 64(AX)
MOVOU X5, 80(AX)
MOVQ $0x00000000, 480(SP)
RET
DATA p256one<>+0(SB)/8, $0x0000000000000001
DATA p256one<>+8(SB)/8, $0xffffffff00000000
DATA p256one<>+16(SB)/8, $0xffffffffffffffff
DATA p256one<>+24(SB)/8, $0x00000000fffffffe
GLOBL p256one<>(SB), RODATA, $32
// func p256IsZero()
// Requires: CMOV
TEXT p256IsZero(SB), NOSPLIT, $0
// AX contains a flag that is set if the input is zero.
XORQ AX, AX
MOVQ $0x00000001, R15
// Check whether [acc4..acc7] are all zero.
MOVQ R10, R14
ORQ R11, R14
ORQ R12, R14
ORQ R13, R14
// Set the zero flag if so. (CMOV of a constant to a register doesn't
// appear to be supported in Go. Thus t1 = 1.)
CMOVQEQ R15, AX
// XOR [acc4..acc7] with P and compare with zero again.
XORQ $-1, R10
XORQ p256const0<>+0(SB), R11
XORQ p256const1<>+0(SB), R13
ORQ R11, R10
ORQ R12, R10
ORQ R13, R10
// Set the zero flag if so.
CMOVQEQ R15, AX
RET
// func p256PointAddAsm(res *P256Point, in1 *P256Point, in2 *P256Point) int
// Requires: CMOV, SSE2
TEXT ·p256PointAddAsm(SB), $680-32
// Move input to stack in order to free registers
MOVQ res+0(FP), AX
MOVQ in1+8(FP), BX
MOVQ in2+16(FP), CX
MOVOU (BX), X0
MOVOU 16(BX), X1
MOVOU 32(BX), X2
MOVOU 48(BX), X3
MOVOU 64(BX), X4
MOVOU 80(BX), X5
MOVOU X0, (SP)
MOVOU X1, 16(SP)
MOVOU X2, 32(SP)
MOVOU X3, 48(SP)
MOVOU X4, 64(SP)
MOVOU X5, 80(SP)
MOVOU (CX), X0
MOVOU 16(CX), X1
MOVOU 32(CX), X2
MOVOU 48(CX), X3
MOVOU 64(CX), X4
MOVOU 80(CX), X5
MOVOU X0, 96(SP)
MOVOU X1, 112(SP)
MOVOU X2, 128(SP)
MOVOU X3, 144(SP)
MOVOU X4, 160(SP)
MOVOU X5, 176(SP)
// Store pointer to result
MOVQ AX, 640(SP)
// Begin point add
MOVQ 160(SP), R10
MOVQ 168(SP), R11
MOVQ 176(SP), R12
MOVQ 184(SP), R13
CALL p256SqrInternal(SB)
MOVQ R10, 448(SP)
MOVQ R11, 456(SP)
MOVQ R12, 464(SP)
MOVQ R13, 472(SP)
MOVQ 160(SP), R14
MOVQ 168(SP), R15
MOVQ 176(SP), DI
MOVQ 184(SP), SI
CALL p256MulInternal(SB)
MOVQ 32(SP), R14
MOVQ 40(SP), R15
MOVQ 48(SP), DI
MOVQ 56(SP), SI
CALL p256MulInternal(SB)
MOVQ R10, 352(SP)
MOVQ R11, 360(SP)
MOVQ R12, 368(SP)
MOVQ R13, 376(SP)
MOVQ 64(SP), R10
MOVQ 72(SP), R11
MOVQ 80(SP), R12
MOVQ 88(SP), R13
CALL p256SqrInternal(SB)
MOVQ R10, 416(SP)
MOVQ R11, 424(SP)
MOVQ R12, 432(SP)
MOVQ R13, 440(SP)
MOVQ 64(SP), R14
MOVQ 72(SP), R15
MOVQ 80(SP), DI
MOVQ 88(SP), SI
CALL p256MulInternal(SB)
MOVQ 128(SP), R14
MOVQ 136(SP), R15
MOVQ 144(SP), DI
MOVQ 152(SP), SI
CALL p256MulInternal(SB)
MOVQ R10, 384(SP)
MOVQ R11, 392(SP)
MOVQ R12, 400(SP)
MOVQ R13, 408(SP)
MOVQ 352(SP), R14
MOVQ 360(SP), R15
MOVQ 368(SP), DI
MOVQ 376(SP), SI
CALL p256SubInternal(SB)
MOVQ R10, 512(SP)
MOVQ R11, 520(SP)
MOVQ R12, 528(SP)
MOVQ R13, 536(SP)
CALL p256IsZero(SB)
MOVQ AX, 648(SP)
MOVQ 448(SP), R10
MOVQ 456(SP), R11
MOVQ 464(SP), R12
MOVQ 472(SP), R13
MOVQ (SP), R14
MOVQ 8(SP), R15
MOVQ 16(SP), DI
MOVQ 24(SP), SI
CALL p256MulInternal(SB)
MOVQ R10, 288(SP)
MOVQ R11, 296(SP)
MOVQ R12, 304(SP)
MOVQ R13, 312(SP)
MOVQ 416(SP), R10
MOVQ 424(SP), R11
MOVQ 432(SP), R12
MOVQ 440(SP), R13
MOVQ 96(SP), R14
MOVQ 104(SP), R15
MOVQ 112(SP), DI
MOVQ 120(SP), SI
CALL p256MulInternal(SB)
MOVQ R10, 320(SP)
MOVQ R11, 328(SP)
MOVQ R12, 336(SP)
MOVQ R13, 344(SP)
MOVQ 288(SP), R14
MOVQ 296(SP), R15
MOVQ 304(SP), DI
MOVQ 312(SP), SI
CALL p256SubInternal(SB)
MOVQ R10, 480(SP)
MOVQ R11, 488(SP)
MOVQ R12, 496(SP)
MOVQ R13, 504(SP)
CALL p256IsZero(SB)
ANDQ 648(SP), AX
MOVQ AX, 648(SP)
MOVQ 512(SP), R10
MOVQ 520(SP), R11
MOVQ 528(SP), R12
MOVQ 536(SP), R13
CALL p256SqrInternal(SB)
MOVQ R10, 576(SP)
MOVQ R11, 584(SP)
MOVQ R12, 592(SP)
MOVQ R13, 600(SP)
MOVQ 480(SP), R10
MOVQ 488(SP), R11
MOVQ 496(SP), R12
MOVQ 504(SP), R13
CALL p256SqrInternal(SB)
MOVQ R10, 544(SP)
MOVQ R11, 552(SP)
MOVQ R12, 560(SP)
MOVQ R13, 568(SP)
MOVQ 480(SP), R14
MOVQ 488(SP), R15
MOVQ 496(SP), DI
MOVQ 504(SP), SI
CALL p256MulInternal(SB)
MOVQ R10, 608(SP)
MOVQ R11, 616(SP)
MOVQ R12, 624(SP)
MOVQ R13, 632(SP)
MOVQ 352(SP), R14
MOVQ 360(SP), R15
MOVQ 368(SP), DI
MOVQ 376(SP), SI
CALL p256MulInternal(SB)
MOVQ R10, 384(SP)
MOVQ R11, 392(SP)
MOVQ R12, 400(SP)
MOVQ R13, 408(SP)
MOVQ 64(SP), R10
MOVQ 72(SP), R11
MOVQ 80(SP), R12
MOVQ 88(SP), R13
MOVQ 160(SP), R14
MOVQ 168(SP), R15
MOVQ 176(SP), DI
MOVQ 184(SP), SI
CALL p256MulInternal(SB)
MOVQ 480(SP), R14
MOVQ 488(SP), R15
MOVQ 496(SP), DI
MOVQ 504(SP), SI
CALL p256MulInternal(SB)
MOVQ R10, 256(SP)
MOVQ R11, 264(SP)
MOVQ R12, 272(SP)
MOVQ R13, 280(SP)
MOVQ 544(SP), R10
MOVQ 552(SP), R11
MOVQ 560(SP), R12
MOVQ 568(SP), R13
MOVQ 288(SP), R14
MOVQ 296(SP), R15
MOVQ 304(SP), DI
MOVQ 312(SP), SI
CALL p256MulInternal(SB)
MOVQ R10, 320(SP)
MOVQ R11, 328(SP)
MOVQ R12, 336(SP)
MOVQ R13, 344(SP)
XORQ AX, AX
ADDQ R10, R10
ADCQ R11, R11
ADCQ R12, R12
ADCQ R13, R13
ADCQ $+0, AX
MOVQ R10, R14
MOVQ R11, R15
MOVQ R12, DI
MOVQ R13, SI
SUBQ $-1, R14
SBBQ p256const0<>+0(SB), R15
SBBQ $+0, DI
SBBQ p256const1<>+0(SB), SI
SBBQ $+0, AX
CMOVQCS R10, R14
CMOVQCS R11, R15
CMOVQCS R12, DI
CMOVQCS R13, SI
MOVQ 576(SP), R10
MOVQ 584(SP), R11
MOVQ 592(SP), R12
MOVQ 600(SP), R13
CALL p256SubInternal(SB)
MOVQ 608(SP), R14
MOVQ 616(SP), R15
MOVQ 624(SP), DI
MOVQ 632(SP), SI
CALL p256SubInternal(SB)
MOVQ R10, 192(SP)
MOVQ R11, 200(SP)
MOVQ R12, 208(SP)
MOVQ R13, 216(SP)
MOVQ R10, R14
MOVQ R11, R15
MOVQ R12, DI
MOVQ R13, SI
MOVQ 320(SP), R10
MOVQ 328(SP), R11
MOVQ 336(SP), R12
MOVQ 344(SP), R13
CALL p256SubInternal(SB)
MOVQ 512(SP), R14
MOVQ 520(SP), R15
MOVQ 528(SP), DI
MOVQ 536(SP), SI
CALL p256MulInternal(SB)
MOVQ 384(SP), R14
MOVQ 392(SP), R15
MOVQ 400(SP), DI
MOVQ 408(SP), SI
CALL p256SubInternal(SB)
MOVQ R10, 224(SP)
MOVQ R11, 232(SP)
MOVQ R12, 240(SP)
MOVQ R13, 248(SP)
MOVOU 192(SP), X0
MOVOU 208(SP), X1
MOVOU 224(SP), X2
MOVOU 240(SP), X3
MOVOU 256(SP), X4
MOVOU 272(SP), X5
// Finally output the result
MOVQ 640(SP), AX
MOVQ $0x00000000, 640(SP)
MOVOU X0, (AX)
MOVOU X1, 16(AX)
MOVOU X2, 32(AX)
MOVOU X3, 48(AX)
MOVOU X4, 64(AX)
MOVOU X5, 80(AX)
MOVQ 648(SP), AX
MOVQ AX, ret+24(FP)
RET
// func p256PointDoubleAsm(res *P256Point, in *P256Point)
// Requires: CMOV, SSE2
TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $256-16
MOVQ res+0(FP), AX
MOVQ in+8(FP), BX
MOVOU (BX), X0
MOVOU 16(BX), X1
MOVOU 32(BX), X2
MOVOU 48(BX), X3
MOVOU 64(BX), X4
MOVOU 80(BX), X5
MOVOU X0, (SP)
MOVOU X1, 16(SP)
MOVOU X2, 32(SP)
MOVOU X3, 48(SP)
MOVOU X4, 64(SP)
MOVOU X5, 80(SP)
// Store pointer to result
MOVQ AX, 224(SP)
// Begin point double
MOVQ 64(SP), R10
MOVQ 72(SP), R11
MOVQ 80(SP), R12
MOVQ 88(SP), R13
CALL p256SqrInternal(SB)
MOVQ R10, 160(SP)
MOVQ R11, 168(SP)
MOVQ R12, 176(SP)
MOVQ R13, 184(SP)
MOVQ (SP), R14
MOVQ 8(SP), R15
MOVQ 16(SP), DI
MOVQ 24(SP), SI
XORQ AX, AX
ADDQ R14, R10
ADCQ R15, R11
ADCQ DI, R12
ADCQ SI, R13
ADCQ $+0, AX
MOVQ R10, R14
MOVQ R11, R15
MOVQ R12, DI
MOVQ R13, SI
SUBQ $-1, R14
SBBQ p256const0<>+0(SB), R15
SBBQ $+0, DI
SBBQ p256const1<>+0(SB), SI
SBBQ $+0, AX
CMOVQCS R10, R14
CMOVQCS R11, R15
CMOVQCS R12, DI
CMOVQCS R13, SI
MOVQ R14, 128(SP)
MOVQ R15, 136(SP)
MOVQ DI, 144(SP)
MOVQ SI, 152(SP)
MOVQ 64(SP), R10
MOVQ 72(SP), R11
MOVQ 80(SP), R12
MOVQ 88(SP), R13
MOVQ 32(SP), R14
MOVQ 40(SP), R15
MOVQ 48(SP), DI
MOVQ 56(SP), SI
CALL p256MulInternal(SB)
XORQ AX, AX
ADDQ R10, R10
ADCQ R11, R11
ADCQ R12, R12
ADCQ R13, R13
ADCQ $+0, AX
MOVQ R10, R14
MOVQ R11, R15
MOVQ R12, DI
MOVQ R13, SI
SUBQ $-1, R14
SBBQ p256const0<>+0(SB), R15
SBBQ $+0, DI
SBBQ p256const1<>+0(SB), SI
SBBQ $+0, AX
CMOVQCS R10, R14
CMOVQCS R11, R15
CMOVQCS R12, DI
CMOVQCS R13, SI
MOVQ 224(SP), AX
// Store z
MOVQ R14, 64(AX)
MOVQ R15, 72(AX)
MOVQ DI, 80(AX)
MOVQ SI, 88(AX)
MOVQ (SP), R10
MOVQ 8(SP), R11
MOVQ 16(SP), R12
MOVQ 24(SP), R13
MOVQ 160(SP), R14
MOVQ 168(SP), R15
MOVQ 176(SP), DI
MOVQ 184(SP), SI
CALL p256SubInternal(SB)
MOVQ 128(SP), R14
MOVQ 136(SP), R15
MOVQ 144(SP), DI
MOVQ 152(SP), SI
CALL p256MulInternal(SB)
MOVQ R10, 128(SP)
MOVQ R11, 136(SP)
MOVQ R12, 144(SP)
MOVQ R13, 152(SP)
// Multiply by 3
XORQ AX, AX
ADDQ R10, R10
ADCQ R11, R11
ADCQ R12, R12
ADCQ R13, R13
ADCQ $+0, AX
MOVQ R10, R14
MOVQ R11, R15
MOVQ R12, DI
MOVQ R13, SI
SUBQ $-1, R14
SBBQ p256const0<>+0(SB), R15
SBBQ $+0, DI
SBBQ p256const1<>+0(SB), SI
SBBQ $+0, AX
CMOVQCS R10, R14
CMOVQCS R11, R15
CMOVQCS R12, DI
CMOVQCS R13, SI
MOVQ 128(SP), R10
MOVQ 136(SP), R11
MOVQ 144(SP), R12
MOVQ 152(SP), R13
XORQ AX, AX
ADDQ R14, R10
ADCQ R15, R11
ADCQ DI, R12
ADCQ SI, R13
ADCQ $+0, AX
MOVQ R10, R14
MOVQ R11, R15
MOVQ R12, DI
MOVQ R13, SI
SUBQ $-1, R14
SBBQ p256const0<>+0(SB), R15
SBBQ $+0, DI
SBBQ p256const1<>+0(SB), SI
SBBQ $+0, AX
CMOVQCS R10, R14
CMOVQCS R11, R15
CMOVQCS R12, DI
CMOVQCS R13, SI
MOVQ R14, 128(SP)
MOVQ R15, 136(SP)
MOVQ DI, 144(SP)
MOVQ SI, 152(SP)
// ////////////////////////
MOVQ 32(SP), R10
MOVQ 40(SP), R11
MOVQ 48(SP), R12
MOVQ 56(SP), R13
XORQ AX, AX
ADDQ R10, R10
ADCQ R11, R11
ADCQ R12, R12
ADCQ R13, R13
ADCQ $+0, AX
MOVQ R10, R14
MOVQ R11, R15
MOVQ R12, DI
MOVQ R13, SI
SUBQ $-1, R14
SBBQ p256const0<>+0(SB), R15
SBBQ $+0, DI
SBBQ p256const1<>+0(SB), SI
SBBQ $+0, AX
CMOVQCS R10, R14
CMOVQCS R11, R15
CMOVQCS R12, DI
CMOVQCS R13, SI
MOVQ R14, R10
MOVQ R15, R11
MOVQ DI, R12
MOVQ SI, R13
CALL p256SqrInternal(SB)
MOVQ R10, 96(SP)
MOVQ R11, 104(SP)
MOVQ R12, 112(SP)
MOVQ R13, 120(SP)
CALL p256SqrInternal(SB)
// Divide by 2
XORQ AX, AX
MOVQ R10, R14
MOVQ R11, R15
MOVQ R12, DI
MOVQ R13, SI
ADDQ $-1, R10
ADCQ p256const0<>+0(SB), R11
ADCQ $0x00, R12
ADCQ p256const1<>+0(SB), R13
ADCQ $0x00, AX
TESTQ $0x00000001, R14
CMOVQEQ R14, R10
CMOVQEQ R15, R11
CMOVQEQ DI, R12
CMOVQEQ SI, R13
ANDQ R14, AX
SHRQ $0x01, R11, R10
SHRQ $0x01, R12, R11
SHRQ $0x01, R13, R12
SHRQ $0x01, AX, R13
MOVQ R10, 32(SP)
MOVQ R11, 40(SP)
MOVQ R12, 48(SP)
MOVQ R13, 56(SP)
// /////////////////////////
MOVQ (SP), R10
MOVQ 8(SP), R11
MOVQ 16(SP), R12
MOVQ 24(SP), R13
MOVQ 96(SP), R14
MOVQ 104(SP), R15
MOVQ 112(SP), DI
MOVQ 120(SP), SI
CALL p256MulInternal(SB)
MOVQ R10, 96(SP)
MOVQ R11, 104(SP)
MOVQ R12, 112(SP)
MOVQ R13, 120(SP)
XORQ AX, AX
ADDQ R10, R10
ADCQ R11, R11
ADCQ R12, R12
ADCQ R13, R13
ADCQ $+0, AX
MOVQ R10, R14
MOVQ R11, R15
MOVQ R12, DI
MOVQ R13, SI
SUBQ $-1, R14
SBBQ p256const0<>+0(SB), R15
SBBQ $+0, DI
SBBQ p256const1<>+0(SB), SI
SBBQ $+0, AX
CMOVQCS R10, R14
CMOVQCS R11, R15
CMOVQCS R12, DI
CMOVQCS R13, SI
MOVQ R14, 192(SP)
MOVQ R15, 200(SP)
MOVQ DI, 208(SP)
MOVQ SI, 216(SP)
MOVQ 128(SP), R10
MOVQ 136(SP), R11
MOVQ 144(SP), R12
MOVQ 152(SP), R13
CALL p256SqrInternal(SB)
MOVQ 192(SP), R14
MOVQ 200(SP), R15
MOVQ 208(SP), DI
MOVQ 216(SP), SI
CALL p256SubInternal(SB)
MOVQ 224(SP), AX
// Store x
MOVQ R10, (AX)
MOVQ R11, 8(AX)
MOVQ R12, 16(AX)
MOVQ R13, 24(AX)
MOVQ R10, R14
MOVQ R11, R15
MOVQ R12, DI
MOVQ R13, SI
MOVQ 96(SP), R10
MOVQ 104(SP), R11
MOVQ 112(SP), R12
MOVQ 120(SP), R13
CALL p256SubInternal(SB)
MOVQ 128(SP), R14
MOVQ 136(SP), R15
MOVQ 144(SP), DI
MOVQ 152(SP), SI
CALL p256MulInternal(SB)
MOVQ 32(SP), R14
MOVQ 40(SP), R15
MOVQ 48(SP), DI
MOVQ 56(SP), SI
CALL p256SubInternal(SB)
MOVQ 224(SP), AX
// Store y
MOVQ R10, 32(AX)
MOVQ R11, 40(AX)
MOVQ R12, 48(AX)
MOVQ R13, 56(AX)
// ///////////////////////
MOVQ $0x00000000, 224(SP)
RET