| // Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg field. DO NOT EDIT. |
| |
| //go:build amd64 && gc && !purego |
| // +build amd64,gc,!purego |
| |
| #include "textflag.h" |
| |
| // func feMul(out *Element, a *Element, b *Element) |
| TEXT ·feMul(SB), NOSPLIT, $0-24 |
| MOVQ a+8(FP), CX |
| MOVQ b+16(FP), BX |
| |
| // r0 = a0×b0 |
| MOVQ (CX), AX |
| MULQ (BX) |
| MOVQ AX, DI |
| MOVQ DX, SI |
| |
| // r0 += 19×a1×b4 |
| MOVQ 8(CX), AX |
| IMUL3Q $0x13, AX, AX |
| MULQ 32(BX) |
| ADDQ AX, DI |
| ADCQ DX, SI |
| |
| // r0 += 19×a2×b3 |
| MOVQ 16(CX), AX |
| IMUL3Q $0x13, AX, AX |
| MULQ 24(BX) |
| ADDQ AX, DI |
| ADCQ DX, SI |
| |
| // r0 += 19×a3×b2 |
| MOVQ 24(CX), AX |
| IMUL3Q $0x13, AX, AX |
| MULQ 16(BX) |
| ADDQ AX, DI |
| ADCQ DX, SI |
| |
| // r0 += 19×a4×b1 |
| MOVQ 32(CX), AX |
| IMUL3Q $0x13, AX, AX |
| MULQ 8(BX) |
| ADDQ AX, DI |
| ADCQ DX, SI |
| |
| // r1 = a0×b1 |
| MOVQ (CX), AX |
| MULQ 8(BX) |
| MOVQ AX, R9 |
| MOVQ DX, R8 |
| |
| // r1 += a1×b0 |
| MOVQ 8(CX), AX |
| MULQ (BX) |
| ADDQ AX, R9 |
| ADCQ DX, R8 |
| |
| // r1 += 19×a2×b4 |
| MOVQ 16(CX), AX |
| IMUL3Q $0x13, AX, AX |
| MULQ 32(BX) |
| ADDQ AX, R9 |
| ADCQ DX, R8 |
| |
| // r1 += 19×a3×b3 |
| MOVQ 24(CX), AX |
| IMUL3Q $0x13, AX, AX |
| MULQ 24(BX) |
| ADDQ AX, R9 |
| ADCQ DX, R8 |
| |
| // r1 += 19×a4×b2 |
| MOVQ 32(CX), AX |
| IMUL3Q $0x13, AX, AX |
| MULQ 16(BX) |
| ADDQ AX, R9 |
| ADCQ DX, R8 |
| |
| // r2 = a0×b2 |
| MOVQ (CX), AX |
| MULQ 16(BX) |
| MOVQ AX, R11 |
| MOVQ DX, R10 |
| |
| // r2 += a1×b1 |
| MOVQ 8(CX), AX |
| MULQ 8(BX) |
| ADDQ AX, R11 |
| ADCQ DX, R10 |
| |
| // r2 += a2×b0 |
| MOVQ 16(CX), AX |
| MULQ (BX) |
| ADDQ AX, R11 |
| ADCQ DX, R10 |
| |
| // r2 += 19×a3×b4 |
| MOVQ 24(CX), AX |
| IMUL3Q $0x13, AX, AX |
| MULQ 32(BX) |
| ADDQ AX, R11 |
| ADCQ DX, R10 |
| |
| // r2 += 19×a4×b3 |
| MOVQ 32(CX), AX |
| IMUL3Q $0x13, AX, AX |
| MULQ 24(BX) |
| ADDQ AX, R11 |
| ADCQ DX, R10 |
| |
| // r3 = a0×b3 |
| MOVQ (CX), AX |
| MULQ 24(BX) |
| MOVQ AX, R13 |
| MOVQ DX, R12 |
| |
| // r3 += a1×b2 |
| MOVQ 8(CX), AX |
| MULQ 16(BX) |
| ADDQ AX, R13 |
| ADCQ DX, R12 |
| |
| // r3 += a2×b1 |
| MOVQ 16(CX), AX |
| MULQ 8(BX) |
| ADDQ AX, R13 |
| ADCQ DX, R12 |
| |
| // r3 += a3×b0 |
| MOVQ 24(CX), AX |
| MULQ (BX) |
| ADDQ AX, R13 |
| ADCQ DX, R12 |
| |
| // r3 += 19×a4×b4 |
| MOVQ 32(CX), AX |
| IMUL3Q $0x13, AX, AX |
| MULQ 32(BX) |
| ADDQ AX, R13 |
| ADCQ DX, R12 |
| |
| // r4 = a0×b4 |
| MOVQ (CX), AX |
| MULQ 32(BX) |
| MOVQ AX, R15 |
| MOVQ DX, R14 |
| |
| // r4 += a1×b3 |
| MOVQ 8(CX), AX |
| MULQ 24(BX) |
| ADDQ AX, R15 |
| ADCQ DX, R14 |
| |
| // r4 += a2×b2 |
| MOVQ 16(CX), AX |
| MULQ 16(BX) |
| ADDQ AX, R15 |
| ADCQ DX, R14 |
| |
| // r4 += a3×b1 |
| MOVQ 24(CX), AX |
| MULQ 8(BX) |
| ADDQ AX, R15 |
| ADCQ DX, R14 |
| |
| // r4 += a4×b0 |
| MOVQ 32(CX), AX |
| MULQ (BX) |
| ADDQ AX, R15 |
| ADCQ DX, R14 |
| |
| // First reduction chain |
| MOVQ $0x0007ffffffffffff, AX |
| SHLQ $0x0d, DI, SI |
| SHLQ $0x0d, R9, R8 |
| SHLQ $0x0d, R11, R10 |
| SHLQ $0x0d, R13, R12 |
| SHLQ $0x0d, R15, R14 |
| ANDQ AX, DI |
| IMUL3Q $0x13, R14, R14 |
| ADDQ R14, DI |
| ANDQ AX, R9 |
| ADDQ SI, R9 |
| ANDQ AX, R11 |
| ADDQ R8, R11 |
| ANDQ AX, R13 |
| ADDQ R10, R13 |
| ANDQ AX, R15 |
| ADDQ R12, R15 |
| |
| // Second reduction chain (carryPropagate) |
| MOVQ DI, SI |
| SHRQ $0x33, SI |
| MOVQ R9, R8 |
| SHRQ $0x33, R8 |
| MOVQ R11, R10 |
| SHRQ $0x33, R10 |
| MOVQ R13, R12 |
| SHRQ $0x33, R12 |
| MOVQ R15, R14 |
| SHRQ $0x33, R14 |
| ANDQ AX, DI |
| IMUL3Q $0x13, R14, R14 |
| ADDQ R14, DI |
| ANDQ AX, R9 |
| ADDQ SI, R9 |
| ANDQ AX, R11 |
| ADDQ R8, R11 |
| ANDQ AX, R13 |
| ADDQ R10, R13 |
| ANDQ AX, R15 |
| ADDQ R12, R15 |
| |
| // Store output |
| MOVQ out+0(FP), AX |
| MOVQ DI, (AX) |
| MOVQ R9, 8(AX) |
| MOVQ R11, 16(AX) |
| MOVQ R13, 24(AX) |
| MOVQ R15, 32(AX) |
| RET |
| |
| // func feSquare(out *Element, a *Element) |
| TEXT ·feSquare(SB), NOSPLIT, $0-16 |
| MOVQ a+8(FP), CX |
| |
| // r0 = l0×l0 |
| MOVQ (CX), AX |
| MULQ (CX) |
| MOVQ AX, SI |
| MOVQ DX, BX |
| |
| // r0 += 38×l1×l4 |
| MOVQ 8(CX), AX |
| IMUL3Q $0x26, AX, AX |
| MULQ 32(CX) |
| ADDQ AX, SI |
| ADCQ DX, BX |
| |
| // r0 += 38×l2×l3 |
| MOVQ 16(CX), AX |
| IMUL3Q $0x26, AX, AX |
| MULQ 24(CX) |
| ADDQ AX, SI |
| ADCQ DX, BX |
| |
| // r1 = 2×l0×l1 |
| MOVQ (CX), AX |
| SHLQ $0x01, AX |
| MULQ 8(CX) |
| MOVQ AX, R8 |
| MOVQ DX, DI |
| |
| // r1 += 38×l2×l4 |
| MOVQ 16(CX), AX |
| IMUL3Q $0x26, AX, AX |
| MULQ 32(CX) |
| ADDQ AX, R8 |
| ADCQ DX, DI |
| |
| // r1 += 19×l3×l3 |
| MOVQ 24(CX), AX |
| IMUL3Q $0x13, AX, AX |
| MULQ 24(CX) |
| ADDQ AX, R8 |
| ADCQ DX, DI |
| |
| // r2 = 2×l0×l2 |
| MOVQ (CX), AX |
| SHLQ $0x01, AX |
| MULQ 16(CX) |
| MOVQ AX, R10 |
| MOVQ DX, R9 |
| |
| // r2 += l1×l1 |
| MOVQ 8(CX), AX |
| MULQ 8(CX) |
| ADDQ AX, R10 |
| ADCQ DX, R9 |
| |
| // r2 += 38×l3×l4 |
| MOVQ 24(CX), AX |
| IMUL3Q $0x26, AX, AX |
| MULQ 32(CX) |
| ADDQ AX, R10 |
| ADCQ DX, R9 |
| |
| // r3 = 2×l0×l3 |
| MOVQ (CX), AX |
| SHLQ $0x01, AX |
| MULQ 24(CX) |
| MOVQ AX, R12 |
| MOVQ DX, R11 |
| |
| // r3 += 2×l1×l2 |
| MOVQ 8(CX), AX |
| IMUL3Q $0x02, AX, AX |
| MULQ 16(CX) |
| ADDQ AX, R12 |
| ADCQ DX, R11 |
| |
| // r3 += 19×l4×l4 |
| MOVQ 32(CX), AX |
| IMUL3Q $0x13, AX, AX |
| MULQ 32(CX) |
| ADDQ AX, R12 |
| ADCQ DX, R11 |
| |
| // r4 = 2×l0×l4 |
| MOVQ (CX), AX |
| SHLQ $0x01, AX |
| MULQ 32(CX) |
| MOVQ AX, R14 |
| MOVQ DX, R13 |
| |
| // r4 += 2×l1×l3 |
| MOVQ 8(CX), AX |
| IMUL3Q $0x02, AX, AX |
| MULQ 24(CX) |
| ADDQ AX, R14 |
| ADCQ DX, R13 |
| |
| // r4 += l2×l2 |
| MOVQ 16(CX), AX |
| MULQ 16(CX) |
| ADDQ AX, R14 |
| ADCQ DX, R13 |
| |
| // First reduction chain |
| MOVQ $0x0007ffffffffffff, AX |
| SHLQ $0x0d, SI, BX |
| SHLQ $0x0d, R8, DI |
| SHLQ $0x0d, R10, R9 |
| SHLQ $0x0d, R12, R11 |
| SHLQ $0x0d, R14, R13 |
| ANDQ AX, SI |
| IMUL3Q $0x13, R13, R13 |
| ADDQ R13, SI |
| ANDQ AX, R8 |
| ADDQ BX, R8 |
| ANDQ AX, R10 |
| ADDQ DI, R10 |
| ANDQ AX, R12 |
| ADDQ R9, R12 |
| ANDQ AX, R14 |
| ADDQ R11, R14 |
| |
| // Second reduction chain (carryPropagate) |
| MOVQ SI, BX |
| SHRQ $0x33, BX |
| MOVQ R8, DI |
| SHRQ $0x33, DI |
| MOVQ R10, R9 |
| SHRQ $0x33, R9 |
| MOVQ R12, R11 |
| SHRQ $0x33, R11 |
| MOVQ R14, R13 |
| SHRQ $0x33, R13 |
| ANDQ AX, SI |
| IMUL3Q $0x13, R13, R13 |
| ADDQ R13, SI |
| ANDQ AX, R8 |
| ADDQ BX, R8 |
| ANDQ AX, R10 |
| ADDQ DI, R10 |
| ANDQ AX, R12 |
| ADDQ R9, R12 |
| ANDQ AX, R14 |
| ADDQ R11, R14 |
| |
| // Store output |
| MOVQ out+0(FP), AX |
| MOVQ SI, (AX) |
| MOVQ R8, 8(AX) |
| MOVQ R10, 16(AX) |
| MOVQ R12, 24(AX) |
| MOVQ R14, 32(AX) |
| RET |