| // Copyright 2025 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT. |
| |
| //go:build !math_big_pure_go |
| |
| #include "textflag.h" |
| |
| // func addVV(z, x, y []Word) (c Word) |
| TEXT ·addVV(SB), NOSPLIT, $0 |
| MOVQ z_len+8(FP), BX |
| MOVQ x_base+24(FP), SI |
| MOVQ y_base+48(FP), DI |
| MOVQ z_base+0(FP), R8 |
| // compute unrolled loop lengths |
| MOVQ BX, R9 |
| ANDQ $3, R9 |
| SHRQ $2, BX |
| MOVQ $0, R10 // clear saved carry |
| loop1: |
| TESTQ R9, R9; JZ loop1done |
| loop1cont: |
| // unroll 1X |
| ADDQ R10, R10 // restore carry |
| MOVQ 0(SI), R10 |
| ADCQ 0(DI), R10 |
| MOVQ R10, 0(R8) |
| SBBQ R10, R10 // save carry |
| LEAQ 8(SI), SI // ADD $8, SI |
| LEAQ 8(DI), DI // ADD $8, DI |
| LEAQ 8(R8), R8 // ADD $8, R8 |
| SUBQ $1, R9; JNZ loop1cont |
| loop1done: |
| loop4: |
| TESTQ BX, BX; JZ loop4done |
| loop4cont: |
| // unroll 4X |
| ADDQ R10, R10 // restore carry |
| MOVQ 0(SI), R9 |
| MOVQ 8(SI), R10 |
| MOVQ 16(SI), R11 |
| MOVQ 24(SI), R12 |
| ADCQ 0(DI), R9 |
| ADCQ 8(DI), R10 |
| ADCQ 16(DI), R11 |
| ADCQ 24(DI), R12 |
| MOVQ R9, 0(R8) |
| MOVQ R10, 8(R8) |
| MOVQ R11, 16(R8) |
| MOVQ R12, 24(R8) |
| SBBQ R10, R10 // save carry |
| LEAQ 32(SI), SI // ADD $32, SI |
| LEAQ 32(DI), DI // ADD $32, DI |
| LEAQ 32(R8), R8 // ADD $32, R8 |
| SUBQ $1, BX; JNZ loop4cont |
| loop4done: |
| NEGQ R10 // convert add carry |
| MOVQ R10, c+72(FP) |
| RET |
| |
| // func subVV(z, x, y []Word) (c Word) |
| TEXT ·subVV(SB), NOSPLIT, $0 |
| MOVQ z_len+8(FP), BX |
| MOVQ x_base+24(FP), SI |
| MOVQ y_base+48(FP), DI |
| MOVQ z_base+0(FP), R8 |
| // compute unrolled loop lengths |
| MOVQ BX, R9 |
| ANDQ $3, R9 |
| SHRQ $2, BX |
| MOVQ $0, R10 // clear saved carry |
| loop1: |
| TESTQ R9, R9; JZ loop1done |
| loop1cont: |
| // unroll 1X |
| ADDQ R10, R10 // restore carry |
| MOVQ 0(SI), R10 |
| SBBQ 0(DI), R10 |
| MOVQ R10, 0(R8) |
| SBBQ R10, R10 // save carry |
| LEAQ 8(SI), SI // ADD $8, SI |
| LEAQ 8(DI), DI // ADD $8, DI |
| LEAQ 8(R8), R8 // ADD $8, R8 |
| SUBQ $1, R9; JNZ loop1cont |
| loop1done: |
| loop4: |
| TESTQ BX, BX; JZ loop4done |
| loop4cont: |
| // unroll 4X |
| ADDQ R10, R10 // restore carry |
| MOVQ 0(SI), R9 |
| MOVQ 8(SI), R10 |
| MOVQ 16(SI), R11 |
| MOVQ 24(SI), R12 |
| SBBQ 0(DI), R9 |
| SBBQ 8(DI), R10 |
| SBBQ 16(DI), R11 |
| SBBQ 24(DI), R12 |
| MOVQ R9, 0(R8) |
| MOVQ R10, 8(R8) |
| MOVQ R11, 16(R8) |
| MOVQ R12, 24(R8) |
| SBBQ R10, R10 // save carry |
| LEAQ 32(SI), SI // ADD $32, SI |
| LEAQ 32(DI), DI // ADD $32, DI |
| LEAQ 32(R8), R8 // ADD $32, R8 |
| SUBQ $1, BX; JNZ loop4cont |
| loop4done: |
| NEGQ R10 // convert sub carry |
| MOVQ R10, c+72(FP) |
| RET |
| |
| // func lshVU(z, x []Word, s uint) (c Word) |
| TEXT ·lshVU(SB), NOSPLIT, $0 |
| MOVQ z_len+8(FP), BX |
| TESTQ BX, BX; JZ ret0 |
| MOVQ s+48(FP), CX |
| MOVQ x_base+24(FP), SI |
| MOVQ z_base+0(FP), DI |
| // run loop backward |
| LEAQ (SI)(BX*8), SI |
| LEAQ (DI)(BX*8), DI |
| // shift first word into carry |
| MOVQ -8(SI), R8 |
| MOVQ $0, R9 |
| SHLQ CX, R8, R9 |
| MOVQ R9, c+56(FP) |
| // shift remaining words |
| SUBQ $1, BX |
| // compute unrolled loop lengths |
| MOVQ BX, R9 |
| ANDQ $3, R9 |
| SHRQ $2, BX |
| loop1: |
| TESTQ R9, R9; JZ loop1done |
| loop1cont: |
| // unroll 1X |
| MOVQ -16(SI), R10 |
| SHLQ CX, R10, R8 |
| MOVQ R8, -8(DI) |
| MOVQ R10, R8 |
| LEAQ -8(SI), SI // ADD $-8, SI |
| LEAQ -8(DI), DI // ADD $-8, DI |
| SUBQ $1, R9; JNZ loop1cont |
| loop1done: |
| loop4: |
| TESTQ BX, BX; JZ loop4done |
| loop4cont: |
| // unroll 4X |
| MOVQ -16(SI), R9 |
| MOVQ -24(SI), R10 |
| MOVQ -32(SI), R11 |
| MOVQ -40(SI), R12 |
| SHLQ CX, R9, R8 |
| SHLQ CX, R10, R9 |
| SHLQ CX, R11, R10 |
| SHLQ CX, R12, R11 |
| MOVQ R8, -8(DI) |
| MOVQ R9, -16(DI) |
| MOVQ R10, -24(DI) |
| MOVQ R11, -32(DI) |
| MOVQ R12, R8 |
| LEAQ -32(SI), SI // ADD $-32, SI |
| LEAQ -32(DI), DI // ADD $-32, DI |
| SUBQ $1, BX; JNZ loop4cont |
| loop4done: |
| // store final shifted bits |
| SHLQ CX, R8 |
| MOVQ R8, -8(DI) |
| RET |
| ret0: |
| MOVQ $0, c+56(FP) |
| RET |
| |
| // func rshVU(z, x []Word, s uint) (c Word) |
| TEXT ·rshVU(SB), NOSPLIT, $0 |
| MOVQ z_len+8(FP), BX |
| TESTQ BX, BX; JZ ret0 |
| MOVQ s+48(FP), CX |
| MOVQ x_base+24(FP), SI |
| MOVQ z_base+0(FP), DI |
| // shift first word into carry |
| MOVQ 0(SI), R8 |
| MOVQ $0, R9 |
| SHRQ CX, R8, R9 |
| MOVQ R9, c+56(FP) |
| // shift remaining words |
| SUBQ $1, BX |
| // compute unrolled loop lengths |
| MOVQ BX, R9 |
| ANDQ $3, R9 |
| SHRQ $2, BX |
| loop1: |
| TESTQ R9, R9; JZ loop1done |
| loop1cont: |
| // unroll 1X |
| MOVQ 8(SI), R10 |
| SHRQ CX, R10, R8 |
| MOVQ R8, 0(DI) |
| MOVQ R10, R8 |
| LEAQ 8(SI), SI // ADD $8, SI |
| LEAQ 8(DI), DI // ADD $8, DI |
| SUBQ $1, R9; JNZ loop1cont |
| loop1done: |
| loop4: |
| TESTQ BX, BX; JZ loop4done |
| loop4cont: |
| // unroll 4X |
| MOVQ 8(SI), R9 |
| MOVQ 16(SI), R10 |
| MOVQ 24(SI), R11 |
| MOVQ 32(SI), R12 |
| SHRQ CX, R9, R8 |
| SHRQ CX, R10, R9 |
| SHRQ CX, R11, R10 |
| SHRQ CX, R12, R11 |
| MOVQ R8, 0(DI) |
| MOVQ R9, 8(DI) |
| MOVQ R10, 16(DI) |
| MOVQ R11, 24(DI) |
| MOVQ R12, R8 |
| LEAQ 32(SI), SI // ADD $32, SI |
| LEAQ 32(DI), DI // ADD $32, DI |
| SUBQ $1, BX; JNZ loop4cont |
| loop4done: |
| // store final shifted bits |
| SHRQ CX, R8 |
| MOVQ R8, 0(DI) |
| RET |
| ret0: |
| MOVQ $0, c+56(FP) |
| RET |
| |
| // func mulAddVWW(z, x []Word, m, a Word) (c Word) |
| TEXT ·mulAddVWW(SB), NOSPLIT, $0 |
| MOVQ m+48(FP), BX |
| MOVQ a+56(FP), SI |
| MOVQ z_len+8(FP), DI |
| MOVQ x_base+24(FP), R8 |
| MOVQ z_base+0(FP), R9 |
| // compute unrolled loop lengths |
| MOVQ DI, R10 |
| ANDQ $3, R10 |
| SHRQ $2, DI |
| loop1: |
| TESTQ R10, R10; JZ loop1done |
| loop1cont: |
| // unroll 1X in batches of 1 |
| MOVQ 0(R8), AX |
| // multiply |
| MULQ BX |
| ADDQ SI, AX |
| MOVQ DX, SI |
| ADCQ $0, SI |
| MOVQ AX, 0(R9) |
| LEAQ 8(R8), R8 // ADD $8, R8 |
| LEAQ 8(R9), R9 // ADD $8, R9 |
| SUBQ $1, R10; JNZ loop1cont |
| loop1done: |
| loop4: |
| TESTQ DI, DI; JZ loop4done |
| loop4cont: |
| // unroll 4X in batches of 1 |
| MOVQ 0(R8), AX |
| // multiply |
| MULQ BX |
| ADDQ SI, AX |
| MOVQ DX, SI |
| ADCQ $0, SI |
| MOVQ AX, 0(R9) |
| MOVQ 8(R8), AX |
| // multiply |
| MULQ BX |
| ADDQ SI, AX |
| MOVQ DX, SI |
| ADCQ $0, SI |
| MOVQ AX, 8(R9) |
| MOVQ 16(R8), AX |
| // multiply |
| MULQ BX |
| ADDQ SI, AX |
| MOVQ DX, SI |
| ADCQ $0, SI |
| MOVQ AX, 16(R9) |
| MOVQ 24(R8), AX |
| // multiply |
| MULQ BX |
| ADDQ SI, AX |
| MOVQ DX, SI |
| ADCQ $0, SI |
| MOVQ AX, 24(R9) |
| LEAQ 32(R8), R8 // ADD $32, R8 |
| LEAQ 32(R9), R9 // ADD $32, R9 |
| SUBQ $1, DI; JNZ loop4cont |
| loop4done: |
| MOVQ SI, c+64(FP) |
| RET |
| |
| // func addMulVVWW(z, x, y []Word, m, a Word) (c Word) |
| TEXT ·addMulVVWW(SB), NOSPLIT, $0 |
| CMPB ·hasADX(SB), $0; JNZ altcarry |
| MOVQ m+72(FP), BX |
| MOVQ a+80(FP), SI |
| MOVQ z_len+8(FP), DI |
| MOVQ x_base+24(FP), R8 |
| MOVQ y_base+48(FP), R9 |
| MOVQ z_base+0(FP), R10 |
| // compute unrolled loop lengths |
| MOVQ DI, R11 |
| ANDQ $3, R11 |
| SHRQ $2, DI |
| loop1: |
| TESTQ R11, R11; JZ loop1done |
| loop1cont: |
| // unroll 1X in batches of 1 |
| MOVQ 0(R9), AX |
| // multiply |
| MULQ BX |
| ADDQ SI, AX |
| MOVQ DX, SI |
| ADCQ $0, SI |
| // add |
| ADDQ 0(R8), AX |
| ADCQ $0, SI |
| MOVQ AX, 0(R10) |
| LEAQ 8(R8), R8 // ADD $8, R8 |
| LEAQ 8(R9), R9 // ADD $8, R9 |
| LEAQ 8(R10), R10 // ADD $8, R10 |
| SUBQ $1, R11; JNZ loop1cont |
| loop1done: |
| loop4: |
| TESTQ DI, DI; JZ loop4done |
| loop4cont: |
| // unroll 4X in batches of 1 |
| MOVQ 0(R9), AX |
| // multiply |
| MULQ BX |
| ADDQ SI, AX |
| MOVQ DX, SI |
| ADCQ $0, SI |
| // add |
| ADDQ 0(R8), AX |
| ADCQ $0, SI |
| MOVQ AX, 0(R10) |
| MOVQ 8(R9), AX |
| // multiply |
| MULQ BX |
| ADDQ SI, AX |
| MOVQ DX, SI |
| ADCQ $0, SI |
| // add |
| ADDQ 8(R8), AX |
| ADCQ $0, SI |
| MOVQ AX, 8(R10) |
| MOVQ 16(R9), AX |
| // multiply |
| MULQ BX |
| ADDQ SI, AX |
| MOVQ DX, SI |
| ADCQ $0, SI |
| // add |
| ADDQ 16(R8), AX |
| ADCQ $0, SI |
| MOVQ AX, 16(R10) |
| MOVQ 24(R9), AX |
| // multiply |
| MULQ BX |
| ADDQ SI, AX |
| MOVQ DX, SI |
| ADCQ $0, SI |
| // add |
| ADDQ 24(R8), AX |
| ADCQ $0, SI |
| MOVQ AX, 24(R10) |
| LEAQ 32(R8), R8 // ADD $32, R8 |
| LEAQ 32(R9), R9 // ADD $32, R9 |
| LEAQ 32(R10), R10 // ADD $32, R10 |
| SUBQ $1, DI; JNZ loop4cont |
| loop4done: |
| MOVQ SI, c+88(FP) |
| RET |
| altcarry: |
| MOVQ m+72(FP), DX |
| MOVQ a+80(FP), BX |
| MOVQ z_len+8(FP), SI |
| MOVQ $0, DI |
| MOVQ x_base+24(FP), R8 |
| MOVQ y_base+48(FP), R9 |
| MOVQ z_base+0(FP), R10 |
| // compute unrolled loop lengths |
| MOVQ SI, R11 |
| ANDQ $7, R11 |
| SHRQ $3, SI |
| alt1: |
| TESTQ R11, R11; JZ alt1done |
| alt1cont: |
| // unroll 1X |
| // multiply and add |
| TESTQ AX, AX // clear carry |
| TESTQ AX, AX // clear carry |
| MULXQ 0(R9), R13, R12 |
| ADCXQ BX, R13 |
| ADOXQ 0(R8), R13 |
| MOVQ R13, 0(R10) |
| MOVQ R12, BX |
| ADCXQ DI, BX |
| ADOXQ DI, BX |
| LEAQ 8(R8), R8 // ADD $8, R8 |
| LEAQ 8(R9), R9 // ADD $8, R9 |
| LEAQ 8(R10), R10 // ADD $8, R10 |
| SUBQ $1, R11; JNZ alt1cont |
| alt1done: |
| alt8: |
| TESTQ SI, SI; JZ alt8done |
| alt8cont: |
| // unroll 8X in batches of 2 |
| // multiply and add |
| TESTQ AX, AX // clear carry |
| TESTQ AX, AX // clear carry |
| MULXQ 0(R9), R13, R11 |
| ADCXQ BX, R13 |
| ADOXQ 0(R8), R13 |
| MULXQ 8(R9), R14, BX |
| ADCXQ R11, R14 |
| ADOXQ 8(R8), R14 |
| MOVQ R13, 0(R10) |
| MOVQ R14, 8(R10) |
| MULXQ 16(R9), R13, R11 |
| ADCXQ BX, R13 |
| ADOXQ 16(R8), R13 |
| MULXQ 24(R9), R14, BX |
| ADCXQ R11, R14 |
| ADOXQ 24(R8), R14 |
| MOVQ R13, 16(R10) |
| MOVQ R14, 24(R10) |
| MULXQ 32(R9), R13, R11 |
| ADCXQ BX, R13 |
| ADOXQ 32(R8), R13 |
| MULXQ 40(R9), R14, BX |
| ADCXQ R11, R14 |
| ADOXQ 40(R8), R14 |
| MOVQ R13, 32(R10) |
| MOVQ R14, 40(R10) |
| MULXQ 48(R9), R13, R11 |
| ADCXQ BX, R13 |
| ADOXQ 48(R8), R13 |
| MULXQ 56(R9), R14, BX |
| ADCXQ R11, R14 |
| ADOXQ 56(R8), R14 |
| MOVQ R13, 48(R10) |
| MOVQ R14, 56(R10) |
| ADCXQ DI, BX |
| ADOXQ DI, BX |
| LEAQ 64(R8), R8 // ADD $64, R8 |
| LEAQ 64(R9), R9 // ADD $64, R9 |
| LEAQ 64(R10), R10 // ADD $64, R10 |
| SUBQ $1, SI; JNZ alt8cont |
| alt8done: |
| MOVQ BX, c+88(FP) |
| RET |