| // Copyright 2025 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT. |
| |
| //go:build !math_big_pure_go |
| |
| #include "textflag.h" |
| |
| // func addVV(z, x, y []Word) (c Word) |
| TEXT ·addVV(SB), NOSPLIT, $0 |
| MOVD z_len+8(FP), R0 |
| MOVD x_base+24(FP), R1 |
| MOVD y_base+48(FP), R2 |
| MOVD z_base+0(FP), R3 |
| // compute unrolled loop lengths |
| AND $3, R0, R4 |
| LSR $2, R0 |
| ADDS ZR, R0 // clear carry |
| loop1: |
| CBZ R4, loop1done |
| loop1cont: |
| // unroll 1X |
| MOVD.P 8(R1), R5 |
| MOVD.P 8(R2), R6 |
| ADCS R6, R5 |
| MOVD.P R5, 8(R3) |
| SUB $1, R4 |
| CBNZ R4, loop1cont |
| loop1done: |
| loop4: |
| CBZ R0, loop4done |
| loop4cont: |
| // unroll 4X |
| LDP.P 32(R1), (R4, R5) |
| LDP -16(R1), (R6, R7) |
| LDP.P 32(R2), (R8, R9) |
| LDP -16(R2), (R10, R11) |
| ADCS R8, R4 |
| ADCS R9, R5 |
| ADCS R10, R6 |
| ADCS R11, R7 |
| STP.P (R4, R5), 32(R3) |
| STP (R6, R7), -16(R3) |
| SUB $1, R0 |
| CBNZ R0, loop4cont |
| loop4done: |
| ADC ZR, ZR, R1 // save & convert add carry |
| MOVD R1, c+72(FP) |
| RET |
| |
| // func subVV(z, x, y []Word) (c Word) |
| TEXT ·subVV(SB), NOSPLIT, $0 |
| MOVD z_len+8(FP), R0 |
| MOVD x_base+24(FP), R1 |
| MOVD y_base+48(FP), R2 |
| MOVD z_base+0(FP), R3 |
| // compute unrolled loop lengths |
| AND $3, R0, R4 |
| LSR $2, R0 |
| SUBS ZR, R0 // clear carry |
| loop1: |
| CBZ R4, loop1done |
| loop1cont: |
| // unroll 1X |
| MOVD.P 8(R1), R5 |
| MOVD.P 8(R2), R6 |
| SBCS R6, R5 |
| MOVD.P R5, 8(R3) |
| SUB $1, R4 |
| CBNZ R4, loop1cont |
| loop1done: |
| loop4: |
| CBZ R0, loop4done |
| loop4cont: |
| // unroll 4X |
| LDP.P 32(R1), (R4, R5) |
| LDP -16(R1), (R6, R7) |
| LDP.P 32(R2), (R8, R9) |
| LDP -16(R2), (R10, R11) |
| SBCS R8, R4 |
| SBCS R9, R5 |
| SBCS R10, R6 |
| SBCS R11, R7 |
| STP.P (R4, R5), 32(R3) |
| STP (R6, R7), -16(R3) |
| SUB $1, R0 |
| CBNZ R0, loop4cont |
| loop4done: |
| SBC R1, R1 // save carry |
| SUB R1, ZR, R1 // convert sub carry |
| MOVD R1, c+72(FP) |
| RET |
| |
| // func lshVU(z, x []Word, s uint) (c Word) |
| TEXT ·lshVU(SB), NOSPLIT, $0 |
| MOVD z_len+8(FP), R0 |
| CBZ R0, ret0 |
| MOVD s+48(FP), R1 |
| MOVD x_base+24(FP), R2 |
| MOVD z_base+0(FP), R3 |
| // run loop backward |
| ADD R0<<3, R2, R2 |
| ADD R0<<3, R3, R3 |
| // shift first word into carry |
| MOVD.W -8(R2), R4 |
| MOVD $64, R5 |
| SUB R1, R5 |
| LSR R5, R4, R6 |
| LSL R1, R4 |
| MOVD R6, c+56(FP) |
| // shift remaining words |
| SUB $1, R0 |
| // compute unrolled loop lengths |
| AND $3, R0, R6 |
| LSR $2, R0 |
| loop1: |
| CBZ R6, loop1done |
| loop1cont: |
| // unroll 1X |
| MOVD.W -8(R2), R7 |
| LSR R5, R7, R8 |
| ORR R4, R8 |
| LSL R1, R7, R4 |
| MOVD.W R8, -8(R3) |
| SUB $1, R6 |
| CBNZ R6, loop1cont |
| loop1done: |
| loop4: |
| CBZ R0, loop4done |
| loop4cont: |
| // unroll 4X |
| LDP.W -32(R2), (R9, R8) |
| LDP 16(R2), (R7, R6) |
| LSR R5, R6, R10 |
| ORR R4, R10 |
| LSL R1, R6, R4 |
| LSR R5, R7, R6 |
| ORR R4, R6 |
| LSL R1, R7, R4 |
| LSR R5, R8, R7 |
| ORR R4, R7 |
| LSL R1, R8, R4 |
| LSR R5, R9, R8 |
| ORR R4, R8 |
| LSL R1, R9, R4 |
| STP.W (R8, R7), -32(R3) |
| STP (R6, R10), 16(R3) |
| SUB $1, R0 |
| CBNZ R0, loop4cont |
| loop4done: |
| // store final shifted bits |
| MOVD.W R4, -8(R3) |
| RET |
| ret0: |
| MOVD ZR, c+56(FP) |
| RET |
| |
| // func rshVU(z, x []Word, s uint) (c Word) |
| TEXT ·rshVU(SB), NOSPLIT, $0 |
| MOVD z_len+8(FP), R0 |
| CBZ R0, ret0 |
| MOVD s+48(FP), R1 |
| MOVD x_base+24(FP), R2 |
| MOVD z_base+0(FP), R3 |
| // shift first word into carry |
| MOVD.P 8(R2), R4 |
| MOVD $64, R5 |
| SUB R1, R5 |
| LSL R5, R4, R6 |
| LSR R1, R4 |
| MOVD R6, c+56(FP) |
| // shift remaining words |
| SUB $1, R0 |
| // compute unrolled loop lengths |
| AND $3, R0, R6 |
| LSR $2, R0 |
| loop1: |
| CBZ R6, loop1done |
| loop1cont: |
| // unroll 1X |
| MOVD.P 8(R2), R7 |
| LSL R5, R7, R8 |
| ORR R4, R8 |
| LSR R1, R7, R4 |
| MOVD.P R8, 8(R3) |
| SUB $1, R6 |
| CBNZ R6, loop1cont |
| loop1done: |
| loop4: |
| CBZ R0, loop4done |
| loop4cont: |
| // unroll 4X |
| LDP.P 32(R2), (R6, R7) |
| LDP -16(R2), (R8, R9) |
| LSL R5, R6, R10 |
| ORR R4, R10 |
| LSR R1, R6, R4 |
| LSL R5, R7, R6 |
| ORR R4, R6 |
| LSR R1, R7, R4 |
| LSL R5, R8, R7 |
| ORR R4, R7 |
| LSR R1, R8, R4 |
| LSL R5, R9, R8 |
| ORR R4, R8 |
| LSR R1, R9, R4 |
| STP.P (R10, R6), 32(R3) |
| STP (R7, R8), -16(R3) |
| SUB $1, R0 |
| CBNZ R0, loop4cont |
| loop4done: |
| // store final shifted bits |
| MOVD.P R4, 8(R3) |
| RET |
| ret0: |
| MOVD ZR, c+56(FP) |
| RET |
| |
| // func mulAddVWW(z, x []Word, m, a Word) (c Word) |
| TEXT ·mulAddVWW(SB), NOSPLIT, $0 |
| MOVD m+48(FP), R0 |
| MOVD a+56(FP), R1 |
| MOVD z_len+8(FP), R2 |
| MOVD x_base+24(FP), R3 |
| MOVD z_base+0(FP), R4 |
| // compute unrolled loop lengths |
| AND $7, R2, R5 |
| LSR $3, R2 |
| loop1: |
| CBZ R5, loop1done |
| loop1cont: |
| // unroll 1X |
| MOVD.P 8(R3), R6 |
| // multiply |
| UMULH R0, R6, R7 |
| MUL R0, R6 |
| ADDS R1, R6 |
| ADC ZR, R7, R1 |
| MOVD.P R6, 8(R4) |
| SUB $1, R5 |
| CBNZ R5, loop1cont |
| loop1done: |
| loop8: |
| CBZ R2, loop8done |
| loop8cont: |
| // unroll 8X |
| LDP.P 64(R3), (R5, R6) |
| LDP -48(R3), (R7, R8) |
| LDP -32(R3), (R9, R10) |
| LDP -16(R3), (R11, R12) |
| // multiply |
| UMULH R0, R5, R13 |
| MUL R0, R5 |
| ADDS R1, R5 |
| UMULH R0, R6, R14 |
| MUL R0, R6 |
| ADCS R13, R6 |
| UMULH R0, R7, R13 |
| MUL R0, R7 |
| ADCS R14, R7 |
| UMULH R0, R8, R14 |
| MUL R0, R8 |
| ADCS R13, R8 |
| UMULH R0, R9, R13 |
| MUL R0, R9 |
| ADCS R14, R9 |
| UMULH R0, R10, R14 |
| MUL R0, R10 |
| ADCS R13, R10 |
| UMULH R0, R11, R13 |
| MUL R0, R11 |
| ADCS R14, R11 |
| UMULH R0, R12, R14 |
| MUL R0, R12 |
| ADCS R13, R12 |
| ADC ZR, R14, R1 |
| STP.P (R5, R6), 64(R4) |
| STP (R7, R8), -48(R4) |
| STP (R9, R10), -32(R4) |
| STP (R11, R12), -16(R4) |
| SUB $1, R2 |
| CBNZ R2, loop8cont |
| loop8done: |
| MOVD R1, c+64(FP) |
| RET |
| |
| // func addMulVVWW(z, x, y []Word, m, a Word) (c Word) |
| TEXT ·addMulVVWW(SB), NOSPLIT, $0 |
| MOVD m+72(FP), R0 |
| MOVD a+80(FP), R1 |
| MOVD z_len+8(FP), R2 |
| MOVD x_base+24(FP), R3 |
| MOVD y_base+48(FP), R4 |
| MOVD z_base+0(FP), R5 |
| // compute unrolled loop lengths |
| AND $7, R2, R6 |
| LSR $3, R2 |
| loop1: |
| CBZ R6, loop1done |
| loop1cont: |
| // unroll 1X |
| MOVD.P 8(R3), R7 |
| MOVD.P 8(R4), R8 |
| // multiply |
| UMULH R0, R8, R9 |
| MUL R0, R8 |
| ADDS R1, R8 |
| ADC ZR, R9, R1 |
| // add |
| ADDS R7, R8 |
| ADC ZR, R1 |
| MOVD.P R8, 8(R5) |
| SUB $1, R6 |
| CBNZ R6, loop1cont |
| loop1done: |
| loop8: |
| CBZ R2, loop8done |
| loop8cont: |
| // unroll 8X |
| LDP.P 64(R3), (R6, R7) |
| LDP -48(R3), (R8, R9) |
| LDP -32(R3), (R10, R11) |
| LDP -16(R3), (R12, R13) |
| LDP.P 64(R4), (R14, R15) |
| LDP -48(R4), (R16, R17) |
| LDP -32(R4), (R19, R20) |
| LDP -16(R4), (R21, R22) |
| // multiply |
| UMULH R0, R14, R23 |
| MUL R0, R14 |
| ADDS R1, R14 |
| UMULH R0, R15, R24 |
| MUL R0, R15 |
| ADCS R23, R15 |
| UMULH R0, R16, R23 |
| MUL R0, R16 |
| ADCS R24, R16 |
| UMULH R0, R17, R24 |
| MUL R0, R17 |
| ADCS R23, R17 |
| UMULH R0, R19, R23 |
| MUL R0, R19 |
| ADCS R24, R19 |
| UMULH R0, R20, R24 |
| MUL R0, R20 |
| ADCS R23, R20 |
| UMULH R0, R21, R23 |
| MUL R0, R21 |
| ADCS R24, R21 |
| UMULH R0, R22, R24 |
| MUL R0, R22 |
| ADCS R23, R22 |
| ADC ZR, R24, R1 |
| // add |
| ADDS R6, R14 |
| ADCS R7, R15 |
| ADCS R8, R16 |
| ADCS R9, R17 |
| ADCS R10, R19 |
| ADCS R11, R20 |
| ADCS R12, R21 |
| ADCS R13, R22 |
| ADC ZR, R1 |
| STP.P (R14, R15), 64(R5) |
| STP (R16, R17), -48(R5) |
| STP (R19, R20), -32(R5) |
| STP (R21, R22), -16(R5) |
| SUB $1, R2 |
| CBNZ R2, loop8cont |
| loop8done: |
| MOVD R1, c+88(FP) |
| RET |