| // Copyright 2025 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT. |
| |
| //go:build !math_big_pure_go |
| |
| #include "textflag.h" |
| |
| // func addVV(z, x, y []Word) (c Word) |
| TEXT ·addVV(SB), NOSPLIT, $0 |
| MOVB ·hasVX(SB), R1 |
| CMPBEQ R1, $0, novec |
| JMP ·addVVvec(SB) |
| novec: |
| MOVD $0, R0 |
| MOVD z_len+8(FP), R1 |
| MOVD x_base+24(FP), R2 |
| MOVD y_base+48(FP), R3 |
| MOVD z_base+0(FP), R4 |
| // compute unrolled loop lengths |
| MOVD R1, R5 |
| AND $3, R5 |
| SRD $2, R1 |
| ADDC R0, R1 // clear carry |
| loop1: |
| CMPBEQ R5, $0, loop1done |
| loop1cont: |
| // unroll 1X |
| MOVD 0(R2), R6 |
| MOVD 0(R3), R7 |
| ADDE R7, R6 |
| MOVD R6, 0(R4) |
| LAY 8(R2), R2 // ADD $8, R2 |
| LAY 8(R3), R3 // ADD $8, R3 |
| LAY 8(R4), R4 // ADD $8, R4 |
| LAY -1(R5), R5 // ADD $-1, R5 |
| CMPBNE R5, $0, loop1cont |
| loop1done: |
| loop4: |
| CMPBEQ R1, $0, loop4done |
| loop4cont: |
| // unroll 4X in batches of 2 |
| MOVD 0(R2), R5 |
| MOVD 8(R2), R6 |
| MOVD 0(R3), R7 |
| MOVD 8(R3), R8 |
| ADDE R7, R5 |
| ADDE R8, R6 |
| MOVD R5, 0(R4) |
| MOVD R6, 8(R4) |
| MOVD 16(R2), R5 |
| MOVD 24(R2), R6 |
| MOVD 16(R3), R7 |
| MOVD 24(R3), R8 |
| ADDE R7, R5 |
| ADDE R8, R6 |
| MOVD R5, 16(R4) |
| MOVD R6, 24(R4) |
| LAY 32(R2), R2 // ADD $32, R2 |
| LAY 32(R3), R3 // ADD $32, R3 |
| LAY 32(R4), R4 // ADD $32, R4 |
| LAY -1(R1), R1 // ADD $-1, R1 |
| CMPBNE R1, $0, loop4cont |
| loop4done: |
| ADDE R0, R0, R2 // save & convert add carry |
| MOVD R2, c+72(FP) |
| RET |
| |
| // func subVV(z, x, y []Word) (c Word) |
| TEXT ·subVV(SB), NOSPLIT, $0 |
| MOVB ·hasVX(SB), R1 |
| CMPBEQ R1, $0, novec |
| JMP ·subVVvec(SB) |
| novec: |
| MOVD $0, R0 |
| MOVD z_len+8(FP), R1 |
| MOVD x_base+24(FP), R2 |
| MOVD y_base+48(FP), R3 |
| MOVD z_base+0(FP), R4 |
| // compute unrolled loop lengths |
| MOVD R1, R5 |
| AND $3, R5 |
| SRD $2, R1 |
| SUBC R0, R1 // clear carry |
| loop1: |
| CMPBEQ R5, $0, loop1done |
| loop1cont: |
| // unroll 1X |
| MOVD 0(R2), R6 |
| MOVD 0(R3), R7 |
| SUBE R7, R6 |
| MOVD R6, 0(R4) |
| LAY 8(R2), R2 // ADD $8, R2 |
| LAY 8(R3), R3 // ADD $8, R3 |
| LAY 8(R4), R4 // ADD $8, R4 |
| LAY -1(R5), R5 // ADD $-1, R5 |
| CMPBNE R5, $0, loop1cont |
| loop1done: |
| loop4: |
| CMPBEQ R1, $0, loop4done |
| loop4cont: |
| // unroll 4X in batches of 2 |
| MOVD 0(R2), R5 |
| MOVD 8(R2), R6 |
| MOVD 0(R3), R7 |
| MOVD 8(R3), R8 |
| SUBE R7, R5 |
| SUBE R8, R6 |
| MOVD R5, 0(R4) |
| MOVD R6, 8(R4) |
| MOVD 16(R2), R5 |
| MOVD 24(R2), R6 |
| MOVD 16(R3), R7 |
| MOVD 24(R3), R8 |
| SUBE R7, R5 |
| SUBE R8, R6 |
| MOVD R5, 16(R4) |
| MOVD R6, 24(R4) |
| LAY 32(R2), R2 // ADD $32, R2 |
| LAY 32(R3), R3 // ADD $32, R3 |
| LAY 32(R4), R4 // ADD $32, R4 |
| LAY -1(R1), R1 // ADD $-1, R1 |
| CMPBNE R1, $0, loop4cont |
| loop4done: |
| SUBE R2, R2 // save carry |
| NEG R2 // convert sub carry |
| MOVD R2, c+72(FP) |
| RET |
| |
| // func lshVU(z, x []Word, s uint) (c Word) |
| TEXT ·lshVU(SB), NOSPLIT, $0 |
| MOVD $0, R0 |
| MOVD z_len+8(FP), R1 |
| CMPBEQ R1, $0, ret0 |
| MOVD s+48(FP), R2 |
| MOVD x_base+24(FP), R3 |
| MOVD z_base+0(FP), R4 |
| // run loop backward |
| SLD $3, R1, R5 |
| LAY (R5)(R3), R3 // ADD R5, R3 |
| SLD $3, R1, R5 |
| LAY (R5)(R4), R4 // ADD R5, R4 |
| // shift first word into carry |
| MOVD -8(R3), R5 |
| MOVD $64, R6 |
| SUBC R2, R6 |
| SRD R6, R5, R7 |
| SLD R2, R5 |
| MOVD R7, c+56(FP) |
| // shift remaining words |
| SUBC $1, R1 |
| // compute unrolled loop lengths |
| MOVD R1, R7 |
| AND $3, R7 |
| SRD $2, R1 |
| loop1: |
| CMPBEQ R7, $0, loop1done |
| loop1cont: |
| // unroll 1X |
| MOVD -16(R3), R8 |
| SRD R6, R8, R9 |
| OR R5, R9 |
| SLD R2, R8, R5 |
| MOVD R9, -8(R4) |
| LAY -8(R3), R3 // ADD $-8, R3 |
| LAY -8(R4), R4 // ADD $-8, R4 |
| LAY -1(R7), R7 // ADD $-1, R7 |
| CMPBNE R7, $0, loop1cont |
| loop1done: |
| loop4: |
| CMPBEQ R1, $0, loop4done |
| loop4cont: |
| // unroll 4X in batches of 2 |
| MOVD -16(R3), R7 |
| MOVD -24(R3), R8 |
| SRD R6, R7, R9 |
| OR R5, R9 |
| SLD R2, R7, R5 |
| SRD R6, R8, R7 |
| OR R5, R7 |
| SLD R2, R8, R5 |
| MOVD R9, -8(R4) |
| MOVD R7, -16(R4) |
| MOVD -32(R3), R7 |
| MOVD -40(R3), R8 |
| SRD R6, R7, R9 |
| OR R5, R9 |
| SLD R2, R7, R5 |
| SRD R6, R8, R7 |
| OR R5, R7 |
| SLD R2, R8, R5 |
| MOVD R9, -24(R4) |
| MOVD R7, -32(R4) |
| LAY -32(R3), R3 // ADD $-32, R3 |
| LAY -32(R4), R4 // ADD $-32, R4 |
| LAY -1(R1), R1 // ADD $-1, R1 |
| CMPBNE R1, $0, loop4cont |
| loop4done: |
| // store final shifted bits |
| MOVD R5, -8(R4) |
| RET |
| ret0: |
| MOVD R0, c+56(FP) |
| RET |
| |
| // func rshVU(z, x []Word, s uint) (c Word) |
| TEXT ·rshVU(SB), NOSPLIT, $0 |
| MOVD $0, R0 |
| MOVD z_len+8(FP), R1 |
| CMPBEQ R1, $0, ret0 |
| MOVD s+48(FP), R2 |
| MOVD x_base+24(FP), R3 |
| MOVD z_base+0(FP), R4 |
| // shift first word into carry |
| MOVD 0(R3), R5 |
| MOVD $64, R6 |
| SUBC R2, R6 |
| SLD R6, R5, R7 |
| SRD R2, R5 |
| MOVD R7, c+56(FP) |
| // shift remaining words |
| SUBC $1, R1 |
| // compute unrolled loop lengths |
| MOVD R1, R7 |
| AND $3, R7 |
| SRD $2, R1 |
| loop1: |
| CMPBEQ R7, $0, loop1done |
| loop1cont: |
| // unroll 1X |
| MOVD 8(R3), R8 |
| SLD R6, R8, R9 |
| OR R5, R9 |
| SRD R2, R8, R5 |
| MOVD R9, 0(R4) |
| LAY 8(R3), R3 // ADD $8, R3 |
| LAY 8(R4), R4 // ADD $8, R4 |
| LAY -1(R7), R7 // ADD $-1, R7 |
| CMPBNE R7, $0, loop1cont |
| loop1done: |
| loop4: |
| CMPBEQ R1, $0, loop4done |
| loop4cont: |
| // unroll 4X in batches of 2 |
| MOVD 8(R3), R7 |
| MOVD 16(R3), R8 |
| SLD R6, R7, R9 |
| OR R5, R9 |
| SRD R2, R7, R5 |
| SLD R6, R8, R7 |
| OR R5, R7 |
| SRD R2, R8, R5 |
| MOVD R9, 0(R4) |
| MOVD R7, 8(R4) |
| MOVD 24(R3), R7 |
| MOVD 32(R3), R8 |
| SLD R6, R7, R9 |
| OR R5, R9 |
| SRD R2, R7, R5 |
| SLD R6, R8, R7 |
| OR R5, R7 |
| SRD R2, R8, R5 |
| MOVD R9, 16(R4) |
| MOVD R7, 24(R4) |
| LAY 32(R3), R3 // ADD $32, R3 |
| LAY 32(R4), R4 // ADD $32, R4 |
| LAY -1(R1), R1 // ADD $-1, R1 |
| CMPBNE R1, $0, loop4cont |
| loop4done: |
| // store final shifted bits |
| MOVD R5, 0(R4) |
| RET |
| ret0: |
| MOVD R0, c+56(FP) |
| RET |
| |
| // func mulAddVWW(z, x []Word, m, a Word) (c Word) |
| TEXT ·mulAddVWW(SB), NOSPLIT, $0 |
| MOVD $0, R0 |
| MOVD m+48(FP), R1 |
| MOVD a+56(FP), R2 |
| MOVD z_len+8(FP), R3 |
| MOVD x_base+24(FP), R4 |
| MOVD z_base+0(FP), R5 |
| // compute unrolled loop lengths |
| MOVD R3, R6 |
| AND $3, R6 |
| SRD $2, R3 |
| loop1: |
| CMPBEQ R6, $0, loop1done |
| loop1cont: |
| // unroll 1X in batches of 1 |
| MOVD 0(R4), R11 |
| // multiply |
| MLGR R1, R10 |
| ADDC R2, R11 |
| ADDE R0, R10, R2 |
| MOVD R11, 0(R5) |
| LAY 8(R4), R4 // ADD $8, R4 |
| LAY 8(R5), R5 // ADD $8, R5 |
| LAY -1(R6), R6 // ADD $-1, R6 |
| CMPBNE R6, $0, loop1cont |
| loop1done: |
| loop4: |
| CMPBEQ R3, $0, loop4done |
| loop4cont: |
| // unroll 4X in batches of 1 |
| MOVD 0(R4), R11 |
| // multiply |
| MLGR R1, R10 |
| ADDC R2, R11 |
| ADDE R0, R10, R2 |
| MOVD R11, 0(R5) |
| MOVD 8(R4), R11 |
| // multiply |
| MLGR R1, R10 |
| ADDC R2, R11 |
| ADDE R0, R10, R2 |
| MOVD R11, 8(R5) |
| MOVD 16(R4), R11 |
| // multiply |
| MLGR R1, R10 |
| ADDC R2, R11 |
| ADDE R0, R10, R2 |
| MOVD R11, 16(R5) |
| MOVD 24(R4), R11 |
| // multiply |
| MLGR R1, R10 |
| ADDC R2, R11 |
| ADDE R0, R10, R2 |
| MOVD R11, 24(R5) |
| LAY 32(R4), R4 // ADD $32, R4 |
| LAY 32(R5), R5 // ADD $32, R5 |
| LAY -1(R3), R3 // ADD $-1, R3 |
| CMPBNE R3, $0, loop4cont |
| loop4done: |
| MOVD R2, c+64(FP) |
| RET |
| |
| // func addMulVVWW(z, x, y []Word, m, a Word) (c Word) |
| TEXT ·addMulVVWW(SB), NOSPLIT, $0 |
| MOVD $0, R0 |
| MOVD m+72(FP), R1 |
| MOVD a+80(FP), R2 |
| MOVD z_len+8(FP), R3 |
| MOVD x_base+24(FP), R4 |
| MOVD y_base+48(FP), R5 |
| MOVD z_base+0(FP), R6 |
| // compute unrolled loop lengths |
| MOVD R3, R7 |
| AND $3, R7 |
| SRD $2, R3 |
| loop1: |
| CMPBEQ R7, $0, loop1done |
| loop1cont: |
| // unroll 1X in batches of 1 |
| MOVD 0(R4), R8 |
| MOVD 0(R5), R11 |
| // multiply |
| MLGR R1, R10 |
| ADDC R2, R11 |
| ADDE R0, R10, R2 |
| // add |
| ADDC R8, R11 |
| ADDE R0, R2 |
| MOVD R11, 0(R6) |
| LAY 8(R4), R4 // ADD $8, R4 |
| LAY 8(R5), R5 // ADD $8, R5 |
| LAY 8(R6), R6 // ADD $8, R6 |
| LAY -1(R7), R7 // ADD $-1, R7 |
| CMPBNE R7, $0, loop1cont |
| loop1done: |
| loop4: |
| CMPBEQ R3, $0, loop4done |
| loop4cont: |
| // unroll 4X in batches of 1 |
| MOVD 0(R4), R7 |
| MOVD 0(R5), R11 |
| // multiply |
| MLGR R1, R10 |
| ADDC R2, R11 |
| ADDE R0, R10, R2 |
| // add |
| ADDC R7, R11 |
| ADDE R0, R2 |
| MOVD R11, 0(R6) |
| MOVD 8(R4), R7 |
| MOVD 8(R5), R11 |
| // multiply |
| MLGR R1, R10 |
| ADDC R2, R11 |
| ADDE R0, R10, R2 |
| // add |
| ADDC R7, R11 |
| ADDE R0, R2 |
| MOVD R11, 8(R6) |
| MOVD 16(R4), R7 |
| MOVD 16(R5), R11 |
| // multiply |
| MLGR R1, R10 |
| ADDC R2, R11 |
| ADDE R0, R10, R2 |
| // add |
| ADDC R7, R11 |
| ADDE R0, R2 |
| MOVD R11, 16(R6) |
| MOVD 24(R4), R7 |
| MOVD 24(R5), R11 |
| // multiply |
| MLGR R1, R10 |
| ADDC R2, R11 |
| ADDE R0, R10, R2 |
| // add |
| ADDC R7, R11 |
| ADDE R0, R2 |
| MOVD R11, 24(R6) |
| LAY 32(R4), R4 // ADD $32, R4 |
| LAY 32(R5), R5 // ADD $32, R5 |
| LAY 32(R6), R6 // ADD $32, R6 |
| LAY -1(R3), R3 // ADD $-1, R3 |
| CMPBNE R3, $0, loop4cont |
| loop4done: |
| MOVD R2, c+88(FP) |
| RET |