blob: b2af9251245a7eff88b2047695c367eb687fce6d [file] [log] [blame]
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
//go:build !math_big_pure_go
#include "textflag.h"
// func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB), NOSPLIT, $0
MOVV z_len+8(FP), R4
MOVV x_base+24(FP), R5
MOVV y_base+48(FP), R6
MOVV z_base+0(FP), R7
// compute unrolled loop lengths
AND $3, R4, R8
SRLV $2, R4
XOR R28, R28 // clear carry
loop1:
BEQ R8, loop1done
loop1cont:
// unroll 1X
MOVV 0(R5), R9
MOVV 0(R6), R10
ADDVU R10, R9 // ADCS R10, R9, R9 (cr=R28)
SGTU R10, R9, R30 // ...
ADDVU R28, R9 // ...
SGTU R28, R9, R28 // ...
ADDVU R30, R28 // ...
MOVV R9, 0(R7)
ADDVU $8, R5
ADDVU $8, R6
ADDVU $8, R7
SUBVU $1, R8
BNE R8, loop1cont
loop1done:
loop4:
BEQ R4, loop4done
loop4cont:
// unroll 4X
MOVV 0(R5), R8
MOVV 8(R5), R9
MOVV 16(R5), R10
MOVV 24(R5), R11
MOVV 0(R6), R12
MOVV 8(R6), R13
MOVV 16(R6), R14
MOVV 24(R6), R15
ADDVU R12, R8 // ADCS R12, R8, R8 (cr=R28)
SGTU R12, R8, R30 // ...
ADDVU R28, R8 // ...
SGTU R28, R8, R28 // ...
ADDVU R30, R28 // ...
ADDVU R13, R9 // ADCS R13, R9, R9 (cr=R28)
SGTU R13, R9, R30 // ...
ADDVU R28, R9 // ...
SGTU R28, R9, R28 // ...
ADDVU R30, R28 // ...
ADDVU R14, R10 // ADCS R14, R10, R10 (cr=R28)
SGTU R14, R10, R30 // ...
ADDVU R28, R10 // ...
SGTU R28, R10, R28 // ...
ADDVU R30, R28 // ...
ADDVU R15, R11 // ADCS R15, R11, R11 (cr=R28)
SGTU R15, R11, R30 // ...
ADDVU R28, R11 // ...
SGTU R28, R11, R28 // ...
ADDVU R30, R28 // ...
MOVV R8, 0(R7)
MOVV R9, 8(R7)
MOVV R10, 16(R7)
MOVV R11, 24(R7)
ADDVU $32, R5
ADDVU $32, R6
ADDVU $32, R7
SUBVU $1, R4
BNE R4, loop4cont
loop4done:
MOVV R28, c+72(FP)
RET
// func subVV(z, x, y []Word) (c Word)
TEXT ·subVV(SB), NOSPLIT, $0
MOVV z_len+8(FP), R4
MOVV x_base+24(FP), R5
MOVV y_base+48(FP), R6
MOVV z_base+0(FP), R7
// compute unrolled loop lengths
AND $3, R4, R8
SRLV $2, R4
XOR R28, R28 // clear carry
loop1:
BEQ R8, loop1done
loop1cont:
// unroll 1X
MOVV 0(R5), R9
MOVV 0(R6), R10
SGTU R28, R9, R30 // SBCS R10, R9, R9
SUBVU R28, R9 // ...
SGTU R10, R9, R28 // ...
SUBVU R10, R9 // ...
ADDVU R30, R28 // ...
MOVV R9, 0(R7)
ADDVU $8, R5
ADDVU $8, R6
ADDVU $8, R7
SUBVU $1, R8
BNE R8, loop1cont
loop1done:
loop4:
BEQ R4, loop4done
loop4cont:
// unroll 4X
MOVV 0(R5), R8
MOVV 8(R5), R9
MOVV 16(R5), R10
MOVV 24(R5), R11
MOVV 0(R6), R12
MOVV 8(R6), R13
MOVV 16(R6), R14
MOVV 24(R6), R15
SGTU R28, R8, R30 // SBCS R12, R8, R8
SUBVU R28, R8 // ...
SGTU R12, R8, R28 // ...
SUBVU R12, R8 // ...
ADDVU R30, R28 // ...
SGTU R28, R9, R30 // SBCS R13, R9, R9
SUBVU R28, R9 // ...
SGTU R13, R9, R28 // ...
SUBVU R13, R9 // ...
ADDVU R30, R28 // ...
SGTU R28, R10, R30 // SBCS R14, R10, R10
SUBVU R28, R10 // ...
SGTU R14, R10, R28 // ...
SUBVU R14, R10 // ...
ADDVU R30, R28 // ...
SGTU R28, R11, R30 // SBCS R15, R11, R11
SUBVU R28, R11 // ...
SGTU R15, R11, R28 // ...
SUBVU R15, R11 // ...
ADDVU R30, R28 // ...
MOVV R8, 0(R7)
MOVV R9, 8(R7)
MOVV R10, 16(R7)
MOVV R11, 24(R7)
ADDVU $32, R5
ADDVU $32, R6
ADDVU $32, R7
SUBVU $1, R4
BNE R4, loop4cont
loop4done:
MOVV R28, c+72(FP)
RET
// func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB), NOSPLIT, $0
MOVV z_len+8(FP), R4
BEQ R4, ret0
MOVV s+48(FP), R5
MOVV x_base+24(FP), R6
MOVV z_base+0(FP), R7
// run loop backward
SLLV $3, R4, R8
ADDVU R8, R6
SLLV $3, R4, R8
ADDVU R8, R7
// shift first word into carry
MOVV -8(R6), R8
MOVV $64, R9
SUBVU R5, R9
SRLV R9, R8, R10
SLLV R5, R8
MOVV R10, c+56(FP)
// shift remaining words
SUBVU $1, R4
// compute unrolled loop lengths
AND $3, R4, R10
SRLV $2, R4
loop1:
BEQ R10, loop1done
loop1cont:
// unroll 1X
MOVV -16(R6), R11
SRLV R9, R11, R12
OR R8, R12
SLLV R5, R11, R8
MOVV R12, -8(R7)
ADDVU $-8, R6
ADDVU $-8, R7
SUBVU $1, R10
BNE R10, loop1cont
loop1done:
loop4:
BEQ R4, loop4done
loop4cont:
// unroll 4X
MOVV -16(R6), R10
MOVV -24(R6), R11
MOVV -32(R6), R12
MOVV -40(R6), R13
SRLV R9, R10, R14
OR R8, R14
SLLV R5, R10, R8
SRLV R9, R11, R10
OR R8, R10
SLLV R5, R11, R8
SRLV R9, R12, R11
OR R8, R11
SLLV R5, R12, R8
SRLV R9, R13, R12
OR R8, R12
SLLV R5, R13, R8
MOVV R14, -8(R7)
MOVV R10, -16(R7)
MOVV R11, -24(R7)
MOVV R12, -32(R7)
ADDVU $-32, R6
ADDVU $-32, R7
SUBVU $1, R4
BNE R4, loop4cont
loop4done:
// store final shifted bits
MOVV R8, -8(R7)
RET
ret0:
MOVV R0, c+56(FP)
RET
// func rshVU(z, x []Word, s uint) (c Word)
TEXT ·rshVU(SB), NOSPLIT, $0
MOVV z_len+8(FP), R4
BEQ R4, ret0
MOVV s+48(FP), R5
MOVV x_base+24(FP), R6
MOVV z_base+0(FP), R7
// shift first word into carry
MOVV 0(R6), R8
MOVV $64, R9
SUBVU R5, R9
SLLV R9, R8, R10
SRLV R5, R8
MOVV R10, c+56(FP)
// shift remaining words
SUBVU $1, R4
// compute unrolled loop lengths
AND $3, R4, R10
SRLV $2, R4
loop1:
BEQ R10, loop1done
loop1cont:
// unroll 1X
MOVV 8(R6), R11
SLLV R9, R11, R12
OR R8, R12
SRLV R5, R11, R8
MOVV R12, 0(R7)
ADDVU $8, R6
ADDVU $8, R7
SUBVU $1, R10
BNE R10, loop1cont
loop1done:
loop4:
BEQ R4, loop4done
loop4cont:
// unroll 4X
MOVV 8(R6), R10
MOVV 16(R6), R11
MOVV 24(R6), R12
MOVV 32(R6), R13
SLLV R9, R10, R14
OR R8, R14
SRLV R5, R10, R8
SLLV R9, R11, R10
OR R8, R10
SRLV R5, R11, R8
SLLV R9, R12, R11
OR R8, R11
SRLV R5, R12, R8
SLLV R9, R13, R12
OR R8, R12
SRLV R5, R13, R8
MOVV R14, 0(R7)
MOVV R10, 8(R7)
MOVV R11, 16(R7)
MOVV R12, 24(R7)
ADDVU $32, R6
ADDVU $32, R7
SUBVU $1, R4
BNE R4, loop4cont
loop4done:
// store final shifted bits
MOVV R8, 0(R7)
RET
ret0:
MOVV R0, c+56(FP)
RET
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOVV m+48(FP), R4
MOVV a+56(FP), R5
MOVV z_len+8(FP), R6
MOVV x_base+24(FP), R7
MOVV z_base+0(FP), R8
// compute unrolled loop lengths
AND $3, R6, R9
SRLV $2, R6
loop1:
BEQ R9, loop1done
loop1cont:
// unroll 1X
MOVV 0(R7), R10
// synthetic carry, one column at a time
MULV R4, R10, R11
MULHVU R4, R10, R12
ADDVU R5, R11, R10 // ADDS R5, R11, R10 (cr=R28)
SGTU R5, R10, R28 // ...
ADDVU R28, R12, R5 // ADC $0, R12, R5
MOVV R10, 0(R8)
ADDVU $8, R7
ADDVU $8, R8
SUBVU $1, R9
BNE R9, loop1cont
loop1done:
loop4:
BEQ R6, loop4done
loop4cont:
// unroll 4X
MOVV 0(R7), R9
MOVV 8(R7), R10
MOVV 16(R7), R11
MOVV 24(R7), R12
// synthetic carry, one column at a time
MULV R4, R9, R13
MULHVU R4, R9, R14
ADDVU R5, R13, R9 // ADDS R5, R13, R9 (cr=R28)
SGTU R5, R9, R28 // ...
ADDVU R28, R14, R5 // ADC $0, R14, R5
MULV R4, R10, R13
MULHVU R4, R10, R14
ADDVU R5, R13, R10 // ADDS R5, R13, R10 (cr=R28)
SGTU R5, R10, R28 // ...
ADDVU R28, R14, R5 // ADC $0, R14, R5
MULV R4, R11, R13
MULHVU R4, R11, R14
ADDVU R5, R13, R11 // ADDS R5, R13, R11 (cr=R28)
SGTU R5, R11, R28 // ...
ADDVU R28, R14, R5 // ADC $0, R14, R5
MULV R4, R12, R13
MULHVU R4, R12, R14
ADDVU R5, R13, R12 // ADDS R5, R13, R12 (cr=R28)
SGTU R5, R12, R28 // ...
ADDVU R28, R14, R5 // ADC $0, R14, R5
MOVV R9, 0(R8)
MOVV R10, 8(R8)
MOVV R11, 16(R8)
MOVV R12, 24(R8)
ADDVU $32, R7
ADDVU $32, R8
SUBVU $1, R6
BNE R6, loop4cont
loop4done:
MOVV R5, c+64(FP)
RET
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB), NOSPLIT, $0
MOVV m+72(FP), R4
MOVV a+80(FP), R5
MOVV z_len+8(FP), R6
MOVV x_base+24(FP), R7
MOVV y_base+48(FP), R8
MOVV z_base+0(FP), R9
// compute unrolled loop lengths
AND $3, R6, R10
SRLV $2, R6
loop1:
BEQ R10, loop1done
loop1cont:
// unroll 1X
MOVV 0(R7), R11
MOVV 0(R8), R12
// synthetic carry, one column at a time
MULV R4, R12, R13
MULHVU R4, R12, R14
ADDVU R11, R13 // ADDS R11, R13, R13 (cr=R28)
SGTU R11, R13, R28 // ...
ADDVU R28, R14 // ADC $0, R14, R14
ADDVU R5, R13, R12 // ADDS R5, R13, R12 (cr=R28)
SGTU R5, R12, R28 // ...
ADDVU R28, R14, R5 // ADC $0, R14, R5
MOVV R12, 0(R9)
ADDVU $8, R7
ADDVU $8, R8
ADDVU $8, R9
SUBVU $1, R10
BNE R10, loop1cont
loop1done:
loop4:
BEQ R6, loop4done
loop4cont:
// unroll 4X
MOVV 0(R7), R10
MOVV 8(R7), R11
MOVV 16(R7), R12
MOVV 24(R7), R13
MOVV 0(R8), R14
MOVV 8(R8), R15
MOVV 16(R8), R16
MOVV 24(R8), R17
// synthetic carry, one column at a time
MULV R4, R14, R18
MULHVU R4, R14, R19
ADDVU R10, R18 // ADDS R10, R18, R18 (cr=R28)
SGTU R10, R18, R28 // ...
ADDVU R28, R19 // ADC $0, R19, R19
ADDVU R5, R18, R14 // ADDS R5, R18, R14 (cr=R28)
SGTU R5, R14, R28 // ...
ADDVU R28, R19, R5 // ADC $0, R19, R5
MULV R4, R15, R18
MULHVU R4, R15, R19
ADDVU R11, R18 // ADDS R11, R18, R18 (cr=R28)
SGTU R11, R18, R28 // ...
ADDVU R28, R19 // ADC $0, R19, R19
ADDVU R5, R18, R15 // ADDS R5, R18, R15 (cr=R28)
SGTU R5, R15, R28 // ...
ADDVU R28, R19, R5 // ADC $0, R19, R5
MULV R4, R16, R18
MULHVU R4, R16, R19
ADDVU R12, R18 // ADDS R12, R18, R18 (cr=R28)
SGTU R12, R18, R28 // ...
ADDVU R28, R19 // ADC $0, R19, R19
ADDVU R5, R18, R16 // ADDS R5, R18, R16 (cr=R28)
SGTU R5, R16, R28 // ...
ADDVU R28, R19, R5 // ADC $0, R19, R5
MULV R4, R17, R18
MULHVU R4, R17, R19
ADDVU R13, R18 // ADDS R13, R18, R18 (cr=R28)
SGTU R13, R18, R28 // ...
ADDVU R28, R19 // ADC $0, R19, R19
ADDVU R5, R18, R17 // ADDS R5, R18, R17 (cr=R28)
SGTU R5, R17, R28 // ...
ADDVU R28, R19, R5 // ADC $0, R19, R5
MOVV R14, 0(R9)
MOVV R15, 8(R9)
MOVV R16, 16(R9)
MOVV R17, 24(R9)
ADDVU $32, R7
ADDVU $32, R8
ADDVU $32, R9
SUBVU $1, R6
BNE R6, loop4cont
loop4done:
MOVV R5, c+88(FP)
RET