blob: 874930352b9a56a004fe7afecf261480670b9f8b [file] [log] [blame] [edit]
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
//go:build !math_big_pure_go
#include "textflag.h"
// func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB), NOSPLIT, $0
MOVD z_len+8(FP), R0
MOVD x_base+24(FP), R1
MOVD y_base+48(FP), R2
MOVD z_base+0(FP), R3
// compute unrolled loop lengths
AND $3, R0, R4
LSR $2, R0
ADDS ZR, R0 // clear carry
loop1:
CBZ R4, loop1done
loop1cont:
// unroll 1X
MOVD.P 8(R1), R5
MOVD.P 8(R2), R6
ADCS R6, R5
MOVD.P R5, 8(R3)
SUB $1, R4
CBNZ R4, loop1cont
loop1done:
loop4:
CBZ R0, loop4done
loop4cont:
// unroll 4X
LDP.P 32(R1), (R4, R5)
LDP -16(R1), (R6, R7)
LDP.P 32(R2), (R8, R9)
LDP -16(R2), (R10, R11)
ADCS R8, R4
ADCS R9, R5
ADCS R10, R6
ADCS R11, R7
STP.P (R4, R5), 32(R3)
STP (R6, R7), -16(R3)
SUB $1, R0
CBNZ R0, loop4cont
loop4done:
ADC ZR, ZR, R1 // save & convert add carry
MOVD R1, c+72(FP)
RET
// func subVV(z, x, y []Word) (c Word)
TEXT ·subVV(SB), NOSPLIT, $0
MOVD z_len+8(FP), R0
MOVD x_base+24(FP), R1
MOVD y_base+48(FP), R2
MOVD z_base+0(FP), R3
// compute unrolled loop lengths
AND $3, R0, R4
LSR $2, R0
SUBS ZR, R0 // clear carry
loop1:
CBZ R4, loop1done
loop1cont:
// unroll 1X
MOVD.P 8(R1), R5
MOVD.P 8(R2), R6
SBCS R6, R5
MOVD.P R5, 8(R3)
SUB $1, R4
CBNZ R4, loop1cont
loop1done:
loop4:
CBZ R0, loop4done
loop4cont:
// unroll 4X
LDP.P 32(R1), (R4, R5)
LDP -16(R1), (R6, R7)
LDP.P 32(R2), (R8, R9)
LDP -16(R2), (R10, R11)
SBCS R8, R4
SBCS R9, R5
SBCS R10, R6
SBCS R11, R7
STP.P (R4, R5), 32(R3)
STP (R6, R7), -16(R3)
SUB $1, R0
CBNZ R0, loop4cont
loop4done:
SBC R1, R1 // save carry
SUB R1, ZR, R1 // convert sub carry
MOVD R1, c+72(FP)
RET
// func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB), NOSPLIT, $0
MOVD z_len+8(FP), R0
CBZ R0, ret0
MOVD s+48(FP), R1
MOVD x_base+24(FP), R2
MOVD z_base+0(FP), R3
// run loop backward
ADD R0<<3, R2, R2
ADD R0<<3, R3, R3
// shift first word into carry
MOVD.W -8(R2), R4
MOVD $64, R5
SUB R1, R5
LSR R5, R4, R6
LSL R1, R4
MOVD R6, c+56(FP)
// shift remaining words
SUB $1, R0
// compute unrolled loop lengths
AND $3, R0, R6
LSR $2, R0
loop1:
CBZ R6, loop1done
loop1cont:
// unroll 1X
MOVD.W -8(R2), R7
LSR R5, R7, R8
ORR R4, R8
LSL R1, R7, R4
MOVD.W R8, -8(R3)
SUB $1, R6
CBNZ R6, loop1cont
loop1done:
loop4:
CBZ R0, loop4done
loop4cont:
// unroll 4X
LDP.W -32(R2), (R9, R8)
LDP 16(R2), (R7, R6)
LSR R5, R6, R10
ORR R4, R10
LSL R1, R6, R4
LSR R5, R7, R6
ORR R4, R6
LSL R1, R7, R4
LSR R5, R8, R7
ORR R4, R7
LSL R1, R8, R4
LSR R5, R9, R8
ORR R4, R8
LSL R1, R9, R4
STP.W (R8, R7), -32(R3)
STP (R6, R10), 16(R3)
SUB $1, R0
CBNZ R0, loop4cont
loop4done:
// store final shifted bits
MOVD.W R4, -8(R3)
RET
ret0:
MOVD ZR, c+56(FP)
RET
// func rshVU(z, x []Word, s uint) (c Word)
TEXT ·rshVU(SB), NOSPLIT, $0
MOVD z_len+8(FP), R0
CBZ R0, ret0
MOVD s+48(FP), R1
MOVD x_base+24(FP), R2
MOVD z_base+0(FP), R3
// shift first word into carry
MOVD.P 8(R2), R4
MOVD $64, R5
SUB R1, R5
LSL R5, R4, R6
LSR R1, R4
MOVD R6, c+56(FP)
// shift remaining words
SUB $1, R0
// compute unrolled loop lengths
AND $3, R0, R6
LSR $2, R0
loop1:
CBZ R6, loop1done
loop1cont:
// unroll 1X
MOVD.P 8(R2), R7
LSL R5, R7, R8
ORR R4, R8
LSR R1, R7, R4
MOVD.P R8, 8(R3)
SUB $1, R6
CBNZ R6, loop1cont
loop1done:
loop4:
CBZ R0, loop4done
loop4cont:
// unroll 4X
LDP.P 32(R2), (R6, R7)
LDP -16(R2), (R8, R9)
LSL R5, R6, R10
ORR R4, R10
LSR R1, R6, R4
LSL R5, R7, R6
ORR R4, R6
LSR R1, R7, R4
LSL R5, R8, R7
ORR R4, R7
LSR R1, R8, R4
LSL R5, R9, R8
ORR R4, R8
LSR R1, R9, R4
STP.P (R10, R6), 32(R3)
STP (R7, R8), -16(R3)
SUB $1, R0
CBNZ R0, loop4cont
loop4done:
// store final shifted bits
MOVD.P R4, 8(R3)
RET
ret0:
MOVD ZR, c+56(FP)
RET
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOVD m+48(FP), R0
MOVD a+56(FP), R1
MOVD z_len+8(FP), R2
MOVD x_base+24(FP), R3
MOVD z_base+0(FP), R4
// compute unrolled loop lengths
AND $7, R2, R5
LSR $3, R2
loop1:
CBZ R5, loop1done
loop1cont:
// unroll 1X
MOVD.P 8(R3), R6
// multiply
UMULH R0, R6, R7
MUL R0, R6
ADDS R1, R6
ADC ZR, R7, R1
MOVD.P R6, 8(R4)
SUB $1, R5
CBNZ R5, loop1cont
loop1done:
loop8:
CBZ R2, loop8done
loop8cont:
// unroll 8X
LDP.P 64(R3), (R5, R6)
LDP -48(R3), (R7, R8)
LDP -32(R3), (R9, R10)
LDP -16(R3), (R11, R12)
// multiply
UMULH R0, R5, R13
MUL R0, R5
ADDS R1, R5
UMULH R0, R6, R14
MUL R0, R6
ADCS R13, R6
UMULH R0, R7, R13
MUL R0, R7
ADCS R14, R7
UMULH R0, R8, R14
MUL R0, R8
ADCS R13, R8
UMULH R0, R9, R13
MUL R0, R9
ADCS R14, R9
UMULH R0, R10, R14
MUL R0, R10
ADCS R13, R10
UMULH R0, R11, R13
MUL R0, R11
ADCS R14, R11
UMULH R0, R12, R14
MUL R0, R12
ADCS R13, R12
ADC ZR, R14, R1
STP.P (R5, R6), 64(R4)
STP (R7, R8), -48(R4)
STP (R9, R10), -32(R4)
STP (R11, R12), -16(R4)
SUB $1, R2
CBNZ R2, loop8cont
loop8done:
MOVD R1, c+64(FP)
RET
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB), NOSPLIT, $0
MOVD m+72(FP), R0
MOVD a+80(FP), R1
MOVD z_len+8(FP), R2
MOVD x_base+24(FP), R3
MOVD y_base+48(FP), R4
MOVD z_base+0(FP), R5
// compute unrolled loop lengths
AND $7, R2, R6
LSR $3, R2
loop1:
CBZ R6, loop1done
loop1cont:
// unroll 1X
MOVD.P 8(R3), R7
MOVD.P 8(R4), R8
// multiply
UMULH R0, R8, R9
MUL R0, R8
ADDS R1, R8
ADC ZR, R9, R1
// add
ADDS R7, R8
ADC ZR, R1
MOVD.P R8, 8(R5)
SUB $1, R6
CBNZ R6, loop1cont
loop1done:
loop8:
CBZ R2, loop8done
loop8cont:
// unroll 8X
LDP.P 64(R3), (R6, R7)
LDP -48(R3), (R8, R9)
LDP -32(R3), (R10, R11)
LDP -16(R3), (R12, R13)
LDP.P 64(R4), (R14, R15)
LDP -48(R4), (R16, R17)
LDP -32(R4), (R19, R20)
LDP -16(R4), (R21, R22)
// multiply
UMULH R0, R14, R23
MUL R0, R14
ADDS R1, R14
UMULH R0, R15, R24
MUL R0, R15
ADCS R23, R15
UMULH R0, R16, R23
MUL R0, R16
ADCS R24, R16
UMULH R0, R17, R24
MUL R0, R17
ADCS R23, R17
UMULH R0, R19, R23
MUL R0, R19
ADCS R24, R19
UMULH R0, R20, R24
MUL R0, R20
ADCS R23, R20
UMULH R0, R21, R23
MUL R0, R21
ADCS R24, R21
UMULH R0, R22, R24
MUL R0, R22
ADCS R23, R22
ADC ZR, R24, R1
// add
ADDS R6, R14
ADCS R7, R15
ADCS R8, R16
ADCS R9, R17
ADCS R10, R19
ADCS R11, R20
ADCS R12, R21
ADCS R13, R22
ADC ZR, R1
STP.P (R14, R15), 64(R5)
STP (R16, R17), -48(R5)
STP (R19, R20), -32(R5)
STP (R21, R22), -16(R5)
SUB $1, R2
CBNZ R2, loop8cont
loop8done:
MOVD R1, c+88(FP)
RET