blob: ba1e6118cc8bb9f23dafb34307576ef876c58ed4 [file] [log] [blame]
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !purego
#include "textflag.h"
// func addMulVVW1024(z, x *uint, y uint) (c uint)
TEXT ·addMulVVW1024(SB), $0-32
MOVD $16, R0
JMP addMulVVWx(SB)
// func addMulVVW1536(z, x *uint, y uint) (c uint)
TEXT ·addMulVVW1536(SB), $0-32
MOVD $24, R0
JMP addMulVVWx(SB)
// func addMulVVW2048(z, x *uint, y uint) (c uint)
TEXT ·addMulVVW2048(SB), $0-32
MOVD $32, R0
JMP addMulVVWx(SB)
TEXT addMulVVWx(SB), NOFRAME|NOSPLIT, $0
MOVD z+0(FP), R1
MOVD x+8(FP), R2
MOVD y+16(FP), R3
MOVD $0, R4
// The main loop of this code operates on a block of 4 words every iteration
// performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9]
// where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next
// 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z.
loop:
CBZ R0, done
LDP.P 16(R2), (R5, R6)
LDP.P 16(R2), (R7, R8)
LDP (R1), (R9, R10)
ADDS R4, R9
MUL R6, R3, R14
ADCS R14, R10
MUL R7, R3, R15
LDP 16(R1), (R11, R12)
ADCS R15, R11
MUL R8, R3, R16
ADCS R16, R12
UMULH R8, R3, R20
ADC $0, R20
MUL R5, R3, R13
ADDS R13, R9
UMULH R5, R3, R17
ADCS R17, R10
UMULH R6, R3, R21
STP.P (R9, R10), 16(R1)
ADCS R21, R11
UMULH R7, R3, R19
ADCS R19, R12
STP.P (R11, R12), 16(R1)
ADC $0, R20, R4
SUB $4, R0
B loop
done:
MOVD R4, c+24(FP)
RET