blob: 5ed3de68e2aa2e33ce2ffaf977da18369091b09f [file] [log] [blame]
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !math_big_pure_go,ppc64 !math_big_pure_go,ppc64le
#include "textflag.h"
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
// func mulWW(x, y Word) (z1, z0 Word)
TEXT ·mulWW(SB), NOSPLIT, $0
MOVD x+0(FP), R4
MOVD y+8(FP), R5
MULHDU R4, R5, R6
MULLD R4, R5, R7
MOVD R6, z1+16(FP)
MOVD R7, z0+24(FP)
RET
// func addVV(z, y, y []Word) (c Word)
// z[i] = x[i] + y[i] for all i, carrying
TEXT ·addVV(SB), NOSPLIT, $0
MOVD z_len+8(FP), R7
MOVD x+24(FP), R8
MOVD y+48(FP), R9
MOVD z+0(FP), R10
MOVD R0, R4
MOVD R0, R6 // R6 will be the address index
ADDC R4, R4 // clear CA
MOVD R7, CTR
CMP R0, R7
BEQ done
loop:
MOVD (R8)(R6), R11 // x[i]
MOVD (R9)(R6), R12 // y[i]
ADDE R12, R11, R15 // x[i] + y[i] + CA
MOVD R15, (R10)(R6) // z[i]
ADD $8, R6
BC 16, 0, loop // bdnz
done:
ADDZE R4
MOVD R4, c+72(FP)
RET
// func subVV(z, x, y []Word) (c Word)
// z[i] = x[i] - y[i] for all i, carrying
TEXT ·subVV(SB), NOSPLIT, $0
MOVD z_len+8(FP), R7
MOVD x+24(FP), R8
MOVD y+48(FP), R9
MOVD z+0(FP), R10
MOVD R0, R4 // c = 0
MOVD R0, R6
SUBC R0, R0 // clear CA
MOVD R7, CTR
CMP R0, R7
BEQ sublend
// amd64 saves and restores CF, but I believe they only have to do that because all of
// their math operations clobber it - we should just be able to recover it at the end.
subloop:
MOVD (R8)(R6), R11 // x[i]
MOVD (R9)(R6), R12 // y[i]
SUBE R12, R11, R15
MOVD R15, (R10)(R6)
ADD $8, R6
BC 16, 0, subloop // bdnz
sublend:
ADDZE R4
XOR $1, R4
MOVD R4, c+72(FP)
RET
TEXT ·addVW(SB), NOSPLIT, $0
BR ·addVW_g(SB)
TEXT ·subVW(SB), NOSPLIT, $0
BR ·subVW_g(SB)
TEXT ·shlVU(SB), NOSPLIT, $0
BR ·shlVU_g(SB)
TEXT ·shrVU(SB), NOSPLIT, $0
BR ·shrVU_g(SB)
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOVD z+0(FP), R10
MOVD x+24(FP), R8
MOVD y+48(FP), R9
MOVD r+56(FP), R4 // c = r
MOVD z_len+8(FP), R11
MOVD $0, R3 // i = 0
MOVD $8, R18
MOVD $1, R19
JMP e5
l5:
MULLD R18, R3, R5
MOVD (R8)(R5), R20
MULLD R9, R20, R6
MULHDU R9, R20, R7
ADDC R4, R6
ADDZE R7
MOVD R6, (R10)(R5)
MOVD R7, R4
ADD R19, R3
e5:
CMP R3, R11
BLT l5
MOVD R4, c+64(FP)
RET
// func addMulVVW(z, x []Word, y Word) (c Word)
TEXT ·addMulVVW(SB), NOSPLIT, $0
MOVD z+0(FP), R10
MOVD x+24(FP), R8
MOVD y+48(FP), R9
MOVD z_len+8(FP), R22
MOVD $0, R5 // i = 0
MOVD $0, R4 // c = 0
MOVD $8, R28
MOVD $-2, R23
AND R22, R23 // mask the last bit of z.len
MOVD $2, R24
CMP R23, R24
BGE unrolled
JMP end
unrolled:
MOVD $8, R19 // no (RA)(RB*8) on power
MULLD R5, R19
MOVD (R10)(R19), R11 // R11 = z[i]
MOVD (R8)(R19), R16 // R16 = x[i]
ADD R28, R19, R25
MOVD (R10)(R25), R17
MOVD (R8)(R25), R18
MULLD R9, R16, R12
MULHDU R9, R16, R14
MULLD R9, R18, R6
MULHDU R9, R18, R7
ADDC R4, R12
ADDZE R14
ADDC R11, R12 // z[i] = (x[i]*y) + z[i] + carry
ADDZE R14 // carry = high order bits + add carry
MOVD R12, (R10)(R19)
ADDC R14, R6
ADDZE R7
ADDC R17, R6
ADDZE R7
MOVD R6, (R10)(R25)
MOVD R7, R4
ADD R24, R5
CMP R5, R23
BLT unrolled
JMP end
loop:
MOVD $8, R19
MULLD R5, R19
MOVD (R10)(R19), R11
MOVD (R8)(R19), R16
MULLD R9, R16, R12
MULHDU R9, R16, R14
ADDC R4, R12
ADDZE R14
ADDC R11, R12
ADDZE R14
MOVD R12, (R10)(R19)
MOVD R14, R4
MOVD $1, R15
ADD R15, R5
end:
CMP R5, R22
BLT loop
MOVD R4, c+56(FP)
RET
// func divWW(x1, x0, y Word) (q, r Word)
TEXT ·divWW(SB), NOSPLIT, $0
MOVD x1+0(FP), R4
MOVD x0+8(FP), R5
MOVD y+16(FP), R6
CMPU R4, R6
BGE divbigger
// from the programmer's note in ch. 3 of the ISA manual, p.74
DIVDEU R6, R4, R3
DIVDU R6, R5, R7
MULLD R6, R3, R8
MULLD R6, R7, R20
SUB R20, R5, R10
ADD R7, R3, R3
SUB R8, R10, R4
CMPU R4, R10
BLT adjust
CMPU R4, R6
BLT end
adjust:
MOVD $1, R21
ADD R21, R3, R3
SUB R6, R4, R4
end:
MOVD R3, q+24(FP)
MOVD R4, r+32(FP)
RET
divbigger:
MOVD $-1, R7
MOVD R7, q+24(FP)
MOVD R7, r+32(FP)
RET
TEXT ·divWVW(SB), NOSPLIT, $0
BR ·divWVW_g(SB)