blob: 60abe6eaa9525bdcc0faf53de99671aa4bc39cfe [file] [log] [blame]
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
#define CFLAG 29 // bit position of carry flag
// func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB),7,$0
MOVW $0, R0
MOVW z+0(FP), R1
MOVW x+12(FP), R2
MOVW y+24(FP), R3
MOVW n+4(FP), R4
MOVW R4<<2, R4
ADD R1, R4
B E1
L1:
MOVW.P 4(R2), R5
MOVW.P 4(R3), R6
MOVW R0, CPSR
ADC.S R6, R5
MOVW.P R5, 4(R1)
MOVW CPSR, R0
E1:
CMP R1, R4
BNE L1
MOVW R0>>CFLAG, R0
AND $1, R0
MOVW R0, c+36(FP)
RET
// func subVV(z, x, y []Word) (c Word)
// (same as addVV except for SBC instead of ADC and label names)
TEXT ·subVV(SB),7,$0
MOVW $(1<<CFLAG), R0
MOVW z+0(FP), R1
MOVW x+12(FP), R2
MOVW y+24(FP), R3
MOVW n+4(FP), R4
MOVW R4<<2, R4
ADD R1, R4
B E2
L2:
MOVW.P 4(R2), R5
MOVW.P 4(R3), R6
MOVW R0, CPSR
SBC.S R6, R5
MOVW.P R5, 4(R1)
MOVW CPSR, R0
E2:
CMP R1, R4
BNE L2
MOVW R0>>CFLAG, R0
AND $1, R0
EOR $1, R0
MOVW R0, c+36(FP)
RET
// func addVW(z, x []Word, y Word) (c Word)
TEXT ·addVW(SB),7,$0
MOVW z+0(FP), R1
MOVW x+12(FP), R2
MOVW y+24(FP), R3
MOVW n+4(FP), R4
MOVW R4<<2, R4
ADD R1, R4
CMP R1, R4
BNE L3a
MOVW R3, c+28(FP)
RET
L3a:
MOVW.P 4(R2), R5
ADD.S R3, R5
MOVW.P R5, 4(R1)
MOVW CPSR, R0
B E3
L3:
MOVW.P 4(R2), R5
MOVW R0, CPSR
ADC.S $0, R5
MOVW.P R5, 4(R1)
MOVW CPSR, R0
E3:
CMP R1, R4
BNE L3
MOVW R0>>CFLAG, R0
AND $1, R0
MOVW R0, c+28(FP)
RET
// func subVW(z, x []Word, y Word) (c Word)
TEXT ·subVW(SB),7,$0
MOVW z+0(FP), R1
MOVW x+12(FP), R2
MOVW y+24(FP), R3
MOVW n+4(FP), R4
MOVW R4<<2, R4
ADD R1, R4
CMP R1, R4
BNE L4a
MOVW R3, c+28(FP)
RET
L4a:
MOVW.P 4(R2), R5
SUB.S R3, R5
MOVW.P R5, 4(R1)
MOVW CPSR, R0
B E4
L4:
MOVW.P 4(R2), R5
MOVW R0, CPSR
SBC.S $0, R5
MOVW.P R5, 4(R1)
MOVW CPSR, R0
E4:
CMP R1, R4
BNE L4
MOVW R0>>CFLAG, R0
AND $1, R0
EOR $1, R0
MOVW R0, c+28(FP)
RET
// func shlVU(z, x []Word, s uint) (c Word)
TEXT ·shlVU(SB),7,$0
MOVW n+4(FP), R5
CMP $0, R5
BEQ X7
MOVW z+0(FP), R1
MOVW x+12(FP), R2
MOVW R5<<2, R5
ADD R5, R2
ADD R1, R5
MOVW s+24(FP), R3
CMP $0, R3 // shift 0 is special
BEQ Y7
ADD $4, R1 // stop one word early
MOVW $32, R4
SUB R3, R4
MOVW $0, R7
MOVW.W -4(R2), R6
MOVW R6<<R3, R7
MOVW R6>>R4, R6
MOVW R6, c+28(FP)
B E7
L7:
MOVW.W -4(R2), R6
ORR R6>>R4, R7
MOVW.W R7, -4(R5)
MOVW R6<<R3, R7
E7:
CMP R1, R5
BNE L7
MOVW R7, -4(R5)
RET
Y7: // copy loop, because shift 0 == shift 32
MOVW.W -4(R2), R6
MOVW.W R6, -4(R5)
CMP R1, R5
BNE Y7
X7:
MOVW $0, R1
MOVW R1, c+28(FP)
RET
// func shrVU(z, x []Word, s uint) (c Word)
TEXT ·shrVU(SB),7,$0
MOVW n+4(FP), R5
CMP $0, R5
BEQ X6
MOVW z+0(FP), R1
MOVW x+12(FP), R2
MOVW R5<<2, R5
ADD R1, R5
MOVW s+24(FP), R3
CMP $0, R3 // shift 0 is special
BEQ Y6
SUB $4, R5 // stop one word early
MOVW $32, R4
SUB R3, R4
MOVW $0, R7
// first word
MOVW.P 4(R2), R6
MOVW R6>>R3, R7
MOVW R6<<R4, R6
MOVW R6, c+28(FP)
B E6
// word loop
L6:
MOVW.P 4(R2), R6
ORR R6<<R4, R7
MOVW.P R7, 4(R1)
MOVW R6>>R3, R7
E6:
CMP R1, R5
BNE L6
MOVW R7, 0(R1)
RET
Y6: // copy loop, because shift 0 == shift 32
MOVW.P 4(R2), R6
MOVW.P R6, 4(R1)
CMP R1, R5
BNE Y6
X6:
MOVW $0, R1
MOVW R1, c+28(FP)
RET
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
TEXT ·mulAddVWW(SB),7,$0
MOVW $0, R0
MOVW z+0(FP), R1
MOVW x+12(FP), R2
MOVW y+24(FP), R3
MOVW r+28(FP), R4
MOVW n+4(FP), R5
MOVW R5<<2, R5
ADD R1, R5
B E8
// word loop
L8:
MOVW.P 4(R2), R6
MULLU R6, R3, (R7, R6)
ADD.S R4, R6
ADC R0, R7
MOVW.P R6, 4(R1)
MOVW R7, R4
E8:
CMP R1, R5
BNE L8
MOVW R4, c+32(FP)
RET
// func addMulVVW(z, x []Word, y Word) (c Word)
TEXT ·addMulVVW(SB),7,$0
MOVW $0, R0
MOVW z+0(FP), R1
MOVW x+12(FP), R2
MOVW y+24(FP), R3
MOVW n+4(FP), R5
MOVW R5<<2, R5
ADD R1, R5
MOVW $0, R4
B E9
// word loop
L9:
MOVW.P 4(R2), R6
MULLU R6, R3, (R7, R6)
ADD.S R4, R6
ADC R0, R7
MOVW 0(R1), R4
ADD.S R4, R6
ADC R0, R7
MOVW.P R6, 4(R1)
MOVW R7, R4
E9:
CMP R1, R5
BNE L9
MOVW R4, c+28(FP)
RET
// divWVW(z* Word, xn Word, x []Word, y Word) (r Word)
TEXT ·divWVW(SB),7,$0
// ARM has no multiword division, so use portable code.
B ·divWVW_g(SB)
// func divWW(x1, x0, y Word) (q, r Word)
TEXT ·divWW(SB),7,$0
// ARM has no multiword division, so use portable code.
B ·divWW_g(SB)
// func mulWW(x, y Word) (z1, z0 Word)
TEXT ·mulWW(SB),7,$0
MOVW x+0(FP), R1
MOVW y+4(FP), R2
MULLU R1, R2, (R4, R3)
MOVW R4, z1+8(FP)
MOVW R3, z0+12(FP)
RET