// Copyright 2013 The Go Authors. All rights reserved. | |

// Use of this source code is governed by a BSD-style | |

// license that can be found in the LICENSE file. | |

// +build !math_big_pure_go | |

#include "textflag.h" | |

// This file provides fast assembly versions for the elementary | |

// arithmetic operations on vectors implemented in arith.go. | |

// TODO: Consider re-implementing using Advanced SIMD | |

// once the assembler supports those instructions. | |

// func mulWW(x, y Word) (z1, z0 Word) | |

TEXT ·mulWW(SB),NOSPLIT,$0 | |

MOVD x+0(FP), R0 | |

MOVD y+8(FP), R1 | |

MUL R0, R1, R2 | |

UMULH R0, R1, R3 | |

MOVD R3, z1+16(FP) | |

MOVD R2, z0+24(FP) | |

RET | |

// func addVV(z, x, y []Word) (c Word) | |

TEXT ·addVV(SB),NOSPLIT,$0 | |

MOVD z_len+8(FP), R0 | |

MOVD x+24(FP), R8 | |

MOVD y+48(FP), R9 | |

MOVD z+0(FP), R10 | |

ADDS $0, R0 // clear carry flag | |

TBZ $0, R0, two | |

MOVD.P 8(R8), R11 | |

MOVD.P 8(R9), R15 | |

ADCS R15, R11 | |

MOVD.P R11, 8(R10) | |

SUB $1, R0 | |

two: | |

TBZ $1, R0, loop | |

LDP.P 16(R8), (R11, R12) | |

LDP.P 16(R9), (R15, R16) | |

ADCS R15, R11 | |

ADCS R16, R12 | |

STP.P (R11, R12), 16(R10) | |

SUB $2, R0 | |

loop: | |

CBZ R0, done // careful not to touch the carry flag | |

LDP.P 32(R8), (R11, R12) | |

LDP -16(R8), (R13, R14) | |

LDP.P 32(R9), (R15, R16) | |

LDP -16(R9), (R17, R19) | |

ADCS R15, R11 | |

ADCS R16, R12 | |

ADCS R17, R13 | |

ADCS R19, R14 | |

STP.P (R11, R12), 32(R10) | |

STP (R13, R14), -16(R10) | |

SUB $4, R0 | |

B loop | |

done: | |

CSET HS, R0 // extract carry flag | |

MOVD R0, c+72(FP) | |

RET | |

// func subVV(z, x, y []Word) (c Word) | |

TEXT ·subVV(SB),NOSPLIT,$0 | |

MOVD z_len+8(FP), R0 | |

MOVD x+24(FP), R8 | |

MOVD y+48(FP), R9 | |

MOVD z+0(FP), R10 | |

CMP R0, R0 // set carry flag | |

TBZ $0, R0, two | |

MOVD.P 8(R8), R11 | |

MOVD.P 8(R9), R15 | |

SBCS R15, R11 | |

MOVD.P R11, 8(R10) | |

SUB $1, R0 | |

two: | |

TBZ $1, R0, loop | |

LDP.P 16(R8), (R11, R12) | |

LDP.P 16(R9), (R15, R16) | |

SBCS R15, R11 | |

SBCS R16, R12 | |

STP.P (R11, R12), 16(R10) | |

SUB $2, R0 | |

loop: | |

CBZ R0, done // careful not to touch the carry flag | |

LDP.P 32(R8), (R11, R12) | |

LDP -16(R8), (R13, R14) | |

LDP.P 32(R9), (R15, R16) | |

LDP -16(R9), (R17, R19) | |

SBCS R15, R11 | |

SBCS R16, R12 | |

SBCS R17, R13 | |

SBCS R19, R14 | |

STP.P (R11, R12), 32(R10) | |

STP (R13, R14), -16(R10) | |

SUB $4, R0 | |

B loop | |

done: | |

CSET LO, R0 // extract carry flag | |

MOVD R0, c+72(FP) | |

RET | |

#define vwOneOp(instr, op1) \ | |

MOVD.P 8(R1), R4; \ | |

instr op1, R4; \ | |

MOVD.P R4, 8(R3); | |

// handle the first 1~4 elements before starting iteration in addVW/subVW | |

#define vwPreIter(instr1, instr2, counter, target) \ | |

vwOneOp(instr1, R2); \ | |

SUB $1, counter; \ | |

CBZ counter, target; \ | |

vwOneOp(instr2, $0); \ | |

SUB $1, counter; \ | |

CBZ counter, target; \ | |

vwOneOp(instr2, $0); \ | |

SUB $1, counter; \ | |

CBZ counter, target; \ | |

vwOneOp(instr2, $0); | |

// do one iteration of add or sub in addVW/subVW | |

#define vwOneIter(instr, counter, exit) \ | |

CBZ counter, exit; \ // careful not to touch the carry flag | |

LDP.P 32(R1), (R4, R5); \ | |

LDP -16(R1), (R6, R7); \ | |

instr $0, R4, R8; \ | |

instr $0, R5, R9; \ | |

instr $0, R6, R10; \ | |

instr $0, R7, R11; \ | |

STP.P (R8, R9), 32(R3); \ | |

STP (R10, R11), -16(R3); \ | |

SUB $4, counter; | |

// do one iteration of copy in addVW/subVW | |

#define vwOneIterCopy(counter, exit) \ | |

CBZ counter, exit; \ | |

LDP.P 32(R1), (R4, R5); \ | |

LDP -16(R1), (R6, R7); \ | |

STP.P (R4, R5), 32(R3); \ | |

STP (R6, R7), -16(R3); \ | |

SUB $4, counter; | |

// func addVW(z, x []Word, y Word) (c Word) | |

// The 'large' branch handles large 'z'. It checks the carry flag on every iteration | |

// and switches to copy if we are done with carries. The copying is skipped as well | |

// if 'x' and 'z' happen to share the same underlying storage. | |

// The overhead of the checking and branching is visible when 'z' are small (~5%), | |

// so set a threshold of 32, and remain the small-sized part entirely untouched. | |

TEXT ·addVW(SB),NOSPLIT,$0 | |

MOVD z+0(FP), R3 | |

MOVD z_len+8(FP), R0 | |

MOVD x+24(FP), R1 | |

MOVD y+48(FP), R2 | |

CMP $32, R0 | |

BGE large // large-sized 'z' and 'x' | |

CBZ R0, len0 // the length of z is 0 | |

MOVD.P 8(R1), R4 | |

ADDS R2, R4 // z[0] = x[0] + y, set carry | |

MOVD.P R4, 8(R3) | |

SUB $1, R0 | |

CBZ R0, len1 // the length of z is 1 | |

TBZ $0, R0, two | |

MOVD.P 8(R1), R4 // do it once | |

ADCS $0, R4 | |

MOVD.P R4, 8(R3) | |

SUB $1, R0 | |

two: // do it twice | |

TBZ $1, R0, loop | |

LDP.P 16(R1), (R4, R5) | |

ADCS $0, R4, R8 // c, z[i] = x[i] + c | |

ADCS $0, R5, R9 | |

STP.P (R8, R9), 16(R3) | |

SUB $2, R0 | |

loop: // do four times per round | |

vwOneIter(ADCS, R0, len1) | |

B loop | |

len1: | |

CSET HS, R2 // extract carry flag | |

len0: | |

MOVD R2, c+56(FP) | |

done: | |

RET | |

large: | |

AND $0x3, R0, R10 | |

AND $~0x3, R0 | |

// unrolling for the first 1~4 elements to avoid saving the carry | |

// flag in each step, adjust $R0 if we unrolled 4 elements | |

vwPreIter(ADDS, ADCS, R10, add4) | |

SUB $4, R0 | |

add4: | |

BCC copy | |

vwOneIter(ADCS, R0, len1) | |

B add4 | |

copy: | |

MOVD ZR, c+56(FP) | |

CMP R1, R3 | |

BEQ done | |

copy_4: // no carry flag, copy the rest | |

vwOneIterCopy(R0, done) | |

B copy_4 | |

// func subVW(z, x []Word, y Word) (c Word) | |

// The 'large' branch handles large 'z'. It checks the carry flag on every iteration | |

// and switches to copy if we are done with carries. The copying is skipped as well | |

// if 'x' and 'z' happen to share the same underlying storage. | |

// The overhead of the checking and branching is visible when 'z' are small (~5%), | |

// so set a threshold of 32, and remain the small-sized part entirely untouched. | |

TEXT ·subVW(SB),NOSPLIT,$0 | |

MOVD z+0(FP), R3 | |

MOVD z_len+8(FP), R0 | |

MOVD x+24(FP), R1 | |

MOVD y+48(FP), R2 | |

CMP $32, R0 | |

BGE large // large-sized 'z' and 'x' | |

CBZ R0, len0 // the length of z is 0 | |

MOVD.P 8(R1), R4 | |

SUBS R2, R4 // z[0] = x[0] - y, set carry | |

MOVD.P R4, 8(R3) | |

SUB $1, R0 | |

CBZ R0, len1 // the length of z is 1 | |

TBZ $0, R0, two // do it once | |

MOVD.P 8(R1), R4 | |

SBCS $0, R4 | |

MOVD.P R4, 8(R3) | |

SUB $1, R0 | |

two: // do it twice | |

TBZ $1, R0, loop | |

LDP.P 16(R1), (R4, R5) | |

SBCS $0, R4, R8 // c, z[i] = x[i] + c | |

SBCS $0, R5, R9 | |

STP.P (R8, R9), 16(R3) | |

SUB $2, R0 | |

loop: // do four times per round | |

vwOneIter(SBCS, R0, len1) | |

B loop | |

len1: | |

CSET LO, R2 // extract carry flag | |

len0: | |

MOVD R2, c+56(FP) | |

done: | |

RET | |

large: | |

AND $0x3, R0, R10 | |

AND $~0x3, R0 | |

// unrolling for the first 1~4 elements to avoid saving the carry | |

// flag in each step, adjust $R0 if we unrolled 4 elements | |

vwPreIter(SUBS, SBCS, R10, sub4) | |

SUB $4, R0 | |

sub4: | |

BCS copy | |

vwOneIter(SBCS, R0, len1) | |

B sub4 | |

copy: | |

MOVD ZR, c+56(FP) | |

CMP R1, R3 | |

BEQ done | |

copy_4: // no carry flag, copy the rest | |

vwOneIterCopy(R0, done) | |

B copy_4 | |

// func shlVU(z, x []Word, s uint) (c Word) | |

// This implementation handles the shift operation from the high word to the low word, | |

// which may be an error for the case where the low word of x overlaps with the high | |

// word of z. When calling this function directly, you need to pay attention to this | |

// situation. | |

TEXT ·shlVU(SB),NOSPLIT,$0 | |

LDP z+0(FP), (R0, R1) // R0 = z.ptr, R1 = len(z) | |

MOVD x+24(FP), R2 | |

MOVD s+48(FP), R3 | |

ADD R1<<3, R0 // R0 = &z[n] | |

ADD R1<<3, R2 // R2 = &x[n] | |

CBZ R1, len0 | |

CBZ R3, copy // if the number of shift is 0, just copy x to z | |

MOVD $64, R4 | |

SUB R3, R4 | |

// handling the most significant element x[n-1] | |

MOVD.W -8(R2), R6 | |

LSR R4, R6, R5 // return value | |

LSL R3, R6, R8 // x[i] << s | |

SUB $1, R1 | |

one: TBZ $0, R1, two | |

MOVD.W -8(R2), R6 | |

LSR R4, R6, R7 | |

ORR R8, R7 | |

LSL R3, R6, R8 | |

SUB $1, R1 | |

MOVD.W R7, -8(R0) | |

two: | |

TBZ $1, R1, loop | |

LDP.W -16(R2), (R6, R7) | |

LSR R4, R7, R10 | |

ORR R8, R10 | |

LSL R3, R7 | |

LSR R4, R6, R9 | |

ORR R7, R9 | |

LSL R3, R6, R8 | |

SUB $2, R1 | |

STP.W (R9, R10), -16(R0) | |

loop: | |

CBZ R1, done | |

LDP.W -32(R2), (R10, R11) | |

LDP 16(R2), (R12, R13) | |

LSR R4, R13, R23 | |

ORR R8, R23 // z[i] = (x[i] << s) | (x[i-1] >> (64 - s)) | |

LSL R3, R13 | |

LSR R4, R12, R22 | |

ORR R13, R22 | |

LSL R3, R12 | |

LSR R4, R11, R21 | |

ORR R12, R21 | |

LSL R3, R11 | |

LSR R4, R10, R20 | |

ORR R11, R20 | |

LSL R3, R10, R8 | |

STP.W (R20, R21), -32(R0) | |

STP (R22, R23), 16(R0) | |

SUB $4, R1 | |

B loop | |

done: | |

MOVD.W R8, -8(R0) // the first element x[0] | |

MOVD R5, c+56(FP) // the part moved out from x[n-1] | |

RET | |

copy: | |

CMP R0, R2 | |

BEQ len0 | |

TBZ $0, R1, ctwo | |

MOVD.W -8(R2), R4 | |

MOVD.W R4, -8(R0) | |

SUB $1, R1 | |

ctwo: | |

TBZ $1, R1, cloop | |

LDP.W -16(R2), (R4, R5) | |

STP.W (R4, R5), -16(R0) | |

SUB $2, R1 | |

cloop: | |

CBZ R1, len0 | |

LDP.W -32(R2), (R4, R5) | |

LDP 16(R2), (R6, R7) | |

STP.W (R4, R5), -32(R0) | |

STP (R6, R7), 16(R0) | |

SUB $4, R1 | |

B cloop | |

len0: | |

MOVD $0, c+56(FP) | |

RET | |

// func shrVU(z, x []Word, s uint) (c Word) | |

// This implementation handles the shift operation from the low word to the high word, | |

// which may be an error for the case where the high word of x overlaps with the low | |

// word of z. When calling this function directly, you need to pay attention to this | |

// situation. | |

TEXT ·shrVU(SB),NOSPLIT,$0 | |

MOVD z+0(FP), R0 | |

MOVD z_len+8(FP), R1 | |

MOVD x+24(FP), R2 | |

MOVD s+48(FP), R3 | |

MOVD $0, R8 | |

MOVD $64, R4 | |

SUB R3, R4 | |

CBZ R1, len0 | |

CBZ R3, copy // if the number of shift is 0, just copy x to z | |

MOVD.P 8(R2), R20 | |

LSR R3, R20, R8 | |

LSL R4, R20 | |

MOVD R20, c+56(FP) // deal with the first element | |

SUB $1, R1 | |

TBZ $0, R1, two | |

MOVD.P 8(R2), R6 | |

LSL R4, R6, R20 | |

ORR R8, R20 | |

LSR R3, R6, R8 | |

MOVD.P R20, 8(R0) | |

SUB $1, R1 | |

two: | |

TBZ $1, R1, loop | |

LDP.P 16(R2), (R6, R7) | |

LSL R4, R6, R20 | |

LSR R3, R6 | |

ORR R8, R20 | |

LSL R4, R7, R21 | |

LSR R3, R7, R8 | |

ORR R6, R21 | |

STP.P (R20, R21), 16(R0) | |

SUB $2, R1 | |

loop: | |

CBZ R1, done | |

LDP.P 32(R2), (R10, R11) | |

LDP -16(R2), (R12, R13) | |

LSL R4, R10, R20 | |

LSR R3, R10 | |

ORR R8, R20 // z[i] = (x[i] >> s) | (x[i+1] << (64 - s)) | |

LSL R4, R11, R21 | |

LSR R3, R11 | |

ORR R10, R21 | |

LSL R4, R12, R22 | |

LSR R3, R12 | |

ORR R11, R22 | |

LSL R4, R13, R23 | |

LSR R3, R13, R8 | |

ORR R12, R23 | |

STP.P (R20, R21), 32(R0) | |

STP (R22, R23), -16(R0) | |

SUB $4, R1 | |

B loop | |

done: | |

MOVD R8, (R0) // deal with the last element | |

RET | |

copy: | |

CMP R0, R2 | |

BEQ len0 | |

TBZ $0, R1, ctwo | |

MOVD.P 8(R2), R3 | |

MOVD.P R3, 8(R0) | |

SUB $1, R1 | |

ctwo: | |

TBZ $1, R1, cloop | |

LDP.P 16(R2), (R4, R5) | |

STP.P (R4, R5), 16(R0) | |

SUB $2, R1 | |

cloop: | |

CBZ R1, len0 | |

LDP.P 32(R2), (R4, R5) | |

LDP -16(R2), (R6, R7) | |

STP.P (R4, R5), 32(R0) | |

STP (R6, R7), -16(R0) | |

SUB $4, R1 | |

B cloop | |

len0: | |

MOVD $0, c+56(FP) | |

RET | |

// func mulAddVWW(z, x []Word, y, r Word) (c Word) | |

TEXT ·mulAddVWW(SB),NOSPLIT,$0 | |

MOVD z+0(FP), R1 | |

MOVD z_len+8(FP), R0 | |

MOVD x+24(FP), R2 | |

MOVD y+48(FP), R3 | |

MOVD r+56(FP), R4 | |

// c, z = x * y + r | |

TBZ $0, R0, two | |

MOVD.P 8(R2), R5 | |

MUL R3, R5, R7 | |

UMULH R3, R5, R8 | |

ADDS R4, R7 | |

ADC $0, R8, R4 // c, z[i] = x[i] * y + r | |

MOVD.P R7, 8(R1) | |

SUB $1, R0 | |

two: | |

TBZ $1, R0, loop | |

LDP.P 16(R2), (R5, R6) | |

MUL R3, R5, R10 | |

UMULH R3, R5, R11 | |

ADDS R4, R10 | |

MUL R3, R6, R12 | |

UMULH R3, R6, R13 | |

ADCS R12, R11 | |

ADC $0, R13, R4 | |

STP.P (R10, R11), 16(R1) | |

SUB $2, R0 | |

loop: | |

CBZ R0, done | |

LDP.P 32(R2), (R5, R6) | |

LDP -16(R2), (R7, R8) | |

MUL R3, R5, R10 | |

UMULH R3, R5, R11 | |

ADDS R4, R10 | |

MUL R3, R6, R12 | |

UMULH R3, R6, R13 | |

ADCS R11, R12 | |

MUL R3, R7, R14 | |

UMULH R3, R7, R15 | |

ADCS R13, R14 | |

MUL R3, R8, R16 | |

UMULH R3, R8, R17 | |

ADCS R15, R16 | |

ADC $0, R17, R4 | |

STP.P (R10, R12), 32(R1) | |

STP (R14, R16), -16(R1) | |

SUB $4, R0 | |

B loop | |

done: | |

MOVD R4, c+64(FP) | |

RET | |

// func addMulVVW(z, x []Word, y Word) (c Word) | |

TEXT ·addMulVVW(SB),NOSPLIT,$0 | |

MOVD z+0(FP), R1 | |

MOVD z_len+8(FP), R0 | |

MOVD x+24(FP), R2 | |

MOVD y+48(FP), R3 | |

MOVD $0, R4 | |

TBZ $0, R0, two | |

MOVD.P 8(R2), R5 | |

MOVD (R1), R6 | |

MUL R5, R3, R7 | |

UMULH R5, R3, R8 | |

ADDS R7, R6 | |

ADC $0, R8, R4 | |

MOVD.P R6, 8(R1) | |

SUB $1, R0 | |

two: | |

TBZ $1, R0, loop | |

LDP.P 16(R2), (R5, R10) | |

LDP (R1), (R6, R11) | |

MUL R10, R3, R13 | |

UMULH R10, R3, R12 | |

MUL R5, R3, R7 | |

UMULH R5, R3, R8 | |

ADDS R4, R6 | |

ADCS R13, R11 | |

ADC $0, R12 | |

ADDS R7, R6 | |

ADCS R8, R11 | |

ADC $0, R12, R4 | |

STP.P (R6, R11), 16(R1) | |

SUB $2, R0 | |

// The main loop of this code operates on a block of 4 words every iteration | |

// performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9] | |

// where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next | |

// 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z. | |

loop: | |

CBZ R0, done | |

LDP.P 16(R2), (R5, R6) | |

LDP.P 16(R2), (R7, R8) | |

LDP (R1), (R9, R10) | |

ADDS R4, R9 | |

MUL R6, R3, R14 | |

ADCS R14, R10 | |

MUL R7, R3, R15 | |

LDP 16(R1), (R11, R12) | |

ADCS R15, R11 | |

MUL R8, R3, R16 | |

ADCS R16, R12 | |

UMULH R8, R3, R20 | |

ADC $0, R20 | |

MUL R5, R3, R13 | |

ADDS R13, R9 | |

UMULH R5, R3, R17 | |

ADCS R17, R10 | |

UMULH R6, R3, R21 | |

STP.P (R9, R10), 16(R1) | |

ADCS R21, R11 | |

UMULH R7, R3, R19 | |

ADCS R19, R12 | |

STP.P (R11, R12), 16(R1) | |

ADC $0, R20, R4 | |

SUB $4, R0 | |

B loop | |

done: | |

MOVD R4, c+56(FP) | |

RET | |