| // Copyright 2013 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // +build !math_big_pure_go |
| |
| #include "textflag.h" |
| |
| // This file provides fast assembly versions for the elementary |
| // arithmetic operations on vectors implemented in arith.go. |
| |
| // TODO: Consider re-implementing using Advanced SIMD |
| // once the assembler supports those instructions. |
| |
| // func mulWW(x, y Word) (z1, z0 Word) |
| TEXT ·mulWW(SB),NOSPLIT,$0 |
| MOVD x+0(FP), R0 |
| MOVD y+8(FP), R1 |
| MUL R0, R1, R2 |
| UMULH R0, R1, R3 |
| MOVD R3, z1+16(FP) |
| MOVD R2, z0+24(FP) |
| RET |
| |
| |
| // func addVV(z, x, y []Word) (c Word) |
| TEXT ·addVV(SB),NOSPLIT,$0 |
| MOVD z_len+8(FP), R0 |
| MOVD x+24(FP), R8 |
| MOVD y+48(FP), R9 |
| MOVD z+0(FP), R10 |
| ADDS $0, R0 // clear carry flag |
| TBZ $0, R0, two |
| MOVD.P 8(R8), R11 |
| MOVD.P 8(R9), R15 |
| ADCS R15, R11 |
| MOVD.P R11, 8(R10) |
| SUB $1, R0 |
| two: |
| TBZ $1, R0, loop |
| LDP.P 16(R8), (R11, R12) |
| LDP.P 16(R9), (R15, R16) |
| ADCS R15, R11 |
| ADCS R16, R12 |
| STP.P (R11, R12), 16(R10) |
| SUB $2, R0 |
| loop: |
| CBZ R0, done // careful not to touch the carry flag |
| LDP.P 32(R8), (R11, R12) |
| LDP -16(R8), (R13, R14) |
| LDP.P 32(R9), (R15, R16) |
| LDP -16(R9), (R17, R19) |
| ADCS R15, R11 |
| ADCS R16, R12 |
| ADCS R17, R13 |
| ADCS R19, R14 |
| STP.P (R11, R12), 32(R10) |
| STP (R13, R14), -16(R10) |
| SUB $4, R0 |
| B loop |
| done: |
| CSET HS, R0 // extract carry flag |
| MOVD R0, c+72(FP) |
| RET |
| |
| |
| // func subVV(z, x, y []Word) (c Word) |
| TEXT ·subVV(SB),NOSPLIT,$0 |
| MOVD z_len+8(FP), R0 |
| MOVD x+24(FP), R8 |
| MOVD y+48(FP), R9 |
| MOVD z+0(FP), R10 |
| CMP R0, R0 // set carry flag |
| TBZ $0, R0, two |
| MOVD.P 8(R8), R11 |
| MOVD.P 8(R9), R15 |
| SBCS R15, R11 |
| MOVD.P R11, 8(R10) |
| SUB $1, R0 |
| two: |
| TBZ $1, R0, loop |
| LDP.P 16(R8), (R11, R12) |
| LDP.P 16(R9), (R15, R16) |
| SBCS R15, R11 |
| SBCS R16, R12 |
| STP.P (R11, R12), 16(R10) |
| SUB $2, R0 |
| loop: |
| CBZ R0, done // careful not to touch the carry flag |
| LDP.P 32(R8), (R11, R12) |
| LDP -16(R8), (R13, R14) |
| LDP.P 32(R9), (R15, R16) |
| LDP -16(R9), (R17, R19) |
| SBCS R15, R11 |
| SBCS R16, R12 |
| SBCS R17, R13 |
| SBCS R19, R14 |
| STP.P (R11, R12), 32(R10) |
| STP (R13, R14), -16(R10) |
| SUB $4, R0 |
| B loop |
| done: |
| CSET LO, R0 // extract carry flag |
| MOVD R0, c+72(FP) |
| RET |
| |
| #define vwOneOp(instr, op1) \ |
| MOVD.P 8(R1), R4; \ |
| instr op1, R4; \ |
| MOVD.P R4, 8(R3); |
| |
| // handle the first 1~4 elements before starting iteration in addVW/subVW |
| #define vwPreIter(instr1, instr2, counter, target) \ |
| vwOneOp(instr1, R2); \ |
| SUB $1, counter; \ |
| CBZ counter, target; \ |
| vwOneOp(instr2, $0); \ |
| SUB $1, counter; \ |
| CBZ counter, target; \ |
| vwOneOp(instr2, $0); \ |
| SUB $1, counter; \ |
| CBZ counter, target; \ |
| vwOneOp(instr2, $0); |
| |
| // do one iteration of add or sub in addVW/subVW |
| #define vwOneIter(instr, counter, exit) \ |
| CBZ counter, exit; \ // careful not to touch the carry flag |
| LDP.P 32(R1), (R4, R5); \ |
| LDP -16(R1), (R6, R7); \ |
| instr $0, R4, R8; \ |
| instr $0, R5, R9; \ |
| instr $0, R6, R10; \ |
| instr $0, R7, R11; \ |
| STP.P (R8, R9), 32(R3); \ |
| STP (R10, R11), -16(R3); \ |
| SUB $4, counter; |
| |
| // do one iteration of copy in addVW/subVW |
| #define vwOneIterCopy(counter, exit) \ |
| CBZ counter, exit; \ |
| LDP.P 32(R1), (R4, R5); \ |
| LDP -16(R1), (R6, R7); \ |
| STP.P (R4, R5), 32(R3); \ |
| STP (R6, R7), -16(R3); \ |
| SUB $4, counter; |
| |
| // func addVW(z, x []Word, y Word) (c Word) |
| // The 'large' branch handles large 'z'. It checks the carry flag on every iteration |
| // and switches to copy if we are done with carries. The copying is skipped as well |
| // if 'x' and 'z' happen to share the same underlying storage. |
| // The overhead of the checking and branching is visible when 'z' are small (~5%), |
| // so set a threshold of 32, and remain the small-sized part entirely untouched. |
| TEXT ·addVW(SB),NOSPLIT,$0 |
| MOVD z+0(FP), R3 |
| MOVD z_len+8(FP), R0 |
| MOVD x+24(FP), R1 |
| MOVD y+48(FP), R2 |
| CMP $32, R0 |
| BGE large // large-sized 'z' and 'x' |
| CBZ R0, len0 // the length of z is 0 |
| MOVD.P 8(R1), R4 |
| ADDS R2, R4 // z[0] = x[0] + y, set carry |
| MOVD.P R4, 8(R3) |
| SUB $1, R0 |
| CBZ R0, len1 // the length of z is 1 |
| TBZ $0, R0, two |
| MOVD.P 8(R1), R4 // do it once |
| ADCS $0, R4 |
| MOVD.P R4, 8(R3) |
| SUB $1, R0 |
| two: // do it twice |
| TBZ $1, R0, loop |
| LDP.P 16(R1), (R4, R5) |
| ADCS $0, R4, R8 // c, z[i] = x[i] + c |
| ADCS $0, R5, R9 |
| STP.P (R8, R9), 16(R3) |
| SUB $2, R0 |
| loop: // do four times per round |
| vwOneIter(ADCS, R0, len1) |
| B loop |
| len1: |
| CSET HS, R2 // extract carry flag |
| len0: |
| MOVD R2, c+56(FP) |
| done: |
| RET |
| large: |
| AND $0x3, R0, R10 |
| AND $~0x3, R0 |
| // unrolling for the first 1~4 elements to avoid saving the carry |
| // flag in each step, adjust $R0 if we unrolled 4 elements |
| vwPreIter(ADDS, ADCS, R10, add4) |
| SUB $4, R0 |
| add4: |
| BCC copy |
| vwOneIter(ADCS, R0, len1) |
| B add4 |
| copy: |
| MOVD ZR, c+56(FP) |
| CMP R1, R3 |
| BEQ done |
| copy_4: // no carry flag, copy the rest |
| vwOneIterCopy(R0, done) |
| B copy_4 |
| |
| // func subVW(z, x []Word, y Word) (c Word) |
| // The 'large' branch handles large 'z'. It checks the carry flag on every iteration |
| // and switches to copy if we are done with carries. The copying is skipped as well |
| // if 'x' and 'z' happen to share the same underlying storage. |
| // The overhead of the checking and branching is visible when 'z' are small (~5%), |
| // so set a threshold of 32, and remain the small-sized part entirely untouched. |
| TEXT ·subVW(SB),NOSPLIT,$0 |
| MOVD z+0(FP), R3 |
| MOVD z_len+8(FP), R0 |
| MOVD x+24(FP), R1 |
| MOVD y+48(FP), R2 |
| CMP $32, R0 |
| BGE large // large-sized 'z' and 'x' |
| CBZ R0, len0 // the length of z is 0 |
| MOVD.P 8(R1), R4 |
| SUBS R2, R4 // z[0] = x[0] - y, set carry |
| MOVD.P R4, 8(R3) |
| SUB $1, R0 |
| CBZ R0, len1 // the length of z is 1 |
| TBZ $0, R0, two // do it once |
| MOVD.P 8(R1), R4 |
| SBCS $0, R4 |
| MOVD.P R4, 8(R3) |
| SUB $1, R0 |
| two: // do it twice |
| TBZ $1, R0, loop |
| LDP.P 16(R1), (R4, R5) |
| SBCS $0, R4, R8 // c, z[i] = x[i] + c |
| SBCS $0, R5, R9 |
| STP.P (R8, R9), 16(R3) |
| SUB $2, R0 |
| loop: // do four times per round |
| vwOneIter(SBCS, R0, len1) |
| B loop |
| len1: |
| CSET LO, R2 // extract carry flag |
| len0: |
| MOVD R2, c+56(FP) |
| done: |
| RET |
| large: |
| AND $0x3, R0, R10 |
| AND $~0x3, R0 |
| // unrolling for the first 1~4 elements to avoid saving the carry |
| // flag in each step, adjust $R0 if we unrolled 4 elements |
| vwPreIter(SUBS, SBCS, R10, sub4) |
| SUB $4, R0 |
| sub4: |
| BCS copy |
| vwOneIter(SBCS, R0, len1) |
| B sub4 |
| copy: |
| MOVD ZR, c+56(FP) |
| CMP R1, R3 |
| BEQ done |
| copy_4: // no carry flag, copy the rest |
| vwOneIterCopy(R0, done) |
| B copy_4 |
| |
| // func shlVU(z, x []Word, s uint) (c Word) |
| // This implementation handles the shift operation from the high word to the low word, |
| // which may be an error for the case where the low word of x overlaps with the high |
| // word of z. When calling this function directly, you need to pay attention to this |
| // situation. |
| TEXT ·shlVU(SB),NOSPLIT,$0 |
| LDP z+0(FP), (R0, R1) // R0 = z.ptr, R1 = len(z) |
| MOVD x+24(FP), R2 |
| MOVD s+48(FP), R3 |
| ADD R1<<3, R0 // R0 = &z[n] |
| ADD R1<<3, R2 // R2 = &x[n] |
| CBZ R1, len0 |
| CBZ R3, copy // if the number of shift is 0, just copy x to z |
| MOVD $64, R4 |
| SUB R3, R4 |
| // handling the most significant element x[n-1] |
| MOVD.W -8(R2), R6 |
| LSR R4, R6, R5 // return value |
| LSL R3, R6, R8 // x[i] << s |
| SUB $1, R1 |
| one: TBZ $0, R1, two |
| MOVD.W -8(R2), R6 |
| LSR R4, R6, R7 |
| ORR R8, R7 |
| LSL R3, R6, R8 |
| SUB $1, R1 |
| MOVD.W R7, -8(R0) |
| two: |
| TBZ $1, R1, loop |
| LDP.W -16(R2), (R6, R7) |
| LSR R4, R7, R10 |
| ORR R8, R10 |
| LSL R3, R7 |
| LSR R4, R6, R9 |
| ORR R7, R9 |
| LSL R3, R6, R8 |
| SUB $2, R1 |
| STP.W (R9, R10), -16(R0) |
| loop: |
| CBZ R1, done |
| LDP.W -32(R2), (R10, R11) |
| LDP 16(R2), (R12, R13) |
| LSR R4, R13, R23 |
| ORR R8, R23 // z[i] = (x[i] << s) | (x[i-1] >> (64 - s)) |
| LSL R3, R13 |
| LSR R4, R12, R22 |
| ORR R13, R22 |
| LSL R3, R12 |
| LSR R4, R11, R21 |
| ORR R12, R21 |
| LSL R3, R11 |
| LSR R4, R10, R20 |
| ORR R11, R20 |
| LSL R3, R10, R8 |
| STP.W (R20, R21), -32(R0) |
| STP (R22, R23), 16(R0) |
| SUB $4, R1 |
| B loop |
| done: |
| MOVD.W R8, -8(R0) // the first element x[0] |
| MOVD R5, c+56(FP) // the part moved out from x[n-1] |
| RET |
| copy: |
| CMP R0, R2 |
| BEQ len0 |
| TBZ $0, R1, ctwo |
| MOVD.W -8(R2), R4 |
| MOVD.W R4, -8(R0) |
| SUB $1, R1 |
| ctwo: |
| TBZ $1, R1, cloop |
| LDP.W -16(R2), (R4, R5) |
| STP.W (R4, R5), -16(R0) |
| SUB $2, R1 |
| cloop: |
| CBZ R1, len0 |
| LDP.W -32(R2), (R4, R5) |
| LDP 16(R2), (R6, R7) |
| STP.W (R4, R5), -32(R0) |
| STP (R6, R7), 16(R0) |
| SUB $4, R1 |
| B cloop |
| len0: |
| MOVD $0, c+56(FP) |
| RET |
| |
| // func shrVU(z, x []Word, s uint) (c Word) |
| // This implementation handles the shift operation from the low word to the high word, |
| // which may be an error for the case where the high word of x overlaps with the low |
| // word of z. When calling this function directly, you need to pay attention to this |
| // situation. |
| TEXT ·shrVU(SB),NOSPLIT,$0 |
| MOVD z+0(FP), R0 |
| MOVD z_len+8(FP), R1 |
| MOVD x+24(FP), R2 |
| MOVD s+48(FP), R3 |
| MOVD $0, R8 |
| MOVD $64, R4 |
| SUB R3, R4 |
| CBZ R1, len0 |
| CBZ R3, copy // if the number of shift is 0, just copy x to z |
| |
| MOVD.P 8(R2), R20 |
| LSR R3, R20, R8 |
| LSL R4, R20 |
| MOVD R20, c+56(FP) // deal with the first element |
| SUB $1, R1 |
| |
| TBZ $0, R1, two |
| MOVD.P 8(R2), R6 |
| LSL R4, R6, R20 |
| ORR R8, R20 |
| LSR R3, R6, R8 |
| MOVD.P R20, 8(R0) |
| SUB $1, R1 |
| two: |
| TBZ $1, R1, loop |
| LDP.P 16(R2), (R6, R7) |
| LSL R4, R6, R20 |
| LSR R3, R6 |
| ORR R8, R20 |
| LSL R4, R7, R21 |
| LSR R3, R7, R8 |
| ORR R6, R21 |
| STP.P (R20, R21), 16(R0) |
| SUB $2, R1 |
| loop: |
| CBZ R1, done |
| LDP.P 32(R2), (R10, R11) |
| LDP -16(R2), (R12, R13) |
| LSL R4, R10, R20 |
| LSR R3, R10 |
| ORR R8, R20 // z[i] = (x[i] >> s) | (x[i+1] << (64 - s)) |
| LSL R4, R11, R21 |
| LSR R3, R11 |
| ORR R10, R21 |
| LSL R4, R12, R22 |
| LSR R3, R12 |
| ORR R11, R22 |
| LSL R4, R13, R23 |
| LSR R3, R13, R8 |
| ORR R12, R23 |
| STP.P (R20, R21), 32(R0) |
| STP (R22, R23), -16(R0) |
| SUB $4, R1 |
| B loop |
| done: |
| MOVD R8, (R0) // deal with the last element |
| RET |
| copy: |
| CMP R0, R2 |
| BEQ len0 |
| TBZ $0, R1, ctwo |
| MOVD.P 8(R2), R3 |
| MOVD.P R3, 8(R0) |
| SUB $1, R1 |
| ctwo: |
| TBZ $1, R1, cloop |
| LDP.P 16(R2), (R4, R5) |
| STP.P (R4, R5), 16(R0) |
| SUB $2, R1 |
| cloop: |
| CBZ R1, len0 |
| LDP.P 32(R2), (R4, R5) |
| LDP -16(R2), (R6, R7) |
| STP.P (R4, R5), 32(R0) |
| STP (R6, R7), -16(R0) |
| SUB $4, R1 |
| B cloop |
| len0: |
| MOVD $0, c+56(FP) |
| RET |
| |
| |
| // func mulAddVWW(z, x []Word, y, r Word) (c Word) |
| TEXT ·mulAddVWW(SB),NOSPLIT,$0 |
| MOVD z+0(FP), R1 |
| MOVD z_len+8(FP), R0 |
| MOVD x+24(FP), R2 |
| MOVD y+48(FP), R3 |
| MOVD r+56(FP), R4 |
| // c, z = x * y + r |
| TBZ $0, R0, two |
| MOVD.P 8(R2), R5 |
| MUL R3, R5, R7 |
| UMULH R3, R5, R8 |
| ADDS R4, R7 |
| ADC $0, R8, R4 // c, z[i] = x[i] * y + r |
| MOVD.P R7, 8(R1) |
| SUB $1, R0 |
| two: |
| TBZ $1, R0, loop |
| LDP.P 16(R2), (R5, R6) |
| MUL R3, R5, R10 |
| UMULH R3, R5, R11 |
| ADDS R4, R10 |
| MUL R3, R6, R12 |
| UMULH R3, R6, R13 |
| ADCS R12, R11 |
| ADC $0, R13, R4 |
| |
| STP.P (R10, R11), 16(R1) |
| SUB $2, R0 |
| loop: |
| CBZ R0, done |
| LDP.P 32(R2), (R5, R6) |
| LDP -16(R2), (R7, R8) |
| |
| MUL R3, R5, R10 |
| UMULH R3, R5, R11 |
| ADDS R4, R10 |
| MUL R3, R6, R12 |
| UMULH R3, R6, R13 |
| ADCS R11, R12 |
| |
| MUL R3, R7, R14 |
| UMULH R3, R7, R15 |
| ADCS R13, R14 |
| MUL R3, R8, R16 |
| UMULH R3, R8, R17 |
| ADCS R15, R16 |
| ADC $0, R17, R4 |
| |
| STP.P (R10, R12), 32(R1) |
| STP (R14, R16), -16(R1) |
| SUB $4, R0 |
| B loop |
| done: |
| MOVD R4, c+64(FP) |
| RET |
| |
| |
| // func addMulVVW(z, x []Word, y Word) (c Word) |
| TEXT ·addMulVVW(SB),NOSPLIT,$0 |
| MOVD z+0(FP), R1 |
| MOVD z_len+8(FP), R0 |
| MOVD x+24(FP), R2 |
| MOVD y+48(FP), R3 |
| MOVD $0, R4 |
| |
| TBZ $0, R0, two |
| |
| MOVD.P 8(R2), R5 |
| MOVD (R1), R6 |
| |
| MUL R5, R3, R7 |
| UMULH R5, R3, R8 |
| |
| ADDS R7, R6 |
| ADC $0, R8, R4 |
| |
| MOVD.P R6, 8(R1) |
| SUB $1, R0 |
| |
| two: |
| TBZ $1, R0, loop |
| |
| LDP.P 16(R2), (R5, R10) |
| LDP (R1), (R6, R11) |
| |
| MUL R10, R3, R13 |
| UMULH R10, R3, R12 |
| |
| MUL R5, R3, R7 |
| UMULH R5, R3, R8 |
| |
| ADDS R4, R6 |
| ADCS R13, R11 |
| ADC $0, R12 |
| |
| ADDS R7, R6 |
| ADCS R8, R11 |
| ADC $0, R12, R4 |
| |
| STP.P (R6, R11), 16(R1) |
| SUB $2, R0 |
| |
| // The main loop of this code operates on a block of 4 words every iteration |
| // performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9] |
| // where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next |
| // 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z. |
| loop: |
| CBZ R0, done |
| |
| LDP.P 16(R2), (R5, R6) |
| LDP.P 16(R2), (R7, R8) |
| |
| LDP (R1), (R9, R10) |
| ADDS R4, R9 |
| MUL R6, R3, R14 |
| ADCS R14, R10 |
| MUL R7, R3, R15 |
| LDP 16(R1), (R11, R12) |
| ADCS R15, R11 |
| MUL R8, R3, R16 |
| ADCS R16, R12 |
| UMULH R8, R3, R20 |
| ADC $0, R20 |
| |
| MUL R5, R3, R13 |
| ADDS R13, R9 |
| UMULH R5, R3, R17 |
| ADCS R17, R10 |
| UMULH R6, R3, R21 |
| STP.P (R9, R10), 16(R1) |
| ADCS R21, R11 |
| UMULH R7, R3, R19 |
| ADCS R19, R12 |
| STP.P (R11, R12), 16(R1) |
| ADC $0, R20, R4 |
| |
| SUB $4, R0 |
| B loop |
| |
| done: |
| MOVD R4, c+56(FP) |
| RET |
| |
| |