| // Copyright 2013 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // +build !math_big_pure_go |
| |
| #include "textflag.h" |
| |
| // This file provides fast assembly versions for the elementary |
| // arithmetic operations on vectors implemented in arith.go. |
| |
| // TODO: Consider re-implementing using Advanced SIMD |
| // once the assembler supports those instructions. |
| |
| // func mulWW(x, y Word) (z1, z0 Word) |
| TEXT ·mulWW(SB),NOSPLIT,$0 |
| MOVD x+0(FP), R0 |
| MOVD y+8(FP), R1 |
| MUL R0, R1, R2 |
| UMULH R0, R1, R3 |
| MOVD R3, z1+16(FP) |
| MOVD R2, z0+24(FP) |
| RET |
| |
| |
| // func divWW(x1, x0, y Word) (q, r Word) |
| TEXT ·divWW(SB),NOSPLIT,$0 |
| B ·divWW_g(SB) // ARM64 has no multiword division |
| |
| |
| // func addVV(z, x, y []Word) (c Word) |
| TEXT ·addVV(SB),NOSPLIT,$0 |
| MOVD z_len+8(FP), R0 |
| MOVD x+24(FP), R8 |
| MOVD y+48(FP), R9 |
| MOVD z+0(FP), R10 |
| ADDS $0, R0 // clear carry flag |
| TBZ $0, R0, two |
| MOVD.P 8(R8), R11 |
| MOVD.P 8(R9), R15 |
| ADCS R15, R11 |
| MOVD.P R11, 8(R10) |
| SUB $1, R0 |
| two: |
| TBZ $1, R0, loop |
| LDP.P 16(R8), (R11, R12) |
| LDP.P 16(R9), (R15, R16) |
| ADCS R15, R11 |
| ADCS R16, R12 |
| STP.P (R11, R12), 16(R10) |
| SUB $2, R0 |
| loop: |
| CBZ R0, done // careful not to touch the carry flag |
| LDP.P 32(R8), (R11, R12) |
| LDP -16(R8), (R13, R14) |
| LDP.P 32(R9), (R15, R16) |
| LDP -16(R9), (R17, R19) |
| ADCS R15, R11 |
| ADCS R16, R12 |
| ADCS R17, R13 |
| ADCS R19, R14 |
| STP.P (R11, R12), 32(R10) |
| STP (R13, R14), -16(R10) |
| SUB $4, R0 |
| B loop |
| done: |
| CSET HS, R0 // extract carry flag |
| MOVD R0, c+72(FP) |
| RET |
| |
| |
| // func subVV(z, x, y []Word) (c Word) |
| TEXT ·subVV(SB),NOSPLIT,$0 |
| MOVD z_len+8(FP), R0 |
| MOVD x+24(FP), R8 |
| MOVD y+48(FP), R9 |
| MOVD z+0(FP), R10 |
| CMP R0, R0 // set carry flag |
| TBZ $0, R0, two |
| MOVD.P 8(R8), R11 |
| MOVD.P 8(R9), R15 |
| SBCS R15, R11 |
| MOVD.P R11, 8(R10) |
| SUB $1, R0 |
| two: |
| TBZ $1, R0, loop |
| LDP.P 16(R8), (R11, R12) |
| LDP.P 16(R9), (R15, R16) |
| SBCS R15, R11 |
| SBCS R16, R12 |
| STP.P (R11, R12), 16(R10) |
| SUB $2, R0 |
| loop: |
| CBZ R0, done // careful not to touch the carry flag |
| LDP.P 32(R8), (R11, R12) |
| LDP -16(R8), (R13, R14) |
| LDP.P 32(R9), (R15, R16) |
| LDP -16(R9), (R17, R19) |
| SBCS R15, R11 |
| SBCS R16, R12 |
| SBCS R17, R13 |
| SBCS R19, R14 |
| STP.P (R11, R12), 32(R10) |
| STP (R13, R14), -16(R10) |
| SUB $4, R0 |
| B loop |
| done: |
| CSET LO, R0 // extract carry flag |
| MOVD R0, c+72(FP) |
| RET |
| |
| |
| // func addVW(z, x []Word, y Word) (c Word) |
| TEXT ·addVW(SB),NOSPLIT,$0 |
| MOVD z+0(FP), R3 |
| MOVD z_len+8(FP), R0 |
| MOVD x+24(FP), R1 |
| MOVD y+48(FP), R2 |
| CBZ R0, len0 // the length of z is 0 |
| MOVD.P 8(R1), R4 |
| ADDS R2, R4 // z[0] = x[0] + y, set carry |
| MOVD.P R4, 8(R3) |
| SUB $1, R0 |
| CBZ R0, len1 // the length of z is 1 |
| TBZ $0, R0, two |
| MOVD.P 8(R1), R4 // do it once |
| ADCS $0, R4 |
| MOVD.P R4, 8(R3) |
| SUB $1, R0 |
| two: // do it twice |
| TBZ $1, R0, loop |
| LDP.P 16(R1), (R4, R5) |
| ADCS $0, R4, R8 // c, z[i] = x[i] + c |
| ADCS $0, R5, R9 |
| STP.P (R8, R9), 16(R3) |
| SUB $2, R0 |
| loop: // do four times per round |
| CBZ R0, len1 // careful not to touch the carry flag |
| LDP.P 32(R1), (R4, R5) |
| LDP -16(R1), (R6, R7) |
| ADCS $0, R4, R8 |
| ADCS $0, R5, R9 |
| ADCS $0, R6, R10 |
| ADCS $0, R7, R11 |
| STP.P (R8, R9), 32(R3) |
| STP (R10, R11), -16(R3) |
| SUB $4, R0 |
| B loop |
| len1: |
| CSET HS, R2 // extract carry flag |
| len0: |
| MOVD R2, c+56(FP) |
| RET |
| |
| // func subVW(z, x []Word, y Word) (c Word) |
| TEXT ·subVW(SB),NOSPLIT,$0 |
| MOVD z+0(FP), R3 |
| MOVD z_len+8(FP), R0 |
| MOVD x+24(FP), R1 |
| MOVD y+48(FP), R2 |
| CBZ R0, len0 // the length of z is 0 |
| MOVD.P 8(R1), R4 |
| SUBS R2, R4 // z[0] = x[0] - y, set carry |
| MOVD.P R4, 8(R3) |
| SUB $1, R0 |
| CBZ R0, len1 // the length of z is 1 |
| TBZ $0, R0, two // do it once |
| MOVD.P 8(R1), R4 |
| SBCS $0, R4 |
| MOVD.P R4, 8(R3) |
| SUB $1, R0 |
| two: // do it twice |
| TBZ $1, R0, loop |
| LDP.P 16(R1), (R4, R5) |
| SBCS $0, R4, R8 // c, z[i] = x[i] + c |
| SBCS $0, R5, R9 |
| STP.P (R8, R9), 16(R3) |
| SUB $2, R0 |
| loop: // do four times per round |
| CBZ R0, len1 // careful not to touch the carry flag |
| LDP.P 32(R1), (R4, R5) |
| LDP -16(R1), (R6, R7) |
| SBCS $0, R4, R8 |
| SBCS $0, R5, R9 |
| SBCS $0, R6, R10 |
| SBCS $0, R7, R11 |
| STP.P (R8, R9), 32(R3) |
| STP (R10, R11), -16(R3) |
| SUB $4, R0 |
| B loop |
| len1: |
| CSET LO, R2 // extract carry flag |
| len0: |
| MOVD R2, c+56(FP) |
| RET |
| |
| |
| // func shlVU(z, x []Word, s uint) (c Word) |
| TEXT ·shlVU(SB),NOSPLIT,$0 |
| MOVD z+0(FP), R0 |
| MOVD z_len+8(FP), R1 |
| MOVD x+24(FP), R2 |
| MOVD s+48(FP), R3 |
| MOVD $0, R8 // in order not to affect the first element, R8 is initialized to zero |
| MOVD $64, R4 |
| SUB R3, R4 |
| CBZ R1, len0 |
| CBZ R3, copy // if the number of shift is 0, just copy x to z |
| |
| TBZ $0, R1, two |
| MOVD.P 8(R2), R6 |
| LSR R4, R6, R8 |
| LSL R3, R6 |
| MOVD.P R6, 8(R0) |
| SUB $1, R1 |
| two: |
| TBZ $1, R1, loop |
| LDP.P 16(R2), (R6, R7) |
| LSR R4, R6, R9 |
| LSL R3, R6 |
| ORR R8, R6 |
| LSR R4, R7, R8 |
| LSL R3, R7 |
| ORR R9, R7 |
| STP.P (R6, R7), 16(R0) |
| SUB $2, R1 |
| loop: |
| CBZ R1, done |
| LDP.P 32(R2), (R10, R11) |
| LDP -16(R2), (R12, R13) |
| LSR R4, R10, R20 |
| LSL R3, R10 |
| ORR R8, R10 // z[i] = (x[i] << s) | (x[i-1] >> (64 - s)) |
| LSR R4, R11, R21 |
| LSL R3, R11 |
| ORR R20, R11 |
| LSR R4, R12, R22 |
| LSL R3, R12 |
| ORR R21, R12 |
| LSR R4, R13, R8 |
| LSL R3, R13 |
| ORR R22, R13 |
| STP.P (R10, R11), 32(R0) |
| STP (R12, R13), -16(R0) |
| SUB $4, R1 |
| B loop |
| done: |
| MOVD R8, c+56(FP) // the part moved out from the last element |
| RET |
| copy: |
| TBZ $0, R1, ctwo |
| MOVD.P 8(R2), R3 |
| MOVD.P R3, 8(R0) |
| SUB $1, R1 |
| ctwo: |
| TBZ $1, R1, cloop |
| LDP.P 16(R2), (R4, R5) |
| STP.P (R4, R5), 16(R0) |
| SUB $2, R1 |
| cloop: |
| CBZ R1, len0 |
| LDP.P 32(R2), (R4, R5) |
| LDP -16(R2), (R6, R7) |
| STP.P (R4, R5), 32(R0) |
| STP (R6, R7), -16(R0) |
| SUB $4, R1 |
| B cloop |
| len0: |
| MOVD $0, c+56(FP) |
| RET |
| |
| |
| // func shrVU(z, x []Word, s uint) (c Word) |
| TEXT ·shrVU(SB),NOSPLIT,$0 |
| MOVD z+0(FP), R0 |
| MOVD z_len+8(FP), R1 |
| MOVD x+24(FP), R2 |
| MOVD s+48(FP), R3 |
| MOVD $0, R8 |
| MOVD $64, R4 |
| SUB R3, R4 |
| CBZ R1, len0 |
| CBZ R3, copy // if the number of shift is 0, just copy x to z |
| |
| MOVD.P 8(R2), R20 |
| LSR R3, R20, R8 |
| LSL R4, R20 |
| MOVD R20, c+56(FP) // deal with the first element |
| SUB $1, R1 |
| |
| TBZ $0, R1, two |
| MOVD.P 8(R2), R6 |
| LSL R4, R6, R20 |
| ORR R8, R20 |
| LSR R3, R6, R8 |
| MOVD.P R20, 8(R0) |
| SUB $1, R1 |
| two: |
| TBZ $1, R1, loop |
| LDP.P 16(R2), (R6, R7) |
| LSL R4, R6, R20 |
| LSR R3, R6 |
| ORR R8, R20 |
| LSL R4, R7, R21 |
| LSR R3, R7, R8 |
| ORR R6, R21 |
| STP.P (R20, R21), 16(R0) |
| SUB $2, R1 |
| loop: |
| CBZ R1, done |
| LDP.P 32(R2), (R10, R11) |
| LDP -16(R2), (R12, R13) |
| LSL R4, R10, R20 |
| LSR R3, R10 |
| ORR R8, R20 // z[i] = (x[i] >> s) | (x[i+1] << (64 - s)) |
| LSL R4, R11, R21 |
| LSR R3, R11 |
| ORR R10, R21 |
| LSL R4, R12, R22 |
| LSR R3, R12 |
| ORR R11, R22 |
| LSL R4, R13, R23 |
| LSR R3, R13, R8 |
| ORR R12, R23 |
| STP.P (R20, R21), 32(R0) |
| STP (R22, R23), -16(R0) |
| SUB $4, R1 |
| B loop |
| done: |
| MOVD R8, (R0) // deal with the last element |
| RET |
| copy: |
| TBZ $0, R1, ctwo |
| MOVD.P 8(R2), R3 |
| MOVD.P R3, 8(R0) |
| SUB $1, R1 |
| ctwo: |
| TBZ $1, R1, cloop |
| LDP.P 16(R2), (R4, R5) |
| STP.P (R4, R5), 16(R0) |
| SUB $2, R1 |
| cloop: |
| CBZ R1, len0 |
| LDP.P 32(R2), (R4, R5) |
| LDP -16(R2), (R6, R7) |
| STP.P (R4, R5), 32(R0) |
| STP (R6, R7), -16(R0) |
| SUB $4, R1 |
| B cloop |
| len0: |
| MOVD $0, c+56(FP) |
| RET |
| |
| |
| // func mulAddVWW(z, x []Word, y, r Word) (c Word) |
| TEXT ·mulAddVWW(SB),NOSPLIT,$0 |
| MOVD z+0(FP), R1 |
| MOVD z_len+8(FP), R0 |
| MOVD x+24(FP), R2 |
| MOVD y+48(FP), R3 |
| MOVD r+56(FP), R4 |
| loop: |
| CBZ R0, done |
| MOVD.P 8(R2), R5 |
| UMULH R5, R3, R7 |
| MUL R5, R3, R6 |
| ADDS R4, R6 |
| ADC $0, R7 |
| MOVD.P R6, 8(R1) |
| MOVD R7, R4 |
| SUB $1, R0 |
| B loop |
| done: |
| MOVD R4, c+64(FP) |
| RET |
| |
| |
| // func addMulVVW(z, x []Word, y Word) (c Word) |
| TEXT ·addMulVVW(SB),NOSPLIT,$0 |
| MOVD z+0(FP), R1 |
| MOVD z_len+8(FP), R0 |
| MOVD x+24(FP), R2 |
| MOVD y+48(FP), R3 |
| MOVD $0, R4 |
| |
| TBZ $0, R0, two |
| |
| MOVD.P 8(R2), R5 |
| MOVD (R1), R6 |
| |
| MUL R5, R3, R7 |
| UMULH R5, R3, R8 |
| |
| ADDS R7, R6 |
| ADC $0, R8, R4 |
| |
| MOVD.P R6, 8(R1) |
| SUB $1, R0 |
| |
| two: |
| TBZ $1, R0, loop |
| |
| LDP.P 16(R2), (R5, R10) |
| LDP (R1), (R6, R11) |
| |
| MUL R10, R3, R13 |
| UMULH R10, R3, R12 |
| |
| MUL R5, R3, R7 |
| UMULH R5, R3, R8 |
| |
| ADDS R4, R6 |
| ADCS R13, R11 |
| ADC $0, R12 |
| |
| ADDS R7, R6 |
| ADCS R8, R11 |
| ADC $0, R12, R4 |
| |
| STP.P (R6, R11), 16(R1) |
| SUB $2, R0 |
| |
| // The main loop of this code operates on a block of 4 words every iteration |
| // performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9] |
| // where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next |
| // 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z. |
| loop: |
| CBZ R0, done |
| |
| LDP.P 16(R2), (R5, R6) |
| LDP.P 16(R2), (R7, R8) |
| |
| LDP (R1), (R9, R10) |
| ADDS R4, R9 |
| MUL R6, R3, R14 |
| ADCS R14, R10 |
| MUL R7, R3, R15 |
| LDP 16(R1), (R11, R12) |
| ADCS R15, R11 |
| MUL R8, R3, R16 |
| ADCS R16, R12 |
| UMULH R8, R3, R20 |
| ADC $0, R20 |
| |
| MUL R5, R3, R13 |
| ADDS R13, R9 |
| UMULH R5, R3, R17 |
| ADCS R17, R10 |
| UMULH R6, R3, R21 |
| STP.P (R9, R10), 16(R1) |
| ADCS R21, R11 |
| UMULH R7, R3, R19 |
| ADCS R19, R12 |
| STP.P (R11, R12), 16(R1) |
| ADC $0, R20, R4 |
| |
| SUB $4, R0 |
| B loop |
| |
| done: |
| MOVD R4, c+56(FP) |
| RET |
| |
| // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) |
| TEXT ·divWVW(SB),NOSPLIT,$0 |
| B ·divWVW_g(SB) |