| // Copyright 2013 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // +build !math_big_pure_go,ppc64 !math_big_pure_go,ppc64le |
| |
| #include "textflag.h" |
| |
| // This file provides fast assembly versions for the elementary |
| // arithmetic operations on vectors implemented in arith.go. |
| |
| // func mulWW(x, y Word) (z1, z0 Word) |
| TEXT ·mulWW(SB), NOSPLIT, $0 |
| MOVD x+0(FP), R4 |
| MOVD y+8(FP), R5 |
| MULHDU R4, R5, R6 |
| MULLD R4, R5, R7 |
| MOVD R6, z1+16(FP) |
| MOVD R7, z0+24(FP) |
| RET |
| |
| // func addVV(z, y, y []Word) (c Word) |
| // z[i] = x[i] + y[i] for all i, carrying |
| TEXT ·addVV(SB), NOSPLIT, $0 |
| MOVD z_len+8(FP), R7 |
| MOVD x+24(FP), R8 |
| MOVD y+48(FP), R9 |
| MOVD z+0(FP), R10 |
| |
| MOVD R0, R4 |
| MOVD R0, R6 // R6 will be the address index |
| ADDC R4, R4 // clear CA |
| MOVD R7, CTR |
| |
| CMP R0, R7 |
| BEQ done |
| |
| loop: |
| MOVD (R8)(R6), R11 // x[i] |
| MOVD (R9)(R6), R12 // y[i] |
| ADDE R12, R11, R15 // x[i] + y[i] + CA |
| MOVD R15, (R10)(R6) // z[i] |
| |
| ADD $8, R6 |
| BC 16, 0, loop // bdnz |
| |
| done: |
| ADDZE R4 |
| MOVD R4, c+72(FP) |
| RET |
| |
| // func subVV(z, x, y []Word) (c Word) |
| // z[i] = x[i] - y[i] for all i, carrying |
| TEXT ·subVV(SB), NOSPLIT, $0 |
| MOVD z_len+8(FP), R7 |
| MOVD x+24(FP), R8 |
| MOVD y+48(FP), R9 |
| MOVD z+0(FP), R10 |
| |
| MOVD R0, R4 // c = 0 |
| MOVD R0, R6 |
| SUBC R0, R0 // clear CA |
| MOVD R7, CTR |
| |
| CMP R0, R7 |
| BEQ sublend |
| |
| // amd64 saves and restores CF, but I believe they only have to do that because all of |
| // their math operations clobber it - we should just be able to recover it at the end. |
| subloop: |
| MOVD (R8)(R6), R11 // x[i] |
| MOVD (R9)(R6), R12 // y[i] |
| |
| SUBE R12, R11, R15 |
| MOVD R15, (R10)(R6) |
| |
| ADD $8, R6 |
| BC 16, 0, subloop // bdnz |
| |
| sublend: |
| |
| ADDZE R4 |
| XOR $1, R4 |
| MOVD R4, c+72(FP) |
| RET |
| |
| TEXT ·addVW(SB), NOSPLIT, $0 |
| BR ·addVW_g(SB) |
| |
| TEXT ·subVW(SB), NOSPLIT, $0 |
| BR ·subVW_g(SB) |
| |
| TEXT ·shlVU(SB), NOSPLIT, $0 |
| BR ·shlVU_g(SB) |
| |
| TEXT ·shrVU(SB), NOSPLIT, $0 |
| BR ·shrVU_g(SB) |
| |
| // func mulAddVWW(z, x []Word, y, r Word) (c Word) |
| TEXT ·mulAddVWW(SB), NOSPLIT, $0 |
| MOVD z+0(FP), R10 // R10 = z[] |
| MOVD x+24(FP), R8 // R8 = x[] |
| MOVD y+48(FP), R9 // R9 = y |
| MOVD r+56(FP), R4 // R4 = r = c |
| MOVD z_len+8(FP), R11 // R11 = z_len |
| |
| MOVD R0, R3 // R3 will be the index register |
| CMP R0, R11 |
| MOVD R11, CTR // Initialize loop counter |
| BEQ done |
| |
| loop: |
| MOVD (R8)(R3), R20 // x[i] |
| MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y) |
| MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y) |
| ADDC R4, R6 // Compute sum for z1 and z0 |
| ADDZE R7 |
| MOVD R6, (R10)(R3) // z[i] |
| MOVD R7, R4 // c |
| ADD $8, R3 |
| BC 16, 0, loop // bdnz |
| |
| done: |
| MOVD R4, c+64(FP) |
| RET |
| |
| // func addMulVVW(z, x []Word, y Word) (c Word) |
| TEXT ·addMulVVW(SB), NOSPLIT, $0 |
| MOVD z+0(FP), R10 // R10 = z[] |
| MOVD x+24(FP), R8 // R8 = x[] |
| MOVD y+48(FP), R9 // R9 = y |
| MOVD z_len+8(FP), R22 // R22 = z_len |
| |
| MOVD R0, R3 // R3 will be the index register |
| CMP R0, R22 |
| MOVD R0, R4 // R4 = c = 0 |
| MOVD R22, CTR // Initialize loop counter |
| BEQ done |
| |
| loop: |
| MOVD (R8)(R3), R20 // Load x[i] |
| MOVD (R10)(R3), R21 // Load z[i] |
| MULLD R9, R20, R6 // R6 = Low-order(x[i]*y) |
| MULHDU R9, R20, R7 // R7 = High-order(x[i]*y) |
| ADDC R21, R6 // R6 = z0 |
| ADDZE R7 // R7 = z1 |
| ADDC R4, R6 // R6 = z0 + c + 0 |
| ADDZE R7, R4 // c += z1 |
| MOVD R6, (R10)(R3) // Store z[i] |
| ADD $8, R3 |
| BC 16, 0, loop // bdnz |
| |
| done: |
| MOVD R4, c+56(FP) |
| RET |
| |
| // func divWW(x1, x0, y Word) (q, r Word) |
| TEXT ·divWW(SB), NOSPLIT, $0 |
| MOVD x1+0(FP), R4 |
| MOVD x0+8(FP), R5 |
| MOVD y+16(FP), R6 |
| |
| CMPU R4, R6 |
| BGE divbigger |
| |
| // from the programmer's note in ch. 3 of the ISA manual, p.74 |
| DIVDEU R6, R4, R3 |
| DIVDU R6, R5, R7 |
| MULLD R6, R3, R8 |
| MULLD R6, R7, R20 |
| SUB R20, R5, R10 |
| ADD R7, R3, R3 |
| SUB R8, R10, R4 |
| CMPU R4, R10 |
| BLT adjust |
| CMPU R4, R6 |
| BLT end |
| |
| adjust: |
| MOVD $1, R21 |
| ADD R21, R3, R3 |
| SUB R6, R4, R4 |
| |
| end: |
| MOVD R3, q+24(FP) |
| MOVD R4, r+32(FP) |
| |
| RET |
| |
| divbigger: |
| MOVD $-1, R7 |
| MOVD R7, q+24(FP) |
| MOVD R7, r+32(FP) |
| RET |
| |
| TEXT ·divWVW(SB), NOSPLIT, $0 |
| BR ·divWVW_g(SB) |