|  | // Copyright 2013 The Go Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style | 
|  | // license that can be found in the LICENSE file. | 
|  |  | 
|  | //go:build !math_big_pure_go && (ppc64 || ppc64le) | 
|  |  | 
|  | #include "textflag.h" | 
|  |  | 
|  | // This file provides fast assembly versions for the elementary | 
|  | // arithmetic operations on vectors implemented in arith.go. | 
|  |  | 
|  | // func addVV(z, y, y []Word) (c Word) | 
|  | // z[i] = x[i] + y[i] for all i, carrying | 
|  | TEXT ·addVV(SB), NOSPLIT, $0 | 
|  | MOVD  z_len+8(FP), R7   // R7 = z_len | 
|  | MOVD  x+24(FP), R8      // R8 = x[] | 
|  | MOVD  y+48(FP), R9      // R9 = y[] | 
|  | MOVD  z+0(FP), R10      // R10 = z[] | 
|  |  | 
|  | // If z_len = 0, we are done | 
|  | CMP   R0, R7 | 
|  | MOVD  R0, R4 | 
|  | BEQ   done | 
|  |  | 
|  | // Process the first iteration out of the loop so we can | 
|  | // use MOVDU and avoid 3 index registers updates. | 
|  | MOVD  0(R8), R11      // R11 = x[i] | 
|  | MOVD  0(R9), R12      // R12 = y[i] | 
|  | ADD   $-1, R7         // R7 = z_len - 1 | 
|  | ADDC  R12, R11, R15   // R15 = x[i] + y[i], set CA | 
|  | CMP   R0, R7 | 
|  | MOVD  R15, 0(R10)     // z[i] | 
|  | BEQ   final          // If z_len was 1, we are done | 
|  |  | 
|  | SRD   $2, R7, R5      // R5 = z_len/4 | 
|  | CMP   R0, R5 | 
|  | MOVD  R5, CTR         // Set up loop counter | 
|  | BEQ   tail            // If R5 = 0, we can't use the loop | 
|  |  | 
|  | // Process 4 elements per iteration. Unrolling this loop | 
|  | // means a performance trade-off: we will lose performance | 
|  | // for small values of z_len (0.90x in the worst case), but | 
|  | // gain significant performance as z_len increases (up to | 
|  | // 1.45x). | 
|  |  | 
|  | PCALIGN $16 | 
|  | loop: | 
|  | MOVD  8(R8), R11      // R11 = x[i] | 
|  | MOVD  16(R8), R12     // R12 = x[i+1] | 
|  | MOVD  24(R8), R14     // R14 = x[i+2] | 
|  | MOVDU 32(R8), R15     // R15 = x[i+3] | 
|  | MOVD  8(R9), R16      // R16 = y[i] | 
|  | MOVD  16(R9), R17     // R17 = y[i+1] | 
|  | MOVD  24(R9), R18     // R18 = y[i+2] | 
|  | MOVDU 32(R9), R19     // R19 = y[i+3] | 
|  | ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA | 
|  | ADDE  R12, R17, R21   // R21 = x[i+1] + y[i+1] + CA | 
|  | ADDE  R14, R18, R22   // R22 = x[i+2] + y[i+2] + CA | 
|  | ADDE  R15, R19, R23   // R23 = x[i+3] + y[i+3] + CA | 
|  | MOVD  R20, 8(R10)     // z[i] | 
|  | MOVD  R21, 16(R10)    // z[i+1] | 
|  | MOVD  R22, 24(R10)    // z[i+2] | 
|  | MOVDU R23, 32(R10)    // z[i+3] | 
|  | ADD   $-4, R7         // R7 = z_len - 4 | 
|  | BC  16, 0, loop       // bdnz | 
|  |  | 
|  | // We may have more elements to read | 
|  | CMP   R0, R7 | 
|  | BEQ   final | 
|  |  | 
|  | // Process the remaining elements, one at a time | 
|  | tail: | 
|  | MOVDU 8(R8), R11      // R11 = x[i] | 
|  | MOVDU 8(R9), R16      // R16 = y[i] | 
|  | ADD   $-1, R7         // R7 = z_len - 1 | 
|  | ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA | 
|  | CMP   R0, R7 | 
|  | MOVDU R20, 8(R10)     // z[i] | 
|  | BEQ   final           // If R7 = 0, we are done | 
|  |  | 
|  | MOVDU 8(R8), R11 | 
|  | MOVDU 8(R9), R16 | 
|  | ADD   $-1, R7 | 
|  | ADDE  R11, R16, R20 | 
|  | CMP   R0, R7 | 
|  | MOVDU R20, 8(R10) | 
|  | BEQ   final | 
|  |  | 
|  | MOVD  8(R8), R11 | 
|  | MOVD  8(R9), R16 | 
|  | ADDE  R11, R16, R20 | 
|  | MOVD  R20, 8(R10) | 
|  |  | 
|  | final: | 
|  | ADDZE R4              // Capture CA | 
|  |  | 
|  | done: | 
|  | MOVD  R4, c+72(FP) | 
|  | RET | 
|  |  | 
|  | // func subVV(z, x, y []Word) (c Word) | 
|  | // z[i] = x[i] - y[i] for all i, carrying | 
|  | TEXT ·subVV(SB), NOSPLIT, $0 | 
|  | MOVD  z_len+8(FP), R7 // R7 = z_len | 
|  | MOVD  x+24(FP), R8    // R8 = x[] | 
|  | MOVD  y+48(FP), R9    // R9 = y[] | 
|  | MOVD  z+0(FP), R10    // R10 = z[] | 
|  |  | 
|  | // If z_len = 0, we are done | 
|  | CMP   R0, R7 | 
|  | MOVD  R0, R4 | 
|  | BEQ   done | 
|  |  | 
|  | // Process the first iteration out of the loop so we can | 
|  | // use MOVDU and avoid 3 index registers updates. | 
|  | MOVD  0(R8), R11      // R11 = x[i] | 
|  | MOVD  0(R9), R12      // R12 = y[i] | 
|  | ADD   $-1, R7         // R7 = z_len - 1 | 
|  | SUBC  R12, R11, R15   // R15 = x[i] - y[i], set CA | 
|  | CMP   R0, R7 | 
|  | MOVD  R15, 0(R10)     // z[i] | 
|  | BEQ   final           // If z_len was 1, we are done | 
|  |  | 
|  | SRD   $2, R7, R5      // R5 = z_len/4 | 
|  | CMP   R0, R5 | 
|  | MOVD  R5, CTR         // Set up loop counter | 
|  | BEQ   tail            // If R5 = 0, we can't use the loop | 
|  |  | 
|  | // Process 4 elements per iteration. Unrolling this loop | 
|  | // means a performance trade-off: we will lose performance | 
|  | // for small values of z_len (0.92x in the worst case), but | 
|  | // gain significant performance as z_len increases (up to | 
|  | // 1.45x). | 
|  |  | 
|  | PCALIGN $16 | 
|  | loop: | 
|  | MOVD  8(R8), R11      // R11 = x[i] | 
|  | MOVD  16(R8), R12     // R12 = x[i+1] | 
|  | MOVD  24(R8), R14     // R14 = x[i+2] | 
|  | MOVDU 32(R8), R15     // R15 = x[i+3] | 
|  | MOVD  8(R9), R16      // R16 = y[i] | 
|  | MOVD  16(R9), R17     // R17 = y[i+1] | 
|  | MOVD  24(R9), R18     // R18 = y[i+2] | 
|  | MOVDU 32(R9), R19     // R19 = y[i+3] | 
|  | SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA | 
|  | SUBE  R17, R12, R21   // R21 = x[i+1] - y[i+1] + CA | 
|  | SUBE  R18, R14, R22   // R22 = x[i+2] - y[i+2] + CA | 
|  | SUBE  R19, R15, R23   // R23 = x[i+3] - y[i+3] + CA | 
|  | MOVD  R20, 8(R10)     // z[i] | 
|  | MOVD  R21, 16(R10)    // z[i+1] | 
|  | MOVD  R22, 24(R10)    // z[i+2] | 
|  | MOVDU R23, 32(R10)    // z[i+3] | 
|  | ADD   $-4, R7         // R7 = z_len - 4 | 
|  | BC  16, 0, loop       // bdnz | 
|  |  | 
|  | // We may have more elements to read | 
|  | CMP   R0, R7 | 
|  | BEQ   final | 
|  |  | 
|  | // Process the remaining elements, one at a time | 
|  | tail: | 
|  | MOVDU 8(R8), R11      // R11 = x[i] | 
|  | MOVDU 8(R9), R16      // R16 = y[i] | 
|  | ADD   $-1, R7         // R7 = z_len - 1 | 
|  | SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA | 
|  | CMP   R0, R7 | 
|  | MOVDU R20, 8(R10)     // z[i] | 
|  | BEQ   final           // If R7 = 0, we are done | 
|  |  | 
|  | MOVDU 8(R8), R11 | 
|  | MOVDU 8(R9), R16 | 
|  | ADD   $-1, R7 | 
|  | SUBE  R16, R11, R20 | 
|  | CMP   R0, R7 | 
|  | MOVDU R20, 8(R10) | 
|  | BEQ   final | 
|  |  | 
|  | MOVD  8(R8), R11 | 
|  | MOVD  8(R9), R16 | 
|  | SUBE  R16, R11, R20 | 
|  | MOVD  R20, 8(R10) | 
|  |  | 
|  | final: | 
|  | ADDZE R4 | 
|  | XOR   $1, R4 | 
|  |  | 
|  | done: | 
|  | MOVD  R4, c+72(FP) | 
|  | RET | 
|  |  | 
|  | // func addVW(z, x []Word, y Word) (c Word) | 
|  | TEXT ·addVW(SB), NOSPLIT, $0 | 
|  | MOVD z+0(FP), R10	// R10 = z[] | 
|  | MOVD x+24(FP), R8	// R8 = x[] | 
|  | MOVD y+48(FP), R4	// R4 = y = c | 
|  | MOVD z_len+8(FP), R11	// R11 = z_len | 
|  |  | 
|  | CMP   R0, R11		// If z_len is zero, return | 
|  | BEQ   done | 
|  |  | 
|  | // We will process the first iteration out of the loop so we capture | 
|  | // the value of c. In the subsequent iterations, we will rely on the | 
|  | // value of CA set here. | 
|  | MOVD  0(R8), R20	// R20 = x[i] | 
|  | ADD   $-1, R11		// R11 = z_len - 1 | 
|  | ADDC  R20, R4, R6	// R6 = x[i] + c | 
|  | CMP   R0, R11		// If z_len was 1, we are done | 
|  | MOVD  R6, 0(R10)	// z[i] | 
|  | BEQ   final | 
|  |  | 
|  | // We will read 4 elements per iteration | 
|  | SRD   $2, R11, R9	// R9 = z_len/4 | 
|  | DCBT  (R8) | 
|  | CMP   R0, R9 | 
|  | MOVD  R9, CTR		// Set up the loop counter | 
|  | BEQ   tail		// If R9 = 0, we can't use the loop | 
|  | PCALIGN $16 | 
|  |  | 
|  | loop: | 
|  | MOVD  8(R8), R20	// R20 = x[i] | 
|  | MOVD  16(R8), R21	// R21 = x[i+1] | 
|  | MOVD  24(R8), R22	// R22 = x[i+2] | 
|  | MOVDU 32(R8), R23	// R23 = x[i+3] | 
|  | ADDZE R20, R24		// R24 = x[i] + CA | 
|  | ADDZE R21, R25		// R25 = x[i+1] + CA | 
|  | ADDZE R22, R26		// R26 = x[i+2] + CA | 
|  | ADDZE R23, R27		// R27 = x[i+3] + CA | 
|  | MOVD  R24, 8(R10)	// z[i] | 
|  | MOVD  R25, 16(R10)	// z[i+1] | 
|  | MOVD  R26, 24(R10)	// z[i+2] | 
|  | MOVDU R27, 32(R10)	// z[i+3] | 
|  | ADD   $-4, R11		// R11 = z_len - 4 | 
|  | BC    16, 0, loop	// bdnz | 
|  |  | 
|  | // We may have some elements to read | 
|  | CMP R0, R11 | 
|  | BEQ final | 
|  |  | 
|  | tail: | 
|  | MOVDU 8(R8), R20 | 
|  | ADDZE R20, R24 | 
|  | ADD $-1, R11 | 
|  | MOVDU R24, 8(R10) | 
|  | CMP R0, R11 | 
|  | BEQ final | 
|  |  | 
|  | MOVDU 8(R8), R20 | 
|  | ADDZE R20, R24 | 
|  | ADD $-1, R11 | 
|  | MOVDU R24, 8(R10) | 
|  | CMP R0, R11 | 
|  | BEQ final | 
|  |  | 
|  | MOVD 8(R8), R20 | 
|  | ADDZE R20, R24 | 
|  | MOVD R24, 8(R10) | 
|  |  | 
|  | final: | 
|  | ADDZE R0, R4		// c = CA | 
|  | done: | 
|  | MOVD  R4, c+56(FP) | 
|  | RET | 
|  |  | 
|  | // func subVW(z, x []Word, y Word) (c Word) | 
|  | TEXT ·subVW(SB), NOSPLIT, $0 | 
|  | MOVD  z+0(FP), R10	// R10 = z[] | 
|  | MOVD  x+24(FP), R8	// R8 = x[] | 
|  | MOVD  y+48(FP), R4	// R4 = y = c | 
|  | MOVD  z_len+8(FP), R11	// R11 = z_len | 
|  |  | 
|  | CMP   R0, R11		// If z_len is zero, return | 
|  | BEQ   done | 
|  |  | 
|  | // We will process the first iteration out of the loop so we capture | 
|  | // the value of c. In the subsequent iterations, we will rely on the | 
|  | // value of CA set here. | 
|  | MOVD  0(R8), R20	// R20 = x[i] | 
|  | ADD   $-1, R11		// R11 = z_len - 1 | 
|  | SUBC  R4, R20, R6	// R6 = x[i] - c | 
|  | CMP   R0, R11		// If z_len was 1, we are done | 
|  | MOVD  R6, 0(R10)	// z[i] | 
|  | BEQ   final | 
|  |  | 
|  | // We will read 4 elements per iteration | 
|  | SRD   $2, R11, R9	// R9 = z_len/4 | 
|  | DCBT  (R8) | 
|  | CMP   R0, R9 | 
|  | MOVD  R9, CTR		// Set up the loop counter | 
|  | BEQ   tail		// If R9 = 0, we can't use the loop | 
|  |  | 
|  | // The loop here is almost the same as the one used in s390x, but | 
|  | // we don't need to capture CA every iteration because we've already | 
|  | // done that above. | 
|  |  | 
|  | PCALIGN $16 | 
|  | loop: | 
|  | MOVD  8(R8), R20 | 
|  | MOVD  16(R8), R21 | 
|  | MOVD  24(R8), R22 | 
|  | MOVDU 32(R8), R23 | 
|  | SUBE  R0, R20 | 
|  | SUBE  R0, R21 | 
|  | SUBE  R0, R22 | 
|  | SUBE  R0, R23 | 
|  | MOVD  R20, 8(R10) | 
|  | MOVD  R21, 16(R10) | 
|  | MOVD  R22, 24(R10) | 
|  | MOVDU R23, 32(R10) | 
|  | ADD   $-4, R11 | 
|  | BC    16, 0, loop	// bdnz | 
|  |  | 
|  | // We may have some elements to read | 
|  | CMP   R0, R11 | 
|  | BEQ   final | 
|  |  | 
|  | tail: | 
|  | MOVDU 8(R8), R20 | 
|  | SUBE  R0, R20 | 
|  | ADD   $-1, R11 | 
|  | MOVDU R20, 8(R10) | 
|  | CMP   R0, R11 | 
|  | BEQ   final | 
|  |  | 
|  | MOVDU 8(R8), R20 | 
|  | SUBE  R0, R20 | 
|  | ADD   $-1, R11 | 
|  | MOVDU R20, 8(R10) | 
|  | CMP   R0, R11 | 
|  | BEQ   final | 
|  |  | 
|  | MOVD  8(R8), R20 | 
|  | SUBE  R0, R20 | 
|  | MOVD  R20, 8(R10) | 
|  |  | 
|  | final: | 
|  | // Capture CA | 
|  | SUBE  R4, R4 | 
|  | NEG   R4, R4 | 
|  |  | 
|  | done: | 
|  | MOVD  R4, c+56(FP) | 
|  | RET | 
|  |  | 
|  | //func shlVU(z, x []Word, s uint) (c Word) | 
|  | TEXT ·shlVU(SB), NOSPLIT, $0 | 
|  | MOVD    z+0(FP), R3 | 
|  | MOVD    x+24(FP), R6 | 
|  | MOVD    s+48(FP), R9 | 
|  | MOVD    z_len+8(FP), R4 | 
|  | MOVD    x_len+32(FP), R7 | 
|  | CMP     R9, R0          // s==0 copy(z,x) | 
|  | BEQ     zeroshift | 
|  | CMP     R4, R0          // len(z)==0 return | 
|  | BEQ     done | 
|  |  | 
|  | ADD     $-1, R4, R5     // len(z)-1 | 
|  | SUBC    R9, $64, R4     // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64) | 
|  | SLD     $3, R5, R7 | 
|  | ADD     R6, R7, R15     // save starting address &x[len(z)-1] | 
|  | ADD     R3, R7, R16     // save starting address &z[len(z)-1] | 
|  | MOVD    (R6)(R7), R14 | 
|  | SRD     R4, R14, R7     // compute x[len(z)-1]>>ŝ into R7 | 
|  | CMP     R5, R0          // iterate from i=len(z)-1 to 0 | 
|  | BEQ     loopexit        // Already at end? | 
|  | MOVD	0(R15),R10	// x[i] | 
|  | PCALIGN $16 | 
|  | shloop: | 
|  | SLD     R9, R10, R10    // x[i]<<s | 
|  | MOVDU   -8(R15), R14 | 
|  | SRD     R4, R14, R11    // x[i-1]>>ŝ | 
|  | OR      R11, R10, R10 | 
|  | MOVD    R10, 0(R16)     // z[i-1]=x[i]<<s | x[i-1]>>ŝ | 
|  | MOVD	R14, R10	// reuse x[i-1] for next iteration | 
|  | ADD     $-8, R16        // i-- | 
|  | CMP     R15, R6         // &x[i-1]>&x[0]? | 
|  | BGT     shloop | 
|  | loopexit: | 
|  | MOVD    0(R6), R4 | 
|  | SLD     R9, R4, R4 | 
|  | MOVD    R4, 0(R3)       // z[0]=x[0]<<s | 
|  | MOVD    R7, c+56(FP)    // store pre-computed x[len(z)-1]>>ŝ into c | 
|  | RET | 
|  |  | 
|  | zeroshift: | 
|  | CMP     R6, R0          // x is null, nothing to copy | 
|  | BEQ     done | 
|  | CMP     R6, R3          // if x is same as z, nothing to copy | 
|  | BEQ     done | 
|  | CMP     R7, R4 | 
|  | ISEL    $0, R7, R4, R7  // Take the lower bound of lengths of x,z | 
|  | SLD     $3, R7, R7 | 
|  | SUB     R6, R3, R11     // dest - src | 
|  | CMPU    R11, R7, CR2    // < len? | 
|  | BLT     CR2, backward   // there is overlap, copy backwards | 
|  | MOVD    $0, R14 | 
|  | // shlVU processes backwards, but added a forward copy option | 
|  | // since its faster on POWER | 
|  | repeat: | 
|  | MOVD    (R6)(R14), R15  // Copy 8 bytes at a time | 
|  | MOVD    R15, (R3)(R14) | 
|  | ADD     $8, R14 | 
|  | CMP     R14, R7         // More 8 bytes left? | 
|  | BLT     repeat | 
|  | BR      done | 
|  | backward: | 
|  | ADD     $-8,R7, R14 | 
|  | repeatback: | 
|  | MOVD    (R6)(R14), R15  // copy x into z backwards | 
|  | MOVD    R15, (R3)(R14)  // copy 8 bytes at a time | 
|  | SUB     $8, R14 | 
|  | CMP     R14, $-8        // More 8 bytes left? | 
|  | BGT     repeatback | 
|  |  | 
|  | done: | 
|  | MOVD    R0, c+56(FP)    // c=0 | 
|  | RET | 
|  |  | 
|  | //func shrVU(z, x []Word, s uint) (c Word) | 
|  | TEXT ·shrVU(SB), NOSPLIT, $0 | 
|  | MOVD    z+0(FP), R3 | 
|  | MOVD    x+24(FP), R6 | 
|  | MOVD    s+48(FP), R9 | 
|  | MOVD    z_len+8(FP), R4 | 
|  | MOVD    x_len+32(FP), R7 | 
|  |  | 
|  | CMP     R9, R0          // s==0, copy(z,x) | 
|  | BEQ     zeroshift | 
|  | CMP     R4, R0          // len(z)==0 return | 
|  | BEQ     done | 
|  | SUBC    R9, $64, R5     // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64) | 
|  |  | 
|  | MOVD    0(R6), R7 | 
|  | SLD     R5, R7, R7      // compute x[0]<<ŝ | 
|  | MOVD    $1, R8          // iterate from i=1 to i<len(z) | 
|  | CMP     R8, R4 | 
|  | BGE     loopexit        // Already at end? | 
|  |  | 
|  | // vectorize if len(z) is >=3, else jump to scalar loop | 
|  | CMP     R4, $3 | 
|  | BLT     scalar | 
|  | MTVSRD  R9, VS38        // s | 
|  | VSPLTB  $7, V6, V4 | 
|  | MTVSRD  R5, VS39        // ŝ | 
|  | VSPLTB  $7, V7, V2 | 
|  | ADD     $-2, R4, R16 | 
|  | PCALIGN $16 | 
|  | loopback: | 
|  | ADD     $-1, R8, R10 | 
|  | SLD     $3, R10 | 
|  | LXVD2X  (R6)(R10), VS32 // load x[i-1], x[i] | 
|  | SLD     $3, R8, R12 | 
|  | LXVD2X  (R6)(R12), VS33 // load x[i], x[i+1] | 
|  |  | 
|  | VSRD    V0, V4, V3      // x[i-1]>>s, x[i]>>s | 
|  | VSLD    V1, V2, V5      // x[i]<<ŝ, x[i+1]<<ŝ | 
|  | VOR     V3, V5, V5      // Or(|) the two registers together | 
|  | STXVD2X VS37, (R3)(R10) // store into z[i-1] and z[i] | 
|  | ADD     $2, R8          // Done processing 2 entries, i and i+1 | 
|  | CMP     R8, R16         // Are there at least a couple of more entries left? | 
|  | BLE     loopback | 
|  | CMP     R8, R4          // Are we at the last element? | 
|  | BEQ     loopexit | 
|  | scalar: | 
|  | ADD     $-1, R8, R10 | 
|  | SLD     $3, R10 | 
|  | MOVD    (R6)(R10),R11 | 
|  | SRD     R9, R11, R11    // x[len(z)-2] >> s | 
|  | SLD     $3, R8, R12 | 
|  | MOVD    (R6)(R12), R12 | 
|  | SLD     R5, R12, R12    // x[len(z)-1]<<ŝ | 
|  | OR      R12, R11, R11   // x[len(z)-2]>>s | x[len(z)-1]<<ŝ | 
|  | MOVD    R11, (R3)(R10)  // z[len(z)-2]=x[len(z)-2]>>s | x[len(z)-1]<<ŝ | 
|  | loopexit: | 
|  | ADD     $-1, R4 | 
|  | SLD     $3, R4 | 
|  | MOVD    (R6)(R4), R5 | 
|  | SRD     R9, R5, R5      // x[len(z)-1]>>s | 
|  | MOVD    R5, (R3)(R4)    // z[len(z)-1]=x[len(z)-1]>>s | 
|  | MOVD    R7, c+56(FP)    // store pre-computed x[0]<<ŝ into c | 
|  | RET | 
|  |  | 
|  | zeroshift: | 
|  | CMP     R6, R0          // x is null, nothing to copy | 
|  | BEQ     done | 
|  | CMP     R6, R3          // if x is same as z, nothing to copy | 
|  | BEQ     done | 
|  | CMP     R7, R4 | 
|  | ISEL    $0, R7, R4, R7  // Take the lower bounds of lengths of x, z | 
|  | SLD     $3, R7, R7 | 
|  | MOVD    $0, R14 | 
|  | repeat: | 
|  | MOVD    (R6)(R14), R15  // copy 8 bytes at a time | 
|  | MOVD    R15, (R3)(R14)  // shrVU processes bytes only forwards | 
|  | ADD     $8, R14 | 
|  | CMP     R14, R7         // More 8 bytes left? | 
|  | BLT     repeat | 
|  | done: | 
|  | MOVD    R0, c+56(FP) | 
|  | RET | 
|  |  | 
|  | // func mulAddVWW(z, x []Word, y, r Word) (c Word) | 
|  | TEXT ·mulAddVWW(SB), NOSPLIT, $0 | 
|  | MOVD    z+0(FP), R10      // R10 = z[] | 
|  | MOVD    x+24(FP), R8      // R8 = x[] | 
|  | MOVD    y+48(FP), R9      // R9 = y | 
|  | MOVD    r+56(FP), R4      // R4 = r = c | 
|  | MOVD    z_len+8(FP), R11  // R11 = z_len | 
|  |  | 
|  | CMP     R0, R11 | 
|  | BEQ     done | 
|  |  | 
|  | MOVD    0(R8), R20 | 
|  | ADD     $-1, R11 | 
|  | MULLD   R9, R20, R6       // R6 = z0 = Low-order(x[i]*y) | 
|  | MULHDU  R9, R20, R7       // R7 = z1 = High-order(x[i]*y) | 
|  | ADDC    R4, R6            // R6 = z0 + r | 
|  | ADDZE   R7                // R7 = z1 + CA | 
|  | CMP     R0, R11 | 
|  | MOVD    R7, R4            // R4 = c | 
|  | MOVD    R6, 0(R10)        // z[i] | 
|  | BEQ     done | 
|  |  | 
|  | // We will read 4 elements per iteration | 
|  | SRD     $2, R11, R14      // R14 = z_len/4 | 
|  | DCBT    (R8) | 
|  | CMP     R0, R14 | 
|  | MOVD    R14, CTR          // Set up the loop counter | 
|  | BEQ     tail              // If R9 = 0, we can't use the loop | 
|  | PCALIGN $16 | 
|  |  | 
|  | loop: | 
|  | MOVD    8(R8), R20        // R20 = x[i] | 
|  | MOVD    16(R8), R21       // R21 = x[i+1] | 
|  | MOVD    24(R8), R22       // R22 = x[i+2] | 
|  | MOVDU   32(R8), R23       // R23 = x[i+3] | 
|  | MULLD   R9, R20, R24      // R24 = z0[i] | 
|  | MULHDU  R9, R20, R20      // R20 = z1[i] | 
|  | ADDC    R4, R24           // R24 = z0[i] + c | 
|  | ADDZE   R20               // R7 = z1[i] + CA | 
|  | MULLD   R9, R21, R25 | 
|  | MULHDU  R9, R21, R21 | 
|  | ADDC    R20, R25 | 
|  | ADDZE   R21 | 
|  | MULLD   R9, R22, R26 | 
|  | MULHDU  R9, R22, R22 | 
|  | MULLD   R9, R23, R27 | 
|  | MULHDU  R9, R23, R23 | 
|  | ADDC    R21, R26 | 
|  | ADDZE   R22 | 
|  | MOVD    R24, 8(R10)       // z[i] | 
|  | MOVD    R25, 16(R10)      // z[i+1] | 
|  | ADDC    R22, R27 | 
|  | ADDZE   R23,R4		  // update carry | 
|  | MOVD    R26, 24(R10)      // z[i+2] | 
|  | MOVDU   R27, 32(R10)      // z[i+3] | 
|  | ADD     $-4, R11          // R11 = z_len - 4 | 
|  | BC      16, 0, loop       // bdnz | 
|  |  | 
|  | // We may have some elements to read | 
|  | CMP   R0, R11 | 
|  | BEQ   done | 
|  |  | 
|  | // Process the remaining elements, one at a time | 
|  | tail: | 
|  | MOVDU   8(R8), R20        // R20 = x[i] | 
|  | MULLD   R9, R20, R24      // R24 = z0[i] | 
|  | MULHDU  R9, R20, R25      // R25 = z1[i] | 
|  | ADD     $-1, R11          // R11 = z_len - 1 | 
|  | ADDC    R4, R24 | 
|  | ADDZE   R25 | 
|  | MOVDU   R24, 8(R10)       // z[i] | 
|  | CMP     R0, R11 | 
|  | MOVD    R25, R4           // R4 = c | 
|  | BEQ     done              // If R11 = 0, we are done | 
|  |  | 
|  | MOVDU   8(R8), R20 | 
|  | MULLD   R9, R20, R24 | 
|  | MULHDU  R9, R20, R25 | 
|  | ADD     $-1, R11 | 
|  | ADDC    R4, R24 | 
|  | ADDZE   R25 | 
|  | MOVDU   R24, 8(R10) | 
|  | CMP     R0, R11 | 
|  | MOVD    R25, R4 | 
|  | BEQ     done | 
|  |  | 
|  | MOVD    8(R8), R20 | 
|  | MULLD   R9, R20, R24 | 
|  | MULHDU  R9, R20, R25 | 
|  | ADD     $-1, R11 | 
|  | ADDC    R4, R24 | 
|  | ADDZE   R25 | 
|  | MOVD    R24, 8(R10) | 
|  | MOVD    R25, R4 | 
|  |  | 
|  | done: | 
|  | MOVD    R4, c+64(FP) | 
|  | RET | 
|  |  | 
|  | // func addMulVVW(z, x []Word, y Word) (c Word) | 
|  | TEXT ·addMulVVW(SB), NOSPLIT, $0 | 
|  | MOVD z+0(FP), R10	// R10 = z[] | 
|  | MOVD x+24(FP), R8	// R8 = x[] | 
|  | MOVD y+48(FP), R9	// R9 = y | 
|  | MOVD z_len+8(FP), R22	// R22 = z_len | 
|  |  | 
|  | MOVD R0, R3		// R3 will be the index register | 
|  | CMP  R0, R22 | 
|  | MOVD R0, R4		// R4 = c = 0 | 
|  | MOVD R22, CTR		// Initialize loop counter | 
|  | BEQ  done | 
|  | PCALIGN $16 | 
|  |  | 
|  | loop: | 
|  | MOVD  (R8)(R3), R20	// Load x[i] | 
|  | MOVD  (R10)(R3), R21	// Load z[i] | 
|  | MULLD  R9, R20, R6	// R6 = Low-order(x[i]*y) | 
|  | MULHDU R9, R20, R7	// R7 = High-order(x[i]*y) | 
|  | ADDC   R21, R6		// R6 = z0 | 
|  | ADDZE  R7		// R7 = z1 | 
|  | ADDC   R4, R6		// R6 = z0 + c + 0 | 
|  | ADDZE  R7, R4           // c += z1 | 
|  | MOVD   R6, (R10)(R3)	// Store z[i] | 
|  | ADD    $8, R3 | 
|  | BC  16, 0, loop		// bdnz | 
|  |  | 
|  | done: | 
|  | MOVD R4, c+56(FP) | 
|  | RET | 
|  |  | 
|  |  |