| // Copyright 2016 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // +build !math_big_pure_go,s390x |
| |
| #include "textflag.h" |
| |
| // This file provides fast assembly versions for the elementary |
| // arithmetic operations on vectors implemented in arith.go. |
| |
| TEXT ·mulWW(SB), NOSPLIT, $0 |
| MOVD x+0(FP), R3 |
| MOVD y+8(FP), R4 |
| MULHDU R3, R4 |
| MOVD R10, z1+16(FP) |
| MOVD R11, z0+24(FP) |
| RET |
| |
| |
| // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 |
| // func addVV(z, x, y []Word) (c Word) |
| |
| TEXT ·addVV(SB), NOSPLIT, $0 |
| MOVD addvectorfacility+0x00(SB), R1 |
| BR (R1) |
| |
| TEXT ·addVV_check(SB), NOSPLIT, $0 |
| MOVB ·hasVX(SB), R1 |
| CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported |
| MOVD $addvectorfacility+0x00(SB), R1 |
| MOVD $·addVV_novec(SB), R2 |
| MOVD R2, 0(R1) |
| |
| // MOVD $·addVV_novec(SB), 0(R1) |
| BR ·addVV_novec(SB) |
| |
| vectorimpl: |
| MOVD $addvectorfacility+0x00(SB), R1 |
| MOVD $·addVV_vec(SB), R2 |
| MOVD R2, 0(R1) |
| |
| // MOVD $·addVV_vec(SB), 0(R1) |
| BR ·addVV_vec(SB) |
| |
| GLOBL addvectorfacility+0x00(SB), NOPTR, $8 |
| DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB) |
| |
| TEXT ·addVV_vec(SB), NOSPLIT, $0 |
| MOVD z_len+8(FP), R3 |
| MOVD x+24(FP), R8 |
| MOVD y+48(FP), R9 |
| MOVD z+0(FP), R2 |
| |
| MOVD $0, R4 // c = 0 |
| MOVD $0, R0 // make sure it's zero |
| MOVD $0, R10 // i = 0 |
| |
| // s/JL/JMP/ below to disable the unrolled loop |
| SUB $4, R3 |
| BLT v1 |
| SUB $12, R3 // n -= 16 |
| BLT A1 // if n < 0 goto A1 |
| |
| MOVD R8, R5 |
| MOVD R9, R6 |
| MOVD R2, R7 |
| |
| // n >= 0 |
| // regular loop body unrolled 16x |
| VZERO V0 // c = 0 |
| |
| UU1: |
| VLM 0(R5), V1, V4 // 64-bytes into V1..V8 |
| ADD $64, R5 |
| VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order |
| VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order |
| |
| VLM 0(R6), V9, V12 // 64-bytes into V9..V16 |
| ADD $64, R6 |
| VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order |
| VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order |
| |
| VACCCQ V1, V9, V0, V25 |
| VACQ V1, V9, V0, V17 |
| VACCCQ V2, V10, V25, V26 |
| VACQ V2, V10, V25, V18 |
| |
| VLM 0(R5), V5, V6 // 32-bytes into V1..V8 |
| VLM 0(R6), V13, V14 // 32-bytes into V9..V16 |
| ADD $32, R5 |
| ADD $32, R6 |
| |
| VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order |
| VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order |
| VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order |
| VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order |
| |
| VACCCQ V3, V11, V26, V27 |
| VACQ V3, V11, V26, V19 |
| VACCCQ V4, V12, V27, V28 |
| VACQ V4, V12, V27, V20 |
| |
| VLM 0(R5), V7, V8 // 32-bytes into V1..V8 |
| VLM 0(R6), V15, V16 // 32-bytes into V9..V16 |
| ADD $32, R5 |
| ADD $32, R6 |
| |
| VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order |
| VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order |
| VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order |
| VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order |
| |
| VACCCQ V5, V13, V28, V29 |
| VACQ V5, V13, V28, V21 |
| VACCCQ V6, V14, V29, V30 |
| VACQ V6, V14, V29, V22 |
| |
| VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order |
| VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order |
| VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order |
| VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order |
| |
| VACCCQ V7, V15, V30, V31 |
| VACQ V7, V15, V30, V23 |
| VACCCQ V8, V16, V31, V0 // V0 has carry-over |
| VACQ V8, V16, V31, V24 |
| |
| VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order |
| VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order |
| VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order |
| VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order |
| VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order |
| VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order |
| VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order |
| VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order |
| VSTM V17, V24, 0(R7) // 128-bytes into z |
| ADD $128, R7 |
| ADD $128, R10 // i += 16 |
| SUB $16, R3 // n -= 16 |
| BGE UU1 // if n >= 0 goto U1 |
| VLGVG $1, V0, R4 // put cf into R4 |
| NEG R4, R4 // save cf |
| |
| A1: |
| ADD $12, R3 // n += 16 |
| |
| // s/JL/JMP/ below to disable the unrolled loop |
| BLT v1 // if n < 0 goto v1 |
| |
| U1: // n >= 0 |
| // regular loop body unrolled 4x |
| MOVD 0(R8)(R10*1), R5 |
| MOVD 8(R8)(R10*1), R6 |
| MOVD 16(R8)(R10*1), R7 |
| MOVD 24(R8)(R10*1), R1 |
| ADDC R4, R4 // restore CF |
| MOVD 0(R9)(R10*1), R11 |
| ADDE R11, R5 |
| MOVD 8(R9)(R10*1), R11 |
| ADDE R11, R6 |
| MOVD 16(R9)(R10*1), R11 |
| ADDE R11, R7 |
| MOVD 24(R9)(R10*1), R11 |
| ADDE R11, R1 |
| MOVD R0, R4 |
| ADDE R4, R4 // save CF |
| NEG R4, R4 |
| MOVD R5, 0(R2)(R10*1) |
| MOVD R6, 8(R2)(R10*1) |
| MOVD R7, 16(R2)(R10*1) |
| MOVD R1, 24(R2)(R10*1) |
| |
| ADD $32, R10 // i += 4 |
| SUB $4, R3 // n -= 4 |
| BGE U1 // if n >= 0 goto U1 |
| |
| v1: |
| ADD $4, R3 // n += 4 |
| BLE E1 // if n <= 0 goto E1 |
| |
| L1: // n > 0 |
| ADDC R4, R4 // restore CF |
| MOVD 0(R8)(R10*1), R5 |
| MOVD 0(R9)(R10*1), R11 |
| ADDE R11, R5 |
| MOVD R5, 0(R2)(R10*1) |
| MOVD R0, R4 |
| ADDE R4, R4 // save CF |
| NEG R4, R4 |
| |
| ADD $8, R10 // i++ |
| SUB $1, R3 // n-- |
| BGT L1 // if n > 0 goto L1 |
| |
| E1: |
| NEG R4, R4 |
| MOVD R4, c+72(FP) // return c |
| RET |
| |
| TEXT ·addVV_novec(SB), NOSPLIT, $0 |
| novec: |
| MOVD z_len+8(FP), R3 |
| MOVD x+24(FP), R8 |
| MOVD y+48(FP), R9 |
| MOVD z+0(FP), R2 |
| |
| MOVD $0, R4 // c = 0 |
| MOVD $0, R0 // make sure it's zero |
| MOVD $0, R10 // i = 0 |
| |
| // s/JL/JMP/ below to disable the unrolled loop |
| SUB $4, R3 // n -= 4 |
| BLT v1n // if n < 0 goto v1n |
| |
| U1n: // n >= 0 |
| // regular loop body unrolled 4x |
| MOVD 0(R8)(R10*1), R5 |
| MOVD 8(R8)(R10*1), R6 |
| MOVD 16(R8)(R10*1), R7 |
| MOVD 24(R8)(R10*1), R1 |
| ADDC R4, R4 // restore CF |
| MOVD 0(R9)(R10*1), R11 |
| ADDE R11, R5 |
| MOVD 8(R9)(R10*1), R11 |
| ADDE R11, R6 |
| MOVD 16(R9)(R10*1), R11 |
| ADDE R11, R7 |
| MOVD 24(R9)(R10*1), R11 |
| ADDE R11, R1 |
| MOVD R0, R4 |
| ADDE R4, R4 // save CF |
| NEG R4, R4 |
| MOVD R5, 0(R2)(R10*1) |
| MOVD R6, 8(R2)(R10*1) |
| MOVD R7, 16(R2)(R10*1) |
| MOVD R1, 24(R2)(R10*1) |
| |
| ADD $32, R10 // i += 4 |
| SUB $4, R3 // n -= 4 |
| BGE U1n // if n >= 0 goto U1n |
| |
| v1n: |
| ADD $4, R3 // n += 4 |
| BLE E1n // if n <= 0 goto E1n |
| |
| L1n: // n > 0 |
| ADDC R4, R4 // restore CF |
| MOVD 0(R8)(R10*1), R5 |
| MOVD 0(R9)(R10*1), R11 |
| ADDE R11, R5 |
| MOVD R5, 0(R2)(R10*1) |
| MOVD R0, R4 |
| ADDE R4, R4 // save CF |
| NEG R4, R4 |
| |
| ADD $8, R10 // i++ |
| SUB $1, R3 // n-- |
| BGT L1n // if n > 0 goto L1n |
| |
| E1n: |
| NEG R4, R4 |
| MOVD R4, c+72(FP) // return c |
| RET |
| |
| TEXT ·subVV(SB), NOSPLIT, $0 |
| MOVD subvectorfacility+0x00(SB), R1 |
| BR (R1) |
| |
| TEXT ·subVV_check(SB), NOSPLIT, $0 |
| MOVB ·hasVX(SB), R1 |
| CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported |
| MOVD $subvectorfacility+0x00(SB), R1 |
| MOVD $·subVV_novec(SB), R2 |
| MOVD R2, 0(R1) |
| |
| // MOVD $·subVV_novec(SB), 0(R1) |
| BR ·subVV_novec(SB) |
| |
| vectorimpl: |
| MOVD $subvectorfacility+0x00(SB), R1 |
| MOVD $·subVV_vec(SB), R2 |
| MOVD R2, 0(R1) |
| |
| // MOVD $·subVV_vec(SB), 0(R1) |
| BR ·subVV_vec(SB) |
| |
| GLOBL subvectorfacility+0x00(SB), NOPTR, $8 |
| DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB) |
| |
| // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 |
| // func subVV(z, x, y []Word) (c Word) |
| // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names) |
| TEXT ·subVV_vec(SB), NOSPLIT, $0 |
| MOVD z_len+8(FP), R3 |
| MOVD x+24(FP), R8 |
| MOVD y+48(FP), R9 |
| MOVD z+0(FP), R2 |
| MOVD $0, R4 // c = 0 |
| MOVD $0, R0 // make sure it's zero |
| MOVD $0, R10 // i = 0 |
| |
| // s/JL/JMP/ below to disable the unrolled loop |
| SUB $4, R3 // n -= 4 |
| BLT v1 // if n < 0 goto v1 |
| SUB $12, R3 // n -= 16 |
| BLT A1 // if n < 0 goto A1 |
| |
| MOVD R8, R5 |
| MOVD R9, R6 |
| MOVD R2, R7 |
| |
| // n >= 0 |
| // regular loop body unrolled 16x |
| VZERO V0 // cf = 0 |
| MOVD $1, R4 // for 390 subtraction cf starts as 1 (no borrow) |
| VLVGG $1, R4, V0 // put carry into V0 |
| |
| UU1: |
| VLM 0(R5), V1, V4 // 64-bytes into V1..V8 |
| ADD $64, R5 |
| VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order |
| VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order |
| |
| VLM 0(R6), V9, V12 // 64-bytes into V9..V16 |
| ADD $64, R6 |
| VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order |
| VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order |
| |
| VSBCBIQ V1, V9, V0, V25 |
| VSBIQ V1, V9, V0, V17 |
| VSBCBIQ V2, V10, V25, V26 |
| VSBIQ V2, V10, V25, V18 |
| |
| VLM 0(R5), V5, V6 // 32-bytes into V1..V8 |
| VLM 0(R6), V13, V14 // 32-bytes into V9..V16 |
| ADD $32, R5 |
| ADD $32, R6 |
| |
| VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order |
| VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order |
| VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order |
| VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order |
| |
| VSBCBIQ V3, V11, V26, V27 |
| VSBIQ V3, V11, V26, V19 |
| VSBCBIQ V4, V12, V27, V28 |
| VSBIQ V4, V12, V27, V20 |
| |
| VLM 0(R5), V7, V8 // 32-bytes into V1..V8 |
| VLM 0(R6), V15, V16 // 32-bytes into V9..V16 |
| ADD $32, R5 |
| ADD $32, R6 |
| |
| VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order |
| VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order |
| VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order |
| VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order |
| |
| VSBCBIQ V5, V13, V28, V29 |
| VSBIQ V5, V13, V28, V21 |
| VSBCBIQ V6, V14, V29, V30 |
| VSBIQ V6, V14, V29, V22 |
| |
| VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order |
| VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order |
| VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order |
| VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order |
| |
| VSBCBIQ V7, V15, V30, V31 |
| VSBIQ V7, V15, V30, V23 |
| VSBCBIQ V8, V16, V31, V0 // V0 has carry-over |
| VSBIQ V8, V16, V31, V24 |
| |
| VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order |
| VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order |
| VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order |
| VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order |
| VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order |
| VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order |
| VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order |
| VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order |
| VSTM V17, V24, 0(R7) // 128-bytes into z |
| ADD $128, R7 |
| ADD $128, R10 // i += 16 |
| SUB $16, R3 // n -= 16 |
| BGE UU1 // if n >= 0 goto U1 |
| VLGVG $1, V0, R4 // put cf into R4 |
| SUB $1, R4 // save cf |
| |
| A1: |
| ADD $12, R3 // n += 16 |
| BLT v1 // if n < 0 goto v1 |
| |
| U1: // n >= 0 |
| // regular loop body unrolled 4x |
| MOVD 0(R8)(R10*1), R5 |
| MOVD 8(R8)(R10*1), R6 |
| MOVD 16(R8)(R10*1), R7 |
| MOVD 24(R8)(R10*1), R1 |
| MOVD R0, R11 |
| SUBC R4, R11 // restore CF |
| MOVD 0(R9)(R10*1), R11 |
| SUBE R11, R5 |
| MOVD 8(R9)(R10*1), R11 |
| SUBE R11, R6 |
| MOVD 16(R9)(R10*1), R11 |
| SUBE R11, R7 |
| MOVD 24(R9)(R10*1), R11 |
| SUBE R11, R1 |
| MOVD R0, R4 |
| SUBE R4, R4 // save CF |
| MOVD R5, 0(R2)(R10*1) |
| MOVD R6, 8(R2)(R10*1) |
| MOVD R7, 16(R2)(R10*1) |
| MOVD R1, 24(R2)(R10*1) |
| |
| ADD $32, R10 // i += 4 |
| SUB $4, R3 // n -= 4 |
| BGE U1 // if n >= 0 goto U1n |
| |
| v1: |
| ADD $4, R3 // n += 4 |
| BLE E1 // if n <= 0 goto E1 |
| |
| L1: // n > 0 |
| MOVD R0, R11 |
| SUBC R4, R11 // restore CF |
| MOVD 0(R8)(R10*1), R5 |
| MOVD 0(R9)(R10*1), R11 |
| SUBE R11, R5 |
| MOVD R5, 0(R2)(R10*1) |
| MOVD R0, R4 |
| SUBE R4, R4 // save CF |
| |
| ADD $8, R10 // i++ |
| SUB $1, R3 // n-- |
| BGT L1 // if n > 0 goto L1n |
| |
| E1: |
| NEG R4, R4 |
| MOVD R4, c+72(FP) // return c |
| RET |
| |
| // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 |
| // func subVV(z, x, y []Word) (c Word) |
| // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names) |
| TEXT ·subVV_novec(SB), NOSPLIT, $0 |
| MOVD z_len+8(FP), R3 |
| MOVD x+24(FP), R8 |
| MOVD y+48(FP), R9 |
| MOVD z+0(FP), R2 |
| |
| MOVD $0, R4 // c = 0 |
| MOVD $0, R0 // make sure it's zero |
| MOVD $0, R10 // i = 0 |
| |
| // s/JL/JMP/ below to disable the unrolled loop |
| SUB $4, R3 // n -= 4 |
| BLT v1 // if n < 0 goto v1 |
| |
| U1: // n >= 0 |
| // regular loop body unrolled 4x |
| MOVD 0(R8)(R10*1), R5 |
| MOVD 8(R8)(R10*1), R6 |
| MOVD 16(R8)(R10*1), R7 |
| MOVD 24(R8)(R10*1), R1 |
| MOVD R0, R11 |
| SUBC R4, R11 // restore CF |
| MOVD 0(R9)(R10*1), R11 |
| SUBE R11, R5 |
| MOVD 8(R9)(R10*1), R11 |
| SUBE R11, R6 |
| MOVD 16(R9)(R10*1), R11 |
| SUBE R11, R7 |
| MOVD 24(R9)(R10*1), R11 |
| SUBE R11, R1 |
| MOVD R0, R4 |
| SUBE R4, R4 // save CF |
| MOVD R5, 0(R2)(R10*1) |
| MOVD R6, 8(R2)(R10*1) |
| MOVD R7, 16(R2)(R10*1) |
| MOVD R1, 24(R2)(R10*1) |
| |
| ADD $32, R10 // i += 4 |
| SUB $4, R3 // n -= 4 |
| BGE U1 // if n >= 0 goto U1 |
| |
| v1: |
| ADD $4, R3 // n += 4 |
| BLE E1 // if n <= 0 goto E1 |
| |
| L1: // n > 0 |
| MOVD R0, R11 |
| SUBC R4, R11 // restore CF |
| MOVD 0(R8)(R10*1), R5 |
| MOVD 0(R9)(R10*1), R11 |
| SUBE R11, R5 |
| MOVD R5, 0(R2)(R10*1) |
| MOVD R0, R4 |
| SUBE R4, R4 // save CF |
| |
| ADD $8, R10 // i++ |
| SUB $1, R3 // n-- |
| BGT L1 // if n > 0 goto L1 |
| |
| E1: |
| NEG R4, R4 |
| MOVD R4, c+72(FP) // return c |
| RET |
| |
| TEXT ·addVW(SB), NOSPLIT, $0 |
| MOVD z_len+8(FP), R5 // length of z |
| MOVD x+24(FP), R6 |
| MOVD y+48(FP), R7 // c = y |
| MOVD z+0(FP), R8 |
| |
| CMPBEQ R5, $0, returnC // if len(z) == 0, we can have an early return |
| |
| // Add the first two words, and determine which path (copy path or loop path) to take based on the carry flag. |
| ADDC 0(R6), R7 |
| MOVD R7, 0(R8) |
| CMPBEQ R5, $1, returnResult // len(z) == 1 |
| MOVD $0, R9 |
| ADDE 8(R6), R9 |
| MOVD R9, 8(R8) |
| CMPBEQ R5, $2, returnResult // len(z) == 2 |
| |
| // Update the counters |
| MOVD $16, R12 // i = 2 |
| MOVD $-2(R5), R5 // n = n - 2 |
| |
| loopOverEachWord: |
| BRC $12, copySetup // carry = 0, copy the rest |
| MOVD $1, R9 |
| |
| // Originally we used the carry flag generated in the previous iteration |
| // (i.e: ADDE could be used here to do the addition). However, since we |
| // already know carry is 1 (otherwise we will go to copy section), we can use |
| // ADDC here so the current iteration does not depend on the carry flag |
| // generated in the previous iteration. This could be useful when branch prediction happens. |
| ADDC 0(R6)(R12*1), R9 |
| MOVD R9, 0(R8)(R12*1) // z[i] = x[i] + c |
| |
| MOVD $8(R12), R12 // i++ |
| BRCTG R5, loopOverEachWord // n-- |
| |
| // Return the current carry value |
| returnResult: |
| MOVD $0, R0 |
| ADDE R0, R0 |
| MOVD R0, c+56(FP) |
| RET |
| |
| // Update position of x(R6) and z(R8) based on the current counter value and perform copying. |
| // With the assumption that x and z will not overlap with each other or x and z will |
| // point to same memory region, we can use a faster version of copy using only MVC here. |
| // In the following implementation, we have three copy loops, each copying a word, 4 words, and |
| // 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove. |
| copySetup: |
| ADD R12, R6 |
| ADD R12, R8 |
| |
| CMPBGE R5, $4, mediumLoop |
| |
| smallLoop: // does a loop unrolling to copy word when n < 4 |
| CMPBEQ R5, $0, returnZero |
| MVC $8, 0(R6), 0(R8) |
| CMPBEQ R5, $1, returnZero |
| MVC $8, 8(R6), 8(R8) |
| CMPBEQ R5, $2, returnZero |
| MVC $8, 16(R6), 16(R8) |
| |
| returnZero: |
| MOVD $0, c+56(FP) // return 0 as carry |
| RET |
| |
| mediumLoop: |
| CMPBLT R5, $4, smallLoop |
| CMPBLT R5, $32, mediumLoopBody |
| |
| largeLoop: // Copying 256 bytes at a time. |
| MVC $256, 0(R6), 0(R8) |
| MOVD $256(R6), R6 |
| MOVD $256(R8), R8 |
| MOVD $-32(R5), R5 |
| CMPBGE R5, $32, largeLoop |
| BR mediumLoop |
| |
| mediumLoopBody: // Copying 32 bytes at a time |
| MVC $32, 0(R6), 0(R8) |
| MOVD $32(R6), R6 |
| MOVD $32(R8), R8 |
| MOVD $-4(R5), R5 |
| CMPBGE R5, $4, mediumLoopBody |
| BR smallLoop |
| |
| returnC: |
| MOVD R7, c+56(FP) |
| RET |
| |
| TEXT ·subVW(SB), NOSPLIT, $0 |
| MOVD z_len+8(FP), R5 |
| MOVD x+24(FP), R6 |
| MOVD y+48(FP), R7 // The borrow bit passed in |
| MOVD z+0(FP), R8 |
| MOVD $0, R0 // R0 is a temporary variable used during computation. Ensure it has zero in it. |
| |
| CMPBEQ R5, $0, returnC // len(z) == 0, have an early return |
| |
| // Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag |
| MOVD 0(R6), R9 |
| SUBC R7, R9 |
| MOVD R9, 0(R8) |
| CMPBEQ R5, $1, returnResult |
| MOVD 8(R6), R9 |
| SUBE R0, R9 |
| MOVD R9, 8(R8) |
| CMPBEQ R5, $2, returnResult |
| |
| // Update the counters |
| MOVD $16, R12 // i = 2 |
| MOVD $-2(R5), R5 // n = n - 2 |
| |
| loopOverEachWord: |
| BRC $3, copySetup // no borrow, copy the rest |
| MOVD 0(R6)(R12*1), R9 |
| |
| // Originally we used the borrow flag generated in the previous iteration |
| // (i.e: SUBE could be used here to do the subtraction). However, since we |
| // already know borrow is 1 (otherwise we will go to copy section), we can |
| // use SUBC here so the current iteration does not depend on the borrow flag |
| // generated in the previous iteration. This could be useful when branch prediction happens. |
| SUBC $1, R9 |
| MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1 |
| |
| MOVD $8(R12), R12 // i++ |
| BRCTG R5, loopOverEachWord // n-- |
| |
| // return the current borrow value |
| returnResult: |
| SUBE R0, R0 |
| NEG R0, R0 |
| MOVD R0, c+56(FP) |
| RET |
| |
| // Update position of x(R6) and z(R8) based on the current counter value and perform copying. |
| // With the assumption that x and z will not overlap with each other or x and z will |
| // point to same memory region, we can use a faster version of copy using only MVC here. |
| // In the following implementation, we have three copy loops, each copying a word, 4 words, and |
| // 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove. |
| copySetup: |
| ADD R12, R6 |
| ADD R12, R8 |
| |
| CMPBGE R5, $4, mediumLoop |
| |
| smallLoop: // does a loop unrolling to copy word when n < 4 |
| CMPBEQ R5, $0, returnZero |
| MVC $8, 0(R6), 0(R8) |
| CMPBEQ R5, $1, returnZero |
| MVC $8, 8(R6), 8(R8) |
| CMPBEQ R5, $2, returnZero |
| MVC $8, 16(R6), 16(R8) |
| |
| returnZero: |
| MOVD $0, c+56(FP) // return 0 as borrow |
| RET |
| |
| mediumLoop: |
| CMPBLT R5, $4, smallLoop |
| CMPBLT R5, $32, mediumLoopBody |
| |
| largeLoop: // Copying 256 bytes at a time |
| MVC $256, 0(R6), 0(R8) |
| MOVD $256(R6), R6 |
| MOVD $256(R8), R8 |
| MOVD $-32(R5), R5 |
| CMPBGE R5, $32, largeLoop |
| BR mediumLoop |
| |
| mediumLoopBody: // Copying 32 bytes at a time |
| MVC $32, 0(R6), 0(R8) |
| MOVD $32(R6), R6 |
| MOVD $32(R8), R8 |
| MOVD $-4(R5), R5 |
| CMPBGE R5, $4, mediumLoopBody |
| BR smallLoop |
| |
| returnC: |
| MOVD R7, c+56(FP) |
| RET |
| |
| // func shlVU(z, x []Word, s uint) (c Word) |
| TEXT ·shlVU(SB), NOSPLIT, $0 |
| MOVD z_len+8(FP), R5 |
| MOVD $0, R0 |
| SUB $1, R5 // n-- |
| BLT X8b // n < 0 (n <= 0) |
| |
| // n > 0 |
| MOVD s+48(FP), R4 |
| CMPBEQ R0, R4, Z80 // handle 0 case beq |
| MOVD $64, R6 |
| CMPBEQ R6, R4, Z864 // handle 64 case beq |
| MOVD z+0(FP), R2 |
| MOVD x+24(FP), R8 |
| SLD $3, R5 // n = n*8 |
| SUB R4, R6, R7 |
| MOVD (R8)(R5*1), R10 // w1 = x[i-1] |
| SRD R7, R10, R3 |
| MOVD R3, c+56(FP) |
| |
| MOVD $0, R1 // i = 0 |
| BR E8 |
| |
| // i < n-1 |
| L8: |
| MOVD R10, R3 // w = w1 |
| MOVD -8(R8)(R5*1), R10 // w1 = x[i+1] |
| |
| SLD R4, R3 // w<<s | w1>>ŝ |
| SRD R7, R10, R6 |
| OR R6, R3 |
| MOVD R3, (R2)(R5*1) // z[i] = w<<s | w1>>ŝ |
| SUB $8, R5 // i-- |
| |
| E8: |
| CMPBGT R5, R0, L8 // i < n-1 |
| |
| // i >= n-1 |
| X8a: |
| SLD R4, R10 // w1<<s |
| MOVD R10, (R2) // z[0] = w1<<s |
| RET |
| |
| X8b: |
| MOVD R0, c+56(FP) |
| RET |
| |
| Z80: |
| MOVD z+0(FP), R2 |
| MOVD x+24(FP), R8 |
| SLD $3, R5 // n = n*8 |
| |
| MOVD (R8), R10 |
| MOVD $0, R3 |
| MOVD R3, c+56(FP) |
| |
| MOVD $0, R1 // i = 0 |
| BR E8Z |
| |
| // i < n-1 |
| L8Z: |
| MOVD R10, R3 |
| MOVD 8(R8)(R1*1), R10 |
| |
| MOVD R3, (R2)(R1*1) |
| ADD $8, R1 |
| |
| E8Z: |
| CMPBLT R1, R5, L8Z |
| |
| // i >= n-1 |
| MOVD R10, (R2)(R5*1) |
| RET |
| |
| Z864: |
| MOVD z+0(FP), R2 |
| MOVD x+24(FP), R8 |
| SLD $3, R5 // n = n*8 |
| MOVD (R8)(R5*1), R3 // w1 = x[n-1] |
| MOVD R3, c+56(FP) // z[i] = x[n-1] |
| |
| BR E864 |
| |
| // i < n-1 |
| L864: |
| MOVD -8(R8)(R5*1), R3 |
| |
| MOVD R3, (R2)(R5*1) // z[i] = x[n-1] |
| SUB $8, R5 // i-- |
| |
| E864: |
| CMPBGT R5, R0, L864 // i < n-1 |
| |
| MOVD R0, (R2) // z[n-1] = 0 |
| RET |
| |
| // CX = R4, r8 = r8, r10 = r2 , r11 = r5, DX = r3, AX = r10 , BX = R1 , 64-count = r7 (R0 set to 0) temp = R6 |
| // func shrVU(z, x []Word, s uint) (c Word) |
| TEXT ·shrVU(SB), NOSPLIT, $0 |
| MOVD z_len+8(FP), R5 |
| MOVD $0, R0 |
| SUB $1, R5 // n-- |
| BLT X9b // n < 0 (n <= 0) |
| |
| // n > 0 |
| MOVD s+48(FP), R4 |
| CMPBEQ R0, R4, ZB0 // handle 0 case beq |
| MOVD $64, R6 |
| CMPBEQ R6, R4, ZB64 // handle 64 case beq |
| MOVD z+0(FP), R2 |
| MOVD x+24(FP), R8 |
| SLD $3, R5 // n = n*8 |
| SUB R4, R6, R7 |
| MOVD (R8), R10 // w1 = x[0] |
| SLD R7, R10, R3 |
| MOVD R3, c+56(FP) |
| |
| MOVD $0, R1 // i = 0 |
| BR E9 |
| |
| // i < n-1 |
| L9: |
| MOVD R10, R3 // w = w1 |
| MOVD 8(R8)(R1*1), R10 // w1 = x[i+1] |
| |
| SRD R4, R3 // w>>s | w1<<s |
| SLD R7, R10, R6 |
| OR R6, R3 |
| MOVD R3, (R2)(R1*1) // z[i] = w>>s | w1<<s |
| ADD $8, R1 // i++ |
| |
| E9: |
| CMPBLT R1, R5, L9 // i < n-1 |
| |
| // i >= n-1 |
| X9a: |
| SRD R4, R10 // w1>>s |
| MOVD R10, (R2)(R5*1) // z[n-1] = w1>>s |
| RET |
| |
| X9b: |
| MOVD R0, c+56(FP) |
| RET |
| |
| ZB0: |
| MOVD z+0(FP), R2 |
| MOVD x+24(FP), R8 |
| SLD $3, R5 // n = n*8 |
| |
| MOVD (R8), R10 // w1 = x[0] |
| MOVD $0, R3 // R10 << 64 |
| MOVD R3, c+56(FP) |
| |
| MOVD $0, R1 // i = 0 |
| BR E9Z |
| |
| // i < n-1 |
| L9Z: |
| MOVD R10, R3 // w = w1 |
| MOVD 8(R8)(R1*1), R10 // w1 = x[i+1] |
| |
| MOVD R3, (R2)(R1*1) // z[i] = w>>s | w1<<s |
| ADD $8, R1 // i++ |
| |
| E9Z: |
| CMPBLT R1, R5, L9Z // i < n-1 |
| |
| // i >= n-1 |
| MOVD R10, (R2)(R5*1) // z[n-1] = w1>>s |
| RET |
| |
| ZB64: |
| MOVD z+0(FP), R2 |
| MOVD x+24(FP), R8 |
| SLD $3, R5 // n = n*8 |
| MOVD (R8), R3 // w1 = x[0] |
| MOVD R3, c+56(FP) |
| |
| MOVD $0, R1 // i = 0 |
| BR E964 |
| |
| // i < n-1 |
| L964: |
| MOVD 8(R8)(R1*1), R3 // w1 = x[i+1] |
| |
| MOVD R3, (R2)(R1*1) // z[i] = w>>s | w1<<s |
| ADD $8, R1 // i++ |
| |
| E964: |
| CMPBLT R1, R5, L964 // i < n-1 |
| |
| // i >= n-1 |
| MOVD $0, R10 // w1>>s |
| MOVD R10, (R2)(R5*1) // z[n-1] = w1>>s |
| RET |
| |
| // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, DX = r3, AX = r6 , BX = R1 , (R0 set to 0) + use R11 + use R7 for i |
| // func mulAddVWW(z, x []Word, y, r Word) (c Word) |
| TEXT ·mulAddVWW(SB), NOSPLIT, $0 |
| MOVD z+0(FP), R2 |
| MOVD x+24(FP), R8 |
| MOVD y+48(FP), R9 |
| MOVD r+56(FP), R4 // c = r |
| MOVD z_len+8(FP), R5 |
| MOVD $0, R1 // i = 0 |
| MOVD $0, R7 // i*8 = 0 |
| MOVD $0, R0 // make sure it's zero |
| BR E5 |
| |
| L5: |
| MOVD (R8)(R1*1), R6 |
| MULHDU R9, R6 |
| ADDC R4, R11 // add to low order bits |
| ADDE R0, R6 |
| MOVD R11, (R2)(R1*1) |
| MOVD R6, R4 |
| ADD $8, R1 // i*8 + 8 |
| ADD $1, R7 // i++ |
| |
| E5: |
| CMPBLT R7, R5, L5 // i < n |
| |
| MOVD R4, c+64(FP) |
| RET |
| |
| // func addMulVVW(z, x []Word, y Word) (c Word) |
| // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1 , (R0 set to 0) + use R11 + use R7 for i |
| TEXT ·addMulVVW(SB), NOSPLIT, $0 |
| MOVD z+0(FP), R2 |
| MOVD x+24(FP), R8 |
| MOVD y+48(FP), R9 |
| MOVD z_len+8(FP), R5 |
| |
| MOVD $0, R1 // i*8 = 0 |
| MOVD $0, R7 // i = 0 |
| MOVD $0, R0 // make sure it's zero |
| MOVD $0, R4 // c = 0 |
| |
| MOVD R5, R12 |
| AND $-2, R12 |
| CMPBGE R5, $2, A6 |
| BR E6 |
| |
| A6: |
| MOVD (R8)(R1*1), R6 |
| MULHDU R9, R6 |
| MOVD (R2)(R1*1), R10 |
| ADDC R10, R11 // add to low order bits |
| ADDE R0, R6 |
| ADDC R4, R11 |
| ADDE R0, R6 |
| MOVD R6, R4 |
| MOVD R11, (R2)(R1*1) |
| |
| MOVD (8)(R8)(R1*1), R6 |
| MULHDU R9, R6 |
| MOVD (8)(R2)(R1*1), R10 |
| ADDC R10, R11 // add to low order bits |
| ADDE R0, R6 |
| ADDC R4, R11 |
| ADDE R0, R6 |
| MOVD R6, R4 |
| MOVD R11, (8)(R2)(R1*1) |
| |
| ADD $16, R1 // i*8 + 8 |
| ADD $2, R7 // i++ |
| |
| CMPBLT R7, R12, A6 |
| BR E6 |
| |
| L6: |
| MOVD (R8)(R1*1), R6 |
| MULHDU R9, R6 |
| MOVD (R2)(R1*1), R10 |
| ADDC R10, R11 // add to low order bits |
| ADDE R0, R6 |
| ADDC R4, R11 |
| ADDE R0, R6 |
| MOVD R6, R4 |
| MOVD R11, (R2)(R1*1) |
| |
| ADD $8, R1 // i*8 + 8 |
| ADD $1, R7 // i++ |
| |
| E6: |
| CMPBLT R7, R5, L6 // i < n |
| |
| MOVD R4, c+56(FP) |
| RET |
| |