| // Copyright 2016 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| //go:build !math_big_pure_go |
| |
| #include "textflag.h" |
| |
| TEXT ·addVVvec(SB), NOSPLIT, $0 |
| MOVD z_len+8(FP), R3 |
| MOVD x+24(FP), R8 |
| MOVD y+48(FP), R9 |
| MOVD z+0(FP), R2 |
| |
| MOVD $0, R4 // c = 0 |
| MOVD $0, R0 // make sure it's zero |
| MOVD $0, R10 // i = 0 |
| |
| // s/JL/JMP/ below to disable the unrolled loop |
| SUB $4, R3 |
| BLT v1 |
| SUB $12, R3 // n -= 16 |
| BLT A1 // if n < 0 goto A1 |
| |
| MOVD R8, R5 |
| MOVD R9, R6 |
| MOVD R2, R7 |
| |
| // n >= 0 |
| // regular loop body unrolled 16x |
| VZERO V0 // c = 0 |
| |
| UU1: |
| VLM 0(R5), V1, V4 // 64-bytes into V1..V8 |
| ADD $64, R5 |
| VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order |
| VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order |
| |
| VLM 0(R6), V9, V12 // 64-bytes into V9..V16 |
| ADD $64, R6 |
| VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order |
| VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order |
| |
| VACCCQ V1, V9, V0, V25 |
| VACQ V1, V9, V0, V17 |
| VACCCQ V2, V10, V25, V26 |
| VACQ V2, V10, V25, V18 |
| |
| VLM 0(R5), V5, V6 // 32-bytes into V1..V8 |
| VLM 0(R6), V13, V14 // 32-bytes into V9..V16 |
| ADD $32, R5 |
| ADD $32, R6 |
| |
| VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order |
| VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order |
| VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order |
| VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order |
| |
| VACCCQ V3, V11, V26, V27 |
| VACQ V3, V11, V26, V19 |
| VACCCQ V4, V12, V27, V28 |
| VACQ V4, V12, V27, V20 |
| |
| VLM 0(R5), V7, V8 // 32-bytes into V1..V8 |
| VLM 0(R6), V15, V16 // 32-bytes into V9..V16 |
| ADD $32, R5 |
| ADD $32, R6 |
| |
| VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order |
| VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order |
| VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order |
| VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order |
| |
| VACCCQ V5, V13, V28, V29 |
| VACQ V5, V13, V28, V21 |
| VACCCQ V6, V14, V29, V30 |
| VACQ V6, V14, V29, V22 |
| |
| VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order |
| VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order |
| VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order |
| VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order |
| |
| VACCCQ V7, V15, V30, V31 |
| VACQ V7, V15, V30, V23 |
| VACCCQ V8, V16, V31, V0 // V0 has carry-over |
| VACQ V8, V16, V31, V24 |
| |
| VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order |
| VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order |
| VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order |
| VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order |
| VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order |
| VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order |
| VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order |
| VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order |
| VSTM V17, V24, 0(R7) // 128-bytes into z |
| ADD $128, R7 |
| ADD $128, R10 // i += 16 |
| SUB $16, R3 // n -= 16 |
| BGE UU1 // if n >= 0 goto U1 |
| VLGVG $1, V0, R4 // put cf into R4 |
| NEG R4, R4 // save cf |
| |
| A1: |
| ADD $12, R3 // n += 16 |
| |
| // s/JL/JMP/ below to disable the unrolled loop |
| BLT v1 // if n < 0 goto v1 |
| |
| U1: // n >= 0 |
| // regular loop body unrolled 4x |
| MOVD 0(R8)(R10*1), R5 |
| MOVD 8(R8)(R10*1), R6 |
| MOVD 16(R8)(R10*1), R7 |
| MOVD 24(R8)(R10*1), R1 |
| ADDC R4, R4 // restore CF |
| MOVD 0(R9)(R10*1), R11 |
| ADDE R11, R5 |
| MOVD 8(R9)(R10*1), R11 |
| ADDE R11, R6 |
| MOVD 16(R9)(R10*1), R11 |
| ADDE R11, R7 |
| MOVD 24(R9)(R10*1), R11 |
| ADDE R11, R1 |
| MOVD R0, R4 |
| ADDE R4, R4 // save CF |
| NEG R4, R4 |
| MOVD R5, 0(R2)(R10*1) |
| MOVD R6, 8(R2)(R10*1) |
| MOVD R7, 16(R2)(R10*1) |
| MOVD R1, 24(R2)(R10*1) |
| |
| ADD $32, R10 // i += 4 |
| SUB $4, R3 // n -= 4 |
| BGE U1 // if n >= 0 goto U1 |
| |
| v1: |
| ADD $4, R3 // n += 4 |
| BLE E1 // if n <= 0 goto E1 |
| |
| L1: // n > 0 |
| ADDC R4, R4 // restore CF |
| MOVD 0(R8)(R10*1), R5 |
| MOVD 0(R9)(R10*1), R11 |
| ADDE R11, R5 |
| MOVD R5, 0(R2)(R10*1) |
| MOVD R0, R4 |
| ADDE R4, R4 // save CF |
| NEG R4, R4 |
| |
| ADD $8, R10 // i++ |
| SUB $1, R3 // n-- |
| BGT L1 // if n > 0 goto L1 |
| |
| E1: |
| NEG R4, R4 |
| MOVD R4, c+72(FP) // return c |
| RET |
| |
| TEXT ·subVVvec(SB), NOSPLIT, $0 |
| MOVD z_len+8(FP), R3 |
| MOVD x+24(FP), R8 |
| MOVD y+48(FP), R9 |
| MOVD z+0(FP), R2 |
| MOVD $0, R4 // c = 0 |
| MOVD $0, R0 // make sure it's zero |
| MOVD $0, R10 // i = 0 |
| |
| // s/JL/JMP/ below to disable the unrolled loop |
| SUB $4, R3 // n -= 4 |
| BLT v1 // if n < 0 goto v1 |
| SUB $12, R3 // n -= 16 |
| BLT A1 // if n < 0 goto A1 |
| |
| MOVD R8, R5 |
| MOVD R9, R6 |
| MOVD R2, R7 |
| |
| // n >= 0 |
| // regular loop body unrolled 16x |
| VZERO V0 // cf = 0 |
| MOVD $1, R4 // for 390 subtraction cf starts as 1 (no borrow) |
| VLVGG $1, R4, V0 // put carry into V0 |
| |
| UU1: |
| VLM 0(R5), V1, V4 // 64-bytes into V1..V8 |
| ADD $64, R5 |
| VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order |
| VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order |
| |
| VLM 0(R6), V9, V12 // 64-bytes into V9..V16 |
| ADD $64, R6 |
| VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order |
| VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order |
| |
| VSBCBIQ V1, V9, V0, V25 |
| VSBIQ V1, V9, V0, V17 |
| VSBCBIQ V2, V10, V25, V26 |
| VSBIQ V2, V10, V25, V18 |
| |
| VLM 0(R5), V5, V6 // 32-bytes into V1..V8 |
| VLM 0(R6), V13, V14 // 32-bytes into V9..V16 |
| ADD $32, R5 |
| ADD $32, R6 |
| |
| VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order |
| VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order |
| VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order |
| VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order |
| |
| VSBCBIQ V3, V11, V26, V27 |
| VSBIQ V3, V11, V26, V19 |
| VSBCBIQ V4, V12, V27, V28 |
| VSBIQ V4, V12, V27, V20 |
| |
| VLM 0(R5), V7, V8 // 32-bytes into V1..V8 |
| VLM 0(R6), V15, V16 // 32-bytes into V9..V16 |
| ADD $32, R5 |
| ADD $32, R6 |
| |
| VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order |
| VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order |
| VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order |
| VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order |
| |
| VSBCBIQ V5, V13, V28, V29 |
| VSBIQ V5, V13, V28, V21 |
| VSBCBIQ V6, V14, V29, V30 |
| VSBIQ V6, V14, V29, V22 |
| |
| VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order |
| VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order |
| VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order |
| VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order |
| |
| VSBCBIQ V7, V15, V30, V31 |
| VSBIQ V7, V15, V30, V23 |
| VSBCBIQ V8, V16, V31, V0 // V0 has carry-over |
| VSBIQ V8, V16, V31, V24 |
| |
| VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order |
| VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order |
| VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order |
| VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order |
| VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order |
| VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order |
| VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order |
| VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order |
| VSTM V17, V24, 0(R7) // 128-bytes into z |
| ADD $128, R7 |
| ADD $128, R10 // i += 16 |
| SUB $16, R3 // n -= 16 |
| BGE UU1 // if n >= 0 goto U1 |
| VLGVG $1, V0, R4 // put cf into R4 |
| SUB $1, R4 // save cf |
| |
| A1: |
| ADD $12, R3 // n += 16 |
| BLT v1 // if n < 0 goto v1 |
| |
| U1: // n >= 0 |
| // regular loop body unrolled 4x |
| MOVD 0(R8)(R10*1), R5 |
| MOVD 8(R8)(R10*1), R6 |
| MOVD 16(R8)(R10*1), R7 |
| MOVD 24(R8)(R10*1), R1 |
| MOVD R0, R11 |
| SUBC R4, R11 // restore CF |
| MOVD 0(R9)(R10*1), R11 |
| SUBE R11, R5 |
| MOVD 8(R9)(R10*1), R11 |
| SUBE R11, R6 |
| MOVD 16(R9)(R10*1), R11 |
| SUBE R11, R7 |
| MOVD 24(R9)(R10*1), R11 |
| SUBE R11, R1 |
| MOVD R0, R4 |
| SUBE R4, R4 // save CF |
| MOVD R5, 0(R2)(R10*1) |
| MOVD R6, 8(R2)(R10*1) |
| MOVD R7, 16(R2)(R10*1) |
| MOVD R1, 24(R2)(R10*1) |
| |
| ADD $32, R10 // i += 4 |
| SUB $4, R3 // n -= 4 |
| BGE U1 // if n >= 0 goto U1n |
| |
| v1: |
| ADD $4, R3 // n += 4 |
| BLE E1 // if n <= 0 goto E1 |
| |
| L1: // n > 0 |
| MOVD R0, R11 |
| SUBC R4, R11 // restore CF |
| MOVD 0(R8)(R10*1), R5 |
| MOVD 0(R9)(R10*1), R11 |
| SUBE R11, R5 |
| MOVD R5, 0(R2)(R10*1) |
| MOVD R0, R4 |
| SUBE R4, R4 // save CF |
| |
| ADD $8, R10 // i++ |
| SUB $1, R3 // n-- |
| BGT L1 // if n > 0 goto L1n |
| |
| E1: |
| NEG R4, R4 |
| MOVD R4, c+72(FP) // return c |
| RET |