blob: 4520d161d779d112330c7c21f894e01b3ad67e05 [file] [log] [blame]
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !math_big_pure_go,s390x
#include "textflag.h"
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
TEXT ·hasVectorFacility(SB),NOSPLIT,$24-1
MOVD $x-24(SP), R1
XC $24, 0(R1), 0(R1) // clear the storage
MOVD $2, R0 // R0 is the number of double words stored -1
WORD $0xB2B01000 // STFLE 0(R1)
XOR R0, R0 // reset the value of R0
MOVBZ z-8(SP), R1
AND $0x40, R1
BEQ novector
vectorinstalled:
// check if the vector instruction has been enabled
VLEIB $0, $0xF, V16
VLGVB $0, V16, R1
CMPBNE R1, $0xF, novector
MOVB $1, ret+0(FP) // have vx
RET
novector:
MOVB $0, ret+0(FP) // no vx
RET
TEXT ·mulWW(SB),NOSPLIT,$0
MOVD x+0(FP), R3
MOVD y+8(FP), R4
MULHDU R3, R4
MOVD R10, z1+16(FP)
MOVD R11, z0+24(FP)
RET
// func divWW(x1, x0, y Word) (q, r Word)
TEXT ·divWW(SB),NOSPLIT,$0
MOVD x1+0(FP), R10
MOVD x0+8(FP), R11
MOVD y+16(FP), R5
WORD $0xb98700a5 // dlgr r10,r5
MOVD R11, q+24(FP)
MOVD R10, r+32(FP)
RET
// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
// func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB),NOSPLIT,$0
MOVD addvectorfacility+0x00(SB),R1
BR (R1)
TEXT ·addVV_check(SB),NOSPLIT, $0
MOVB ·hasVX(SB), R1
CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
MOVD $addvectorfacility+0x00(SB), R1
MOVD $·addVV_novec(SB), R2
MOVD R2, 0(R1)
//MOVD $·addVV_novec(SB), 0(R1)
BR ·addVV_novec(SB)
vectorimpl:
MOVD $addvectorfacility+0x00(SB), R1
MOVD $·addVV_vec(SB), R2
MOVD R2, 0(R1)
//MOVD $·addVV_vec(SB), 0(R1)
BR ·addVV_vec(SB)
GLOBL addvectorfacility+0x00(SB), NOPTR, $8
DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB)
TEXT ·addVV_vec(SB),NOSPLIT,$0
MOVD z_len+8(FP), R3
MOVD x+24(FP), R8
MOVD y+48(FP), R9
MOVD z+0(FP), R2
MOVD $0, R4 // c = 0
MOVD $0, R0 // make sure it's zero
MOVD $0, R10 // i = 0
// s/JL/JMP/ below to disable the unrolled loop
SUB $4, R3
BLT v1
SUB $12, R3 // n -= 16
BLT A1 // if n < 0 goto A1
MOVD R8, R5
MOVD R9, R6
MOVD R2, R7
// n >= 0
// regular loop body unrolled 16x
VZERO V0 // c = 0
UU1: VLM 0(R5), V1, V4 // 64-bytes into V1..V8
ADD $64, R5
VPDI $0x4,V1,V1,V1 // flip the doublewords to big-endian order
VPDI $0x4,V2,V2,V2 // flip the doublewords to big-endian order
VLM 0(R6), V9, V12 // 64-bytes into V9..V16
ADD $64, R6
VPDI $0x4,V9,V9,V9 // flip the doublewords to big-endian order
VPDI $0x4,V10,V10,V10 // flip the doublewords to big-endian order
VACCCQ V1, V9, V0, V25
VACQ V1, V9, V0, V17
VACCCQ V2, V10, V25, V26
VACQ V2, V10, V25, V18
VLM 0(R5), V5, V6 // 32-bytes into V1..V8
VLM 0(R6), V13, V14 // 32-bytes into V9..V16
ADD $32, R5
ADD $32, R6
VPDI $0x4,V3,V3,V3 // flip the doublewords to big-endian order
VPDI $0x4,V4,V4,V4 // flip the doublewords to big-endian order
VPDI $0x4,V11,V11,V11 // flip the doublewords to big-endian order
VPDI $0x4,V12,V12,V12 // flip the doublewords to big-endian order
VACCCQ V3, V11, V26, V27
VACQ V3, V11, V26, V19
VACCCQ V4, V12, V27, V28
VACQ V4, V12, V27, V20
VLM 0(R5), V7, V8 // 32-bytes into V1..V8
VLM 0(R6), V15, V16 // 32-bytes into V9..V16
ADD $32, R5
ADD $32, R6
VPDI $0x4,V5,V5,V5 // flip the doublewords to big-endian order
VPDI $0x4,V6,V6,V6 // flip the doublewords to big-endian order
VPDI $0x4,V13,V13,V13 // flip the doublewords to big-endian order
VPDI $0x4,V14,V14,V14 // flip the doublewords to big-endian order
VACCCQ V5, V13, V28, V29
VACQ V5, V13, V28, V21
VACCCQ V6, V14, V29, V30
VACQ V6, V14, V29, V22
VPDI $0x4,V7,V7,V7 // flip the doublewords to big-endian order
VPDI $0x4,V8,V8,V8 // flip the doublewords to big-endian order
VPDI $0x4,V15,V15,V15 // flip the doublewords to big-endian order
VPDI $0x4,V16,V16,V16 // flip the doublewords to big-endian order
VACCCQ V7, V15, V30, V31
VACQ V7, V15, V30, V23
VACCCQ V8, V16, V31, V0 //V0 has carry-over
VACQ V8, V16, V31, V24
VPDI $0x4,V17,V17,V17 // flip the doublewords to big-endian order
VPDI $0x4,V18,V18,V18 // flip the doublewords to big-endian order
VPDI $0x4,V19,V19,V19 // flip the doublewords to big-endian order
VPDI $0x4,V20,V20,V20 // flip the doublewords to big-endian order
VPDI $0x4,V21,V21,V21 // flip the doublewords to big-endian order
VPDI $0x4,V22,V22,V22 // flip the doublewords to big-endian order
VPDI $0x4,V23,V23,V23 // flip the doublewords to big-endian order
VPDI $0x4,V24,V24,V24 // flip the doublewords to big-endian order
VSTM V17, V24, 0(R7) // 128-bytes into z
ADD $128, R7
ADD $128, R10 // i += 16
SUB $16, R3 // n -= 16
BGE UU1 // if n >= 0 goto U1
VLGVG $1, V0, R4 // put cf into R4
NEG R4, R4 // save cf
A1: ADD $12, R3 // n += 16
// s/JL/JMP/ below to disable the unrolled loop
BLT v1 // if n < 0 goto v1
U1: // n >= 0
// regular loop body unrolled 4x
MOVD 0(R8)(R10*1), R5
MOVD 8(R8)(R10*1), R6
MOVD 16(R8)(R10*1), R7
MOVD 24(R8)(R10*1), R1
ADDC R4, R4 // restore CF
MOVD 0(R9)(R10*1), R11
ADDE R11, R5
MOVD 8(R9)(R10*1), R11
ADDE R11, R6
MOVD 16(R9)(R10*1), R11
ADDE R11, R7
MOVD 24(R9)(R10*1), R11
ADDE R11, R1
MOVD R0, R4
ADDE R4, R4 // save CF
NEG R4, R4
MOVD R5, 0(R2)(R10*1)
MOVD R6, 8(R2)(R10*1)
MOVD R7, 16(R2)(R10*1)
MOVD R1, 24(R2)(R10*1)
ADD $32, R10 // i += 4
SUB $4, R3 // n -= 4
BGE U1 // if n >= 0 goto U1
v1: ADD $4, R3 // n += 4
BLE E1 // if n <= 0 goto E1
L1: // n > 0
ADDC R4, R4 // restore CF
MOVD 0(R8)(R10*1), R5
MOVD 0(R9)(R10*1), R11
ADDE R11, R5
MOVD R5, 0(R2)(R10*1)
MOVD R0, R4
ADDE R4, R4 // save CF
NEG R4, R4
ADD $8, R10 // i++
SUB $1, R3 // n--
BGT L1 // if n > 0 goto L1
E1: NEG R4, R4
MOVD R4, c+72(FP) // return c
RET
TEXT ·addVV_novec(SB),NOSPLIT,$0
novec:
MOVD z_len+8(FP), R3
MOVD x+24(FP), R8
MOVD y+48(FP), R9
MOVD z+0(FP), R2
MOVD $0, R4 // c = 0
MOVD $0, R0 // make sure it's zero
MOVD $0, R10 // i = 0
// s/JL/JMP/ below to disable the unrolled loop
SUB $4, R3 // n -= 4
BLT v1n // if n < 0 goto v1n
U1n: // n >= 0
// regular loop body unrolled 4x
MOVD 0(R8)(R10*1), R5
MOVD 8(R8)(R10*1), R6
MOVD 16(R8)(R10*1), R7
MOVD 24(R8)(R10*1), R1
ADDC R4, R4 // restore CF
MOVD 0(R9)(R10*1), R11
ADDE R11, R5
MOVD 8(R9)(R10*1), R11
ADDE R11, R6
MOVD 16(R9)(R10*1), R11
ADDE R11, R7
MOVD 24(R9)(R10*1), R11
ADDE R11, R1
MOVD R0, R4
ADDE R4, R4 // save CF
NEG R4, R4
MOVD R5, 0(R2)(R10*1)
MOVD R6, 8(R2)(R10*1)
MOVD R7, 16(R2)(R10*1)
MOVD R1, 24(R2)(R10*1)
ADD $32, R10 // i += 4
SUB $4, R3 // n -= 4
BGE U1n // if n >= 0 goto U1n
v1n: ADD $4, R3 // n += 4
BLE E1n // if n <= 0 goto E1n
L1n: // n > 0
ADDC R4, R4 // restore CF
MOVD 0(R8)(R10*1), R5
MOVD 0(R9)(R10*1), R11
ADDE R11, R5
MOVD R5, 0(R2)(R10*1)
MOVD R0, R4
ADDE R4, R4 // save CF
NEG R4, R4
ADD $8, R10 // i++
SUB $1, R3 // n--
BGT L1n // if n > 0 goto L1n
E1n: NEG R4, R4
MOVD R4, c+72(FP) // return c
RET
TEXT ·subVV(SB),NOSPLIT,$0
MOVD subvectorfacility+0x00(SB),R1
BR (R1)
TEXT ·subVV_check(SB),NOSPLIT,$0
MOVB ·hasVX(SB), R1
CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
MOVD $subvectorfacility+0x00(SB), R1
MOVD $·subVV_novec(SB), R2
MOVD R2, 0(R1)
//MOVD $·subVV_novec(SB), 0(R1)
BR ·subVV_novec(SB)
vectorimpl:
MOVD $subvectorfacility+0x00(SB), R1
MOVD $·subVV_vec(SB), R2
MOVD R2, 0(R1)
//MOVD $·subVV_vec(SB), 0(R1)
BR ·subVV_vec(SB)
GLOBL subvectorfacility+0x00(SB), NOPTR, $8
DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB)
// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
// func subVV(z, x, y []Word) (c Word)
// (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
TEXT ·subVV_vec(SB),NOSPLIT,$0
MOVD z_len+8(FP), R3
MOVD x+24(FP), R8
MOVD y+48(FP), R9
MOVD z+0(FP), R2
MOVD $0, R4 // c = 0
MOVD $0, R0 // make sure it's zero
MOVD $0, R10 // i = 0
// s/JL/JMP/ below to disable the unrolled loop
SUB $4, R3 // n -= 4
BLT v1 // if n < 0 goto v1
SUB $12, R3 // n -= 16
BLT A1 // if n < 0 goto A1
MOVD R8, R5
MOVD R9, R6
MOVD R2, R7
// n >= 0
// regular loop body unrolled 16x
VZERO V0 // cf = 0
MOVD $1, R4 // for 390 subtraction cf starts as 1 (no borrow)
VLVGG $1, R4, V0 //put carry into V0
UU1: VLM 0(R5), V1, V4 // 64-bytes into V1..V8
ADD $64, R5
VPDI $0x4,V1,V1,V1 // flip the doublewords to big-endian order
VPDI $0x4,V2,V2,V2 // flip the doublewords to big-endian order
VLM 0(R6), V9, V12 // 64-bytes into V9..V16
ADD $64, R6
VPDI $0x4,V9,V9,V9 // flip the doublewords to big-endian order
VPDI $0x4,V10,V10,V10 // flip the doublewords to big-endian order
VSBCBIQ V1, V9, V0, V25
VSBIQ V1, V9, V0, V17
VSBCBIQ V2, V10, V25, V26
VSBIQ V2, V10, V25, V18
VLM 0(R5), V5, V6 // 32-bytes into V1..V8
VLM 0(R6), V13, V14 // 32-bytes into V9..V16
ADD $32, R5
ADD $32, R6
VPDI $0x4,V3,V3,V3 // flip the doublewords to big-endian order
VPDI $0x4,V4,V4,V4 // flip the doublewords to big-endian order
VPDI $0x4,V11,V11,V11 // flip the doublewords to big-endian order
VPDI $0x4,V12,V12,V12 // flip the doublewords to big-endian order
VSBCBIQ V3, V11, V26, V27
VSBIQ V3, V11, V26, V19
VSBCBIQ V4, V12, V27, V28
VSBIQ V4, V12, V27, V20
VLM 0(R5), V7, V8 // 32-bytes into V1..V8
VLM 0(R6), V15, V16 // 32-bytes into V9..V16
ADD $32, R5
ADD $32, R6
VPDI $0x4,V5,V5,V5 // flip the doublewords to big-endian order
VPDI $0x4,V6,V6,V6 // flip the doublewords to big-endian order
VPDI $0x4,V13,V13,V13 // flip the doublewords to big-endian order
VPDI $0x4,V14,V14,V14 // flip the doublewords to big-endian order
VSBCBIQ V5, V13, V28, V29
VSBIQ V5, V13, V28, V21
VSBCBIQ V6, V14, V29, V30
VSBIQ V6, V14, V29, V22
VPDI $0x4,V7,V7,V7 // flip the doublewords to big-endian order
VPDI $0x4,V8,V8,V8 // flip the doublewords to big-endian order
VPDI $0x4,V15,V15,V15 // flip the doublewords to big-endian order
VPDI $0x4,V16,V16,V16 // flip the doublewords to big-endian order
VSBCBIQ V7, V15, V30, V31
VSBIQ V7, V15, V30, V23
VSBCBIQ V8, V16, V31, V0 //V0 has carry-over
VSBIQ V8, V16, V31, V24
VPDI $0x4,V17,V17,V17 // flip the doublewords to big-endian order
VPDI $0x4,V18,V18,V18 // flip the doublewords to big-endian order
VPDI $0x4,V19,V19,V19 // flip the doublewords to big-endian order
VPDI $0x4,V20,V20,V20 // flip the doublewords to big-endian order
VPDI $0x4,V21,V21,V21 // flip the doublewords to big-endian order
VPDI $0x4,V22,V22,V22 // flip the doublewords to big-endian order
VPDI $0x4,V23,V23,V23 // flip the doublewords to big-endian order
VPDI $0x4,V24,V24,V24 // flip the doublewords to big-endian order
VSTM V17, V24, 0(R7) // 128-bytes into z
ADD $128, R7
ADD $128, R10 // i += 16
SUB $16, R3 // n -= 16
BGE UU1 // if n >= 0 goto U1
VLGVG $1, V0, R4 // put cf into R4
SUB $1, R4 // save cf
A1: ADD $12, R3 // n += 16
BLT v1 // if n < 0 goto v1
U1: // n >= 0
// regular loop body unrolled 4x
MOVD 0(R8)(R10*1), R5
MOVD 8(R8)(R10*1), R6
MOVD 16(R8)(R10*1), R7
MOVD 24(R8)(R10*1), R1
MOVD R0, R11
SUBC R4, R11 // restore CF
MOVD 0(R9)(R10*1), R11
SUBE R11, R5
MOVD 8(R9)(R10*1), R11
SUBE R11, R6
MOVD 16(R9)(R10*1), R11
SUBE R11, R7
MOVD 24(R9)(R10*1), R11
SUBE R11, R1
MOVD R0, R4
SUBE R4, R4 // save CF
MOVD R5, 0(R2)(R10*1)
MOVD R6, 8(R2)(R10*1)
MOVD R7, 16(R2)(R10*1)
MOVD R1, 24(R2)(R10*1)
ADD $32, R10 // i += 4
SUB $4, R3 // n -= 4
BGE U1 // if n >= 0 goto U1n
v1: ADD $4, R3 // n += 4
BLE E1 // if n <= 0 goto E1
L1: // n > 0
MOVD R0, R11
SUBC R4, R11 // restore CF
MOVD 0(R8)(R10*1), R5
MOVD 0(R9)(R10*1), R11
SUBE R11, R5
MOVD R5, 0(R2)(R10*1)
MOVD R0, R4
SUBE R4, R4 // save CF
ADD $8, R10 // i++
SUB $1, R3 // n--
BGT L1 // if n > 0 goto L1n
E1: NEG R4, R4
MOVD R4, c+72(FP) // return c
RET
// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
// func subVV(z, x, y []Word) (c Word)
// (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
TEXT ·subVV_novec(SB),NOSPLIT,$0
MOVD z_len+8(FP), R3
MOVD x+24(FP), R8
MOVD y+48(FP), R9
MOVD z+0(FP), R2
MOVD $0, R4 // c = 0
MOVD $0, R0 // make sure it's zero
MOVD $0, R10 // i = 0
// s/JL/JMP/ below to disable the unrolled loop
SUB $4, R3 // n -= 4
BLT v1 // if n < 0 goto v1
U1: // n >= 0
// regular loop body unrolled 4x
MOVD 0(R8)(R10*1), R5
MOVD 8(R8)(R10*1), R6
MOVD 16(R8)(R10*1), R7
MOVD 24(R8)(R10*1), R1
MOVD R0, R11
SUBC R4, R11 // restore CF
MOVD 0(R9)(R10*1), R11
SUBE R11, R5
MOVD 8(R9)(R10*1), R11
SUBE R11, R6
MOVD 16(R9)(R10*1), R11
SUBE R11, R7
MOVD 24(R9)(R10*1), R11
SUBE R11, R1
MOVD R0, R4
SUBE R4, R4 // save CF
MOVD R5, 0(R2)(R10*1)
MOVD R6, 8(R2)(R10*1)
MOVD R7, 16(R2)(R10*1)
MOVD R1, 24(R2)(R10*1)
ADD $32, R10 // i += 4
SUB $4, R3 // n -= 4
BGE U1 // if n >= 0 goto U1
v1: ADD $4, R3 // n += 4
BLE E1 // if n <= 0 goto E1
L1: // n > 0
MOVD R0, R11
SUBC R4, R11 // restore CF
MOVD 0(R8)(R10*1), R5
MOVD 0(R9)(R10*1), R11
SUBE R11, R5
MOVD R5, 0(R2)(R10*1)
MOVD R0, R4
SUBE R4, R4 // save CF
ADD $8, R10 // i++
SUB $1, R3 // n--
BGT L1 // if n > 0 goto L1
E1: NEG R4, R4
MOVD R4, c+72(FP) // return c
RET
TEXT ·addVW(SB),NOSPLIT,$0
MOVD addwvectorfacility+0x00(SB),R1
BR (R1)
TEXT ·addVW_check(SB),NOSPLIT,$0
MOVB ·hasVX(SB), R1
CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
MOVD $addwvectorfacility+0x00(SB), R1
MOVD $·addVW_novec(SB), R2
MOVD R2, 0(R1)
//MOVD $·addVW_novec(SB), 0(R1)
BR ·addVW_novec(SB)
vectorimpl:
MOVD $addwvectorfacility+0x00(SB), R1
MOVD $·addVW_vec(SB), R2
MOVD R2, 0(R1)
//MOVD $·addVW_vec(SB), 0(R1)
BR ·addVW_vec(SB)
GLOBL addwvectorfacility+0x00(SB), NOPTR, $8
DATA addwvectorfacility+0x00(SB)/8, $·addVW_check(SB)
// func addVW_vec(z, x []Word, y Word) (c Word)
TEXT ·addVW_vec(SB),NOSPLIT,$0
MOVD z_len+8(FP), R3
MOVD x+24(FP), R8
MOVD y+48(FP), R4 // c = y
MOVD z+0(FP), R2
MOVD $0, R0 // make sure it's zero
MOVD $0, R10 // i = 0
MOVD R8, R5
MOVD R2, R7
// s/JL/JMP/ below to disable the unrolled loop
SUB $4, R3 // n -= 4
BLT v10 // if n < 0 goto v10
SUB $12, R3
BLT A10
// n >= 0
// regular loop body unrolled 16x
VZERO V0 // prepare V0 to be final carry register
VZERO V9 // to ensure upper half is zero
VLVGG $1, R4, V9
UU1: VLM 0(R5), V1, V4 // 64-bytes into V1..V4
ADD $64, R5
VPDI $0x4,V1,V1,V1 // flip the doublewords to big-endian order
VPDI $0x4,V2,V2,V2 // flip the doublewords to big-endian order
VACCCQ V1, V9, V0, V25
VACQ V1, V9, V0, V17
VZERO V9
VACCCQ V2, V9, V25, V26
VACQ V2, V9, V25, V18
VLM 0(R5), V5, V6 // 32-bytes into V5..V6
ADD $32, R5
VPDI $0x4,V3,V3,V3 // flip the doublewords to big-endian order
VPDI $0x4,V4,V4,V4 // flip the doublewords to big-endian order
VACCCQ V3, V9, V26, V27
VACQ V3, V9, V26, V19
VACCCQ V4, V9, V27, V28
VACQ V4, V9, V27, V20
VLM 0(R5), V7, V8 // 32-bytes into V7..V8
ADD $32, R5
VPDI $0x4,V5,V5,V5 // flip the doublewords to big-endian order
VPDI $0x4,V6,V6,V6 // flip the doublewords to big-endian order
VACCCQ V5, V9, V28, V29
VACQ V5, V9, V28, V21
VACCCQ V6, V9, V29, V30
VACQ V6, V9, V29, V22
VPDI $0x4,V7,V7,V7 // flip the doublewords to big-endian order
VPDI $0x4,V8,V8,V8 // flip the doublewords to big-endian order
VACCCQ V7, V9, V30, V31
VACQ V7, V9, V30, V23
VACCCQ V8, V9, V31, V0 //V0 has carry-over
VACQ V8, V9, V31, V24
VPDI $0x4,V17,V17,V17 // flip the doublewords to big-endian order
VPDI $0x4,V18,V18,V18 // flip the doublewords to big-endian order
VPDI $0x4,V19,V19,V19 // flip the doublewords to big-endian order
VPDI $0x4,V20,V20,V20 // flip the doublewords to big-endian order
VPDI $0x4,V21,V21,V21 // flip the doublewords to big-endian order
VPDI $0x4,V22,V22,V22 // flip the doublewords to big-endian order
VPDI $0x4,V23,V23,V23 // flip the doublewords to big-endian order
VPDI $0x4,V24,V24,V24 // flip the doublewords to big-endian order
VSTM V17, V24, 0(R7) // 128-bytes into z
ADD $128, R7
ADD $128, R10 // i += 16
SUB $16, R3 // n -= 16
BGE UU1 // if n >= 0 goto U1
VLGVG $1, V0, R4 // put cf into R4 in case we branch to v10
A10: ADD $12, R3 // n += 16
// s/JL/JMP/ below to disable the unrolled loop
BLT v10 // if n < 0 goto v10
U4: // n >= 0
// regular loop body unrolled 4x
MOVD 0(R8)(R10*1), R5
MOVD 8(R8)(R10*1), R6
MOVD 16(R8)(R10*1), R7
MOVD 24(R8)(R10*1), R1
ADDC R4, R5
ADDE R0, R6
ADDE R0, R7
ADDE R0, R1
ADDE R0, R0
MOVD R0, R4 // save CF
SUB R0, R0
MOVD R5, 0(R2)(R10*1)
MOVD R6, 8(R2)(R10*1)
MOVD R7, 16(R2)(R10*1)
MOVD R1, 24(R2)(R10*1)
ADD $32, R10 // i += 4 -> i +=32
SUB $4, R3 // n -= 4
BGE U4 // if n >= 0 goto U4
v10: ADD $4, R3 // n += 4
BLE E10 // if n <= 0 goto E4
L4: // n > 0
MOVD 0(R8)(R10*1), R5
ADDC R4, R5
ADDE R0, R0
MOVD R0, R4 // save CF
SUB R0, R0
MOVD R5, 0(R2)(R10*1)
ADD $8, R10 // i++
SUB $1, R3 // n--
BGT L4 // if n > 0 goto L4
E10: MOVD R4, c+56(FP) // return c
RET
TEXT ·addVW_novec(SB),NOSPLIT,$0
//DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0)
MOVD z_len+8(FP), R3
MOVD x+24(FP), R8
MOVD y+48(FP), R4 // c = y
MOVD z+0(FP), R2
MOVD $0, R0 // make sure it's 0
MOVD $0, R10 // i = 0
// s/JL/JMP/ below to disable the unrolled loop
SUB $4, R3 // n -= 4
BLT v4 // if n < 4 goto v4
U4: // n >= 0
// regular loop body unrolled 4x
MOVD 0(R8)(R10*1), R5
MOVD 8(R8)(R10*1), R6
MOVD 16(R8)(R10*1), R7
MOVD 24(R8)(R10*1), R1
ADDC R4, R5
ADDE R0, R6
ADDE R0, R7
ADDE R0, R1
ADDE R0, R0
MOVD R0, R4 // save CF
SUB R0, R0
MOVD R5, 0(R2)(R10*1)
MOVD R6, 8(R2)(R10*1)
MOVD R7, 16(R2)(R10*1)
MOVD R1, 24(R2)(R10*1)
ADD $32, R10 // i += 4 -> i +=32
SUB $4, R3 // n -= 4
BGE U4 // if n >= 0 goto U4
v4: ADD $4, R3 // n += 4
BLE E4 // if n <= 0 goto E4
L4: // n > 0
MOVD 0(R8)(R10*1), R5
ADDC R4, R5
ADDE R0, R0
MOVD R0, R4 // save CF
SUB R0, R0
MOVD R5, 0(R2)(R10*1)
ADD $8, R10 // i++
SUB $1, R3 // n--
BGT L4 // if n > 0 goto L4
E4: MOVD R4, c+56(FP) // return c
RET
TEXT ·subVW(SB),NOSPLIT,$0
MOVD subwvectorfacility+0x00(SB),R1
BR (R1)
TEXT ·subVW_check(SB),NOSPLIT,$0
MOVB ·hasVX(SB), R1
CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
MOVD $subwvectorfacility+0x00(SB), R1
MOVD $·subVW_novec(SB), R2
MOVD R2, 0(R1)
//MOVD $·subVW_novec(SB), 0(R1)
BR ·subVW_novec(SB)
vectorimpl:
MOVD $subwvectorfacility+0x00(SB), R1
MOVD $·subVW_vec(SB), R2
MOVD R2, 0(R1)
//MOVD $·subVW_vec(SB), 0(R1)
BR ·subVW_vec(SB)
GLOBL subwvectorfacility+0x00(SB), NOPTR, $8
DATA subwvectorfacility+0x00(SB)/8, $·subVW_check(SB)
// func subVW(z, x []Word, y Word) (c Word)
TEXT ·subVW_vec(SB),NOSPLIT,$0
MOVD z_len+8(FP), R3
MOVD x+24(FP), R8
MOVD y+48(FP), R4 // c = y
MOVD z+0(FP), R2
MOVD $0, R0 // make sure it's zero
MOVD $0, R10 // i = 0
MOVD R8, R5
MOVD R2, R7
// s/JL/JMP/ below to disable the unrolled loop
SUB $4, R3 // n -= 4
BLT v11 // if n < 0 goto v11
SUB $12, R3
BLT A11
VZERO V0
MOVD $1, R6 // prepare V0 to be final carry register
VLVGG $1, R6, V0 // borrow is initially "no borrow"
VZERO V9 // to ensure upper half is zero
VLVGG $1, R4, V9
// n >= 0
// regular loop body unrolled 16x
UU1: VLM 0(R5), V1, V4 // 64-bytes into V1..V4
ADD $64, R5
VPDI $0x4,V1,V1,V1 // flip the doublewords to big-endian order
VPDI $0x4,V2,V2,V2 // flip the doublewords to big-endian order
VSBCBIQ V1, V9, V0, V25
VSBIQ V1, V9, V0, V17
VZERO V9
VSBCBIQ V2, V9, V25, V26
VSBIQ V2, V9, V25, V18
VLM 0(R5), V5, V6 // 32-bytes into V5..V6
ADD $32, R5
VPDI $0x4,V3,V3,V3 // flip the doublewords to big-endian order
VPDI $0x4,V4,V4,V4 // flip the doublewords to big-endian order
VSBCBIQ V3, V9, V26, V27
VSBIQ V3, V9, V26, V19
VSBCBIQ V4, V9, V27, V28
VSBIQ V4, V9, V27, V20
VLM 0(R5), V7, V8 // 32-bytes into V7..V8
ADD $32, R5
VPDI $0x4,V5,V5,V5 // flip the doublewords to big-endian order
VPDI $0x4,V6,V6,V6 // flip the doublewords to big-endian order
VSBCBIQ V5, V9, V28, V29
VSBIQ V5, V9, V28, V21
VSBCBIQ V6, V9, V29, V30
VSBIQ V6, V9, V29, V22
VPDI $0x4,V7,V7,V7 // flip the doublewords to big-endian order
VPDI $0x4,V8,V8,V8 // flip the doublewords to big-endian order
VSBCBIQ V7, V9, V30, V31
VSBIQ V7, V9, V30, V23
VSBCBIQ V8, V9, V31, V0 // V0 has carry-over
VSBIQ V8, V9, V31, V24
VPDI $0x4,V17,V17,V17 // flip the doublewords to big-endian order
VPDI $0x4,V18,V18,V18 // flip the doublewords to big-endian order
VPDI $0x4,V19,V19,V19 // flip the doublewords to big-endian order
VPDI $0x4,V20,V20,V20 // flip the doublewords to big-endian order
VPDI $0x4,V21,V21,V21 // flip the doublewords to big-endian order
VPDI $0x4,V22,V22,V22 // flip the doublewords to big-endian order
VPDI $0x4,V23,V23,V23 // flip the doublewords to big-endian order
VPDI $0x4,V24,V24,V24 // flip the doublewords to big-endian order
VSTM V17, V24, 0(R7) // 128-bytes into z
ADD $128, R7
ADD $128, R10 // i += 16
SUB $16, R3 // n -= 16
BGE UU1 // if n >= 0 goto U1
VLGVG $1, V0, R4 // put cf into R4 in case we branch to v10
SUB $1, R4 // save cf
NEG R4, R4
A11: ADD $12, R3 // n += 16
BLT v11 // if n < 0 goto v11
// n >= 0
// regular loop body unrolled 4x
U4: // n >= 0
// regular loop body unrolled 4x
MOVD 0(R8)(R10*1), R5
MOVD 8(R8)(R10*1), R6
MOVD 16(R8)(R10*1), R7
MOVD 24(R8)(R10*1), R1
SUBC R4, R5 //SLGR -> SUBC
SUBE R0, R6 //SLBGR -> SUBE
SUBE R0, R7
SUBE R0, R1
SUBE R4, R4 // save CF
NEG R4, R4
MOVD R5, 0(R2)(R10*1)
MOVD R6, 8(R2)(R10*1)
MOVD R7, 16(R2)(R10*1)
MOVD R1, 24(R2)(R10*1)
ADD $32, R10 // i += 4 -> i +=32
SUB $4, R3 // n -= 4
BGE U4 // if n >= 0 goto U4
v11: ADD $4, R3 // n += 4
BLE E11 // if n <= 0 goto E4
L4: // n > 0
MOVD 0(R8)(R10*1), R5
SUBC R4, R5
SUBE R4, R4 // save CF
NEG R4, R4
MOVD R5, 0(R2)(R10*1)
ADD $8, R10 // i++
SUB $1, R3 // n--
BGT L4 // if n > 0 goto L4
E11: MOVD R4, c+56(FP) // return c
RET
//DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0)
// func subVW(z, x []Word, y Word) (c Word)
// (same as addVW except for SUBC/SUBE instead of ADDC/ADDE and label names)
TEXT ·subVW_novec(SB),NOSPLIT,$0
MOVD z_len+8(FP), R3
MOVD x+24(FP), R8
MOVD y+48(FP), R4 // c = y
MOVD z+0(FP), R2
MOVD $0, R0 // make sure it's 0
MOVD $0, R10 // i = 0
// s/JL/JMP/ below to disable the unrolled loop
SUB $4, R3 // n -= 4
BLT v4 // if n < 4 goto v4
U4: // n >= 0
// regular loop body unrolled 4x
MOVD 0(R8)(R10*1), R5
MOVD 8(R8)(R10*1), R6
MOVD 16(R8)(R10*1), R7
MOVD 24(R8)(R10*1), R1
SUBC R4, R5 //SLGR -> SUBC
SUBE R0, R6 //SLBGR -> SUBE
SUBE R0, R7
SUBE R0, R1
SUBE R4, R4 // save CF
NEG R4, R4
MOVD R5, 0(R2)(R10*1)
MOVD R6, 8(R2)(R10*1)
MOVD R7, 16(R2)(R10*1)
MOVD R1, 24(R2)(R10*1)
ADD $32, R10 // i += 4 -> i +=32
SUB $4, R3 // n -= 4
BGE U4 // if n >= 0 goto U4
v4: ADD $4, R3 // n += 4
BLE E4 // if n <= 0 goto E4
L4: // n > 0
MOVD 0(R8)(R10*1), R5
SUBC R4, R5
SUBE R4, R4 // save CF
NEG R4, R4
MOVD R5, 0(R2)(R10*1)
ADD $8, R10 // i++
SUB $1, R3 // n--
BGT L4 // if n > 0 goto L4
E4: MOVD R4, c+56(FP) // return c
RET
// func shlVU(z, x []Word, s uint) (c Word)
TEXT ·shlVU(SB),NOSPLIT,$0
MOVD z_len+8(FP), R5
MOVD $0, R0
SUB $1, R5 // n--
BLT X8b // n < 0 (n <= 0)
// n > 0
MOVD s+48(FP), R4
CMPBEQ R0, R4, Z80 //handle 0 case beq
MOVD $64, R6
CMPBEQ R6, R4, Z864 //handle 64 case beq
MOVD z+0(FP), R2
MOVD x+24(FP), R8
SLD $3, R5 // n = n*8
SUB R4, R6, R7
MOVD (R8)(R5*1), R10 // w1 = x[i-1]
SRD R7, R10, R3
MOVD R3, c+56(FP)
MOVD $0, R1 // i = 0
BR E8
// i < n-1
L8: MOVD R10, R3 // w = w1
MOVD -8(R8)(R5*1), R10 // w1 = x[i+1]
SLD R4, R3 // w<<s | w1>>ŝ
SRD R7, R10, R6
OR R6, R3
MOVD R3, (R2)(R5*1) // z[i] = w<<s | w1>>ŝ
SUB $8, R5 // i--
E8: CMPBGT R5, R0, L8 // i < n-1
// i >= n-1
X8a: SLD R4, R10 // w1<<s
MOVD R10, (R2) // z[0] = w1<<s
RET
X8b: MOVD R0, c+56(FP)
RET
Z80: MOVD z+0(FP), R2
MOVD x+24(FP), R8
SLD $3, R5 // n = n*8
MOVD (R8), R10
MOVD $0, R3
MOVD R3, c+56(FP)
MOVD $0, R1 // i = 0
BR E8Z
// i < n-1
L8Z: MOVD R10, R3
MOVD 8(R8)(R1*1), R10
MOVD R3, (R2)(R1*1)
ADD $8, R1
E8Z: CMPBLT R1, R5, L8Z
// i >= n-1
MOVD R10, (R2)(R5*1)
RET
Z864: MOVD z+0(FP), R2
MOVD x+24(FP), R8
SLD $3, R5 // n = n*8
MOVD (R8)(R5*1), R3 // w1 = x[n-1]
MOVD R3, c+56(FP) // z[i] = x[n-1]
BR E864
// i < n-1
L864: MOVD -8(R8)(R5*1), R3
MOVD R3, (R2)(R5*1) // z[i] = x[n-1]
SUB $8, R5 // i--
E864: CMPBGT R5, R0, L864 // i < n-1
MOVD R0, (R2) // z[n-1] = 0
RET
// CX = R4, r8 = r8, r10 = r2 , r11 = r5, DX = r3, AX = r10 , BX = R1 , 64-count = r7 (R0 set to 0) temp = R6
// func shrVU(z, x []Word, s uint) (c Word)
TEXT ·shrVU(SB),NOSPLIT,$0
MOVD z_len+8(FP), R5
MOVD $0, R0
SUB $1, R5 // n--
BLT X9b // n < 0 (n <= 0)
// n > 0
MOVD s+48(FP), R4
CMPBEQ R0, R4, ZB0 //handle 0 case beq
MOVD $64, R6
CMPBEQ R6, R4, ZB64 //handle 64 case beq
MOVD z+0(FP), R2
MOVD x+24(FP), R8
SLD $3, R5 // n = n*8
SUB R4, R6, R7
MOVD (R8), R10 // w1 = x[0]
SLD R7, R10, R3
MOVD R3, c+56(FP)
MOVD $0, R1 // i = 0
BR E9
// i < n-1
L9: MOVD R10, R3 // w = w1
MOVD 8(R8)(R1*1), R10 // w1 = x[i+1]
SRD R4, R3 // w>>s | w1<<s
SLD R7, R10, R6
OR R6, R3
MOVD R3, (R2)(R1*1) // z[i] = w>>s | w1<<s
ADD $8, R1 // i++
E9: CMPBLT R1, R5, L9 // i < n-1
// i >= n-1
X9a: SRD R4, R10 // w1>>s
MOVD R10, (R2)(R5*1) // z[n-1] = w1>>s
RET
X9b: MOVD R0, c+56(FP)
RET
ZB0: MOVD z+0(FP), R2
MOVD x+24(FP), R8
SLD $3, R5 // n = n*8
MOVD (R8), R10 // w1 = x[0]
MOVD $0, R3 // R10 << 64
MOVD R3, c+56(FP)
MOVD $0, R1 // i = 0
BR E9Z
// i < n-1
L9Z: MOVD R10, R3 // w = w1
MOVD 8(R8)(R1*1), R10 // w1 = x[i+1]
MOVD R3, (R2)(R1*1) // z[i] = w>>s | w1<<s
ADD $8, R1 // i++
E9Z: CMPBLT R1, R5, L9Z // i < n-1
// i >= n-1
MOVD R10, (R2)(R5*1) // z[n-1] = w1>>s
RET
ZB64: MOVD z+0(FP), R2
MOVD x+24(FP), R8
SLD $3, R5 // n = n*8
MOVD (R8), R3 // w1 = x[0]
MOVD R3, c+56(FP)
MOVD $0, R1 // i = 0
BR E964
// i < n-1
L964: MOVD 8(R8)(R1*1), R3 // w1 = x[i+1]
MOVD R3, (R2)(R1*1) // z[i] = w>>s | w1<<s
ADD $8, R1 // i++
E964: CMPBLT R1, R5, L964 // i < n-1
// i >= n-1
MOVD $0, R10 // w1>>s
MOVD R10, (R2)(R5*1) // z[n-1] = w1>>s
RET
// CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, DX = r3, AX = r6 , BX = R1 , (R0 set to 0) + use R11 + use R7 for i
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
TEXT ·mulAddVWW(SB),NOSPLIT,$0
MOVD z+0(FP), R2
MOVD x+24(FP), R8
MOVD y+48(FP), R9
MOVD r+56(FP), R4 // c = r
MOVD z_len+8(FP), R5
MOVD $0, R1 // i = 0
MOVD $0, R7 // i*8 = 0
MOVD $0, R0 // make sure it's zero
BR E5
L5: MOVD (R8)(R1*1), R6
MULHDU R9, R6
ADDC R4, R11 //add to low order bits
ADDE R0, R6
MOVD R11, (R2)(R1*1)
MOVD R6, R4
ADD $8, R1 // i*8 + 8
ADD $1, R7 // i++
E5: CMPBLT R7, R5, L5 // i < n
MOVD R4, c+64(FP)
RET
// func addMulVVW(z, x []Word, y Word) (c Word)
// CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1 , (R0 set to 0) + use R11 + use R7 for i
TEXT ·addMulVVW(SB),NOSPLIT,$0
MOVD z+0(FP), R2
MOVD x+24(FP), R8
MOVD y+48(FP), R9
MOVD z_len+8(FP), R5
MOVD $0, R1 // i*8 = 0
MOVD $0, R7 // i = 0
MOVD $0, R0 // make sure it's zero
MOVD $0, R4 // c = 0
MOVD R5, R12
AND $-2, R12
CMPBGE R5, $2, A6
BR E6
A6: MOVD (R8)(R1*1), R6
MULHDU R9, R6
MOVD (R2)(R1*1), R10
ADDC R10, R11 //add to low order bits
ADDE R0, R6
ADDC R4, R11
ADDE R0, R6
MOVD R6, R4
MOVD R11, (R2)(R1*1)
MOVD (8)(R8)(R1*1), R6
MULHDU R9, R6
MOVD (8)(R2)(R1*1), R10
ADDC R10, R11 //add to low order bits
ADDE R0, R6
ADDC R4, R11
ADDE R0, R6
MOVD R6, R4
MOVD R11, (8)(R2)(R1*1)
ADD $16, R1 // i*8 + 8
ADD $2, R7 // i++
CMPBLT R7, R12, A6
BR E6
L6: MOVD (R8)(R1*1), R6
MULHDU R9, R6
MOVD (R2)(R1*1), R10
ADDC R10, R11 //add to low order bits
ADDE R0, R6
ADDC R4, R11
ADDE R0, R6
MOVD R6, R4
MOVD R11, (R2)(R1*1)
ADD $8, R1 // i*8 + 8
ADD $1, R7 // i++
E6: CMPBLT R7, R5, L6 // i < n
MOVD R4, c+56(FP)
RET
// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
// CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1(*8) , (R0 set to 0) + use R11 + use R7 for i
TEXT ·divWVW(SB),NOSPLIT,$0
MOVD z+0(FP), R2
MOVD xn+24(FP), R10 // r = xn
MOVD x+32(FP), R8
MOVD y+56(FP), R9
MOVD z_len+8(FP), R7 // i = z
SLD $3, R7, R1 // i*8
MOVD $0, R0 // make sure it's zero
BR E7
L7: MOVD (R8)(R1*1), R11
WORD $0xB98700A9 //DLGR R10,R9
MOVD R11, (R2)(R1*1)
E7: SUB $1, R7 // i--
SUB $8, R1
BGE L7 // i >= 0
MOVD R10, r+64(FP)
RET