| // Copyright 2016 The Go Authors. All rights reserved. | 
 | // Use of this source code is governed by a BSD-style | 
 | // license that can be found in the LICENSE file. | 
 |  | 
 | // +build !math_big_pure_go,s390x | 
 |  | 
 | #include "textflag.h" | 
 |  | 
 | // This file provides fast assembly versions for the elementary | 
 | // arithmetic operations on vectors implemented in arith.go. | 
 |  | 
 | TEXT ·hasVectorFacility(SB),NOSPLIT,$24-1 | 
 |         MOVD    $x-24(SP), R1 | 
 |         XC      $24, 0(R1), 0(R1) // clear the storage | 
 |         MOVD    $2, R0            // R0 is the number of double words stored -1 | 
 |         WORD    $0xB2B01000       // STFLE 0(R1) | 
 |         XOR     R0, R0            // reset the value of R0 | 
 |         MOVBZ   z-8(SP), R1 | 
 |         AND     $0x40, R1 | 
 |         BEQ     novector | 
 | vectorinstalled: | 
 |         // check if the vector instruction has been enabled | 
 |         VLEIB   $0, $0xF, V16 | 
 |         VLGVB   $0, V16, R1 | 
 |         CMPBNE  R1, $0xF, novector | 
 |         MOVB    $1, ret+0(FP) // have vx | 
 |         RET | 
 | novector: | 
 |         MOVB    $0, ret+0(FP) // no vx | 
 |         RET | 
 |  | 
 | TEXT ·mulWW(SB),NOSPLIT,$0 | 
 | 	MOVD	x+0(FP), R3 | 
 | 	MOVD	y+8(FP), R4 | 
 | 	MULHDU	R3, R4 | 
 | 	MOVD	R10, z1+16(FP) | 
 | 	MOVD	R11, z0+24(FP) | 
 | 	RET | 
 |  | 
 | // func divWW(x1, x0, y Word) (q, r Word) | 
 | TEXT ·divWW(SB),NOSPLIT,$0 | 
 | 	MOVD	x1+0(FP), R10 | 
 | 	MOVD	x0+8(FP), R11 | 
 | 	MOVD	y+16(FP), R5 | 
 | 	WORD	$0xb98700a5 // dlgr r10,r5 | 
 | 	MOVD	R11, q+24(FP) | 
 | 	MOVD	R10, r+32(FP) | 
 | 	RET | 
 |  | 
 | // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 | 
 | // func addVV(z, x, y []Word) (c Word) | 
 |  | 
 |  | 
 | TEXT ·addVV(SB),NOSPLIT,$0 | 
 | 	MOVD	addvectorfacility+0x00(SB),R1 | 
 | 	BR	(R1) | 
 | 	 | 
 | TEXT ·addVV_check(SB),NOSPLIT, $0 | 
 | 	MOVB	·hasVX(SB), R1 | 
 | 	CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported | 
 | 	MOVD	$addvectorfacility+0x00(SB), R1 | 
 | 	MOVD	$·addVV_novec(SB), R2 | 
 | 	MOVD	R2, 0(R1) | 
 | 	//MOVD	$·addVV_novec(SB), 0(R1) | 
 | 	BR	·addVV_novec(SB) | 
 | vectorimpl: | 
 | 	MOVD	$addvectorfacility+0x00(SB), R1 | 
 | 	MOVD	$·addVV_vec(SB), R2 | 
 | 	MOVD	R2, 0(R1) | 
 | 	//MOVD	$·addVV_vec(SB), 0(R1) | 
 | 	BR	·addVV_vec(SB) | 
 |  | 
 | GLOBL addvectorfacility+0x00(SB), NOPTR, $8 | 
 | DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB) | 
 |  | 
 | TEXT ·addVV_vec(SB),NOSPLIT,$0 | 
 | 	MOVD	z_len+8(FP), R3 | 
 | 	MOVD	x+24(FP), R8 | 
 | 	MOVD	y+48(FP), R9 | 
 | 	MOVD	z+0(FP), R2 | 
 |  | 
 | 	MOVD	$0, R4		// c = 0 | 
 | 	MOVD	$0, R0		// make sure it's zero | 
 | 	MOVD	$0, R10		// i = 0 | 
 |  | 
 |  | 
 | 	// s/JL/JMP/ below to disable the unrolled loop | 
 | 	SUB	$4, R3 | 
 | 	BLT	v1 | 
 | 	SUB     $12, R3                 // n -= 16 | 
 |         BLT     A1                      // if n < 0 goto A1 | 
 |         | 
 | 	MOVD	R8, R5 | 
 | 	MOVD	R9, R6 | 
 | 	MOVD	R2, R7 | 
 | 	// n >= 0 | 
 | 	// regular loop body unrolled 16x | 
 | 	VZERO	V0			// c = 0 | 
 | UU1:	VLM	0(R5), V1, V4		// 64-bytes into V1..V8 | 
 | 	ADD	$64, R5 | 
 | 	VPDI	$0x4,V1,V1,V1		// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V2,V2,V2		// flip the doublewords to big-endian order | 
 |  | 
 |  | 
 | 	VLM	0(R6), V9, V12  	// 64-bytes into V9..V16 | 
 | 	ADD	$64, R6 | 
 | 	VPDI	$0x4,V9,V9,V9		// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V10,V10,V10	// flip the doublewords to big-endian order | 
 |  | 
 | 	VACCCQ	V1, V9, V0, V25 | 
 | 	VACQ	V1, V9, V0, V17 | 
 | 	VACCCQ	V2, V10, V25, V26 | 
 | 	VACQ	V2, V10, V25, V18 | 
 |  | 
 |  | 
 | 	VLM	0(R5), V5, V6		// 32-bytes into V1..V8 | 
 | 	VLM	0(R6), V13, V14  	// 32-bytes into V9..V16 | 
 | 	ADD	$32, R5 | 
 | 	ADD	$32, R6 | 
 |  | 
 | 	VPDI	$0x4,V3,V3,V3		// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V4,V4,V4		// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V11,V11,V11	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V12,V12,V12	// flip the doublewords to big-endian order | 
 |  | 
 | 	VACCCQ	V3, V11, V26, V27 | 
 | 	VACQ	V3, V11, V26, V19 | 
 | 	VACCCQ	V4, V12, V27, V28 | 
 | 	VACQ	V4, V12, V27, V20 | 
 |  | 
 | 	VLM	0(R5), V7, V8		// 32-bytes into V1..V8 | 
 | 	VLM	0(R6), V15, V16  	// 32-bytes into V9..V16 | 
 | 	ADD	$32, R5 | 
 | 	ADD	$32, R6 | 
 |  | 
 | 	VPDI	$0x4,V5,V5,V5		// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V6,V6,V6		// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V13,V13,V13	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V14,V14,V14	// flip the doublewords to big-endian order | 
 |  | 
 | 	VACCCQ	V5, V13, V28, V29 | 
 | 	VACQ	V5, V13, V28, V21 | 
 | 	VACCCQ	V6, V14, V29, V30 | 
 | 	VACQ	V6, V14, V29, V22 | 
 |  | 
 | 	VPDI	$0x4,V7,V7,V7		// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V8,V8,V8		// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V15,V15,V15	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V16,V16,V16	// flip the doublewords to big-endian order | 
 |  | 
 | 	VACCCQ	V7, V15, V30, V31 | 
 | 	VACQ	V7, V15, V30, V23 | 
 | 	VACCCQ	V8, V16, V31, V0	//V0 has carry-over | 
 | 	VACQ	V8, V16, V31, V24 | 
 |  | 
 | 	VPDI	$0x4,V17,V17,V17	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V18,V18,V18	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V19,V19,V19	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V20,V20,V20	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V21,V21,V21	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V22,V22,V22	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V23,V23,V23	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V24,V24,V24	// flip the doublewords to big-endian order | 
 | 	VSTM	V17, V24, 0(R7)  	// 128-bytes into z | 
 | 	ADD	$128, R7 | 
 | 	ADD	$128, R10	// i += 16 | 
 | 	SUB	$16,  R3	// n -= 16 | 
 | 	BGE	UU1		// if n >= 0 goto U1 | 
 | 	VLGVG	$1, V0, R4	// put cf into R4 | 
 | 	NEG	R4, R4		// save cf | 
 |  | 
 | A1:	ADD	$12, R3		// n += 16 | 
 |  | 
 |  | 
 | 	// s/JL/JMP/ below to disable the unrolled loop | 
 | 	BLT	v1		// if n < 0 goto v1 | 
 |  | 
 | U1:	// n >= 0 | 
 | 	// regular loop body unrolled 4x | 
 | 	MOVD	0(R8)(R10*1), R5 | 
 | 	MOVD	8(R8)(R10*1), R6 | 
 | 	MOVD	16(R8)(R10*1), R7 | 
 | 	MOVD	24(R8)(R10*1), R1 | 
 | 	ADDC	R4, R4		// restore CF | 
 | 	MOVD	0(R9)(R10*1), R11 | 
 | 	ADDE	R11, R5 | 
 | 	MOVD	8(R9)(R10*1), R11 | 
 | 	ADDE	R11, R6 | 
 | 	MOVD	16(R9)(R10*1), R11 | 
 | 	ADDE	R11, R7 | 
 | 	MOVD	24(R9)(R10*1), R11 | 
 | 	ADDE	R11, R1 | 
 | 	MOVD	R0, R4 | 
 | 	ADDE	R4, R4		// save CF | 
 | 	NEG	R4, R4 | 
 | 	MOVD	R5, 0(R2)(R10*1) | 
 | 	MOVD	R6, 8(R2)(R10*1) | 
 | 	MOVD	R7, 16(R2)(R10*1) | 
 | 	MOVD	R1, 24(R2)(R10*1) | 
 |  | 
 |  | 
 | 	ADD	$32, R10	// i += 4 | 
 | 	SUB	$4,  R3		// n -= 4 | 
 | 	BGE	U1		// if n >= 0 goto U1 | 
 |  | 
 | v1:	ADD	$4, R3		// n += 4 | 
 | 	BLE	E1		// if n <= 0 goto E1 | 
 |  | 
 | L1:	// n > 0 | 
 | 	ADDC	R4, R4		// restore CF | 
 | 	MOVD	0(R8)(R10*1), R5 | 
 | 	MOVD	0(R9)(R10*1), R11 | 
 | 	ADDE	R11, R5 | 
 | 	MOVD	R5, 0(R2)(R10*1) | 
 | 	MOVD	R0, R4 | 
 | 	ADDE	R4, R4		// save CF | 
 | 	NEG 	R4, R4 | 
 |  | 
 | 	ADD	$8, R10		// i++ | 
 | 	SUB	$1, R3		// n-- | 
 | 	BGT	L1		// if n > 0 goto L1 | 
 |  | 
 | E1:	NEG	R4, R4 | 
 | 	MOVD	R4, c+72(FP)	// return c | 
 | 	RET | 
 |  | 
 | TEXT ·addVV_novec(SB),NOSPLIT,$0 | 
 | novec: | 
 | 	MOVD	z_len+8(FP), R3 | 
 | 	MOVD	x+24(FP), R8 | 
 | 	MOVD	y+48(FP), R9 | 
 | 	MOVD	z+0(FP), R2 | 
 |  | 
 | 	MOVD	$0, R4		// c = 0 | 
 | 	MOVD	$0, R0		// make sure it's zero | 
 | 	MOVD	$0, R10		// i = 0 | 
 |  | 
 | 	// s/JL/JMP/ below to disable the unrolled loop | 
 | 	SUB	$4, R3		// n -= 4 | 
 | 	BLT	v1n		// if n < 0 goto v1n | 
 | U1n:	// n >= 0 | 
 | 	// regular loop body unrolled 4x | 
 | 	MOVD	0(R8)(R10*1), R5 | 
 | 	MOVD	8(R8)(R10*1), R6 | 
 | 	MOVD	16(R8)(R10*1), R7 | 
 | 	MOVD	24(R8)(R10*1), R1 | 
 | 	ADDC	R4, R4		// restore CF | 
 | 	MOVD	0(R9)(R10*1), R11 | 
 | 	ADDE	R11, R5 | 
 | 	MOVD	8(R9)(R10*1), R11 | 
 | 	ADDE	R11, R6 | 
 | 	MOVD	16(R9)(R10*1), R11 | 
 | 	ADDE	R11, R7 | 
 | 	MOVD	24(R9)(R10*1), R11 | 
 | 	ADDE	R11, R1 | 
 | 	MOVD	R0, R4 | 
 | 	ADDE	R4, R4		// save CF | 
 | 	NEG	R4, R4 | 
 | 	MOVD	R5, 0(R2)(R10*1) | 
 | 	MOVD	R6, 8(R2)(R10*1) | 
 | 	MOVD	R7, 16(R2)(R10*1) | 
 | 	MOVD	R1, 24(R2)(R10*1) | 
 |  | 
 |  | 
 | 	ADD	$32, R10	// i += 4 | 
 | 	SUB	$4,  R3		// n -= 4 | 
 | 	BGE	U1n		// if n >= 0 goto U1n | 
 |  | 
 | v1n:	ADD	$4, R3		// n += 4 | 
 | 	BLE	E1n		// if n <= 0 goto E1n | 
 |  | 
 | L1n:	// n > 0 | 
 | 	ADDC	R4, R4		// restore CF | 
 | 	MOVD	0(R8)(R10*1), R5 | 
 | 	MOVD	0(R9)(R10*1), R11 | 
 | 	ADDE	R11, R5 | 
 | 	MOVD	R5, 0(R2)(R10*1) | 
 | 	MOVD	R0, R4 | 
 | 	ADDE	R4, R4		// save CF | 
 | 	NEG 	R4, R4 | 
 |  | 
 | 	ADD	$8, R10		// i++ | 
 | 	SUB	$1, R3		// n-- | 
 | 	BGT L1n			// if n > 0 goto L1n | 
 |  | 
 | E1n:	NEG	R4, R4 | 
 | 	MOVD	R4, c+72(FP)	// return c | 
 | 	RET | 
 |  | 
 |  | 
 | TEXT ·subVV(SB),NOSPLIT,$0 | 
 | 	MOVD	subvectorfacility+0x00(SB),R1 | 
 | 	BR	(R1) | 
 | 	 | 
 | TEXT ·subVV_check(SB),NOSPLIT,$0 | 
 | 	MOVB	·hasVX(SB), R1 | 
 | 	CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported | 
 | 	MOVD	$subvectorfacility+0x00(SB), R1 | 
 | 	MOVD	$·subVV_novec(SB), R2 | 
 | 	MOVD	R2, 0(R1) | 
 | 	//MOVD	$·subVV_novec(SB), 0(R1) | 
 | 	BR	·subVV_novec(SB) | 
 | vectorimpl: | 
 | 	MOVD	$subvectorfacility+0x00(SB), R1 | 
 | 	MOVD    $·subVV_vec(SB), R2 | 
 |         MOVD    R2, 0(R1) | 
 | 	//MOVD	$·subVV_vec(SB), 0(R1) | 
 | 	BR	·subVV_vec(SB) | 
 |  | 
 | GLOBL subvectorfacility+0x00(SB), NOPTR, $8 | 
 | DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB) | 
 |  | 
 | // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 | 
 | // func subVV(z, x, y []Word) (c Word) | 
 | // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names) | 
 | TEXT ·subVV_vec(SB),NOSPLIT,$0 | 
 | 	MOVD	z_len+8(FP), R3 | 
 | 	MOVD	x+24(FP), R8 | 
 | 	MOVD	y+48(FP), R9 | 
 | 	MOVD	z+0(FP), R2 | 
 | 	MOVD	$0, R4		// c = 0 | 
 | 	MOVD	$0, R0		// make sure it's zero | 
 | 	MOVD	$0, R10		// i = 0 | 
 | 	 | 
 | 	// s/JL/JMP/ below to disable the unrolled loop | 
 | 	SUB	$4, R3		// n -= 4 | 
 | 	BLT	v1		// if n < 0 goto v1 | 
 | 	SUB     $12, R3         // n -= 16 | 
 |         BLT     A1              // if n < 0 goto A1 | 
 |  | 
 | 	MOVD	R8, R5 | 
 | 	MOVD	R9, R6 | 
 | 	MOVD	R2, R7 | 
 |  | 
 | 	// n >= 0 | 
 | 	// regular loop body unrolled 16x | 
 | 	VZERO	V0		// cf = 0 | 
 | 	MOVD	$1, R4		// for 390 subtraction cf starts as 1 (no borrow) | 
 | 	VLVGG	$1, R4, V0	//put carry into V0 | 
 |  | 
 | UU1:	VLM	0(R5), V1, V4		// 64-bytes into V1..V8 | 
 | 	ADD	$64, R5 | 
 | 	VPDI	$0x4,V1,V1,V1		// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V2,V2,V2		// flip the doublewords to big-endian order | 
 |  | 
 |  | 
 | 	VLM	0(R6), V9, V12  	// 64-bytes into V9..V16 | 
 | 	ADD	$64, R6 | 
 | 	VPDI	$0x4,V9,V9,V9		// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V10,V10,V10	// flip the doublewords to big-endian order | 
 |  | 
 | 	VSBCBIQ	V1, V9, V0, V25 | 
 | 	VSBIQ	V1, V9, V0, V17 | 
 | 	VSBCBIQ	V2, V10, V25, V26 | 
 | 	VSBIQ	V2, V10, V25, V18 | 
 |  | 
 |  | 
 | 	VLM	0(R5), V5, V6		// 32-bytes into V1..V8 | 
 | 	VLM	0(R6), V13, V14  	// 32-bytes into V9..V16 | 
 | 	ADD	$32, R5 | 
 | 	ADD	$32, R6 | 
 |  | 
 | 	VPDI	$0x4,V3,V3,V3		// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V4,V4,V4		// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V11,V11,V11	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V12,V12,V12	// flip the doublewords to big-endian order | 
 |  | 
 | 	VSBCBIQ	V3, V11, V26, V27 | 
 | 	VSBIQ	V3, V11, V26, V19 | 
 | 	VSBCBIQ	V4, V12, V27, V28 | 
 | 	VSBIQ	V4, V12, V27, V20 | 
 |  | 
 | 	VLM	0(R5), V7, V8		// 32-bytes into V1..V8 | 
 | 	VLM	0(R6), V15, V16  	// 32-bytes into V9..V16 | 
 | 	ADD	$32, R5 | 
 | 	ADD	$32, R6 | 
 |  | 
 | 	VPDI	$0x4,V5,V5,V5		// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V6,V6,V6		// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V13,V13,V13	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V14,V14,V14	// flip the doublewords to big-endian order | 
 |  | 
 | 	VSBCBIQ	V5, V13, V28, V29 | 
 | 	VSBIQ	V5, V13, V28, V21 | 
 | 	VSBCBIQ	V6, V14, V29, V30 | 
 | 	VSBIQ	V6, V14, V29, V22 | 
 |  | 
 | 	VPDI	$0x4,V7,V7,V7		// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V8,V8,V8		// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V15,V15,V15	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V16,V16,V16	// flip the doublewords to big-endian order | 
 |  | 
 | 	VSBCBIQ	V7, V15, V30, V31 | 
 | 	VSBIQ	V7, V15, V30, V23 | 
 | 	VSBCBIQ	V8, V16, V31, V0	//V0 has carry-over | 
 | 	VSBIQ	V8, V16, V31, V24 | 
 |  | 
 | 	VPDI	$0x4,V17,V17,V17	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V18,V18,V18	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V19,V19,V19	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V20,V20,V20	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V21,V21,V21	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V22,V22,V22	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V23,V23,V23	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V24,V24,V24	// flip the doublewords to big-endian order | 
 | 	VSTM	V17, V24, 0(R7)   // 128-bytes into z | 
 | 	ADD	$128, R7 | 
 | 	ADD	$128, R10	// i += 16 | 
 | 	SUB	$16,  R3	// n -= 16 | 
 | 	BGE	UU1		// if n >= 0 goto U1 | 
 | 	VLGVG	$1, V0, R4	// put cf into R4 | 
 | 	SUB	$1, R4		// save cf | 
 |  | 
 | A1:	ADD	$12, R3		// n += 16 | 
 | 	BLT	v1		// if n < 0 goto v1 | 
 | 	 | 
 | U1:	// n >= 0 | 
 | 	// regular loop body unrolled 4x | 
 | 	MOVD	0(R8)(R10*1), R5 | 
 | 	MOVD	8(R8)(R10*1), R6 | 
 | 	MOVD	16(R8)(R10*1), R7 | 
 | 	MOVD	24(R8)(R10*1), R1 | 
 | 	MOVD	R0, R11 | 
 | 	SUBC	R4, R11		// restore CF | 
 | 	MOVD	0(R9)(R10*1), R11 | 
 | 	SUBE	R11, R5 | 
 | 	MOVD	8(R9)(R10*1), R11 | 
 | 	SUBE	R11, R6 | 
 | 	MOVD	16(R9)(R10*1), R11 | 
 | 	SUBE	R11, R7 | 
 | 	MOVD	24(R9)(R10*1), R11 | 
 | 	SUBE	R11, R1 | 
 | 	MOVD	R0, R4 | 
 | 	SUBE	R4, R4		// save CF | 
 | 	MOVD	R5, 0(R2)(R10*1) | 
 | 	MOVD	R6, 8(R2)(R10*1) | 
 | 	MOVD	R7, 16(R2)(R10*1) | 
 | 	MOVD	R1, 24(R2)(R10*1) | 
 |  | 
 | 	ADD	$32, R10	// i += 4 | 
 | 	SUB	$4,  R3		// n -= 4 | 
 | 	BGE	U1		// if n >= 0 goto U1n | 
 |  | 
 | v1:	ADD	$4, R3		// n += 4 | 
 | 	BLE	E1		// if n <= 0 goto E1 | 
 |  | 
 | L1:	// n > 0 | 
 | 	MOVD	R0, R11 | 
 | 	SUBC	R4, R11		// restore CF | 
 | 	MOVD	0(R8)(R10*1), R5 | 
 | 	MOVD	0(R9)(R10*1), R11 | 
 | 	SUBE	R11, R5 | 
 | 	MOVD	R5, 0(R2)(R10*1) | 
 | 	MOVD	R0, R4 | 
 | 	SUBE	R4, R4		// save CF | 
 |  | 
 | 	ADD	$8, R10		// i++ | 
 | 	SUB	$1, R3		// n-- | 
 | 	BGT	L1		// if n > 0 goto L1n | 
 |  | 
 | E1:	NEG	R4, R4 | 
 | 	MOVD	R4, c+72(FP)	// return c | 
 | 	RET | 
 |  | 
 |  | 
 | // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 | 
 | // func subVV(z, x, y []Word) (c Word) | 
 | // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names) | 
 | TEXT ·subVV_novec(SB),NOSPLIT,$0 | 
 | 	MOVD z_len+8(FP), R3 | 
 | 	MOVD x+24(FP), R8 | 
 | 	MOVD y+48(FP), R9 | 
 | 	MOVD z+0(FP), R2 | 
 |  | 
 | 	MOVD $0, R4		// c = 0 | 
 | 	MOVD $0, R0		// make sure it's zero | 
 | 	MOVD $0, R10		// i = 0 | 
 |  | 
 | 	// s/JL/JMP/ below to disable the unrolled loop | 
 | 	SUB  $4, R3		// n -= 4 | 
 | 	BLT v1			// if n < 0 goto v1 | 
 |  | 
 | U1:	// n >= 0 | 
 | 	// regular loop body unrolled 4x | 
 | 	MOVD 0(R8)(R10*1), R5 | 
 | 	MOVD 8(R8)(R10*1), R6 | 
 | 	MOVD 16(R8)(R10*1), R7 | 
 | 	MOVD 24(R8)(R10*1), R1 | 
 | 	MOVD R0, R11 | 
 | 	SUBC R4, R11		// restore CF | 
 | 	MOVD 0(R9)(R10*1), R11 | 
 | 	SUBE R11, R5 | 
 | 	MOVD 8(R9)(R10*1), R11 | 
 | 	SUBE R11, R6 | 
 | 	MOVD 16(R9)(R10*1), R11 | 
 | 	SUBE R11, R7 | 
 | 	MOVD 24(R9)(R10*1), R11 | 
 | 	SUBE R11, R1 | 
 | 	MOVD R0, R4 | 
 | 	SUBE R4, R4		// save CF | 
 | 	MOVD R5, 0(R2)(R10*1) | 
 | 	MOVD R6, 8(R2)(R10*1) | 
 | 	MOVD R7, 16(R2)(R10*1) | 
 | 	MOVD R1, 24(R2)(R10*1) | 
 |  | 
 |  | 
 | 	ADD  $32, R10		// i += 4 | 
 | 	SUB  $4,  R3		// n -= 4 | 
 | 	BGE  U1			// if n >= 0 goto U1 | 
 |  | 
 | v1:	ADD  $4, R3		// n += 4 | 
 | 	BLE E1			// if n <= 0 goto E1 | 
 |  | 
 | L1:	// n > 0 | 
 | 	MOVD R0, R11 | 
 | 	SUBC R4, R11		// restore CF | 
 | 	MOVD 0(R8)(R10*1), R5 | 
 | 	MOVD 0(R9)(R10*1), R11 | 
 | 	SUBE R11, R5 | 
 | 	MOVD R5, 0(R2)(R10*1) | 
 | 	MOVD R0, R4 | 
 | 	SUBE R4, R4		// save CF | 
 |  | 
 | 	ADD  $8, R10		// i++ | 
 | 	SUB  $1, R3		// n-- | 
 | 	BGT L1			// if n > 0 goto L1 | 
 |  | 
 | E1:	NEG  R4, R4 | 
 | 	MOVD R4, c+72(FP)	// return c | 
 | 	RET | 
 |  | 
 | TEXT ·addVW(SB),NOSPLIT,$0 | 
 | 	MOVD	addwvectorfacility+0x00(SB),R1 | 
 | 	BR	(R1) | 
 | 	 | 
 | TEXT ·addVW_check(SB),NOSPLIT,$0 | 
 | 	MOVB	·hasVX(SB), R1 | 
 | 	CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported | 
 | 	MOVD	$addwvectorfacility+0x00(SB), R1 | 
 | 	MOVD    $·addVW_novec(SB), R2 | 
 |         MOVD    R2, 0(R1) | 
 | 	//MOVD	$·addVW_novec(SB), 0(R1) | 
 | 	BR	·addVW_novec(SB) | 
 | vectorimpl: | 
 | 	MOVD	$addwvectorfacility+0x00(SB), R1 | 
 | 	MOVD    $·addVW_vec(SB), R2 | 
 |         MOVD    R2, 0(R1) | 
 | 	//MOVD	$·addVW_vec(SB), 0(R1) | 
 | 	BR	·addVW_vec(SB) | 
 |  | 
 | GLOBL addwvectorfacility+0x00(SB), NOPTR, $8 | 
 | DATA addwvectorfacility+0x00(SB)/8, $·addVW_check(SB) | 
 |  | 
 |  | 
 | // func addVW_vec(z, x []Word, y Word) (c Word) | 
 | TEXT ·addVW_vec(SB),NOSPLIT,$0 | 
 | 	MOVD	z_len+8(FP), R3 | 
 | 	MOVD	x+24(FP), R8 | 
 | 	MOVD	y+48(FP), R4	// c = y | 
 | 	MOVD	z+0(FP), R2 | 
 |  | 
 | 	MOVD	$0, R0		// make sure it's zero | 
 | 	MOVD	$0, R10		// i = 0 | 
 | 	MOVD	R8, R5 | 
 | 	MOVD	R2, R7 | 
 |  | 
 | 	// s/JL/JMP/ below to disable the unrolled loop | 
 | 	SUB	$4, R3			// n -= 4 | 
 | 	BLT	v10			// if n < 0 goto v10 | 
 | 	SUB	$12, R3 | 
 | 	BLT	A10 | 
 |  | 
 | 	// n >= 0 | 
 | 	// regular loop body unrolled 16x | 
 |  | 
 | 	VZERO	V0			// prepare V0 to be final carry register | 
 | 	VZERO	V9			// to ensure upper half is zero | 
 | 	VLVGG	$1, R4, V9 | 
 | UU1:	VLM	0(R5), V1, V4		// 64-bytes into V1..V4 | 
 | 	ADD	$64, R5 | 
 | 	VPDI	$0x4,V1,V1,V1		// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V2,V2,V2		// flip the doublewords to big-endian order | 
 |  | 
 |  | 
 | 	VACCCQ	V1, V9, V0, V25 | 
 | 	VACQ	V1, V9, V0, V17 | 
 | 	VZERO	V9 | 
 | 	VACCCQ	V2, V9, V25, V26 | 
 | 	VACQ	V2, V9, V25, V18 | 
 |  | 
 |  | 
 | 	VLM	0(R5), V5, V6		// 32-bytes into V5..V6 | 
 | 	ADD	$32, R5 | 
 |  | 
 | 	VPDI	$0x4,V3,V3,V3		// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V4,V4,V4		// flip the doublewords to big-endian order | 
 |  | 
 | 	VACCCQ	V3, V9, V26, V27 | 
 | 	VACQ	V3, V9, V26, V19 | 
 | 	VACCCQ	V4, V9, V27, V28 | 
 | 	VACQ	V4, V9, V27, V20 | 
 |  | 
 | 	VLM	0(R5), V7, V8		// 32-bytes into V7..V8 | 
 | 	ADD	$32, R5 | 
 |  | 
 | 	VPDI	$0x4,V5,V5,V5		// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V6,V6,V6		// flip the doublewords to big-endian order | 
 |  | 
 | 	VACCCQ	V5, V9, V28, V29 | 
 | 	VACQ	V5, V9, V28, V21 | 
 | 	VACCCQ	V6, V9, V29, V30 | 
 | 	VACQ	V6, V9, V29, V22 | 
 |  | 
 | 	VPDI	$0x4,V7,V7,V7		// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V8,V8,V8		// flip the doublewords to big-endian order | 
 |  | 
 | 	VACCCQ	V7, V9, V30, V31 | 
 | 	VACQ	V7, V9, V30, V23 | 
 | 	VACCCQ	V8, V9, V31, V0	//V0 has carry-over | 
 | 	VACQ	V8, V9, V31, V24 | 
 |  | 
 | 	VPDI	$0x4,V17,V17,V17	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V18,V18,V18	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V19,V19,V19	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V20,V20,V20	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V21,V21,V21	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V22,V22,V22	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V23,V23,V23	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V24,V24,V24	// flip the doublewords to big-endian order | 
 | 	VSTM	V17, V24, 0(R7)   	// 128-bytes into z | 
 | 	ADD	$128, R7 | 
 | 	ADD	$128, R10		// i += 16 | 
 | 	SUB	$16,  R3		// n -= 16 | 
 | 	BGE	UU1		// if n >= 0 goto U1 | 
 | 	VLGVG	$1, V0, R4	// put cf into R4 in case we branch to v10 | 
 |  | 
 | A10:	ADD	$12, R3		// n += 16 | 
 |  | 
 |  | 
 | 	// s/JL/JMP/ below to disable the unrolled loop | 
 |  | 
 | 	BLT	v10		// if n < 0 goto v10 | 
 |  | 
 |  | 
 | U4:	// n >= 0 | 
 | 	// regular loop body unrolled 4x | 
 | 	MOVD 0(R8)(R10*1), R5 | 
 | 	MOVD 8(R8)(R10*1), R6 | 
 | 	MOVD 16(R8)(R10*1), R7 | 
 | 	MOVD 24(R8)(R10*1), R1 | 
 | 	ADDC R4, R5 | 
 | 	ADDE R0, R6 | 
 | 	ADDE R0, R7 | 
 | 	ADDE R0, R1 | 
 | 	ADDE R0, R0 | 
 | 	MOVD R0, R4		// save CF | 
 | 	SUB  R0, R0 | 
 | 	MOVD R5, 0(R2)(R10*1) | 
 | 	MOVD R6, 8(R2)(R10*1) | 
 | 	MOVD R7, 16(R2)(R10*1) | 
 | 	MOVD R1, 24(R2)(R10*1) | 
 |  | 
 | 	ADD $32, R10		// i += 4 -> i +=32 | 
 | 	SUB $4, R3		// n -= 4 | 
 | 	BGE U4			// if n >= 0 goto U4 | 
 |  | 
 | v10:	ADD $4, R3		// n += 4 | 
 | 	BLE E10			// if n <= 0 goto E4 | 
 |  | 
 |  | 
 | L4:	// n > 0 | 
 | 	MOVD	0(R8)(R10*1), R5 | 
 | 	ADDC	R4, R5 | 
 | 	ADDE	R0, R0 | 
 | 	MOVD	R0, R4		// save CF | 
 | 	SUB 	R0, R0 | 
 | 	MOVD	R5, 0(R2)(R10*1) | 
 |  | 
 | 	ADD	$8, R10		// i++ | 
 | 	SUB	$1, R3		// n-- | 
 | 	BGT	L4		// if n > 0 goto L4 | 
 |  | 
 | E10:	MOVD	R4, c+56(FP)	// return c | 
 |  | 
 | 	RET | 
 |  | 
 |  | 
 | TEXT ·addVW_novec(SB),NOSPLIT,$0 | 
 | //DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) | 
 | 	MOVD z_len+8(FP), R3 | 
 | 	MOVD x+24(FP), R8 | 
 | 	MOVD y+48(FP), R4	// c = y | 
 | 	MOVD z+0(FP), R2 | 
 | 	MOVD $0, R0		// make sure it's 0 | 
 | 	MOVD $0, R10		// i = 0 | 
 |  | 
 | 	// s/JL/JMP/ below to disable the unrolled loop | 
 | 	SUB $4, R3		// n -= 4 | 
 | 	BLT v4			// if n < 4 goto v4 | 
 |  | 
 | U4:	// n >= 0 | 
 | 	// regular loop body unrolled 4x | 
 | 	MOVD 0(R8)(R10*1), R5 | 
 | 	MOVD 8(R8)(R10*1), R6 | 
 | 	MOVD 16(R8)(R10*1), R7 | 
 | 	MOVD 24(R8)(R10*1), R1 | 
 | 	ADDC R4, R5 | 
 | 	ADDE R0, R6 | 
 | 	ADDE R0, R7 | 
 | 	ADDE R0, R1 | 
 | 	ADDE R0, R0 | 
 | 	MOVD R0, R4		// save CF | 
 | 	SUB  R0, R0 | 
 | 	MOVD R5, 0(R2)(R10*1) | 
 | 	MOVD R6, 8(R2)(R10*1) | 
 | 	MOVD R7, 16(R2)(R10*1) | 
 | 	MOVD R1, 24(R2)(R10*1) | 
 |  | 
 | 	ADD $32, R10		// i += 4 -> i +=32 | 
 | 	SUB $4, R3		// n -= 4 | 
 | 	BGE U4			// if n >= 0 goto U4 | 
 |  | 
 | v4:	ADD $4, R3		// n += 4 | 
 | 	BLE E4			// if n <= 0 goto E4 | 
 |  | 
 | L4:	// n > 0 | 
 | 	MOVD 0(R8)(R10*1), R5 | 
 | 	ADDC R4, R5 | 
 | 	ADDE R0, R0 | 
 | 	MOVD R0, R4		// save CF | 
 | 	SUB  R0, R0 | 
 | 	MOVD R5, 0(R2)(R10*1) | 
 |  | 
 | 	ADD  $8, R10		// i++ | 
 | 	SUB  $1, R3		// n-- | 
 | 	BGT L4			// if n > 0 goto L4 | 
 |  | 
 | E4:	MOVD R4, c+56(FP)	// return c | 
 |  | 
 | 	RET | 
 |  | 
 | TEXT ·subVW(SB),NOSPLIT,$0 | 
 | 	MOVD	subwvectorfacility+0x00(SB),R1 | 
 | 	BR	(R1) | 
 | 	 | 
 | TEXT ·subVW_check(SB),NOSPLIT,$0 | 
 | 	MOVB	·hasVX(SB), R1 | 
 | 	CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported | 
 | 	MOVD	$subwvectorfacility+0x00(SB), R1 | 
 | 	MOVD    $·subVW_novec(SB), R2 | 
 |         MOVD    R2, 0(R1) | 
 | 	//MOVD	$·subVW_novec(SB), 0(R1) | 
 | 	BR	·subVW_novec(SB) | 
 | vectorimpl: | 
 | 	MOVD	$subwvectorfacility+0x00(SB), R1 | 
 | 	MOVD    $·subVW_vec(SB), R2 | 
 |         MOVD    R2, 0(R1) | 
 | 	//MOVD	$·subVW_vec(SB), 0(R1) | 
 | 	BR	·subVW_vec(SB) | 
 |  | 
 | GLOBL subwvectorfacility+0x00(SB), NOPTR, $8 | 
 | DATA subwvectorfacility+0x00(SB)/8, $·subVW_check(SB) | 
 |  | 
 | // func subVW(z, x []Word, y Word) (c Word) | 
 | TEXT ·subVW_vec(SB),NOSPLIT,$0 | 
 | 	MOVD	z_len+8(FP), R3 | 
 | 	MOVD	x+24(FP), R8 | 
 | 	MOVD	y+48(FP), R4	// c = y | 
 | 	MOVD	z+0(FP), R2 | 
 |  | 
 | 	MOVD	$0, R0		// make sure it's zero | 
 | 	MOVD	$0, R10		// i = 0 | 
 | 	MOVD	R8, R5 | 
 | 	MOVD	R2, R7 | 
 |  | 
 | 	// s/JL/JMP/ below to disable the unrolled loop | 
 | 	SUB	$4, R3			// n -= 4 | 
 | 	BLT	v11			// if n < 0 goto v11 | 
 | 	SUB	$12, R3 | 
 | 	BLT	A11 | 
 |  | 
 | 	VZERO	V0 | 
 | 	MOVD	$1, R6			// prepare V0 to be final carry register | 
 | 	VLVGG	$1, R6, V0		// borrow is initially "no borrow" | 
 | 	VZERO	V9			// to ensure upper half is zero | 
 | 	VLVGG	$1, R4, V9 | 
 |  | 
 | 	// n >= 0 | 
 | 	// regular loop body unrolled 16x | 
 |  | 
 |  | 
 | UU1:	VLM	0(R5), V1, V4		// 64-bytes into V1..V4 | 
 | 	ADD	$64, R5 | 
 | 	VPDI	$0x4,V1,V1,V1		// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V2,V2,V2		// flip the doublewords to big-endian order | 
 |  | 
 |  | 
 | 	VSBCBIQ	V1, V9, V0, V25 | 
 | 	VSBIQ	V1, V9, V0, V17 | 
 | 	VZERO	V9 | 
 | 	VSBCBIQ	V2, V9, V25, V26 | 
 | 	VSBIQ	V2, V9, V25, V18 | 
 |  | 
 | 	VLM	0(R5), V5, V6		// 32-bytes into V5..V6 | 
 | 	ADD	$32, R5 | 
 |  | 
 | 	VPDI	$0x4,V3,V3,V3		// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V4,V4,V4		// flip the doublewords to big-endian order | 
 |  | 
 |  | 
 | 	VSBCBIQ	V3, V9, V26, V27 | 
 | 	VSBIQ	V3, V9, V26, V19 | 
 | 	VSBCBIQ	V4, V9, V27, V28 | 
 | 	VSBIQ	V4, V9, V27, V20 | 
 |  | 
 | 	VLM	0(R5), V7, V8		// 32-bytes into V7..V8 | 
 | 	ADD	$32, R5 | 
 |  | 
 | 	VPDI	$0x4,V5,V5,V5		// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V6,V6,V6		// flip the doublewords to big-endian order | 
 |  | 
 | 	VSBCBIQ	V5, V9, V28, V29 | 
 | 	VSBIQ	V5, V9, V28, V21 | 
 | 	VSBCBIQ	V6, V9, V29, V30 | 
 | 	VSBIQ	V6, V9, V29, V22 | 
 |  | 
 | 	VPDI	$0x4,V7,V7,V7		// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V8,V8,V8		// flip the doublewords to big-endian order | 
 |  | 
 | 	VSBCBIQ	V7, V9, V30, V31 | 
 | 	VSBIQ	V7, V9, V30, V23 | 
 | 	VSBCBIQ	V8, V9, V31, V0	// V0 has carry-over | 
 | 	VSBIQ	V8, V9, V31, V24 | 
 |  | 
 | 	VPDI	$0x4,V17,V17,V17	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V18,V18,V18	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V19,V19,V19	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V20,V20,V20	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V21,V21,V21	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V22,V22,V22	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V23,V23,V23	// flip the doublewords to big-endian order | 
 | 	VPDI	$0x4,V24,V24,V24	// flip the doublewords to big-endian order | 
 | 	VSTM	V17, V24, 0(R7)   	// 128-bytes into z | 
 | 	ADD	$128, R7 | 
 | 	ADD	$128, R10		// i += 16 | 
 | 	SUB	$16,  R3		// n -= 16 | 
 | 	BGE	UU1			// if n >= 0 goto U1 | 
 | 	VLGVG	$1, V0, R4		// put cf into R4 in case we branch to v10 | 
 | 	SUB	$1, R4			// save cf | 
 | 	NEG	R4, R4 | 
 | A11:	ADD	$12, R3			// n += 16 | 
 |  | 
 | 	BLT	v11			// if n < 0 goto v11 | 
 |  | 
 | 	// n >= 0 | 
 | 	// regular loop body unrolled 4x | 
 |  | 
 | U4:	// n >= 0 | 
 | 	// regular loop body unrolled 4x | 
 | 	MOVD 0(R8)(R10*1), R5 | 
 | 	MOVD 8(R8)(R10*1), R6 | 
 | 	MOVD 16(R8)(R10*1), R7 | 
 | 	MOVD 24(R8)(R10*1), R1 | 
 | 	SUBC R4, R5 //SLGR  -> SUBC | 
 | 	SUBE R0, R6 //SLBGR -> SUBE | 
 | 	SUBE R0, R7 | 
 | 	SUBE R0, R1 | 
 | 	SUBE R4, R4		// save CF | 
 | 	NEG  R4, R4 | 
 | 	MOVD R5, 0(R2)(R10*1) | 
 | 	MOVD R6, 8(R2)(R10*1) | 
 | 	MOVD R7, 16(R2)(R10*1) | 
 | 	MOVD R1, 24(R2)(R10*1) | 
 |  | 
 | 	ADD $32, R10		// i += 4 -> i +=32 | 
 | 	SUB $4, R3		// n -= 4 | 
 | 	BGE U4			// if n >= 0 goto U4 | 
 |  | 
 | v11:	ADD $4, R3		// n += 4 | 
 | 	BLE E11			// if n <= 0 goto E4 | 
 |  | 
 | L4:	// n > 0 | 
 |  | 
 | 	MOVD	0(R8)(R10*1), R5 | 
 | 	SUBC	R4, R5 | 
 | 	SUBE	R4, R4		// save CF | 
 | 	NEG	R4, R4 | 
 | 	MOVD	R5, 0(R2)(R10*1) | 
 |  | 
 | 	ADD	$8, R10		// i++ | 
 | 	SUB	$1, R3		// n-- | 
 | 	BGT	L4		// if n > 0 goto L4 | 
 |  | 
 | E11:	MOVD	R4, c+56(FP)	// return c | 
 |  | 
 | 	RET | 
 |  | 
 | //DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) | 
 | // func subVW(z, x []Word, y Word) (c Word) | 
 | // (same as addVW except for SUBC/SUBE instead of ADDC/ADDE and label names) | 
 | TEXT ·subVW_novec(SB),NOSPLIT,$0 | 
 | 	MOVD z_len+8(FP), R3 | 
 | 	MOVD x+24(FP), R8 | 
 | 	MOVD y+48(FP), R4	// c = y | 
 | 	MOVD z+0(FP), R2 | 
 | 	MOVD $0, R0		// make sure it's 0 | 
 | 	MOVD $0, R10		// i = 0 | 
 |  | 
 | 	// s/JL/JMP/ below to disable the unrolled loop | 
 | 	SUB $4, R3		// n -= 4 | 
 | 	BLT v4			// if n < 4 goto v4 | 
 |  | 
 | U4:	// n >= 0 | 
 | 	// regular loop body unrolled 4x | 
 | 	MOVD 0(R8)(R10*1), R5 | 
 | 	MOVD 8(R8)(R10*1), R6 | 
 | 	MOVD 16(R8)(R10*1), R7 | 
 | 	MOVD 24(R8)(R10*1), R1 | 
 | 	SUBC R4, R5 //SLGR  -> SUBC | 
 | 	SUBE R0, R6 //SLBGR -> SUBE | 
 | 	SUBE R0, R7 | 
 | 	SUBE R0, R1 | 
 | 	SUBE R4, R4		// save CF | 
 | 	NEG  R4, R4 | 
 | 	MOVD R5, 0(R2)(R10*1) | 
 | 	MOVD R6, 8(R2)(R10*1) | 
 | 	MOVD R7, 16(R2)(R10*1) | 
 | 	MOVD R1, 24(R2)(R10*1) | 
 |  | 
 | 	ADD $32, R10		// i += 4 -> i +=32 | 
 | 	SUB $4, R3		// n -= 4 | 
 | 	BGE U4			// if n >= 0 goto U4 | 
 |  | 
 | v4:	ADD $4, R3		// n += 4 | 
 | 	BLE E4			// if n <= 0 goto E4 | 
 |  | 
 | L4:	// n > 0 | 
 | 	MOVD 0(R8)(R10*1), R5 | 
 | 	SUBC R4, R5 | 
 | 	SUBE R4, R4		// save CF | 
 | 	NEG  R4, R4 | 
 | 	MOVD R5, 0(R2)(R10*1) | 
 |  | 
 | 	ADD  $8, R10		// i++ | 
 | 	SUB  $1, R3		// n-- | 
 | 	BGT L4			// if n > 0 goto L4 | 
 |  | 
 | E4:	MOVD R4, c+56(FP)	// return c | 
 |  | 
 | 	RET | 
 |  | 
 | // func shlVU(z, x []Word, s uint) (c Word) | 
 | TEXT ·shlVU(SB),NOSPLIT,$0 | 
 | 	MOVD	z_len+8(FP), R5 | 
 | 	MOVD	$0, R0 | 
 | 	SUB	$1, R5             // n-- | 
 | 	BLT	X8b                // n < 0        (n <= 0) | 
 |  | 
 | 	// n > 0 | 
 | 	MOVD	s+48(FP), R4 | 
 | 	CMPBEQ	R0, R4, Z80	   //handle 0 case beq | 
 | 	MOVD	$64, R6 | 
 | 	CMPBEQ	R6, R4, Z864	   //handle 64 case beq | 
 | 	MOVD	z+0(FP), R2 | 
 | 	MOVD	x+24(FP), R8 | 
 | 	SLD	$3, R5             // n = n*8 | 
 | 	SUB	R4, R6, R7 | 
 | 	MOVD	(R8)(R5*1), R10    // w1 = x[i-1] | 
 | 	SRD	R7, R10, R3 | 
 | 	MOVD	R3, c+56(FP) | 
 |  | 
 | 	MOVD	$0, R1             // i = 0 | 
 | 	BR	E8 | 
 |  | 
 | 	// i < n-1 | 
 | L8:	MOVD	R10, R3             // w = w1 | 
 | 	MOVD	-8(R8)(R5*1), R10   // w1 = x[i+1] | 
 |  | 
 | 	SLD	R4,  R3             // w<<s | w1>>ŝ | 
 | 	SRD	R7, R10, R6 | 
 | 	OR 	R6, R3 | 
 | 	MOVD	R3, (R2)(R5*1)      // z[i] = w<<s | w1>>ŝ | 
 | 	SUB	$8, R5              // i-- | 
 |  | 
 | E8:	CMPBGT	R5, R0, L8	    // i < n-1 | 
 |  | 
 | 	// i >= n-1 | 
 | X8a:	SLD	R4, R10             // w1<<s | 
 | 	MOVD	R10, (R2)           // z[0] = w1<<s | 
 | 	RET | 
 |  | 
 | X8b:	MOVD	R0, c+56(FP) | 
 | 	RET | 
 |  | 
 | Z80:	MOVD	z+0(FP), R2 | 
 | 	MOVD	x+24(FP), R8 | 
 | 	SLD	$3, R5             // n = n*8 | 
 |  | 
 | 	MOVD	(R8), R10 | 
 | 	MOVD	$0, R3 | 
 | 	MOVD	R3, c+56(FP) | 
 |  | 
 | 	MOVD	$0, R1             // i = 0 | 
 | 	BR	E8Z | 
 |  | 
 | 	// i < n-1 | 
 | L8Z:	MOVD	R10, R3 | 
 | 	MOVD	8(R8)(R1*1), R10 | 
 |  | 
 | 	MOVD	R3, (R2)(R1*1) | 
 | 	ADD 	$8, R1 | 
 |  | 
 | E8Z:	CMPBLT	R1, R5, L8Z | 
 |  | 
 | 	// i >= n-1 | 
 | 	MOVD	R10, (R2)(R5*1) | 
 | 	RET | 
 |  | 
 | Z864:	MOVD	z+0(FP), R2 | 
 | 	MOVD	x+24(FP), R8 | 
 | 	SLD	$3, R5             // n = n*8 | 
 | 	MOVD	(R8)(R5*1), R3     // w1 = x[n-1] | 
 | 	MOVD	R3, c+56(FP)       // z[i] = x[n-1] | 
 |  | 
 | 	BR	E864 | 
 |  | 
 | 	// i < n-1 | 
 | L864:	MOVD	-8(R8)(R5*1), R3 | 
 |  | 
 | 	MOVD	R3, (R2)(R5*1)     // z[i] = x[n-1] | 
 | 	SUB	$8, R5             // i-- | 
 |  | 
 | E864:	CMPBGT	R5, R0, L864       // i < n-1 | 
 |  | 
 | 	MOVD	R0, (R2)           // z[n-1] = 0 | 
 | 	RET | 
 |  | 
 |  | 
 | // CX = R4, r8 = r8, r10 = r2 , r11 = r5, DX = r3, AX = r10 , BX = R1 , 64-count = r7 (R0 set to 0) temp = R6 | 
 | // func shrVU(z, x []Word, s uint) (c Word) | 
 | TEXT ·shrVU(SB),NOSPLIT,$0 | 
 | 	MOVD	z_len+8(FP), R5 | 
 | 	MOVD	$0, R0 | 
 | 	SUB	$1, R5             // n-- | 
 | 	BLT	X9b                // n < 0        (n <= 0) | 
 |  | 
 | 	// n > 0 | 
 | 	MOVD	s+48(FP), R4 | 
 | 	CMPBEQ	R0, R4, ZB0	//handle 0 case beq | 
 | 	MOVD	$64, R6 | 
 | 	CMPBEQ 	R6, R4, ZB64	//handle 64 case beq | 
 | 	MOVD	z+0(FP), R2 | 
 | 	MOVD	x+24(FP), R8 | 
 | 	SLD	$3, R5		// n = n*8 | 
 | 	SUB	R4, R6, R7 | 
 | 	MOVD	(R8), R10	// w1 = x[0] | 
 | 	SLD	R7, R10, R3 | 
 | 	MOVD	R3, c+56(FP) | 
 |  | 
 | 	MOVD	$0, R1		// i = 0 | 
 | 	BR 	E9 | 
 |  | 
 | 	// i < n-1 | 
 | L9:	MOVD	R10, R3		// w = w1 | 
 | 	MOVD	8(R8)(R1*1), R10	// w1 = x[i+1] | 
 |  | 
 | 	SRD	R4,  R3		// w>>s | w1<<s | 
 | 	SLD	R7, R10, R6 | 
 | 	OR	R6, R3 | 
 | 	MOVD	R3, (R2)(R1*1)	// z[i] = w>>s | w1<<s | 
 | 	ADD	$8, R1		// i++ | 
 |  | 
 | E9:	CMPBLT	R1, R5, L9	// i < n-1 | 
 |  | 
 | 	// i >= n-1 | 
 | X9a:	SRD	R4, R10		// w1>>s | 
 | 	MOVD	R10, (R2)(R5*1)	// z[n-1] = w1>>s | 
 | 	RET | 
 |  | 
 | X9b:	MOVD	R0, c+56(FP) | 
 | 	RET | 
 |  | 
 | ZB0:	MOVD	z+0(FP), R2 | 
 | 	MOVD	x+24(FP), R8 | 
 | 	SLD	$3, R5		// n = n*8 | 
 |  | 
 | 	MOVD	(R8), R10	// w1 = x[0] | 
 | 	MOVD	$0, R3		// R10 << 64 | 
 | 	MOVD	R3, c+56(FP) | 
 |  | 
 | 	MOVD	$0, R1		// i = 0 | 
 | 	BR	E9Z | 
 |  | 
 | 	// i < n-1 | 
 | L9Z:	MOVD	R10, R3		// w = w1 | 
 | 	MOVD	8(R8)(R1*1), R10	// w1 = x[i+1] | 
 |  | 
 | 	MOVD	R3, (R2)(R1*1)	// z[i] = w>>s | w1<<s | 
 | 	ADD	$8, R1		// i++ | 
 |  | 
 | E9Z:	CMPBLT	R1, R5, L9Z	// i < n-1 | 
 |  | 
 | 	// i >= n-1 | 
 | 	MOVD	R10, (R2)(R5*1)	// z[n-1] = w1>>s | 
 | 	RET | 
 |  | 
 | ZB64:	MOVD	z+0(FP), R2 | 
 | 	MOVD	x+24(FP), R8 | 
 | 	SLD	$3, R5		// n = n*8 | 
 | 	MOVD	(R8), R3	// w1 = x[0] | 
 | 	MOVD	R3, c+56(FP) | 
 |  | 
 | 	MOVD	$0, R1		// i = 0 | 
 | 	BR	E964 | 
 |  | 
 | 	// i < n-1 | 
 | L964:	MOVD	8(R8)(R1*1), R3	// w1 = x[i+1] | 
 |  | 
 | 	MOVD	R3, (R2)(R1*1)	// z[i] = w>>s | w1<<s | 
 | 	ADD	$8, R1		// i++ | 
 |  | 
 | E964:	CMPBLT	R1, R5, L964	// i < n-1 | 
 |  | 
 | 	// i >= n-1 | 
 | 	MOVD	$0, R10            // w1>>s | 
 | 	MOVD	R10, (R2)(R5*1)    // z[n-1] = w1>>s | 
 | 	RET | 
 |  | 
 | // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, DX = r3, AX = r6 , BX = R1 , (R0 set to 0) + use R11 + use R7 for i | 
 | // func mulAddVWW(z, x []Word, y, r Word) (c Word) | 
 | TEXT ·mulAddVWW(SB),NOSPLIT,$0 | 
 | 	MOVD	z+0(FP), R2 | 
 | 	MOVD	x+24(FP), R8 | 
 | 	MOVD	y+48(FP), R9 | 
 | 	MOVD	r+56(FP), R4	// c = r | 
 | 	MOVD	z_len+8(FP), R5 | 
 | 	MOVD	$0, R1		// i = 0 | 
 | 	MOVD	$0, R7		// i*8 = 0 | 
 | 	MOVD	$0, R0		// make sure it's zero | 
 | 	BR	E5 | 
 |  | 
 | L5:	MOVD	(R8)(R1*1), R6 | 
 | 	MULHDU	R9, R6 | 
 | 	ADDC	R4, R11 	//add to low order bits | 
 | 	ADDE	R0, R6 | 
 | 	MOVD	R11, (R2)(R1*1) | 
 | 	MOVD	R6, R4 | 
 | 	ADD	$8, R1		// i*8 + 8 | 
 | 	ADD	$1, R7		// i++ | 
 |  | 
 | E5:	CMPBLT	R7, R5, L5	// i < n | 
 |  | 
 | 	MOVD	R4, c+64(FP) | 
 | 	RET | 
 |  | 
 | // func addMulVVW(z, x []Word, y Word) (c Word) | 
 | // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1 , (R0 set to 0) + use R11 + use R7 for i | 
 | TEXT ·addMulVVW(SB),NOSPLIT,$0 | 
 | 	MOVD	z+0(FP), R2 | 
 | 	MOVD	x+24(FP), R8 | 
 | 	MOVD	y+48(FP), R9 | 
 | 	MOVD	z_len+8(FP), R5 | 
 |  | 
 | 	MOVD	$0, R1		// i*8 = 0 | 
 | 	MOVD	$0, R7		// i = 0 | 
 | 	MOVD	$0, R0		// make sure it's zero | 
 | 	MOVD	$0, R4		// c = 0 | 
 |  | 
 | 	MOVD	R5, R12 | 
 | 	AND	$-2, R12 | 
 | 	CMPBGE	R5, $2, A6 | 
 | 	BR	E6 | 
 |  | 
 | A6:	MOVD	(R8)(R1*1), R6 | 
 | 	MULHDU	R9, R6 | 
 | 	MOVD	(R2)(R1*1), R10 | 
 | 	ADDC	R10, R11	//add to low order bits | 
 | 	ADDE	R0, R6 | 
 | 	ADDC	R4, R11 | 
 | 	ADDE	R0, R6 | 
 | 	MOVD	R6, R4 | 
 | 	MOVD	R11, (R2)(R1*1) | 
 |  | 
 | 	MOVD	(8)(R8)(R1*1), R6 | 
 | 	MULHDU	R9, R6 | 
 | 	MOVD	(8)(R2)(R1*1), R10 | 
 | 	ADDC	R10, R11	//add to low order bits | 
 | 	ADDE	R0, R6 | 
 | 	ADDC	R4, R11 | 
 | 	ADDE	R0, R6 | 
 | 	MOVD	R6, R4 | 
 | 	MOVD	R11, (8)(R2)(R1*1) | 
 |  | 
 | 	ADD	$16, R1		// i*8 + 8 | 
 | 	ADD	$2, R7		// i++ | 
 |  | 
 | 	CMPBLT	R7, R12, A6 | 
 | 	BR	E6 | 
 |  | 
 | L6:	MOVD	(R8)(R1*1), R6 | 
 | 	MULHDU	R9, R6 | 
 | 	MOVD	(R2)(R1*1), R10 | 
 | 	ADDC	R10, R11	//add to low order bits | 
 | 	ADDE	R0, R6 | 
 | 	ADDC	R4, R11 | 
 | 	ADDE	R0, R6 | 
 | 	MOVD	R6, R4 | 
 | 	MOVD	R11, (R2)(R1*1) | 
 |  | 
 | 	ADD	$8, R1		// i*8 + 8 | 
 | 	ADD	$1, R7		// i++ | 
 |  | 
 | E6:	CMPBLT	R7, R5, L6	// i < n | 
 |  | 
 | 	MOVD	R4, c+56(FP) | 
 | 	RET | 
 |  | 
 | // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) | 
 | // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1(*8) , (R0 set to 0) + use R11 + use R7 for i | 
 | TEXT ·divWVW(SB),NOSPLIT,$0 | 
 | 	MOVD	z+0(FP), R2 | 
 | 	MOVD	xn+24(FP), R10	// r = xn | 
 | 	MOVD	x+32(FP), R8 | 
 | 	MOVD	y+56(FP), R9 | 
 | 	MOVD	z_len+8(FP), R7	// i = z | 
 | 	SLD	$3, R7, R1		// i*8 | 
 | 	MOVD	$0, R0		// make sure it's zero | 
 | 	BR	E7 | 
 |  | 
 | L7:	MOVD	(R8)(R1*1), R11 | 
 | 	WORD	$0xB98700A9	//DLGR R10,R9 | 
 | 	MOVD	R11, (R2)(R1*1) | 
 |  | 
 | E7:	SUB	$1, R7		// i-- | 
 | 	SUB	$8, R1 | 
 | 	BGE	L7		// i >= 0 | 
 |  | 
 | 	MOVD	R10, r+64(FP) | 
 | 	RET |