cmd/compile: use vsx loads and stores for LoweredMove, LoweredZero on ppc64x

This improves the code generated for LoweredMove and LoweredZero by
using LXVD2X and STXVD2X to move 16 bytes at a time. These instructions
are now used if the size to be moved or zeroed is >= 64. These same
instructions have already been used in the asm implementations for
memmove and memclr.

Some examples where this shows an improvement on power8:

MakeSlice/Byte                                  27.3ns ± 1%     25.2ns ± 0%    -7.69%
MakeSlice/Int16                                 40.2ns ± 0%     35.2ns ± 0%   -12.39%
MakeSlice/Int                                   94.9ns ± 1%     77.9ns ± 0%   -17.92%
MakeSlice/Ptr                                    129ns ± 1%      103ns ± 0%   -20.16%
MakeSlice/Struct/24                              176ns ± 1%      131ns ± 0%   -25.67%
MakeSlice/Struct/32                              200ns ± 1%      142ns ± 0%   -29.09%
MakeSlice/Struct/40                              220ns ± 2%      156ns ± 0%   -28.82%
GrowSlice/Byte                                  81.4ns ± 0%     73.4ns ± 0%    -9.88%
GrowSlice/Int16                                  118ns ± 1%       98ns ± 0%   -17.03%
GrowSlice/Int                                    178ns ± 1%      134ns ± 1%   -24.65%
GrowSlice/Ptr                                    249ns ± 4%      212ns ± 0%   -14.94%
GrowSlice/Struct/24                              294ns ± 5%      215ns ± 0%   -27.08%
GrowSlice/Struct/32                              315ns ± 1%      248ns ± 0%   -21.49%
GrowSlice/Struct/40                              382ns ± 4%      289ns ± 1%   -24.38%
ExtendSlice/IntSlice                             109ns ± 1%       90ns ± 1%   -17.51%
ExtendSlice/PointerSlice                         142ns ± 2%      118ns ± 0%   -16.75%
ExtendSlice/NoGrow                              6.02ns ± 0%     5.88ns ± 0%    -2.33%
Append                                          27.2ns ± 0%     27.6ns ± 0%    +1.38%
AppendGrowByte                                  4.20ms ± 3%     2.60ms ± 0%   -38.18%
AppendGrowString                                 134ms ± 3%      102ms ± 2%   -23.62%
AppendSlice/1Bytes                              5.65ns ± 0%     5.67ns ± 0%    +0.35%
AppendSlice/4Bytes                              6.40ns ± 0%     6.55ns ± 0%    +2.34%
AppendSlice/7Bytes                              8.74ns ± 0%     8.84ns ± 0%    +1.14%
AppendSlice/8Bytes                              5.68ns ± 0%     5.70ns ± 0%    +0.40%
AppendSlice/15Bytes                             9.31ns ± 0%     9.39ns ± 0%    +0.86%
AppendSlice/16Bytes                             14.0ns ± 0%      5.8ns ± 0%   -58.32%
AppendSlice/32Bytes                             5.72ns ± 0%     5.68ns ± 0%    -0.66%
AppendSliceLarge/1024Bytes                       918ns ± 8%      615ns ± 1%   -33.00%
AppendSliceLarge/4096Bytes                      3.25µs ± 1%     1.92µs ± 1%   -40.84%
AppendSliceLarge/16384Bytes                     8.70µs ± 2%     4.69µs ± 0%   -46.08%
AppendSliceLarge/65536Bytes                     18.1µs ± 3%      7.9µs ± 0%   -56.30%
AppendSliceLarge/262144Bytes                    69.8µs ± 2%     25.9µs ± 0%   -62.91%
AppendSliceLarge/1048576Bytes                    258µs ± 1%       93µs ± 0%   -63.96%

Change-Id: I21625dbe231a2029ddb9f7d73f5a6417b35c1e49
Reviewed-on: https://go-review.googlesource.com/c/go/+/199639
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
diff --git a/src/cmd/compile/internal/ppc64/ssa.go b/src/cmd/compile/internal/ppc64/ssa.go
index 69847c3..4f852b8 100644
--- a/src/cmd/compile/internal/ppc64/ssa.go
+++ b/src/cmd/compile/internal/ppc64/ssa.go
@@ -855,13 +855,13 @@
 		// for sizes >= 64 generate a loop as follows:
 
 		// set up loop counter in CTR, used by BC
+		//       XXLXOR VS32,VS32,VS32
 		//	 MOVD len/32,REG_TMP
 		//	 MOVD REG_TMP,CTR
+		//       MOVD $16,REG_TMP
 		//	 loop:
-		//	 MOVD R0,(R3)
-		//	 MOVD R0,8(R3)
-		//	 MOVD R0,16(R3)
-		//	 MOVD R0,24(R3)
+		//	 STXVD2X VS32,(R0)(R3)
+		//	 STXVD2X VS32,(R31)(R3)
 		//	 ADD  $32,R3
 		//	 BC   16, 0, loop
 		//
@@ -895,8 +895,16 @@
 		// only generate a loop if there is more
 		// than 1 iteration.
 		if ctr > 1 {
+			// Set up VS32 (V0) to hold 0s
+			p := s.Prog(ppc64.AXXLXOR)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_VS32
+			p.Reg = ppc64.REG_VS32
+
 			// Set up CTR loop counter
-			p := s.Prog(ppc64.AMOVD)
+			p = s.Prog(ppc64.AMOVD)
 			p.From.Type = obj.TYPE_CONST
 			p.From.Offset = ctr
 			p.To.Type = obj.TYPE_REG
@@ -908,23 +916,35 @@
 			p.To.Type = obj.TYPE_REG
 			p.To.Reg = ppc64.REG_CTR
 
-			// generate 4 MOVDs
+			// Set up R31 to hold index value 16
+			p = s.Prog(ppc64.AMOVD)
+			p.From.Type = obj.TYPE_CONST
+			p.From.Offset = 16
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REGTMP
+
+			// generate 2 STXVD2Xs to store 16 bytes
 			// when this is a loop then the top must be saved
 			var top *obj.Prog
-			for offset := int64(0); offset < 32; offset += 8 {
-				// This is the top of loop
-				p := s.Prog(ppc64.AMOVD)
-				p.From.Type = obj.TYPE_REG
-				p.From.Reg = ppc64.REG_R0
-				p.To.Type = obj.TYPE_MEM
-				p.To.Reg = v.Args[0].Reg()
-				p.To.Offset = offset
-				// Save the top of loop
-				if top == nil {
-					top = p
-				}
+			// This is the top of loop
+			p = s.Prog(ppc64.ASTXVD2X)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = v.Args[0].Reg()
+			p.To.Index = ppc64.REGZERO
+			// Save the top of loop
+			if top == nil {
+				top = p
 			}
 
+			p = s.Prog(ppc64.ASTXVD2X)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = v.Args[0].Reg()
+			p.To.Index = ppc64.REGTMP
+
 			// Increment address for the
 			// 4 doublewords just zeroed.
 			p = s.Prog(ppc64.AADD)
@@ -994,30 +1014,27 @@
 		// When moving >= 64 bytes a loop is used
 		//	MOVD len/32,REG_TMP
 		//	MOVD REG_TMP,CTR
+		//	MOVD $16,REG_TMP
 		// top:
-		//	MOVD (R4),R7
-		//	MOVD 8(R4),R8
-		//	MOVD 16(R4),R9
-		//	MOVD 24(R4),R10
-		//	ADD  R4,$32
-		//	MOVD R7,(R3)
-		//	MOVD R8,8(R3)
-		//	MOVD R9,16(R3)
-		//	MOVD R10,24(R3)
-		//	ADD  R3,$32
+		//	LXVD2X (R0)(R4),VS32
+		//	LXVD2X (R31)(R4),VS33
+		//	ADD $32,R4
+		//	STXVD2X VS32,(R0)(R3)
+		//	STXVD2X VS33,(R31)(R4)
+		//	ADD $32,R3
 		//	BC 16,0,top
 		// Bytes not moved by this loop are moved
 		// with a combination of the following instructions,
 		// starting with the largest sizes and generating as
 		// many as needed, using the appropriate offset value.
-		//	MOVD  n(R4),R7
-		//	MOVD  R7,n(R3)
-		//	MOVW  n1(R4),R7
-		//	MOVW  R7,n1(R3)
-		//	MOVH  n2(R4),R7
-		//	MOVH  R7,n2(R3)
-		//	MOVB  n3(R4),R7
-		//	MOVB  R7,n3(R3)
+		//	MOVD  n(R4),R14
+		//	MOVD  R14,n(R3)
+		//	MOVW  n1(R4),R14
+		//	MOVW  R14,n1(R3)
+		//	MOVH  n2(R4),R14
+		//	MOVH  R14,n2(R3)
+		//	MOVB  n3(R4),R14
+		//	MOVB  R14,n3(R3)
 
 		// Each loop iteration moves 32 bytes
 		ctr := v.AuxInt / 32
@@ -1030,7 +1047,6 @@
 
 		// The set of registers used here, must match the clobbered reg list
 		// in PPC64Ops.go.
-		useregs := []int16{ppc64.REG_R7, ppc64.REG_R8, ppc64.REG_R9, ppc64.REG_R10}
 		offset := int64(0)
 
 		// top of the loop
@@ -1050,22 +1066,35 @@
 			p.To.Type = obj.TYPE_REG
 			p.To.Reg = ppc64.REG_CTR
 
-			// Generate all the MOVDs for loads
-			// based off the same register, increasing
-			// the offset by 8 for each instruction
-			for _, rg := range useregs {
-				p := s.Prog(ppc64.AMOVD)
-				p.From.Type = obj.TYPE_MEM
-				p.From.Reg = src_reg
-				p.From.Offset = offset
-				p.To.Type = obj.TYPE_REG
-				p.To.Reg = rg
-				if top == nil {
-					top = p
-				}
-				offset += 8
+			// Use REGTMP as index reg
+			p = s.Prog(ppc64.AMOVD)
+			p.From.Type = obj.TYPE_CONST
+			p.From.Offset = 16
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REGTMP
+
+			// Generate 16 byte loads and stores.
+			// Use temp register for index (16)
+			// on the second one.
+			p = s.Prog(ppc64.ALXVD2X)
+			p.From.Type = obj.TYPE_MEM
+			p.From.Reg = src_reg
+			p.From.Index = ppc64.REGZERO
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_VS32
+
+			if top == nil {
+				top = p
 			}
-			// increment the src_reg for next iteration
+
+			p = s.Prog(ppc64.ALXVD2X)
+			p.From.Type = obj.TYPE_MEM
+			p.From.Reg = src_reg
+			p.From.Index = ppc64.REGTMP
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_VS33
+
+			// increment the src reg for next iteration
 			p = s.Prog(ppc64.AADD)
 			p.Reg = src_reg
 			p.From.Type = obj.TYPE_CONST
@@ -1073,20 +1102,22 @@
 			p.To.Type = obj.TYPE_REG
 			p.To.Reg = src_reg
 
-			// generate the MOVDs for stores, based
-			// off the same register, using the same
-			// offsets as in the loads.
-			offset = int64(0)
-			for _, rg := range useregs {
-				p := s.Prog(ppc64.AMOVD)
-				p.From.Type = obj.TYPE_REG
-				p.From.Reg = rg
-				p.To.Type = obj.TYPE_MEM
-				p.To.Reg = dst_reg
-				p.To.Offset = offset
-				offset += 8
-			}
-			// increment the dst_reg for next iteration
+			// generate 16 byte stores
+			p = s.Prog(ppc64.ASTXVD2X)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = dst_reg
+			p.To.Index = ppc64.REGZERO
+
+			p = s.Prog(ppc64.ASTXVD2X)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS33
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = dst_reg
+			p.To.Index = ppc64.REGTMP
+
+			// increment the dst reg for next iteration
 			p = s.Prog(ppc64.AADD)
 			p.Reg = dst_reg
 			p.From.Type = obj.TYPE_CONST
@@ -1114,6 +1145,57 @@
 			rem += 32
 		}
 
+		if rem >= 16 {
+			// Generate 16 byte loads and stores.
+			// Use temp register for index (value 16)
+			// on the second one.
+			p := s.Prog(ppc64.ALXVD2X)
+			p.From.Type = obj.TYPE_MEM
+			p.From.Reg = src_reg
+			p.From.Index = ppc64.REGZERO
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_VS32
+
+			p = s.Prog(ppc64.ASTXVD2X)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = dst_reg
+			p.To.Index = ppc64.REGZERO
+
+			offset = 16
+			rem -= 16
+
+			if rem >= 16 {
+				// Use REGTMP as index reg
+				p = s.Prog(ppc64.AMOVD)
+				p.From.Type = obj.TYPE_CONST
+				p.From.Offset = 16
+				p.To.Type = obj.TYPE_REG
+				p.To.Reg = ppc64.REGTMP
+
+				// Generate 16 byte loads and stores.
+				// Use temp register for index (16)
+				// on the second one.
+				p = s.Prog(ppc64.ALXVD2X)
+				p.From.Type = obj.TYPE_MEM
+				p.From.Reg = src_reg
+				p.From.Index = ppc64.REGTMP
+				p.To.Type = obj.TYPE_REG
+				p.To.Reg = ppc64.REG_VS32
+
+				p = s.Prog(ppc64.ASTXVD2X)
+				p.From.Type = obj.TYPE_REG
+				p.From.Reg = ppc64.REG_VS32
+				p.To.Type = obj.TYPE_MEM
+				p.To.Reg = dst_reg
+				p.To.Index = ppc64.REGTMP
+
+				offset = 32
+				rem -= 16
+			}
+		}
+
 		// Generate all the remaining load and store pairs, starting with
 		// as many 8 byte moves as possible, then 4, 2, 1.
 		for rem > 0 {
@@ -1129,7 +1211,7 @@
 			// Load
 			p := s.Prog(op)
 			p.To.Type = obj.TYPE_REG
-			p.To.Reg = ppc64.REG_R7
+			p.To.Reg = ppc64.REG_R14
 			p.From.Type = obj.TYPE_MEM
 			p.From.Reg = src_reg
 			p.From.Offset = offset
@@ -1137,7 +1219,7 @@
 			// Store
 			p = s.Prog(op)
 			p.From.Type = obj.TYPE_REG
-			p.From.Reg = ppc64.REG_R7
+			p.From.Reg = ppc64.REG_R14
 			p.To.Type = obj.TYPE_MEM
 			p.To.Reg = dst_reg
 			p.To.Offset = offset
diff --git a/src/cmd/compile/internal/ssa/gen/PPC64Ops.go b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
index 5505db5..a6bcc26 100644
--- a/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
@@ -416,13 +416,13 @@
 		// a loop is generated when there is more than one iteration
 		// needed to clear 4 doublewords
 		//
+		//	XXLXOR	VS32,VS32,VS32
 		// 	MOVD	$len/32,R31
 		//	MOVD	R31,CTR
+		//	MOVD	$16,R31
 		//	loop:
-		//	MOVD	R0,(R3)
-		//	MOVD	R0,8(R3)
-		//	MOVD	R0,16(R3)
-		//	MOVD	R0,24(R3)
+		//	STXVD2X VS32,(R0)(R3)
+		//	STXVD2X	VS32,(R31),R3)
 		//	ADD	R3,32
 		//	BC	loop
 
@@ -448,33 +448,38 @@
 			typ:            "Mem",
 			faultOnNilArg0: true,
 		},
+		// R31 is temp register
 		// Loop code:
-		//	MOVD len/32,REG_TMP  only for loop
-		//	MOVD REG_TMP,CTR     only for loop
+		//	MOVD len/32,R31		set up loop ctr
+		//	MOVD R31,CTR
+		//	MOVD $16,R31		index register
 		// loop:
-		//	MOVD (R4),R7
-		//	MOVD 8(R4),R8
-		//	MOVD 16(R4),R9
-		//	MOVD 24(R4),R10
-		//	ADD  R4,$32          only with loop
-		//	MOVD R7,(R3)
-		//	MOVD R8,8(R3)
-		//	MOVD R9,16(R3)
-		//	MOVD R10,24(R3)
-		//	ADD  R3,$32          only with loop
-		//	BC 16,0,loop         only with loop
+		//	LXVD2X (R0)(R4),VS32
+		//	LXVD2X (R31)(R4),VS33
+		//	ADD  R4,$32          increment src
+		//	STXVD2X VS32,(R0)(R3)
+		//	STXVD2X VS33,(R31)(R3)
+		//	ADD  R3,$32          increment dst
+		//	BC 16,0,loop         branch ctr
+		// For this purpose, VS32 and VS33 are treated as
+		// scratch registers. Since regalloc does not
+		// track vector registers, even if it could be marked
+		// as clobbered it would have no effect.
+		// TODO: If vector registers are managed by regalloc
+		// mark these as clobbered.
+		//
 		// Bytes not moved by this loop are moved
 		// with a combination of the following instructions,
 		// starting with the largest sizes and generating as
 		// many as needed, using the appropriate offset value.
-		//	MOVD  n(R4),R7
-		//	MOVD  R7,n(R3)
-		//	MOVW  n1(R4),R7
-		//	MOVW  R7,n1(R3)
-		//	MOVH  n2(R4),R7
-		//	MOVH  R7,n2(R3)
-		//	MOVB  n3(R4),R7
-		//	MOVB  R7,n3(R3)
+		//	MOVD  n(R4),R14
+		//	MOVD  R14,n(R3)
+		//	MOVW  n1(R4),R14
+		//	MOVW  R14,n1(R3)
+		//	MOVH  n2(R4),R14
+		//	MOVH  R14,n2(R3)
+		//	MOVB  n3(R4),R14
+		//	MOVB  R14,n3(R3)
 
 		{
 			name:      "LoweredMove",
@@ -482,7 +487,7 @@
 			argLength: 3,
 			reg: regInfo{
 				inputs:   []regMask{buildReg("R3"), buildReg("R4")},
-				clobbers: buildReg("R3 R4 R7 R8 R9 R10"),
+				clobbers: buildReg("R3 R4 R14"),
 			},
 			clobberFlags:   true,
 			typ:            "Mem",
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 795c6bb..c30654d 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -24486,7 +24486,7 @@
 				{0, 8},  // R3
 				{1, 16}, // R4
 			},
-			clobbers: 1944, // R3 R4 R7 R8 R9 R10
+			clobbers: 16408, // R3 R4 R14
 		},
 	},
 	{