cmd/compile: use vsx loads and stores for LoweredMove, LoweredZero on ppc64x
This improves the code generated for LoweredMove and LoweredZero by
using LXVD2X and STXVD2X to move 16 bytes at a time. These instructions
are now used if the size to be moved or zeroed is >= 64. These same
instructions have already been used in the asm implementations for
memmove and memclr.
Some examples where this shows an improvement on power8:
MakeSlice/Byte 27.3ns ± 1% 25.2ns ± 0% -7.69%
MakeSlice/Int16 40.2ns ± 0% 35.2ns ± 0% -12.39%
MakeSlice/Int 94.9ns ± 1% 77.9ns ± 0% -17.92%
MakeSlice/Ptr 129ns ± 1% 103ns ± 0% -20.16%
MakeSlice/Struct/24 176ns ± 1% 131ns ± 0% -25.67%
MakeSlice/Struct/32 200ns ± 1% 142ns ± 0% -29.09%
MakeSlice/Struct/40 220ns ± 2% 156ns ± 0% -28.82%
GrowSlice/Byte 81.4ns ± 0% 73.4ns ± 0% -9.88%
GrowSlice/Int16 118ns ± 1% 98ns ± 0% -17.03%
GrowSlice/Int 178ns ± 1% 134ns ± 1% -24.65%
GrowSlice/Ptr 249ns ± 4% 212ns ± 0% -14.94%
GrowSlice/Struct/24 294ns ± 5% 215ns ± 0% -27.08%
GrowSlice/Struct/32 315ns ± 1% 248ns ± 0% -21.49%
GrowSlice/Struct/40 382ns ± 4% 289ns ± 1% -24.38%
ExtendSlice/IntSlice 109ns ± 1% 90ns ± 1% -17.51%
ExtendSlice/PointerSlice 142ns ± 2% 118ns ± 0% -16.75%
ExtendSlice/NoGrow 6.02ns ± 0% 5.88ns ± 0% -2.33%
Append 27.2ns ± 0% 27.6ns ± 0% +1.38%
AppendGrowByte 4.20ms ± 3% 2.60ms ± 0% -38.18%
AppendGrowString 134ms ± 3% 102ms ± 2% -23.62%
AppendSlice/1Bytes 5.65ns ± 0% 5.67ns ± 0% +0.35%
AppendSlice/4Bytes 6.40ns ± 0% 6.55ns ± 0% +2.34%
AppendSlice/7Bytes 8.74ns ± 0% 8.84ns ± 0% +1.14%
AppendSlice/8Bytes 5.68ns ± 0% 5.70ns ± 0% +0.40%
AppendSlice/15Bytes 9.31ns ± 0% 9.39ns ± 0% +0.86%
AppendSlice/16Bytes 14.0ns ± 0% 5.8ns ± 0% -58.32%
AppendSlice/32Bytes 5.72ns ± 0% 5.68ns ± 0% -0.66%
AppendSliceLarge/1024Bytes 918ns ± 8% 615ns ± 1% -33.00%
AppendSliceLarge/4096Bytes 3.25µs ± 1% 1.92µs ± 1% -40.84%
AppendSliceLarge/16384Bytes 8.70µs ± 2% 4.69µs ± 0% -46.08%
AppendSliceLarge/65536Bytes 18.1µs ± 3% 7.9µs ± 0% -56.30%
AppendSliceLarge/262144Bytes 69.8µs ± 2% 25.9µs ± 0% -62.91%
AppendSliceLarge/1048576Bytes 258µs ± 1% 93µs ± 0% -63.96%
Change-Id: I21625dbe231a2029ddb9f7d73f5a6417b35c1e49
Reviewed-on: https://go-review.googlesource.com/c/go/+/199639
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
diff --git a/src/cmd/compile/internal/ppc64/ssa.go b/src/cmd/compile/internal/ppc64/ssa.go
index 69847c3..4f852b8 100644
--- a/src/cmd/compile/internal/ppc64/ssa.go
+++ b/src/cmd/compile/internal/ppc64/ssa.go
@@ -855,13 +855,13 @@
// for sizes >= 64 generate a loop as follows:
// set up loop counter in CTR, used by BC
+ // XXLXOR VS32,VS32,VS32
// MOVD len/32,REG_TMP
// MOVD REG_TMP,CTR
+ // MOVD $16,REG_TMP
// loop:
- // MOVD R0,(R3)
- // MOVD R0,8(R3)
- // MOVD R0,16(R3)
- // MOVD R0,24(R3)
+ // STXVD2X VS32,(R0)(R3)
+ // STXVD2X VS32,(R31)(R3)
// ADD $32,R3
// BC 16, 0, loop
//
@@ -895,8 +895,16 @@
// only generate a loop if there is more
// than 1 iteration.
if ctr > 1 {
+ // Set up VS32 (V0) to hold 0s
+ p := s.Prog(ppc64.AXXLXOR)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS32
+ p.Reg = ppc64.REG_VS32
+
// Set up CTR loop counter
- p := s.Prog(ppc64.AMOVD)
+ p = s.Prog(ppc64.AMOVD)
p.From.Type = obj.TYPE_CONST
p.From.Offset = ctr
p.To.Type = obj.TYPE_REG
@@ -908,23 +916,35 @@
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REG_CTR
- // generate 4 MOVDs
+ // Set up R31 to hold index value 16
+ p = s.Prog(ppc64.AMOVD)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = 16
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REGTMP
+
+ // generate 2 STXVD2Xs to store 16 bytes
// when this is a loop then the top must be saved
var top *obj.Prog
- for offset := int64(0); offset < 32; offset += 8 {
- // This is the top of loop
- p := s.Prog(ppc64.AMOVD)
- p.From.Type = obj.TYPE_REG
- p.From.Reg = ppc64.REG_R0
- p.To.Type = obj.TYPE_MEM
- p.To.Reg = v.Args[0].Reg()
- p.To.Offset = offset
- // Save the top of loop
- if top == nil {
- top = p
- }
+ // This is the top of loop
+ p = s.Prog(ppc64.ASTXVD2X)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = v.Args[0].Reg()
+ p.To.Index = ppc64.REGZERO
+ // Save the top of loop
+ if top == nil {
+ top = p
}
+ p = s.Prog(ppc64.ASTXVD2X)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = v.Args[0].Reg()
+ p.To.Index = ppc64.REGTMP
+
// Increment address for the
// 4 doublewords just zeroed.
p = s.Prog(ppc64.AADD)
@@ -994,30 +1014,27 @@
// When moving >= 64 bytes a loop is used
// MOVD len/32,REG_TMP
// MOVD REG_TMP,CTR
+ // MOVD $16,REG_TMP
// top:
- // MOVD (R4),R7
- // MOVD 8(R4),R8
- // MOVD 16(R4),R9
- // MOVD 24(R4),R10
- // ADD R4,$32
- // MOVD R7,(R3)
- // MOVD R8,8(R3)
- // MOVD R9,16(R3)
- // MOVD R10,24(R3)
- // ADD R3,$32
+ // LXVD2X (R0)(R4),VS32
+ // LXVD2X (R31)(R4),VS33
+ // ADD $32,R4
+ // STXVD2X VS32,(R0)(R3)
+ // STXVD2X VS33,(R31)(R4)
+ // ADD $32,R3
// BC 16,0,top
// Bytes not moved by this loop are moved
// with a combination of the following instructions,
// starting with the largest sizes and generating as
// many as needed, using the appropriate offset value.
- // MOVD n(R4),R7
- // MOVD R7,n(R3)
- // MOVW n1(R4),R7
- // MOVW R7,n1(R3)
- // MOVH n2(R4),R7
- // MOVH R7,n2(R3)
- // MOVB n3(R4),R7
- // MOVB R7,n3(R3)
+ // MOVD n(R4),R14
+ // MOVD R14,n(R3)
+ // MOVW n1(R4),R14
+ // MOVW R14,n1(R3)
+ // MOVH n2(R4),R14
+ // MOVH R14,n2(R3)
+ // MOVB n3(R4),R14
+ // MOVB R14,n3(R3)
// Each loop iteration moves 32 bytes
ctr := v.AuxInt / 32
@@ -1030,7 +1047,6 @@
// The set of registers used here, must match the clobbered reg list
// in PPC64Ops.go.
- useregs := []int16{ppc64.REG_R7, ppc64.REG_R8, ppc64.REG_R9, ppc64.REG_R10}
offset := int64(0)
// top of the loop
@@ -1050,22 +1066,35 @@
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REG_CTR
- // Generate all the MOVDs for loads
- // based off the same register, increasing
- // the offset by 8 for each instruction
- for _, rg := range useregs {
- p := s.Prog(ppc64.AMOVD)
- p.From.Type = obj.TYPE_MEM
- p.From.Reg = src_reg
- p.From.Offset = offset
- p.To.Type = obj.TYPE_REG
- p.To.Reg = rg
- if top == nil {
- top = p
- }
- offset += 8
+ // Use REGTMP as index reg
+ p = s.Prog(ppc64.AMOVD)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = 16
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REGTMP
+
+ // Generate 16 byte loads and stores.
+ // Use temp register for index (16)
+ // on the second one.
+ p = s.Prog(ppc64.ALXVD2X)
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = src_reg
+ p.From.Index = ppc64.REGZERO
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS32
+
+ if top == nil {
+ top = p
}
- // increment the src_reg for next iteration
+
+ p = s.Prog(ppc64.ALXVD2X)
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = src_reg
+ p.From.Index = ppc64.REGTMP
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS33
+
+ // increment the src reg for next iteration
p = s.Prog(ppc64.AADD)
p.Reg = src_reg
p.From.Type = obj.TYPE_CONST
@@ -1073,20 +1102,22 @@
p.To.Type = obj.TYPE_REG
p.To.Reg = src_reg
- // generate the MOVDs for stores, based
- // off the same register, using the same
- // offsets as in the loads.
- offset = int64(0)
- for _, rg := range useregs {
- p := s.Prog(ppc64.AMOVD)
- p.From.Type = obj.TYPE_REG
- p.From.Reg = rg
- p.To.Type = obj.TYPE_MEM
- p.To.Reg = dst_reg
- p.To.Offset = offset
- offset += 8
- }
- // increment the dst_reg for next iteration
+ // generate 16 byte stores
+ p = s.Prog(ppc64.ASTXVD2X)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = dst_reg
+ p.To.Index = ppc64.REGZERO
+
+ p = s.Prog(ppc64.ASTXVD2X)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS33
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = dst_reg
+ p.To.Index = ppc64.REGTMP
+
+ // increment the dst reg for next iteration
p = s.Prog(ppc64.AADD)
p.Reg = dst_reg
p.From.Type = obj.TYPE_CONST
@@ -1114,6 +1145,57 @@
rem += 32
}
+ if rem >= 16 {
+ // Generate 16 byte loads and stores.
+ // Use temp register for index (value 16)
+ // on the second one.
+ p := s.Prog(ppc64.ALXVD2X)
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = src_reg
+ p.From.Index = ppc64.REGZERO
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS32
+
+ p = s.Prog(ppc64.ASTXVD2X)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = dst_reg
+ p.To.Index = ppc64.REGZERO
+
+ offset = 16
+ rem -= 16
+
+ if rem >= 16 {
+ // Use REGTMP as index reg
+ p = s.Prog(ppc64.AMOVD)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = 16
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REGTMP
+
+ // Generate 16 byte loads and stores.
+ // Use temp register for index (16)
+ // on the second one.
+ p = s.Prog(ppc64.ALXVD2X)
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = src_reg
+ p.From.Index = ppc64.REGTMP
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ppc64.REG_VS32
+
+ p = s.Prog(ppc64.ASTXVD2X)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = ppc64.REG_VS32
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = dst_reg
+ p.To.Index = ppc64.REGTMP
+
+ offset = 32
+ rem -= 16
+ }
+ }
+
// Generate all the remaining load and store pairs, starting with
// as many 8 byte moves as possible, then 4, 2, 1.
for rem > 0 {
@@ -1129,7 +1211,7 @@
// Load
p := s.Prog(op)
p.To.Type = obj.TYPE_REG
- p.To.Reg = ppc64.REG_R7
+ p.To.Reg = ppc64.REG_R14
p.From.Type = obj.TYPE_MEM
p.From.Reg = src_reg
p.From.Offset = offset
@@ -1137,7 +1219,7 @@
// Store
p = s.Prog(op)
p.From.Type = obj.TYPE_REG
- p.From.Reg = ppc64.REG_R7
+ p.From.Reg = ppc64.REG_R14
p.To.Type = obj.TYPE_MEM
p.To.Reg = dst_reg
p.To.Offset = offset
diff --git a/src/cmd/compile/internal/ssa/gen/PPC64Ops.go b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
index 5505db5..a6bcc26 100644
--- a/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
@@ -416,13 +416,13 @@
// a loop is generated when there is more than one iteration
// needed to clear 4 doublewords
//
+ // XXLXOR VS32,VS32,VS32
// MOVD $len/32,R31
// MOVD R31,CTR
+ // MOVD $16,R31
// loop:
- // MOVD R0,(R3)
- // MOVD R0,8(R3)
- // MOVD R0,16(R3)
- // MOVD R0,24(R3)
+ // STXVD2X VS32,(R0)(R3)
+ // STXVD2X VS32,(R31),R3)
// ADD R3,32
// BC loop
@@ -448,33 +448,38 @@
typ: "Mem",
faultOnNilArg0: true,
},
+ // R31 is temp register
// Loop code:
- // MOVD len/32,REG_TMP only for loop
- // MOVD REG_TMP,CTR only for loop
+ // MOVD len/32,R31 set up loop ctr
+ // MOVD R31,CTR
+ // MOVD $16,R31 index register
// loop:
- // MOVD (R4),R7
- // MOVD 8(R4),R8
- // MOVD 16(R4),R9
- // MOVD 24(R4),R10
- // ADD R4,$32 only with loop
- // MOVD R7,(R3)
- // MOVD R8,8(R3)
- // MOVD R9,16(R3)
- // MOVD R10,24(R3)
- // ADD R3,$32 only with loop
- // BC 16,0,loop only with loop
+ // LXVD2X (R0)(R4),VS32
+ // LXVD2X (R31)(R4),VS33
+ // ADD R4,$32 increment src
+ // STXVD2X VS32,(R0)(R3)
+ // STXVD2X VS33,(R31)(R3)
+ // ADD R3,$32 increment dst
+ // BC 16,0,loop branch ctr
+ // For this purpose, VS32 and VS33 are treated as
+ // scratch registers. Since regalloc does not
+ // track vector registers, even if it could be marked
+ // as clobbered it would have no effect.
+ // TODO: If vector registers are managed by regalloc
+ // mark these as clobbered.
+ //
// Bytes not moved by this loop are moved
// with a combination of the following instructions,
// starting with the largest sizes and generating as
// many as needed, using the appropriate offset value.
- // MOVD n(R4),R7
- // MOVD R7,n(R3)
- // MOVW n1(R4),R7
- // MOVW R7,n1(R3)
- // MOVH n2(R4),R7
- // MOVH R7,n2(R3)
- // MOVB n3(R4),R7
- // MOVB R7,n3(R3)
+ // MOVD n(R4),R14
+ // MOVD R14,n(R3)
+ // MOVW n1(R4),R14
+ // MOVW R14,n1(R3)
+ // MOVH n2(R4),R14
+ // MOVH R14,n2(R3)
+ // MOVB n3(R4),R14
+ // MOVB R14,n3(R3)
{
name: "LoweredMove",
@@ -482,7 +487,7 @@
argLength: 3,
reg: regInfo{
inputs: []regMask{buildReg("R3"), buildReg("R4")},
- clobbers: buildReg("R3 R4 R7 R8 R9 R10"),
+ clobbers: buildReg("R3 R4 R14"),
},
clobberFlags: true,
typ: "Mem",
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 795c6bb..c30654d 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -24486,7 +24486,7 @@
{0, 8}, // R3
{1, 16}, // R4
},
- clobbers: 1944, // R3 R4 R7 R8 R9 R10
+ clobbers: 16408, // R3 R4 R14
},
},
{