cmd/compile: intrinsify math/bits/ReverseBytes{16|32|64} for ppc64/power10

This change intrinsifies ReverseBytes{16|32|64} by generating the
corresponding new instructions in Power10: brh, brd and brw and
adds a verification test for the same.
On Power 9 and 8, the .go code performs optimally as it is.

Performance improvement seen on Power10:
ReverseBytes32  1.38ns ± 0%  1.18ns ± 0%  -14.2
ReverseBytes64  1.52ns ± 0%  1.11ns ± 0%  -26.87
ReverseBytes16  1.41ns ± 1%  1.18ns ± 0%  -16.47

Change-Id: I88f127f3ab9ba24a772becc21ad90acfba324b37
Reviewed-on: https://go-review.googlesource.com/c/go/+/446675
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
diff --git a/src/cmd/compile/internal/ppc64/ssa.go b/src/cmd/compile/internal/ppc64/ssa.go
index d567a12..08a2a0c 100644
--- a/src/cmd/compile/internal/ppc64/ssa.go
+++ b/src/cmd/compile/internal/ppc64/ssa.go
@@ -670,7 +670,7 @@
 	case ssa.OpPPC64NEG, ssa.OpPPC64FNEG, ssa.OpPPC64FSQRT, ssa.OpPPC64FSQRTS, ssa.OpPPC64FFLOOR, ssa.OpPPC64FTRUNC, ssa.OpPPC64FCEIL,
 		ssa.OpPPC64FCTIDZ, ssa.OpPPC64FCTIWZ, ssa.OpPPC64FCFID, ssa.OpPPC64FCFIDS, ssa.OpPPC64FRSP, ssa.OpPPC64CNTLZD, ssa.OpPPC64CNTLZW,
 		ssa.OpPPC64POPCNTD, ssa.OpPPC64POPCNTW, ssa.OpPPC64POPCNTB, ssa.OpPPC64MFVSRD, ssa.OpPPC64MTVSRD, ssa.OpPPC64FABS, ssa.OpPPC64FNABS,
-		ssa.OpPPC64FROUND, ssa.OpPPC64CNTTZW, ssa.OpPPC64CNTTZD:
+		ssa.OpPPC64FROUND, ssa.OpPPC64CNTTZW, ssa.OpPPC64CNTTZD, ssa.OpPPC64BRH, ssa.OpPPC64BRW, ssa.OpPPC64BRD:
 		r := v.Reg()
 		p := s.Prog(v.Op.Asm())
 		p.To.Type = obj.TYPE_REG
diff --git a/src/cmd/compile/internal/ssa/_gen/PPC64.rules b/src/cmd/compile/internal/ssa/_gen/PPC64.rules
index 2eda1af..7953125 100644
--- a/src/cmd/compile/internal/ssa/_gen/PPC64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/PPC64.rules
@@ -1273,3 +1273,5 @@
 (PrefetchCache ptr mem)          => (DCBT ptr mem [0])
 (PrefetchCacheStreamed ptr mem)  => (DCBT ptr mem [16])
 
+// Use byte reverse instructions on Power10
+(Bswap(16|32|64) x) && buildcfg.GOPPC64>=10 => (BR(H|W|D) x)
diff --git a/src/cmd/compile/internal/ssa/_gen/PPC64Ops.go b/src/cmd/compile/internal/ssa/_gen/PPC64Ops.go
index 88d85ed..10e8f1d 100644
--- a/src/cmd/compile/internal/ssa/_gen/PPC64Ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/PPC64Ops.go
@@ -295,6 +295,9 @@
 		{name: "XORCC", argLength: 2, reg: gp21, asm: "XORCC", commutative: true, clobberFlags: true, typ: "(Int,Flags)"},   // arg0^arg1 sets CC
 		{name: "EQV", argLength: 2, reg: gp21, asm: "EQV", typ: "Int64", commutative: true},                                 // arg0^^arg1
 		{name: "NEG", argLength: 1, reg: gp11, asm: "NEG"},                                                                  // -arg0 (integer)
+		{name: "BRD", argLength: 1, reg: gp11, asm: "BRD"},                                                                  // reversebytes64(arg0)
+		{name: "BRW", argLength: 1, reg: gp11, asm: "BRW"},                                                                  // reversebytes32(arg0)
+		{name: "BRH", argLength: 1, reg: gp11, asm: "BRH"},                                                                  // reversebytes16(arg0)
 		{name: "FNEG", argLength: 1, reg: fp11, asm: "FNEG"},                                                                // -arg0 (floating point)
 		{name: "FSQRT", argLength: 1, reg: fp11, asm: "FSQRT"},                                                              // sqrt(arg0) (floating point)
 		{name: "FSQRTS", argLength: 1, reg: fp11, asm: "FSQRTS"},                                                            // sqrt(arg0) (floating point, single precision)
diff --git a/src/cmd/compile/internal/ssa/_gen/genericOps.go b/src/cmd/compile/internal/ssa/_gen/genericOps.go
index cb4470a..6ecccc3 100644
--- a/src/cmd/compile/internal/ssa/_gen/genericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/genericOps.go
@@ -238,6 +238,7 @@
 	{name: "BitLen32", argLength: 1},     // Number of bits in arg[0] (returns 0-32)
 	{name: "BitLen64", argLength: 1},     // Number of bits in arg[0] (returns 0-64)
 
+	{name: "Bswap16", argLength: 1}, // Swap bytes
 	{name: "Bswap32", argLength: 1}, // Swap bytes
 	{name: "Bswap64", argLength: 1}, // Swap bytes
 
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 6c26213..59e1a5e 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -2161,6 +2161,9 @@
 	OpPPC64XORCC
 	OpPPC64EQV
 	OpPPC64NEG
+	OpPPC64BRD
+	OpPPC64BRW
+	OpPPC64BRH
 	OpPPC64FNEG
 	OpPPC64FSQRT
 	OpPPC64FSQRTS
@@ -2962,6 +2965,7 @@
 	OpBitLen16
 	OpBitLen32
 	OpBitLen64
+	OpBswap16
 	OpBswap32
 	OpBswap64
 	OpBitRev8
@@ -29014,6 +29018,45 @@
 		},
 	},
 	{
+		name:   "BRD",
+		argLen: 1,
+		asm:    ppc64.ABRD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+			outputs: []outputInfo{
+				{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+		},
+	},
+	{
+		name:   "BRW",
+		argLen: 1,
+		asm:    ppc64.ABRW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+			outputs: []outputInfo{
+				{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+		},
+	},
+	{
+		name:   "BRH",
+		argLen: 1,
+		asm:    ppc64.ABRH,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+			outputs: []outputInfo{
+				{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+		},
+	},
+	{
 		name:   "FNEG",
 		argLen: 1,
 		asm:    ppc64.AFNEG,
@@ -38565,6 +38608,11 @@
 		generic: true,
 	},
 	{
+		name:    "Bswap16",
+		argLen:  1,
+		generic: true,
+	},
+	{
 		name:    "Bswap32",
 		argLen:  1,
 		generic: true,
diff --git a/src/cmd/compile/internal/ssa/rewritePPC64.go b/src/cmd/compile/internal/ssa/rewritePPC64.go
index aee570d..7b6e3be 100644
--- a/src/cmd/compile/internal/ssa/rewritePPC64.go
+++ b/src/cmd/compile/internal/ssa/rewritePPC64.go
@@ -107,6 +107,12 @@
 		return rewriteValuePPC64_OpBitLen32(v)
 	case OpBitLen64:
 		return rewriteValuePPC64_OpBitLen64(v)
+	case OpBswap16:
+		return rewriteValuePPC64_OpBswap16(v)
+	case OpBswap32:
+		return rewriteValuePPC64_OpBswap32(v)
+	case OpBswap64:
+		return rewriteValuePPC64_OpBswap64(v)
 	case OpCeil:
 		v.Op = OpPPC64FCEIL
 		return true
@@ -1122,6 +1128,54 @@
 		return true
 	}
 }
+func rewriteValuePPC64_OpBswap16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Bswap16 x)
+	// cond: buildcfg.GOPPC64>=10
+	// result: (BRH x)
+	for {
+		x := v_0
+		if !(buildcfg.GOPPC64 >= 10) {
+			break
+		}
+		v.reset(OpPPC64BRH)
+		v.AddArg(x)
+		return true
+	}
+	return false
+}
+func rewriteValuePPC64_OpBswap32(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Bswap32 x)
+	// cond: buildcfg.GOPPC64>=10
+	// result: (BRW x)
+	for {
+		x := v_0
+		if !(buildcfg.GOPPC64 >= 10) {
+			break
+		}
+		v.reset(OpPPC64BRW)
+		v.AddArg(x)
+		return true
+	}
+	return false
+}
+func rewriteValuePPC64_OpBswap64(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Bswap64 x)
+	// cond: buildcfg.GOPPC64>=10
+	// result: (BRD x)
+	for {
+		x := v_0
+		if !(buildcfg.GOPPC64 >= 10) {
+			break
+		}
+		v.reset(OpPPC64BRD)
+		v.AddArg(x)
+		return true
+	}
+	return false
+}
 func rewriteValuePPC64_OpCom16(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (Com16 x)
diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go
index 654db8f..48f813a 100644
--- a/src/cmd/compile/internal/ssagen/ssa.go
+++ b/src/cmd/compile/internal/ssagen/ssa.go
@@ -4000,17 +4000,23 @@
 		},
 		sys.ARM64, sys.PPC64)
 
+	/* Use only on Power10 as the new byte reverse instructions that Power10 provide
+	   make it worthwhile as an intrinsic */
+	brev_arch := []sys.ArchFamily{sys.AMD64, sys.ARM64, sys.ARM, sys.S390X}
+	if buildcfg.GOPPC64 >= 10 {
+		brev_arch = append(brev_arch, sys.PPC64)
+	}
 	/******** runtime/internal/sys ********/
 	addF("runtime/internal/sys", "Bswap32",
 		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 			return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
 		},
-		sys.AMD64, sys.ARM64, sys.ARM, sys.S390X)
+		brev_arch...)
 	addF("runtime/internal/sys", "Bswap64",
 		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 			return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
 		},
-		sys.AMD64, sys.ARM64, sys.ARM, sys.S390X)
+		brev_arch...)
 
 	/****** Prefetch ******/
 	makePrefetchFunc := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
@@ -4537,7 +4543,16 @@
 	alias("math/bits", "ReverseBytes64", "runtime/internal/sys", "Bswap64", all...)
 	alias("math/bits", "ReverseBytes32", "runtime/internal/sys", "Bswap32", all...)
 	// ReverseBytes inlines correctly, no need to intrinsify it.
-	// ReverseBytes16 lowers to a rotate, no need for anything special here.
+	// Nothing special is needed for targets where ReverseBytes16 lowers to a rotate
+	// On Power10, 16-bit rotate is not available so use BRH instruction
+	if buildcfg.GOPPC64 >= 10 {
+		addF("math/bits", "ReverseBytes16",
+			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+				return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0])
+			},
+			sys.PPC64)
+	}
+
 	addF("math/bits", "Len64",
 		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go
index 09aa5a1..0eed27a 100644
--- a/test/codegen/mathbits.go
+++ b/test/codegen/mathbits.go
@@ -198,6 +198,7 @@
 	// amd64:"BSWAPQ"
 	// s390x:"MOVDBR"
 	// arm64:"REV"
+	// ppc64x/power10: "BRD"
 	return bits.ReverseBytes64(n)
 }
 
@@ -205,6 +206,7 @@
 	// amd64:"BSWAPL"
 	// s390x:"MOVWBR"
 	// arm64:"REVW"
+	// ppc64x/power10: "BRW"
 	return bits.ReverseBytes32(n)
 }
 
@@ -214,6 +216,7 @@
 	// arm/5:"SLL","SRL","ORR"
 	// arm/6:"REV16"
 	// arm/7:"REV16"
+	// ppc64x/power10: "BRH"
 	return bits.ReverseBytes16(n)
 }
 
diff --git a/test/run.go b/test/run.go
index 8eff84d..611fb02 100644
--- a/test/run.go
+++ b/test/run.go
@@ -1649,8 +1649,8 @@
 		"loong64": {},
 		"mips":    {"GOMIPS", "hardfloat", "softfloat"},
 		"mips64":  {"GOMIPS64", "hardfloat", "softfloat"},
-		"ppc64":   {"GOPPC64", "power8", "power9"},
-		"ppc64le": {"GOPPC64", "power8", "power9"},
+		"ppc64":   {"GOPPC64", "power8", "power9", "power10"},
+		"ppc64le": {"GOPPC64", "power8", "power9", "power10"},
 		"ppc64x":  {}, // A pseudo-arch representing both ppc64 and ppc64le
 		"s390x":   {},
 		"wasm":    {},