cmd/compile: clean up and optimize s390x multiplication rules

Some of the existing optimizations aren't triggered because they
are handled by the generic rules so this CL removes them. Also
some constraints were copied without much thought from the amd64
rules and they don't make sense on s390x, so we remove those
constraints.

Finally, add a 'multiply by the sum of two powers of two'
optimization. This makes sense on s390x as shifts are low latency
and can also sometimes be optimized further (especially if we add
support for RISBG instructions).

name                   old time/op  new time/op  delta
IntMulByConst/3-8      1.70ns ±11%  1.10ns ± 5%  -35.26%  (p=0.000 n=10+10)
IntMulByConst/5-8      1.64ns ± 7%  1.10ns ± 4%  -32.94%  (p=0.000 n=10+9)
IntMulByConst/12-8     1.65ns ± 6%  1.20ns ± 4%  -27.16%  (p=0.000 n=10+9)
IntMulByConst/120-8    1.66ns ± 4%  1.22ns ±13%  -26.43%  (p=0.000 n=10+10)
IntMulByConst/-120-8   1.65ns ± 7%  1.19ns ± 4%  -28.06%  (p=0.000 n=9+10)
IntMulByConst/65537-8  0.86ns ± 9%  1.12ns ±12%  +30.41%  (p=0.000 n=10+10)
IntMulByConst/65538-8  1.65ns ± 5%  1.23ns ± 5%  -25.11%  (p=0.000 n=10+10)

Change-Id: Ib196e6bff1e97febfd266134d0a2b2a62897989f
Reviewed-on: https://go-review.googlesource.com/c/go/+/248937
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
diff --git a/src/cmd/compile/internal/ssa/gen/S390X.rules b/src/cmd/compile/internal/ssa/gen/S390X.rules
index d3234c1..5e4c436 100644
--- a/src/cmd/compile/internal/ssa/gen/S390X.rules
+++ b/src/cmd/compile/internal/ssa/gen/S390X.rules
@@ -716,20 +716,40 @@
 (ANDWconst [0xFF] x) => (MOVBZreg x)
 (ANDWconst [0xFFFF] x) => (MOVHZreg x)
 
-// strength reduction
-(MULLDconst [-1] x) => (NEG x)
-(MULLDconst [0] _) => (MOVDconst [0])
-(MULLDconst [1] x) => x
-(MULLDconst [c] x) && isPowerOfTwo(c) -> (SLDconst [log2(c)] x)
-(MULLDconst [c] x) && isPowerOfTwo(c+1) && c >= 15 -> (SUB (SLDconst <v.Type> [log2(c+1)] x) x)
-(MULLDconst [c] x) && isPowerOfTwo(c-1) && c >= 17 -> (ADD (SLDconst <v.Type> [log2(c-1)] x) x)
+// Strength reduce multiplication to the sum (or difference) of two powers of two.
+//
+// Examples:
+//     5x -> 4x + 1x
+//    10x -> 8x + 2x
+//   120x -> 128x - 8x
+//  -120x -> 8x - 128x
+//
+// We know that the rightmost bit of any positive value, once isolated, must either
+// be a power of 2 (because it is a single bit) or 0 (if the original value is 0).
+// In all of these rules we use a rightmost bit calculation to determine one operand
+// for the addition or subtraction. We then just need to calculate if the other
+// operand is a valid power of 2 before we can match the rule.
+//
+// Notes:
+//   - the generic rules have already matched single powers of two so we ignore them here
+//   - isPowerOfTwo32 asserts that its argument is greater than 0
+//   - c&(c-1) = clear rightmost bit
+//   - c&^(c-1) = isolate rightmost bit
 
-(MULLWconst [-1] x) => (NEGW x)
-(MULLWconst [0] _) => (MOVDconst [0])
-(MULLWconst [1] x) => x
-(MULLWconst [c] x) && isPowerOfTwo(c) -> (SLWconst [log2(c)] x)
-(MULLWconst [c] x) && isPowerOfTwo(c+1) && c >= 15 -> (SUBW (SLWconst <v.Type> [log2(c+1)] x) x)
-(MULLWconst [c] x) && isPowerOfTwo(c-1) && c >= 17 -> (ADDW (SLWconst <v.Type> [log2(c-1)] x) x)
+// c = 2ˣ + 2ʸ => c - 2ˣ = 2ʸ
+(MULL(D|W)const <t> x [c]) && isPowerOfTwo32(c&(c-1))
+  => ((ADD|ADDW) (SL(D|W)const <t> x [int8(log32(c&(c-1)))])
+                 (SL(D|W)const <t> x [int8(log32(c&^(c-1)))]))
+
+// c = 2ʸ - 2ˣ => c + 2ˣ = 2ʸ
+(MULL(D|W)const <t> x [c]) && isPowerOfTwo32(c+(c&^(c-1)))
+  => ((SUB|SUBW) (SL(D|W)const <t> x [int8(log32(c+(c&^(c-1))))])
+                 (SL(D|W)const <t> x [int8(log32(c&^(c-1)))]))
+
+// c = 2ˣ - 2ʸ => -c + 2ˣ = 2ʸ
+(MULL(D|W)const <t> x [c]) && isPowerOfTwo32(-c+(-c&^(-c-1)))
+  => ((SUB|SUBW) (SL(D|W)const <t> x [int8(log32(-c&^(-c-1)))])
+                 (SL(D|W)const <t> x [int8(log32(-c+(-c&^(-c-1))))]))
 
 // Fold ADD into MOVDaddr. Odd offsets from SB shouldn't be folded (LARL can't handle them).
 (ADDconst [c] (MOVDaddr [d] {s} x:(SB))) && ((c+d)&1 == 0) && is32Bit(c+d) -> (MOVDaddr [c+d] {s} x)
@@ -1133,6 +1153,9 @@
 (XORconst [0] x)                  => x
 (XORWconst [c] x) && int32(c)==0   => x
 
+// Shifts by zero (may be inserted during multiplication strength reduction).
+((SLD|SLW|SRD|SRW|SRAD|SRAW)const x [0]) => x
+
 // Convert constant subtracts to constant adds.
 (SUBconst [c] x) && c != -(1<<31) => (ADDconst [-c] x)
 (SUBWconst [c] x) -> (ADDWconst [int64(int32(-c))] x)
diff --git a/src/cmd/compile/internal/ssa/rewriteS390X.go b/src/cmd/compile/internal/ssa/rewriteS390X.go
index dc9b143..536f8db 100644
--- a/src/cmd/compile/internal/ssa/rewriteS390X.go
+++ b/src/cmd/compile/internal/ssa/rewriteS390X.go
@@ -732,8 +732,12 @@
 		return rewriteValueS390X_OpS390XRLLG(v)
 	case OpS390XSLD:
 		return rewriteValueS390X_OpS390XSLD(v)
+	case OpS390XSLDconst:
+		return rewriteValueS390X_OpS390XSLDconst(v)
 	case OpS390XSLW:
 		return rewriteValueS390X_OpS390XSLW(v)
+	case OpS390XSLWconst:
+		return rewriteValueS390X_OpS390XSLWconst(v)
 	case OpS390XSRAD:
 		return rewriteValueS390X_OpS390XSRAD(v)
 	case OpS390XSRADconst:
@@ -748,6 +752,8 @@
 		return rewriteValueS390X_OpS390XSRDconst(v)
 	case OpS390XSRW:
 		return rewriteValueS390X_OpS390XSRW(v)
+	case OpS390XSRWconst:
+		return rewriteValueS390X_OpS390XSRWconst(v)
 	case OpS390XSTM2:
 		return rewriteValueS390X_OpS390XSTM2(v)
 	case OpS390XSTMG2:
@@ -13853,81 +13859,64 @@
 func rewriteValueS390X_OpS390XMULLDconst(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (MULLDconst [-1] x)
-	// result: (NEG x)
+	// match: (MULLDconst <t> x [c])
+	// cond: isPowerOfTwo32(c&(c-1))
+	// result: (ADD (SLDconst <t> x [int8(log32(c&(c-1)))]) (SLDconst <t> x [int8(log32(c&^(c-1)))]))
 	for {
-		if auxIntToInt32(v.AuxInt) != -1 {
-			break
-		}
+		t := v.Type
+		c := auxIntToInt32(v.AuxInt)
 		x := v_0
-		v.reset(OpS390XNEG)
-		v.AddArg(x)
-		return true
-	}
-	// match: (MULLDconst [0] _)
-	// result: (MOVDconst [0])
-	for {
-		if auxIntToInt32(v.AuxInt) != 0 {
-			break
-		}
-		v.reset(OpS390XMOVDconst)
-		v.AuxInt = int64ToAuxInt(0)
-		return true
-	}
-	// match: (MULLDconst [1] x)
-	// result: x
-	for {
-		if auxIntToInt32(v.AuxInt) != 1 {
-			break
-		}
-		x := v_0
-		v.copyOf(x)
-		return true
-	}
-	// match: (MULLDconst [c] x)
-	// cond: isPowerOfTwo(c)
-	// result: (SLDconst [log2(c)] x)
-	for {
-		c := v.AuxInt
-		x := v_0
-		if !(isPowerOfTwo(c)) {
-			break
-		}
-		v.reset(OpS390XSLDconst)
-		v.AuxInt = log2(c)
-		v.AddArg(x)
-		return true
-	}
-	// match: (MULLDconst [c] x)
-	// cond: isPowerOfTwo(c+1) && c >= 15
-	// result: (SUB (SLDconst <v.Type> [log2(c+1)] x) x)
-	for {
-		c := v.AuxInt
-		x := v_0
-		if !(isPowerOfTwo(c+1) && c >= 15) {
-			break
-		}
-		v.reset(OpS390XSUB)
-		v0 := b.NewValue0(v.Pos, OpS390XSLDconst, v.Type)
-		v0.AuxInt = log2(c + 1)
-		v0.AddArg(x)
-		v.AddArg2(v0, x)
-		return true
-	}
-	// match: (MULLDconst [c] x)
-	// cond: isPowerOfTwo(c-1) && c >= 17
-	// result: (ADD (SLDconst <v.Type> [log2(c-1)] x) x)
-	for {
-		c := v.AuxInt
-		x := v_0
-		if !(isPowerOfTwo(c-1) && c >= 17) {
+		if !(isPowerOfTwo32(c & (c - 1))) {
 			break
 		}
 		v.reset(OpS390XADD)
-		v0 := b.NewValue0(v.Pos, OpS390XSLDconst, v.Type)
-		v0.AuxInt = log2(c - 1)
+		v0 := b.NewValue0(v.Pos, OpS390XSLDconst, t)
+		v0.AuxInt = int8ToAuxInt(int8(log32(c & (c - 1))))
 		v0.AddArg(x)
-		v.AddArg2(v0, x)
+		v1 := b.NewValue0(v.Pos, OpS390XSLDconst, t)
+		v1.AuxInt = int8ToAuxInt(int8(log32(c &^ (c - 1))))
+		v1.AddArg(x)
+		v.AddArg2(v0, v1)
+		return true
+	}
+	// match: (MULLDconst <t> x [c])
+	// cond: isPowerOfTwo32(c+(c&^(c-1)))
+	// result: (SUB (SLDconst <t> x [int8(log32(c+(c&^(c-1))))]) (SLDconst <t> x [int8(log32(c&^(c-1)))]))
+	for {
+		t := v.Type
+		c := auxIntToInt32(v.AuxInt)
+		x := v_0
+		if !(isPowerOfTwo32(c + (c &^ (c - 1)))) {
+			break
+		}
+		v.reset(OpS390XSUB)
+		v0 := b.NewValue0(v.Pos, OpS390XSLDconst, t)
+		v0.AuxInt = int8ToAuxInt(int8(log32(c + (c &^ (c - 1)))))
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpS390XSLDconst, t)
+		v1.AuxInt = int8ToAuxInt(int8(log32(c &^ (c - 1))))
+		v1.AddArg(x)
+		v.AddArg2(v0, v1)
+		return true
+	}
+	// match: (MULLDconst <t> x [c])
+	// cond: isPowerOfTwo32(-c+(-c&^(-c-1)))
+	// result: (SUB (SLDconst <t> x [int8(log32(-c&^(-c-1)))]) (SLDconst <t> x [int8(log32(-c+(-c&^(-c-1))))]))
+	for {
+		t := v.Type
+		c := auxIntToInt32(v.AuxInt)
+		x := v_0
+		if !(isPowerOfTwo32(-c + (-c &^ (-c - 1)))) {
+			break
+		}
+		v.reset(OpS390XSUB)
+		v0 := b.NewValue0(v.Pos, OpS390XSLDconst, t)
+		v0.AuxInt = int8ToAuxInt(int8(log32(-c &^ (-c - 1))))
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpS390XSLDconst, t)
+		v1.AuxInt = int8ToAuxInt(int8(log32(-c + (-c &^ (-c - 1)))))
+		v1.AddArg(x)
+		v.AddArg2(v0, v1)
 		return true
 	}
 	// match: (MULLDconst [c] (MOVDconst [d]))
@@ -14097,81 +14086,64 @@
 func rewriteValueS390X_OpS390XMULLWconst(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (MULLWconst [-1] x)
-	// result: (NEGW x)
+	// match: (MULLWconst <t> x [c])
+	// cond: isPowerOfTwo32(c&(c-1))
+	// result: (ADDW (SLWconst <t> x [int8(log32(c&(c-1)))]) (SLWconst <t> x [int8(log32(c&^(c-1)))]))
 	for {
-		if auxIntToInt32(v.AuxInt) != -1 {
-			break
-		}
+		t := v.Type
+		c := auxIntToInt32(v.AuxInt)
 		x := v_0
-		v.reset(OpS390XNEGW)
-		v.AddArg(x)
-		return true
-	}
-	// match: (MULLWconst [0] _)
-	// result: (MOVDconst [0])
-	for {
-		if auxIntToInt32(v.AuxInt) != 0 {
-			break
-		}
-		v.reset(OpS390XMOVDconst)
-		v.AuxInt = int64ToAuxInt(0)
-		return true
-	}
-	// match: (MULLWconst [1] x)
-	// result: x
-	for {
-		if auxIntToInt32(v.AuxInt) != 1 {
-			break
-		}
-		x := v_0
-		v.copyOf(x)
-		return true
-	}
-	// match: (MULLWconst [c] x)
-	// cond: isPowerOfTwo(c)
-	// result: (SLWconst [log2(c)] x)
-	for {
-		c := v.AuxInt
-		x := v_0
-		if !(isPowerOfTwo(c)) {
-			break
-		}
-		v.reset(OpS390XSLWconst)
-		v.AuxInt = log2(c)
-		v.AddArg(x)
-		return true
-	}
-	// match: (MULLWconst [c] x)
-	// cond: isPowerOfTwo(c+1) && c >= 15
-	// result: (SUBW (SLWconst <v.Type> [log2(c+1)] x) x)
-	for {
-		c := v.AuxInt
-		x := v_0
-		if !(isPowerOfTwo(c+1) && c >= 15) {
-			break
-		}
-		v.reset(OpS390XSUBW)
-		v0 := b.NewValue0(v.Pos, OpS390XSLWconst, v.Type)
-		v0.AuxInt = log2(c + 1)
-		v0.AddArg(x)
-		v.AddArg2(v0, x)
-		return true
-	}
-	// match: (MULLWconst [c] x)
-	// cond: isPowerOfTwo(c-1) && c >= 17
-	// result: (ADDW (SLWconst <v.Type> [log2(c-1)] x) x)
-	for {
-		c := v.AuxInt
-		x := v_0
-		if !(isPowerOfTwo(c-1) && c >= 17) {
+		if !(isPowerOfTwo32(c & (c - 1))) {
 			break
 		}
 		v.reset(OpS390XADDW)
-		v0 := b.NewValue0(v.Pos, OpS390XSLWconst, v.Type)
-		v0.AuxInt = log2(c - 1)
+		v0 := b.NewValue0(v.Pos, OpS390XSLWconst, t)
+		v0.AuxInt = int8ToAuxInt(int8(log32(c & (c - 1))))
 		v0.AddArg(x)
-		v.AddArg2(v0, x)
+		v1 := b.NewValue0(v.Pos, OpS390XSLWconst, t)
+		v1.AuxInt = int8ToAuxInt(int8(log32(c &^ (c - 1))))
+		v1.AddArg(x)
+		v.AddArg2(v0, v1)
+		return true
+	}
+	// match: (MULLWconst <t> x [c])
+	// cond: isPowerOfTwo32(c+(c&^(c-1)))
+	// result: (SUBW (SLWconst <t> x [int8(log32(c+(c&^(c-1))))]) (SLWconst <t> x [int8(log32(c&^(c-1)))]))
+	for {
+		t := v.Type
+		c := auxIntToInt32(v.AuxInt)
+		x := v_0
+		if !(isPowerOfTwo32(c + (c &^ (c - 1)))) {
+			break
+		}
+		v.reset(OpS390XSUBW)
+		v0 := b.NewValue0(v.Pos, OpS390XSLWconst, t)
+		v0.AuxInt = int8ToAuxInt(int8(log32(c + (c &^ (c - 1)))))
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpS390XSLWconst, t)
+		v1.AuxInt = int8ToAuxInt(int8(log32(c &^ (c - 1))))
+		v1.AddArg(x)
+		v.AddArg2(v0, v1)
+		return true
+	}
+	// match: (MULLWconst <t> x [c])
+	// cond: isPowerOfTwo32(-c+(-c&^(-c-1)))
+	// result: (SUBW (SLWconst <t> x [int8(log32(-c&^(-c-1)))]) (SLWconst <t> x [int8(log32(-c+(-c&^(-c-1))))]))
+	for {
+		t := v.Type
+		c := auxIntToInt32(v.AuxInt)
+		x := v_0
+		if !(isPowerOfTwo32(-c + (-c &^ (-c - 1)))) {
+			break
+		}
+		v.reset(OpS390XSUBW)
+		v0 := b.NewValue0(v.Pos, OpS390XSLWconst, t)
+		v0.AuxInt = int8ToAuxInt(int8(log32(-c &^ (-c - 1))))
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpS390XSLWconst, t)
+		v1.AuxInt = int8ToAuxInt(int8(log32(-c + (-c &^ (-c - 1)))))
+		v1.AddArg(x)
+		v.AddArg2(v0, v1)
 		return true
 	}
 	// match: (MULLWconst [c] (MOVDconst [d]))
@@ -16826,6 +16798,20 @@
 	}
 	return false
 }
+func rewriteValueS390X_OpS390XSLDconst(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (SLDconst x [0])
+	// result: x
+	for {
+		if auxIntToInt8(v.AuxInt) != 0 {
+			break
+		}
+		x := v_0
+		v.copyOf(x)
+		return true
+	}
+	return false
+}
 func rewriteValueS390X_OpS390XSLW(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
@@ -16960,6 +16946,20 @@
 	}
 	return false
 }
+func rewriteValueS390X_OpS390XSLWconst(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (SLWconst x [0])
+	// result: x
+	for {
+		if auxIntToInt8(v.AuxInt) != 0 {
+			break
+		}
+		x := v_0
+		v.copyOf(x)
+		return true
+	}
+	return false
+}
 func rewriteValueS390X_OpS390XSRAD(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
@@ -17096,6 +17096,16 @@
 }
 func rewriteValueS390X_OpS390XSRADconst(v *Value) bool {
 	v_0 := v.Args[0]
+	// match: (SRADconst x [0])
+	// result: x
+	for {
+		if auxIntToInt8(v.AuxInt) != 0 {
+			break
+		}
+		x := v_0
+		v.copyOf(x)
+		return true
+	}
 	// match: (SRADconst [c] (MOVDconst [d]))
 	// result: (MOVDconst [d>>uint64(c)])
 	for {
@@ -17246,6 +17256,16 @@
 }
 func rewriteValueS390X_OpS390XSRAWconst(v *Value) bool {
 	v_0 := v.Args[0]
+	// match: (SRAWconst x [0])
+	// result: x
+	for {
+		if auxIntToInt8(v.AuxInt) != 0 {
+			break
+		}
+		x := v_0
+		v.copyOf(x)
+		return true
+	}
 	// match: (SRAWconst [c] (MOVDconst [d]))
 	// result: (MOVDconst [int64(int32(d))>>uint64(c)])
 	for {
@@ -17416,6 +17436,16 @@
 		v.AddArg(v0)
 		return true
 	}
+	// match: (SRDconst x [0])
+	// result: x
+	for {
+		if auxIntToInt8(v.AuxInt) != 0 {
+			break
+		}
+		x := v_0
+		v.copyOf(x)
+		return true
+	}
 	return false
 }
 func rewriteValueS390X_OpS390XSRW(v *Value) bool {
@@ -17552,6 +17582,20 @@
 	}
 	return false
 }
+func rewriteValueS390X_OpS390XSRWconst(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (SRWconst x [0])
+	// result: x
+	for {
+		if auxIntToInt8(v.AuxInt) != 0 {
+			break
+		}
+		x := v_0
+		v.copyOf(x)
+		return true
+	}
+	return false
+}
 func rewriteValueS390X_OpS390XSTM2(v *Value) bool {
 	v_3 := v.Args[3]
 	v_2 := v.Args[2]
diff --git a/src/cmd/compile/internal/test/mulconst_test.go b/src/cmd/compile/internal/test/mulconst_test.go
new file mode 100644
index 0000000..314cab3
--- /dev/null
+++ b/src/cmd/compile/internal/test/mulconst_test.go
@@ -0,0 +1,242 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package test
+
+import "testing"
+
+// Benchmark multiplication of an integer by various constants.
+//
+// The comment above each sub-benchmark provides an example of how the
+// target multiplication operation might be implemented using shift
+// (multiplication by a power of 2), addition and subtraction
+// operations. It is platform-dependent whether these transformations
+// are actually applied.
+
+var (
+	mulSinkI32 int32
+	mulSinkI64 int64
+	mulSinkU32 uint32
+	mulSinkU64 uint64
+)
+
+func BenchmarkMulconstI32(b *testing.B) {
+	// 3x = 2x + x
+	b.Run("3", func(b *testing.B) {
+		x := int32(1)
+		for i := 0; i < b.N; i++ {
+			x *= 3
+		}
+		mulSinkI32 = x
+	})
+	// 5x = 4x + x
+	b.Run("5", func(b *testing.B) {
+		x := int32(1)
+		for i := 0; i < b.N; i++ {
+			x *= 5
+		}
+		mulSinkI32 = x
+	})
+	// 12x = 8x + 4x
+	b.Run("12", func(b *testing.B) {
+		x := int32(1)
+		for i := 0; i < b.N; i++ {
+			x *= 12
+		}
+		mulSinkI32 = x
+	})
+	// 120x = 128x - 8x
+	b.Run("120", func(b *testing.B) {
+		x := int32(1)
+		for i := 0; i < b.N; i++ {
+			x *= 120
+		}
+		mulSinkI32 = x
+	})
+	// -120x = 8x - 120x
+	b.Run("-120", func(b *testing.B) {
+		x := int32(1)
+		for i := 0; i < b.N; i++ {
+			x *= -120
+		}
+		mulSinkI32 = x
+	})
+	// 65537x = 65536x + x
+	b.Run("65537", func(b *testing.B) {
+		x := int32(1)
+		for i := 0; i < b.N; i++ {
+			x *= 65537
+		}
+		mulSinkI32 = x
+	})
+	// 65538x = 65536x + 2x
+	b.Run("65538", func(b *testing.B) {
+		x := int32(1)
+		for i := 0; i < b.N; i++ {
+			x *= 65538
+		}
+		mulSinkI32 = x
+	})
+}
+
+func BenchmarkMulconstI64(b *testing.B) {
+	// 3x = 2x + x
+	b.Run("3", func(b *testing.B) {
+		x := int64(1)
+		for i := 0; i < b.N; i++ {
+			x *= 3
+		}
+		mulSinkI64 = x
+	})
+	// 5x = 4x + x
+	b.Run("5", func(b *testing.B) {
+		x := int64(1)
+		for i := 0; i < b.N; i++ {
+			x *= 5
+		}
+		mulSinkI64 = x
+	})
+	// 12x = 8x + 4x
+	b.Run("12", func(b *testing.B) {
+		x := int64(1)
+		for i := 0; i < b.N; i++ {
+			x *= 12
+		}
+		mulSinkI64 = x
+	})
+	// 120x = 128x - 8x
+	b.Run("120", func(b *testing.B) {
+		x := int64(1)
+		for i := 0; i < b.N; i++ {
+			x *= 120
+		}
+		mulSinkI64 = x
+	})
+	// -120x = 8x - 120x
+	b.Run("-120", func(b *testing.B) {
+		x := int64(1)
+		for i := 0; i < b.N; i++ {
+			x *= -120
+		}
+		mulSinkI64 = x
+	})
+	// 65537x = 65536x + x
+	b.Run("65537", func(b *testing.B) {
+		x := int64(1)
+		for i := 0; i < b.N; i++ {
+			x *= 65537
+		}
+		mulSinkI64 = x
+	})
+	// 65538x = 65536x + 2x
+	b.Run("65538", func(b *testing.B) {
+		x := int64(1)
+		for i := 0; i < b.N; i++ {
+			x *= 65538
+		}
+		mulSinkI64 = x
+	})
+}
+
+func BenchmarkMulconstU32(b *testing.B) {
+	// 3x = 2x + x
+	b.Run("3", func(b *testing.B) {
+		x := uint32(1)
+		for i := 0; i < b.N; i++ {
+			x *= 3
+		}
+		mulSinkU32 = x
+	})
+	// 5x = 4x + x
+	b.Run("5", func(b *testing.B) {
+		x := uint32(1)
+		for i := 0; i < b.N; i++ {
+			x *= 5
+		}
+		mulSinkU32 = x
+	})
+	// 12x = 8x + 4x
+	b.Run("12", func(b *testing.B) {
+		x := uint32(1)
+		for i := 0; i < b.N; i++ {
+			x *= 12
+		}
+		mulSinkU32 = x
+	})
+	// 120x = 128x - 8x
+	b.Run("120", func(b *testing.B) {
+		x := uint32(1)
+		for i := 0; i < b.N; i++ {
+			x *= 120
+		}
+		mulSinkU32 = x
+	})
+	// 65537x = 65536x + x
+	b.Run("65537", func(b *testing.B) {
+		x := uint32(1)
+		for i := 0; i < b.N; i++ {
+			x *= 65537
+		}
+		mulSinkU32 = x
+	})
+	// 65538x = 65536x + 2x
+	b.Run("65538", func(b *testing.B) {
+		x := uint32(1)
+		for i := 0; i < b.N; i++ {
+			x *= 65538
+		}
+		mulSinkU32 = x
+	})
+}
+
+func BenchmarkMulconstU64(b *testing.B) {
+	// 3x = 2x + x
+	b.Run("3", func(b *testing.B) {
+		x := uint64(1)
+		for i := 0; i < b.N; i++ {
+			x *= 3
+		}
+		mulSinkU64 = x
+	})
+	// 5x = 4x + x
+	b.Run("5", func(b *testing.B) {
+		x := uint64(1)
+		for i := 0; i < b.N; i++ {
+			x *= 5
+		}
+		mulSinkU64 = x
+	})
+	// 12x = 8x + 4x
+	b.Run("12", func(b *testing.B) {
+		x := uint64(1)
+		for i := 0; i < b.N; i++ {
+			x *= 12
+		}
+		mulSinkU64 = x
+	})
+	// 120x = 128x - 8x
+	b.Run("120", func(b *testing.B) {
+		x := uint64(1)
+		for i := 0; i < b.N; i++ {
+			x *= 120
+		}
+		mulSinkU64 = x
+	})
+	// 65537x = 65536x + x
+	b.Run("65537", func(b *testing.B) {
+		x := uint64(1)
+		for i := 0; i < b.N; i++ {
+			x *= 65537
+		}
+		mulSinkU64 = x
+	})
+	// 65538x = 65536x + 2x
+	b.Run("65538", func(b *testing.B) {
+		x := uint64(1)
+		for i := 0; i < b.N; i++ {
+			x *= 65538
+		}
+		mulSinkU64 = x
+	})
+}
diff --git a/test/codegen/arithmetic.go b/test/codegen/arithmetic.go
index 8f25974..9f30ec8 100644
--- a/test/codegen/arithmetic.go
+++ b/test/codegen/arithmetic.go
@@ -71,9 +71,15 @@
 	// 386:`SHLL\t[$]5`,`LEAL\t\(.*\)\(.*\*2\),`,-`IMULL`
 	// arm64:`LSL\t[$]5`,`ADD\sR[0-9]+<<1,\sR[0-9]+`,-`MUL`
 	// arm:`SLL\t[$]5`,`ADD\sR[0-9]+<<1,\sR[0-9]+`,-`MUL`
+	// s390x:`SLD\t[$]5`,`SLD\t[$]6`,-`MULLD`
 	return n * 96
 }
 
+func Mul_n120(n int) int {
+	// s390x:`SLD\t[$]3`,`SLD\t[$]7`,-`MULLD`
+	return n * -120
+}
+
 func MulMemSrc(a []uint32, b []float32) {
 	// 386:`IMULL\s4\([A-Z]+\),\s[A-Z]+`
 	a[0] *= a[1]