cmd/compile: use BMI1 instructions for GOAMD64=v3 and higher
BMI1 includes four instructions (ANDN, BLSI, BLSMSK, BLSR) that are
easy to peephole optimize, and which GCC always seems to favor using
when available and applicable.
Updates #45453.
Change-Id: I0274184057058f5c579e5bc3ea9c414396d3cf46
Reviewed-on: https://go-review.googlesource.com/c/go/+/351130
Run-TryBot: Matthew Dempsky <mdempsky@google.com>
Trust: Matthew Dempsky <mdempsky@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go
index 30131bd..68266d3 100644
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@@ -263,6 +263,23 @@
p.To.Reg = lo
p.SetFrom3Reg(hi)
+ case ssa.OpAMD64BLSIQ, ssa.OpAMD64BLSIL,
+ ssa.OpAMD64BLSMSKQ, ssa.OpAMD64BLSMSKL,
+ ssa.OpAMD64BLSRQ, ssa.OpAMD64BLSRL:
+ p := s.Prog(v.Op.Asm())
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = v.Args[0].Reg()
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = v.Reg()
+
+ case ssa.OpAMD64ANDNQ, ssa.OpAMD64ANDNL:
+ p := s.Prog(v.Op.Asm())
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = v.Args[0].Reg()
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = v.Reg()
+ p.SetFrom3Reg(v.Args[1].Reg())
+
case ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU:
// Arg[0] (the dividend) is in AX.
// Arg[1] (the divisor) can be in any other register.
diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules
index bfed3bc..edb1a48 100644
--- a/src/cmd/compile/internal/ssa/gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules
@@ -639,6 +639,7 @@
// Recognize bit clearing: a &^= 1<<b
(AND(Q|L) (NOT(Q|L) (SHL(Q|L) (MOV(Q|L)const [1]) y)) x) => (BTR(Q|L) x y)
+(ANDN(Q|L) x (SHL(Q|L) (MOV(Q|L)const [1]) y)) => (BTR(Q|L) x y)
(ANDQconst [c] x) && isUint64PowerOfTwo(int64(^c)) && uint64(^c) >= 128
=> (BTRQconst [int8(log32(^c))] x)
(ANDLconst [c] x) && isUint32PowerOfTwo(int64(^c)) && uint64(^c) >= 128
@@ -2204,3 +2205,9 @@
// Prefetch instructions
(PrefetchCache ...) => (PrefetchT0 ...)
(PrefetchCacheStreamed ...) => (PrefetchNTA ...)
+
+// CPUID feature: BMI1.
+(AND(Q|L) x (NOT(Q|L) y)) && buildcfg.GOAMD64 >= 3 => (ANDN(Q|L) x y)
+(AND(Q|L) x (NEG(Q|L) x)) && buildcfg.GOAMD64 >= 3 => (BLSI(Q|L) x)
+(XOR(Q|L) x (ADD(Q|L)const [-1] x)) && buildcfg.GOAMD64 >= 3 => (BLSMSK(Q|L) x)
+(AND(Q|L) x (ADD(Q|L)const [-1] x)) && buildcfg.GOAMD64 >= 3 => (BLSR(Q|L) x)
diff --git a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
index 51cbf5f..6e4c514 100644
--- a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
@@ -908,6 +908,16 @@
// Do prefetch arg0 address. arg0=addr, arg1=memory. Instruction variant selects locality hint
{name: "PrefetchT0", argLength: 2, reg: prefreg, asm: "PREFETCHT0", hasSideEffects: true},
{name: "PrefetchNTA", argLength: 2, reg: prefreg, asm: "PREFETCHNTA", hasSideEffects: true},
+
+ // CPUID feature: BMI1.
+ {name: "ANDNQ", argLength: 2, reg: gp21, asm: "ANDNQ", clobberFlags: true}, // arg0 &^ arg1
+ {name: "ANDNL", argLength: 2, reg: gp21, asm: "ANDNL", clobberFlags: true}, // arg0 &^ arg1
+ {name: "BLSIQ", argLength: 1, reg: gp11, asm: "BLSIQ", clobberFlags: true}, // arg0 & -arg0
+ {name: "BLSIL", argLength: 1, reg: gp11, asm: "BLSIL", clobberFlags: true}, // arg0 & -arg0
+ {name: "BLSMSKQ", argLength: 1, reg: gp11, asm: "BLSMSKQ", clobberFlags: true}, // arg0 ^ (arg0 - 1)
+ {name: "BLSMSKL", argLength: 1, reg: gp11, asm: "BLSMSKL", clobberFlags: true}, // arg0 ^ (arg0 - 1)
+ {name: "BLSRQ", argLength: 1, reg: gp11, asm: "BLSRQ", clobberFlags: true}, // arg0 & (arg0 - 1)
+ {name: "BLSRL", argLength: 1, reg: gp11, asm: "BLSRL", clobberFlags: true}, // arg0 & (arg0 - 1)
}
var AMD64blocks = []blockData{
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index ceb0a24..fed3bc3 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -1033,6 +1033,14 @@
OpAMD64ORLlock
OpAMD64PrefetchT0
OpAMD64PrefetchNTA
+ OpAMD64ANDNQ
+ OpAMD64ANDNL
+ OpAMD64BLSIQ
+ OpAMD64BLSIL
+ OpAMD64BLSMSKQ
+ OpAMD64BLSMSKL
+ OpAMD64BLSRQ
+ OpAMD64BLSRL
OpARMADD
OpARMADDconst
@@ -13628,6 +13636,120 @@
},
},
},
+ {
+ name: "ANDNQ",
+ argLen: 2,
+ clobberFlags: true,
+ asm: x86.AANDNQ,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ {1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ outputs: []outputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ },
+ },
+ {
+ name: "ANDNL",
+ argLen: 2,
+ clobberFlags: true,
+ asm: x86.AANDNL,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ {1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ outputs: []outputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ },
+ },
+ {
+ name: "BLSIQ",
+ argLen: 1,
+ clobberFlags: true,
+ asm: x86.ABLSIQ,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ outputs: []outputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ },
+ },
+ {
+ name: "BLSIL",
+ argLen: 1,
+ clobberFlags: true,
+ asm: x86.ABLSIL,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ outputs: []outputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ },
+ },
+ {
+ name: "BLSMSKQ",
+ argLen: 1,
+ clobberFlags: true,
+ asm: x86.ABLSMSKQ,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ outputs: []outputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ },
+ },
+ {
+ name: "BLSMSKL",
+ argLen: 1,
+ clobberFlags: true,
+ asm: x86.ABLSMSKL,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ outputs: []outputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ },
+ },
+ {
+ name: "BLSRQ",
+ argLen: 1,
+ clobberFlags: true,
+ asm: x86.ABLSRQ,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ outputs: []outputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ },
+ },
+ {
+ name: "BLSRL",
+ argLen: 1,
+ clobberFlags: true,
+ asm: x86.ABLSRL,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ outputs: []outputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ },
+ },
{
name: "ADD",
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index e20161c..906260f 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -3,6 +3,7 @@
package ssa
+import "internal/buildcfg"
import "math"
import "cmd/internal/obj"
import "cmd/compile/internal/types"
@@ -53,6 +54,10 @@
return rewriteValueAMD64_OpAMD64ANDLload(v)
case OpAMD64ANDLmodify:
return rewriteValueAMD64_OpAMD64ANDLmodify(v)
+ case OpAMD64ANDNL:
+ return rewriteValueAMD64_OpAMD64ANDNL(v)
+ case OpAMD64ANDNQ:
+ return rewriteValueAMD64_OpAMD64ANDNQ(v)
case OpAMD64ANDQ:
return rewriteValueAMD64_OpAMD64ANDQ(v)
case OpAMD64ANDQconst:
@@ -2759,6 +2764,55 @@
}
break
}
+ // match: (ANDL x (NOTL y))
+ // cond: buildcfg.GOAMD64 >= 3
+ // result: (ANDNL x y)
+ for {
+ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+ x := v_0
+ if v_1.Op != OpAMD64NOTL {
+ continue
+ }
+ y := v_1.Args[0]
+ if !(buildcfg.GOAMD64 >= 3) {
+ continue
+ }
+ v.reset(OpAMD64ANDNL)
+ v.AddArg2(x, y)
+ return true
+ }
+ break
+ }
+ // match: (ANDL x (NEGL x))
+ // cond: buildcfg.GOAMD64 >= 3
+ // result: (BLSIL x)
+ for {
+ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+ x := v_0
+ if v_1.Op != OpAMD64NEGL || x != v_1.Args[0] || !(buildcfg.GOAMD64 >= 3) {
+ continue
+ }
+ v.reset(OpAMD64BLSIL)
+ v.AddArg(x)
+ return true
+ }
+ break
+ }
+ // match: (ANDL x (ADDLconst [-1] x))
+ // cond: buildcfg.GOAMD64 >= 3
+ // result: (BLSRL x)
+ for {
+ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+ x := v_0
+ if v_1.Op != OpAMD64ADDLconst || auxIntToInt32(v_1.AuxInt) != -1 || x != v_1.Args[0] || !(buildcfg.GOAMD64 >= 3) {
+ continue
+ }
+ v.reset(OpAMD64BLSRL)
+ v.AddArg(x)
+ return true
+ }
+ break
+ }
return false
}
func rewriteValueAMD64_OpAMD64ANDLconst(v *Value) bool {
@@ -3037,6 +3091,48 @@
}
return false
}
+func rewriteValueAMD64_OpAMD64ANDNL(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ // match: (ANDNL x (SHLL (MOVLconst [1]) y))
+ // result: (BTRL x y)
+ for {
+ x := v_0
+ if v_1.Op != OpAMD64SHLL {
+ break
+ }
+ y := v_1.Args[1]
+ v_1_0 := v_1.Args[0]
+ if v_1_0.Op != OpAMD64MOVLconst || auxIntToInt32(v_1_0.AuxInt) != 1 {
+ break
+ }
+ v.reset(OpAMD64BTRL)
+ v.AddArg2(x, y)
+ return true
+ }
+ return false
+}
+func rewriteValueAMD64_OpAMD64ANDNQ(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ // match: (ANDNQ x (SHLQ (MOVQconst [1]) y))
+ // result: (BTRQ x y)
+ for {
+ x := v_0
+ if v_1.Op != OpAMD64SHLQ {
+ break
+ }
+ y := v_1.Args[1]
+ v_1_0 := v_1.Args[0]
+ if v_1_0.Op != OpAMD64MOVQconst || auxIntToInt64(v_1_0.AuxInt) != 1 {
+ break
+ }
+ v.reset(OpAMD64BTRQ)
+ v.AddArg2(x, y)
+ return true
+ }
+ return false
+}
func rewriteValueAMD64_OpAMD64ANDQ(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
@@ -3138,6 +3234,55 @@
}
break
}
+ // match: (ANDQ x (NOTQ y))
+ // cond: buildcfg.GOAMD64 >= 3
+ // result: (ANDNQ x y)
+ for {
+ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+ x := v_0
+ if v_1.Op != OpAMD64NOTQ {
+ continue
+ }
+ y := v_1.Args[0]
+ if !(buildcfg.GOAMD64 >= 3) {
+ continue
+ }
+ v.reset(OpAMD64ANDNQ)
+ v.AddArg2(x, y)
+ return true
+ }
+ break
+ }
+ // match: (ANDQ x (NEGQ x))
+ // cond: buildcfg.GOAMD64 >= 3
+ // result: (BLSIQ x)
+ for {
+ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+ x := v_0
+ if v_1.Op != OpAMD64NEGQ || x != v_1.Args[0] || !(buildcfg.GOAMD64 >= 3) {
+ continue
+ }
+ v.reset(OpAMD64BLSIQ)
+ v.AddArg(x)
+ return true
+ }
+ break
+ }
+ // match: (ANDQ x (ADDQconst [-1] x))
+ // cond: buildcfg.GOAMD64 >= 3
+ // result: (BLSRQ x)
+ for {
+ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+ x := v_0
+ if v_1.Op != OpAMD64ADDQconst || auxIntToInt32(v_1.AuxInt) != -1 || x != v_1.Args[0] || !(buildcfg.GOAMD64 >= 3) {
+ continue
+ }
+ v.reset(OpAMD64BLSRQ)
+ v.AddArg(x)
+ return true
+ }
+ break
+ }
return false
}
func rewriteValueAMD64_OpAMD64ANDQconst(v *Value) bool {
@@ -26474,6 +26619,21 @@
}
break
}
+ // match: (XORL x (ADDLconst [-1] x))
+ // cond: buildcfg.GOAMD64 >= 3
+ // result: (BLSMSKL x)
+ for {
+ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+ x := v_0
+ if v_1.Op != OpAMD64ADDLconst || auxIntToInt32(v_1.AuxInt) != -1 || x != v_1.Args[0] || !(buildcfg.GOAMD64 >= 3) {
+ continue
+ }
+ v.reset(OpAMD64BLSMSKL)
+ v.AddArg(x)
+ return true
+ }
+ break
+ }
return false
}
func rewriteValueAMD64_OpAMD64XORLconst(v *Value) bool {
@@ -26950,6 +27110,21 @@
}
break
}
+ // match: (XORQ x (ADDQconst [-1] x))
+ // cond: buildcfg.GOAMD64 >= 3
+ // result: (BLSMSKQ x)
+ for {
+ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+ x := v_0
+ if v_1.Op != OpAMD64ADDQconst || auxIntToInt32(v_1.AuxInt) != -1 || x != v_1.Args[0] || !(buildcfg.GOAMD64 >= 3) {
+ continue
+ }
+ v.reset(OpAMD64BLSMSKQ)
+ v.AddArg(x)
+ return true
+ }
+ break
+ }
return false
}
func rewriteValueAMD64_OpAMD64XORQconst(v *Value) bool {
diff --git a/test/codegen/bmi.go b/test/codegen/bmi.go
new file mode 100644
index 0000000..0c25e0b
--- /dev/null
+++ b/test/codegen/bmi.go
@@ -0,0 +1,47 @@
+// asmcheck
+
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package codegen
+
+func andn64(x, y int64) int64 {
+ // amd64/v3:"ANDNQ"
+ return x &^ y
+}
+
+func andn32(x, y int32) int32 {
+ // amd64/v3:"ANDNL"
+ return x &^ y
+}
+
+func blsi64(x int64) int64 {
+ // amd64/v3:"BLSIQ"
+ return x & -x
+}
+
+func blsi32(x int32) int32 {
+ // amd64/v3:"BLSIL"
+ return x & -x
+}
+
+func blsmsk64(x int64) int64 {
+ // amd64/v3:"BLSMSKQ"
+ return x ^ (x - 1)
+}
+
+func blsmsk32(x int32) int32 {
+ // amd64/v3:"BLSMSKL"
+ return x ^ (x - 1)
+}
+
+func blsr64(x int64) int64 {
+ // amd64/v3:"BLSRQ"
+ return x & (x - 1)
+}
+
+func blsr32(x int32) int32 {
+ // amd64/v3:"BLSRL"
+ return x & (x - 1)
+}