cmd/compile: add SARX instruction for GOAMD64>=3
name old time/op new time/op delta
ShiftArithmeticRight-8 0.68ns ± 5% 0.30ns ± 6% -56.14% (p=0.000 n=10+10)
Change-Id: I052a0d7b9e6526d526276444e588b0cc288beff4
Reviewed-on: https://go-review.googlesource.com/c/go/+/399055
Run-TryBot: Wayne Zuo <wdvxdr@golangcn.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go
index 2dae55b..9fde775 100644
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@@ -282,6 +282,10 @@
p.To.Reg = v.Reg()
p.SetFrom3Reg(v.Args[1].Reg())
+ case ssa.OpAMD64SARXL, ssa.OpAMD64SARXQ:
+ p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
+ p.SetFrom3Reg(v.Args[0].Reg())
+
case ssa.OpAMD64SHLXLload, ssa.OpAMD64SHLXQload,
ssa.OpAMD64SHRXLload, ssa.OpAMD64SHRXQload:
p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
diff --git a/src/cmd/compile/internal/amd64/versions_test.go b/src/cmd/compile/internal/amd64/versions_test.go
index 11b4d84..248f070 100644
--- a/src/cmd/compile/internal/amd64/versions_test.go
+++ b/src/cmd/compile/internal/amd64/versions_test.go
@@ -239,6 +239,7 @@
// native objdump doesn't include [QL] on linux.
"popcnt": {"popcntq", "popcntl", "popcnt"},
"bmi1": {"andnq", "andnl", "andn", "blsiq", "blsil", "blsi", "blsmskq", "blsmskl", "blsmsk", "blsrq", "blsrl", "blsr", "tzcntq", "tzcntl", "tzcnt"},
+ "bmi2": {"sarxq", "sarxl", "sarx", "shlxq", "shlxl", "shlx", "shrxq", "shrxl", "shrx"},
"sse41": {"roundsd"},
"fma": {"vfmadd231sd"},
"movbe": {"movbeqq", "movbeq", "movbell", "movbel", "movbe"},
diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules
index d50bdf2..3a9de8d 100644
--- a/src/cmd/compile/internal/ssa/gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules
@@ -206,6 +206,9 @@
(Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SARW x y)
(Rsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SARB x y)
+// Prefer SARX instruction because it has less register restriction on the shift input.
+(SAR(Q|L) x y) && buildcfg.GOAMD64 >= 3 => (SARX(Q|L) x y)
+
// Lowering integer comparisons
(Less(64|32|16|8) x y) => (SETL (CMP(Q|L|W|B) x y))
(Less(64|32|16|8)U x y) => (SETB (CMP(Q|L|W|B) x y))
@@ -803,28 +806,29 @@
(SARL x (MOV(Q|L)const [c])) => (SARLconst [int8(c&31)] x)
(SARW x (MOV(Q|L)const [c])) => (SARWconst [int8(min(int64(c)&31,15))] x)
(SARB x (MOV(Q|L)const [c])) => (SARBconst [int8(min(int64(c)&31,7))] x)
-
+(SARXQ x (MOV(Q|L)const [c])) => (SARQconst [int8(c&63)] x)
+(SARXL x (MOV(Q|L)const [c])) => (SARLconst [int8(c&31)] x)
// Operations which don't affect the low 6/5 bits of the shift amount are NOPs.
-((SHLQ|SHRQ|SARQ) x (ADDQconst [c] y)) && c & 63 == 0 => ((SHLQ|SHRQ|SARQ) x y)
-((SHLQ|SHRQ|SARQ) x (NEGQ <t> (ADDQconst [c] y))) && c & 63 == 0 => ((SHLQ|SHRQ|SARQ) x (NEGQ <t> y))
-((SHLQ|SHRQ|SARQ) x (ANDQconst [c] y)) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ) x y)
-((SHLQ|SHRQ|SARQ) x (NEGQ <t> (ANDQconst [c] y))) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ) x (NEGQ <t> y))
+((SHLQ|SHRQ|SARQ|SARXQ) x (ADDQconst [c] y)) && c & 63 == 0 => ((SHLQ|SHRQ|SARQ|SARXQ) x y)
+((SHLQ|SHRQ|SARQ|SARXQ) x (NEGQ <t> (ADDQconst [c] y))) && c & 63 == 0 => ((SHLQ|SHRQ|SARQ|SARXQ) x (NEGQ <t> y))
+((SHLQ|SHRQ|SARQ|SARXQ) x (ANDQconst [c] y)) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ|SARXQ) x y)
+((SHLQ|SHRQ|SARQ|SARXQ) x (NEGQ <t> (ANDQconst [c] y))) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ|SARXQ) x (NEGQ <t> y))
-((SHLL|SHRL|SARL) x (ADDQconst [c] y)) && c & 31 == 0 => ((SHLL|SHRL|SARL) x y)
-((SHLL|SHRL|SARL) x (NEGQ <t> (ADDQconst [c] y))) && c & 31 == 0 => ((SHLL|SHRL|SARL) x (NEGQ <t> y))
-((SHLL|SHRL|SARL) x (ANDQconst [c] y)) && c & 31 == 31 => ((SHLL|SHRL|SARL) x y)
-((SHLL|SHRL|SARL) x (NEGQ <t> (ANDQconst [c] y))) && c & 31 == 31 => ((SHLL|SHRL|SARL) x (NEGQ <t> y))
+((SHLL|SHRL|SARL|SARXL) x (ADDQconst [c] y)) && c & 31 == 0 => ((SHLL|SHRL|SARL|SARXL) x y)
+((SHLL|SHRL|SARL|SARXL) x (NEGQ <t> (ADDQconst [c] y))) && c & 31 == 0 => ((SHLL|SHRL|SARL|SARXL) x (NEGQ <t> y))
+((SHLL|SHRL|SARL|SARXL) x (ANDQconst [c] y)) && c & 31 == 31 => ((SHLL|SHRL|SARL|SARXL) x y)
+((SHLL|SHRL|SARL|SARXL) x (NEGQ <t> (ANDQconst [c] y))) && c & 31 == 31 => ((SHLL|SHRL|SARL|SARXL) x (NEGQ <t> y))
-((SHLQ|SHRQ|SARQ) x (ADDLconst [c] y)) && c & 63 == 0 => ((SHLQ|SHRQ|SARQ) x y)
-((SHLQ|SHRQ|SARQ) x (NEGL <t> (ADDLconst [c] y))) && c & 63 == 0 => ((SHLQ|SHRQ|SARQ) x (NEGL <t> y))
-((SHLQ|SHRQ|SARQ) x (ANDLconst [c] y)) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ) x y)
-((SHLQ|SHRQ|SARQ) x (NEGL <t> (ANDLconst [c] y))) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ) x (NEGL <t> y))
+((SHLQ|SHRQ|SARQ|SARXQ) x (ADDLconst [c] y)) && c & 63 == 0 => ((SHLQ|SHRQ|SARQ|SARXQ) x y)
+((SHLQ|SHRQ|SARQ|SARXQ) x (NEGL <t> (ADDLconst [c] y))) && c & 63 == 0 => ((SHLQ|SHRQ|SARQ|SARXQ) x (NEGL <t> y))
+((SHLQ|SHRQ|SARQ|SARXQ) x (ANDLconst [c] y)) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ|SARXQ) x y)
+((SHLQ|SHRQ|SARQ|SARXQ) x (NEGL <t> (ANDLconst [c] y))) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ|SARXQ) x (NEGL <t> y))
-((SHLL|SHRL|SARL) x (ADDLconst [c] y)) && c & 31 == 0 => ((SHLL|SHRL|SARL) x y)
-((SHLL|SHRL|SARL) x (NEGL <t> (ADDLconst [c] y))) && c & 31 == 0 => ((SHLL|SHRL|SARL) x (NEGL <t> y))
-((SHLL|SHRL|SARL) x (ANDLconst [c] y)) && c & 31 == 31 => ((SHLL|SHRL|SARL) x y)
-((SHLL|SHRL|SARL) x (NEGL <t> (ANDLconst [c] y))) && c & 31 == 31 => ((SHLL|SHRL|SARL) x (NEGL <t> y))
+((SHLL|SHRL|SARL|SARXL) x (ADDLconst [c] y)) && c & 31 == 0 => ((SHLL|SHRL|SARL|SARXL) x y)
+((SHLL|SHRL|SARL|SARXL) x (NEGL <t> (ADDLconst [c] y))) && c & 31 == 0 => ((SHLL|SHRL|SARL|SARXL) x (NEGL <t> y))
+((SHLL|SHRL|SARL|SARXL) x (ANDLconst [c] y)) && c & 31 == 31 => ((SHLL|SHRL|SARL|SARXL) x y)
+((SHLL|SHRL|SARL|SARXL) x (NEGL <t> (ANDLconst [c] y))) && c & 31 == 31 => ((SHLL|SHRL|SARL|SARXL) x (NEGL <t> y))
// Constant rotate instructions
((ADDQ|ORQ|XORQ) (SHLQconst x [c]) (SHRQconst x [d])) && d==64-c => (ROLQconst x [c])
diff --git a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
index ab84504..2eec6e0 100644
--- a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
@@ -953,6 +953,9 @@
{name: "MOVBEQstoreidx8", argLength: 4, reg: gpstoreidx, asm: "MOVBEQ", scale: 8, aux: "SymOff", symEffect: "Write"}, // swap and store 8 bytes in arg2 to arg0+8*arg1+auxint+aux. arg3=mem
// CPUID feature: BMI2.
+ {name: "SARXQ", argLength: 2, reg: gp21, asm: "SARXQ"}, // signed arg0 >> arg1, shift amount is mod 64
+ {name: "SARXL", argLength: 2, reg: gp21, asm: "SARXL"}, // signed int32(arg0) >> arg1, shift amount is mod 32
+
{name: "SHLXLload", argLength: 3, reg: gp21shxload, asm: "SHLXL", aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // *(arg0+auxint+aux) << arg1, arg2=mem, shift amount is mod 32
{name: "SHLXQload", argLength: 3, reg: gp21shxload, asm: "SHLXQ", aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // *(arg0+auxint+aux) << arg1, arg2=mem, shift amount is mod 64
{name: "SHRXLload", argLength: 3, reg: gp21shxload, asm: "SHRXL", aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+auxint+aux) >> arg1, arg2=mem, shift amount is mod 32
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 1c941e8..976d887 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -1062,6 +1062,8 @@
OpAMD64MOVBELstoreidx8
OpAMD64MOVBEQstoreidx1
OpAMD64MOVBEQstoreidx8
+ OpAMD64SARXQ
+ OpAMD64SARXL
OpAMD64SHLXLload
OpAMD64SHLXQload
OpAMD64SHRXLload
@@ -14118,6 +14120,34 @@
},
},
{
+ name: "SARXQ",
+ argLen: 2,
+ asm: x86.ASARXQ,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ {1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ outputs: []outputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ },
+ },
+ {
+ name: "SARXL",
+ argLen: 2,
+ asm: x86.ASARXL,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ {1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ outputs: []outputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ },
+ },
+ {
name: "SHLXLload",
auxType: auxSymOff,
argLen: 3,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index addfaaa..81f1f1a 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -382,6 +382,10 @@
return rewriteValueAMD64_OpAMD64SARW(v)
case OpAMD64SARWconst:
return rewriteValueAMD64_OpAMD64SARWconst(v)
+ case OpAMD64SARXL:
+ return rewriteValueAMD64_OpAMD64SARXL(v)
+ case OpAMD64SARXQ:
+ return rewriteValueAMD64_OpAMD64SARXQ(v)
case OpAMD64SBBLcarrymask:
return rewriteValueAMD64_OpAMD64SBBLcarrymask(v)
case OpAMD64SBBQ:
@@ -19844,6 +19848,19 @@
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
+ // match: (SARL x y)
+ // cond: buildcfg.GOAMD64 >= 3
+ // result: (SARXL x y)
+ for {
+ x := v_0
+ y := v_1
+ if !(buildcfg.GOAMD64 >= 3) {
+ break
+ }
+ v.reset(OpAMD64SARXL)
+ v.AddArg2(x, y)
+ return true
+ }
// match: (SARL x (MOVQconst [c]))
// result: (SARLconst [int8(c&31)] x)
for {
@@ -20066,6 +20083,19 @@
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
+ // match: (SARQ x y)
+ // cond: buildcfg.GOAMD64 >= 3
+ // result: (SARXQ x y)
+ for {
+ x := v_0
+ y := v_1
+ if !(buildcfg.GOAMD64 >= 3) {
+ break
+ }
+ v.reset(OpAMD64SARXQ)
+ v.AddArg2(x, y)
+ return true
+ }
// match: (SARQ x (MOVQconst [c]))
// result: (SARQconst [int8(c&63)] x)
for {
@@ -20341,6 +20371,398 @@
}
return false
}
+func rewriteValueAMD64_OpAMD64SARXL(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (SARXL x (MOVQconst [c]))
+ // result: (SARLconst [int8(c&31)] x)
+ for {
+ x := v_0
+ if v_1.Op != OpAMD64MOVQconst {
+ break
+ }
+ c := auxIntToInt64(v_1.AuxInt)
+ v.reset(OpAMD64SARLconst)
+ v.AuxInt = int8ToAuxInt(int8(c & 31))
+ v.AddArg(x)
+ return true
+ }
+ // match: (SARXL x (MOVLconst [c]))
+ // result: (SARLconst [int8(c&31)] x)
+ for {
+ x := v_0
+ if v_1.Op != OpAMD64MOVLconst {
+ break
+ }
+ c := auxIntToInt32(v_1.AuxInt)
+ v.reset(OpAMD64SARLconst)
+ v.AuxInt = int8ToAuxInt(int8(c & 31))
+ v.AddArg(x)
+ return true
+ }
+ // match: (SARXL x (ADDQconst [c] y))
+ // cond: c & 31 == 0
+ // result: (SARXL x y)
+ for {
+ x := v_0
+ if v_1.Op != OpAMD64ADDQconst {
+ break
+ }
+ c := auxIntToInt32(v_1.AuxInt)
+ y := v_1.Args[0]
+ if !(c&31 == 0) {
+ break
+ }
+ v.reset(OpAMD64SARXL)
+ v.AddArg2(x, y)
+ return true
+ }
+ // match: (SARXL x (NEGQ <t> (ADDQconst [c] y)))
+ // cond: c & 31 == 0
+ // result: (SARXL x (NEGQ <t> y))
+ for {
+ x := v_0
+ if v_1.Op != OpAMD64NEGQ {
+ break
+ }
+ t := v_1.Type
+ v_1_0 := v_1.Args[0]
+ if v_1_0.Op != OpAMD64ADDQconst {
+ break
+ }
+ c := auxIntToInt32(v_1_0.AuxInt)
+ y := v_1_0.Args[0]
+ if !(c&31 == 0) {
+ break
+ }
+ v.reset(OpAMD64SARXL)
+ v0 := b.NewValue0(v.Pos, OpAMD64NEGQ, t)
+ v0.AddArg(y)
+ v.AddArg2(x, v0)
+ return true
+ }
+ // match: (SARXL x (ANDQconst [c] y))
+ // cond: c & 31 == 31
+ // result: (SARXL x y)
+ for {
+ x := v_0
+ if v_1.Op != OpAMD64ANDQconst {
+ break
+ }
+ c := auxIntToInt32(v_1.AuxInt)
+ y := v_1.Args[0]
+ if !(c&31 == 31) {
+ break
+ }
+ v.reset(OpAMD64SARXL)
+ v.AddArg2(x, y)
+ return true
+ }
+ // match: (SARXL x (NEGQ <t> (ANDQconst [c] y)))
+ // cond: c & 31 == 31
+ // result: (SARXL x (NEGQ <t> y))
+ for {
+ x := v_0
+ if v_1.Op != OpAMD64NEGQ {
+ break
+ }
+ t := v_1.Type
+ v_1_0 := v_1.Args[0]
+ if v_1_0.Op != OpAMD64ANDQconst {
+ break
+ }
+ c := auxIntToInt32(v_1_0.AuxInt)
+ y := v_1_0.Args[0]
+ if !(c&31 == 31) {
+ break
+ }
+ v.reset(OpAMD64SARXL)
+ v0 := b.NewValue0(v.Pos, OpAMD64NEGQ, t)
+ v0.AddArg(y)
+ v.AddArg2(x, v0)
+ return true
+ }
+ // match: (SARXL x (ADDLconst [c] y))
+ // cond: c & 31 == 0
+ // result: (SARXL x y)
+ for {
+ x := v_0
+ if v_1.Op != OpAMD64ADDLconst {
+ break
+ }
+ c := auxIntToInt32(v_1.AuxInt)
+ y := v_1.Args[0]
+ if !(c&31 == 0) {
+ break
+ }
+ v.reset(OpAMD64SARXL)
+ v.AddArg2(x, y)
+ return true
+ }
+ // match: (SARXL x (NEGL <t> (ADDLconst [c] y)))
+ // cond: c & 31 == 0
+ // result: (SARXL x (NEGL <t> y))
+ for {
+ x := v_0
+ if v_1.Op != OpAMD64NEGL {
+ break
+ }
+ t := v_1.Type
+ v_1_0 := v_1.Args[0]
+ if v_1_0.Op != OpAMD64ADDLconst {
+ break
+ }
+ c := auxIntToInt32(v_1_0.AuxInt)
+ y := v_1_0.Args[0]
+ if !(c&31 == 0) {
+ break
+ }
+ v.reset(OpAMD64SARXL)
+ v0 := b.NewValue0(v.Pos, OpAMD64NEGL, t)
+ v0.AddArg(y)
+ v.AddArg2(x, v0)
+ return true
+ }
+ // match: (SARXL x (ANDLconst [c] y))
+ // cond: c & 31 == 31
+ // result: (SARXL x y)
+ for {
+ x := v_0
+ if v_1.Op != OpAMD64ANDLconst {
+ break
+ }
+ c := auxIntToInt32(v_1.AuxInt)
+ y := v_1.Args[0]
+ if !(c&31 == 31) {
+ break
+ }
+ v.reset(OpAMD64SARXL)
+ v.AddArg2(x, y)
+ return true
+ }
+ // match: (SARXL x (NEGL <t> (ANDLconst [c] y)))
+ // cond: c & 31 == 31
+ // result: (SARXL x (NEGL <t> y))
+ for {
+ x := v_0
+ if v_1.Op != OpAMD64NEGL {
+ break
+ }
+ t := v_1.Type
+ v_1_0 := v_1.Args[0]
+ if v_1_0.Op != OpAMD64ANDLconst {
+ break
+ }
+ c := auxIntToInt32(v_1_0.AuxInt)
+ y := v_1_0.Args[0]
+ if !(c&31 == 31) {
+ break
+ }
+ v.reset(OpAMD64SARXL)
+ v0 := b.NewValue0(v.Pos, OpAMD64NEGL, t)
+ v0.AddArg(y)
+ v.AddArg2(x, v0)
+ return true
+ }
+ return false
+}
+func rewriteValueAMD64_OpAMD64SARXQ(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (SARXQ x (MOVQconst [c]))
+ // result: (SARQconst [int8(c&63)] x)
+ for {
+ x := v_0
+ if v_1.Op != OpAMD64MOVQconst {
+ break
+ }
+ c := auxIntToInt64(v_1.AuxInt)
+ v.reset(OpAMD64SARQconst)
+ v.AuxInt = int8ToAuxInt(int8(c & 63))
+ v.AddArg(x)
+ return true
+ }
+ // match: (SARXQ x (MOVLconst [c]))
+ // result: (SARQconst [int8(c&63)] x)
+ for {
+ x := v_0
+ if v_1.Op != OpAMD64MOVLconst {
+ break
+ }
+ c := auxIntToInt32(v_1.AuxInt)
+ v.reset(OpAMD64SARQconst)
+ v.AuxInt = int8ToAuxInt(int8(c & 63))
+ v.AddArg(x)
+ return true
+ }
+ // match: (SARXQ x (ADDQconst [c] y))
+ // cond: c & 63 == 0
+ // result: (SARXQ x y)
+ for {
+ x := v_0
+ if v_1.Op != OpAMD64ADDQconst {
+ break
+ }
+ c := auxIntToInt32(v_1.AuxInt)
+ y := v_1.Args[0]
+ if !(c&63 == 0) {
+ break
+ }
+ v.reset(OpAMD64SARXQ)
+ v.AddArg2(x, y)
+ return true
+ }
+ // match: (SARXQ x (NEGQ <t> (ADDQconst [c] y)))
+ // cond: c & 63 == 0
+ // result: (SARXQ x (NEGQ <t> y))
+ for {
+ x := v_0
+ if v_1.Op != OpAMD64NEGQ {
+ break
+ }
+ t := v_1.Type
+ v_1_0 := v_1.Args[0]
+ if v_1_0.Op != OpAMD64ADDQconst {
+ break
+ }
+ c := auxIntToInt32(v_1_0.AuxInt)
+ y := v_1_0.Args[0]
+ if !(c&63 == 0) {
+ break
+ }
+ v.reset(OpAMD64SARXQ)
+ v0 := b.NewValue0(v.Pos, OpAMD64NEGQ, t)
+ v0.AddArg(y)
+ v.AddArg2(x, v0)
+ return true
+ }
+ // match: (SARXQ x (ANDQconst [c] y))
+ // cond: c & 63 == 63
+ // result: (SARXQ x y)
+ for {
+ x := v_0
+ if v_1.Op != OpAMD64ANDQconst {
+ break
+ }
+ c := auxIntToInt32(v_1.AuxInt)
+ y := v_1.Args[0]
+ if !(c&63 == 63) {
+ break
+ }
+ v.reset(OpAMD64SARXQ)
+ v.AddArg2(x, y)
+ return true
+ }
+ // match: (SARXQ x (NEGQ <t> (ANDQconst [c] y)))
+ // cond: c & 63 == 63
+ // result: (SARXQ x (NEGQ <t> y))
+ for {
+ x := v_0
+ if v_1.Op != OpAMD64NEGQ {
+ break
+ }
+ t := v_1.Type
+ v_1_0 := v_1.Args[0]
+ if v_1_0.Op != OpAMD64ANDQconst {
+ break
+ }
+ c := auxIntToInt32(v_1_0.AuxInt)
+ y := v_1_0.Args[0]
+ if !(c&63 == 63) {
+ break
+ }
+ v.reset(OpAMD64SARXQ)
+ v0 := b.NewValue0(v.Pos, OpAMD64NEGQ, t)
+ v0.AddArg(y)
+ v.AddArg2(x, v0)
+ return true
+ }
+ // match: (SARXQ x (ADDLconst [c] y))
+ // cond: c & 63 == 0
+ // result: (SARXQ x y)
+ for {
+ x := v_0
+ if v_1.Op != OpAMD64ADDLconst {
+ break
+ }
+ c := auxIntToInt32(v_1.AuxInt)
+ y := v_1.Args[0]
+ if !(c&63 == 0) {
+ break
+ }
+ v.reset(OpAMD64SARXQ)
+ v.AddArg2(x, y)
+ return true
+ }
+ // match: (SARXQ x (NEGL <t> (ADDLconst [c] y)))
+ // cond: c & 63 == 0
+ // result: (SARXQ x (NEGL <t> y))
+ for {
+ x := v_0
+ if v_1.Op != OpAMD64NEGL {
+ break
+ }
+ t := v_1.Type
+ v_1_0 := v_1.Args[0]
+ if v_1_0.Op != OpAMD64ADDLconst {
+ break
+ }
+ c := auxIntToInt32(v_1_0.AuxInt)
+ y := v_1_0.Args[0]
+ if !(c&63 == 0) {
+ break
+ }
+ v.reset(OpAMD64SARXQ)
+ v0 := b.NewValue0(v.Pos, OpAMD64NEGL, t)
+ v0.AddArg(y)
+ v.AddArg2(x, v0)
+ return true
+ }
+ // match: (SARXQ x (ANDLconst [c] y))
+ // cond: c & 63 == 63
+ // result: (SARXQ x y)
+ for {
+ x := v_0
+ if v_1.Op != OpAMD64ANDLconst {
+ break
+ }
+ c := auxIntToInt32(v_1.AuxInt)
+ y := v_1.Args[0]
+ if !(c&63 == 63) {
+ break
+ }
+ v.reset(OpAMD64SARXQ)
+ v.AddArg2(x, y)
+ return true
+ }
+ // match: (SARXQ x (NEGL <t> (ANDLconst [c] y)))
+ // cond: c & 63 == 63
+ // result: (SARXQ x (NEGL <t> y))
+ for {
+ x := v_0
+ if v_1.Op != OpAMD64NEGL {
+ break
+ }
+ t := v_1.Type
+ v_1_0 := v_1.Args[0]
+ if v_1_0.Op != OpAMD64ANDLconst {
+ break
+ }
+ c := auxIntToInt32(v_1_0.AuxInt)
+ y := v_1_0.Args[0]
+ if !(c&63 == 63) {
+ break
+ }
+ v.reset(OpAMD64SARXQ)
+ v0 := b.NewValue0(v.Pos, OpAMD64NEGL, t)
+ v0.AddArg(y)
+ v.AddArg2(x, v0)
+ return true
+ }
+ return false
+}
func rewriteValueAMD64_OpAMD64SBBLcarrymask(v *Value) bool {
v_0 := v.Args[0]
// match: (SBBLcarrymask (FlagEQ))
diff --git a/src/cmd/compile/internal/test/shift_test.go b/src/cmd/compile/internal/test/shift_test.go
index ea88f0a..58c8dde 100644
--- a/src/cmd/compile/internal/test/shift_test.go
+++ b/src/cmd/compile/internal/test/shift_test.go
@@ -1029,3 +1029,13 @@
}
}
}
+
+var shiftSink64 int64
+
+func BenchmarkShiftArithmeticRight(b *testing.B) {
+ x := shiftSink64
+ for i := 0; i < b.N; i++ {
+ x = x >> (i & 63)
+ }
+ shiftSink64 = x
+}
diff --git a/test/codegen/bmi.go b/test/codegen/bmi.go
index 2908d1b..9dd2b00 100644
--- a/test/codegen/bmi.go
+++ b/test/codegen/bmi.go
@@ -46,6 +46,16 @@
return x & (x - 1)
}
+func sarx64(x, y int64) int64 {
+ // amd64/v3:"SARXQ"
+ return x >> y
+}
+
+func sarx32(x, y int32) int32 {
+ // amd64/v3:"SARXL"
+ return x >> y
+}
+
func shlrx64(x []uint64, i int, s uint64) uint64 {
// amd64/v3: `SHRXQ\t[A-Z]+[0-9]*, \([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*8\), [A-Z]+[0-9]*`
s = x[i] >> i