cmd/compile: add SARX instruction for GOAMD64>=3

name                    old time/op  new time/op  delta
ShiftArithmeticRight-8  0.68ns ± 5%  0.30ns ± 6%  -56.14%  (p=0.000 n=10+10)

Change-Id: I052a0d7b9e6526d526276444e588b0cc288beff4
Reviewed-on: https://go-review.googlesource.com/c/go/+/399055
Run-TryBot: Wayne Zuo <wdvxdr@golangcn.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go
index 2dae55b..9fde775 100644
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@@ -282,6 +282,10 @@
 		p.To.Reg = v.Reg()
 		p.SetFrom3Reg(v.Args[1].Reg())
 
+	case ssa.OpAMD64SARXL, ssa.OpAMD64SARXQ:
+		p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
+		p.SetFrom3Reg(v.Args[0].Reg())
+
 	case ssa.OpAMD64SHLXLload, ssa.OpAMD64SHLXQload,
 		ssa.OpAMD64SHRXLload, ssa.OpAMD64SHRXQload:
 		p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
diff --git a/src/cmd/compile/internal/amd64/versions_test.go b/src/cmd/compile/internal/amd64/versions_test.go
index 11b4d84..248f070 100644
--- a/src/cmd/compile/internal/amd64/versions_test.go
+++ b/src/cmd/compile/internal/amd64/versions_test.go
@@ -239,6 +239,7 @@
 	// native objdump doesn't include [QL] on linux.
 	"popcnt": {"popcntq", "popcntl", "popcnt"},
 	"bmi1":   {"andnq", "andnl", "andn", "blsiq", "blsil", "blsi", "blsmskq", "blsmskl", "blsmsk", "blsrq", "blsrl", "blsr", "tzcntq", "tzcntl", "tzcnt"},
+	"bmi2":   {"sarxq", "sarxl", "sarx", "shlxq", "shlxl", "shlx", "shrxq", "shrxl", "shrx"},
 	"sse41":  {"roundsd"},
 	"fma":    {"vfmadd231sd"},
 	"movbe":  {"movbeqq", "movbeq", "movbell", "movbel", "movbe"},
diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules
index d50bdf2..3a9de8d 100644
--- a/src/cmd/compile/internal/ssa/gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules
@@ -206,6 +206,9 @@
 (Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SARW x y)
 (Rsh8x(64|32|16|8) x y)  && shiftIsBounded(v) => (SARB x y)
 
+// Prefer SARX instruction because it has less register restriction on the shift input.
+(SAR(Q|L) x y) && buildcfg.GOAMD64 >= 3 => (SARX(Q|L) x y)
+
 // Lowering integer comparisons
 (Less(64|32|16|8)      x y) => (SETL  (CMP(Q|L|W|B)     x y))
 (Less(64|32|16|8)U     x y) => (SETB  (CMP(Q|L|W|B)     x y))
@@ -803,28 +806,29 @@
 (SARL x (MOV(Q|L)const [c])) => (SARLconst [int8(c&31)] x)
 (SARW x (MOV(Q|L)const [c])) => (SARWconst [int8(min(int64(c)&31,15))] x)
 (SARB x (MOV(Q|L)const [c])) => (SARBconst [int8(min(int64(c)&31,7))] x)
-
+(SARXQ x (MOV(Q|L)const [c])) => (SARQconst [int8(c&63)] x)
+(SARXL x (MOV(Q|L)const [c])) => (SARLconst [int8(c&31)] x)
 
 // Operations which don't affect the low 6/5 bits of the shift amount are NOPs.
-((SHLQ|SHRQ|SARQ) x (ADDQconst [c] y)) && c & 63 == 0  => ((SHLQ|SHRQ|SARQ) x y)
-((SHLQ|SHRQ|SARQ) x (NEGQ <t> (ADDQconst [c] y))) && c & 63 == 0  => ((SHLQ|SHRQ|SARQ) x (NEGQ <t> y))
-((SHLQ|SHRQ|SARQ) x (ANDQconst [c] y)) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ) x y)
-((SHLQ|SHRQ|SARQ) x (NEGQ <t> (ANDQconst [c] y))) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ) x (NEGQ <t> y))
+((SHLQ|SHRQ|SARQ|SARXQ) x (ADDQconst [c] y)) && c & 63 == 0  => ((SHLQ|SHRQ|SARQ|SARXQ) x y)
+((SHLQ|SHRQ|SARQ|SARXQ) x (NEGQ <t> (ADDQconst [c] y))) && c & 63 == 0  => ((SHLQ|SHRQ|SARQ|SARXQ) x (NEGQ <t> y))
+((SHLQ|SHRQ|SARQ|SARXQ) x (ANDQconst [c] y)) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ|SARXQ) x y)
+((SHLQ|SHRQ|SARQ|SARXQ) x (NEGQ <t> (ANDQconst [c] y))) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ|SARXQ) x (NEGQ <t> y))
 
-((SHLL|SHRL|SARL) x (ADDQconst [c] y)) && c & 31 == 0  => ((SHLL|SHRL|SARL) x y)
-((SHLL|SHRL|SARL) x (NEGQ <t> (ADDQconst [c] y))) && c & 31 == 0  => ((SHLL|SHRL|SARL) x (NEGQ <t> y))
-((SHLL|SHRL|SARL) x (ANDQconst [c] y)) && c & 31 == 31 => ((SHLL|SHRL|SARL) x y)
-((SHLL|SHRL|SARL) x (NEGQ <t> (ANDQconst [c] y))) && c & 31 == 31 => ((SHLL|SHRL|SARL) x (NEGQ <t> y))
+((SHLL|SHRL|SARL|SARXL) x (ADDQconst [c] y)) && c & 31 == 0  => ((SHLL|SHRL|SARL|SARXL) x y)
+((SHLL|SHRL|SARL|SARXL) x (NEGQ <t> (ADDQconst [c] y))) && c & 31 == 0  => ((SHLL|SHRL|SARL|SARXL) x (NEGQ <t> y))
+((SHLL|SHRL|SARL|SARXL) x (ANDQconst [c] y)) && c & 31 == 31 => ((SHLL|SHRL|SARL|SARXL) x y)
+((SHLL|SHRL|SARL|SARXL) x (NEGQ <t> (ANDQconst [c] y))) && c & 31 == 31 => ((SHLL|SHRL|SARL|SARXL) x (NEGQ <t> y))
 
-((SHLQ|SHRQ|SARQ) x (ADDLconst [c] y)) && c & 63 == 0  => ((SHLQ|SHRQ|SARQ) x y)
-((SHLQ|SHRQ|SARQ) x (NEGL <t> (ADDLconst [c] y))) && c & 63 == 0  => ((SHLQ|SHRQ|SARQ) x (NEGL <t> y))
-((SHLQ|SHRQ|SARQ) x (ANDLconst [c] y)) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ) x y)
-((SHLQ|SHRQ|SARQ) x (NEGL <t> (ANDLconst [c] y))) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ) x (NEGL <t> y))
+((SHLQ|SHRQ|SARQ|SARXQ) x (ADDLconst [c] y)) && c & 63 == 0  => ((SHLQ|SHRQ|SARQ|SARXQ) x y)
+((SHLQ|SHRQ|SARQ|SARXQ) x (NEGL <t> (ADDLconst [c] y))) && c & 63 == 0  => ((SHLQ|SHRQ|SARQ|SARXQ) x (NEGL <t> y))
+((SHLQ|SHRQ|SARQ|SARXQ) x (ANDLconst [c] y)) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ|SARXQ) x y)
+((SHLQ|SHRQ|SARQ|SARXQ) x (NEGL <t> (ANDLconst [c] y))) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ|SARXQ) x (NEGL <t> y))
 
-((SHLL|SHRL|SARL) x (ADDLconst [c] y)) && c & 31 == 0  => ((SHLL|SHRL|SARL) x y)
-((SHLL|SHRL|SARL) x (NEGL <t> (ADDLconst [c] y))) && c & 31 == 0  => ((SHLL|SHRL|SARL) x (NEGL <t> y))
-((SHLL|SHRL|SARL) x (ANDLconst [c] y)) && c & 31 == 31 => ((SHLL|SHRL|SARL) x y)
-((SHLL|SHRL|SARL) x (NEGL <t> (ANDLconst [c] y))) && c & 31 == 31 => ((SHLL|SHRL|SARL) x (NEGL <t> y))
+((SHLL|SHRL|SARL|SARXL) x (ADDLconst [c] y)) && c & 31 == 0  => ((SHLL|SHRL|SARL|SARXL) x y)
+((SHLL|SHRL|SARL|SARXL) x (NEGL <t> (ADDLconst [c] y))) && c & 31 == 0  => ((SHLL|SHRL|SARL|SARXL) x (NEGL <t> y))
+((SHLL|SHRL|SARL|SARXL) x (ANDLconst [c] y)) && c & 31 == 31 => ((SHLL|SHRL|SARL|SARXL) x y)
+((SHLL|SHRL|SARL|SARXL) x (NEGL <t> (ANDLconst [c] y))) && c & 31 == 31 => ((SHLL|SHRL|SARL|SARXL) x (NEGL <t> y))
 
 // Constant rotate instructions
 ((ADDQ|ORQ|XORQ) (SHLQconst x [c]) (SHRQconst x [d])) && d==64-c => (ROLQconst x [c])
diff --git a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
index ab84504..2eec6e0 100644
--- a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
@@ -953,6 +953,9 @@
 		{name: "MOVBEQstoreidx8", argLength: 4, reg: gpstoreidx, asm: "MOVBEQ", scale: 8, aux: "SymOff", symEffect: "Write"},                    // swap and store 8 bytes in arg2 to arg0+8*arg1+auxint+aux. arg3=mem
 
 		// CPUID feature: BMI2.
+		{name: "SARXQ", argLength: 2, reg: gp21, asm: "SARXQ"}, // signed arg0 >> arg1, shift amount is mod 64
+		{name: "SARXL", argLength: 2, reg: gp21, asm: "SARXL"}, // signed int32(arg0) >> arg1, shift amount is mod 32
+
 		{name: "SHLXLload", argLength: 3, reg: gp21shxload, asm: "SHLXL", aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // *(arg0+auxint+aux) << arg1, arg2=mem, shift amount is mod 32
 		{name: "SHLXQload", argLength: 3, reg: gp21shxload, asm: "SHLXQ", aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // *(arg0+auxint+aux) << arg1, arg2=mem, shift amount is mod 64
 		{name: "SHRXLload", argLength: 3, reg: gp21shxload, asm: "SHRXL", aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+auxint+aux) >> arg1, arg2=mem, shift amount is mod 32
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 1c941e8..976d887 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -1062,6 +1062,8 @@
 	OpAMD64MOVBELstoreidx8
 	OpAMD64MOVBEQstoreidx1
 	OpAMD64MOVBEQstoreidx8
+	OpAMD64SARXQ
+	OpAMD64SARXL
 	OpAMD64SHLXLload
 	OpAMD64SHLXQload
 	OpAMD64SHRXLload
@@ -14118,6 +14120,34 @@
 		},
 	},
 	{
+		name:   "SARXQ",
+		argLen: 2,
+		asm:    x86.ASARXQ,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+				{1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+			},
+			outputs: []outputInfo{
+				{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+			},
+		},
+	},
+	{
+		name:   "SARXL",
+		argLen: 2,
+		asm:    x86.ASARXL,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+				{1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+			},
+			outputs: []outputInfo{
+				{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+			},
+		},
+	},
+	{
 		name:           "SHLXLload",
 		auxType:        auxSymOff,
 		argLen:         3,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index addfaaa..81f1f1a 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -382,6 +382,10 @@
 		return rewriteValueAMD64_OpAMD64SARW(v)
 	case OpAMD64SARWconst:
 		return rewriteValueAMD64_OpAMD64SARWconst(v)
+	case OpAMD64SARXL:
+		return rewriteValueAMD64_OpAMD64SARXL(v)
+	case OpAMD64SARXQ:
+		return rewriteValueAMD64_OpAMD64SARXQ(v)
 	case OpAMD64SBBLcarrymask:
 		return rewriteValueAMD64_OpAMD64SBBLcarrymask(v)
 	case OpAMD64SBBQ:
@@ -19844,6 +19848,19 @@
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
+	// match: (SARL x y)
+	// cond: buildcfg.GOAMD64 >= 3
+	// result: (SARXL x y)
+	for {
+		x := v_0
+		y := v_1
+		if !(buildcfg.GOAMD64 >= 3) {
+			break
+		}
+		v.reset(OpAMD64SARXL)
+		v.AddArg2(x, y)
+		return true
+	}
 	// match: (SARL x (MOVQconst [c]))
 	// result: (SARLconst [int8(c&31)] x)
 	for {
@@ -20066,6 +20083,19 @@
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
+	// match: (SARQ x y)
+	// cond: buildcfg.GOAMD64 >= 3
+	// result: (SARXQ x y)
+	for {
+		x := v_0
+		y := v_1
+		if !(buildcfg.GOAMD64 >= 3) {
+			break
+		}
+		v.reset(OpAMD64SARXQ)
+		v.AddArg2(x, y)
+		return true
+	}
 	// match: (SARQ x (MOVQconst [c]))
 	// result: (SARQconst [int8(c&63)] x)
 	for {
@@ -20341,6 +20371,398 @@
 	}
 	return false
 }
+func rewriteValueAMD64_OpAMD64SARXL(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (SARXL x (MOVQconst [c]))
+	// result: (SARLconst [int8(c&31)] x)
+	for {
+		x := v_0
+		if v_1.Op != OpAMD64MOVQconst {
+			break
+		}
+		c := auxIntToInt64(v_1.AuxInt)
+		v.reset(OpAMD64SARLconst)
+		v.AuxInt = int8ToAuxInt(int8(c & 31))
+		v.AddArg(x)
+		return true
+	}
+	// match: (SARXL x (MOVLconst [c]))
+	// result: (SARLconst [int8(c&31)] x)
+	for {
+		x := v_0
+		if v_1.Op != OpAMD64MOVLconst {
+			break
+		}
+		c := auxIntToInt32(v_1.AuxInt)
+		v.reset(OpAMD64SARLconst)
+		v.AuxInt = int8ToAuxInt(int8(c & 31))
+		v.AddArg(x)
+		return true
+	}
+	// match: (SARXL x (ADDQconst [c] y))
+	// cond: c & 31 == 0
+	// result: (SARXL x y)
+	for {
+		x := v_0
+		if v_1.Op != OpAMD64ADDQconst {
+			break
+		}
+		c := auxIntToInt32(v_1.AuxInt)
+		y := v_1.Args[0]
+		if !(c&31 == 0) {
+			break
+		}
+		v.reset(OpAMD64SARXL)
+		v.AddArg2(x, y)
+		return true
+	}
+	// match: (SARXL x (NEGQ <t> (ADDQconst [c] y)))
+	// cond: c & 31 == 0
+	// result: (SARXL x (NEGQ <t> y))
+	for {
+		x := v_0
+		if v_1.Op != OpAMD64NEGQ {
+			break
+		}
+		t := v_1.Type
+		v_1_0 := v_1.Args[0]
+		if v_1_0.Op != OpAMD64ADDQconst {
+			break
+		}
+		c := auxIntToInt32(v_1_0.AuxInt)
+		y := v_1_0.Args[0]
+		if !(c&31 == 0) {
+			break
+		}
+		v.reset(OpAMD64SARXL)
+		v0 := b.NewValue0(v.Pos, OpAMD64NEGQ, t)
+		v0.AddArg(y)
+		v.AddArg2(x, v0)
+		return true
+	}
+	// match: (SARXL x (ANDQconst [c] y))
+	// cond: c & 31 == 31
+	// result: (SARXL x y)
+	for {
+		x := v_0
+		if v_1.Op != OpAMD64ANDQconst {
+			break
+		}
+		c := auxIntToInt32(v_1.AuxInt)
+		y := v_1.Args[0]
+		if !(c&31 == 31) {
+			break
+		}
+		v.reset(OpAMD64SARXL)
+		v.AddArg2(x, y)
+		return true
+	}
+	// match: (SARXL x (NEGQ <t> (ANDQconst [c] y)))
+	// cond: c & 31 == 31
+	// result: (SARXL x (NEGQ <t> y))
+	for {
+		x := v_0
+		if v_1.Op != OpAMD64NEGQ {
+			break
+		}
+		t := v_1.Type
+		v_1_0 := v_1.Args[0]
+		if v_1_0.Op != OpAMD64ANDQconst {
+			break
+		}
+		c := auxIntToInt32(v_1_0.AuxInt)
+		y := v_1_0.Args[0]
+		if !(c&31 == 31) {
+			break
+		}
+		v.reset(OpAMD64SARXL)
+		v0 := b.NewValue0(v.Pos, OpAMD64NEGQ, t)
+		v0.AddArg(y)
+		v.AddArg2(x, v0)
+		return true
+	}
+	// match: (SARXL x (ADDLconst [c] y))
+	// cond: c & 31 == 0
+	// result: (SARXL x y)
+	for {
+		x := v_0
+		if v_1.Op != OpAMD64ADDLconst {
+			break
+		}
+		c := auxIntToInt32(v_1.AuxInt)
+		y := v_1.Args[0]
+		if !(c&31 == 0) {
+			break
+		}
+		v.reset(OpAMD64SARXL)
+		v.AddArg2(x, y)
+		return true
+	}
+	// match: (SARXL x (NEGL <t> (ADDLconst [c] y)))
+	// cond: c & 31 == 0
+	// result: (SARXL x (NEGL <t> y))
+	for {
+		x := v_0
+		if v_1.Op != OpAMD64NEGL {
+			break
+		}
+		t := v_1.Type
+		v_1_0 := v_1.Args[0]
+		if v_1_0.Op != OpAMD64ADDLconst {
+			break
+		}
+		c := auxIntToInt32(v_1_0.AuxInt)
+		y := v_1_0.Args[0]
+		if !(c&31 == 0) {
+			break
+		}
+		v.reset(OpAMD64SARXL)
+		v0 := b.NewValue0(v.Pos, OpAMD64NEGL, t)
+		v0.AddArg(y)
+		v.AddArg2(x, v0)
+		return true
+	}
+	// match: (SARXL x (ANDLconst [c] y))
+	// cond: c & 31 == 31
+	// result: (SARXL x y)
+	for {
+		x := v_0
+		if v_1.Op != OpAMD64ANDLconst {
+			break
+		}
+		c := auxIntToInt32(v_1.AuxInt)
+		y := v_1.Args[0]
+		if !(c&31 == 31) {
+			break
+		}
+		v.reset(OpAMD64SARXL)
+		v.AddArg2(x, y)
+		return true
+	}
+	// match: (SARXL x (NEGL <t> (ANDLconst [c] y)))
+	// cond: c & 31 == 31
+	// result: (SARXL x (NEGL <t> y))
+	for {
+		x := v_0
+		if v_1.Op != OpAMD64NEGL {
+			break
+		}
+		t := v_1.Type
+		v_1_0 := v_1.Args[0]
+		if v_1_0.Op != OpAMD64ANDLconst {
+			break
+		}
+		c := auxIntToInt32(v_1_0.AuxInt)
+		y := v_1_0.Args[0]
+		if !(c&31 == 31) {
+			break
+		}
+		v.reset(OpAMD64SARXL)
+		v0 := b.NewValue0(v.Pos, OpAMD64NEGL, t)
+		v0.AddArg(y)
+		v.AddArg2(x, v0)
+		return true
+	}
+	return false
+}
+func rewriteValueAMD64_OpAMD64SARXQ(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (SARXQ x (MOVQconst [c]))
+	// result: (SARQconst [int8(c&63)] x)
+	for {
+		x := v_0
+		if v_1.Op != OpAMD64MOVQconst {
+			break
+		}
+		c := auxIntToInt64(v_1.AuxInt)
+		v.reset(OpAMD64SARQconst)
+		v.AuxInt = int8ToAuxInt(int8(c & 63))
+		v.AddArg(x)
+		return true
+	}
+	// match: (SARXQ x (MOVLconst [c]))
+	// result: (SARQconst [int8(c&63)] x)
+	for {
+		x := v_0
+		if v_1.Op != OpAMD64MOVLconst {
+			break
+		}
+		c := auxIntToInt32(v_1.AuxInt)
+		v.reset(OpAMD64SARQconst)
+		v.AuxInt = int8ToAuxInt(int8(c & 63))
+		v.AddArg(x)
+		return true
+	}
+	// match: (SARXQ x (ADDQconst [c] y))
+	// cond: c & 63 == 0
+	// result: (SARXQ x y)
+	for {
+		x := v_0
+		if v_1.Op != OpAMD64ADDQconst {
+			break
+		}
+		c := auxIntToInt32(v_1.AuxInt)
+		y := v_1.Args[0]
+		if !(c&63 == 0) {
+			break
+		}
+		v.reset(OpAMD64SARXQ)
+		v.AddArg2(x, y)
+		return true
+	}
+	// match: (SARXQ x (NEGQ <t> (ADDQconst [c] y)))
+	// cond: c & 63 == 0
+	// result: (SARXQ x (NEGQ <t> y))
+	for {
+		x := v_0
+		if v_1.Op != OpAMD64NEGQ {
+			break
+		}
+		t := v_1.Type
+		v_1_0 := v_1.Args[0]
+		if v_1_0.Op != OpAMD64ADDQconst {
+			break
+		}
+		c := auxIntToInt32(v_1_0.AuxInt)
+		y := v_1_0.Args[0]
+		if !(c&63 == 0) {
+			break
+		}
+		v.reset(OpAMD64SARXQ)
+		v0 := b.NewValue0(v.Pos, OpAMD64NEGQ, t)
+		v0.AddArg(y)
+		v.AddArg2(x, v0)
+		return true
+	}
+	// match: (SARXQ x (ANDQconst [c] y))
+	// cond: c & 63 == 63
+	// result: (SARXQ x y)
+	for {
+		x := v_0
+		if v_1.Op != OpAMD64ANDQconst {
+			break
+		}
+		c := auxIntToInt32(v_1.AuxInt)
+		y := v_1.Args[0]
+		if !(c&63 == 63) {
+			break
+		}
+		v.reset(OpAMD64SARXQ)
+		v.AddArg2(x, y)
+		return true
+	}
+	// match: (SARXQ x (NEGQ <t> (ANDQconst [c] y)))
+	// cond: c & 63 == 63
+	// result: (SARXQ x (NEGQ <t> y))
+	for {
+		x := v_0
+		if v_1.Op != OpAMD64NEGQ {
+			break
+		}
+		t := v_1.Type
+		v_1_0 := v_1.Args[0]
+		if v_1_0.Op != OpAMD64ANDQconst {
+			break
+		}
+		c := auxIntToInt32(v_1_0.AuxInt)
+		y := v_1_0.Args[0]
+		if !(c&63 == 63) {
+			break
+		}
+		v.reset(OpAMD64SARXQ)
+		v0 := b.NewValue0(v.Pos, OpAMD64NEGQ, t)
+		v0.AddArg(y)
+		v.AddArg2(x, v0)
+		return true
+	}
+	// match: (SARXQ x (ADDLconst [c] y))
+	// cond: c & 63 == 0
+	// result: (SARXQ x y)
+	for {
+		x := v_0
+		if v_1.Op != OpAMD64ADDLconst {
+			break
+		}
+		c := auxIntToInt32(v_1.AuxInt)
+		y := v_1.Args[0]
+		if !(c&63 == 0) {
+			break
+		}
+		v.reset(OpAMD64SARXQ)
+		v.AddArg2(x, y)
+		return true
+	}
+	// match: (SARXQ x (NEGL <t> (ADDLconst [c] y)))
+	// cond: c & 63 == 0
+	// result: (SARXQ x (NEGL <t> y))
+	for {
+		x := v_0
+		if v_1.Op != OpAMD64NEGL {
+			break
+		}
+		t := v_1.Type
+		v_1_0 := v_1.Args[0]
+		if v_1_0.Op != OpAMD64ADDLconst {
+			break
+		}
+		c := auxIntToInt32(v_1_0.AuxInt)
+		y := v_1_0.Args[0]
+		if !(c&63 == 0) {
+			break
+		}
+		v.reset(OpAMD64SARXQ)
+		v0 := b.NewValue0(v.Pos, OpAMD64NEGL, t)
+		v0.AddArg(y)
+		v.AddArg2(x, v0)
+		return true
+	}
+	// match: (SARXQ x (ANDLconst [c] y))
+	// cond: c & 63 == 63
+	// result: (SARXQ x y)
+	for {
+		x := v_0
+		if v_1.Op != OpAMD64ANDLconst {
+			break
+		}
+		c := auxIntToInt32(v_1.AuxInt)
+		y := v_1.Args[0]
+		if !(c&63 == 63) {
+			break
+		}
+		v.reset(OpAMD64SARXQ)
+		v.AddArg2(x, y)
+		return true
+	}
+	// match: (SARXQ x (NEGL <t> (ANDLconst [c] y)))
+	// cond: c & 63 == 63
+	// result: (SARXQ x (NEGL <t> y))
+	for {
+		x := v_0
+		if v_1.Op != OpAMD64NEGL {
+			break
+		}
+		t := v_1.Type
+		v_1_0 := v_1.Args[0]
+		if v_1_0.Op != OpAMD64ANDLconst {
+			break
+		}
+		c := auxIntToInt32(v_1_0.AuxInt)
+		y := v_1_0.Args[0]
+		if !(c&63 == 63) {
+			break
+		}
+		v.reset(OpAMD64SARXQ)
+		v0 := b.NewValue0(v.Pos, OpAMD64NEGL, t)
+		v0.AddArg(y)
+		v.AddArg2(x, v0)
+		return true
+	}
+	return false
+}
 func rewriteValueAMD64_OpAMD64SBBLcarrymask(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (SBBLcarrymask (FlagEQ))
diff --git a/src/cmd/compile/internal/test/shift_test.go b/src/cmd/compile/internal/test/shift_test.go
index ea88f0a..58c8dde 100644
--- a/src/cmd/compile/internal/test/shift_test.go
+++ b/src/cmd/compile/internal/test/shift_test.go
@@ -1029,3 +1029,13 @@
 		}
 	}
 }
+
+var shiftSink64 int64
+
+func BenchmarkShiftArithmeticRight(b *testing.B) {
+	x := shiftSink64
+	for i := 0; i < b.N; i++ {
+		x = x >> (i & 63)
+	}
+	shiftSink64 = x
+}
diff --git a/test/codegen/bmi.go b/test/codegen/bmi.go
index 2908d1b..9dd2b00 100644
--- a/test/codegen/bmi.go
+++ b/test/codegen/bmi.go
@@ -46,6 +46,16 @@
 	return x & (x - 1)
 }
 
+func sarx64(x, y int64) int64 {
+	// amd64/v3:"SARXQ"
+	return x >> y
+}
+
+func sarx32(x, y int32) int32 {
+	// amd64/v3:"SARXL"
+	return x >> y
+}
+
 func shlrx64(x []uint64, i int, s uint64) uint64 {
 	// amd64/v3: `SHRXQ\t[A-Z]+[0-9]*, \([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*8\), [A-Z]+[0-9]*`
 	s = x[i] >> i