cmd/compile: implement non-constant rotates using ROR on arm64

Add some rules to match the Go code like:
	y &= 63
	x << y | x >> (64-y)
or
	y &= 63
	x >> y | x << (64-y)
as a ROR instruction. Make math/bits.RotateLeft faster on arm64.

Extends CL 132435 to arm64.

Benchmarks of math/bits.RotateLeftxxN:
name            old time/op       new time/op       delta
RotateLeft-8    3.548750ns +- 1%  2.003750ns +- 0%  -43.54%  (p=0.000 n=8+8)
RotateLeft8-8   3.925000ns +- 0%  3.925000ns +- 0%     ~     (p=1.000 n=8+8)
RotateLeft16-8  3.925000ns +- 0%  3.927500ns +- 0%     ~     (p=0.608 n=8+8)
RotateLeft32-8  3.925000ns +- 0%  2.002500ns +- 0%  -48.98%  (p=0.000 n=8+8)
RotateLeft64-8  3.536250ns +- 0%  2.003750ns +- 0%  -43.34%  (p=0.000 n=8+8)

Change-Id: I77622cd7f39b917427e060647321f5513973232c
Reviewed-on: https://go-review.googlesource.com/122542
Run-TryBot: Ben Shi <powerman1st@163.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go
index db7064c..1926541 100644
--- a/src/cmd/compile/internal/arm64/ssa.go
+++ b/src/cmd/compile/internal/arm64/ssa.go
@@ -195,7 +195,9 @@
 		ssa.OpARM64FNMULS,
 		ssa.OpARM64FNMULD,
 		ssa.OpARM64FDIVS,
-		ssa.OpARM64FDIVD:
+		ssa.OpARM64FDIVD,
+		ssa.OpARM64ROR,
+		ssa.OpARM64RORW:
 		r := v.Reg()
 		r1 := v.Args[0].Reg()
 		r2 := v.Args[1].Reg()
diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go
index 00ff7d4..bb076f8 100644
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@@ -3361,12 +3361,12 @@
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 			return s.newValue2(ssa.OpRotateLeft32, types.Types[TUINT32], args[0], args[1])
 		},
-		sys.AMD64, sys.S390X)
+		sys.AMD64, sys.ARM64, sys.S390X)
 	addF("math/bits", "RotateLeft64",
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 			return s.newValue2(ssa.OpRotateLeft64, types.Types[TUINT64], args[0], args[1])
 		},
-		sys.AMD64, sys.S390X)
+		sys.AMD64, sys.ARM64, sys.S390X)
 	alias("math/bits", "RotateLeft", "math/bits", "RotateLeft64", p8...)
 
 	makeOnesCountAMD64 := func(op64 ssa.Op, op32 ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
diff --git a/src/cmd/compile/internal/ssa/gen/ARM64.rules b/src/cmd/compile/internal/ssa/gen/ARM64.rules
index 6c8f386..4c9d9a6 100644
--- a/src/cmd/compile/internal/ssa/gen/ARM64.rules
+++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules
@@ -89,6 +89,10 @@
 (Round x) -> (FRINTAD x)
 (Trunc x) -> (FRINTZD x)
 
+// lowering rotates
+(RotateLeft32 x y) -> (RORW x (NEG <y.Type> y))
+(RotateLeft64 x y) -> (ROR x (NEG <y.Type> y))
+
 (Ctz64NonZero x) -> (Ctz64 x)
 (Ctz32NonZero x) -> (Ctz32 x)
 
@@ -127,6 +131,8 @@
 // shifts
 // hardware instruction uses only the low 6 bits of the shift
 // we compare to 64 to ensure Go semantics for large shifts
+// Rules about rotates with non-const shift are based on the following rules,
+// if the following rules change, please also modify the rules based on them.
 (Lsh64x64 <t> x y) -> (CSEL {OpARM64LessThanU} (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] y))
 (Lsh64x32 <t> x y) -> (CSEL {OpARM64LessThanU} (SLL <t> x (ZeroExt32to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
 (Lsh64x16 <t> x y) -> (CSEL {OpARM64LessThanU} (SLL <t> x (ZeroExt16to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
@@ -1592,7 +1598,7 @@
 (ORNshiftRL x (SRLconst x [c]) [d]) && c==d -> (MOVDconst [-1])
 (ORNshiftRA x (SRAconst x [c]) [d]) && c==d -> (MOVDconst [-1])
 
-// Generate rotates
+// Generate rotates with const shift
 (ADDshiftLL [c] (SRLconst x [64-c]) x) -> (RORconst [64-c] x)
 ( ORshiftLL [c] (SRLconst x [64-c]) x) -> (RORconst [64-c] x)
 (XORshiftLL [c] (SRLconst x [64-c]) x) -> (RORconst [64-c] x)
@@ -1610,6 +1616,38 @@
 ( ORshiftRL <t> [c] (SLLconst x [32-c]) (MOVWUreg x)) && c < 32 && t.Size() == 4 -> (RORWconst [c] x)
 (XORshiftRL <t> [c] (SLLconst x [32-c]) (MOVWUreg x)) && c < 32 && t.Size() == 4 -> (RORWconst [c] x)
 
+(RORconst [c] (RORconst [d] x)) -> (RORconst [(c+d)&63] x)
+(RORWconst [c] (RORWconst [d] x)) -> (RORWconst [(c+d)&31] x)
+
+// Generate rotates with non-const shift.
+// These rules match the Go source code like
+//	y &= 63
+//	x << y | x >> (64-y)
+// "|" can also be "^" or "+".
+// As arm64 does not have a ROL instruction, so ROL(x, y) is replaced by ROR(x, -y).
+((ADD|OR|XOR) (SLL x (ANDconst <t> [63] y))
+	(CSEL0 <typ.UInt64> {cc} (SRL <typ.UInt64> x (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y)))
+		(CMPconst [64] (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y))))) && cc.(Op) == OpARM64LessThanU
+	-> (ROR x (NEG <t> y))
+((ADD|OR|XOR) (SRL <typ.UInt64> x (ANDconst <t> [63] y))
+	(CSEL0 <typ.UInt64> {cc} (SLL x (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y)))
+		(CMPconst [64] (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y))))) && cc.(Op) == OpARM64LessThanU
+	-> (ROR x y)
+
+// These rules match the Go source code like
+//	y &= 31
+//	x << y | x >> (32-y)
+// "|" can also be "^" or "+".
+// As arm64 does not have a ROLW instruction, so ROLW(x, y) is replaced by RORW(x, -y).
+((ADD|OR|XOR) (SLL x (ANDconst <t> [31] y))
+	(CSEL0 <typ.UInt32> {cc} (SRL <typ.UInt32> (MOVWUreg x) (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y)))
+		(CMPconst [64]  (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y))))) && cc.(Op) == OpARM64LessThanU
+	-> (RORW x (NEG <t> y))
+((ADD|OR|XOR) (SRL <typ.UInt32> (MOVWUreg x) (ANDconst <t> [31] y))
+	(CSEL0 <typ.UInt32> {cc} (SLL x (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y)))
+		(CMPconst [64]  (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y))))) && cc.(Op) == OpARM64LessThanU
+	-> (RORW x y)
+
 // Extract from reg pair
 (ADDshiftLL [c] (SRLconst x [64-c]) x2) -> (EXTRconst [64-c] x2 x)
 ( ORshiftLL [c] (SRLconst x [64-c]) x2) -> (EXTRconst [64-c] x2 x)
diff --git a/src/cmd/compile/internal/ssa/gen/ARM64Ops.go b/src/cmd/compile/internal/ssa/gen/ARM64Ops.go
index eb0ad53..43230fb 100644
--- a/src/cmd/compile/internal/ssa/gen/ARM64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/ARM64Ops.go
@@ -248,6 +248,8 @@
 		{name: "SRLconst", argLength: 1, reg: gp11, asm: "LSR", aux: "Int64"},     // arg0 >> auxInt, unsigned
 		{name: "SRA", argLength: 2, reg: gp21, asm: "ASR"},                        // arg0 >> arg1, signed, shift amount is mod 64
 		{name: "SRAconst", argLength: 1, reg: gp11, asm: "ASR", aux: "Int64"},     // arg0 >> auxInt, signed
+		{name: "ROR", argLength: 2, reg: gp21, asm: "ROR"},                        // arg0 right rotate by (arg1 mod 64) bits
+		{name: "RORW", argLength: 2, reg: gp21, asm: "RORW"},                      // arg0 right rotate by (arg1 mod 32) bits
 		{name: "RORconst", argLength: 1, reg: gp11, asm: "ROR", aux: "Int64"},     // arg0 right rotate by auxInt bits
 		{name: "RORWconst", argLength: 1, reg: gp11, asm: "RORW", aux: "Int64"},   // uint32(arg0) right rotate by auxInt bits
 		{name: "EXTRconst", argLength: 2, reg: gp21, asm: "EXTR", aux: "Int64"},   // extract 64 bits from arg0:arg1 starting at lsb auxInt
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 5bf7021..30c5787 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -1139,6 +1139,8 @@
 	OpARM64SRLconst
 	OpARM64SRA
 	OpARM64SRAconst
+	OpARM64ROR
+	OpARM64RORW
 	OpARM64RORconst
 	OpARM64RORWconst
 	OpARM64EXTRconst
@@ -15105,6 +15107,34 @@
 		},
 	},
 	{
+		name:   "ROR",
+		argLen: 2,
+		asm:    arm64.AROR,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
+				{1, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
+			},
+			outputs: []outputInfo{
+				{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
+			},
+		},
+	},
+	{
+		name:   "RORW",
+		argLen: 2,
+		asm:    arm64.ARORW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
+				{1, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
+			},
+			outputs: []outputInfo{
+				{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
+			},
+		},
+	},
+	{
 		name:    "RORconst",
 		auxType: auxInt64,
 		argLen:  1,
diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go
index 219bc36..1dcbd93 100644
--- a/src/cmd/compile/internal/ssa/rewriteARM64.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM64.go
@@ -16,7 +16,7 @@
 func rewriteValueARM64(v *Value) bool {
 	switch v.Op {
 	case OpARM64ADD:
-		return rewriteValueARM64_OpARM64ADD_0(v) || rewriteValueARM64_OpARM64ADD_10(v)
+		return rewriteValueARM64_OpARM64ADD_0(v) || rewriteValueARM64_OpARM64ADD_10(v) || rewriteValueARM64_OpARM64ADD_20(v)
 	case OpARM64ADDconst:
 		return rewriteValueARM64_OpARM64ADDconst_0(v)
 	case OpARM64ADDshiftLL:
@@ -256,7 +256,7 @@
 	case OpARM64NotEqual:
 		return rewriteValueARM64_OpARM64NotEqual_0(v)
 	case OpARM64OR:
-		return rewriteValueARM64_OpARM64OR_0(v) || rewriteValueARM64_OpARM64OR_10(v) || rewriteValueARM64_OpARM64OR_20(v) || rewriteValueARM64_OpARM64OR_30(v)
+		return rewriteValueARM64_OpARM64OR_0(v) || rewriteValueARM64_OpARM64OR_10(v) || rewriteValueARM64_OpARM64OR_20(v) || rewriteValueARM64_OpARM64OR_30(v) || rewriteValueARM64_OpARM64OR_40(v)
 	case OpARM64ORN:
 		return rewriteValueARM64_OpARM64ORN_0(v)
 	case OpARM64ORNshiftLL:
@@ -273,6 +273,10 @@
 		return rewriteValueARM64_OpARM64ORshiftRA_0(v)
 	case OpARM64ORshiftRL:
 		return rewriteValueARM64_OpARM64ORshiftRL_0(v)
+	case OpARM64RORWconst:
+		return rewriteValueARM64_OpARM64RORWconst_0(v)
+	case OpARM64RORconst:
+		return rewriteValueARM64_OpARM64RORconst_0(v)
 	case OpARM64SLL:
 		return rewriteValueARM64_OpARM64SLL_0(v)
 	case OpARM64SLLconst:
@@ -733,6 +737,10 @@
 		return rewriteValueARM64_OpPopCount32_0(v)
 	case OpPopCount64:
 		return rewriteValueARM64_OpPopCount64_0(v)
+	case OpRotateLeft32:
+		return rewriteValueARM64_OpRotateLeft32_0(v)
+	case OpRotateLeft64:
+		return rewriteValueARM64_OpRotateLeft64_0(v)
 	case OpRound:
 		return rewriteValueARM64_OpRound_0(v)
 	case OpRound32F:
@@ -1090,6 +1098,10 @@
 	return false
 }
 func rewriteValueARM64_OpARM64ADD_10(v *Value) bool {
+	b := v.Block
+	_ = b
+	typ := &b.Func.Config.Types
+	_ = typ
 	// match: (ADD x (NEG y))
 	// cond:
 	// result: (SUB x y)
@@ -1248,6 +1260,925 @@
 		v.AddArg(y)
 		return true
 	}
+	// match: (ADD (SLL x (ANDconst <t> [63] y)) (CSEL0 <typ.UInt64> {cc} (SRL <typ.UInt64> x (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y))) (CMPconst [64] (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y)))))
+	// cond: cc.(Op) == OpARM64LessThanU
+	// result: (ROR x (NEG <t> y))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64SLL {
+			break
+		}
+		_ = v_0.Args[1]
+		x := v_0.Args[0]
+		v_0_1 := v_0.Args[1]
+		if v_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		t := v_0_1.Type
+		if v_0_1.AuxInt != 63 {
+			break
+		}
+		y := v_0_1.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64CSEL0 {
+			break
+		}
+		if v_1.Type != typ.UInt64 {
+			break
+		}
+		cc := v_1.Aux
+		_ = v_1.Args[1]
+		v_1_0 := v_1.Args[0]
+		if v_1_0.Op != OpARM64SRL {
+			break
+		}
+		if v_1_0.Type != typ.UInt64 {
+			break
+		}
+		_ = v_1_0.Args[1]
+		if x != v_1_0.Args[0] {
+			break
+		}
+		v_1_0_1 := v_1_0.Args[1]
+		if v_1_0_1.Op != OpARM64SUB {
+			break
+		}
+		if v_1_0_1.Type != t {
+			break
+		}
+		_ = v_1_0_1.Args[1]
+		v_1_0_1_0 := v_1_0_1.Args[0]
+		if v_1_0_1_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1_0_1_0.AuxInt != 64 {
+			break
+		}
+		v_1_0_1_1 := v_1_0_1.Args[1]
+		if v_1_0_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_0_1_1.Type != t {
+			break
+		}
+		if v_1_0_1_1.AuxInt != 63 {
+			break
+		}
+		if y != v_1_0_1_1.Args[0] {
+			break
+		}
+		v_1_1 := v_1.Args[1]
+		if v_1_1.Op != OpARM64CMPconst {
+			break
+		}
+		if v_1_1.AuxInt != 64 {
+			break
+		}
+		v_1_1_0 := v_1_1.Args[0]
+		if v_1_1_0.Op != OpARM64SUB {
+			break
+		}
+		if v_1_1_0.Type != t {
+			break
+		}
+		_ = v_1_1_0.Args[1]
+		v_1_1_0_0 := v_1_1_0.Args[0]
+		if v_1_1_0_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1_1_0_0.AuxInt != 64 {
+			break
+		}
+		v_1_1_0_1 := v_1_1_0.Args[1]
+		if v_1_1_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_1_0_1.Type != t {
+			break
+		}
+		if v_1_1_0_1.AuxInt != 63 {
+			break
+		}
+		if y != v_1_1_0_1.Args[0] {
+			break
+		}
+		if !(cc.(Op) == OpARM64LessThanU) {
+			break
+		}
+		v.reset(OpARM64ROR)
+		v.AddArg(x)
+		v0 := b.NewValue0(v.Pos, OpARM64NEG, t)
+		v0.AddArg(y)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (ADD (CSEL0 <typ.UInt64> {cc} (SRL <typ.UInt64> x (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y))) (CMPconst [64] (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y)))) (SLL x (ANDconst <t> [63] y)))
+	// cond: cc.(Op) == OpARM64LessThanU
+	// result: (ROR x (NEG <t> y))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64CSEL0 {
+			break
+		}
+		if v_0.Type != typ.UInt64 {
+			break
+		}
+		cc := v_0.Aux
+		_ = v_0.Args[1]
+		v_0_0 := v_0.Args[0]
+		if v_0_0.Op != OpARM64SRL {
+			break
+		}
+		if v_0_0.Type != typ.UInt64 {
+			break
+		}
+		_ = v_0_0.Args[1]
+		x := v_0_0.Args[0]
+		v_0_0_1 := v_0_0.Args[1]
+		if v_0_0_1.Op != OpARM64SUB {
+			break
+		}
+		t := v_0_0_1.Type
+		_ = v_0_0_1.Args[1]
+		v_0_0_1_0 := v_0_0_1.Args[0]
+		if v_0_0_1_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0_0_1_0.AuxInt != 64 {
+			break
+		}
+		v_0_0_1_1 := v_0_0_1.Args[1]
+		if v_0_0_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_0_0_1_1.Type != t {
+			break
+		}
+		if v_0_0_1_1.AuxInt != 63 {
+			break
+		}
+		y := v_0_0_1_1.Args[0]
+		v_0_1 := v_0.Args[1]
+		if v_0_1.Op != OpARM64CMPconst {
+			break
+		}
+		if v_0_1.AuxInt != 64 {
+			break
+		}
+		v_0_1_0 := v_0_1.Args[0]
+		if v_0_1_0.Op != OpARM64SUB {
+			break
+		}
+		if v_0_1_0.Type != t {
+			break
+		}
+		_ = v_0_1_0.Args[1]
+		v_0_1_0_0 := v_0_1_0.Args[0]
+		if v_0_1_0_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0_1_0_0.AuxInt != 64 {
+			break
+		}
+		v_0_1_0_1 := v_0_1_0.Args[1]
+		if v_0_1_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_0_1_0_1.Type != t {
+			break
+		}
+		if v_0_1_0_1.AuxInt != 63 {
+			break
+		}
+		if y != v_0_1_0_1.Args[0] {
+			break
+		}
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64SLL {
+			break
+		}
+		_ = v_1.Args[1]
+		if x != v_1.Args[0] {
+			break
+		}
+		v_1_1 := v_1.Args[1]
+		if v_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_1.Type != t {
+			break
+		}
+		if v_1_1.AuxInt != 63 {
+			break
+		}
+		if y != v_1_1.Args[0] {
+			break
+		}
+		if !(cc.(Op) == OpARM64LessThanU) {
+			break
+		}
+		v.reset(OpARM64ROR)
+		v.AddArg(x)
+		v0 := b.NewValue0(v.Pos, OpARM64NEG, t)
+		v0.AddArg(y)
+		v.AddArg(v0)
+		return true
+	}
+	return false
+}
+func rewriteValueARM64_OpARM64ADD_20(v *Value) bool {
+	b := v.Block
+	_ = b
+	typ := &b.Func.Config.Types
+	_ = typ
+	// match: (ADD (SRL <typ.UInt64> x (ANDconst <t> [63] y)) (CSEL0 <typ.UInt64> {cc} (SLL x (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y))) (CMPconst [64] (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y)))))
+	// cond: cc.(Op) == OpARM64LessThanU
+	// result: (ROR x y)
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64SRL {
+			break
+		}
+		if v_0.Type != typ.UInt64 {
+			break
+		}
+		_ = v_0.Args[1]
+		x := v_0.Args[0]
+		v_0_1 := v_0.Args[1]
+		if v_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		t := v_0_1.Type
+		if v_0_1.AuxInt != 63 {
+			break
+		}
+		y := v_0_1.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64CSEL0 {
+			break
+		}
+		if v_1.Type != typ.UInt64 {
+			break
+		}
+		cc := v_1.Aux
+		_ = v_1.Args[1]
+		v_1_0 := v_1.Args[0]
+		if v_1_0.Op != OpARM64SLL {
+			break
+		}
+		_ = v_1_0.Args[1]
+		if x != v_1_0.Args[0] {
+			break
+		}
+		v_1_0_1 := v_1_0.Args[1]
+		if v_1_0_1.Op != OpARM64SUB {
+			break
+		}
+		if v_1_0_1.Type != t {
+			break
+		}
+		_ = v_1_0_1.Args[1]
+		v_1_0_1_0 := v_1_0_1.Args[0]
+		if v_1_0_1_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1_0_1_0.AuxInt != 64 {
+			break
+		}
+		v_1_0_1_1 := v_1_0_1.Args[1]
+		if v_1_0_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_0_1_1.Type != t {
+			break
+		}
+		if v_1_0_1_1.AuxInt != 63 {
+			break
+		}
+		if y != v_1_0_1_1.Args[0] {
+			break
+		}
+		v_1_1 := v_1.Args[1]
+		if v_1_1.Op != OpARM64CMPconst {
+			break
+		}
+		if v_1_1.AuxInt != 64 {
+			break
+		}
+		v_1_1_0 := v_1_1.Args[0]
+		if v_1_1_0.Op != OpARM64SUB {
+			break
+		}
+		if v_1_1_0.Type != t {
+			break
+		}
+		_ = v_1_1_0.Args[1]
+		v_1_1_0_0 := v_1_1_0.Args[0]
+		if v_1_1_0_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1_1_0_0.AuxInt != 64 {
+			break
+		}
+		v_1_1_0_1 := v_1_1_0.Args[1]
+		if v_1_1_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_1_0_1.Type != t {
+			break
+		}
+		if v_1_1_0_1.AuxInt != 63 {
+			break
+		}
+		if y != v_1_1_0_1.Args[0] {
+			break
+		}
+		if !(cc.(Op) == OpARM64LessThanU) {
+			break
+		}
+		v.reset(OpARM64ROR)
+		v.AddArg(x)
+		v.AddArg(y)
+		return true
+	}
+	// match: (ADD (CSEL0 <typ.UInt64> {cc} (SLL x (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y))) (CMPconst [64] (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y)))) (SRL <typ.UInt64> x (ANDconst <t> [63] y)))
+	// cond: cc.(Op) == OpARM64LessThanU
+	// result: (ROR x y)
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64CSEL0 {
+			break
+		}
+		if v_0.Type != typ.UInt64 {
+			break
+		}
+		cc := v_0.Aux
+		_ = v_0.Args[1]
+		v_0_0 := v_0.Args[0]
+		if v_0_0.Op != OpARM64SLL {
+			break
+		}
+		_ = v_0_0.Args[1]
+		x := v_0_0.Args[0]
+		v_0_0_1 := v_0_0.Args[1]
+		if v_0_0_1.Op != OpARM64SUB {
+			break
+		}
+		t := v_0_0_1.Type
+		_ = v_0_0_1.Args[1]
+		v_0_0_1_0 := v_0_0_1.Args[0]
+		if v_0_0_1_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0_0_1_0.AuxInt != 64 {
+			break
+		}
+		v_0_0_1_1 := v_0_0_1.Args[1]
+		if v_0_0_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_0_0_1_1.Type != t {
+			break
+		}
+		if v_0_0_1_1.AuxInt != 63 {
+			break
+		}
+		y := v_0_0_1_1.Args[0]
+		v_0_1 := v_0.Args[1]
+		if v_0_1.Op != OpARM64CMPconst {
+			break
+		}
+		if v_0_1.AuxInt != 64 {
+			break
+		}
+		v_0_1_0 := v_0_1.Args[0]
+		if v_0_1_0.Op != OpARM64SUB {
+			break
+		}
+		if v_0_1_0.Type != t {
+			break
+		}
+		_ = v_0_1_0.Args[1]
+		v_0_1_0_0 := v_0_1_0.Args[0]
+		if v_0_1_0_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0_1_0_0.AuxInt != 64 {
+			break
+		}
+		v_0_1_0_1 := v_0_1_0.Args[1]
+		if v_0_1_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_0_1_0_1.Type != t {
+			break
+		}
+		if v_0_1_0_1.AuxInt != 63 {
+			break
+		}
+		if y != v_0_1_0_1.Args[0] {
+			break
+		}
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64SRL {
+			break
+		}
+		if v_1.Type != typ.UInt64 {
+			break
+		}
+		_ = v_1.Args[1]
+		if x != v_1.Args[0] {
+			break
+		}
+		v_1_1 := v_1.Args[1]
+		if v_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_1.Type != t {
+			break
+		}
+		if v_1_1.AuxInt != 63 {
+			break
+		}
+		if y != v_1_1.Args[0] {
+			break
+		}
+		if !(cc.(Op) == OpARM64LessThanU) {
+			break
+		}
+		v.reset(OpARM64ROR)
+		v.AddArg(x)
+		v.AddArg(y)
+		return true
+	}
+	// match: (ADD (SLL x (ANDconst <t> [31] y)) (CSEL0 <typ.UInt32> {cc} (SRL <typ.UInt32> (MOVWUreg x) (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y))) (CMPconst [64] (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y)))))
+	// cond: cc.(Op) == OpARM64LessThanU
+	// result: (RORW x (NEG <t> y))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64SLL {
+			break
+		}
+		_ = v_0.Args[1]
+		x := v_0.Args[0]
+		v_0_1 := v_0.Args[1]
+		if v_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		t := v_0_1.Type
+		if v_0_1.AuxInt != 31 {
+			break
+		}
+		y := v_0_1.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64CSEL0 {
+			break
+		}
+		if v_1.Type != typ.UInt32 {
+			break
+		}
+		cc := v_1.Aux
+		_ = v_1.Args[1]
+		v_1_0 := v_1.Args[0]
+		if v_1_0.Op != OpARM64SRL {
+			break
+		}
+		if v_1_0.Type != typ.UInt32 {
+			break
+		}
+		_ = v_1_0.Args[1]
+		v_1_0_0 := v_1_0.Args[0]
+		if v_1_0_0.Op != OpARM64MOVWUreg {
+			break
+		}
+		if x != v_1_0_0.Args[0] {
+			break
+		}
+		v_1_0_1 := v_1_0.Args[1]
+		if v_1_0_1.Op != OpARM64SUB {
+			break
+		}
+		if v_1_0_1.Type != t {
+			break
+		}
+		_ = v_1_0_1.Args[1]
+		v_1_0_1_0 := v_1_0_1.Args[0]
+		if v_1_0_1_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1_0_1_0.AuxInt != 32 {
+			break
+		}
+		v_1_0_1_1 := v_1_0_1.Args[1]
+		if v_1_0_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_0_1_1.Type != t {
+			break
+		}
+		if v_1_0_1_1.AuxInt != 31 {
+			break
+		}
+		if y != v_1_0_1_1.Args[0] {
+			break
+		}
+		v_1_1 := v_1.Args[1]
+		if v_1_1.Op != OpARM64CMPconst {
+			break
+		}
+		if v_1_1.AuxInt != 64 {
+			break
+		}
+		v_1_1_0 := v_1_1.Args[0]
+		if v_1_1_0.Op != OpARM64SUB {
+			break
+		}
+		if v_1_1_0.Type != t {
+			break
+		}
+		_ = v_1_1_0.Args[1]
+		v_1_1_0_0 := v_1_1_0.Args[0]
+		if v_1_1_0_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1_1_0_0.AuxInt != 32 {
+			break
+		}
+		v_1_1_0_1 := v_1_1_0.Args[1]
+		if v_1_1_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_1_0_1.Type != t {
+			break
+		}
+		if v_1_1_0_1.AuxInt != 31 {
+			break
+		}
+		if y != v_1_1_0_1.Args[0] {
+			break
+		}
+		if !(cc.(Op) == OpARM64LessThanU) {
+			break
+		}
+		v.reset(OpARM64RORW)
+		v.AddArg(x)
+		v0 := b.NewValue0(v.Pos, OpARM64NEG, t)
+		v0.AddArg(y)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (ADD (CSEL0 <typ.UInt32> {cc} (SRL <typ.UInt32> (MOVWUreg x) (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y))) (CMPconst [64] (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y)))) (SLL x (ANDconst <t> [31] y)))
+	// cond: cc.(Op) == OpARM64LessThanU
+	// result: (RORW x (NEG <t> y))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64CSEL0 {
+			break
+		}
+		if v_0.Type != typ.UInt32 {
+			break
+		}
+		cc := v_0.Aux
+		_ = v_0.Args[1]
+		v_0_0 := v_0.Args[0]
+		if v_0_0.Op != OpARM64SRL {
+			break
+		}
+		if v_0_0.Type != typ.UInt32 {
+			break
+		}
+		_ = v_0_0.Args[1]
+		v_0_0_0 := v_0_0.Args[0]
+		if v_0_0_0.Op != OpARM64MOVWUreg {
+			break
+		}
+		x := v_0_0_0.Args[0]
+		v_0_0_1 := v_0_0.Args[1]
+		if v_0_0_1.Op != OpARM64SUB {
+			break
+		}
+		t := v_0_0_1.Type
+		_ = v_0_0_1.Args[1]
+		v_0_0_1_0 := v_0_0_1.Args[0]
+		if v_0_0_1_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0_0_1_0.AuxInt != 32 {
+			break
+		}
+		v_0_0_1_1 := v_0_0_1.Args[1]
+		if v_0_0_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_0_0_1_1.Type != t {
+			break
+		}
+		if v_0_0_1_1.AuxInt != 31 {
+			break
+		}
+		y := v_0_0_1_1.Args[0]
+		v_0_1 := v_0.Args[1]
+		if v_0_1.Op != OpARM64CMPconst {
+			break
+		}
+		if v_0_1.AuxInt != 64 {
+			break
+		}
+		v_0_1_0 := v_0_1.Args[0]
+		if v_0_1_0.Op != OpARM64SUB {
+			break
+		}
+		if v_0_1_0.Type != t {
+			break
+		}
+		_ = v_0_1_0.Args[1]
+		v_0_1_0_0 := v_0_1_0.Args[0]
+		if v_0_1_0_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0_1_0_0.AuxInt != 32 {
+			break
+		}
+		v_0_1_0_1 := v_0_1_0.Args[1]
+		if v_0_1_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_0_1_0_1.Type != t {
+			break
+		}
+		if v_0_1_0_1.AuxInt != 31 {
+			break
+		}
+		if y != v_0_1_0_1.Args[0] {
+			break
+		}
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64SLL {
+			break
+		}
+		_ = v_1.Args[1]
+		if x != v_1.Args[0] {
+			break
+		}
+		v_1_1 := v_1.Args[1]
+		if v_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_1.Type != t {
+			break
+		}
+		if v_1_1.AuxInt != 31 {
+			break
+		}
+		if y != v_1_1.Args[0] {
+			break
+		}
+		if !(cc.(Op) == OpARM64LessThanU) {
+			break
+		}
+		v.reset(OpARM64RORW)
+		v.AddArg(x)
+		v0 := b.NewValue0(v.Pos, OpARM64NEG, t)
+		v0.AddArg(y)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (ADD (SRL <typ.UInt32> (MOVWUreg x) (ANDconst <t> [31] y)) (CSEL0 <typ.UInt32> {cc} (SLL x (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y))) (CMPconst [64] (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y)))))
+	// cond: cc.(Op) == OpARM64LessThanU
+	// result: (RORW x y)
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64SRL {
+			break
+		}
+		if v_0.Type != typ.UInt32 {
+			break
+		}
+		_ = v_0.Args[1]
+		v_0_0 := v_0.Args[0]
+		if v_0_0.Op != OpARM64MOVWUreg {
+			break
+		}
+		x := v_0_0.Args[0]
+		v_0_1 := v_0.Args[1]
+		if v_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		t := v_0_1.Type
+		if v_0_1.AuxInt != 31 {
+			break
+		}
+		y := v_0_1.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64CSEL0 {
+			break
+		}
+		if v_1.Type != typ.UInt32 {
+			break
+		}
+		cc := v_1.Aux
+		_ = v_1.Args[1]
+		v_1_0 := v_1.Args[0]
+		if v_1_0.Op != OpARM64SLL {
+			break
+		}
+		_ = v_1_0.Args[1]
+		if x != v_1_0.Args[0] {
+			break
+		}
+		v_1_0_1 := v_1_0.Args[1]
+		if v_1_0_1.Op != OpARM64SUB {
+			break
+		}
+		if v_1_0_1.Type != t {
+			break
+		}
+		_ = v_1_0_1.Args[1]
+		v_1_0_1_0 := v_1_0_1.Args[0]
+		if v_1_0_1_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1_0_1_0.AuxInt != 32 {
+			break
+		}
+		v_1_0_1_1 := v_1_0_1.Args[1]
+		if v_1_0_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_0_1_1.Type != t {
+			break
+		}
+		if v_1_0_1_1.AuxInt != 31 {
+			break
+		}
+		if y != v_1_0_1_1.Args[0] {
+			break
+		}
+		v_1_1 := v_1.Args[1]
+		if v_1_1.Op != OpARM64CMPconst {
+			break
+		}
+		if v_1_1.AuxInt != 64 {
+			break
+		}
+		v_1_1_0 := v_1_1.Args[0]
+		if v_1_1_0.Op != OpARM64SUB {
+			break
+		}
+		if v_1_1_0.Type != t {
+			break
+		}
+		_ = v_1_1_0.Args[1]
+		v_1_1_0_0 := v_1_1_0.Args[0]
+		if v_1_1_0_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1_1_0_0.AuxInt != 32 {
+			break
+		}
+		v_1_1_0_1 := v_1_1_0.Args[1]
+		if v_1_1_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_1_0_1.Type != t {
+			break
+		}
+		if v_1_1_0_1.AuxInt != 31 {
+			break
+		}
+		if y != v_1_1_0_1.Args[0] {
+			break
+		}
+		if !(cc.(Op) == OpARM64LessThanU) {
+			break
+		}
+		v.reset(OpARM64RORW)
+		v.AddArg(x)
+		v.AddArg(y)
+		return true
+	}
+	// match: (ADD (CSEL0 <typ.UInt32> {cc} (SLL x (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y))) (CMPconst [64] (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y)))) (SRL <typ.UInt32> (MOVWUreg x) (ANDconst <t> [31] y)))
+	// cond: cc.(Op) == OpARM64LessThanU
+	// result: (RORW x y)
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64CSEL0 {
+			break
+		}
+		if v_0.Type != typ.UInt32 {
+			break
+		}
+		cc := v_0.Aux
+		_ = v_0.Args[1]
+		v_0_0 := v_0.Args[0]
+		if v_0_0.Op != OpARM64SLL {
+			break
+		}
+		_ = v_0_0.Args[1]
+		x := v_0_0.Args[0]
+		v_0_0_1 := v_0_0.Args[1]
+		if v_0_0_1.Op != OpARM64SUB {
+			break
+		}
+		t := v_0_0_1.Type
+		_ = v_0_0_1.Args[1]
+		v_0_0_1_0 := v_0_0_1.Args[0]
+		if v_0_0_1_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0_0_1_0.AuxInt != 32 {
+			break
+		}
+		v_0_0_1_1 := v_0_0_1.Args[1]
+		if v_0_0_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_0_0_1_1.Type != t {
+			break
+		}
+		if v_0_0_1_1.AuxInt != 31 {
+			break
+		}
+		y := v_0_0_1_1.Args[0]
+		v_0_1 := v_0.Args[1]
+		if v_0_1.Op != OpARM64CMPconst {
+			break
+		}
+		if v_0_1.AuxInt != 64 {
+			break
+		}
+		v_0_1_0 := v_0_1.Args[0]
+		if v_0_1_0.Op != OpARM64SUB {
+			break
+		}
+		if v_0_1_0.Type != t {
+			break
+		}
+		_ = v_0_1_0.Args[1]
+		v_0_1_0_0 := v_0_1_0.Args[0]
+		if v_0_1_0_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0_1_0_0.AuxInt != 32 {
+			break
+		}
+		v_0_1_0_1 := v_0_1_0.Args[1]
+		if v_0_1_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_0_1_0_1.Type != t {
+			break
+		}
+		if v_0_1_0_1.AuxInt != 31 {
+			break
+		}
+		if y != v_0_1_0_1.Args[0] {
+			break
+		}
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64SRL {
+			break
+		}
+		if v_1.Type != typ.UInt32 {
+			break
+		}
+		_ = v_1.Args[1]
+		v_1_0 := v_1.Args[0]
+		if v_1_0.Op != OpARM64MOVWUreg {
+			break
+		}
+		if x != v_1_0.Args[0] {
+			break
+		}
+		v_1_1 := v_1.Args[1]
+		if v_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_1.Type != t {
+			break
+		}
+		if v_1_1.AuxInt != 31 {
+			break
+		}
+		if y != v_1_1.Args[0] {
+			break
+		}
+		if !(cc.(Op) == OpARM64LessThanU) {
+			break
+		}
+		v.reset(OpARM64RORW)
+		v.AddArg(x)
+		v.AddArg(y)
+		return true
+	}
 	return false
 }
 func rewriteValueARM64_OpARM64ADDconst_0(v *Value) bool {
@@ -17135,6 +18066,8 @@
 func rewriteValueARM64_OpARM64OR_10(v *Value) bool {
 	b := v.Block
 	_ = b
+	typ := &b.Func.Config.Types
+	_ = typ
 	// match: (OR x1:(SRAconst [c] y) x0)
 	// cond: clobberIfDead(x1)
 	// result: (ORshiftRA x0 y [c])
@@ -17156,6 +18089,918 @@
 		v.AddArg(y)
 		return true
 	}
+	// match: (OR (SLL x (ANDconst <t> [63] y)) (CSEL0 <typ.UInt64> {cc} (SRL <typ.UInt64> x (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y))) (CMPconst [64] (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y)))))
+	// cond: cc.(Op) == OpARM64LessThanU
+	// result: (ROR x (NEG <t> y))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64SLL {
+			break
+		}
+		_ = v_0.Args[1]
+		x := v_0.Args[0]
+		v_0_1 := v_0.Args[1]
+		if v_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		t := v_0_1.Type
+		if v_0_1.AuxInt != 63 {
+			break
+		}
+		y := v_0_1.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64CSEL0 {
+			break
+		}
+		if v_1.Type != typ.UInt64 {
+			break
+		}
+		cc := v_1.Aux
+		_ = v_1.Args[1]
+		v_1_0 := v_1.Args[0]
+		if v_1_0.Op != OpARM64SRL {
+			break
+		}
+		if v_1_0.Type != typ.UInt64 {
+			break
+		}
+		_ = v_1_0.Args[1]
+		if x != v_1_0.Args[0] {
+			break
+		}
+		v_1_0_1 := v_1_0.Args[1]
+		if v_1_0_1.Op != OpARM64SUB {
+			break
+		}
+		if v_1_0_1.Type != t {
+			break
+		}
+		_ = v_1_0_1.Args[1]
+		v_1_0_1_0 := v_1_0_1.Args[0]
+		if v_1_0_1_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1_0_1_0.AuxInt != 64 {
+			break
+		}
+		v_1_0_1_1 := v_1_0_1.Args[1]
+		if v_1_0_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_0_1_1.Type != t {
+			break
+		}
+		if v_1_0_1_1.AuxInt != 63 {
+			break
+		}
+		if y != v_1_0_1_1.Args[0] {
+			break
+		}
+		v_1_1 := v_1.Args[1]
+		if v_1_1.Op != OpARM64CMPconst {
+			break
+		}
+		if v_1_1.AuxInt != 64 {
+			break
+		}
+		v_1_1_0 := v_1_1.Args[0]
+		if v_1_1_0.Op != OpARM64SUB {
+			break
+		}
+		if v_1_1_0.Type != t {
+			break
+		}
+		_ = v_1_1_0.Args[1]
+		v_1_1_0_0 := v_1_1_0.Args[0]
+		if v_1_1_0_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1_1_0_0.AuxInt != 64 {
+			break
+		}
+		v_1_1_0_1 := v_1_1_0.Args[1]
+		if v_1_1_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_1_0_1.Type != t {
+			break
+		}
+		if v_1_1_0_1.AuxInt != 63 {
+			break
+		}
+		if y != v_1_1_0_1.Args[0] {
+			break
+		}
+		if !(cc.(Op) == OpARM64LessThanU) {
+			break
+		}
+		v.reset(OpARM64ROR)
+		v.AddArg(x)
+		v0 := b.NewValue0(v.Pos, OpARM64NEG, t)
+		v0.AddArg(y)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (OR (CSEL0 <typ.UInt64> {cc} (SRL <typ.UInt64> x (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y))) (CMPconst [64] (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y)))) (SLL x (ANDconst <t> [63] y)))
+	// cond: cc.(Op) == OpARM64LessThanU
+	// result: (ROR x (NEG <t> y))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64CSEL0 {
+			break
+		}
+		if v_0.Type != typ.UInt64 {
+			break
+		}
+		cc := v_0.Aux
+		_ = v_0.Args[1]
+		v_0_0 := v_0.Args[0]
+		if v_0_0.Op != OpARM64SRL {
+			break
+		}
+		if v_0_0.Type != typ.UInt64 {
+			break
+		}
+		_ = v_0_0.Args[1]
+		x := v_0_0.Args[0]
+		v_0_0_1 := v_0_0.Args[1]
+		if v_0_0_1.Op != OpARM64SUB {
+			break
+		}
+		t := v_0_0_1.Type
+		_ = v_0_0_1.Args[1]
+		v_0_0_1_0 := v_0_0_1.Args[0]
+		if v_0_0_1_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0_0_1_0.AuxInt != 64 {
+			break
+		}
+		v_0_0_1_1 := v_0_0_1.Args[1]
+		if v_0_0_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_0_0_1_1.Type != t {
+			break
+		}
+		if v_0_0_1_1.AuxInt != 63 {
+			break
+		}
+		y := v_0_0_1_1.Args[0]
+		v_0_1 := v_0.Args[1]
+		if v_0_1.Op != OpARM64CMPconst {
+			break
+		}
+		if v_0_1.AuxInt != 64 {
+			break
+		}
+		v_0_1_0 := v_0_1.Args[0]
+		if v_0_1_0.Op != OpARM64SUB {
+			break
+		}
+		if v_0_1_0.Type != t {
+			break
+		}
+		_ = v_0_1_0.Args[1]
+		v_0_1_0_0 := v_0_1_0.Args[0]
+		if v_0_1_0_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0_1_0_0.AuxInt != 64 {
+			break
+		}
+		v_0_1_0_1 := v_0_1_0.Args[1]
+		if v_0_1_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_0_1_0_1.Type != t {
+			break
+		}
+		if v_0_1_0_1.AuxInt != 63 {
+			break
+		}
+		if y != v_0_1_0_1.Args[0] {
+			break
+		}
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64SLL {
+			break
+		}
+		_ = v_1.Args[1]
+		if x != v_1.Args[0] {
+			break
+		}
+		v_1_1 := v_1.Args[1]
+		if v_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_1.Type != t {
+			break
+		}
+		if v_1_1.AuxInt != 63 {
+			break
+		}
+		if y != v_1_1.Args[0] {
+			break
+		}
+		if !(cc.(Op) == OpARM64LessThanU) {
+			break
+		}
+		v.reset(OpARM64ROR)
+		v.AddArg(x)
+		v0 := b.NewValue0(v.Pos, OpARM64NEG, t)
+		v0.AddArg(y)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (OR (SRL <typ.UInt64> x (ANDconst <t> [63] y)) (CSEL0 <typ.UInt64> {cc} (SLL x (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y))) (CMPconst [64] (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y)))))
+	// cond: cc.(Op) == OpARM64LessThanU
+	// result: (ROR x y)
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64SRL {
+			break
+		}
+		if v_0.Type != typ.UInt64 {
+			break
+		}
+		_ = v_0.Args[1]
+		x := v_0.Args[0]
+		v_0_1 := v_0.Args[1]
+		if v_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		t := v_0_1.Type
+		if v_0_1.AuxInt != 63 {
+			break
+		}
+		y := v_0_1.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64CSEL0 {
+			break
+		}
+		if v_1.Type != typ.UInt64 {
+			break
+		}
+		cc := v_1.Aux
+		_ = v_1.Args[1]
+		v_1_0 := v_1.Args[0]
+		if v_1_0.Op != OpARM64SLL {
+			break
+		}
+		_ = v_1_0.Args[1]
+		if x != v_1_0.Args[0] {
+			break
+		}
+		v_1_0_1 := v_1_0.Args[1]
+		if v_1_0_1.Op != OpARM64SUB {
+			break
+		}
+		if v_1_0_1.Type != t {
+			break
+		}
+		_ = v_1_0_1.Args[1]
+		v_1_0_1_0 := v_1_0_1.Args[0]
+		if v_1_0_1_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1_0_1_0.AuxInt != 64 {
+			break
+		}
+		v_1_0_1_1 := v_1_0_1.Args[1]
+		if v_1_0_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_0_1_1.Type != t {
+			break
+		}
+		if v_1_0_1_1.AuxInt != 63 {
+			break
+		}
+		if y != v_1_0_1_1.Args[0] {
+			break
+		}
+		v_1_1 := v_1.Args[1]
+		if v_1_1.Op != OpARM64CMPconst {
+			break
+		}
+		if v_1_1.AuxInt != 64 {
+			break
+		}
+		v_1_1_0 := v_1_1.Args[0]
+		if v_1_1_0.Op != OpARM64SUB {
+			break
+		}
+		if v_1_1_0.Type != t {
+			break
+		}
+		_ = v_1_1_0.Args[1]
+		v_1_1_0_0 := v_1_1_0.Args[0]
+		if v_1_1_0_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1_1_0_0.AuxInt != 64 {
+			break
+		}
+		v_1_1_0_1 := v_1_1_0.Args[1]
+		if v_1_1_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_1_0_1.Type != t {
+			break
+		}
+		if v_1_1_0_1.AuxInt != 63 {
+			break
+		}
+		if y != v_1_1_0_1.Args[0] {
+			break
+		}
+		if !(cc.(Op) == OpARM64LessThanU) {
+			break
+		}
+		v.reset(OpARM64ROR)
+		v.AddArg(x)
+		v.AddArg(y)
+		return true
+	}
+	// match: (OR (CSEL0 <typ.UInt64> {cc} (SLL x (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y))) (CMPconst [64] (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y)))) (SRL <typ.UInt64> x (ANDconst <t> [63] y)))
+	// cond: cc.(Op) == OpARM64LessThanU
+	// result: (ROR x y)
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64CSEL0 {
+			break
+		}
+		if v_0.Type != typ.UInt64 {
+			break
+		}
+		cc := v_0.Aux
+		_ = v_0.Args[1]
+		v_0_0 := v_0.Args[0]
+		if v_0_0.Op != OpARM64SLL {
+			break
+		}
+		_ = v_0_0.Args[1]
+		x := v_0_0.Args[0]
+		v_0_0_1 := v_0_0.Args[1]
+		if v_0_0_1.Op != OpARM64SUB {
+			break
+		}
+		t := v_0_0_1.Type
+		_ = v_0_0_1.Args[1]
+		v_0_0_1_0 := v_0_0_1.Args[0]
+		if v_0_0_1_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0_0_1_0.AuxInt != 64 {
+			break
+		}
+		v_0_0_1_1 := v_0_0_1.Args[1]
+		if v_0_0_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_0_0_1_1.Type != t {
+			break
+		}
+		if v_0_0_1_1.AuxInt != 63 {
+			break
+		}
+		y := v_0_0_1_1.Args[0]
+		v_0_1 := v_0.Args[1]
+		if v_0_1.Op != OpARM64CMPconst {
+			break
+		}
+		if v_0_1.AuxInt != 64 {
+			break
+		}
+		v_0_1_0 := v_0_1.Args[0]
+		if v_0_1_0.Op != OpARM64SUB {
+			break
+		}
+		if v_0_1_0.Type != t {
+			break
+		}
+		_ = v_0_1_0.Args[1]
+		v_0_1_0_0 := v_0_1_0.Args[0]
+		if v_0_1_0_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0_1_0_0.AuxInt != 64 {
+			break
+		}
+		v_0_1_0_1 := v_0_1_0.Args[1]
+		if v_0_1_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_0_1_0_1.Type != t {
+			break
+		}
+		if v_0_1_0_1.AuxInt != 63 {
+			break
+		}
+		if y != v_0_1_0_1.Args[0] {
+			break
+		}
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64SRL {
+			break
+		}
+		if v_1.Type != typ.UInt64 {
+			break
+		}
+		_ = v_1.Args[1]
+		if x != v_1.Args[0] {
+			break
+		}
+		v_1_1 := v_1.Args[1]
+		if v_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_1.Type != t {
+			break
+		}
+		if v_1_1.AuxInt != 63 {
+			break
+		}
+		if y != v_1_1.Args[0] {
+			break
+		}
+		if !(cc.(Op) == OpARM64LessThanU) {
+			break
+		}
+		v.reset(OpARM64ROR)
+		v.AddArg(x)
+		v.AddArg(y)
+		return true
+	}
+	// match: (OR (SLL x (ANDconst <t> [31] y)) (CSEL0 <typ.UInt32> {cc} (SRL <typ.UInt32> (MOVWUreg x) (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y))) (CMPconst [64] (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y)))))
+	// cond: cc.(Op) == OpARM64LessThanU
+	// result: (RORW x (NEG <t> y))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64SLL {
+			break
+		}
+		_ = v_0.Args[1]
+		x := v_0.Args[0]
+		v_0_1 := v_0.Args[1]
+		if v_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		t := v_0_1.Type
+		if v_0_1.AuxInt != 31 {
+			break
+		}
+		y := v_0_1.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64CSEL0 {
+			break
+		}
+		if v_1.Type != typ.UInt32 {
+			break
+		}
+		cc := v_1.Aux
+		_ = v_1.Args[1]
+		v_1_0 := v_1.Args[0]
+		if v_1_0.Op != OpARM64SRL {
+			break
+		}
+		if v_1_0.Type != typ.UInt32 {
+			break
+		}
+		_ = v_1_0.Args[1]
+		v_1_0_0 := v_1_0.Args[0]
+		if v_1_0_0.Op != OpARM64MOVWUreg {
+			break
+		}
+		if x != v_1_0_0.Args[0] {
+			break
+		}
+		v_1_0_1 := v_1_0.Args[1]
+		if v_1_0_1.Op != OpARM64SUB {
+			break
+		}
+		if v_1_0_1.Type != t {
+			break
+		}
+		_ = v_1_0_1.Args[1]
+		v_1_0_1_0 := v_1_0_1.Args[0]
+		if v_1_0_1_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1_0_1_0.AuxInt != 32 {
+			break
+		}
+		v_1_0_1_1 := v_1_0_1.Args[1]
+		if v_1_0_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_0_1_1.Type != t {
+			break
+		}
+		if v_1_0_1_1.AuxInt != 31 {
+			break
+		}
+		if y != v_1_0_1_1.Args[0] {
+			break
+		}
+		v_1_1 := v_1.Args[1]
+		if v_1_1.Op != OpARM64CMPconst {
+			break
+		}
+		if v_1_1.AuxInt != 64 {
+			break
+		}
+		v_1_1_0 := v_1_1.Args[0]
+		if v_1_1_0.Op != OpARM64SUB {
+			break
+		}
+		if v_1_1_0.Type != t {
+			break
+		}
+		_ = v_1_1_0.Args[1]
+		v_1_1_0_0 := v_1_1_0.Args[0]
+		if v_1_1_0_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1_1_0_0.AuxInt != 32 {
+			break
+		}
+		v_1_1_0_1 := v_1_1_0.Args[1]
+		if v_1_1_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_1_0_1.Type != t {
+			break
+		}
+		if v_1_1_0_1.AuxInt != 31 {
+			break
+		}
+		if y != v_1_1_0_1.Args[0] {
+			break
+		}
+		if !(cc.(Op) == OpARM64LessThanU) {
+			break
+		}
+		v.reset(OpARM64RORW)
+		v.AddArg(x)
+		v0 := b.NewValue0(v.Pos, OpARM64NEG, t)
+		v0.AddArg(y)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (OR (CSEL0 <typ.UInt32> {cc} (SRL <typ.UInt32> (MOVWUreg x) (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y))) (CMPconst [64] (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y)))) (SLL x (ANDconst <t> [31] y)))
+	// cond: cc.(Op) == OpARM64LessThanU
+	// result: (RORW x (NEG <t> y))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64CSEL0 {
+			break
+		}
+		if v_0.Type != typ.UInt32 {
+			break
+		}
+		cc := v_0.Aux
+		_ = v_0.Args[1]
+		v_0_0 := v_0.Args[0]
+		if v_0_0.Op != OpARM64SRL {
+			break
+		}
+		if v_0_0.Type != typ.UInt32 {
+			break
+		}
+		_ = v_0_0.Args[1]
+		v_0_0_0 := v_0_0.Args[0]
+		if v_0_0_0.Op != OpARM64MOVWUreg {
+			break
+		}
+		x := v_0_0_0.Args[0]
+		v_0_0_1 := v_0_0.Args[1]
+		if v_0_0_1.Op != OpARM64SUB {
+			break
+		}
+		t := v_0_0_1.Type
+		_ = v_0_0_1.Args[1]
+		v_0_0_1_0 := v_0_0_1.Args[0]
+		if v_0_0_1_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0_0_1_0.AuxInt != 32 {
+			break
+		}
+		v_0_0_1_1 := v_0_0_1.Args[1]
+		if v_0_0_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_0_0_1_1.Type != t {
+			break
+		}
+		if v_0_0_1_1.AuxInt != 31 {
+			break
+		}
+		y := v_0_0_1_1.Args[0]
+		v_0_1 := v_0.Args[1]
+		if v_0_1.Op != OpARM64CMPconst {
+			break
+		}
+		if v_0_1.AuxInt != 64 {
+			break
+		}
+		v_0_1_0 := v_0_1.Args[0]
+		if v_0_1_0.Op != OpARM64SUB {
+			break
+		}
+		if v_0_1_0.Type != t {
+			break
+		}
+		_ = v_0_1_0.Args[1]
+		v_0_1_0_0 := v_0_1_0.Args[0]
+		if v_0_1_0_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0_1_0_0.AuxInt != 32 {
+			break
+		}
+		v_0_1_0_1 := v_0_1_0.Args[1]
+		if v_0_1_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_0_1_0_1.Type != t {
+			break
+		}
+		if v_0_1_0_1.AuxInt != 31 {
+			break
+		}
+		if y != v_0_1_0_1.Args[0] {
+			break
+		}
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64SLL {
+			break
+		}
+		_ = v_1.Args[1]
+		if x != v_1.Args[0] {
+			break
+		}
+		v_1_1 := v_1.Args[1]
+		if v_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_1.Type != t {
+			break
+		}
+		if v_1_1.AuxInt != 31 {
+			break
+		}
+		if y != v_1_1.Args[0] {
+			break
+		}
+		if !(cc.(Op) == OpARM64LessThanU) {
+			break
+		}
+		v.reset(OpARM64RORW)
+		v.AddArg(x)
+		v0 := b.NewValue0(v.Pos, OpARM64NEG, t)
+		v0.AddArg(y)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (OR (SRL <typ.UInt32> (MOVWUreg x) (ANDconst <t> [31] y)) (CSEL0 <typ.UInt32> {cc} (SLL x (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y))) (CMPconst [64] (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y)))))
+	// cond: cc.(Op) == OpARM64LessThanU
+	// result: (RORW x y)
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64SRL {
+			break
+		}
+		if v_0.Type != typ.UInt32 {
+			break
+		}
+		_ = v_0.Args[1]
+		v_0_0 := v_0.Args[0]
+		if v_0_0.Op != OpARM64MOVWUreg {
+			break
+		}
+		x := v_0_0.Args[0]
+		v_0_1 := v_0.Args[1]
+		if v_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		t := v_0_1.Type
+		if v_0_1.AuxInt != 31 {
+			break
+		}
+		y := v_0_1.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64CSEL0 {
+			break
+		}
+		if v_1.Type != typ.UInt32 {
+			break
+		}
+		cc := v_1.Aux
+		_ = v_1.Args[1]
+		v_1_0 := v_1.Args[0]
+		if v_1_0.Op != OpARM64SLL {
+			break
+		}
+		_ = v_1_0.Args[1]
+		if x != v_1_0.Args[0] {
+			break
+		}
+		v_1_0_1 := v_1_0.Args[1]
+		if v_1_0_1.Op != OpARM64SUB {
+			break
+		}
+		if v_1_0_1.Type != t {
+			break
+		}
+		_ = v_1_0_1.Args[1]
+		v_1_0_1_0 := v_1_0_1.Args[0]
+		if v_1_0_1_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1_0_1_0.AuxInt != 32 {
+			break
+		}
+		v_1_0_1_1 := v_1_0_1.Args[1]
+		if v_1_0_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_0_1_1.Type != t {
+			break
+		}
+		if v_1_0_1_1.AuxInt != 31 {
+			break
+		}
+		if y != v_1_0_1_1.Args[0] {
+			break
+		}
+		v_1_1 := v_1.Args[1]
+		if v_1_1.Op != OpARM64CMPconst {
+			break
+		}
+		if v_1_1.AuxInt != 64 {
+			break
+		}
+		v_1_1_0 := v_1_1.Args[0]
+		if v_1_1_0.Op != OpARM64SUB {
+			break
+		}
+		if v_1_1_0.Type != t {
+			break
+		}
+		_ = v_1_1_0.Args[1]
+		v_1_1_0_0 := v_1_1_0.Args[0]
+		if v_1_1_0_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1_1_0_0.AuxInt != 32 {
+			break
+		}
+		v_1_1_0_1 := v_1_1_0.Args[1]
+		if v_1_1_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_1_0_1.Type != t {
+			break
+		}
+		if v_1_1_0_1.AuxInt != 31 {
+			break
+		}
+		if y != v_1_1_0_1.Args[0] {
+			break
+		}
+		if !(cc.(Op) == OpARM64LessThanU) {
+			break
+		}
+		v.reset(OpARM64RORW)
+		v.AddArg(x)
+		v.AddArg(y)
+		return true
+	}
+	// match: (OR (CSEL0 <typ.UInt32> {cc} (SLL x (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y))) (CMPconst [64] (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y)))) (SRL <typ.UInt32> (MOVWUreg x) (ANDconst <t> [31] y)))
+	// cond: cc.(Op) == OpARM64LessThanU
+	// result: (RORW x y)
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64CSEL0 {
+			break
+		}
+		if v_0.Type != typ.UInt32 {
+			break
+		}
+		cc := v_0.Aux
+		_ = v_0.Args[1]
+		v_0_0 := v_0.Args[0]
+		if v_0_0.Op != OpARM64SLL {
+			break
+		}
+		_ = v_0_0.Args[1]
+		x := v_0_0.Args[0]
+		v_0_0_1 := v_0_0.Args[1]
+		if v_0_0_1.Op != OpARM64SUB {
+			break
+		}
+		t := v_0_0_1.Type
+		_ = v_0_0_1.Args[1]
+		v_0_0_1_0 := v_0_0_1.Args[0]
+		if v_0_0_1_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0_0_1_0.AuxInt != 32 {
+			break
+		}
+		v_0_0_1_1 := v_0_0_1.Args[1]
+		if v_0_0_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_0_0_1_1.Type != t {
+			break
+		}
+		if v_0_0_1_1.AuxInt != 31 {
+			break
+		}
+		y := v_0_0_1_1.Args[0]
+		v_0_1 := v_0.Args[1]
+		if v_0_1.Op != OpARM64CMPconst {
+			break
+		}
+		if v_0_1.AuxInt != 64 {
+			break
+		}
+		v_0_1_0 := v_0_1.Args[0]
+		if v_0_1_0.Op != OpARM64SUB {
+			break
+		}
+		if v_0_1_0.Type != t {
+			break
+		}
+		_ = v_0_1_0.Args[1]
+		v_0_1_0_0 := v_0_1_0.Args[0]
+		if v_0_1_0_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0_1_0_0.AuxInt != 32 {
+			break
+		}
+		v_0_1_0_1 := v_0_1_0.Args[1]
+		if v_0_1_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_0_1_0_1.Type != t {
+			break
+		}
+		if v_0_1_0_1.AuxInt != 31 {
+			break
+		}
+		if y != v_0_1_0_1.Args[0] {
+			break
+		}
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64SRL {
+			break
+		}
+		if v_1.Type != typ.UInt32 {
+			break
+		}
+		_ = v_1.Args[1]
+		v_1_0 := v_1.Args[0]
+		if v_1_0.Op != OpARM64MOVWUreg {
+			break
+		}
+		if x != v_1_0.Args[0] {
+			break
+		}
+		v_1_1 := v_1.Args[1]
+		if v_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_1.Type != t {
+			break
+		}
+		if v_1_1.AuxInt != 31 {
+			break
+		}
+		if y != v_1_1.Args[0] {
+			break
+		}
+		if !(cc.(Op) == OpARM64LessThanU) {
+			break
+		}
+		v.reset(OpARM64RORW)
+		v.AddArg(x)
+		v.AddArg(y)
+		return true
+	}
 	// match: (OR (UBFIZ [bfc] x) (ANDconst [ac] y))
 	// cond: ac == ^((1<<uint(getARM64BFwidth(bfc))-1) << uint(getARM64BFlsb(bfc)))
 	// result: (BFI [bfc] y x)
@@ -17182,6 +19027,11 @@
 		v.AddArg(x)
 		return true
 	}
+	return false
+}
+func rewriteValueARM64_OpARM64OR_20(v *Value) bool {
+	b := v.Block
+	_ = b
 	// match: (OR (ANDconst [ac] y) (UBFIZ [bfc] x))
 	// cond: ac == ^((1<<uint(getARM64BFwidth(bfc))-1) << uint(getARM64BFlsb(bfc)))
 	// result: (BFI [bfc] y x)
@@ -17848,11 +19698,6 @@
 		v0.AddArg(mem)
 		return true
 	}
-	return false
-}
-func rewriteValueARM64_OpARM64OR_20(v *Value) bool {
-	b := v.Block
-	_ = b
 	// match: (OR <t> y3:(MOVDnop x3:(MOVBUloadidx ptr idx mem)) o0:(ORshiftLL [8] o1:(ORshiftLL [16] s0:(SLLconst [24] y0:(MOVDnop x0:(MOVBUloadidx ptr (ADDconst [3] idx) mem))) y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [2] idx) mem))) y2:(MOVDnop x2:(MOVBUloadidx ptr (ADDconst [1] idx) mem))))
 	// cond: x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && o0.Uses == 1 && o1.Uses == 1 && s0.Uses == 1 && mergePoint(b,x0,x1,x2,x3) != nil && clobber(x0) && clobber(x1) && clobber(x2) && clobber(x3) && clobber(y0) && clobber(y1) && clobber(y2) && clobber(y3) && clobber(o0) && clobber(o1) && clobber(s0)
 	// result: @mergePoint(b,x0,x1,x2,x3) (MOVWUloadidx <t> ptr idx mem)
@@ -18203,6 +20048,11 @@
 		v0.AddArg(mem)
 		return true
 	}
+	return false
+}
+func rewriteValueARM64_OpARM64OR_30(v *Value) bool {
+	b := v.Block
+	_ = b
 	// match: (OR <t> y7:(MOVDnop x7:(MOVBUload [i0] {s} p mem)) o0:(ORshiftLL [8] o1:(ORshiftLL [16] o2:(ORshiftLL [24] o3:(ORshiftLL [32] o4:(ORshiftLL [40] o5:(ORshiftLL [48] s0:(SLLconst [56] y0:(MOVDnop x0:(MOVBUload [i7] {s} p mem))) y1:(MOVDnop x1:(MOVBUload [i6] {s} p mem))) y2:(MOVDnop x2:(MOVBUload [i5] {s} p mem))) y3:(MOVDnop x3:(MOVBUload [i4] {s} p mem))) y4:(MOVDnop x4:(MOVBUload [i3] {s} p mem))) y5:(MOVDnop x5:(MOVBUload [i2] {s} p mem))) y6:(MOVDnop x6:(MOVBUload [i1] {s} p mem))))
 	// cond: i1 == i0+1 && i2 == i0+2 && i3 == i0+3 && i4 == i0+4 && i5 == i0+5 && i6 == i0+6 && i7 == i0+7 && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1 && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && y4.Uses == 1 && y5.Uses == 1 && y6.Uses == 1 && y7.Uses == 1 && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1 && o4.Uses == 1 && o5.Uses == 1 && s0.Uses == 1 && mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil && clobber(x0) && clobber(x1) && clobber(x2) && clobber(x3) && clobber(x4) && clobber(x5) && clobber(x6) && clobber(x7) && clobber(y0) && clobber(y1) && clobber(y2) && clobber(y3) && clobber(y4) && clobber(y5) && clobber(y6) && clobber(y7) && clobber(o0) && clobber(o1) && clobber(o2) && clobber(o3) && clobber(o4) && clobber(o5) && clobber(s0)
 	// result: @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (MOVDload <t> {s} (OffPtr <p.Type> [i0] p) mem)
@@ -19759,11 +21609,6 @@
 		v0.AddArg(v1)
 		return true
 	}
-	return false
-}
-func rewriteValueARM64_OpARM64OR_30(v *Value) bool {
-	b := v.Block
-	_ = b
 	// match: (OR <t> y3:(MOVDnop x3:(MOVBUload [3] {s} p mem)) o0:(ORshiftLL [8] o1:(ORshiftLL [16] s0:(SLLconst [24] y0:(MOVDnop x0:(MOVBUloadidx ptr0 idx0 mem))) y1:(MOVDnop x1:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem))) y2:(MOVDnop x2:(MOVBUload [2] {s} p mem))))
 	// cond: s == nil && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && o0.Uses == 1 && o1.Uses == 1 && s0.Uses == 1 && mergePoint(b,x0,x1,x2,x3) != nil && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) && isSamePtr(p1, p) && clobber(x0) && clobber(x1) && clobber(x2) && clobber(x3) && clobber(y0) && clobber(y1) && clobber(y2) && clobber(y3) && clobber(o0) && clobber(o1) && clobber(s0)
 	// result: @mergePoint(b,x0,x1,x2,x3) (REVW <t> (MOVWUloadidx <t> ptr0 idx0 mem))
@@ -20012,6 +21857,11 @@
 		v0.AddArg(v1)
 		return true
 	}
+	return false
+}
+func rewriteValueARM64_OpARM64OR_40(v *Value) bool {
+	b := v.Block
+	_ = b
 	// match: (OR <t> y3:(MOVDnop x3:(MOVBUloadidx ptr (ADDconst [3] idx) mem)) o0:(ORshiftLL [8] o1:(ORshiftLL [16] s0:(SLLconst [24] y0:(MOVDnop x0:(MOVBUloadidx ptr idx mem))) y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [1] idx) mem))) y2:(MOVDnop x2:(MOVBUloadidx ptr (ADDconst [2] idx) mem))))
 	// cond: x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && o0.Uses == 1 && o1.Uses == 1 && s0.Uses == 1 && mergePoint(b,x0,x1,x2,x3) != nil && clobber(x0) && clobber(x1) && clobber(x2) && clobber(x3) && clobber(y0) && clobber(y1) && clobber(y2) && clobber(y3) && clobber(o0) && clobber(o1) && clobber(s0)
 	// result: @mergePoint(b,x0,x1,x2,x3) (REVW <t> (MOVWUloadidx <t> ptr idx mem))
@@ -24228,6 +26078,44 @@
 	}
 	return false
 }
+func rewriteValueARM64_OpARM64RORWconst_0(v *Value) bool {
+	// match: (RORWconst [c] (RORWconst [d] x))
+	// cond:
+	// result: (RORWconst [(c+d)&31] x)
+	for {
+		c := v.AuxInt
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64RORWconst {
+			break
+		}
+		d := v_0.AuxInt
+		x := v_0.Args[0]
+		v.reset(OpARM64RORWconst)
+		v.AuxInt = (c + d) & 31
+		v.AddArg(x)
+		return true
+	}
+	return false
+}
+func rewriteValueARM64_OpARM64RORconst_0(v *Value) bool {
+	// match: (RORconst [c] (RORconst [d] x))
+	// cond:
+	// result: (RORconst [(c+d)&63] x)
+	for {
+		c := v.AuxInt
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64RORconst {
+			break
+		}
+		d := v_0.AuxInt
+		x := v_0.Args[0]
+		v.reset(OpARM64RORconst)
+		v.AuxInt = (c + d) & 63
+		v.AddArg(x)
+		return true
+	}
+	return false
+}
 func rewriteValueARM64_OpARM64SLL_0(v *Value) bool {
 	// match: (SLL x (MOVDconst [c]))
 	// cond:
@@ -25995,6 +27883,10 @@
 	return false
 }
 func rewriteValueARM64_OpARM64XOR_10(v *Value) bool {
+	b := v.Block
+	_ = b
+	typ := &b.Func.Config.Types
+	_ = typ
 	// match: (XOR x1:(SRAconst [c] y) x0)
 	// cond: clobberIfDead(x1)
 	// result: (XORshiftRA x0 y [c])
@@ -26016,6 +27908,918 @@
 		v.AddArg(y)
 		return true
 	}
+	// match: (XOR (SLL x (ANDconst <t> [63] y)) (CSEL0 <typ.UInt64> {cc} (SRL <typ.UInt64> x (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y))) (CMPconst [64] (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y)))))
+	// cond: cc.(Op) == OpARM64LessThanU
+	// result: (ROR x (NEG <t> y))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64SLL {
+			break
+		}
+		_ = v_0.Args[1]
+		x := v_0.Args[0]
+		v_0_1 := v_0.Args[1]
+		if v_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		t := v_0_1.Type
+		if v_0_1.AuxInt != 63 {
+			break
+		}
+		y := v_0_1.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64CSEL0 {
+			break
+		}
+		if v_1.Type != typ.UInt64 {
+			break
+		}
+		cc := v_1.Aux
+		_ = v_1.Args[1]
+		v_1_0 := v_1.Args[0]
+		if v_1_0.Op != OpARM64SRL {
+			break
+		}
+		if v_1_0.Type != typ.UInt64 {
+			break
+		}
+		_ = v_1_0.Args[1]
+		if x != v_1_0.Args[0] {
+			break
+		}
+		v_1_0_1 := v_1_0.Args[1]
+		if v_1_0_1.Op != OpARM64SUB {
+			break
+		}
+		if v_1_0_1.Type != t {
+			break
+		}
+		_ = v_1_0_1.Args[1]
+		v_1_0_1_0 := v_1_0_1.Args[0]
+		if v_1_0_1_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1_0_1_0.AuxInt != 64 {
+			break
+		}
+		v_1_0_1_1 := v_1_0_1.Args[1]
+		if v_1_0_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_0_1_1.Type != t {
+			break
+		}
+		if v_1_0_1_1.AuxInt != 63 {
+			break
+		}
+		if y != v_1_0_1_1.Args[0] {
+			break
+		}
+		v_1_1 := v_1.Args[1]
+		if v_1_1.Op != OpARM64CMPconst {
+			break
+		}
+		if v_1_1.AuxInt != 64 {
+			break
+		}
+		v_1_1_0 := v_1_1.Args[0]
+		if v_1_1_0.Op != OpARM64SUB {
+			break
+		}
+		if v_1_1_0.Type != t {
+			break
+		}
+		_ = v_1_1_0.Args[1]
+		v_1_1_0_0 := v_1_1_0.Args[0]
+		if v_1_1_0_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1_1_0_0.AuxInt != 64 {
+			break
+		}
+		v_1_1_0_1 := v_1_1_0.Args[1]
+		if v_1_1_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_1_0_1.Type != t {
+			break
+		}
+		if v_1_1_0_1.AuxInt != 63 {
+			break
+		}
+		if y != v_1_1_0_1.Args[0] {
+			break
+		}
+		if !(cc.(Op) == OpARM64LessThanU) {
+			break
+		}
+		v.reset(OpARM64ROR)
+		v.AddArg(x)
+		v0 := b.NewValue0(v.Pos, OpARM64NEG, t)
+		v0.AddArg(y)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (XOR (CSEL0 <typ.UInt64> {cc} (SRL <typ.UInt64> x (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y))) (CMPconst [64] (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y)))) (SLL x (ANDconst <t> [63] y)))
+	// cond: cc.(Op) == OpARM64LessThanU
+	// result: (ROR x (NEG <t> y))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64CSEL0 {
+			break
+		}
+		if v_0.Type != typ.UInt64 {
+			break
+		}
+		cc := v_0.Aux
+		_ = v_0.Args[1]
+		v_0_0 := v_0.Args[0]
+		if v_0_0.Op != OpARM64SRL {
+			break
+		}
+		if v_0_0.Type != typ.UInt64 {
+			break
+		}
+		_ = v_0_0.Args[1]
+		x := v_0_0.Args[0]
+		v_0_0_1 := v_0_0.Args[1]
+		if v_0_0_1.Op != OpARM64SUB {
+			break
+		}
+		t := v_0_0_1.Type
+		_ = v_0_0_1.Args[1]
+		v_0_0_1_0 := v_0_0_1.Args[0]
+		if v_0_0_1_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0_0_1_0.AuxInt != 64 {
+			break
+		}
+		v_0_0_1_1 := v_0_0_1.Args[1]
+		if v_0_0_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_0_0_1_1.Type != t {
+			break
+		}
+		if v_0_0_1_1.AuxInt != 63 {
+			break
+		}
+		y := v_0_0_1_1.Args[0]
+		v_0_1 := v_0.Args[1]
+		if v_0_1.Op != OpARM64CMPconst {
+			break
+		}
+		if v_0_1.AuxInt != 64 {
+			break
+		}
+		v_0_1_0 := v_0_1.Args[0]
+		if v_0_1_0.Op != OpARM64SUB {
+			break
+		}
+		if v_0_1_0.Type != t {
+			break
+		}
+		_ = v_0_1_0.Args[1]
+		v_0_1_0_0 := v_0_1_0.Args[0]
+		if v_0_1_0_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0_1_0_0.AuxInt != 64 {
+			break
+		}
+		v_0_1_0_1 := v_0_1_0.Args[1]
+		if v_0_1_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_0_1_0_1.Type != t {
+			break
+		}
+		if v_0_1_0_1.AuxInt != 63 {
+			break
+		}
+		if y != v_0_1_0_1.Args[0] {
+			break
+		}
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64SLL {
+			break
+		}
+		_ = v_1.Args[1]
+		if x != v_1.Args[0] {
+			break
+		}
+		v_1_1 := v_1.Args[1]
+		if v_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_1.Type != t {
+			break
+		}
+		if v_1_1.AuxInt != 63 {
+			break
+		}
+		if y != v_1_1.Args[0] {
+			break
+		}
+		if !(cc.(Op) == OpARM64LessThanU) {
+			break
+		}
+		v.reset(OpARM64ROR)
+		v.AddArg(x)
+		v0 := b.NewValue0(v.Pos, OpARM64NEG, t)
+		v0.AddArg(y)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (XOR (SRL <typ.UInt64> x (ANDconst <t> [63] y)) (CSEL0 <typ.UInt64> {cc} (SLL x (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y))) (CMPconst [64] (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y)))))
+	// cond: cc.(Op) == OpARM64LessThanU
+	// result: (ROR x y)
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64SRL {
+			break
+		}
+		if v_0.Type != typ.UInt64 {
+			break
+		}
+		_ = v_0.Args[1]
+		x := v_0.Args[0]
+		v_0_1 := v_0.Args[1]
+		if v_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		t := v_0_1.Type
+		if v_0_1.AuxInt != 63 {
+			break
+		}
+		y := v_0_1.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64CSEL0 {
+			break
+		}
+		if v_1.Type != typ.UInt64 {
+			break
+		}
+		cc := v_1.Aux
+		_ = v_1.Args[1]
+		v_1_0 := v_1.Args[0]
+		if v_1_0.Op != OpARM64SLL {
+			break
+		}
+		_ = v_1_0.Args[1]
+		if x != v_1_0.Args[0] {
+			break
+		}
+		v_1_0_1 := v_1_0.Args[1]
+		if v_1_0_1.Op != OpARM64SUB {
+			break
+		}
+		if v_1_0_1.Type != t {
+			break
+		}
+		_ = v_1_0_1.Args[1]
+		v_1_0_1_0 := v_1_0_1.Args[0]
+		if v_1_0_1_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1_0_1_0.AuxInt != 64 {
+			break
+		}
+		v_1_0_1_1 := v_1_0_1.Args[1]
+		if v_1_0_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_0_1_1.Type != t {
+			break
+		}
+		if v_1_0_1_1.AuxInt != 63 {
+			break
+		}
+		if y != v_1_0_1_1.Args[0] {
+			break
+		}
+		v_1_1 := v_1.Args[1]
+		if v_1_1.Op != OpARM64CMPconst {
+			break
+		}
+		if v_1_1.AuxInt != 64 {
+			break
+		}
+		v_1_1_0 := v_1_1.Args[0]
+		if v_1_1_0.Op != OpARM64SUB {
+			break
+		}
+		if v_1_1_0.Type != t {
+			break
+		}
+		_ = v_1_1_0.Args[1]
+		v_1_1_0_0 := v_1_1_0.Args[0]
+		if v_1_1_0_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1_1_0_0.AuxInt != 64 {
+			break
+		}
+		v_1_1_0_1 := v_1_1_0.Args[1]
+		if v_1_1_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_1_0_1.Type != t {
+			break
+		}
+		if v_1_1_0_1.AuxInt != 63 {
+			break
+		}
+		if y != v_1_1_0_1.Args[0] {
+			break
+		}
+		if !(cc.(Op) == OpARM64LessThanU) {
+			break
+		}
+		v.reset(OpARM64ROR)
+		v.AddArg(x)
+		v.AddArg(y)
+		return true
+	}
+	// match: (XOR (CSEL0 <typ.UInt64> {cc} (SLL x (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y))) (CMPconst [64] (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y)))) (SRL <typ.UInt64> x (ANDconst <t> [63] y)))
+	// cond: cc.(Op) == OpARM64LessThanU
+	// result: (ROR x y)
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64CSEL0 {
+			break
+		}
+		if v_0.Type != typ.UInt64 {
+			break
+		}
+		cc := v_0.Aux
+		_ = v_0.Args[1]
+		v_0_0 := v_0.Args[0]
+		if v_0_0.Op != OpARM64SLL {
+			break
+		}
+		_ = v_0_0.Args[1]
+		x := v_0_0.Args[0]
+		v_0_0_1 := v_0_0.Args[1]
+		if v_0_0_1.Op != OpARM64SUB {
+			break
+		}
+		t := v_0_0_1.Type
+		_ = v_0_0_1.Args[1]
+		v_0_0_1_0 := v_0_0_1.Args[0]
+		if v_0_0_1_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0_0_1_0.AuxInt != 64 {
+			break
+		}
+		v_0_0_1_1 := v_0_0_1.Args[1]
+		if v_0_0_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_0_0_1_1.Type != t {
+			break
+		}
+		if v_0_0_1_1.AuxInt != 63 {
+			break
+		}
+		y := v_0_0_1_1.Args[0]
+		v_0_1 := v_0.Args[1]
+		if v_0_1.Op != OpARM64CMPconst {
+			break
+		}
+		if v_0_1.AuxInt != 64 {
+			break
+		}
+		v_0_1_0 := v_0_1.Args[0]
+		if v_0_1_0.Op != OpARM64SUB {
+			break
+		}
+		if v_0_1_0.Type != t {
+			break
+		}
+		_ = v_0_1_0.Args[1]
+		v_0_1_0_0 := v_0_1_0.Args[0]
+		if v_0_1_0_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0_1_0_0.AuxInt != 64 {
+			break
+		}
+		v_0_1_0_1 := v_0_1_0.Args[1]
+		if v_0_1_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_0_1_0_1.Type != t {
+			break
+		}
+		if v_0_1_0_1.AuxInt != 63 {
+			break
+		}
+		if y != v_0_1_0_1.Args[0] {
+			break
+		}
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64SRL {
+			break
+		}
+		if v_1.Type != typ.UInt64 {
+			break
+		}
+		_ = v_1.Args[1]
+		if x != v_1.Args[0] {
+			break
+		}
+		v_1_1 := v_1.Args[1]
+		if v_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_1.Type != t {
+			break
+		}
+		if v_1_1.AuxInt != 63 {
+			break
+		}
+		if y != v_1_1.Args[0] {
+			break
+		}
+		if !(cc.(Op) == OpARM64LessThanU) {
+			break
+		}
+		v.reset(OpARM64ROR)
+		v.AddArg(x)
+		v.AddArg(y)
+		return true
+	}
+	// match: (XOR (SLL x (ANDconst <t> [31] y)) (CSEL0 <typ.UInt32> {cc} (SRL <typ.UInt32> (MOVWUreg x) (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y))) (CMPconst [64] (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y)))))
+	// cond: cc.(Op) == OpARM64LessThanU
+	// result: (RORW x (NEG <t> y))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64SLL {
+			break
+		}
+		_ = v_0.Args[1]
+		x := v_0.Args[0]
+		v_0_1 := v_0.Args[1]
+		if v_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		t := v_0_1.Type
+		if v_0_1.AuxInt != 31 {
+			break
+		}
+		y := v_0_1.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64CSEL0 {
+			break
+		}
+		if v_1.Type != typ.UInt32 {
+			break
+		}
+		cc := v_1.Aux
+		_ = v_1.Args[1]
+		v_1_0 := v_1.Args[0]
+		if v_1_0.Op != OpARM64SRL {
+			break
+		}
+		if v_1_0.Type != typ.UInt32 {
+			break
+		}
+		_ = v_1_0.Args[1]
+		v_1_0_0 := v_1_0.Args[0]
+		if v_1_0_0.Op != OpARM64MOVWUreg {
+			break
+		}
+		if x != v_1_0_0.Args[0] {
+			break
+		}
+		v_1_0_1 := v_1_0.Args[1]
+		if v_1_0_1.Op != OpARM64SUB {
+			break
+		}
+		if v_1_0_1.Type != t {
+			break
+		}
+		_ = v_1_0_1.Args[1]
+		v_1_0_1_0 := v_1_0_1.Args[0]
+		if v_1_0_1_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1_0_1_0.AuxInt != 32 {
+			break
+		}
+		v_1_0_1_1 := v_1_0_1.Args[1]
+		if v_1_0_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_0_1_1.Type != t {
+			break
+		}
+		if v_1_0_1_1.AuxInt != 31 {
+			break
+		}
+		if y != v_1_0_1_1.Args[0] {
+			break
+		}
+		v_1_1 := v_1.Args[1]
+		if v_1_1.Op != OpARM64CMPconst {
+			break
+		}
+		if v_1_1.AuxInt != 64 {
+			break
+		}
+		v_1_1_0 := v_1_1.Args[0]
+		if v_1_1_0.Op != OpARM64SUB {
+			break
+		}
+		if v_1_1_0.Type != t {
+			break
+		}
+		_ = v_1_1_0.Args[1]
+		v_1_1_0_0 := v_1_1_0.Args[0]
+		if v_1_1_0_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1_1_0_0.AuxInt != 32 {
+			break
+		}
+		v_1_1_0_1 := v_1_1_0.Args[1]
+		if v_1_1_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_1_0_1.Type != t {
+			break
+		}
+		if v_1_1_0_1.AuxInt != 31 {
+			break
+		}
+		if y != v_1_1_0_1.Args[0] {
+			break
+		}
+		if !(cc.(Op) == OpARM64LessThanU) {
+			break
+		}
+		v.reset(OpARM64RORW)
+		v.AddArg(x)
+		v0 := b.NewValue0(v.Pos, OpARM64NEG, t)
+		v0.AddArg(y)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (XOR (CSEL0 <typ.UInt32> {cc} (SRL <typ.UInt32> (MOVWUreg x) (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y))) (CMPconst [64] (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y)))) (SLL x (ANDconst <t> [31] y)))
+	// cond: cc.(Op) == OpARM64LessThanU
+	// result: (RORW x (NEG <t> y))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64CSEL0 {
+			break
+		}
+		if v_0.Type != typ.UInt32 {
+			break
+		}
+		cc := v_0.Aux
+		_ = v_0.Args[1]
+		v_0_0 := v_0.Args[0]
+		if v_0_0.Op != OpARM64SRL {
+			break
+		}
+		if v_0_0.Type != typ.UInt32 {
+			break
+		}
+		_ = v_0_0.Args[1]
+		v_0_0_0 := v_0_0.Args[0]
+		if v_0_0_0.Op != OpARM64MOVWUreg {
+			break
+		}
+		x := v_0_0_0.Args[0]
+		v_0_0_1 := v_0_0.Args[1]
+		if v_0_0_1.Op != OpARM64SUB {
+			break
+		}
+		t := v_0_0_1.Type
+		_ = v_0_0_1.Args[1]
+		v_0_0_1_0 := v_0_0_1.Args[0]
+		if v_0_0_1_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0_0_1_0.AuxInt != 32 {
+			break
+		}
+		v_0_0_1_1 := v_0_0_1.Args[1]
+		if v_0_0_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_0_0_1_1.Type != t {
+			break
+		}
+		if v_0_0_1_1.AuxInt != 31 {
+			break
+		}
+		y := v_0_0_1_1.Args[0]
+		v_0_1 := v_0.Args[1]
+		if v_0_1.Op != OpARM64CMPconst {
+			break
+		}
+		if v_0_1.AuxInt != 64 {
+			break
+		}
+		v_0_1_0 := v_0_1.Args[0]
+		if v_0_1_0.Op != OpARM64SUB {
+			break
+		}
+		if v_0_1_0.Type != t {
+			break
+		}
+		_ = v_0_1_0.Args[1]
+		v_0_1_0_0 := v_0_1_0.Args[0]
+		if v_0_1_0_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0_1_0_0.AuxInt != 32 {
+			break
+		}
+		v_0_1_0_1 := v_0_1_0.Args[1]
+		if v_0_1_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_0_1_0_1.Type != t {
+			break
+		}
+		if v_0_1_0_1.AuxInt != 31 {
+			break
+		}
+		if y != v_0_1_0_1.Args[0] {
+			break
+		}
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64SLL {
+			break
+		}
+		_ = v_1.Args[1]
+		if x != v_1.Args[0] {
+			break
+		}
+		v_1_1 := v_1.Args[1]
+		if v_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_1.Type != t {
+			break
+		}
+		if v_1_1.AuxInt != 31 {
+			break
+		}
+		if y != v_1_1.Args[0] {
+			break
+		}
+		if !(cc.(Op) == OpARM64LessThanU) {
+			break
+		}
+		v.reset(OpARM64RORW)
+		v.AddArg(x)
+		v0 := b.NewValue0(v.Pos, OpARM64NEG, t)
+		v0.AddArg(y)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (XOR (SRL <typ.UInt32> (MOVWUreg x) (ANDconst <t> [31] y)) (CSEL0 <typ.UInt32> {cc} (SLL x (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y))) (CMPconst [64] (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y)))))
+	// cond: cc.(Op) == OpARM64LessThanU
+	// result: (RORW x y)
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64SRL {
+			break
+		}
+		if v_0.Type != typ.UInt32 {
+			break
+		}
+		_ = v_0.Args[1]
+		v_0_0 := v_0.Args[0]
+		if v_0_0.Op != OpARM64MOVWUreg {
+			break
+		}
+		x := v_0_0.Args[0]
+		v_0_1 := v_0.Args[1]
+		if v_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		t := v_0_1.Type
+		if v_0_1.AuxInt != 31 {
+			break
+		}
+		y := v_0_1.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64CSEL0 {
+			break
+		}
+		if v_1.Type != typ.UInt32 {
+			break
+		}
+		cc := v_1.Aux
+		_ = v_1.Args[1]
+		v_1_0 := v_1.Args[0]
+		if v_1_0.Op != OpARM64SLL {
+			break
+		}
+		_ = v_1_0.Args[1]
+		if x != v_1_0.Args[0] {
+			break
+		}
+		v_1_0_1 := v_1_0.Args[1]
+		if v_1_0_1.Op != OpARM64SUB {
+			break
+		}
+		if v_1_0_1.Type != t {
+			break
+		}
+		_ = v_1_0_1.Args[1]
+		v_1_0_1_0 := v_1_0_1.Args[0]
+		if v_1_0_1_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1_0_1_0.AuxInt != 32 {
+			break
+		}
+		v_1_0_1_1 := v_1_0_1.Args[1]
+		if v_1_0_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_0_1_1.Type != t {
+			break
+		}
+		if v_1_0_1_1.AuxInt != 31 {
+			break
+		}
+		if y != v_1_0_1_1.Args[0] {
+			break
+		}
+		v_1_1 := v_1.Args[1]
+		if v_1_1.Op != OpARM64CMPconst {
+			break
+		}
+		if v_1_1.AuxInt != 64 {
+			break
+		}
+		v_1_1_0 := v_1_1.Args[0]
+		if v_1_1_0.Op != OpARM64SUB {
+			break
+		}
+		if v_1_1_0.Type != t {
+			break
+		}
+		_ = v_1_1_0.Args[1]
+		v_1_1_0_0 := v_1_1_0.Args[0]
+		if v_1_1_0_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1_1_0_0.AuxInt != 32 {
+			break
+		}
+		v_1_1_0_1 := v_1_1_0.Args[1]
+		if v_1_1_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_1_0_1.Type != t {
+			break
+		}
+		if v_1_1_0_1.AuxInt != 31 {
+			break
+		}
+		if y != v_1_1_0_1.Args[0] {
+			break
+		}
+		if !(cc.(Op) == OpARM64LessThanU) {
+			break
+		}
+		v.reset(OpARM64RORW)
+		v.AddArg(x)
+		v.AddArg(y)
+		return true
+	}
+	// match: (XOR (CSEL0 <typ.UInt32> {cc} (SLL x (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y))) (CMPconst [64] (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y)))) (SRL <typ.UInt32> (MOVWUreg x) (ANDconst <t> [31] y)))
+	// cond: cc.(Op) == OpARM64LessThanU
+	// result: (RORW x y)
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64CSEL0 {
+			break
+		}
+		if v_0.Type != typ.UInt32 {
+			break
+		}
+		cc := v_0.Aux
+		_ = v_0.Args[1]
+		v_0_0 := v_0.Args[0]
+		if v_0_0.Op != OpARM64SLL {
+			break
+		}
+		_ = v_0_0.Args[1]
+		x := v_0_0.Args[0]
+		v_0_0_1 := v_0_0.Args[1]
+		if v_0_0_1.Op != OpARM64SUB {
+			break
+		}
+		t := v_0_0_1.Type
+		_ = v_0_0_1.Args[1]
+		v_0_0_1_0 := v_0_0_1.Args[0]
+		if v_0_0_1_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0_0_1_0.AuxInt != 32 {
+			break
+		}
+		v_0_0_1_1 := v_0_0_1.Args[1]
+		if v_0_0_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_0_0_1_1.Type != t {
+			break
+		}
+		if v_0_0_1_1.AuxInt != 31 {
+			break
+		}
+		y := v_0_0_1_1.Args[0]
+		v_0_1 := v_0.Args[1]
+		if v_0_1.Op != OpARM64CMPconst {
+			break
+		}
+		if v_0_1.AuxInt != 64 {
+			break
+		}
+		v_0_1_0 := v_0_1.Args[0]
+		if v_0_1_0.Op != OpARM64SUB {
+			break
+		}
+		if v_0_1_0.Type != t {
+			break
+		}
+		_ = v_0_1_0.Args[1]
+		v_0_1_0_0 := v_0_1_0.Args[0]
+		if v_0_1_0_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0_1_0_0.AuxInt != 32 {
+			break
+		}
+		v_0_1_0_1 := v_0_1_0.Args[1]
+		if v_0_1_0_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_0_1_0_1.Type != t {
+			break
+		}
+		if v_0_1_0_1.AuxInt != 31 {
+			break
+		}
+		if y != v_0_1_0_1.Args[0] {
+			break
+		}
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64SRL {
+			break
+		}
+		if v_1.Type != typ.UInt32 {
+			break
+		}
+		_ = v_1.Args[1]
+		v_1_0 := v_1.Args[0]
+		if v_1_0.Op != OpARM64MOVWUreg {
+			break
+		}
+		if x != v_1_0.Args[0] {
+			break
+		}
+		v_1_1 := v_1.Args[1]
+		if v_1_1.Op != OpARM64ANDconst {
+			break
+		}
+		if v_1_1.Type != t {
+			break
+		}
+		if v_1_1.AuxInt != 31 {
+			break
+		}
+		if y != v_1_1.Args[0] {
+			break
+		}
+		if !(cc.(Op) == OpARM64LessThanU) {
+			break
+		}
+		v.reset(OpARM64RORW)
+		v.AddArg(x)
+		v.AddArg(y)
+		return true
+	}
 	return false
 }
 func rewriteValueARM64_OpARM64XORconst_0(v *Value) bool {
@@ -30534,6 +33338,42 @@
 		return true
 	}
 }
+func rewriteValueARM64_OpRotateLeft32_0(v *Value) bool {
+	b := v.Block
+	_ = b
+	// match: (RotateLeft32 x y)
+	// cond:
+	// result: (RORW x (NEG <y.Type> y))
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		y := v.Args[1]
+		v.reset(OpARM64RORW)
+		v.AddArg(x)
+		v0 := b.NewValue0(v.Pos, OpARM64NEG, y.Type)
+		v0.AddArg(y)
+		v.AddArg(v0)
+		return true
+	}
+}
+func rewriteValueARM64_OpRotateLeft64_0(v *Value) bool {
+	b := v.Block
+	_ = b
+	// match: (RotateLeft64 x y)
+	// cond:
+	// result: (ROR x (NEG <y.Type> y))
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		y := v.Args[1]
+		v.reset(OpARM64ROR)
+		v.AddArg(x)
+		v0 := b.NewValue0(v.Pos, OpARM64NEG, y.Type)
+		v0.AddArg(y)
+		v.AddArg(v0)
+		return true
+	}
+}
 func rewriteValueARM64_OpRound_0(v *Value) bool {
 	// match: (Round x)
 	// cond:
diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go
index b8844c5..28354ed 100644
--- a/test/codegen/mathbits.go
+++ b/test/codegen/mathbits.go
@@ -195,6 +195,7 @@
 
 func RotateLeftVariable(n uint, m int) uint {
 	// amd64:"ROLQ"
+	// arm64:"ROR"
 	// ppc64:"ROTL"
 	// s390x:"RLLG"
 	return bits.RotateLeft(n, m)
@@ -202,6 +203,7 @@
 
 func RotateLeftVariable64(n uint64, m int) uint64 {
 	// amd64:"ROLQ"
+	// arm64:"ROR"
 	// ppc64:"ROTL"
 	// s390x:"RLLG"
 	return bits.RotateLeft64(n, m)
@@ -209,6 +211,7 @@
 
 func RotateLeftVariable32(n uint32, m int) uint32 {
 	// amd64:"ROLL"
+	// arm64:"RORW"
 	// ppc64:"ROTLW"
 	// s390x:"RLL"
 	return bits.RotateLeft32(n, m)