cmd/compile: added some intrinsics to SSA back end

One intrinsic was needed to help get the very best
performance out of a future GC; as long as that one was
being added, I also added Bswap since that is sometimes
a handy thing to have.  I had intended to fill out the
bit-scan intrinsic family, but the mismatch between the
"scan forward" instruction and "count leading zeroes"
was large enough to cause me to leave it out -- it poses
a dilemma that I'd rather dodge right now.

These intrinsics are not exposed for general use.
That's a separate issue requiring an API proposal change
( https://github.com/golang/proposal )

All intrinsics are tested, both that they are substituted
on the appropriate architecture, and that they produce the
expected result.

Change-Id: I5848037cfd97de4f75bdc33bdd89bba00af4a8ee
Reviewed-on: https://go-review.googlesource.com/20564
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: David Chase <drchase@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
diff --git a/src/cmd/compile/internal/ssa/compile.go b/src/cmd/compile/internal/ssa/compile.go
index b8e2b42..d6c2bf8 100644
--- a/src/cmd/compile/internal/ssa/compile.go
+++ b/src/cmd/compile/internal/ssa/compile.go
@@ -120,6 +120,10 @@
 // Run consistency checker between each phase
 var checkEnabled = false
 
+// Debug output
+var IntrinsicsDebug int
+var IntrinsicsDisable bool
+
 // PhaseOption sets the specified flag in the specified ssa phase,
 // returning empty string if this was successful or a string explaining
 // the error if it was not.
@@ -157,6 +161,20 @@
 		}
 	}
 
+	if phase == "intrinsics" {
+		switch flag {
+		case "on":
+			IntrinsicsDisable = val == 0
+		case "off":
+			IntrinsicsDisable = val != 0
+		case "debug":
+			IntrinsicsDebug = val
+		default:
+			return fmt.Sprintf("Did not find a flag matching %s in -d=ssa/%s debug option", flag, phase)
+		}
+		return ""
+	}
+
 	underphase := strings.Replace(phase, "_", " ", -1)
 	var re *regexp.Regexp
 	if phase[0] == '~' {
diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules
index b595912..cc21097 100644
--- a/src/cmd/compile/internal/ssa/gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules
@@ -92,6 +92,38 @@
 (Com16 x) -> (NOTW x)
 (Com8 x) -> (NOTB x)
 
+// CMPQconst 0 below is redundant because BSF sets Z but how to remove?
+(Ctz64 <t> x) -> (CMOVQEQconst (BSFQ <t> x) (CMPQconst x [0]) [64])
+(Ctz32 <t> x) -> (CMOVLEQconst (BSFL <t> x) (CMPLconst x [0]) [32])
+(Ctz16 <t> x) -> (CMOVWEQconst (BSFW <t> x) (CMPWconst x [0]) [16])
+
+(CMOVQEQconst x (InvertFlags y) [c]) -> (CMOVQNEconst x y [c])
+(CMOVLEQconst x (InvertFlags y) [c]) -> (CMOVLNEconst x y [c])
+(CMOVWEQconst x (InvertFlags y) [c]) -> (CMOVWNEconst x y [c])
+
+(CMOVQEQconst _ (FlagEQ) [c]) -> (Const64 [c])
+(CMOVLEQconst _ (FlagEQ) [c]) -> (Const32 [c])
+(CMOVWEQconst _ (FlagEQ) [c]) -> (Const16 [c])
+
+(CMOVQEQconst x (FlagLT_ULT)) -> x
+(CMOVLEQconst x (FlagLT_ULT)) -> x
+(CMOVWEQconst x (FlagLT_ULT)) -> x
+
+(CMOVQEQconst x (FlagLT_UGT)) -> x
+(CMOVLEQconst x (FlagLT_UGT)) -> x
+(CMOVWEQconst x (FlagLT_UGT)) -> x
+
+(CMOVQEQconst x (FlagGT_ULT)) -> x
+(CMOVLEQconst x (FlagGT_ULT)) -> x
+(CMOVWEQconst x (FlagGT_ULT)) -> x
+
+(CMOVQEQconst x (FlagGT_UGT)) -> x
+(CMOVLEQconst x (FlagGT_UGT)) -> x
+(CMOVWEQconst x (FlagGT_UGT)) -> x
+
+(Bswap64 x) -> (BSWAPQ x)
+(Bswap32 x) -> (BSWAPL x)
+
 (Sqrt x) -> (SQRTSD x)
 
 // Note: we always extend to 64 bits even though some ops don't need that many result bits.
diff --git a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
index 116e3ff..9dc09aa 100644
--- a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
@@ -103,9 +103,13 @@
 		gp11mod = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{dx},
 			clobbers: ax | flags}
 
-		gp2flags  = regInfo{inputs: []regMask{gpsp, gpsp}, outputs: flagsonly}
-		gp1flags  = regInfo{inputs: []regMask{gpsp}, outputs: flagsonly}
-		flagsgp   = regInfo{inputs: flagsonly, outputs: gponly}
+		gp2flags = regInfo{inputs: []regMask{gpsp, gpsp}, outputs: flagsonly}
+		gp1flags = regInfo{inputs: []regMask{gpsp}, outputs: flagsonly}
+		flagsgp  = regInfo{inputs: flagsonly, outputs: gponly}
+
+		// for CMOVconst -- uses AX to hold constant temporary. AX input is moved before temp.
+		gp1flagsgp = regInfo{inputs: []regMask{gp, flags}, clobbers: ax | flags, outputs: []regMask{gp &^ ax}}
+
 		readflags = regInfo{inputs: flagsonly, outputs: gponly}
 		flagsgpax = regInfo{inputs: flagsonly, clobbers: ax | flags, outputs: []regMask{gp &^ ax}}
 
@@ -307,6 +311,25 @@
 		{name: "NOTW", argLength: 1, reg: gp11, asm: "NOTL", resultInArg0: true}, // ^arg0
 		{name: "NOTB", argLength: 1, reg: gp11, asm: "NOTL", resultInArg0: true}, // ^arg0
 
+		{name: "BSFQ", argLength: 1, reg: gp11, asm: "BSFQ"}, // arg0 # of low-order zeroes ; undef if zero
+		{name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL"}, // arg0 # of low-order zeroes ; undef if zero
+		{name: "BSFW", argLength: 1, reg: gp11, asm: "BSFW"}, // arg0 # of low-order zeroes ; undef if zero
+
+		{name: "BSRQ", argLength: 1, reg: gp11, asm: "BSRQ"}, // arg0 # of high-order zeroes ; undef if zero
+		{name: "BSRL", argLength: 1, reg: gp11, asm: "BSRL"}, // arg0 # of high-order zeroes ; undef if zero
+		{name: "BSRW", argLength: 1, reg: gp11, asm: "BSRW"}, // arg0 # of high-order zeroes ; undef if zero
+
+		// Note ASM for ops moves whole register
+		{name: "CMOVQEQconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVQEQ", typ: "UInt64", aux: "Int64", resultInArg0: true}, // replace arg0 w/ constant if Z set
+		{name: "CMOVLEQconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVLEQ", typ: "UInt32", aux: "Int32", resultInArg0: true}, // replace arg0 w/ constant if Z set
+		{name: "CMOVWEQconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVLEQ", typ: "UInt16", aux: "Int16", resultInArg0: true}, // replace arg0 w/ constant if Z set
+		{name: "CMOVQNEconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVQNE", typ: "UInt64", aux: "Int64", resultInArg0: true}, // replace arg0 w/ constant if Z not set
+		{name: "CMOVLNEconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVLNE", typ: "UInt32", aux: "Int32", resultInArg0: true}, // replace arg0 w/ constant if Z not set
+		{name: "CMOVWNEconst", argLength: 2, reg: gp1flagsgp, asm: "CMOVLNE", typ: "UInt16", aux: "Int16", resultInArg0: true}, // replace arg0 w/ constant if Z not set
+
+		{name: "BSWAPQ", argLength: 1, reg: gp11, asm: "BSWAPQ", resultInArg0: true}, // arg0 swap bytes
+		{name: "BSWAPL", argLength: 1, reg: gp11, asm: "BSWAPL", resultInArg0: true}, // arg0 swap bytes
+
 		{name: "SQRTSD", argLength: 1, reg: fp11, asm: "SQRTSD"}, // sqrt(arg0)
 
 		{name: "SBBQcarrymask", argLength: 1, reg: flagsgp, asm: "SBBQ"}, // (int64)(-1) if carry is set, 0 if carry is clear.
diff --git a/src/cmd/compile/internal/ssa/gen/genericOps.go b/src/cmd/compile/internal/ssa/gen/genericOps.go
index ab5e335..6d92926 100644
--- a/src/cmd/compile/internal/ssa/gen/genericOps.go
+++ b/src/cmd/compile/internal/ssa/gen/genericOps.go
@@ -237,6 +237,17 @@
 	{name: "Com32", argLength: 1},
 	{name: "Com64", argLength: 1},
 
+	{name: "Ctz16", argLength: 1}, // Count trailing (low  order) zeroes (returns 0-16)
+	{name: "Ctz32", argLength: 1}, // Count trailing zeroes (returns 0-32)
+	{name: "Ctz64", argLength: 1}, // Count trailing zeroes (returns 0-64)
+
+	{name: "Clz16", argLength: 1}, // Count leading (high order) zeroes (returns 0-16)
+	{name: "Clz32", argLength: 1}, // Count leading zeroes (returns 0-32)
+	{name: "Clz64", argLength: 1}, // Count leading zeroes (returns 0-64)
+
+	{name: "Bswap32", argLength: 1}, // Swap bytes
+	{name: "Bswap64", argLength: 1}, // Swap bytes
+
 	{name: "Sqrt", argLength: 1}, // sqrt(arg0), float64 only
 
 	// Data movement, max argument length for Phi is indefinite so just pick
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 3ff2b5a..e76efd4 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -237,6 +237,20 @@
 	OpAMD64NOTL
 	OpAMD64NOTW
 	OpAMD64NOTB
+	OpAMD64BSFQ
+	OpAMD64BSFL
+	OpAMD64BSFW
+	OpAMD64BSRQ
+	OpAMD64BSRL
+	OpAMD64BSRW
+	OpAMD64CMOVQEQconst
+	OpAMD64CMOVLEQconst
+	OpAMD64CMOVWEQconst
+	OpAMD64CMOVQNEconst
+	OpAMD64CMOVLNEconst
+	OpAMD64CMOVWNEconst
+	OpAMD64BSWAPQ
+	OpAMD64BSWAPL
 	OpAMD64SQRTSD
 	OpAMD64SBBQcarrymask
 	OpAMD64SBBLcarrymask
@@ -521,6 +535,14 @@
 	OpCom16
 	OpCom32
 	OpCom64
+	OpCtz16
+	OpCtz32
+	OpCtz64
+	OpClz16
+	OpClz32
+	OpClz64
+	OpBswap32
+	OpBswap64
 	OpSqrt
 	OpPhi
 	OpCopy
@@ -2804,6 +2826,222 @@
 		},
 	},
 	{
+		name:   "BSFQ",
+		argLen: 1,
+		asm:    x86.ABSFQ,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 65535}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+			clobbers: 8589934592, // FLAGS
+			outputs: []regMask{
+				65519, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+		},
+	},
+	{
+		name:   "BSFL",
+		argLen: 1,
+		asm:    x86.ABSFL,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 65535}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+			clobbers: 8589934592, // FLAGS
+			outputs: []regMask{
+				65519, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+		},
+	},
+	{
+		name:   "BSFW",
+		argLen: 1,
+		asm:    x86.ABSFW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 65535}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+			clobbers: 8589934592, // FLAGS
+			outputs: []regMask{
+				65519, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+		},
+	},
+	{
+		name:   "BSRQ",
+		argLen: 1,
+		asm:    x86.ABSRQ,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 65535}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+			clobbers: 8589934592, // FLAGS
+			outputs: []regMask{
+				65519, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+		},
+	},
+	{
+		name:   "BSRL",
+		argLen: 1,
+		asm:    x86.ABSRL,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 65535}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+			clobbers: 8589934592, // FLAGS
+			outputs: []regMask{
+				65519, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+		},
+	},
+	{
+		name:   "BSRW",
+		argLen: 1,
+		asm:    x86.ABSRW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 65535}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+			clobbers: 8589934592, // FLAGS
+			outputs: []regMask{
+				65519, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+		},
+	},
+	{
+		name:         "CMOVQEQconst",
+		auxType:      auxInt64,
+		argLen:       2,
+		resultInArg0: true,
+		asm:          x86.ACMOVQEQ,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 8589934592}, // FLAGS
+				{0, 65519},      // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+			clobbers: 8589934593, // AX FLAGS
+			outputs: []regMask{
+				65518, // CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+		},
+	},
+	{
+		name:         "CMOVLEQconst",
+		auxType:      auxInt32,
+		argLen:       2,
+		resultInArg0: true,
+		asm:          x86.ACMOVLEQ,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 8589934592}, // FLAGS
+				{0, 65519},      // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+			clobbers: 8589934593, // AX FLAGS
+			outputs: []regMask{
+				65518, // CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+		},
+	},
+	{
+		name:         "CMOVWEQconst",
+		auxType:      auxInt16,
+		argLen:       2,
+		resultInArg0: true,
+		asm:          x86.ACMOVLEQ,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 8589934592}, // FLAGS
+				{0, 65519},      // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+			clobbers: 8589934593, // AX FLAGS
+			outputs: []regMask{
+				65518, // CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+		},
+	},
+	{
+		name:         "CMOVQNEconst",
+		auxType:      auxInt64,
+		argLen:       2,
+		resultInArg0: true,
+		asm:          x86.ACMOVQNE,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 8589934592}, // FLAGS
+				{0, 65519},      // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+			clobbers: 8589934593, // AX FLAGS
+			outputs: []regMask{
+				65518, // CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+		},
+	},
+	{
+		name:         "CMOVLNEconst",
+		auxType:      auxInt32,
+		argLen:       2,
+		resultInArg0: true,
+		asm:          x86.ACMOVLNE,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 8589934592}, // FLAGS
+				{0, 65519},      // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+			clobbers: 8589934593, // AX FLAGS
+			outputs: []regMask{
+				65518, // CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+		},
+	},
+	{
+		name:         "CMOVWNEconst",
+		auxType:      auxInt16,
+		argLen:       2,
+		resultInArg0: true,
+		asm:          x86.ACMOVLNE,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 8589934592}, // FLAGS
+				{0, 65519},      // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+			clobbers: 8589934593, // AX FLAGS
+			outputs: []regMask{
+				65518, // CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+		},
+	},
+	{
+		name:         "BSWAPQ",
+		argLen:       1,
+		resultInArg0: true,
+		asm:          x86.ABSWAPQ,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 65535}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+			clobbers: 8589934592, // FLAGS
+			outputs: []regMask{
+				65519, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+		},
+	},
+	{
+		name:         "BSWAPL",
+		argLen:       1,
+		resultInArg0: true,
+		asm:          x86.ABSWAPL,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 65535}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+			clobbers: 8589934592, // FLAGS
+			outputs: []regMask{
+				65519, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+			},
+		},
+	},
+	{
 		name:   "SQRTSD",
 		argLen: 1,
 		asm:    x86.ASQRTSD,
@@ -4982,6 +5220,46 @@
 		generic: true,
 	},
 	{
+		name:    "Ctz16",
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "Ctz32",
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "Ctz64",
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "Clz16",
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "Clz32",
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "Clz64",
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "Bswap32",
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "Bswap64",
+		argLen:  1,
+		generic: true,
+	},
+	{
 		name:    "Sqrt",
 		argLen:  1,
 		generic: true,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index 0469738..8dd1b15 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -66,6 +66,16 @@
 		return rewriteValueAMD64_OpAnd8(v, config)
 	case OpAvg64u:
 		return rewriteValueAMD64_OpAvg64u(v, config)
+	case OpBswap32:
+		return rewriteValueAMD64_OpBswap32(v, config)
+	case OpBswap64:
+		return rewriteValueAMD64_OpBswap64(v, config)
+	case OpAMD64CMOVLEQconst:
+		return rewriteValueAMD64_OpAMD64CMOVLEQconst(v, config)
+	case OpAMD64CMOVQEQconst:
+		return rewriteValueAMD64_OpAMD64CMOVQEQconst(v, config)
+	case OpAMD64CMOVWEQconst:
+		return rewriteValueAMD64_OpAMD64CMOVWEQconst(v, config)
 	case OpAMD64CMPB:
 		return rewriteValueAMD64_OpAMD64CMPB(v, config)
 	case OpAMD64CMPBconst:
@@ -110,6 +120,12 @@
 		return rewriteValueAMD64_OpConstNil(v, config)
 	case OpConvert:
 		return rewriteValueAMD64_OpConvert(v, config)
+	case OpCtz16:
+		return rewriteValueAMD64_OpCtz16(v, config)
+	case OpCtz32:
+		return rewriteValueAMD64_OpCtz32(v, config)
+	case OpCtz64:
+		return rewriteValueAMD64_OpCtz64(v, config)
 	case OpCvt32Fto32:
 		return rewriteValueAMD64_OpCvt32Fto32(v, config)
 	case OpCvt32Fto64:
@@ -2119,6 +2135,307 @@
 	}
 	return false
 }
+func rewriteValueAMD64_OpBswap32(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Bswap32 x)
+	// cond:
+	// result: (BSWAPL x)
+	for {
+		x := v.Args[0]
+		v.reset(OpAMD64BSWAPL)
+		v.AddArg(x)
+		return true
+	}
+	return false
+}
+func rewriteValueAMD64_OpBswap64(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Bswap64 x)
+	// cond:
+	// result: (BSWAPQ x)
+	for {
+		x := v.Args[0]
+		v.reset(OpAMD64BSWAPQ)
+		v.AddArg(x)
+		return true
+	}
+	return false
+}
+func rewriteValueAMD64_OpAMD64CMOVLEQconst(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (CMOVLEQconst x (InvertFlags y) [c])
+	// cond:
+	// result: (CMOVLNEconst x y [c])
+	for {
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpAMD64InvertFlags {
+			break
+		}
+		y := v_1.Args[0]
+		c := v.AuxInt
+		v.reset(OpAMD64CMOVLNEconst)
+		v.AddArg(x)
+		v.AddArg(y)
+		v.AuxInt = c
+		return true
+	}
+	// match: (CMOVLEQconst _ (FlagEQ) [c])
+	// cond:
+	// result: (Const32 [c])
+	for {
+		v_1 := v.Args[1]
+		if v_1.Op != OpAMD64FlagEQ {
+			break
+		}
+		c := v.AuxInt
+		v.reset(OpConst32)
+		v.AuxInt = c
+		return true
+	}
+	// match: (CMOVLEQconst x (FlagLT_ULT))
+	// cond:
+	// result: x
+	for {
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpAMD64FlagLT_ULT {
+			break
+		}
+		v.reset(OpCopy)
+		v.Type = x.Type
+		v.AddArg(x)
+		return true
+	}
+	// match: (CMOVLEQconst x (FlagLT_UGT))
+	// cond:
+	// result: x
+	for {
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpAMD64FlagLT_UGT {
+			break
+		}
+		v.reset(OpCopy)
+		v.Type = x.Type
+		v.AddArg(x)
+		return true
+	}
+	// match: (CMOVLEQconst x (FlagGT_ULT))
+	// cond:
+	// result: x
+	for {
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpAMD64FlagGT_ULT {
+			break
+		}
+		v.reset(OpCopy)
+		v.Type = x.Type
+		v.AddArg(x)
+		return true
+	}
+	// match: (CMOVLEQconst x (FlagGT_UGT))
+	// cond:
+	// result: x
+	for {
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpAMD64FlagGT_UGT {
+			break
+		}
+		v.reset(OpCopy)
+		v.Type = x.Type
+		v.AddArg(x)
+		return true
+	}
+	return false
+}
+func rewriteValueAMD64_OpAMD64CMOVQEQconst(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (CMOVQEQconst x (InvertFlags y) [c])
+	// cond:
+	// result: (CMOVQNEconst x y [c])
+	for {
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpAMD64InvertFlags {
+			break
+		}
+		y := v_1.Args[0]
+		c := v.AuxInt
+		v.reset(OpAMD64CMOVQNEconst)
+		v.AddArg(x)
+		v.AddArg(y)
+		v.AuxInt = c
+		return true
+	}
+	// match: (CMOVQEQconst _ (FlagEQ) [c])
+	// cond:
+	// result: (Const64 [c])
+	for {
+		v_1 := v.Args[1]
+		if v_1.Op != OpAMD64FlagEQ {
+			break
+		}
+		c := v.AuxInt
+		v.reset(OpConst64)
+		v.AuxInt = c
+		return true
+	}
+	// match: (CMOVQEQconst x (FlagLT_ULT))
+	// cond:
+	// result: x
+	for {
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpAMD64FlagLT_ULT {
+			break
+		}
+		v.reset(OpCopy)
+		v.Type = x.Type
+		v.AddArg(x)
+		return true
+	}
+	// match: (CMOVQEQconst x (FlagLT_UGT))
+	// cond:
+	// result: x
+	for {
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpAMD64FlagLT_UGT {
+			break
+		}
+		v.reset(OpCopy)
+		v.Type = x.Type
+		v.AddArg(x)
+		return true
+	}
+	// match: (CMOVQEQconst x (FlagGT_ULT))
+	// cond:
+	// result: x
+	for {
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpAMD64FlagGT_ULT {
+			break
+		}
+		v.reset(OpCopy)
+		v.Type = x.Type
+		v.AddArg(x)
+		return true
+	}
+	// match: (CMOVQEQconst x (FlagGT_UGT))
+	// cond:
+	// result: x
+	for {
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpAMD64FlagGT_UGT {
+			break
+		}
+		v.reset(OpCopy)
+		v.Type = x.Type
+		v.AddArg(x)
+		return true
+	}
+	return false
+}
+func rewriteValueAMD64_OpAMD64CMOVWEQconst(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (CMOVWEQconst x (InvertFlags y) [c])
+	// cond:
+	// result: (CMOVWNEconst x y [c])
+	for {
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpAMD64InvertFlags {
+			break
+		}
+		y := v_1.Args[0]
+		c := v.AuxInt
+		v.reset(OpAMD64CMOVWNEconst)
+		v.AddArg(x)
+		v.AddArg(y)
+		v.AuxInt = c
+		return true
+	}
+	// match: (CMOVWEQconst _ (FlagEQ) [c])
+	// cond:
+	// result: (Const16 [c])
+	for {
+		v_1 := v.Args[1]
+		if v_1.Op != OpAMD64FlagEQ {
+			break
+		}
+		c := v.AuxInt
+		v.reset(OpConst16)
+		v.AuxInt = c
+		return true
+	}
+	// match: (CMOVWEQconst x (FlagLT_ULT))
+	// cond:
+	// result: x
+	for {
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpAMD64FlagLT_ULT {
+			break
+		}
+		v.reset(OpCopy)
+		v.Type = x.Type
+		v.AddArg(x)
+		return true
+	}
+	// match: (CMOVWEQconst x (FlagLT_UGT))
+	// cond:
+	// result: x
+	for {
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpAMD64FlagLT_UGT {
+			break
+		}
+		v.reset(OpCopy)
+		v.Type = x.Type
+		v.AddArg(x)
+		return true
+	}
+	// match: (CMOVWEQconst x (FlagGT_ULT))
+	// cond:
+	// result: x
+	for {
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpAMD64FlagGT_ULT {
+			break
+		}
+		v.reset(OpCopy)
+		v.Type = x.Type
+		v.AddArg(x)
+		return true
+	}
+	// match: (CMOVWEQconst x (FlagGT_UGT))
+	// cond:
+	// result: x
+	for {
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpAMD64FlagGT_UGT {
+			break
+		}
+		v.reset(OpCopy)
+		v.Type = x.Type
+		v.AddArg(x)
+		return true
+	}
+	return false
+}
 func rewriteValueAMD64_OpAMD64CMPB(v *Value, config *Config) bool {
 	b := v.Block
 	_ = b
@@ -3026,6 +3343,72 @@
 	}
 	return false
 }
+func rewriteValueAMD64_OpCtz16(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Ctz16 <t> x)
+	// cond:
+	// result: (CMOVWEQconst (BSFW <t> x) (CMPWconst x [0]) [16])
+	for {
+		t := v.Type
+		x := v.Args[0]
+		v.reset(OpAMD64CMOVWEQconst)
+		v0 := b.NewValue0(v.Line, OpAMD64BSFW, t)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		v1 := b.NewValue0(v.Line, OpAMD64CMPWconst, TypeFlags)
+		v1.AddArg(x)
+		v1.AuxInt = 0
+		v.AddArg(v1)
+		v.AuxInt = 16
+		return true
+	}
+	return false
+}
+func rewriteValueAMD64_OpCtz32(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Ctz32 <t> x)
+	// cond:
+	// result: (CMOVLEQconst (BSFL <t> x) (CMPLconst x [0]) [32])
+	for {
+		t := v.Type
+		x := v.Args[0]
+		v.reset(OpAMD64CMOVLEQconst)
+		v0 := b.NewValue0(v.Line, OpAMD64BSFL, t)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		v1 := b.NewValue0(v.Line, OpAMD64CMPLconst, TypeFlags)
+		v1.AddArg(x)
+		v1.AuxInt = 0
+		v.AddArg(v1)
+		v.AuxInt = 32
+		return true
+	}
+	return false
+}
+func rewriteValueAMD64_OpCtz64(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Ctz64 <t> x)
+	// cond:
+	// result: (CMOVQEQconst (BSFQ <t> x) (CMPQconst x [0]) [64])
+	for {
+		t := v.Type
+		x := v.Args[0]
+		v.reset(OpAMD64CMOVQEQconst)
+		v0 := b.NewValue0(v.Line, OpAMD64BSFQ, t)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		v1 := b.NewValue0(v.Line, OpAMD64CMPQconst, TypeFlags)
+		v1.AddArg(x)
+		v1.AuxInt = 0
+		v.AddArg(v1)
+		v.AuxInt = 64
+		return true
+	}
+	return false
+}
 func rewriteValueAMD64_OpCvt32Fto32(v *Value, config *Config) bool {
 	b := v.Block
 	_ = b