src/cmd/compile/internal/ssa/gen/AMD64.rules - go - Git at Google

 // Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 // x86 register conventions:
 //  - Integer types live in the low portion of registers.  Upper portions are junk.
 //  - Boolean types use the low-order byte of a register.  Upper bytes are junk.
 //  - We do not use AH,BH,CH,DH registers.
 //  - Floating-point types will live in the low natural slot of an sse2 register.
 //    Unused portions are junk.

 // Lowering arithmetic
 (Add64 x y) -> (ADDQ x y)
 (AddPtr x y) -> (ADDQ x y)
 (Add32 x y) -> (ADDL x y)
 (Add16 x y) -> (ADDW x y)
 (Add8 x y) -> (ADDB x y)
 (Add32F x y) -> (ADDSS x y)
 (Add64F x y) -> (ADDSD x y)

 (Sub64 x y) -> (SUBQ x y)
 (SubPtr x y) -> (SUBQ x y)
 (Sub32 x y) -> (SUBL x y)
 (Sub16 x y) -> (SUBW x y)
 (Sub8 x y) -> (SUBB x y)
 (Sub32F x y) -> (SUBSS x y)
 (Sub64F x y) -> (SUBSD x y)

 (Mul64 x y) -> (MULQ x y)
 (Mul32 x y) -> (MULL x y)
 (Mul16 x y) -> (MULW x y)
 (Mul8 x y) -> (MULB x y)
 (Mul32F x y) -> (MULSS x y)
 (Mul64F x y) -> (MULSD x y)

 (Div32F x y) -> (DIVSS x y)
 (Div64F x y) -> (DIVSD x y)

 (Div64 x y) -> (DIVQ x y)
 (Div64u x y) -> (DIVQU x y)
 (Div32 x y) -> (DIVL x y)
 (Div32u x y) -> (DIVLU x y)
 (Div16 x y) -> (DIVW x y)
 (Div16u x y) -> (DIVWU x y)
 (Div8 x y) ->  (DIVW (SignExt8to16 x) (SignExt8to16 y))
 (Div8u x y) ->  (DIVWU (ZeroExt8to16 x) (ZeroExt8to16 y))

 (Hmul32 x y) -> (HMULL x y)
 (Hmul32u x y) -> (HMULLU x y)
 (Hmul16 x y) -> (HMULW x y)
 (Hmul16u x y) -> (HMULWU x y)
 (Hmul8 x y) ->  (HMULB x y)
 (Hmul8u x y) ->  (HMULBU x y)

 (Mod64 x y) -> (MODQ x y)
 (Mod64u x y) -> (MODQU x y)
 (Mod32 x y) -> (MODL x y)
 (Mod32u x y) -> (MODLU x y)
 (Mod16 x y) -> (MODW x y)
 (Mod16u x y) -> (MODWU x y)
 (Mod8 x y) ->  (MODW (SignExt8to16 x) (SignExt8to16 y))
 (Mod8u x y) ->  (MODWU (ZeroExt8to16 x) (ZeroExt8to16 y))

 (And64 x y) -> (ANDQ x y)
 (And32 x y) -> (ANDL x y)
 (And16 x y) -> (ANDW x y)
 (And8 x y) -> (ANDB x y)

 (Or64 x y) -> (ORQ x y)
 (Or32 x y) -> (ORL x y)
 (Or16 x y) -> (ORW x y)
 (Or8 x y) -> (ORB x y)

 (Xor64 x y) -> (XORQ x y)
 (Xor32 x y) -> (XORL x y)
 (Xor16 x y) -> (XORW x y)
 (Xor8 x y) -> (XORB x y)

 (Neg64 x) -> (NEGQ x)
 (Neg32 x) -> (NEGL x)
 (Neg16 x) -> (NEGW x)
 (Neg8 x) -> (NEGB x)
 (Neg32F x) -> (PXOR x (MOVSSconst <config.Frontend().TypeFloat32()> [f2i(math.Copysign(0, -1))]))
 (Neg64F x) -> (PXOR x (MOVSDconst <config.Frontend().TypeFloat64()> [f2i(math.Copysign(0, -1))]))

 (Com64 x) -> (NOTQ x)
 (Com32 x) -> (NOTL x)
 (Com16 x) -> (NOTW x)
 (Com8 x) -> (NOTB x)

 (Sqrt x) -> (SQRTSD x)

 // Note: we always extend to 64 bits even though some ops don't need that many result bits.
 (SignExt8to16 x) -> (MOVBQSX x)
 (SignExt8to32 x) -> (MOVBQSX x)
 (SignExt8to64 x) -> (MOVBQSX x)
 (SignExt16to32 x) -> (MOVWQSX x)
 (SignExt16to64 x) -> (MOVWQSX x)
 (SignExt32to64 x) -> (MOVLQSX x)

 (ZeroExt8to16 x) -> (MOVBQZX x)
 (ZeroExt8to32 x) -> (MOVBQZX x)
 (ZeroExt8to64 x) -> (MOVBQZX x)
 (ZeroExt16to32 x) -> (MOVWQZX x)
 (ZeroExt16to64 x) -> (MOVWQZX x)
 (ZeroExt32to64 x) -> (MOVLQZX x)

 (Cvt32to32F x) -> (CVTSL2SS x)
 (Cvt32to64F x) -> (CVTSL2SD x)
 (Cvt64to32F x) -> (CVTSQ2SS x)
 (Cvt64to64F x) -> (CVTSQ2SD x)

 (Cvt32Fto32 x) -> (CVTTSS2SL x)
 (Cvt32Fto64 x) -> (CVTTSS2SQ x)
 (Cvt64Fto32 x) -> (CVTTSD2SL x)
 (Cvt64Fto64 x) -> (CVTTSD2SQ x)

 (Cvt32Fto64F x) -> (CVTSS2SD x)
 (Cvt64Fto32F x) -> (CVTSD2SS x)

 // Because we ignore high parts of registers, truncates are just copies.
 (Trunc16to8 x) -> x
 (Trunc32to8 x) -> x
 (Trunc32to16 x) -> x
 (Trunc64to8 x) -> x
 (Trunc64to16 x) -> x
 (Trunc64to32 x) -> x

 // Lowering shifts
 // Unsigned shifts need to return 0 if shift amount is >= width of shifted value.
 //   result = (arg << shift) & (shift >= argbits ? 0 : 0xffffffffffffffff)
 // Note: for small shifts we generate 32 bits of mask even when we don't need it all.
 (Lsh64x64 <t> x y) -> (ANDQ (SHLQ <t> x y) (SBBQcarrymask <t> (CMPQconst [64] y)))
 (Lsh64x32 <t> x y) -> (ANDQ (SHLQ <t> x y) (SBBQcarrymask <t> (CMPLconst [64] y)))
 (Lsh64x16 <t> x y) -> (ANDQ (SHLQ <t> x y) (SBBQcarrymask <t> (CMPWconst [64] y)))
 (Lsh64x8 <t> x y)  -> (ANDQ (SHLQ <t> x y) (SBBQcarrymask <t> (CMPBconst [64] y)))

 (Lsh32x64 <t> x y) -> (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMPQconst [32] y)))
 (Lsh32x32 <t> x y) -> (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMPLconst [32] y)))
 (Lsh32x16 <t> x y) -> (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMPWconst [32] y)))
 (Lsh32x8 <t> x y)  -> (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMPBconst [32] y)))

 (Lsh16x64 <t> x y) -> (ANDW (SHLW <t> x y) (SBBLcarrymask <t> (CMPQconst [16] y)))
 (Lsh16x32 <t> x y) -> (ANDW (SHLW <t> x y) (SBBLcarrymask <t> (CMPLconst [16] y)))
 (Lsh16x16 <t> x y) -> (ANDW (SHLW <t> x y) (SBBLcarrymask <t> (CMPWconst [16] y)))
 (Lsh16x8 <t> x y)  -> (ANDW (SHLW <t> x y) (SBBLcarrymask <t> (CMPBconst [16] y)))

 (Lsh8x64 <t> x y)  -> (ANDB (SHLB <t> x y) (SBBLcarrymask <t> (CMPQconst [8] y)))
 (Lsh8x32 <t> x y)  -> (ANDB (SHLB <t> x y) (SBBLcarrymask <t> (CMPLconst [8] y)))
 (Lsh8x16 <t> x y)  -> (ANDB (SHLB <t> x y) (SBBLcarrymask <t> (CMPWconst [8] y)))
 (Lsh8x8 <t> x y)   -> (ANDB (SHLB <t> x y) (SBBLcarrymask <t> (CMPBconst [8] y)))

 (Lrot64 <t> x [c]) -> (ROLQconst <t> [c&63] x)
 (Lrot32 <t> x [c]) -> (ROLLconst <t> [c&31] x)
 (Lrot16 <t> x [c]) -> (ROLWconst <t> [c&15] x)
 (Lrot8 <t> x [c])  -> (ROLBconst <t> [c&7] x)

 (Rsh64Ux64 <t> x y) -> (ANDQ (SHRQ <t> x y) (SBBQcarrymask <t> (CMPQconst [64] y)))
 (Rsh64Ux32 <t> x y) -> (ANDQ (SHRQ <t> x y) (SBBQcarrymask <t> (CMPLconst [64] y)))
 (Rsh64Ux16 <t> x y) -> (ANDQ (SHRQ <t> x y) (SBBQcarrymask <t> (CMPWconst [64] y)))
 (Rsh64Ux8 <t> x y)  -> (ANDQ (SHRQ <t> x y) (SBBQcarrymask <t> (CMPBconst [64] y)))

 (Rsh32Ux64 <t> x y) -> (ANDL (SHRL <t> x y) (SBBLcarrymask <t> (CMPQconst [32] y)))
 (Rsh32Ux32 <t> x y) -> (ANDL (SHRL <t> x y) (SBBLcarrymask <t> (CMPLconst [32] y)))
 (Rsh32Ux16 <t> x y) -> (ANDL (SHRL <t> x y) (SBBLcarrymask <t> (CMPWconst [32] y)))
 (Rsh32Ux8 <t> x y)  -> (ANDL (SHRL <t> x y) (SBBLcarrymask <t> (CMPBconst [32] y)))

 (Rsh16Ux64 <t> x y) -> (ANDW (SHRW <t> x y) (SBBLcarrymask <t> (CMPQconst [16] y)))
 (Rsh16Ux32 <t> x y) -> (ANDW (SHRW <t> x y) (SBBLcarrymask <t> (CMPLconst [16] y)))
 (Rsh16Ux16 <t> x y) -> (ANDW (SHRW <t> x y) (SBBLcarrymask <t> (CMPWconst [16] y)))
 (Rsh16Ux8 <t> x y)  -> (ANDW (SHRW <t> x y) (SBBLcarrymask <t> (CMPBconst [16] y)))

 (Rsh8Ux64 <t> x y)  -> (ANDB (SHRB <t> x y) (SBBLcarrymask <t> (CMPQconst [8] y)))
 (Rsh8Ux32 <t> x y)  -> (ANDB (SHRB <t> x y) (SBBLcarrymask <t> (CMPLconst [8] y)))
 (Rsh8Ux16 <t> x y)  -> (ANDB (SHRB <t> x y) (SBBLcarrymask <t> (CMPWconst [8] y)))
 (Rsh8Ux8 <t> x y)   -> (ANDB (SHRB <t> x y) (SBBLcarrymask <t> (CMPBconst [8] y)))

 // Signed right shift needs to return 0/-1 if shift amount is >= width of shifted value.
 // We implement this by setting the shift value to -1 (all ones) if the shift value is >= width.
 // Note: for small shift widths we generate 32 bits of mask even when we don't need it all.
 (Rsh64x64 <t> x y) -> (SARQ <t> x (ORQ <y.Type> y (NOTQ <y.Type> (SBBQcarrymask <y.Type> (CMPQconst [64] y)))))
 (Rsh64x32 <t> x y) -> (SARQ <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPLconst [64] y)))))
 (Rsh64x16 <t> x y) -> (SARQ <t> x (ORW <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPWconst [64] y)))))
 (Rsh64x8 <t> x y)  -> (SARQ <t> x (ORB <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPBconst [64] y)))))

 (Rsh32x64 <t> x y) -> (SARL <t> x (ORQ <y.Type> y (NOTQ <y.Type> (SBBQcarrymask <y.Type> (CMPQconst [32] y)))))
 (Rsh32x32 <t> x y) -> (SARL <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPLconst [32] y)))))
 (Rsh32x16 <t> x y) -> (SARL <t> x (ORW <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPWconst [32] y)))))
 (Rsh32x8 <t> x y)  -> (SARL <t> x (ORB <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPBconst [32] y)))))

 (Rsh16x64 <t> x y) -> (SARW <t> x (ORQ <y.Type> y (NOTQ <y.Type> (SBBQcarrymask <y.Type> (CMPQconst [16] y)))))
 (Rsh16x32 <t> x y) -> (SARW <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPLconst [16] y)))))
 (Rsh16x16 <t> x y) -> (SARW <t> x (ORW <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPWconst [16] y)))))
 (Rsh16x8 <t> x y)  -> (SARW <t> x (ORB <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPBconst [16] y)))))

 (Rsh8x64 <t> x y)  -> (SARB <t> x (ORQ <y.Type> y (NOTQ <y.Type> (SBBQcarrymask <y.Type> (CMPQconst [8] y)))))
 (Rsh8x32 <t> x y)  -> (SARB <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPLconst [8] y)))))
 (Rsh8x16 <t> x y)  -> (SARB <t> x (ORW <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPWconst [8] y)))))
 (Rsh8x8 <t> x y)   -> (SARB <t> x (ORB <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPBconst [8] y)))))

 (Less64 x y) -> (SETL (CMPQ x y))
 (Less32 x y) -> (SETL (CMPL x y))
 (Less16 x y) -> (SETL (CMPW x y))
 (Less8  x y) -> (SETL (CMPB x y))
 (Less64U x y) -> (SETB (CMPQ x y))
 (Less32U x y) -> (SETB (CMPL x y))
 (Less16U x y) -> (SETB (CMPW x y))
 (Less8U  x y) -> (SETB (CMPB x y))
 // Use SETGF with reversed operands to dodge NaN case
 (Less64F x y) -> (SETGF (UCOMISD y x))
 (Less32F x y) -> (SETGF (UCOMISS y x))

 (Leq64 x y) -> (SETLE (CMPQ x y))
 (Leq32 x y) -> (SETLE (CMPL x y))
 (Leq16 x y) -> (SETLE (CMPW x y))
 (Leq8  x y) -> (SETLE (CMPB x y))
 (Leq64U x y) -> (SETBE (CMPQ x y))
 (Leq32U x y) -> (SETBE (CMPL x y))
 (Leq16U x y) -> (SETBE (CMPW x y))
 (Leq8U  x y) -> (SETBE (CMPB x y))
 // Use SETGEF with reversed operands to dodge NaN case
 (Leq64F x y) -> (SETGEF (UCOMISD y x))
 (Leq32F x y) -> (SETGEF (UCOMISS y x))

 (Greater64 x y) -> (SETG (CMPQ x y))
 (Greater32 x y) -> (SETG (CMPL x y))
 (Greater16 x y) -> (SETG (CMPW x y))
 (Greater8  x y) -> (SETG (CMPB x y))
 (Greater64U x y) -> (SETA (CMPQ x y))
 (Greater32U x y) -> (SETA (CMPL x y))
 (Greater16U x y) -> (SETA (CMPW x y))
 (Greater8U  x y) -> (SETA (CMPB x y))
 // Note Go assembler gets UCOMISx operand order wrong, but it is right here
 // Bug is accommodated at generation of assembly language.
 (Greater64F x y) -> (SETGF (UCOMISD x y))
 (Greater32F x y) -> (SETGF (UCOMISS x y))

 (Geq64 x y) -> (SETGE (CMPQ x y))
 (Geq32 x y) -> (SETGE (CMPL x y))
 (Geq16 x y) -> (SETGE (CMPW x y))
 (Geq8  x y) -> (SETGE (CMPB x y))
 (Geq64U x y) -> (SETAE (CMPQ x y))
 (Geq32U x y) -> (SETAE (CMPL x y))
 (Geq16U x y) -> (SETAE (CMPW x y))
 (Geq8U  x y) -> (SETAE (CMPB x y))
 // Note Go assembler gets UCOMISx operand order wrong, but it is right here
 // Bug is accommodated at generation of assembly language.
 (Geq64F x y) -> (SETGEF (UCOMISD x y))
 (Geq32F x y) -> (SETGEF (UCOMISS x y))

 (Eq64 x y) -> (SETEQ (CMPQ x y))
 (Eq32 x y) -> (SETEQ (CMPL x y))
 (Eq16 x y) -> (SETEQ (CMPW x y))
 (Eq8 x y) -> (SETEQ (CMPB x y))
 (EqPtr x y) -> (SETEQ (CMPQ x y))
 (Eq64F x y) -> (SETEQF (UCOMISD x y))
 (Eq32F x y) -> (SETEQF (UCOMISS x y))

 (Neq64 x y) -> (SETNE (CMPQ x y))
 (Neq32 x y) -> (SETNE (CMPL x y))
 (Neq16 x y) -> (SETNE (CMPW x y))
 (Neq8 x y) -> (SETNE (CMPB x y))
 (NeqPtr x y) -> (SETNE (CMPQ x y))
 (Neq64F x y) -> (SETNEF (UCOMISD x y))
 (Neq32F x y) -> (SETNEF (UCOMISS x y))

 (Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) -> (MOVQload ptr mem)
 (Load <t> ptr mem) && is32BitInt(t) -> (MOVLload ptr mem)
 (Load <t> ptr mem) && is16BitInt(t) -> (MOVWload ptr mem)
 (Load <t> ptr mem) && (t.IsBoolean() || is8BitInt(t)) -> (MOVBload ptr mem)
 (Load <t> ptr mem) && is32BitFloat(t) -> (MOVSSload ptr mem)
 (Load <t> ptr mem) && is64BitFloat(t) -> (MOVSDload ptr mem)

 // These more-specific FP versions of Store pattern should come first.
 (Store [8] ptr val mem) && is64BitFloat(val.Type) -> (MOVSDstore ptr val mem)
 (Store [4] ptr val mem) && is32BitFloat(val.Type) -> (MOVSSstore ptr val mem)

 (Store [8] ptr val mem) -> (MOVQstore ptr val mem)
 (Store [4] ptr val mem) -> (MOVLstore ptr val mem)
 (Store [2] ptr val mem) -> (MOVWstore ptr val mem)
 (Store [1] ptr val mem) -> (MOVBstore ptr val mem)

 // We want this to stick out so the to/from ptr conversion is obvious
 (Convert <t> x mem) -> (MOVQconvert <t> x mem)

 // checks
 (IsNonNil p) -> (SETNE (TESTQ p p))
 (IsInBounds idx len) -> (SETB (CMPQ idx len))
 (IsSliceInBounds idx len) -> (SETBE (CMPQ idx len))
 (NilCheck ptr mem) -> (LoweredNilCheck ptr mem)

 (GetG mem) -> (LoweredGetG mem)
 (GetClosurePtr) -> (LoweredGetClosurePtr)

 // Small moves
 (Move [0] _ _ mem) -> mem
 (Move [1] dst src mem) -> (MOVBstore dst (MOVBload src mem) mem)
 (Move [2] dst src mem) -> (MOVWstore dst (MOVWload src mem) mem)
 (Move [4] dst src mem) -> (MOVLstore dst (MOVLload src mem) mem)
 (Move [8] dst src mem) -> (MOVQstore dst (MOVQload src mem) mem)
 (Move [16] dst src mem) -> (MOVOstore dst (MOVOload src mem) mem)
 (Move [3] dst src mem) ->
 	(MOVBstore [2] dst (MOVBload [2] src mem)
 		(MOVWstore dst (MOVWload src mem) mem))
 (Move [5] dst src mem) ->
 	(MOVBstore [4] dst (MOVBload [4] src mem)
 		(MOVLstore dst (MOVLload src mem) mem))
 (Move [6] dst src mem) ->
 	(MOVWstore [4] dst (MOVWload [4] src mem)
 		(MOVLstore dst (MOVLload src mem) mem))
 (Move [7] dst src mem) ->
 	(MOVLstore [3] dst (MOVLload [3] src mem)
 		(MOVLstore dst (MOVLload src mem) mem))
 (Move [size] dst src mem) && size > 8 && size < 16 ->
 	(MOVQstore [size-8] dst (MOVQload [size-8] src mem)
 		(MOVQstore dst (MOVQload src mem) mem))

 // Adjust moves to be a multiple of 16 bytes.
 (Move [size] dst src mem) && size > 16 && size%16 != 0 && size%16 <= 8 ->
 	(Move [size-size%16] (ADDQconst <dst.Type> dst [size%16]) (ADDQconst <src.Type> src [size%16])
 		(MOVQstore dst (MOVQload src mem) mem))
 (Move [size] dst src mem) && size > 16 && size%16 != 0 && size%16 > 8 ->
 	(Move [size-size%16] (ADDQconst <dst.Type> dst [size%16]) (ADDQconst <src.Type> src [size%16])
 		(MOVOstore dst (MOVOload src mem) mem))

 // Medium copying uses a duff device.
 (Move [size] dst src mem) && size >= 32 && size <= 16*64 && size%16 == 0 ->
 	(DUFFCOPY [14*(64-size/16)] dst src mem)
 // 14 and 64 are magic constants.  14 is the number of bytes to encode:
 //	MOVUPS	(SI), X0
 //	ADDQ	$16, SI
 //	MOVUPS	X0, (DI)
 //	ADDQ	$16, DI
 // and 64 is the number of such blocks.  See src/runtime/duff_amd64.s:duffcopy.

 // Large copying uses REP MOVSQ.
 (Move [size] dst src mem) && size > 16*64 && size%8 == 0 ->
 	(REPMOVSQ dst src (MOVQconst [size/8]) mem)

 (Not x) -> (XORBconst [1] x)

 (OffPtr [off] ptr) -> (ADDQconst [off] ptr)

 (Const8 [val]) -> (MOVBconst [val])
 (Const16 [val]) -> (MOVWconst [val])
 (Const32 [val]) -> (MOVLconst [val])
 (Const64 [val]) -> (MOVQconst [val])
 (Const32F [val]) -> (MOVSSconst [val])
 (Const64F [val]) -> (MOVSDconst [val])
 (ConstNil) -> (MOVQconst [0])
 (ConstBool [b]) -> (MOVBconst [b])

 (Addr {sym} base) -> (LEAQ {sym} base)

 (ITab (Load ptr mem)) -> (MOVQload ptr mem)

 // block rewrites
 (If (SETL  cmp) yes no) -> (LT  cmp yes no)
 (If (SETLE cmp) yes no) -> (LE  cmp yes no)
 (If (SETG  cmp) yes no) -> (GT  cmp yes no)
 (If (SETGE cmp) yes no) -> (GE  cmp yes no)
 (If (SETEQ cmp) yes no) -> (EQ  cmp yes no)
 (If (SETNE cmp) yes no) -> (NE  cmp yes no)
 (If (SETB  cmp) yes no) -> (ULT cmp yes no)
 (If (SETBE cmp) yes no) -> (ULE cmp yes no)
 (If (SETA  cmp) yes no) -> (UGT cmp yes no)
 (If (SETAE cmp) yes no) -> (UGE cmp yes no)

 // Special case for floating point - LF/LEF not generated
 (If (SETGF  cmp) yes no) -> (UGT  cmp yes no)
 (If (SETGEF cmp) yes no) -> (UGE  cmp yes no)
 (If (SETEQF cmp) yes no) -> (EQF  cmp yes no)
 (If (SETNEF cmp) yes no) -> (EQF  cmp yes no)

 (If cond yes no) -> (NE (TESTB cond cond) yes no)

 (NE (TESTB (SETL  cmp)) yes no) -> (LT  cmp yes no)
 (NE (TESTB (SETLE cmp)) yes no) -> (LE  cmp yes no)
 (NE (TESTB (SETG  cmp)) yes no) -> (GT  cmp yes no)
 (NE (TESTB (SETGE cmp)) yes no) -> (GE  cmp yes no)
 (NE (TESTB (SETEQ cmp)) yes no) -> (EQ  cmp yes no)
 (NE (TESTB (SETNE cmp)) yes no) -> (NE  cmp yes no)
 (NE (TESTB (SETB  cmp)) yes no) -> (ULT cmp yes no)
 (NE (TESTB (SETBE cmp)) yes no) -> (ULE cmp yes no)
 (NE (TESTB (SETA  cmp)) yes no) -> (UGT cmp yes no)
 (NE (TESTB (SETAE cmp)) yes no) -> (UGE cmp yes no)

 // Special case for floating point - LF/LEF not generated
 (NE (TESTB (SETGF  cmp)) yes no) -> (UGT  cmp yes no)
 (NE (TESTB (SETGEF cmp)) yes no) -> (UGE  cmp yes no)
 (NE (TESTB (SETEQF cmp)) yes no) -> (EQF  cmp yes no)
 (NE (TESTB (SETNEF cmp)) yes no) -> (NEF  cmp yes no)

 // Disabled because it interferes with the pattern match above and makes worse code.
 // (SETNEF x) -> (ORQ (SETNE <config.Frontend().TypeInt8()> x) (SETNAN <config.Frontend().TypeInt8()> x))
 // (SETEQF x) -> (ANDQ (SETEQ <config.Frontend().TypeInt8()> x) (SETORD <config.Frontend().TypeInt8()> x))

 (StaticCall [argwid] {target} mem) -> (CALLstatic [argwid] {target} mem)
 (ClosureCall [argwid] entry closure mem) -> (CALLclosure [argwid] entry closure mem)
 (DeferCall [argwid] mem) -> (CALLdefer [argwid] mem)
 (GoCall [argwid] mem) -> (CALLgo [argwid] mem)
 (InterCall [argwid] entry mem) -> (CALLinter [argwid] entry mem)

 // Rules below here apply some simple optimizations after lowering.
 // TODO: Should this be a separate pass?

 // fold constants into instructions
 (ADDQ x (MOVQconst [c])) && is32Bit(c) -> (ADDQconst [c] x)
 (ADDQ (MOVQconst [c]) x) && is32Bit(c) -> (ADDQconst [c] x)
 (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x)
 (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x)
 (ADDW x (MOVWconst [c])) -> (ADDWconst [c] x)
 (ADDW (MOVWconst [c]) x) -> (ADDWconst [c] x)
 (ADDB x (MOVBconst [c])) -> (ADDBconst [c] x)
 (ADDB (MOVBconst [c]) x) -> (ADDBconst [c] x)

 (SUBQ x (MOVQconst [c])) && is32Bit(c) -> (SUBQconst x [c])
 (SUBQ (MOVQconst [c]) x) && is32Bit(c) -> (NEGQ (SUBQconst <v.Type> x [c]))
 (SUBL x (MOVLconst [c])) -> (SUBLconst x [c])
 (SUBL (MOVLconst [c]) x) -> (NEGL (SUBLconst <v.Type> x [c]))
 (SUBW x (MOVWconst [c])) -> (SUBWconst x [c])
 (SUBW (MOVWconst [c]) x) -> (NEGW (SUBWconst <v.Type> x [c]))
 (SUBB x (MOVBconst [c])) -> (SUBBconst x [c])
 (SUBB (MOVBconst [c]) x) -> (NEGB (SUBBconst <v.Type> x [c]))

 (MULQ x (MOVQconst [c])) && is32Bit(c) -> (MULQconst [c] x)
 (MULQ (MOVQconst [c]) x) && is32Bit(c) -> (MULQconst [c] x)
 (MULL x (MOVLconst [c])) -> (MULLconst [c] x)
 (MULL (MOVLconst [c]) x) -> (MULLconst [c] x)
 (MULW x (MOVWconst [c])) -> (MULWconst [c] x)
 (MULW (MOVWconst [c]) x) -> (MULWconst [c] x)
 (MULB x (MOVBconst [c])) -> (MULBconst [c] x)
 (MULB (MOVBconst [c]) x) -> (MULBconst [c] x)

 (ANDQ x (MOVQconst [c])) && is32Bit(c) -> (ANDQconst [c] x)
 (ANDQ (MOVQconst [c]) x) && is32Bit(c) -> (ANDQconst [c] x)
 (ANDL x (MOVLconst [c])) -> (ANDLconst [c] x)
 (ANDL (MOVLconst [c]) x) -> (ANDLconst [c] x)
 (ANDW x (MOVLconst [c])) -> (ANDWconst [c] x)
 (ANDW (MOVLconst [c]) x) -> (ANDWconst [c] x)
 (ANDW x (MOVWconst [c])) -> (ANDWconst [c] x)
 (ANDW (MOVWconst [c]) x) -> (ANDWconst [c] x)
 (ANDB x (MOVLconst [c])) -> (ANDBconst [c] x)
 (ANDB (MOVLconst [c]) x) -> (ANDBconst [c] x)
 (ANDB x (MOVBconst [c])) -> (ANDBconst [c] x)
 (ANDB (MOVBconst [c]) x) -> (ANDBconst [c] x)

 (ORQ x (MOVQconst [c])) && is32Bit(c) -> (ORQconst [c] x)
 (ORQ (MOVQconst [c]) x) && is32Bit(c) -> (ORQconst [c] x)
 (ORL x (MOVLconst [c])) -> (ORLconst [c] x)
 (ORL (MOVLconst [c]) x) -> (ORLconst [c] x)
 (ORW x (MOVWconst [c])) -> (ORWconst [c] x)
 (ORW (MOVWconst [c]) x) -> (ORWconst [c] x)
 (ORB x (MOVBconst [c])) -> (ORBconst [c] x)
 (ORB (MOVBconst [c]) x) -> (ORBconst [c] x)

 (XORQ x (MOVQconst [c])) && is32Bit(c) -> (XORQconst [c] x)
 (XORQ (MOVQconst [c]) x) && is32Bit(c) -> (XORQconst [c] x)
 (XORL x (MOVLconst [c])) -> (XORLconst [c] x)
 (XORL (MOVLconst [c]) x) -> (XORLconst [c] x)
 (XORW x (MOVWconst [c])) -> (XORWconst [c] x)
 (XORW (MOVWconst [c]) x) -> (XORWconst [c] x)
 (XORB x (MOVBconst [c])) -> (XORBconst [c] x)
 (XORB (MOVBconst [c]) x) -> (XORBconst [c] x)

 (SHLQ x (MOVQconst [c])) -> (SHLQconst [c&63] x)
 (SHLL x (MOVLconst [c])) -> (SHLLconst [c&31] x)
 (SHLW x (MOVWconst [c])) -> (SHLWconst [c&31] x)
 (SHLB x (MOVBconst [c])) -> (SHLBconst [c&31] x)

 (SHRQ x (MOVQconst [c])) -> (SHRQconst [c&63] x)
 (SHRL x (MOVLconst [c])) -> (SHRLconst [c&31] x)
 (SHRW x (MOVWconst [c])) -> (SHRWconst [c&31] x)
 (SHRB x (MOVBconst [c])) -> (SHRBconst [c&31] x)

 (SARQ x (MOVQconst [c])) -> (SARQconst [c&63] x)
 (SARL x (MOVLconst [c])) -> (SARLconst [c&31] x)
 (SARW x (MOVWconst [c])) -> (SARWconst [c&31] x)
 (SARB x (MOVBconst [c])) -> (SARBconst [c&31] x)

 // Note: the word and byte shifts keep the low 5 bits (not the low 4 or 3 bits)
 // because the x86 instructions are defined to use all 5 bits of the shift even
 // for the small shifts.  I don't think we'll ever generate a weird shift (e.g.
 // (SHLW x (MOVWconst [24])), but just in case.

 (CMPQ x (MOVQconst [c])) && is32Bit(c) -> (CMPQconst x [c])
 (CMPQ (MOVQconst [c]) x) && is32Bit(c) -> (InvertFlags (CMPQconst x [c]))
 (CMPL x (MOVLconst [c])) -> (CMPLconst x [c])
 (CMPL (MOVLconst [c]) x) -> (InvertFlags (CMPLconst x [c]))
 (CMPW x (MOVWconst [c])) -> (CMPWconst x [c])
 (CMPW (MOVWconst [c]) x) -> (InvertFlags (CMPWconst x [c]))
 (CMPB x (MOVBconst [c])) -> (CMPBconst x [c])
 (CMPB (MOVBconst [c]) x) -> (InvertFlags (CMPBconst x [c]))

 // strength reduction
 (MULQconst [-1] x) -> (NEGQ x)
 (MULQconst [0] _) -> (MOVQconst [0])
 (MULQconst [1] x) -> x
 (MULQconst [3] x) -> (LEAQ2 x x)
 (MULQconst [5] x) -> (LEAQ4 x x)
 (MULQconst [9] x) -> (LEAQ8 x x)
 (MULQconst [c] x) && isPowerOfTwo(c) -> (SHLQconst [log2(c)] x)

 // fold add/shift into leaq
 (ADDQ x (SHLQconst [3] y)) -> (LEAQ8 x y)
 (ADDQconst [c] (LEAQ8 [d] x y)) -> (LEAQ8 [addOff(c, d)] x y)

 // reverse ordering of compare instruction
 (SETL (InvertFlags x)) -> (SETG x)
 (SETG (InvertFlags x)) -> (SETL x)
 (SETB (InvertFlags x)) -> (SETA x)
 (SETA (InvertFlags x)) -> (SETB x)
 (SETLE (InvertFlags x)) -> (SETGE x)
 (SETGE (InvertFlags x)) -> (SETLE x)
 (SETBE (InvertFlags x)) -> (SETAE x)
 (SETAE (InvertFlags x)) -> (SETBE x)
 (SETEQ (InvertFlags x)) -> (SETEQ x)
 (SETNE (InvertFlags x)) -> (SETNE x)

 // sign extended loads
 // Note: The combined instruction must end up in the same block
 // as the original load.  If not, we end up making a value with
 // memory type live in two different blocks, which can lead to
 // multiple memory values alive simultaneously.
 (MOVBQSX (MOVBload [off] {sym} ptr mem)) -> @v.Args[0].Block (MOVBQSXload <v.Type> [off] {sym} ptr mem)
 (MOVBQZX (MOVBload [off] {sym} ptr mem)) -> @v.Args[0].Block (MOVBQZXload <v.Type> [off] {sym} ptr mem)
 // TODO: more

 // Don't extend before storing
 (MOVLstore [off] {sym} ptr (MOVLQSX x) mem) -> (MOVLstore [off] {sym} ptr x mem)
 (MOVWstore [off] {sym} ptr (MOVWQSX x) mem) -> (MOVWstore [off] {sym} ptr x mem)
 (MOVBstore [off] {sym} ptr (MOVBQSX x) mem) -> (MOVBstore [off] {sym} ptr x mem)
 (MOVLstore [off] {sym} ptr (MOVLQZX x) mem) -> (MOVLstore [off] {sym} ptr x mem)
 (MOVWstore [off] {sym} ptr (MOVWQZX x) mem) -> (MOVWstore [off] {sym} ptr x mem)
 (MOVBstore [off] {sym} ptr (MOVBQZX x) mem) -> (MOVBstore [off] {sym} ptr x mem)

 // fold constants into memory operations
 // Note that this is not always a good idea because if not all the uses of
 // the ADDQconst get eliminated, we still have to compute the ADDQconst and we now
 // have potentially two live values (ptr and (ADDQconst [off] ptr)) instead of one.
 // Nevertheless, let's do it!
 (MOVQload  [off1] {sym} (ADDQconst [off2] ptr) mem) -> (MOVQload  [addOff(off1, off2)] {sym} ptr mem)
 (MOVLload  [off1] {sym} (ADDQconst [off2] ptr) mem) -> (MOVLload  [addOff(off1, off2)] {sym} ptr mem)
 (MOVWload  [off1] {sym} (ADDQconst [off2] ptr) mem) -> (MOVWload  [addOff(off1, off2)] {sym} ptr mem)
 (MOVBload  [off1] {sym} (ADDQconst [off2] ptr) mem) -> (MOVBload  [addOff(off1, off2)] {sym} ptr mem)
 (MOVSSload [off1] {sym} (ADDQconst [off2] ptr) mem) -> (MOVSSload [addOff(off1, off2)] {sym} ptr mem)
 (MOVSDload [off1] {sym} (ADDQconst [off2] ptr) mem) -> (MOVSDload [addOff(off1, off2)] {sym} ptr mem)
 (MOVOload  [off1] {sym} (ADDQconst [off2] ptr) mem) -> (MOVOload  [addOff(off1, off2)] {sym} ptr mem)

 (MOVQstore  [off1] {sym} (ADDQconst [off2] ptr) val mem) -> (MOVQstore  [addOff(off1, off2)] {sym} ptr val mem)
 (MOVLstore  [off1] {sym} (ADDQconst [off2] ptr) val mem) -> (MOVLstore  [addOff(off1, off2)] {sym} ptr val mem)
 (MOVWstore  [off1] {sym} (ADDQconst [off2] ptr) val mem) -> (MOVWstore  [addOff(off1, off2)] {sym} ptr val mem)
 (MOVBstore  [off1] {sym} (ADDQconst [off2] ptr) val mem) -> (MOVBstore  [addOff(off1, off2)] {sym} ptr val mem)
 (MOVSSstore [off1] {sym} (ADDQconst [off2] ptr) val mem) -> (MOVSSstore [addOff(off1, off2)] {sym} ptr val mem)
 (MOVSDstore [off1] {sym} (ADDQconst [off2] ptr) val mem) -> (MOVSDstore [addOff(off1, off2)] {sym} ptr val mem)
 (MOVOstore  [off1] {sym} (ADDQconst [off2] ptr) val mem) -> (MOVOstore  [addOff(off1, off2)] {sym} ptr val mem)

 // Fold constants into stores.
 (MOVQstore [off] {sym} ptr (MOVQconst [c]) mem) && validStoreConst(c,off) ->
 	(MOVQstoreconst [makeStoreConst(c,off)] {sym} ptr mem)
 (MOVLstore [off] {sym} ptr (MOVLconst [c]) mem) && validStoreConstOff(off) ->
 	(MOVLstoreconst [makeStoreConst(int64(int32(c)),off)] {sym} ptr mem)
 (MOVWstore [off] {sym} ptr (MOVWconst [c]) mem) && validStoreConstOff(off) ->
 	(MOVWstoreconst [makeStoreConst(int64(int16(c)),off)] {sym} ptr mem)
 (MOVBstore [off] {sym} ptr (MOVBconst [c]) mem) && validStoreConstOff(off) ->
 	(MOVBstoreconst [makeStoreConst(int64(int8(c)),off)] {sym} ptr mem)

 // Fold address offsets into constant stores.
 (MOVQstoreconst [sc] {s} (ADDQconst [off] ptr) mem) && StoreConst(sc).canAdd(off) ->
 	(MOVQstoreconst [StoreConst(sc).add(off)] {s} ptr mem)
 (MOVLstoreconst [sc] {s} (ADDQconst [off] ptr) mem) && StoreConst(sc).canAdd(off) ->
 	(MOVLstoreconst [StoreConst(sc).add(off)] {s} ptr mem)
 (MOVWstoreconst [sc] {s} (ADDQconst [off] ptr) mem) && StoreConst(sc).canAdd(off) ->
 	(MOVWstoreconst [StoreConst(sc).add(off)] {s} ptr mem)
 (MOVBstoreconst [sc] {s} (ADDQconst [off] ptr) mem) && StoreConst(sc).canAdd(off) ->
 	(MOVBstoreconst [StoreConst(sc).add(off)] {s} ptr mem)

 // We need to fold LEAQ into the MOVx ops so that the live variable analysis knows
 // what variables are being read/written by the ops.
 (MOVQload  [off1] {sym1} (LEAQ [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) ->
 	(MOVQload  [addOff(off1,off2)] {mergeSym(sym1,sym2)} base mem)
 (MOVLload  [off1] {sym1} (LEAQ [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) ->
 	(MOVLload  [addOff(off1,off2)] {mergeSym(sym1,sym2)} base mem)
 (MOVWload  [off1] {sym1} (LEAQ [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) ->
 	(MOVWload  [addOff(off1,off2)] {mergeSym(sym1,sym2)} base mem)
 (MOVBload  [off1] {sym1} (LEAQ [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) ->
 	(MOVBload  [addOff(off1,off2)] {mergeSym(sym1,sym2)} base mem)
 (MOVSSload [off1] {sym1} (LEAQ [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) ->
 	(MOVSSload [addOff(off1,off2)] {mergeSym(sym1,sym2)} base mem)
 (MOVSDload [off1] {sym1} (LEAQ [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) ->
 	(MOVSDload [addOff(off1,off2)] {mergeSym(sym1,sym2)} base mem)
 (MOVOload [off1] {sym1} (LEAQ [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) ->
 	(MOVOload [addOff(off1,off2)] {mergeSym(sym1,sym2)} base mem)

 (MOVQstore  [off1] {sym1} (LEAQ [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) ->
 	(MOVQstore  [addOff(off1,off2)] {mergeSym(sym1,sym2)} base val mem)
 (MOVLstore  [off1] {sym1} (LEAQ [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) ->
 	(MOVLstore  [addOff(off1,off2)] {mergeSym(sym1,sym2)} base val mem)
 (MOVWstore  [off1] {sym1} (LEAQ [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) ->
 	(MOVWstore  [addOff(off1,off2)] {mergeSym(sym1,sym2)} base val mem)
 (MOVBstore  [off1] {sym1} (LEAQ [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) ->
 	(MOVBstore  [addOff(off1,off2)] {mergeSym(sym1,sym2)} base val mem)
 (MOVSSstore [off1] {sym1} (LEAQ [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) ->
 	(MOVSSstore [addOff(off1,off2)] {mergeSym(sym1,sym2)} base val mem)
 (MOVSDstore [off1] {sym1} (LEAQ [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) ->
 	(MOVSDstore [addOff(off1,off2)] {mergeSym(sym1,sym2)} base val mem)
 (MOVOstore [off1] {sym1} (LEAQ [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) ->
 	(MOVOstore [addOff(off1,off2)] {mergeSym(sym1,sym2)} base val mem)

 (MOVQstoreconst [sc] {sym1} (LEAQ [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && StoreConst(sc).canAdd(off) ->
 	(MOVQstoreconst [StoreConst(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
 (MOVLstoreconst [sc] {sym1} (LEAQ [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && StoreConst(sc).canAdd(off) ->
 	(MOVLstoreconst [StoreConst(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
 (MOVWstoreconst [sc] {sym1} (LEAQ [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && StoreConst(sc).canAdd(off) ->
 	(MOVWstoreconst [StoreConst(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
 (MOVBstoreconst [sc] {sym1} (LEAQ [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && StoreConst(sc).canAdd(off) ->
 	(MOVBstoreconst [StoreConst(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)

 // indexed loads and stores
 (MOVQloadidx8 [off1] {sym} (ADDQconst [off2] ptr) idx mem) -> (MOVQloadidx8 [addOff(off1, off2)] {sym} ptr idx mem)
 (MOVQstoreidx8 [off1] {sym} (ADDQconst [off2] ptr) idx val mem) -> (MOVQstoreidx8 [addOff(off1, off2)] {sym} ptr idx val mem)
 (MOVSSloadidx4 [off1] {sym} (ADDQconst [off2] {sym} ptr) idx mem) -> (MOVSSloadidx4 [addOff(off1, off2)] {sym} ptr idx mem)
 (MOVSSstoreidx4 [off1] {sym} (ADDQconst [off2] {sym} ptr) idx val mem) -> (MOVSSstoreidx4 [addOff(off1, off2)] {sym} ptr idx val mem)
 (MOVSDloadidx8 [off1] {sym} (ADDQconst [off2] {sym} ptr) idx mem) -> (MOVSDloadidx8 [addOff(off1, off2)] {sym} ptr idx mem)
 (MOVSDstoreidx8 [off1] {sym} (ADDQconst [off2] {sym} ptr) idx val mem) -> (MOVSDstoreidx8 [addOff(off1, off2)] {sym} ptr idx val mem)

 (MOVQload [off1] {sym1} (LEAQ8 [off2] {sym2} ptr idx) mem) && canMergeSym(sym1, sym2) ->
 	(MOVQloadidx8 [addOff(off1, off2)] {mergeSym(sym1,sym2)} ptr idx mem)
 (MOVQstore [off1] {sym1} (LEAQ8 [off2] {sym2} ptr idx) val mem) && canMergeSym(sym1, sym2) ->
 	(MOVQstoreidx8 [addOff(off1, off2)] {mergeSym(sym1,sym2)} ptr idx val mem)

 (MOVSSload [off1] {sym1} (LEAQ4 [off2] {sym2} ptr idx) mem) && canMergeSym(sym1, sym2) ->
 	(MOVSSloadidx4 [addOff(off1, off2)] {mergeSym(sym1,sym2)} ptr idx mem)
 (MOVSSstore [off1] {sym1} (LEAQ4 [off2] {sym2} ptr idx) val mem) && canMergeSym(sym1, sym2) ->
 	(MOVSSstoreidx4 [addOff(off1, off2)] {mergeSym(sym1,sym2)} ptr idx val mem)

 (MOVSDload [off1] {sym1} (LEAQ8 [off2] {sym2} ptr idx) mem) && canMergeSym(sym1, sym2) ->
 	(MOVSDloadidx8 [addOff(off1, off2)] {mergeSym(sym1,sym2)} ptr idx mem)
 (MOVSDstore [off1] {sym1} (LEAQ8 [off2] {sym2} ptr idx) val mem) && canMergeSym(sym1, sym2) ->
 	(MOVSDstoreidx8 [addOff(off1, off2)] {mergeSym(sym1,sym2)} ptr idx val mem)

 // lower Zero instructions with word sizes
 (Zero [0] _ mem) -> mem
 (Zero [1] destptr mem) -> (MOVBstoreconst [0] destptr mem)
 (Zero [2] destptr mem) -> (MOVWstoreconst [0] destptr mem)
 (Zero [4] destptr mem) -> (MOVLstoreconst [0] destptr mem)
 (Zero [8] destptr mem) -> (MOVQstoreconst [0] destptr mem)

 (Zero [3] destptr mem) ->
 	(MOVBstoreconst [makeStoreConst(0,2)] destptr
 		(MOVWstoreconst [0] destptr mem))
 (Zero [5] destptr mem) ->
 	(MOVBstoreconst [makeStoreConst(0,4)] destptr
 		(MOVLstoreconst [0] destptr mem))
 (Zero [6] destptr mem) ->
 	(MOVWstoreconst [makeStoreConst(0,4)] destptr
 		(MOVLstoreconst [0] destptr mem))
 (Zero [7] destptr mem) ->
 	(MOVLstoreconst [makeStoreConst(0,3)] destptr
 		(MOVLstoreconst [0] destptr mem))

 // Strip off any fractional word zeroing.
 (Zero [size] destptr mem) && size%8 != 0 && size > 8 ->
 	(Zero [size-size%8] (ADDQconst destptr [size%8])
 		(MOVQstoreconst [0] destptr mem))

 // Zero small numbers of words directly.
 (Zero [16] destptr mem) ->
 	(MOVQstoreconst [makeStoreConst(0,8)] destptr
 		(MOVQstoreconst [0] destptr mem))
 (Zero [24] destptr mem) ->
 	(MOVQstoreconst [makeStoreConst(0,16)] destptr
 		(MOVQstoreconst [makeStoreConst(0,8)] destptr
 			(MOVQstoreconst [0] destptr mem)))
 (Zero [32] destptr mem) ->
 	(MOVQstoreconst [makeStoreConst(0,24)] destptr
 		(MOVQstoreconst [makeStoreConst(0,16)] destptr
 			(MOVQstoreconst [makeStoreConst(0,8)] destptr
 				(MOVQstoreconst [0] destptr mem))))

 // Medium zeroing uses a duff device.
 (Zero [size] destptr mem) && size <= 1024 && size%8 == 0 && size%16 != 0 ->
 	(Zero [size-8] (ADDQconst [8] destptr) (MOVQstore destptr (MOVQconst [0]) mem))
 (Zero [size] destptr mem) && size <= 1024 && size%16 == 0 ->
 	(DUFFZERO [duffStart(size)] (ADDQconst [duffAdj(size)] destptr) (MOVOconst [0]) mem)

 // Large zeroing uses REP STOSQ.
 (Zero [size] destptr mem) && size > 1024 && size%8 == 0 ->
 	(REPSTOSQ destptr (MOVQconst [size/8]) (MOVQconst [0]) mem)

 // Absorb InvertFlags into branches.
 (LT (InvertFlags cmp) yes no) -> (GT cmp yes no)
 (GT (InvertFlags cmp) yes no) -> (LT cmp yes no)
 (LE (InvertFlags cmp) yes no) -> (GE cmp yes no)
 (GE (InvertFlags cmp) yes no) -> (LE cmp yes no)
 (ULT (InvertFlags cmp) yes no) -> (UGT cmp yes no)
 (UGT (InvertFlags cmp) yes no) -> (ULT cmp yes no)
 (ULE (InvertFlags cmp) yes no) -> (UGE cmp yes no)
 (UGE (InvertFlags cmp) yes no) -> (ULE cmp yes no)
 (EQ (InvertFlags cmp) yes no) -> (EQ cmp yes no)
 (NE (InvertFlags cmp) yes no) -> (NE cmp yes no)

 // get rid of overflow code for constant shifts
 (SBBQcarrymask (CMPQconst [c] (MOVQconst [d]))) &&  inBounds64(d, c) -> (MOVQconst [-1])
 (SBBQcarrymask (CMPQconst [c] (MOVQconst [d]))) && !inBounds64(d, c) -> (MOVQconst [0])
 (SBBQcarrymask (CMPLconst [c] (MOVLconst [d]))) &&  inBounds32(d, c) -> (MOVQconst [-1])
 (SBBQcarrymask (CMPLconst [c] (MOVLconst [d]))) && !inBounds32(d, c) -> (MOVQconst [0])
 (SBBQcarrymask (CMPWconst [c] (MOVWconst [d]))) &&  inBounds16(d, c) -> (MOVQconst [-1])
 (SBBQcarrymask (CMPWconst [c] (MOVWconst [d]))) && !inBounds16(d, c) -> (MOVQconst [0])
 (SBBQcarrymask (CMPBconst [c] (MOVBconst [d]))) &&  inBounds8(d, c)  -> (MOVQconst [-1])
 (SBBQcarrymask (CMPBconst [c] (MOVBconst [d]))) && !inBounds8(d, c)  -> (MOVQconst [0])
 (SBBLcarrymask (CMPQconst [c] (MOVQconst [d]))) &&  inBounds64(d, c) -> (MOVLconst [-1])
 (SBBLcarrymask (CMPQconst [c] (MOVQconst [d]))) && !inBounds64(d, c) -> (MOVLconst [0])
 (SBBLcarrymask (CMPLconst [c] (MOVLconst [d]))) &&  inBounds32(d, c) -> (MOVLconst [-1])
 (SBBLcarrymask (CMPLconst [c] (MOVLconst [d]))) && !inBounds32(d, c) -> (MOVLconst [0])
 (SBBLcarrymask (CMPWconst [c] (MOVWconst [d]))) &&  inBounds16(d, c) -> (MOVLconst [-1])
 (SBBLcarrymask (CMPWconst [c] (MOVWconst [d]))) && !inBounds16(d, c) -> (MOVLconst [0])
 (SBBLcarrymask (CMPBconst [c] (MOVBconst [d]))) &&  inBounds8(d, c)  -> (MOVLconst [-1])
 (SBBLcarrymask (CMPBconst [c] (MOVBconst [d]))) && !inBounds8(d, c)  -> (MOVLconst [0])

 // Remove redundant *const ops
 (ADDQconst [0] x) -> x
 (ADDLconst [c] x) && int32(c)==0 -> x
 (ADDWconst [c] x) && int16(c)==0 -> x
 (ADDBconst [c] x) && int8(c)==0 -> x
 (SUBQconst [0] x) -> x
 (SUBLconst [c] x) && int32(c) == 0 -> x
 (SUBWconst [c] x) && int16(c) == 0 -> x
 (SUBBconst [c] x) && int8(c) == 0 -> x
 (ANDQconst [0] _)                 -> (MOVQconst [0])
 (ANDLconst [c] _) && int32(c)==0  -> (MOVLconst [0])
 (ANDWconst [c] _) && int16(c)==0  -> (MOVWconst [0])
 (ANDBconst [c] _) && int8(c)==0   -> (MOVBconst [0])
 (ANDQconst [-1] x)                -> x
 (ANDLconst [c] x) && int32(c)==-1 -> x
 (ANDWconst [c] x) && int16(c)==-1 -> x
 (ANDBconst [c] x) && int8(c)==-1  -> x
 (ORQconst [0] x)                  -> x
 (ORLconst [c] x) && int32(c)==0   -> x
 (ORWconst [c] x) && int16(c)==0   -> x
 (ORBconst [c] x) && int8(c)==0    -> x
 (ORQconst [-1] _)                 -> (MOVQconst [-1])
 (ORLconst [c] _) && int32(c)==-1  -> (MOVLconst [-1])
 (ORWconst [c] _) && int16(c)==-1  -> (MOVWconst [-1])
 (ORBconst [c] _) && int8(c)==-1   -> (MOVBconst [-1])
 (XORQconst [0] x)                  -> x
 (XORLconst [c] x) && int32(c)==0   -> x
 (XORWconst [c] x) && int16(c)==0   -> x
 (XORBconst [c] x) && int8(c)==0    -> x

 // generic constant folding
 // TODO: more of this
 (ADDQconst [c] (MOVQconst [d])) -> (MOVQconst [c+d])
 (ADDLconst [c] (MOVLconst [d])) -> (MOVLconst [c+d])
 (ADDWconst [c] (MOVWconst [d])) -> (MOVWconst [c+d])
 (ADDBconst [c] (MOVBconst [d])) -> (MOVBconst [c+d])
 (ADDQconst [c] (ADDQconst [d] x)) -> (ADDQconst [c+d] x)
 (ADDLconst [c] (ADDLconst [d] x)) -> (ADDLconst [c+d] x)
 (ADDWconst [c] (ADDWconst [d] x)) -> (ADDWconst [c+d] x)
 (ADDBconst [c] (ADDBconst [d] x)) -> (ADDBconst [c+d] x)
 (SUBQconst [c] (MOVQconst [d])) -> (MOVQconst [d-c])
 (SUBLconst [c] (MOVLconst [d])) -> (MOVLconst [d-c])
 (SUBWconst [c] (MOVWconst [d])) -> (MOVWconst [d-c])
 (SUBBconst [c] (MOVBconst [d])) -> (MOVBconst [d-c])
 (SUBQconst [c] (SUBQconst [d] x)) -> (ADDQconst [-c-d] x)
 (SUBLconst [c] (SUBLconst [d] x)) -> (ADDLconst [-c-d] x)
 (SUBWconst [c] (SUBWconst [d] x)) -> (ADDWconst [-c-d] x)
 (SUBBconst [c] (SUBBconst [d] x)) -> (ADDBconst [-c-d] x)
 (SARQconst [c] (MOVQconst [d])) -> (MOVQconst [d>>uint64(c)])
 (SARLconst [c] (MOVQconst [d])) -> (MOVQconst [d>>uint64(c)])
 (SARWconst [c] (MOVQconst [d])) -> (MOVQconst [d>>uint64(c)])
 (SARBconst [c] (MOVQconst [d])) -> (MOVQconst [d>>uint64(c)])
 (NEGQ (MOVQconst [c])) -> (MOVQconst [-c])
 (NEGL (MOVLconst [c])) -> (MOVLconst [-c])
 (NEGW (MOVWconst [c])) -> (MOVWconst [-c])
 (NEGB (MOVBconst [c])) -> (MOVBconst [-c])
 (MULQconst [c] (MOVQconst [d])) -> (MOVQconst [c*d])
 (MULLconst [c] (MOVLconst [d])) -> (MOVLconst [c*d])
 (MULWconst [c] (MOVWconst [d])) -> (MOVWconst [c*d])
 (MULBconst [c] (MOVBconst [d])) -> (MOVBconst [c*d])
 (ANDQconst [c] (MOVQconst [d])) -> (MOVQconst [c&d])
 (ANDLconst [c] (MOVLconst [d])) -> (MOVLconst [c&d])
 (ANDWconst [c] (MOVWconst [d])) -> (MOVWconst [c&d])
 (ANDBconst [c] (MOVBconst [d])) -> (MOVBconst [c&d])
 (ORQconst [c] (MOVQconst [d])) -> (MOVQconst [c|d])
 (ORLconst [c] (MOVLconst [d])) -> (MOVLconst [c|d])
 (ORWconst [c] (MOVWconst [d])) -> (MOVWconst [c|d])
 (ORBconst [c] (MOVBconst [d])) -> (MOVBconst [c|d])
 (XORQconst [c] (MOVQconst [d])) -> (MOVQconst [c^d])
 (XORLconst [c] (MOVLconst [d])) -> (MOVLconst [c^d])
 (XORWconst [c] (MOVWconst [d])) -> (MOVWconst [c^d])
 (XORBconst [c] (MOVBconst [d])) -> (MOVBconst [c^d])
 (NOTQ (MOVQconst [c])) -> (MOVQconst [^c])
 (NOTL (MOVLconst [c])) -> (MOVLconst [^c])
 (NOTW (MOVWconst [c])) -> (MOVWconst [^c])
 (NOTB (MOVBconst [c])) -> (MOVBconst [^c])

 // generic simplifications
 // TODO: more of this
 (ADDQ x (NEGQ y)) -> (SUBQ x y)
 (ADDL x (NEGL y)) -> (SUBL x y)
 (ADDW x (NEGW y)) -> (SUBW x y)
 (ADDB x (NEGB y)) -> (SUBB x y)
 (SUBQ x x) -> (MOVQconst [0])
 (SUBL x x) -> (MOVLconst [0])
 (SUBW x x) -> (MOVWconst [0])
 (SUBB x x) -> (MOVBconst [0])
 (ANDQ x x) -> x
 (ANDL x x) -> x
 (ANDW x x) -> x
 (ANDB x x) -> x
 (ORQ x x) -> x
 (ORL x x) -> x
 (ORW x x) -> x
 (ORB x x) -> x
 (XORQ x x) -> (MOVQconst [0])
 (XORL x x) -> (MOVLconst [0])
 (XORW x x) -> (MOVWconst [0])
 (XORB x x) -> (MOVBconst [0])