// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// x86 register conventions:
//  - Integer types live in the low portion of registers.  Upper portions are junk.
//  - Boolean types use the low-order byte of a register.  Upper bytes are junk.
//  - We do not use AH,BH,CH,DH registers.
//  - Floating-point types will live in the low natural slot of an sse2 register.
//    Unused portions are junk.

// Lowering arithmetic
(Add64 x y) -> (ADDQ x y)
(AddPtr x y) -> (ADDQ x y)
(Add32 x y) -> (ADDL x y)
(Add16 x y) -> (ADDW x y)
(Add8 x y) -> (ADDB x y)
(Add32F x y) -> (ADDSS x y)
(Add64F x y) -> (ADDSD x y)

(Sub64 x y) -> (SUBQ x y)
(SubPtr x y) -> (SUBQ x y)
(Sub32 x y) -> (SUBL x y)
(Sub16 x y) -> (SUBW x y)
(Sub8 x y) -> (SUBB x y)
(Sub32F x y) -> (SUBSS x y)
(Sub64F x y) -> (SUBSD x y)

(Mul64 x y) -> (MULQ x y)
(Mul32 x y) -> (MULL x y)
(Mul16 x y) -> (MULW x y)
(Mul8 x y) -> (MULB x y)
(Mul32F x y) -> (MULSS x y)
(Mul64F x y) -> (MULSD x y)

(Div32F x y) -> (DIVSS x y)
(Div64F x y) -> (DIVSD x y)

(Div64 x y) -> (DIVQ x y)
(Div64u x y) -> (DIVQU x y)
(Div32 x y) -> (DIVL x y)
(Div32u x y) -> (DIVLU x y)
(Div16 x y) -> (DIVW x y)
(Div16u x y) -> (DIVWU x y)
(Div8 x y) ->  (DIVW (SignExt8to16 x) (SignExt8to16 y))
(Div8u x y) ->  (DIVWU (ZeroExt8to16 x) (ZeroExt8to16 y))

(Hmul32 x y) -> (HMULL x y)
(Hmul32u x y) -> (HMULLU x y)
(Hmul16 x y) -> (HMULW x y)
(Hmul16u x y) -> (HMULWU x y)
(Hmul8 x y) ->  (HMULB x y)
(Hmul8u x y) ->  (HMULBU x y)

(Mod64 x y) -> (MODQ x y)
(Mod64u x y) -> (MODQU x y)
(Mod32 x y) -> (MODL x y)
(Mod32u x y) -> (MODLU x y)
(Mod16 x y) -> (MODW x y)
(Mod16u x y) -> (MODWU x y)
(Mod8 x y) ->  (MODW (SignExt8to16 x) (SignExt8to16 y))
(Mod8u x y) ->  (MODWU (ZeroExt8to16 x) (ZeroExt8to16 y))

(And64 x y) -> (ANDQ x y)
(And32 x y) -> (ANDL x y)
(And16 x y) -> (ANDW x y)
(And8 x y) -> (ANDB x y)

(Or64 x y) -> (ORQ x y)
(Or32 x y) -> (ORL x y)
(Or16 x y) -> (ORW x y)
(Or8 x y) -> (ORB x y)

(Xor64 x y) -> (XORQ x y)
(Xor32 x y) -> (XORL x y)
(Xor16 x y) -> (XORW x y)
(Xor8 x y) -> (XORB x y)

(Neg64 x) -> (NEGQ x)
(Neg32 x) -> (NEGL x)
(Neg16 x) -> (NEGW x)
(Neg8 x) -> (NEGB x)
(Neg32F x) -> (PXOR x (MOVSSconst <config.Frontend().TypeFloat32()> [f2i(math.Copysign(0, -1))]))
(Neg64F x) -> (PXOR x (MOVSDconst <config.Frontend().TypeFloat64()> [f2i(math.Copysign(0, -1))]))

(Com64 x) -> (NOTQ x)
(Com32 x) -> (NOTL x)
(Com16 x) -> (NOTW x)
(Com8 x) -> (NOTB x)

(Sqrt x) -> (SQRTSD x)

// Note: we always extend to 64 bits even though some ops don't need that many result bits.
(SignExt8to16 x) -> (MOVBQSX x)
(SignExt8to32 x) -> (MOVBQSX x)
(SignExt8to64 x) -> (MOVBQSX x)
(SignExt16to32 x) -> (MOVWQSX x)
(SignExt16to64 x) -> (MOVWQSX x)
(SignExt32to64 x) -> (MOVLQSX x)

(ZeroExt8to16 x) -> (MOVBQZX x)
(ZeroExt8to32 x) -> (MOVBQZX x)
(ZeroExt8to64 x) -> (MOVBQZX x)
(ZeroExt16to32 x) -> (MOVWQZX x)
(ZeroExt16to64 x) -> (MOVWQZX x)
(ZeroExt32to64 x) -> (MOVLQZX x)

(Cvt32to32F x) -> (CVTSL2SS x)
(Cvt32to64F x) -> (CVTSL2SD x)
(Cvt64to32F x) -> (CVTSQ2SS x)
(Cvt64to64F x) -> (CVTSQ2SD x)

(Cvt32Fto32 x) -> (CVTTSS2SL x)
(Cvt32Fto64 x) -> (CVTTSS2SQ x)
(Cvt64Fto32 x) -> (CVTTSD2SL x)
(Cvt64Fto64 x) -> (CVTTSD2SQ x)

(Cvt32Fto64F x) -> (CVTSS2SD x)
(Cvt64Fto32F x) -> (CVTSD2SS x)

// Because we ignore high parts of registers, truncates are just copies.
(Trunc16to8 x) -> x
(Trunc32to8 x) -> x
(Trunc32to16 x) -> x
(Trunc64to8 x) -> x
(Trunc64to16 x) -> x
(Trunc64to32 x) -> x

// Lowering shifts
// Unsigned shifts need to return 0 if shift amount is >= width of shifted value.
//   result = (arg << shift) & (shift >= argbits ? 0 : 0xffffffffffffffff)
// Note: for small shifts we generate 32 bits of mask even when we don't need it all.
(Lsh64x64 <t> x y) -> (ANDQ (SHLQ <t> x y) (SBBQcarrymask <t> (CMPQconst [64] y)))
(Lsh64x32 <t> x y) -> (ANDQ (SHLQ <t> x y) (SBBQcarrymask <t> (CMPLconst [64] y)))
(Lsh64x16 <t> x y) -> (ANDQ (SHLQ <t> x y) (SBBQcarrymask <t> (CMPWconst [64] y)))
(Lsh64x8 <t> x y)  -> (ANDQ (SHLQ <t> x y) (SBBQcarrymask <t> (CMPBconst [64] y)))

(Lsh32x64 <t> x y) -> (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMPQconst [32] y)))
(Lsh32x32 <t> x y) -> (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMPLconst [32] y)))
(Lsh32x16 <t> x y) -> (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMPWconst [32] y)))
(Lsh32x8 <t> x y)  -> (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMPBconst [32] y)))

(Lsh16x64 <t> x y) -> (ANDW (SHLW <t> x y) (SBBLcarrymask <t> (CMPQconst [16] y)))
(Lsh16x32 <t> x y) -> (ANDW (SHLW <t> x y) (SBBLcarrymask <t> (CMPLconst [16] y)))
(Lsh16x16 <t> x y) -> (ANDW (SHLW <t> x y) (SBBLcarrymask <t> (CMPWconst [16] y)))
(Lsh16x8 <t> x y)  -> (ANDW (SHLW <t> x y) (SBBLcarrymask <t> (CMPBconst [16] y)))

(Lsh8x64 <t> x y)  -> (ANDB (SHLB <t> x y) (SBBLcarrymask <t> (CMPQconst [8] y)))
(Lsh8x32 <t> x y)  -> (ANDB (SHLB <t> x y) (SBBLcarrymask <t> (CMPLconst [8] y)))
(Lsh8x16 <t> x y)  -> (ANDB (SHLB <t> x y) (SBBLcarrymask <t> (CMPWconst [8] y)))
(Lsh8x8 <t> x y)   -> (ANDB (SHLB <t> x y) (SBBLcarrymask <t> (CMPBconst [8] y)))

(Lrot64 <t> x [c]) -> (ROLQconst <t> [c&63] x)
(Lrot32 <t> x [c]) -> (ROLLconst <t> [c&31] x)
(Lrot16 <t> x [c]) -> (ROLWconst <t> [c&15] x)
(Lrot8 <t> x [c])  -> (ROLBconst <t> [c&7] x)

(Rsh64Ux64 <t> x y) -> (ANDQ (SHRQ <t> x y) (SBBQcarrymask <t> (CMPQconst [64] y)))
(Rsh64Ux32 <t> x y) -> (ANDQ (SHRQ <t> x y) (SBBQcarrymask <t> (CMPLconst [64] y)))
(Rsh64Ux16 <t> x y) -> (ANDQ (SHRQ <t> x y) (SBBQcarrymask <t> (CMPWconst [64] y)))
(Rsh64Ux8 <t> x y)  -> (ANDQ (SHRQ <t> x y) (SBBQcarrymask <t> (CMPBconst [64] y)))

(Rsh32Ux64 <t> x y) -> (ANDL (SHRL <t> x y) (SBBLcarrymask <t> (CMPQconst [32] y)))
(Rsh32Ux32 <t> x y) -> (ANDL (SHRL <t> x y) (SBBLcarrymask <t> (CMPLconst [32] y)))
(Rsh32Ux16 <t> x y) -> (ANDL (SHRL <t> x y) (SBBLcarrymask <t> (CMPWconst [32] y)))
(Rsh32Ux8 <t> x y)  -> (ANDL (SHRL <t> x y) (SBBLcarrymask <t> (CMPBconst [32] y)))

(Rsh16Ux64 <t> x y) -> (ANDW (SHRW <t> x y) (SBBLcarrymask <t> (CMPQconst [16] y)))
(Rsh16Ux32 <t> x y) -> (ANDW (SHRW <t> x y) (SBBLcarrymask <t> (CMPLconst [16] y)))
(Rsh16Ux16 <t> x y) -> (ANDW (SHRW <t> x y) (SBBLcarrymask <t> (CMPWconst [16] y)))
(Rsh16Ux8 <t> x y)  -> (ANDW (SHRW <t> x y) (SBBLcarrymask <t> (CMPBconst [16] y)))

(Rsh8Ux64 <t> x y)  -> (ANDB (SHRB <t> x y) (SBBLcarrymask <t> (CMPQconst [8] y)))
(Rsh8Ux32 <t> x y)  -> (ANDB (SHRB <t> x y) (SBBLcarrymask <t> (CMPLconst [8] y)))
(Rsh8Ux16 <t> x y)  -> (ANDB (SHRB <t> x y) (SBBLcarrymask <t> (CMPWconst [8] y)))
(Rsh8Ux8 <t> x y)   -> (ANDB (SHRB <t> x y) (SBBLcarrymask <t> (CMPBconst [8] y)))

// Signed right shift needs to return 0/-1 if shift amount is >= width of shifted value.
// We implement this by setting the shift value to -1 (all ones) if the shift value is >= width.
// Note: for small shift widths we generate 32 bits of mask even when we don't need it all.
(Rsh64x64 <t> x y) -> (SARQ <t> x (ORQ <y.Type> y (NOTQ <y.Type> (SBBQcarrymask <y.Type> (CMPQconst [64] y)))))
(Rsh64x32 <t> x y) -> (SARQ <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPLconst [64] y)))))
(Rsh64x16 <t> x y) -> (SARQ <t> x (ORW <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPWconst [64] y)))))
(Rsh64x8 <t> x y)  -> (SARQ <t> x (ORB <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPBconst [64] y)))))

(Rsh32x64 <t> x y) -> (SARL <t> x (ORQ <y.Type> y (NOTQ <y.Type> (SBBQcarrymask <y.Type> (CMPQconst [32] y)))))
(Rsh32x32 <t> x y) -> (SARL <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPLconst [32] y)))))
(Rsh32x16 <t> x y) -> (SARL <t> x (ORW <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPWconst [32] y)))))
(Rsh32x8 <t> x y)  -> (SARL <t> x (ORB <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPBconst [32] y)))))

(Rsh16x64 <t> x y) -> (SARW <t> x (ORQ <y.Type> y (NOTQ <y.Type> (SBBQcarrymask <y.Type> (CMPQconst [16] y)))))
(Rsh16x32 <t> x y) -> (SARW <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPLconst [16] y)))))
(Rsh16x16 <t> x y) -> (SARW <t> x (ORW <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPWconst [16] y)))))
(Rsh16x8 <t> x y)  -> (SARW <t> x (ORB <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPBconst [16] y)))))

(Rsh8x64 <t> x y)  -> (SARB <t> x (ORQ <y.Type> y (NOTQ <y.Type> (SBBQcarrymask <y.Type> (CMPQconst [8] y)))))
(Rsh8x32 <t> x y)  -> (SARB <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPLconst [8] y)))))
(Rsh8x16 <t> x y)  -> (SARB <t> x (ORW <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPWconst [8] y)))))
(Rsh8x8 <t> x y)   -> (SARB <t> x (ORB <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPBconst [8] y)))))

(Less64 x y) -> (SETL (CMPQ x y))
(Less32 x y) -> (SETL (CMPL x y))
(Less16 x y) -> (SETL (CMPW x y))
(Less8  x y) -> (SETL (CMPB x y))
(Less64U x y) -> (SETB (CMPQ x y))
(Less32U x y) -> (SETB (CMPL x y))
(Less16U x y) -> (SETB (CMPW x y))
(Less8U  x y) -> (SETB (CMPB x y))
// Use SETGF with reversed operands to dodge NaN case
(Less64F x y) -> (SETGF (UCOMISD y x))
(Less32F x y) -> (SETGF (UCOMISS y x))

(Leq64 x y) -> (SETLE (CMPQ x y))
(Leq32 x y) -> (SETLE (CMPL x y))
(Leq16 x y) -> (SETLE (CMPW x y))
(Leq8  x y) -> (SETLE (CMPB x y))
(Leq64U x y) -> (SETBE (CMPQ x y))
(Leq32U x y) -> (SETBE (CMPL x y))
(Leq16U x y) -> (SETBE (CMPW x y))
(Leq8U  x y) -> (SETBE (CMPB x y))
// Use SETGEF with reversed operands to dodge NaN case
(Leq64F x y) -> (SETGEF (UCOMISD y x))
(Leq32F x y) -> (SETGEF (UCOMISS y x))

(Greater64 x y) -> (SETG (CMPQ x y))
(Greater32 x y) -> (SETG (CMPL x y))
(Greater16 x y) -> (SETG (CMPW x y))
(Greater8  x y) -> (SETG (CMPB x y))
(Greater64U x y) -> (SETA (CMPQ x y))
(Greater32U x y) -> (SETA (CMPL x y))
(Greater16U x y) -> (SETA (CMPW x y))
(Greater8U  x y) -> (SETA (CMPB x y))
// Note Go assembler gets UCOMISx operand order wrong, but it is right here
// Bug is accommodated at generation of assembly language.
(Greater64F x y) -> (SETGF (UCOMISD x y))
(Greater32F x y) -> (SETGF (UCOMISS x y))

(Geq64 x y) -> (SETGE (CMPQ x y))
(Geq32 x y) -> (SETGE (CMPL x y))
(Geq16 x y) -> (SETGE (CMPW x y))
(Geq8  x y) -> (SETGE (CMPB x y))
(Geq64U x y) -> (SETAE (CMPQ x y))
(Geq32U x y) -> (SETAE (CMPL x y))
(Geq16U x y) -> (SETAE (CMPW x y))
(Geq8U  x y) -> (SETAE (CMPB x y))
// Note Go assembler gets UCOMISx operand order wrong, but it is right here
// Bug is accommodated at generation of assembly language.
(Geq64F x y) -> (SETGEF (UCOMISD x y))
(Geq32F x y) -> (SETGEF (UCOMISS x y))

(Eq64 x y) -> (SETEQ (CMPQ x y))
(Eq32 x y) -> (SETEQ (CMPL x y))
(Eq16 x y) -> (SETEQ (CMPW x y))
(Eq8 x y) -> (SETEQ (CMPB x y))
(EqPtr x y) -> (SETEQ (CMPQ x y))
(Eq64F x y) -> (SETEQF (UCOMISD x y))
(Eq32F x y) -> (SETEQF (UCOMISS x y))

(Neq64 x y) -> (SETNE (CMPQ x y))
(Neq32 x y) -> (SETNE (CMPL x y))
(Neq16 x y) -> (SETNE (CMPW x y))
(Neq8 x y) -> (SETNE (CMPB x y))
(NeqPtr x y) -> (SETNE (CMPQ x y))
(Neq64F x y) -> (SETNEF (UCOMISD x y))
(Neq32F x y) -> (SETNEF (UCOMISS x y))

(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) -> (MOVQload ptr mem)
(Load <t> ptr mem) && is32BitInt(t) -> (MOVLload ptr mem)
(Load <t> ptr mem) && is16BitInt(t) -> (MOVWload ptr mem)
(Load <t> ptr mem) && (t.IsBoolean() || is8BitInt(t)) -> (MOVBload ptr mem)
(Load <t> ptr mem) && is32BitFloat(t) -> (MOVSSload ptr mem)
(Load <t> ptr mem) && is64BitFloat(t) -> (MOVSDload ptr mem)

// These more-specific FP versions of Store pattern should come first.
(Store [8] ptr val mem) && is64BitFloat(val.Type) -> (MOVSDstore ptr val mem)
(Store [4] ptr val mem) && is32BitFloat(val.Type) -> (MOVSSstore ptr val mem)

(Store [8] ptr val mem) -> (MOVQstore ptr val mem)
(Store [4] ptr val mem) -> (MOVLstore ptr val mem)
(Store [2] ptr val mem) -> (MOVWstore ptr val mem)
(Store [1] ptr val mem) -> (MOVBstore ptr val mem)

// We want this to stick out so the to/from ptr conversion is obvious
(Convert <t> x mem) -> (MOVQconvert <t> x mem)

// checks
(IsNonNil p) -> (SETNE (TESTQ p p))
(IsInBounds idx len) -> (SETB (CMPQ idx len))
(IsSliceInBounds idx len) -> (SETBE (CMPQ idx len))
(NilCheck ptr mem) -> (LoweredNilCheck ptr mem)

(GetG mem) -> (LoweredGetG mem)
(GetClosurePtr) -> (LoweredGetClosurePtr)

// Small moves
(Move [0] _ _ mem) -> mem
(Move [1] dst src mem) -> (MOVBstore dst (MOVBload src mem) mem)
(Move [2] dst src mem) -> (MOVWstore dst (MOVWload src mem) mem)
(Move [4] dst src mem) -> (MOVLstore dst (MOVLload src mem) mem)
(Move [8] dst src mem) -> (MOVQstore dst (MOVQload src mem) mem)
(Move [16] dst src mem) -> (MOVOstore dst (MOVOload src mem) mem)
(Move [3] dst src mem) ->
	(MOVBstore [2] dst (MOVBload [2] src mem)
		(MOVWstore dst (MOVWload src mem) mem))
(Move [5] dst src mem) ->
	(MOVBstore [4] dst (MOVBload [4] src mem)
		(MOVLstore dst (MOVLload src mem) mem))
(Move [6] dst src mem) ->
	(MOVWstore [4] dst (MOVWload [4] src mem)
		(MOVLstore dst (MOVLload src mem) mem))
(Move [7] dst src mem) ->
	(MOVLstore [3] dst (MOVLload [3] src mem)
		(MOVLstore dst (MOVLload src mem) mem))
(Move [size] dst src mem) && size > 8 && size < 16 ->
	(MOVQstore [size-8] dst (MOVQload [size-8] src mem)
		(MOVQstore dst (MOVQload src mem) mem))

// Adjust moves to be a multiple of 16 bytes.
(Move [size] dst src mem) && size > 16 && size%16 != 0 && size%16 <= 8 ->
	(Move [size-size%16] (ADDQconst <dst.Type> dst [size%16]) (ADDQconst <src.Type> src [size%16])
		(MOVQstore dst (MOVQload src mem) mem))
(Move [size] dst src mem) && size > 16 && size%16 != 0 && size%16 > 8 ->
	(Move [size-size%16] (ADDQconst <dst.Type> dst [size%16]) (ADDQconst <src.Type> src [size%16])
		(MOVOstore dst (MOVOload src mem) mem))

// Medium copying uses a duff device.
(Move [size] dst src mem) && size >= 32 && size <= 16*64 && size%16 == 0 ->
	(DUFFCOPY [14*(64-size/16)] dst src mem)
// 14 and 64 are magic constants.  14 is the number of bytes to encode:
//	MOVUPS	(SI), X0
//	ADDQ	$16, SI
//	MOVUPS	X0, (DI)
//	ADDQ	$16, DI
// and 64 is the number of such blocks.  See src/runtime/duff_amd64.s:duffcopy.

// Large copying uses REP MOVSQ.
(Move [size] dst src mem) && size > 16*64 && size%8 == 0 ->
	(REPMOVSQ dst src (MOVQconst [size/8]) mem)

(Not x) -> (XORBconst [1] x)

(OffPtr [off] ptr) -> (ADDQconst [off] ptr)

(Const8 [val]) -> (MOVBconst [val])
(Const16 [val]) -> (MOVWconst [val])
(Const32 [val]) -> (MOVLconst [val])
(Const64 [val]) -> (MOVQconst [val])
(Const32F [val]) -> (MOVSSconst [val])
(Const64F [val]) -> (MOVSDconst [val])
(ConstNil) -> (MOVQconst [0])
(ConstBool [b]) -> (MOVBconst [b])

(Addr {sym} base) -> (LEAQ {sym} base)

(ITab (Load ptr mem)) -> (MOVQload ptr mem)

// block rewrites
(If (SETL  cmp) yes no) -> (LT  cmp yes no)
(If (SETLE cmp) yes no) -> (LE  cmp yes no)
(If (SETG  cmp) yes no) -> (GT  cmp yes no)
(If (SETGE cmp) yes no) -> (GE  cmp yes no)
(If (SETEQ cmp) yes no) -> (EQ  cmp yes no)
(If (SETNE cmp) yes no) -> (NE  cmp yes no)
(If (SETB  cmp) yes no) -> (ULT cmp yes no)
(If (SETBE cmp) yes no) -> (ULE cmp yes no)
(If (SETA  cmp) yes no) -> (UGT cmp yes no)
(If (SETAE cmp) yes no) -> (UGE cmp yes no)

// Special case for floating point - LF/LEF not generated
(If (SETGF  cmp) yes no) -> (UGT  cmp yes no)
(If (SETGEF cmp) yes no) -> (UGE  cmp yes no)
(If (SETEQF cmp) yes no) -> (EQF  cmp yes no)
(If (SETNEF cmp) yes no) -> (EQF  cmp yes no)

(If cond yes no) -> (NE (TESTB cond cond) yes no)

(NE (TESTB (SETL  cmp)) yes no) -> (LT  cmp yes no)
(NE (TESTB (SETLE cmp)) yes no) -> (LE  cmp yes no)
(NE (TESTB (SETG  cmp)) yes no) -> (GT  cmp yes no)
(NE (TESTB (SETGE cmp)) yes no) -> (GE  cmp yes no)
(NE (TESTB (SETEQ cmp)) yes no) -> (EQ  cmp yes no)
(NE (TESTB (SETNE cmp)) yes no) -> (NE  cmp yes no)
(NE (TESTB (SETB  cmp)) yes no) -> (ULT cmp yes no)
(NE (TESTB (SETBE cmp)) yes no) -> (ULE cmp yes no)
(NE (TESTB (SETA  cmp)) yes no) -> (UGT cmp yes no)
(NE (TESTB (SETAE cmp)) yes no) -> (UGE cmp yes no)

// Special case for floating point - LF/LEF not generated
(NE (TESTB (SETGF  cmp)) yes no) -> (UGT  cmp yes no)
(NE (TESTB (SETGEF cmp)) yes no) -> (UGE  cmp yes no)
(NE (TESTB (SETEQF cmp)) yes no) -> (EQF  cmp yes no)
(NE (TESTB (SETNEF cmp)) yes no) -> (NEF  cmp yes no)

// Disabled because it interferes with the pattern match above and makes worse code.
// (SETNEF x) -> (ORQ (SETNE <config.Frontend().TypeInt8()> x) (SETNAN <config.Frontend().TypeInt8()> x))
// (SETEQF x) -> (ANDQ (SETEQ <config.Frontend().TypeInt8()> x) (SETORD <config.Frontend().TypeInt8()> x))

(StaticCall [argwid] {target} mem) -> (CALLstatic [argwid] {target} mem)
(ClosureCall [argwid] entry closure mem) -> (CALLclosure [argwid] entry closure mem)
(DeferCall [argwid] mem) -> (CALLdefer [argwid] mem)
(GoCall [argwid] mem) -> (CALLgo [argwid] mem)
(InterCall [argwid] entry mem) -> (CALLinter [argwid] entry mem)

// Rules below here apply some simple optimizations after lowering.
// TODO: Should this be a separate pass?

// fold constants into instructions
(ADDQ x (MOVQconst [c])) && is32Bit(c) -> (ADDQconst [c] x)
(ADDQ (MOVQconst [c]) x) && is32Bit(c) -> (ADDQconst [c] x)
(ADDL x (MOVLconst [c])) -> (ADDLconst [c] x)
(ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x)
(ADDW x (MOVWconst [c])) -> (ADDWconst [c] x)
(ADDW (MOVWconst [c]) x) -> (ADDWconst [c] x)
(ADDB x (MOVBconst [c])) -> (ADDBconst [c] x)
(ADDB (MOVBconst [c]) x) -> (ADDBconst [c] x)

(SUBQ x (MOVQconst [c])) && is32Bit(c) -> (SUBQconst x [c])
(SUBQ (MOVQconst [c]) x) && is32Bit(c) -> (NEGQ (SUBQconst <v.Type> x [c]))
(SUBL x (MOVLconst [c])) -> (SUBLconst x [c])
(SUBL (MOVLconst [c]) x) -> (NEGL (SUBLconst <v.Type> x [c]))
(SUBW x (MOVWconst [c])) -> (SUBWconst x [c])
(SUBW (MOVWconst [c]) x) -> (NEGW (SUBWconst <v.Type> x [c]))
(SUBB x (MOVBconst [c])) -> (SUBBconst x [c])
(SUBB (MOVBconst [c]) x) -> (NEGB (SUBBconst <v.Type> x [c]))

(MULQ x (MOVQconst [c])) && is32Bit(c) -> (MULQconst [c] x)
(MULQ (MOVQconst [c]) x) && is32Bit(c) -> (MULQconst [c] x)
(MULL x (MOVLconst [c])) -> (MULLconst [c] x)
(MULL (MOVLconst [c]) x) -> (MULLconst [c] x)
(MULW x (MOVWconst [c])) -> (MULWconst [c] x)
(MULW (MOVWconst [c]) x) -> (MULWconst [c] x)
(MULB x (MOVBconst [c])) -> (MULBconst [c] x)
(MULB (MOVBconst [c]) x) -> (MULBconst [c] x)

(ANDQ x (MOVQconst [c])) && is32Bit(c) -> (ANDQconst [c] x)
(ANDQ (MOVQconst [c]) x) && is32Bit(c) -> (ANDQconst [c] x)
(ANDL x (MOVLconst [c])) -> (ANDLconst [c] x)
(ANDL (MOVLconst [c]) x) -> (ANDLconst [c] x)
(ANDW x (MOVLconst [c])) -> (ANDWconst [c] x)
(ANDW (MOVLconst [c]) x) -> (ANDWconst [c] x)
(ANDW x (MOVWconst [c])) -> (ANDWconst [c] x)
(ANDW (MOVWconst [c]) x) -> (ANDWconst [c] x)
(ANDB x (MOVLconst [c])) -> (ANDBconst [c] x)
(ANDB (MOVLconst [c]) x) -> (ANDBconst [c] x)
(ANDB x (MOVBconst [c])) -> (ANDBconst [c] x)
(ANDB (MOVBconst [c]) x) -> (ANDBconst [c] x)

(ORQ x (MOVQconst [c])) && is32Bit(c) -> (ORQconst [c] x)
(ORQ (MOVQconst [c]) x) && is32Bit(c) -> (ORQconst [c] x)
(ORL x (MOVLconst [c])) -> (ORLconst [c] x)
(ORL (MOVLconst [c]) x) -> (ORLconst [c] x)
(ORW x (MOVWconst [c])) -> (ORWconst [c] x)
(ORW (MOVWconst [c]) x) -> (ORWconst [c] x)
(ORB x (MOVBconst [c])) -> (ORBconst [c] x)
(ORB (MOVBconst [c]) x) -> (ORBconst [c] x)

(XORQ x (MOVQconst [c])) && is32Bit(c) -> (XORQconst [c] x)
(XORQ (MOVQconst [c]) x) && is32Bit(c) -> (XORQconst [c] x)
(XORL x (MOVLconst [c])) -> (XORLconst [c] x)
(XORL (MOVLconst [c]) x) -> (XORLconst [c] x)
(XORW x (MOVWconst [c])) -> (XORWconst [c] x)
(XORW (MOVWconst [c]) x) -> (XORWconst [c] x)
(XORB x (MOVBconst [c])) -> (XORBconst [c] x)
(XORB (MOVBconst [c]) x) -> (XORBconst [c] x)

(SHLQ x (MOVQconst [c])) -> (SHLQconst [c&63] x)
(SHLL x (MOVLconst [c])) -> (SHLLconst [c&31] x)
(SHLW x (MOVWconst [c])) -> (SHLWconst [c&31] x)
(SHLB x (MOVBconst [c])) -> (SHLBconst [c&31] x)

(SHRQ x (MOVQconst [c])) -> (SHRQconst [c&63] x)
(SHRL x (MOVLconst [c])) -> (SHRLconst [c&31] x)
(SHRW x (MOVWconst [c])) -> (SHRWconst [c&31] x)
(SHRB x (MOVBconst [c])) -> (SHRBconst [c&31] x)

(SARQ x (MOVQconst [c])) -> (SARQconst [c&63] x)
(SARL x (MOVLconst [c])) -> (SARLconst [c&31] x)
(SARW x (MOVWconst [c])) -> (SARWconst [c&31] x)
(SARB x (MOVBconst [c])) -> (SARBconst [c&31] x)

// Note: the word and byte shifts keep the low 5 bits (not the low 4 or 3 bits)
// because the x86 instructions are defined to use all 5 bits of the shift even
// for the small shifts.  I don't think we'll ever generate a weird shift (e.g.
// (SHLW x (MOVWconst [24])), but just in case.

(CMPQ x (MOVQconst [c])) && is32Bit(c) -> (CMPQconst x [c])
(CMPQ (MOVQconst [c]) x) && is32Bit(c) -> (InvertFlags (CMPQconst x [c]))
(CMPL x (MOVLconst [c])) -> (CMPLconst x [c])
(CMPL (MOVLconst [c]) x) -> (InvertFlags (CMPLconst x [c]))
(CMPW x (MOVWconst [c])) -> (CMPWconst x [c])
(CMPW (MOVWconst [c]) x) -> (InvertFlags (CMPWconst x [c]))
(CMPB x (MOVBconst [c])) -> (CMPBconst x [c])
(CMPB (MOVBconst [c]) x) -> (InvertFlags (CMPBconst x [c]))

// strength reduction
(MULQconst [-1] x) -> (NEGQ x)
(MULQconst [0] _) -> (MOVQconst [0])
(MULQconst [1] x) -> x
(MULQconst [3] x) -> (LEAQ2 x x)
(MULQconst [5] x) -> (LEAQ4 x x)
(MULQconst [9] x) -> (LEAQ8 x x)
(MULQconst [c] x) && isPowerOfTwo(c) -> (SHLQconst [log2(c)] x)

// fold add/shift into leaq
(ADDQ x (SHLQconst [3] y)) -> (LEAQ8 x y)
(ADDQconst [c] (LEAQ8 [d] x y)) -> (LEAQ8 [addOff(c, d)] x y)

// reverse ordering of compare instruction
(SETL (InvertFlags x)) -> (SETG x)
(SETG (InvertFlags x)) -> (SETL x)
(SETB (InvertFlags x)) -> (SETA x)
(SETA (InvertFlags x)) -> (SETB x)
(SETLE (InvertFlags x)) -> (SETGE x)
(SETGE (InvertFlags x)) -> (SETLE x)
(SETBE (InvertFlags x)) -> (SETAE x)
(SETAE (InvertFlags x)) -> (SETBE x)
(SETEQ (InvertFlags x)) -> (SETEQ x)
(SETNE (InvertFlags x)) -> (SETNE x)

// sign extended loads
// Note: The combined instruction must end up in the same block
// as the original load.  If not, we end up making a value with
// memory type live in two different blocks, which can lead to
// multiple memory values alive simultaneously.
(MOVBQSX (MOVBload [off] {sym} ptr mem)) -> @v.Args[0].Block (MOVBQSXload <v.Type> [off] {sym} ptr mem)
(MOVBQZX (MOVBload [off] {sym} ptr mem)) -> @v.Args[0].Block (MOVBQZXload <v.Type> [off] {sym} ptr mem)
// TODO: more

// Don't extend before storing
(MOVLstore [off] {sym} ptr (MOVLQSX x) mem) -> (MOVLstore [off] {sym} ptr x mem)
(MOVWstore [off] {sym} ptr (MOVWQSX x) mem) -> (MOVWstore [off] {sym} ptr x mem)
(MOVBstore [off] {sym} ptr (MOVBQSX x) mem) -> (MOVBstore [off] {sym} ptr x mem)
(MOVLstore [off] {sym} ptr (MOVLQZX x) mem) -> (MOVLstore [off] {sym} ptr x mem)
(MOVWstore [off] {sym} ptr (MOVWQZX x) mem) -> (MOVWstore [off] {sym} ptr x mem)
(MOVBstore [off] {sym} ptr (MOVBQZX x) mem) -> (MOVBstore [off] {sym} ptr x mem)

// fold constants into memory operations
// Note that this is not always a good idea because if not all the uses of
// the ADDQconst get eliminated, we still have to compute the ADDQconst and we now
// have potentially two live values (ptr and (ADDQconst [off] ptr)) instead of one.
// Nevertheless, let's do it!
(MOVQload  [off1] {sym} (ADDQconst [off2] ptr) mem) -> (MOVQload  [addOff(off1, off2)] {sym} ptr mem)
(MOVLload  [off1] {sym} (ADDQconst [off2] ptr) mem) -> (MOVLload  [addOff(off1, off2)] {sym} ptr mem)
(MOVWload  [off1] {sym} (ADDQconst [off2] ptr) mem) -> (MOVWload  [addOff(off1, off2)] {sym} ptr mem)
(MOVBload  [off1] {sym} (ADDQconst [off2] ptr) mem) -> (MOVBload  [addOff(off1, off2)] {sym} ptr mem)
(MOVSSload [off1] {sym} (ADDQconst [off2] ptr) mem) -> (MOVSSload [addOff(off1, off2)] {sym} ptr mem)
(MOVSDload [off1] {sym} (ADDQconst [off2] ptr) mem) -> (MOVSDload [addOff(off1, off2)] {sym} ptr mem)
(MOVOload  [off1] {sym} (ADDQconst [off2] ptr) mem) -> (MOVOload  [addOff(off1, off2)] {sym} ptr mem)

(MOVQstore  [off1] {sym} (ADDQconst [off2] ptr) val mem) -> (MOVQstore  [addOff(off1, off2)] {sym} ptr val mem)
(MOVLstore  [off1] {sym} (ADDQconst [off2] ptr) val mem) -> (MOVLstore  [addOff(off1, off2)] {sym} ptr val mem)
(MOVWstore  [off1] {sym} (ADDQconst [off2] ptr) val mem) -> (MOVWstore  [addOff(off1, off2)] {sym} ptr val mem)
(MOVBstore  [off1] {sym} (ADDQconst [off2] ptr) val mem) -> (MOVBstore  [addOff(off1, off2)] {sym} ptr val mem)
(MOVSSstore [off1] {sym} (ADDQconst [off2] ptr) val mem) -> (MOVSSstore [addOff(off1, off2)] {sym} ptr val mem)
(MOVSDstore [off1] {sym} (ADDQconst [off2] ptr) val mem) -> (MOVSDstore [addOff(off1, off2)] {sym} ptr val mem)
(MOVOstore  [off1] {sym} (ADDQconst [off2] ptr) val mem) -> (MOVOstore  [addOff(off1, off2)] {sym} ptr val mem)

// Fold constants into stores.
(MOVQstore [off] {sym} ptr (MOVQconst [c]) mem) && validStoreConst(c,off) ->
	(MOVQstoreconst [makeStoreConst(c,off)] {sym} ptr mem)
(MOVLstore [off] {sym} ptr (MOVLconst [c]) mem) && validStoreConstOff(off) ->
	(MOVLstoreconst [makeStoreConst(int64(int32(c)),off)] {sym} ptr mem)
(MOVWstore [off] {sym} ptr (MOVWconst [c]) mem) && validStoreConstOff(off) ->
	(MOVWstoreconst [makeStoreConst(int64(int16(c)),off)] {sym} ptr mem)
(MOVBstore [off] {sym} ptr (MOVBconst [c]) mem) && validStoreConstOff(off) ->
	(MOVBstoreconst [makeStoreConst(int64(int8(c)),off)] {sym} ptr mem)

// Fold address offsets into constant stores.
(MOVQstoreconst [sc] {s} (ADDQconst [off] ptr) mem) && StoreConst(sc).canAdd(off) ->
	(MOVQstoreconst [StoreConst(sc).add(off)] {s} ptr mem)
(MOVLstoreconst [sc] {s} (ADDQconst [off] ptr) mem) && StoreConst(sc).canAdd(off) ->
	(MOVLstoreconst [StoreConst(sc).add(off)] {s} ptr mem)
(MOVWstoreconst [sc] {s} (ADDQconst [off] ptr) mem) && StoreConst(sc).canAdd(off) ->
	(MOVWstoreconst [StoreConst(sc).add(off)] {s} ptr mem)
(MOVBstoreconst [sc] {s} (ADDQconst [off] ptr) mem) && StoreConst(sc).canAdd(off) ->
	(MOVBstoreconst [StoreConst(sc).add(off)] {s} ptr mem)

// We need to fold LEAQ into the MOVx ops so that the live variable analysis knows
// what variables are being read/written by the ops.
(MOVQload  [off1] {sym1} (LEAQ [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) ->
	(MOVQload  [addOff(off1,off2)] {mergeSym(sym1,sym2)} base mem)
(MOVLload  [off1] {sym1} (LEAQ [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) ->
	(MOVLload  [addOff(off1,off2)] {mergeSym(sym1,sym2)} base mem)
(MOVWload  [off1] {sym1} (LEAQ [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) ->
	(MOVWload  [addOff(off1,off2)] {mergeSym(sym1,sym2)} base mem)
(MOVBload  [off1] {sym1} (LEAQ [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) ->
	(MOVBload  [addOff(off1,off2)] {mergeSym(sym1,sym2)} base mem)
(MOVSSload [off1] {sym1} (LEAQ [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) ->
	(MOVSSload [addOff(off1,off2)] {mergeSym(sym1,sym2)} base mem)
(MOVSDload [off1] {sym1} (LEAQ [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) ->
	(MOVSDload [addOff(off1,off2)] {mergeSym(sym1,sym2)} base mem)
(MOVOload [off1] {sym1} (LEAQ [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) ->
	(MOVOload [addOff(off1,off2)] {mergeSym(sym1,sym2)} base mem)

(MOVQstore  [off1] {sym1} (LEAQ [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) ->
	(MOVQstore  [addOff(off1,off2)] {mergeSym(sym1,sym2)} base val mem)
(MOVLstore  [off1] {sym1} (LEAQ [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) ->
	(MOVLstore  [addOff(off1,off2)] {mergeSym(sym1,sym2)} base val mem)
(MOVWstore  [off1] {sym1} (LEAQ [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) ->
	(MOVWstore  [addOff(off1,off2)] {mergeSym(sym1,sym2)} base val mem)
(MOVBstore  [off1] {sym1} (LEAQ [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) ->
	(MOVBstore  [addOff(off1,off2)] {mergeSym(sym1,sym2)} base val mem)
(MOVSSstore [off1] {sym1} (LEAQ [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) ->
	(MOVSSstore [addOff(off1,off2)] {mergeSym(sym1,sym2)} base val mem)
(MOVSDstore [off1] {sym1} (LEAQ [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) ->
	(MOVSDstore [addOff(off1,off2)] {mergeSym(sym1,sym2)} base val mem)
(MOVOstore [off1] {sym1} (LEAQ [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) ->
	(MOVOstore [addOff(off1,off2)] {mergeSym(sym1,sym2)} base val mem)

(MOVQstoreconst [sc] {sym1} (LEAQ [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && StoreConst(sc).canAdd(off) ->
	(MOVQstoreconst [StoreConst(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
(MOVLstoreconst [sc] {sym1} (LEAQ [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && StoreConst(sc).canAdd(off) ->
	(MOVLstoreconst [StoreConst(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
(MOVWstoreconst [sc] {sym1} (LEAQ [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && StoreConst(sc).canAdd(off) ->
	(MOVWstoreconst [StoreConst(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
(MOVBstoreconst [sc] {sym1} (LEAQ [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && StoreConst(sc).canAdd(off) ->
	(MOVBstoreconst [StoreConst(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)

// indexed loads and stores
(MOVQloadidx8 [off1] {sym} (ADDQconst [off2] ptr) idx mem) -> (MOVQloadidx8 [addOff(off1, off2)] {sym} ptr idx mem)
(MOVQstoreidx8 [off1] {sym} (ADDQconst [off2] ptr) idx val mem) -> (MOVQstoreidx8 [addOff(off1, off2)] {sym} ptr idx val mem)
(MOVSSloadidx4 [off1] {sym} (ADDQconst [off2] {sym} ptr) idx mem) -> (MOVSSloadidx4 [addOff(off1, off2)] {sym} ptr idx mem)
(MOVSSstoreidx4 [off1] {sym} (ADDQconst [off2] {sym} ptr) idx val mem) -> (MOVSSstoreidx4 [addOff(off1, off2)] {sym} ptr idx val mem)
(MOVSDloadidx8 [off1] {sym} (ADDQconst [off2] {sym} ptr) idx mem) -> (MOVSDloadidx8 [addOff(off1, off2)] {sym} ptr idx mem)
(MOVSDstoreidx8 [off1] {sym} (ADDQconst [off2] {sym} ptr) idx val mem) -> (MOVSDstoreidx8 [addOff(off1, off2)] {sym} ptr idx val mem)

(MOVQload [off1] {sym1} (LEAQ8 [off2] {sym2} ptr idx) mem) && canMergeSym(sym1, sym2) ->
	(MOVQloadidx8 [addOff(off1, off2)] {mergeSym(sym1,sym2)} ptr idx mem)
(MOVQstore [off1] {sym1} (LEAQ8 [off2] {sym2} ptr idx) val mem) && canMergeSym(sym1, sym2) ->
	(MOVQstoreidx8 [addOff(off1, off2)] {mergeSym(sym1,sym2)} ptr idx val mem)

(MOVSSload [off1] {sym1} (LEAQ4 [off2] {sym2} ptr idx) mem) && canMergeSym(sym1, sym2) ->
	(MOVSSloadidx4 [addOff(off1, off2)] {mergeSym(sym1,sym2)} ptr idx mem)
(MOVSSstore [off1] {sym1} (LEAQ4 [off2] {sym2} ptr idx) val mem) && canMergeSym(sym1, sym2) ->
	(MOVSSstoreidx4 [addOff(off1, off2)] {mergeSym(sym1,sym2)} ptr idx val mem)

(MOVSDload [off1] {sym1} (LEAQ8 [off2] {sym2} ptr idx) mem) && canMergeSym(sym1, sym2) ->
	(MOVSDloadidx8 [addOff(off1, off2)] {mergeSym(sym1,sym2)} ptr idx mem)
(MOVSDstore [off1] {sym1} (LEAQ8 [off2] {sym2} ptr idx) val mem) && canMergeSym(sym1, sym2) ->
	(MOVSDstoreidx8 [addOff(off1, off2)] {mergeSym(sym1,sym2)} ptr idx val mem)

// lower Zero instructions with word sizes
(Zero [0] _ mem) -> mem
(Zero [1] destptr mem) -> (MOVBstoreconst [0] destptr mem)
(Zero [2] destptr mem) -> (MOVWstoreconst [0] destptr mem)
(Zero [4] destptr mem) -> (MOVLstoreconst [0] destptr mem)
(Zero [8] destptr mem) -> (MOVQstoreconst [0] destptr mem)

(Zero [3] destptr mem) ->
	(MOVBstoreconst [makeStoreConst(0,2)] destptr
		(MOVWstoreconst [0] destptr mem))
(Zero [5] destptr mem) ->
	(MOVBstoreconst [makeStoreConst(0,4)] destptr
		(MOVLstoreconst [0] destptr mem))
(Zero [6] destptr mem) ->
	(MOVWstoreconst [makeStoreConst(0,4)] destptr
		(MOVLstoreconst [0] destptr mem))
(Zero [7] destptr mem) ->
	(MOVLstoreconst [makeStoreConst(0,3)] destptr
		(MOVLstoreconst [0] destptr mem))

// Strip off any fractional word zeroing.
(Zero [size] destptr mem) && size%8 != 0 && size > 8 ->
	(Zero [size-size%8] (ADDQconst destptr [size%8])
		(MOVQstoreconst [0] destptr mem))

// Zero small numbers of words directly.
(Zero [16] destptr mem) ->
	(MOVQstoreconst [makeStoreConst(0,8)] destptr
		(MOVQstoreconst [0] destptr mem))
(Zero [24] destptr mem) ->
	(MOVQstoreconst [makeStoreConst(0,16)] destptr
		(MOVQstoreconst [makeStoreConst(0,8)] destptr
			(MOVQstoreconst [0] destptr mem)))
(Zero [32] destptr mem) ->
	(MOVQstoreconst [makeStoreConst(0,24)] destptr
		(MOVQstoreconst [makeStoreConst(0,16)] destptr
			(MOVQstoreconst [makeStoreConst(0,8)] destptr
				(MOVQstoreconst [0] destptr mem))))

// Medium zeroing uses a duff device.
(Zero [size] destptr mem) && size <= 1024 && size%8 == 0 && size%16 != 0 ->
	(Zero [size-8] (ADDQconst [8] destptr) (MOVQstore destptr (MOVQconst [0]) mem))
(Zero [size] destptr mem) && size <= 1024 && size%16 == 0 ->
	(DUFFZERO [duffStart(size)] (ADDQconst [duffAdj(size)] destptr) (MOVOconst [0]) mem)

// Large zeroing uses REP STOSQ.
(Zero [size] destptr mem) && size > 1024 && size%8 == 0 ->
	(REPSTOSQ destptr (MOVQconst [size/8]) (MOVQconst [0]) mem)

// Absorb InvertFlags into branches.
(LT (InvertFlags cmp) yes no) -> (GT cmp yes no)
(GT (InvertFlags cmp) yes no) -> (LT cmp yes no)
(LE (InvertFlags cmp) yes no) -> (GE cmp yes no)
(GE (InvertFlags cmp) yes no) -> (LE cmp yes no)
(ULT (InvertFlags cmp) yes no) -> (UGT cmp yes no)
(UGT (InvertFlags cmp) yes no) -> (ULT cmp yes no)
(ULE (InvertFlags cmp) yes no) -> (UGE cmp yes no)
(UGE (InvertFlags cmp) yes no) -> (ULE cmp yes no)
(EQ (InvertFlags cmp) yes no) -> (EQ cmp yes no)
(NE (InvertFlags cmp) yes no) -> (NE cmp yes no)

// get rid of overflow code for constant shifts
(SBBQcarrymask (CMPQconst [c] (MOVQconst [d]))) &&  inBounds64(d, c) -> (MOVQconst [-1])
(SBBQcarrymask (CMPQconst [c] (MOVQconst [d]))) && !inBounds64(d, c) -> (MOVQconst [0])
(SBBQcarrymask (CMPLconst [c] (MOVLconst [d]))) &&  inBounds32(d, c) -> (MOVQconst [-1])
(SBBQcarrymask (CMPLconst [c] (MOVLconst [d]))) && !inBounds32(d, c) -> (MOVQconst [0])
(SBBQcarrymask (CMPWconst [c] (MOVWconst [d]))) &&  inBounds16(d, c) -> (MOVQconst [-1])
(SBBQcarrymask (CMPWconst [c] (MOVWconst [d]))) && !inBounds16(d, c) -> (MOVQconst [0])
(SBBQcarrymask (CMPBconst [c] (MOVBconst [d]))) &&  inBounds8(d, c)  -> (MOVQconst [-1])
(SBBQcarrymask (CMPBconst [c] (MOVBconst [d]))) && !inBounds8(d, c)  -> (MOVQconst [0])
(SBBLcarrymask (CMPQconst [c] (MOVQconst [d]))) &&  inBounds64(d, c) -> (MOVLconst [-1])
(SBBLcarrymask (CMPQconst [c] (MOVQconst [d]))) && !inBounds64(d, c) -> (MOVLconst [0])
(SBBLcarrymask (CMPLconst [c] (MOVLconst [d]))) &&  inBounds32(d, c) -> (MOVLconst [-1])
(SBBLcarrymask (CMPLconst [c] (MOVLconst [d]))) && !inBounds32(d, c) -> (MOVLconst [0])
(SBBLcarrymask (CMPWconst [c] (MOVWconst [d]))) &&  inBounds16(d, c) -> (MOVLconst [-1])
(SBBLcarrymask (CMPWconst [c] (MOVWconst [d]))) && !inBounds16(d, c) -> (MOVLconst [0])
(SBBLcarrymask (CMPBconst [c] (MOVBconst [d]))) &&  inBounds8(d, c)  -> (MOVLconst [-1])
(SBBLcarrymask (CMPBconst [c] (MOVBconst [d]))) && !inBounds8(d, c)  -> (MOVLconst [0])

// Remove redundant *const ops
(ADDQconst [0] x) -> x
(ADDLconst [c] x) && int32(c)==0 -> x
(ADDWconst [c] x) && int16(c)==0 -> x
(ADDBconst [c] x) && int8(c)==0 -> x
(SUBQconst [0] x) -> x
(SUBLconst [c] x) && int32(c) == 0 -> x
(SUBWconst [c] x) && int16(c) == 0 -> x
(SUBBconst [c] x) && int8(c) == 0 -> x
(ANDQconst [0] _)                 -> (MOVQconst [0])
(ANDLconst [c] _) && int32(c)==0  -> (MOVLconst [0])
(ANDWconst [c] _) && int16(c)==0  -> (MOVWconst [0])
(ANDBconst [c] _) && int8(c)==0   -> (MOVBconst [0])
(ANDQconst [-1] x)                -> x
(ANDLconst [c] x) && int32(c)==-1 -> x
(ANDWconst [c] x) && int16(c)==-1 -> x
(ANDBconst [c] x) && int8(c)==-1  -> x
(ORQconst [0] x)                  -> x
(ORLconst [c] x) && int32(c)==0   -> x
(ORWconst [c] x) && int16(c)==0   -> x
(ORBconst [c] x) && int8(c)==0    -> x
(ORQconst [-1] _)                 -> (MOVQconst [-1])
(ORLconst [c] _) && int32(c)==-1  -> (MOVLconst [-1])
(ORWconst [c] _) && int16(c)==-1  -> (MOVWconst [-1])
(ORBconst [c] _) && int8(c)==-1   -> (MOVBconst [-1])
(XORQconst [0] x)                  -> x
(XORLconst [c] x) && int32(c)==0   -> x
(XORWconst [c] x) && int16(c)==0   -> x
(XORBconst [c] x) && int8(c)==0    -> x

// generic constant folding
// TODO: more of this
(ADDQconst [c] (MOVQconst [d])) -> (MOVQconst [c+d])
(ADDLconst [c] (MOVLconst [d])) -> (MOVLconst [c+d])
(ADDWconst [c] (MOVWconst [d])) -> (MOVWconst [c+d])
(ADDBconst [c] (MOVBconst [d])) -> (MOVBconst [c+d])
(ADDQconst [c] (ADDQconst [d] x)) -> (ADDQconst [c+d] x)
(ADDLconst [c] (ADDLconst [d] x)) -> (ADDLconst [c+d] x)
(ADDWconst [c] (ADDWconst [d] x)) -> (ADDWconst [c+d] x)
(ADDBconst [c] (ADDBconst [d] x)) -> (ADDBconst [c+d] x)
(SUBQconst [c] (MOVQconst [d])) -> (MOVQconst [d-c])
(SUBLconst [c] (MOVLconst [d])) -> (MOVLconst [d-c])
(SUBWconst [c] (MOVWconst [d])) -> (MOVWconst [d-c])
(SUBBconst [c] (MOVBconst [d])) -> (MOVBconst [d-c])
(SUBQconst [c] (SUBQconst [d] x)) -> (ADDQconst [-c-d] x)
(SUBLconst [c] (SUBLconst [d] x)) -> (ADDLconst [-c-d] x)
(SUBWconst [c] (SUBWconst [d] x)) -> (ADDWconst [-c-d] x)
(SUBBconst [c] (SUBBconst [d] x)) -> (ADDBconst [-c-d] x)
(SARQconst [c] (MOVQconst [d])) -> (MOVQconst [d>>uint64(c)])
(SARLconst [c] (MOVQconst [d])) -> (MOVQconst [d>>uint64(c)])
(SARWconst [c] (MOVQconst [d])) -> (MOVQconst [d>>uint64(c)])
(SARBconst [c] (MOVQconst [d])) -> (MOVQconst [d>>uint64(c)])
(NEGQ (MOVQconst [c])) -> (MOVQconst [-c])
(NEGL (MOVLconst [c])) -> (MOVLconst [-c])
(NEGW (MOVWconst [c])) -> (MOVWconst [-c])
(NEGB (MOVBconst [c])) -> (MOVBconst [-c])
(MULQconst [c] (MOVQconst [d])) -> (MOVQconst [c*d])
(MULLconst [c] (MOVLconst [d])) -> (MOVLconst [c*d])
(MULWconst [c] (MOVWconst [d])) -> (MOVWconst [c*d])
(MULBconst [c] (MOVBconst [d])) -> (MOVBconst [c*d])
(ANDQconst [c] (MOVQconst [d])) -> (MOVQconst [c&d])
(ANDLconst [c] (MOVLconst [d])) -> (MOVLconst [c&d])
(ANDWconst [c] (MOVWconst [d])) -> (MOVWconst [c&d])
(ANDBconst [c] (MOVBconst [d])) -> (MOVBconst [c&d])
(ORQconst [c] (MOVQconst [d])) -> (MOVQconst [c|d])
(ORLconst [c] (MOVLconst [d])) -> (MOVLconst [c|d])
(ORWconst [c] (MOVWconst [d])) -> (MOVWconst [c|d])
(ORBconst [c] (MOVBconst [d])) -> (MOVBconst [c|d])
(XORQconst [c] (MOVQconst [d])) -> (MOVQconst [c^d])
(XORLconst [c] (MOVLconst [d])) -> (MOVLconst [c^d])
(XORWconst [c] (MOVWconst [d])) -> (MOVWconst [c^d])
(XORBconst [c] (MOVBconst [d])) -> (MOVBconst [c^d])
(NOTQ (MOVQconst [c])) -> (MOVQconst [^c])
(NOTL (MOVLconst [c])) -> (MOVLconst [^c])
(NOTW (MOVWconst [c])) -> (MOVWconst [^c])
(NOTB (MOVBconst [c])) -> (MOVBconst [^c])

// generic simplifications
// TODO: more of this
(ADDQ x (NEGQ y)) -> (SUBQ x y)
(ADDL x (NEGL y)) -> (SUBL x y)
(ADDW x (NEGW y)) -> (SUBW x y)
(ADDB x (NEGB y)) -> (SUBB x y)
(SUBQ x x) -> (MOVQconst [0])
(SUBL x x) -> (MOVLconst [0])
(SUBW x x) -> (MOVWconst [0])
(SUBB x x) -> (MOVBconst [0])
(ANDQ x x) -> x
(ANDL x x) -> x
(ANDW x x) -> x
(ANDB x x) -> x
(ORQ x x) -> x
(ORL x x) -> x
(ORW x x) -> x
(ORB x x) -> x
(XORQ x x) -> (MOVQconst [0])
(XORL x x) -> (MOVLconst [0])
(XORW x x) -> (MOVWconst [0])
(XORB x x) -> (MOVBconst [0])

