[dev.ssa] cmd/compile: use INC/DEC instead of add when we can

INC/DEC produces slightly faster and smaller code.

Change-Id: I329d9bdb01b90041be45e053d9df640818bf0c2d
Reviewed-on: https://go-review.googlesource.com/19238
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: David Chase <drchase@google.com>
diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go
index b7019d6..35a4929 100644
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@@ -3985,12 +3985,47 @@
 		r := regnum(v)
 		a := regnum(v.Args[0])
 		if r == a {
-			p := Prog(v.Op.Asm())
-			p.From.Type = obj.TYPE_CONST
-			p.From.Offset = v.AuxInt
-			p.To.Type = obj.TYPE_REG
-			p.To.Reg = r
-			return
+			if v.AuxInt == 1 {
+				var asm int
+				switch v.Op {
+				// Software optimization manual recommends add $1,reg.
+				// But inc/dec is 1 byte smaller. ICC always uses inc
+				// Clang/GCC choose depending on flags, but prefer add.
+				// Experiments show that inc/dec is both a little faster
+				// and make a binary a little smaller.
+				case ssa.OpAMD64ADDQconst:
+					asm = x86.AINCQ
+				case ssa.OpAMD64ADDLconst:
+					asm = x86.AINCL
+				case ssa.OpAMD64ADDWconst:
+					asm = x86.AINCW
+				}
+				p := Prog(asm)
+				p.To.Type = obj.TYPE_REG
+				p.To.Reg = r
+				return
+			} else if v.AuxInt == -1 {
+				var asm int
+				switch v.Op {
+				case ssa.OpAMD64ADDQconst:
+					asm = x86.ADECQ
+				case ssa.OpAMD64ADDLconst:
+					asm = x86.ADECL
+				case ssa.OpAMD64ADDWconst:
+					asm = x86.ADECW
+				}
+				p := Prog(asm)
+				p.To.Type = obj.TYPE_REG
+				p.To.Reg = r
+				return
+			} else {
+				p := Prog(v.Op.Asm())
+				p.From.Type = obj.TYPE_CONST
+				p.From.Offset = v.AuxInt
+				p.To.Type = obj.TYPE_REG
+				p.To.Reg = r
+				return
+			}
 		}
 		var asm int
 		switch v.Op {
@@ -4027,15 +4062,83 @@
 		//p.From3 = new(obj.Addr)
 		//p.From3.Type = obj.TYPE_REG
 		//p.From3.Reg = regnum(v.Args[0])
+	case ssa.OpAMD64SUBQconst, ssa.OpAMD64SUBLconst, ssa.OpAMD64SUBWconst:
+		x := regnum(v.Args[0])
+		r := regnum(v)
+		// We have 3-op add (lea), so transforming a = b - const into
+		// a = b + (- const), saves us 1 instruction. We can't fit
+		// - (-1 << 31) into  4 bytes offset in lea.
+		// We handle 2-address just fine below.
+		if v.AuxInt == -1<<31 || x == r {
+			if x != r {
+				// This code compensates for the fact that the register allocator
+				// doesn't understand 2-address instructions yet.  TODO: fix that.
+				p := Prog(moveByType(v.Type))
+				p.From.Type = obj.TYPE_REG
+				p.From.Reg = x
+				p.To.Type = obj.TYPE_REG
+				p.To.Reg = r
+			}
+			p := Prog(v.Op.Asm())
+			p.From.Type = obj.TYPE_CONST
+			p.From.Offset = v.AuxInt
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = r
+		} else if x == r && v.AuxInt == -1 {
+			var asm int
+			// x = x - (-1) is the same as x++
+			// See OpAMD64ADDQconst comments about inc vs add $1,reg
+			switch v.Op {
+			case ssa.OpAMD64SUBQconst:
+				asm = x86.AINCQ
+			case ssa.OpAMD64SUBLconst:
+				asm = x86.AINCL
+			case ssa.OpAMD64SUBWconst:
+				asm = x86.AINCW
+			}
+			p := Prog(asm)
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = r
+		} else if x == r && v.AuxInt == 1 {
+			var asm int
+			switch v.Op {
+			case ssa.OpAMD64SUBQconst:
+				asm = x86.ADECQ
+			case ssa.OpAMD64SUBLconst:
+				asm = x86.ADECL
+			case ssa.OpAMD64SUBWconst:
+				asm = x86.ADECW
+			}
+			p := Prog(asm)
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = r
+		} else {
+			var asm int
+			switch v.Op {
+			case ssa.OpAMD64SUBQconst:
+				asm = x86.ALEAQ
+			case ssa.OpAMD64SUBLconst:
+				asm = x86.ALEAL
+			case ssa.OpAMD64SUBWconst:
+				asm = x86.ALEAW
+			}
+			p := Prog(asm)
+			p.From.Type = obj.TYPE_MEM
+			p.From.Reg = x
+			p.From.Offset = -v.AuxInt
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = r
+		}
+
 	case ssa.OpAMD64ADDBconst,
 		ssa.OpAMD64ANDQconst, ssa.OpAMD64ANDLconst, ssa.OpAMD64ANDWconst, ssa.OpAMD64ANDBconst,
 		ssa.OpAMD64ORQconst, ssa.OpAMD64ORLconst, ssa.OpAMD64ORWconst, ssa.OpAMD64ORBconst,
 		ssa.OpAMD64XORQconst, ssa.OpAMD64XORLconst, ssa.OpAMD64XORWconst, ssa.OpAMD64XORBconst,
-		ssa.OpAMD64SUBQconst, ssa.OpAMD64SUBLconst, ssa.OpAMD64SUBWconst, ssa.OpAMD64SUBBconst,
-		ssa.OpAMD64SHLQconst, ssa.OpAMD64SHLLconst, ssa.OpAMD64SHLWconst, ssa.OpAMD64SHLBconst,
-		ssa.OpAMD64SHRQconst, ssa.OpAMD64SHRLconst, ssa.OpAMD64SHRWconst, ssa.OpAMD64SHRBconst,
-		ssa.OpAMD64SARQconst, ssa.OpAMD64SARLconst, ssa.OpAMD64SARWconst, ssa.OpAMD64SARBconst,
-		ssa.OpAMD64ROLQconst, ssa.OpAMD64ROLLconst, ssa.OpAMD64ROLWconst, ssa.OpAMD64ROLBconst:
+		ssa.OpAMD64SUBBconst, ssa.OpAMD64SHLQconst, ssa.OpAMD64SHLLconst, ssa.OpAMD64SHLWconst,
+		ssa.OpAMD64SHLBconst, ssa.OpAMD64SHRQconst, ssa.OpAMD64SHRLconst, ssa.OpAMD64SHRWconst,
+		ssa.OpAMD64SHRBconst, ssa.OpAMD64SARQconst, ssa.OpAMD64SARLconst, ssa.OpAMD64SARWconst,
+		ssa.OpAMD64SARBconst, ssa.OpAMD64ROLQconst, ssa.OpAMD64ROLLconst, ssa.OpAMD64ROLWconst,
+		ssa.OpAMD64ROLBconst:
 		// This code compensates for the fact that the register allocator
 		// doesn't understand 2-address instructions yet.  TODO: fix that.
 		x := regnum(v.Args[0])