draw: add a fast path for Gray src images.

benchmark                      old ns/op      new ns/op      delta
BenchmarkScaleSrcGray          9296680        552705         -94.05%
BenchmarkTformABSrcGray        6323894        817986         -87.07%
BenchmarkTformCRSrcGray        39229583       4193194        -89.31%

Change-Id: Ie7d43dfe323d49b245b47c3206b5aad2b50cb7fb
Reviewed-on: https://go-review.googlesource.com/7711
Reviewed-by: Rob Pike <r@golang.org>
diff --git a/draw/gen.go b/draw/gen.go
index b429465..eccafee 100644
--- a/draw/gen.go
+++ b/draw/gen.go
@@ -197,7 +197,7 @@
 		switch d.dType {
 		default:
 			return ";"
-		case "*image.RGBA":
+		case "*image.Gray", "*image.RGBA":
 			return "d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))"
 		}
 
@@ -205,7 +205,7 @@
 		switch d.dType {
 		default:
 			return ";"
-		case "*image.RGBA":
+		case "*image.Gray", "*image.RGBA":
 			return "d := dst.PixOffset(dr.Min.X+int(dx), dr.Min.Y+adr.Min.Y)"
 		}
 
@@ -214,16 +214,24 @@
 		if len(args) != 4 {
 			return ""
 		}
-		return fmt.Sprintf(""+
-			"%sr = %s*%sr + %s*%sr\n"+
-			"%sg = %s*%sg + %s*%sg\n"+
-			"%sb = %s*%sb + %s*%sb\n"+
-			"%sa = %s*%sa + %s*%sa",
-			args[3], args[0], args[1], args[2], args[3],
-			args[3], args[0], args[1], args[2], args[3],
-			args[3], args[0], args[1], args[2], args[3],
-			args[3], args[0], args[1], args[2], args[3],
-		)
+		switch d.sType {
+		default:
+			return fmt.Sprintf(""+
+				"%sr = %s*%sr + %s*%sr\n"+
+				"%sg = %s*%sg + %s*%sg\n"+
+				"%sb = %s*%sb + %s*%sb\n"+
+				"%sa = %s*%sa + %s*%sa",
+				args[3], args[0], args[1], args[2], args[3],
+				args[3], args[0], args[1], args[2], args[3],
+				args[3], args[0], args[1], args[2], args[3],
+				args[3], args[0], args[1], args[2], args[3],
+			)
+		case "*image.Gray":
+			return fmt.Sprintf(""+
+				"%sr = %s*%sr + %s*%sr",
+				args[3], args[0], args[1], args[2], args[3],
+			)
+		}
 
 	case "outputu":
 		args, _ := splitArgs(suffix)
@@ -234,23 +242,49 @@
 		default:
 			log.Fatalf("bad dType %q", d.dType)
 		case "Image":
-			return fmt.Sprintf(""+
-				"dstColorRGBA64.R = uint16(%sr)\n"+
-				"dstColorRGBA64.G = uint16(%sg)\n"+
-				"dstColorRGBA64.B = uint16(%sb)\n"+
-				"dstColorRGBA64.A = uint16(%sa)\n"+
-				"dst.Set(%s, %s, dstColor)",
-				args[2], args[2], args[2], args[2],
-				args[0], args[1],
-			)
+			switch d.sType {
+			default:
+				return fmt.Sprintf(""+
+					"dstColorRGBA64.R = uint16(%sr)\n"+
+					"dstColorRGBA64.G = uint16(%sg)\n"+
+					"dstColorRGBA64.B = uint16(%sb)\n"+
+					"dstColorRGBA64.A = uint16(%sa)\n"+
+					"dst.Set(%s, %s, dstColor)",
+					args[2], args[2], args[2], args[2],
+					args[0], args[1],
+				)
+			case "*image.Gray":
+				return fmt.Sprintf(""+
+					"out := uint16(%sr)\n"+
+					"dstColorRGBA64.R = out\n"+
+					"dstColorRGBA64.G = out\n"+
+					"dstColorRGBA64.B = out\n"+
+					"dstColorRGBA64.A = 0xffff\n"+
+					"dst.Set(%s, %s, dstColor)",
+					args[2],
+					args[0], args[1],
+				)
+			}
 		case "*image.RGBA":
-			return fmt.Sprintf(""+
-				"dst.Pix[d+0] = uint8(uint32(%sr) >> 8)\n"+
-				"dst.Pix[d+1] = uint8(uint32(%sg) >> 8)\n"+
-				"dst.Pix[d+2] = uint8(uint32(%sb) >> 8)\n"+
-				"dst.Pix[d+3] = uint8(uint32(%sa) >> 8)",
-				args[2], args[2], args[2], args[2],
-			)
+			switch d.sType {
+			default:
+				return fmt.Sprintf(""+
+					"dst.Pix[d+0] = uint8(uint32(%sr) >> 8)\n"+
+					"dst.Pix[d+1] = uint8(uint32(%sg) >> 8)\n"+
+					"dst.Pix[d+2] = uint8(uint32(%sb) >> 8)\n"+
+					"dst.Pix[d+3] = uint8(uint32(%sa) >> 8)",
+					args[2], args[2], args[2], args[2],
+				)
+			case "*image.Gray":
+				return fmt.Sprintf(""+
+					"out := uint8(uint32(%sr) >> 8)\n"+
+					"dst.Pix[d+0] = out\n"+
+					"dst.Pix[d+1] = out\n"+
+					"dst.Pix[d+2] = out\n"+
+					"dst.Pix[d+3] = 0xff",
+					args[2],
+				)
+			}
 		}
 
 	case "outputf":
@@ -263,29 +297,55 @@
 		default:
 			log.Fatalf("bad dType %q", d.dType)
 		case "Image":
-			ret = fmt.Sprintf(""+
-				"dstColorRGBA64.R = %s(%sr * %s)\n"+
-				"dstColorRGBA64.G = %s(%sg * %s)\n"+
-				"dstColorRGBA64.B = %s(%sb * %s)\n"+
-				"dstColorRGBA64.A = %s(%sa * %s)\n"+
-				"dst.Set(%s, %s, dstColor)",
-				args[2], args[3], args[4],
-				args[2], args[3], args[4],
-				args[2], args[3], args[4],
-				args[2], args[3], args[4],
-				args[0], args[1],
-			)
+			switch d.sType {
+			default:
+				ret = fmt.Sprintf(""+
+					"dstColorRGBA64.R = %s(%sr * %s)\n"+
+					"dstColorRGBA64.G = %s(%sg * %s)\n"+
+					"dstColorRGBA64.B = %s(%sb * %s)\n"+
+					"dstColorRGBA64.A = %s(%sa * %s)\n"+
+					"dst.Set(%s, %s, dstColor)",
+					args[2], args[3], args[4],
+					args[2], args[3], args[4],
+					args[2], args[3], args[4],
+					args[2], args[3], args[4],
+					args[0], args[1],
+				)
+			case "*image.Gray":
+				ret = fmt.Sprintf(""+
+					"out := %s(%sr * %s)\n"+
+					"dstColorRGBA64.R = out\n"+
+					"dstColorRGBA64.G = out\n"+
+					"dstColorRGBA64.B = out\n"+
+					"dstColorRGBA64.A = 0xffff\n"+
+					"dst.Set(%s, %s, dstColor)",
+					args[2], args[3], args[4],
+					args[0], args[1],
+				)
+			}
 		case "*image.RGBA":
-			ret = fmt.Sprintf(""+
-				"dst.Pix[d+0] = uint8(%s(%sr * %s) >> 8)\n"+
-				"dst.Pix[d+1] = uint8(%s(%sg * %s) >> 8)\n"+
-				"dst.Pix[d+2] = uint8(%s(%sb * %s) >> 8)\n"+
-				"dst.Pix[d+3] = uint8(%s(%sa * %s) >> 8)",
-				args[2], args[3], args[4],
-				args[2], args[3], args[4],
-				args[2], args[3], args[4],
-				args[2], args[3], args[4],
-			)
+			switch d.sType {
+			default:
+				ret = fmt.Sprintf(""+
+					"dst.Pix[d+0] = uint8(%s(%sr * %s) >> 8)\n"+
+					"dst.Pix[d+1] = uint8(%s(%sg * %s) >> 8)\n"+
+					"dst.Pix[d+2] = uint8(%s(%sb * %s) >> 8)\n"+
+					"dst.Pix[d+3] = uint8(%s(%sa * %s) >> 8)",
+					args[2], args[3], args[4],
+					args[2], args[3], args[4],
+					args[2], args[3], args[4],
+					args[2], args[3], args[4],
+				)
+			case "*image.Gray":
+				ret = fmt.Sprintf(""+
+					"out := uint8(%s(%sr * %s) >> 8)\n"+
+					"dst.Pix[d+0] = out\n"+
+					"dst.Pix[d+1] = out\n"+
+					"dst.Pix[d+2] = out\n"+
+					"dst.Pix[d+3] = 0xff",
+					args[2], args[3], args[4],
+				)
+			}
 		}
 		return strings.Replace(ret, " * 1)", ")", -1)
 
@@ -308,12 +368,20 @@
 		switch d.sType {
 		default:
 			log.Fatalf("bad sType %q", d.sType)
-		case "image.Image", "*image.Gray", "*image.NRGBA", "*image.Uniform", "*image.YCbCr": // TODO: separate code for concrete types.
+		case "image.Image", "*image.NRGBA", "*image.Uniform", "*image.YCbCr": // TODO: separate code for concrete types.
 			fmt.Fprintf(buf, "%sr%s, %sg%s, %sb%s, %sa%s := "+
 				"src.At(%s, %s).RGBA()\n",
 				lhs, tmp, lhs, tmp, lhs, tmp, lhs, tmp,
 				args[0], args[1],
 			)
+		case "*image.Gray":
+			// TODO: there's no need to multiply by 0x101 if the next thing
+			// we're going to do is shift right by 8.
+			fmt.Fprintf(buf, "%si := src.PixOffset(%s, %s)\n"+
+				"%sr%s := uint32(src.Pix[%si]) * 0x101\n",
+				lhs, args[0], args[1],
+				lhs, tmp, lhs,
+			)
 		case "*image.RGBA":
 			// TODO: there's no need to multiply by 0x101 if the next thing
 			// we're going to do is shift right by 8.
@@ -331,16 +399,24 @@
 		}
 
 		if dollar == "srcf" {
-			fmt.Fprintf(buf, ""+
-				"%sr %s float64(%sru)%s\n"+
-				"%sg %s float64(%sgu)%s\n"+
-				"%sb %s float64(%sbu)%s\n"+
-				"%sa %s float64(%sau)%s\n",
-				lhs, eqOp, lhs, extra,
-				lhs, eqOp, lhs, extra,
-				lhs, eqOp, lhs, extra,
-				lhs, eqOp, lhs, extra,
-			)
+			switch d.sType {
+			default:
+				fmt.Fprintf(buf, ""+
+					"%sr %s float64(%sru)%s\n"+
+					"%sg %s float64(%sgu)%s\n"+
+					"%sb %s float64(%sbu)%s\n"+
+					"%sa %s float64(%sau)%s\n",
+					lhs, eqOp, lhs, extra,
+					lhs, eqOp, lhs, extra,
+					lhs, eqOp, lhs, extra,
+					lhs, eqOp, lhs, extra,
+				)
+			case "*image.Gray":
+				fmt.Fprintf(buf, ""+
+					"%sr %s float64(%sru)%s\n",
+					lhs, eqOp, lhs, extra,
+				)
+			}
 		}
 
 		return strings.TrimSpace(buf.String())
@@ -362,6 +438,27 @@
 			return strings.Replace(suffix, "for dy, s", "for _, s", 1)
 		}
 		return suffix
+
+	case "tweakP":
+		if d.sType == "*image.Gray" {
+			if strings.HasPrefix(strings.TrimSpace(suffix), "pa * ") {
+				return "1,"
+			}
+			return "pr,"
+		}
+		return suffix
+
+	case "tweakPr":
+		if d.sType == "*image.Gray" {
+			return "pr *= s.invTotalWeightFFFF"
+		}
+		return ";"
+
+	case "tweakVarP":
+		if d.sType == "*image.Gray" {
+			return strings.Replace(suffix, "var pr, pg, pb, pa", "var pr", 1)
+		}
+		return suffix
 	}
 	return ""
 }
@@ -690,15 +787,16 @@
 			t := 0
 			for y := int32(0); y < z.sh; y++ {
 				for _, s := range z.horizontal.sources {
-					var pr, pg, pb, pa float64
+					$tweakVarP var pr, pg, pb, pa float64
 					for _, c := range z.horizontal.contribs[s.i:s.j] {
 						p += $srcf[sr.Min.X + int(c.coord), sr.Min.Y + int(y)] * c.weight
 					}
+					$tweakPr
 					tmp[t] = [4]float64{
-						pr * s.invTotalWeightFFFF,
-						pg * s.invTotalWeightFFFF,
-						pb * s.invTotalWeightFFFF,
-						pa * s.invTotalWeightFFFF,
+						$tweakP pr * s.invTotalWeightFFFF,
+						$tweakP pg * s.invTotalWeightFFFF,
+						$tweakP pb * s.invTotalWeightFFFF,
+						$tweakP pa * s.invTotalWeightFFFF,
 					}
 					t++
 				}
@@ -804,7 +902,7 @@
 						yWeights[y] /= totalYWeight
 					}
 
-					var pr, pg, pb, pa float64
+					$tweakVarP var pr, pg, pb, pa float64
 					for ky := iy; ky < jy; ky++ {
 						yWeight := yWeights[ky - iy]
 						for kx := ix; kx < jx; kx++ {
diff --git a/draw/impl.go b/draw/impl.go
index 8ebdd6f..cc0ed77 100644
--- a/draw/impl.go
+++ b/draw/impl.go
@@ -96,11 +96,13 @@
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
 		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			sx := (2*uint64(dx) + 1) * sw / dw2
-			pr, pg, pb, pa := src.At(sr.Min.X+int(sx), sr.Min.Y+int(sy)).RGBA()
-			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
-			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
-			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
-			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
+			pi := src.PixOffset(sr.Min.X+int(sx), sr.Min.Y+int(sy))
+			pr := uint32(src.Pix[pi]) * 0x101
+			out := uint8(uint32(pr) >> 8)
+			dst.Pix[d+0] = out
+			dst.Pix[d+1] = out
+			dst.Pix[d+2] = out
+			dst.Pix[d+3] = 0xff
 		}
 	}
 }
@@ -237,11 +239,13 @@
 			if !(image.Point{sx0, sy0}).In(sr) {
 				continue
 			}
-			pr, pg, pb, pa := src.At(sx0, sy0).RGBA()
-			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
-			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
-			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
-			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
+			pi := src.PixOffset(sx0, sy0)
+			pr := uint32(src.Pix[pi]) * 0x101
+			out := uint8(uint32(pr) >> 8)
+			dst.Pix[d+0] = out
+			dst.Pix[d+1] = out
+			dst.Pix[d+2] = out
+			dst.Pix[d+3] = 0xff
 		}
 	}
 }
@@ -493,42 +497,26 @@
 				xFrac0, xFrac1 = 1, 0
 			}
 
-			s00ru, s00gu, s00bu, s00au := src.At(sr.Min.X+int(sx0), sr.Min.Y+int(sy0)).RGBA()
+			s00i := src.PixOffset(sr.Min.X+int(sx0), sr.Min.Y+int(sy0))
+			s00ru := uint32(src.Pix[s00i]) * 0x101
 			s00r := float64(s00ru)
-			s00g := float64(s00gu)
-			s00b := float64(s00bu)
-			s00a := float64(s00au)
-			s10ru, s10gu, s10bu, s10au := src.At(sr.Min.X+int(sx1), sr.Min.Y+int(sy0)).RGBA()
+			s10i := src.PixOffset(sr.Min.X+int(sx1), sr.Min.Y+int(sy0))
+			s10ru := uint32(src.Pix[s10i]) * 0x101
 			s10r := float64(s10ru)
-			s10g := float64(s10gu)
-			s10b := float64(s10bu)
-			s10a := float64(s10au)
 			s10r = xFrac1*s00r + xFrac0*s10r
-			s10g = xFrac1*s00g + xFrac0*s10g
-			s10b = xFrac1*s00b + xFrac0*s10b
-			s10a = xFrac1*s00a + xFrac0*s10a
-			s01ru, s01gu, s01bu, s01au := src.At(sr.Min.X+int(sx0), sr.Min.Y+int(sy1)).RGBA()
+			s01i := src.PixOffset(sr.Min.X+int(sx0), sr.Min.Y+int(sy1))
+			s01ru := uint32(src.Pix[s01i]) * 0x101
 			s01r := float64(s01ru)
-			s01g := float64(s01gu)
-			s01b := float64(s01bu)
-			s01a := float64(s01au)
-			s11ru, s11gu, s11bu, s11au := src.At(sr.Min.X+int(sx1), sr.Min.Y+int(sy1)).RGBA()
+			s11i := src.PixOffset(sr.Min.X+int(sx1), sr.Min.Y+int(sy1))
+			s11ru := uint32(src.Pix[s11i]) * 0x101
 			s11r := float64(s11ru)
-			s11g := float64(s11gu)
-			s11b := float64(s11bu)
-			s11a := float64(s11au)
 			s11r = xFrac1*s01r + xFrac0*s11r
-			s11g = xFrac1*s01g + xFrac0*s11g
-			s11b = xFrac1*s01b + xFrac0*s11b
-			s11a = xFrac1*s01a + xFrac0*s11a
 			s11r = yFrac1*s10r + yFrac0*s11r
-			s11g = yFrac1*s10g + yFrac0*s11g
-			s11b = yFrac1*s10b + yFrac0*s11b
-			s11a = yFrac1*s10a + yFrac0*s11a
-			dst.Pix[d+0] = uint8(uint32(s11r) >> 8)
-			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
-			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
-			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
+			out := uint8(uint32(s11r) >> 8)
+			dst.Pix[d+0] = out
+			dst.Pix[d+1] = out
+			dst.Pix[d+2] = out
+			dst.Pix[d+3] = 0xff
 		}
 	}
 }
@@ -1066,42 +1054,26 @@
 				yFrac0, yFrac1 = 1, 0
 			}
 
-			s00ru, s00gu, s00bu, s00au := src.At(sx0, sy0).RGBA()
+			s00i := src.PixOffset(sx0, sy0)
+			s00ru := uint32(src.Pix[s00i]) * 0x101
 			s00r := float64(s00ru)
-			s00g := float64(s00gu)
-			s00b := float64(s00bu)
-			s00a := float64(s00au)
-			s10ru, s10gu, s10bu, s10au := src.At(sx1, sy0).RGBA()
+			s10i := src.PixOffset(sx1, sy0)
+			s10ru := uint32(src.Pix[s10i]) * 0x101
 			s10r := float64(s10ru)
-			s10g := float64(s10gu)
-			s10b := float64(s10bu)
-			s10a := float64(s10au)
 			s10r = xFrac1*s00r + xFrac0*s10r
-			s10g = xFrac1*s00g + xFrac0*s10g
-			s10b = xFrac1*s00b + xFrac0*s10b
-			s10a = xFrac1*s00a + xFrac0*s10a
-			s01ru, s01gu, s01bu, s01au := src.At(sx0, sy1).RGBA()
+			s01i := src.PixOffset(sx0, sy1)
+			s01ru := uint32(src.Pix[s01i]) * 0x101
 			s01r := float64(s01ru)
-			s01g := float64(s01gu)
-			s01b := float64(s01bu)
-			s01a := float64(s01au)
-			s11ru, s11gu, s11bu, s11au := src.At(sx1, sy1).RGBA()
+			s11i := src.PixOffset(sx1, sy1)
+			s11ru := uint32(src.Pix[s11i]) * 0x101
 			s11r := float64(s11ru)
-			s11g := float64(s11gu)
-			s11b := float64(s11bu)
-			s11a := float64(s11au)
 			s11r = xFrac1*s01r + xFrac0*s11r
-			s11g = xFrac1*s01g + xFrac0*s11g
-			s11b = xFrac1*s01b + xFrac0*s11b
-			s11a = xFrac1*s01a + xFrac0*s11a
 			s11r = yFrac1*s10r + yFrac0*s11r
-			s11g = yFrac1*s10g + yFrac0*s11g
-			s11b = yFrac1*s10b + yFrac0*s11b
-			s11a = yFrac1*s10a + yFrac0*s11a
-			dst.Pix[d+0] = uint8(uint32(s11r) >> 8)
-			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
-			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
-			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
+			out := uint8(uint32(s11r) >> 8)
+			dst.Pix[d+0] = out
+			dst.Pix[d+1] = out
+			dst.Pix[d+2] = out
+			dst.Pix[d+3] = 0xff
 		}
 	}
 }
@@ -1709,19 +1681,18 @@
 	t := 0
 	for y := int32(0); y < z.sh; y++ {
 		for _, s := range z.horizontal.sources {
-			var pr, pg, pb, pa float64
+			var pr float64
 			for _, c := range z.horizontal.contribs[s.i:s.j] {
-				pru, pgu, pbu, pau := src.At(sr.Min.X+int(c.coord), sr.Min.Y+int(y)).RGBA()
+				pi := src.PixOffset(sr.Min.X+int(c.coord), sr.Min.Y+int(y))
+				pru := uint32(src.Pix[pi]) * 0x101
 				pr += float64(pru) * c.weight
-				pg += float64(pgu) * c.weight
-				pb += float64(pbu) * c.weight
-				pa += float64(pau) * c.weight
 			}
+			pr *= s.invTotalWeightFFFF
 			tmp[t] = [4]float64{
-				pr * s.invTotalWeightFFFF,
-				pg * s.invTotalWeightFFFF,
-				pb * s.invTotalWeightFFFF,
-				pa * s.invTotalWeightFFFF,
+				pr,
+				pr,
+				pr,
+				1,
 			}
 			t++
 		}
@@ -1965,21 +1936,20 @@
 				yWeights[y] /= totalYWeight
 			}
 
-			var pr, pg, pb, pa float64
+			var pr float64
 			for ky := iy; ky < jy; ky++ {
 				yWeight := yWeights[ky-iy]
 				for kx := ix; kx < jx; kx++ {
-					pru, pgu, pbu, pau := src.At(kx, ky).RGBA()
+					pi := src.PixOffset(kx, ky)
+					pru := uint32(src.Pix[pi]) * 0x101
 					pr += float64(pru) * xWeights[kx-ix] * yWeight
-					pg += float64(pgu) * xWeights[kx-ix] * yWeight
-					pb += float64(pbu) * xWeights[kx-ix] * yWeight
-					pa += float64(pau) * xWeights[kx-ix] * yWeight
 				}
 			}
-			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
-			dst.Pix[d+1] = uint8(fffftou(pg) >> 8)
-			dst.Pix[d+2] = uint8(fffftou(pb) >> 8)
-			dst.Pix[d+3] = uint8(fffftou(pa) >> 8)
+			out := uint8(fffftou(pr) >> 8)
+			dst.Pix[d+0] = out
+			dst.Pix[d+1] = out
+			dst.Pix[d+2] = out
+			dst.Pix[d+3] = 0xff
 		}
 	}
 }