draw: add a fast path for NRGBA src images.

benchmark                      old ns/op      new ns/op      delta
BenchmarkScaleSrcNRGBA         14142583       2043782        -85.55%
BenchmarkTformABSrcNRGBA       9846421        1993564        -79.75%
BenchmarkTformCRSrcNRGBA       62041569       13866457       -77.65%

Change-Id: I1edf699dfc6436c0da7e3ab221684406ab1e362f
Reviewed-on: https://go-review.googlesource.com/7793
Reviewed-by: Rob Pike <r@golang.org>
diff --git a/draw/gen.go b/draw/gen.go
index 1450346..4048e19 100644
--- a/draw/gen.go
+++ b/draw/gen.go
@@ -215,7 +215,7 @@
 		switch d.dType {
 		default:
 			return ";"
-		case "*image.Gray", "*image.RGBA":
+		case "*image.RGBA":
 			return "d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))"
 		}
 
@@ -223,7 +223,7 @@
 		switch d.dType {
 		default:
 			return ";"
-		case "*image.Gray", "*image.RGBA":
+		case "*image.RGBA":
 			return "d := dst.PixOffset(dr.Min.X+int(dx), dr.Min.Y+adr.Min.Y)"
 		}
 
@@ -386,7 +386,7 @@
 		switch d.sType {
 		default:
 			log.Fatalf("bad sType %q", d.sType)
-		case "image.Image", "*image.NRGBA", "*image.Uniform", "*image.YCbCr": // TODO: separate code for concrete types.
+		case "image.Image", "*image.Uniform", "*image.YCbCr": // TODO: separate code for concrete types.
 			fmt.Fprintf(buf, "%sr%s, %sg%s, %sb%s, %sa%s := "+
 				"src.At(%s, %s).RGBA()\n",
 				lhs, tmp, lhs, tmp, lhs, tmp, lhs, tmp,
@@ -400,6 +400,20 @@
 				lhs, args[0], args[1],
 				lhs, tmp, lhs,
 			)
+		case "*image.NRGBA":
+			// TODO: there's no need to multiply by 0x101 if the next thing
+			// we're going to do is shift right by 8.
+			fmt.Fprintf(buf, "%si := src.PixOffset(%s, %s)\n"+
+				"%sa%s := uint32(src.Pix[%si+3]) * 0x101\n"+
+				"%sr%s := uint32(src.Pix[%si+0]) * %sa%s / 0xff\n"+
+				"%sg%s := uint32(src.Pix[%si+1]) * %sa%s / 0xff\n"+
+				"%sb%s := uint32(src.Pix[%si+2]) * %sa%s / 0xff\n",
+				lhs, args[0], args[1],
+				lhs, tmp, lhs,
+				lhs, tmp, lhs, lhs, tmp,
+				lhs, tmp, lhs, lhs, tmp,
+				lhs, tmp, lhs, lhs, tmp,
+			)
 		case "*image.RGBA":
 			// TODO: there's no need to multiply by 0x101 if the next thing
 			// we're going to do is shift right by 8.
diff --git a/draw/impl.go b/draw/impl.go
index 02eaabf..06b8432 100644
--- a/draw/impl.go
+++ b/draw/impl.go
@@ -139,7 +139,11 @@
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
 		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			sx := (2*uint64(dx) + 1) * sw / dw2
-			pr, pg, pb, pa := src.At(sr.Min.X+int(sx), sr.Min.Y+int(sy)).RGBA()
+			pi := src.PixOffset(sr.Min.X+int(sx), sr.Min.Y+int(sy))
+			pa := uint32(src.Pix[pi+3]) * 0x101
+			pr := uint32(src.Pix[pi+0]) * pa / 0xff
+			pg := uint32(src.Pix[pi+1]) * pa / 0xff
+			pb := uint32(src.Pix[pi+2]) * pa / 0xff
 			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
 			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
 			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
@@ -341,7 +345,11 @@
 			if !(image.Point{sx0, sy0}).In(sr) {
 				continue
 			}
-			pr, pg, pb, pa := src.At(sx0, sy0).RGBA()
+			pi := src.PixOffset(sx0, sy0)
+			pa := uint32(src.Pix[pi+3]) * 0x101
+			pr := uint32(src.Pix[pi+0]) * pa / 0xff
+			pg := uint32(src.Pix[pi+1]) * pa / 0xff
+			pb := uint32(src.Pix[pi+2]) * pa / 0xff
 			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
 			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
 			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
@@ -724,12 +732,20 @@
 				xFrac0, xFrac1 = 1, 0
 			}
 
-			s00ru, s00gu, s00bu, s00au := src.At(sr.Min.X+int(sx0), sr.Min.Y+int(sy0)).RGBA()
+			s00i := src.PixOffset(sr.Min.X+int(sx0), sr.Min.Y+int(sy0))
+			s00au := uint32(src.Pix[s00i+3]) * 0x101
+			s00ru := uint32(src.Pix[s00i+0]) * s00au / 0xff
+			s00gu := uint32(src.Pix[s00i+1]) * s00au / 0xff
+			s00bu := uint32(src.Pix[s00i+2]) * s00au / 0xff
 			s00r := float64(s00ru)
 			s00g := float64(s00gu)
 			s00b := float64(s00bu)
 			s00a := float64(s00au)
-			s10ru, s10gu, s10bu, s10au := src.At(sr.Min.X+int(sx1), sr.Min.Y+int(sy0)).RGBA()
+			s10i := src.PixOffset(sr.Min.X+int(sx1), sr.Min.Y+int(sy0))
+			s10au := uint32(src.Pix[s10i+3]) * 0x101
+			s10ru := uint32(src.Pix[s10i+0]) * s10au / 0xff
+			s10gu := uint32(src.Pix[s10i+1]) * s10au / 0xff
+			s10bu := uint32(src.Pix[s10i+2]) * s10au / 0xff
 			s10r := float64(s10ru)
 			s10g := float64(s10gu)
 			s10b := float64(s10bu)
@@ -738,12 +754,20 @@
 			s10g = xFrac1*s00g + xFrac0*s10g
 			s10b = xFrac1*s00b + xFrac0*s10b
 			s10a = xFrac1*s00a + xFrac0*s10a
-			s01ru, s01gu, s01bu, s01au := src.At(sr.Min.X+int(sx0), sr.Min.Y+int(sy1)).RGBA()
+			s01i := src.PixOffset(sr.Min.X+int(sx0), sr.Min.Y+int(sy1))
+			s01au := uint32(src.Pix[s01i+3]) * 0x101
+			s01ru := uint32(src.Pix[s01i+0]) * s01au / 0xff
+			s01gu := uint32(src.Pix[s01i+1]) * s01au / 0xff
+			s01bu := uint32(src.Pix[s01i+2]) * s01au / 0xff
 			s01r := float64(s01ru)
 			s01g := float64(s01gu)
 			s01b := float64(s01bu)
 			s01a := float64(s01au)
-			s11ru, s11gu, s11bu, s11au := src.At(sr.Min.X+int(sx1), sr.Min.Y+int(sy1)).RGBA()
+			s11i := src.PixOffset(sr.Min.X+int(sx1), sr.Min.Y+int(sy1))
+			s11au := uint32(src.Pix[s11i+3]) * 0x101
+			s11ru := uint32(src.Pix[s11i+0]) * s11au / 0xff
+			s11gu := uint32(src.Pix[s11i+1]) * s11au / 0xff
+			s11bu := uint32(src.Pix[s11i+2]) * s11au / 0xff
 			s11r := float64(s11ru)
 			s11g := float64(s11gu)
 			s11b := float64(s11bu)
@@ -1520,12 +1544,20 @@
 				yFrac0, yFrac1 = 1, 0
 			}
 
-			s00ru, s00gu, s00bu, s00au := src.At(sx0, sy0).RGBA()
+			s00i := src.PixOffset(sx0, sy0)
+			s00au := uint32(src.Pix[s00i+3]) * 0x101
+			s00ru := uint32(src.Pix[s00i+0]) * s00au / 0xff
+			s00gu := uint32(src.Pix[s00i+1]) * s00au / 0xff
+			s00bu := uint32(src.Pix[s00i+2]) * s00au / 0xff
 			s00r := float64(s00ru)
 			s00g := float64(s00gu)
 			s00b := float64(s00bu)
 			s00a := float64(s00au)
-			s10ru, s10gu, s10bu, s10au := src.At(sx1, sy0).RGBA()
+			s10i := src.PixOffset(sx1, sy0)
+			s10au := uint32(src.Pix[s10i+3]) * 0x101
+			s10ru := uint32(src.Pix[s10i+0]) * s10au / 0xff
+			s10gu := uint32(src.Pix[s10i+1]) * s10au / 0xff
+			s10bu := uint32(src.Pix[s10i+2]) * s10au / 0xff
 			s10r := float64(s10ru)
 			s10g := float64(s10gu)
 			s10b := float64(s10bu)
@@ -1534,12 +1566,20 @@
 			s10g = xFrac1*s00g + xFrac0*s10g
 			s10b = xFrac1*s00b + xFrac0*s10b
 			s10a = xFrac1*s00a + xFrac0*s10a
-			s01ru, s01gu, s01bu, s01au := src.At(sx0, sy1).RGBA()
+			s01i := src.PixOffset(sx0, sy1)
+			s01au := uint32(src.Pix[s01i+3]) * 0x101
+			s01ru := uint32(src.Pix[s01i+0]) * s01au / 0xff
+			s01gu := uint32(src.Pix[s01i+1]) * s01au / 0xff
+			s01bu := uint32(src.Pix[s01i+2]) * s01au / 0xff
 			s01r := float64(s01ru)
 			s01g := float64(s01gu)
 			s01b := float64(s01bu)
 			s01a := float64(s01au)
-			s11ru, s11gu, s11bu, s11au := src.At(sx1, sy1).RGBA()
+			s11i := src.PixOffset(sx1, sy1)
+			s11au := uint32(src.Pix[s11i+3]) * 0x101
+			s11ru := uint32(src.Pix[s11i+0]) * s11au / 0xff
+			s11gu := uint32(src.Pix[s11i+1]) * s11au / 0xff
+			s11bu := uint32(src.Pix[s11i+2]) * s11au / 0xff
 			s11r := float64(s11ru)
 			s11g := float64(s11gu)
 			s11b := float64(s11bu)
@@ -2371,7 +2411,11 @@
 		for _, s := range z.horizontal.sources {
 			var pr, pg, pb, pa float64
 			for _, c := range z.horizontal.contribs[s.i:s.j] {
-				pru, pgu, pbu, pau := src.At(sr.Min.X+int(c.coord), sr.Min.Y+int(y)).RGBA()
+				pi := src.PixOffset(sr.Min.X+int(c.coord), sr.Min.Y+int(y))
+				pau := uint32(src.Pix[pi+3]) * 0x101
+				pru := uint32(src.Pix[pi+0]) * pau / 0xff
+				pgu := uint32(src.Pix[pi+1]) * pau / 0xff
+				pbu := uint32(src.Pix[pi+2]) * pau / 0xff
 				pr += float64(pru) * c.weight
 				pg += float64(pgu) * c.weight
 				pb += float64(pbu) * c.weight
@@ -2768,7 +2812,11 @@
 			for ky := iy; ky < jy; ky++ {
 				yWeight := yWeights[ky-iy]
 				for kx := ix; kx < jx; kx++ {
-					pru, pgu, pbu, pau := src.At(kx, ky).RGBA()
+					pi := src.PixOffset(kx, ky)
+					pau := uint32(src.Pix[pi+3]) * 0x101
+					pru := uint32(src.Pix[pi+0]) * pau / 0xff
+					pgu := uint32(src.Pix[pi+1]) * pau / 0xff
+					pbu := uint32(src.Pix[pi+2]) * pau / 0xff
 					pr += float64(pru) * xWeights[kx-ix] * yWeight
 					pg += float64(pgu) * xWeights[kx-ix] * yWeight
 					pb += float64(pbu) * xWeights[kx-ix] * yWeight