draw: optimize Kernel.Transform.

benchmark                      old ns/op      new ns/op      delta
BenchmarkTformCRSrcGray        6111610        5344117        -12.56%
BenchmarkTformCRSrcNRGBA       62070281       59295178       -4.47%
BenchmarkTformCRSrcRGBA        13840290       10612547       -23.32%
BenchmarkTformCRSrcUniform     591637         587621         -0.68%
BenchmarkTformCRSrcYCbCr       72219184       69404747       -3.90%

As of current origin/master, Gray and RGBA have fast paths but the other src
image types do not. They have more fat, so the relative improvement is smaller.

Change-Id: Ibbae91cd3cb3c139efb1dcc8fda1cb6432505189
Reviewed-on: https://go-review.googlesource.com/7794
Reviewed-by: Rob Pike <r@golang.org>
diff --git a/draw/gen.go b/draw/gen.go
index 4048e19..79ef527 100644
--- a/draw/gen.go
+++ b/draw/gen.go
@@ -958,7 +958,8 @@
 					for ky := iy; ky < jy; ky++ {
 						yWeight := yWeights[ky - iy]
 						for kx := ix; kx < jx; kx++ {
-							p += $srcf[kx, ky] * xWeights[kx - ix] * yWeight
+							w := xWeights[kx - ix] * yWeight
+							p += $srcf[kx, ky] * w
 						}
 					}
 					$outputf[dr.Min.X + int(dx), dr.Min.Y + int(dy), fffftou, p, 1]
diff --git a/draw/impl.go b/draw/impl.go
index 06b8432..89e7cd0 100644
--- a/draw/impl.go
+++ b/draw/impl.go
@@ -2719,9 +2719,10 @@
 			for ky := iy; ky < jy; ky++ {
 				yWeight := yWeights[ky-iy]
 				for kx := ix; kx < jx; kx++ {
+					w := xWeights[kx-ix] * yWeight
 					pi := src.PixOffset(kx, ky)
 					pru := uint32(src.Pix[pi]) * 0x101
-					pr += float64(pru) * xWeights[kx-ix] * yWeight
+					pr += float64(pru) * w
 				}
 			}
 			out := uint8(fffftou(pr) >> 8)
@@ -2812,15 +2813,16 @@
 			for ky := iy; ky < jy; ky++ {
 				yWeight := yWeights[ky-iy]
 				for kx := ix; kx < jx; kx++ {
+					w := xWeights[kx-ix] * yWeight
 					pi := src.PixOffset(kx, ky)
 					pau := uint32(src.Pix[pi+3]) * 0x101
 					pru := uint32(src.Pix[pi+0]) * pau / 0xff
 					pgu := uint32(src.Pix[pi+1]) * pau / 0xff
 					pbu := uint32(src.Pix[pi+2]) * pau / 0xff
-					pr += float64(pru) * xWeights[kx-ix] * yWeight
-					pg += float64(pgu) * xWeights[kx-ix] * yWeight
-					pb += float64(pbu) * xWeights[kx-ix] * yWeight
-					pa += float64(pau) * xWeights[kx-ix] * yWeight
+					pr += float64(pru) * w
+					pg += float64(pgu) * w
+					pb += float64(pbu) * w
+					pa += float64(pau) * w
 				}
 			}
 			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
@@ -2910,15 +2912,16 @@
 			for ky := iy; ky < jy; ky++ {
 				yWeight := yWeights[ky-iy]
 				for kx := ix; kx < jx; kx++ {
+					w := xWeights[kx-ix] * yWeight
 					pi := src.PixOffset(kx, ky)
 					pru := uint32(src.Pix[pi+0]) * 0x101
 					pgu := uint32(src.Pix[pi+1]) * 0x101
 					pbu := uint32(src.Pix[pi+2]) * 0x101
 					pau := uint32(src.Pix[pi+3]) * 0x101
-					pr += float64(pru) * xWeights[kx-ix] * yWeight
-					pg += float64(pgu) * xWeights[kx-ix] * yWeight
-					pb += float64(pbu) * xWeights[kx-ix] * yWeight
-					pa += float64(pau) * xWeights[kx-ix] * yWeight
+					pr += float64(pru) * w
+					pg += float64(pgu) * w
+					pb += float64(pbu) * w
+					pa += float64(pau) * w
 				}
 			}
 			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
@@ -3008,11 +3011,12 @@
 			for ky := iy; ky < jy; ky++ {
 				yWeight := yWeights[ky-iy]
 				for kx := ix; kx < jx; kx++ {
+					w := xWeights[kx-ix] * yWeight
 					pru, pgu, pbu, pau := src.At(kx, ky).RGBA()
-					pr += float64(pru) * xWeights[kx-ix] * yWeight
-					pg += float64(pgu) * xWeights[kx-ix] * yWeight
-					pb += float64(pbu) * xWeights[kx-ix] * yWeight
-					pa += float64(pau) * xWeights[kx-ix] * yWeight
+					pr += float64(pru) * w
+					pg += float64(pgu) * w
+					pb += float64(pbu) * w
+					pa += float64(pau) * w
 				}
 			}
 			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
@@ -3102,11 +3106,12 @@
 			for ky := iy; ky < jy; ky++ {
 				yWeight := yWeights[ky-iy]
 				for kx := ix; kx < jx; kx++ {
+					w := xWeights[kx-ix] * yWeight
 					pru, pgu, pbu, pau := src.At(kx, ky).RGBA()
-					pr += float64(pru) * xWeights[kx-ix] * yWeight
-					pg += float64(pgu) * xWeights[kx-ix] * yWeight
-					pb += float64(pbu) * xWeights[kx-ix] * yWeight
-					pa += float64(pau) * xWeights[kx-ix] * yWeight
+					pr += float64(pru) * w
+					pg += float64(pgu) * w
+					pb += float64(pbu) * w
+					pa += float64(pau) * w
 				}
 			}
 			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
@@ -3196,11 +3201,12 @@
 			for ky := iy; ky < jy; ky++ {
 				yWeight := yWeights[ky-iy]
 				for kx := ix; kx < jx; kx++ {
+					w := xWeights[kx-ix] * yWeight
 					pru, pgu, pbu, pau := src.At(kx, ky).RGBA()
-					pr += float64(pru) * xWeights[kx-ix] * yWeight
-					pg += float64(pgu) * xWeights[kx-ix] * yWeight
-					pb += float64(pbu) * xWeights[kx-ix] * yWeight
-					pa += float64(pau) * xWeights[kx-ix] * yWeight
+					pr += float64(pru) * w
+					pg += float64(pgu) * w
+					pb += float64(pbu) * w
+					pa += float64(pau) * w
 				}
 			}
 			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
@@ -3290,11 +3296,12 @@
 			for ky := iy; ky < jy; ky++ {
 				yWeight := yWeights[ky-iy]
 				for kx := ix; kx < jx; kx++ {
+					w := xWeights[kx-ix] * yWeight
 					pru, pgu, pbu, pau := src.At(kx, ky).RGBA()
-					pr += float64(pru) * xWeights[kx-ix] * yWeight
-					pg += float64(pgu) * xWeights[kx-ix] * yWeight
-					pb += float64(pbu) * xWeights[kx-ix] * yWeight
-					pa += float64(pau) * xWeights[kx-ix] * yWeight
+					pr += float64(pru) * w
+					pg += float64(pgu) * w
+					pb += float64(pbu) * w
+					pa += float64(pau) * w
 				}
 			}
 			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
@@ -3384,11 +3391,12 @@
 			for ky := iy; ky < jy; ky++ {
 				yWeight := yWeights[ky-iy]
 				for kx := ix; kx < jx; kx++ {
+					w := xWeights[kx-ix] * yWeight
 					pru, pgu, pbu, pau := src.At(kx, ky).RGBA()
-					pr += float64(pru) * xWeights[kx-ix] * yWeight
-					pg += float64(pgu) * xWeights[kx-ix] * yWeight
-					pb += float64(pbu) * xWeights[kx-ix] * yWeight
-					pa += float64(pau) * xWeights[kx-ix] * yWeight
+					pr += float64(pru) * w
+					pg += float64(pgu) * w
+					pb += float64(pbu) * w
+					pa += float64(pau) * w
 				}
 			}
 			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
@@ -3478,11 +3486,12 @@
 			for ky := iy; ky < jy; ky++ {
 				yWeight := yWeights[ky-iy]
 				for kx := ix; kx < jx; kx++ {
+					w := xWeights[kx-ix] * yWeight
 					pru, pgu, pbu, pau := src.At(kx, ky).RGBA()
-					pr += float64(pru) * xWeights[kx-ix] * yWeight
-					pg += float64(pgu) * xWeights[kx-ix] * yWeight
-					pb += float64(pbu) * xWeights[kx-ix] * yWeight
-					pa += float64(pau) * xWeights[kx-ix] * yWeight
+					pr += float64(pru) * w
+					pg += float64(pgu) * w
+					pb += float64(pbu) * w
+					pa += float64(pau) * w
 				}
 			}
 			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
@@ -3573,11 +3582,12 @@
 			for ky := iy; ky < jy; ky++ {
 				yWeight := yWeights[ky-iy]
 				for kx := ix; kx < jx; kx++ {
+					w := xWeights[kx-ix] * yWeight
 					pru, pgu, pbu, pau := src.At(kx, ky).RGBA()
-					pr += float64(pru) * xWeights[kx-ix] * yWeight
-					pg += float64(pgu) * xWeights[kx-ix] * yWeight
-					pb += float64(pbu) * xWeights[kx-ix] * yWeight
-					pa += float64(pau) * xWeights[kx-ix] * yWeight
+					pr += float64(pru) * w
+					pg += float64(pgu) * w
+					pb += float64(pbu) * w
+					pa += float64(pau) * w
 				}
 			}
 			dstColorRGBA64.R = fffftou(pr)