draw: optimize some multiply-by-zeroes in Kernel.Transform.

benchmark                      old ns/op     new ns/op     delta
BenchmarkTformCRSrcGray        5096041       4820642       -5.40%
BenchmarkTformCRSrcNRGBA       10476578      8414331       -19.68%
BenchmarkTformCRSrcRGBA        10361135      7954413       -23.23%
BenchmarkTformCRSrcYCbCr       11952218      9824899       -17.80%

Change-Id: I8b4cfe68ecae85e447ae65ceecf185261445a8a2
Reviewed-on: https://go-review.googlesource.com/7991
Reviewed-by: Rob Pike <r@golang.org>
diff --git a/draw/gen.go b/draw/gen.go
index b670521..3ff3d84 100644
--- a/draw/gen.go
+++ b/draw/gen.go
@@ -1131,10 +1131,12 @@
 
 					var pr, pg, pb, pa float64 $tweakVarP
 					for ky := iy; ky < jy; ky++ {
-						yWeight := yWeights[ky - iy]
-						for kx := ix; kx < jx; kx++ {
-							w := xWeights[kx - ix] * yWeight
-							p += $srcf[kx, ky] * w
+						if yWeight := yWeights[ky - iy]; yWeight != 0 {
+							for kx := ix; kx < jx; kx++ {
+								if w := xWeights[kx - ix] * yWeight; w != 0 {
+									p += $srcf[kx, ky] * w
+								}
+							}
 						}
 					}
 					$outputf[dr.Min.X + int(dx), dr.Min.Y + int(dy), fffftou, p, 1]
diff --git a/draw/impl.go b/draw/impl.go
index 9b4ca07..f144531 100644
--- a/draw/impl.go
+++ b/draw/impl.go
@@ -3706,12 +3706,14 @@
 
 			var pr float64
 			for ky := iy; ky < jy; ky++ {
-				yWeight := yWeights[ky-iy]
-				for kx := ix; kx < jx; kx++ {
-					w := xWeights[kx-ix] * yWeight
-					pi := (ky-src.Rect.Min.Y)*src.Stride + (kx - src.Rect.Min.X)
-					pru := uint32(src.Pix[pi]) * 0x101
-					pr += float64(pru) * w
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pi := (ky-src.Rect.Min.Y)*src.Stride + (kx - src.Rect.Min.X)
+							pru := uint32(src.Pix[pi]) * 0x101
+							pr += float64(pru) * w
+						}
+					}
 				}
 			}
 			out := uint8(fffftou(pr) >> 8)
@@ -3803,18 +3805,20 @@
 
 			var pr, pg, pb, pa float64
 			for ky := iy; ky < jy; ky++ {
-				yWeight := yWeights[ky-iy]
-				for kx := ix; kx < jx; kx++ {
-					w := xWeights[kx-ix] * yWeight
-					pi := (ky-src.Rect.Min.Y)*src.Stride + (kx-src.Rect.Min.X)*4
-					pau := uint32(src.Pix[pi+3]) * 0x101
-					pru := uint32(src.Pix[pi+0]) * pau / 0xff
-					pgu := uint32(src.Pix[pi+1]) * pau / 0xff
-					pbu := uint32(src.Pix[pi+2]) * pau / 0xff
-					pr += float64(pru) * w
-					pg += float64(pgu) * w
-					pb += float64(pbu) * w
-					pa += float64(pau) * w
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pi := (ky-src.Rect.Min.Y)*src.Stride + (kx-src.Rect.Min.X)*4
+							pau := uint32(src.Pix[pi+3]) * 0x101
+							pru := uint32(src.Pix[pi+0]) * pau / 0xff
+							pgu := uint32(src.Pix[pi+1]) * pau / 0xff
+							pbu := uint32(src.Pix[pi+2]) * pau / 0xff
+							pr += float64(pru) * w
+							pg += float64(pgu) * w
+							pb += float64(pbu) * w
+							pa += float64(pau) * w
+						}
+					}
 				}
 			}
 			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
@@ -3905,18 +3909,20 @@
 
 			var pr, pg, pb, pa float64
 			for ky := iy; ky < jy; ky++ {
-				yWeight := yWeights[ky-iy]
-				for kx := ix; kx < jx; kx++ {
-					w := xWeights[kx-ix] * yWeight
-					pi := (ky-src.Rect.Min.Y)*src.Stride + (kx-src.Rect.Min.X)*4
-					pru := uint32(src.Pix[pi+0]) * 0x101
-					pgu := uint32(src.Pix[pi+1]) * 0x101
-					pbu := uint32(src.Pix[pi+2]) * 0x101
-					pau := uint32(src.Pix[pi+3]) * 0x101
-					pr += float64(pru) * w
-					pg += float64(pgu) * w
-					pb += float64(pbu) * w
-					pa += float64(pau) * w
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pi := (ky-src.Rect.Min.Y)*src.Stride + (kx-src.Rect.Min.X)*4
+							pru := uint32(src.Pix[pi+0]) * 0x101
+							pgu := uint32(src.Pix[pi+1]) * 0x101
+							pbu := uint32(src.Pix[pi+2]) * 0x101
+							pau := uint32(src.Pix[pi+3]) * 0x101
+							pr += float64(pru) * w
+							pg += float64(pgu) * w
+							pb += float64(pbu) * w
+							pa += float64(pau) * w
+						}
+					}
 				}
 			}
 			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
@@ -4007,41 +4013,43 @@
 
 			var pr, pg, pb float64
 			for ky := iy; ky < jy; ky++ {
-				yWeight := yWeights[ky-iy]
-				for kx := ix; kx < jx; kx++ {
-					w := xWeights[kx-ix] * yWeight
-					pi := (ky-src.Rect.Min.Y)*src.YStride + (kx - src.Rect.Min.X)
-					pj := (ky-src.Rect.Min.Y)*src.CStride + (kx - src.Rect.Min.X)
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pi := (ky-src.Rect.Min.Y)*src.YStride + (kx - src.Rect.Min.X)
+							pj := (ky-src.Rect.Min.Y)*src.CStride + (kx - src.Rect.Min.X)
 
-					// This is an inline version of image/color/ycbcr.go's func YCbCrToRGB.
-					pyy1 := int(src.Y[pi])<<16 + 1<<15
-					pcb1 := int(src.Cb[pj]) - 128
-					pcr1 := int(src.Cr[pj]) - 128
-					pr8 := (pyy1 + 91881*pcr1) >> 16
-					pg8 := (pyy1 - 22554*pcb1 - 46802*pcr1) >> 16
-					pb8 := (pyy1 + 116130*pcb1) >> 16
-					if pr8 < 0 {
-						pr8 = 0
-					} else if pr8 > 0xff {
-						pr8 = 0xff
-					}
-					if pg8 < 0 {
-						pg8 = 0
-					} else if pg8 > 0xff {
-						pg8 = 0xff
-					}
-					if pb8 < 0 {
-						pb8 = 0
-					} else if pb8 > 0xff {
-						pb8 = 0xff
-					}
+							// This is an inline version of image/color/ycbcr.go's func YCbCrToRGB.
+							pyy1 := int(src.Y[pi])<<16 + 1<<15
+							pcb1 := int(src.Cb[pj]) - 128
+							pcr1 := int(src.Cr[pj]) - 128
+							pr8 := (pyy1 + 91881*pcr1) >> 16
+							pg8 := (pyy1 - 22554*pcb1 - 46802*pcr1) >> 16
+							pb8 := (pyy1 + 116130*pcb1) >> 16
+							if pr8 < 0 {
+								pr8 = 0
+							} else if pr8 > 0xff {
+								pr8 = 0xff
+							}
+							if pg8 < 0 {
+								pg8 = 0
+							} else if pg8 > 0xff {
+								pg8 = 0xff
+							}
+							if pb8 < 0 {
+								pb8 = 0
+							} else if pb8 > 0xff {
+								pb8 = 0xff
+							}
 
-					pru := uint32(pr8) * 0x101
-					pgu := uint32(pg8) * 0x101
-					pbu := uint32(pb8) * 0x101
-					pr += float64(pru) * w
-					pg += float64(pgu) * w
-					pb += float64(pbu) * w
+							pru := uint32(pr8) * 0x101
+							pgu := uint32(pg8) * 0x101
+							pbu := uint32(pb8) * 0x101
+							pr += float64(pru) * w
+							pg += float64(pgu) * w
+							pb += float64(pbu) * w
+						}
+					}
 				}
 			}
 			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
@@ -4132,41 +4140,43 @@
 
 			var pr, pg, pb float64
 			for ky := iy; ky < jy; ky++ {
-				yWeight := yWeights[ky-iy]
-				for kx := ix; kx < jx; kx++ {
-					w := xWeights[kx-ix] * yWeight
-					pi := (ky-src.Rect.Min.Y)*src.YStride + (kx - src.Rect.Min.X)
-					pj := (ky-src.Rect.Min.Y)*src.CStride + ((kx)/2 - src.Rect.Min.X/2)
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pi := (ky-src.Rect.Min.Y)*src.YStride + (kx - src.Rect.Min.X)
+							pj := (ky-src.Rect.Min.Y)*src.CStride + ((kx)/2 - src.Rect.Min.X/2)
 
-					// This is an inline version of image/color/ycbcr.go's func YCbCrToRGB.
-					pyy1 := int(src.Y[pi])<<16 + 1<<15
-					pcb1 := int(src.Cb[pj]) - 128
-					pcr1 := int(src.Cr[pj]) - 128
-					pr8 := (pyy1 + 91881*pcr1) >> 16
-					pg8 := (pyy1 - 22554*pcb1 - 46802*pcr1) >> 16
-					pb8 := (pyy1 + 116130*pcb1) >> 16
-					if pr8 < 0 {
-						pr8 = 0
-					} else if pr8 > 0xff {
-						pr8 = 0xff
-					}
-					if pg8 < 0 {
-						pg8 = 0
-					} else if pg8 > 0xff {
-						pg8 = 0xff
-					}
-					if pb8 < 0 {
-						pb8 = 0
-					} else if pb8 > 0xff {
-						pb8 = 0xff
-					}
+							// This is an inline version of image/color/ycbcr.go's func YCbCrToRGB.
+							pyy1 := int(src.Y[pi])<<16 + 1<<15
+							pcb1 := int(src.Cb[pj]) - 128
+							pcr1 := int(src.Cr[pj]) - 128
+							pr8 := (pyy1 + 91881*pcr1) >> 16
+							pg8 := (pyy1 - 22554*pcb1 - 46802*pcr1) >> 16
+							pb8 := (pyy1 + 116130*pcb1) >> 16
+							if pr8 < 0 {
+								pr8 = 0
+							} else if pr8 > 0xff {
+								pr8 = 0xff
+							}
+							if pg8 < 0 {
+								pg8 = 0
+							} else if pg8 > 0xff {
+								pg8 = 0xff
+							}
+							if pb8 < 0 {
+								pb8 = 0
+							} else if pb8 > 0xff {
+								pb8 = 0xff
+							}
 
-					pru := uint32(pr8) * 0x101
-					pgu := uint32(pg8) * 0x101
-					pbu := uint32(pb8) * 0x101
-					pr += float64(pru) * w
-					pg += float64(pgu) * w
-					pb += float64(pbu) * w
+							pru := uint32(pr8) * 0x101
+							pgu := uint32(pg8) * 0x101
+							pbu := uint32(pb8) * 0x101
+							pr += float64(pru) * w
+							pg += float64(pgu) * w
+							pb += float64(pbu) * w
+						}
+					}
 				}
 			}
 			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
@@ -4257,41 +4267,43 @@
 
 			var pr, pg, pb float64
 			for ky := iy; ky < jy; ky++ {
-				yWeight := yWeights[ky-iy]
-				for kx := ix; kx < jx; kx++ {
-					w := xWeights[kx-ix] * yWeight
-					pi := (ky-src.Rect.Min.Y)*src.YStride + (kx - src.Rect.Min.X)
-					pj := ((ky)/2-src.Rect.Min.Y/2)*src.CStride + ((kx)/2 - src.Rect.Min.X/2)
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pi := (ky-src.Rect.Min.Y)*src.YStride + (kx - src.Rect.Min.X)
+							pj := ((ky)/2-src.Rect.Min.Y/2)*src.CStride + ((kx)/2 - src.Rect.Min.X/2)
 
-					// This is an inline version of image/color/ycbcr.go's func YCbCrToRGB.
-					pyy1 := int(src.Y[pi])<<16 + 1<<15
-					pcb1 := int(src.Cb[pj]) - 128
-					pcr1 := int(src.Cr[pj]) - 128
-					pr8 := (pyy1 + 91881*pcr1) >> 16
-					pg8 := (pyy1 - 22554*pcb1 - 46802*pcr1) >> 16
-					pb8 := (pyy1 + 116130*pcb1) >> 16
-					if pr8 < 0 {
-						pr8 = 0
-					} else if pr8 > 0xff {
-						pr8 = 0xff
-					}
-					if pg8 < 0 {
-						pg8 = 0
-					} else if pg8 > 0xff {
-						pg8 = 0xff
-					}
-					if pb8 < 0 {
-						pb8 = 0
-					} else if pb8 > 0xff {
-						pb8 = 0xff
-					}
+							// This is an inline version of image/color/ycbcr.go's func YCbCrToRGB.
+							pyy1 := int(src.Y[pi])<<16 + 1<<15
+							pcb1 := int(src.Cb[pj]) - 128
+							pcr1 := int(src.Cr[pj]) - 128
+							pr8 := (pyy1 + 91881*pcr1) >> 16
+							pg8 := (pyy1 - 22554*pcb1 - 46802*pcr1) >> 16
+							pb8 := (pyy1 + 116130*pcb1) >> 16
+							if pr8 < 0 {
+								pr8 = 0
+							} else if pr8 > 0xff {
+								pr8 = 0xff
+							}
+							if pg8 < 0 {
+								pg8 = 0
+							} else if pg8 > 0xff {
+								pg8 = 0xff
+							}
+							if pb8 < 0 {
+								pb8 = 0
+							} else if pb8 > 0xff {
+								pb8 = 0xff
+							}
 
-					pru := uint32(pr8) * 0x101
-					pgu := uint32(pg8) * 0x101
-					pbu := uint32(pb8) * 0x101
-					pr += float64(pru) * w
-					pg += float64(pgu) * w
-					pb += float64(pbu) * w
+							pru := uint32(pr8) * 0x101
+							pgu := uint32(pg8) * 0x101
+							pbu := uint32(pb8) * 0x101
+							pr += float64(pru) * w
+							pg += float64(pgu) * w
+							pb += float64(pbu) * w
+						}
+					}
 				}
 			}
 			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
@@ -4382,41 +4394,43 @@
 
 			var pr, pg, pb float64
 			for ky := iy; ky < jy; ky++ {
-				yWeight := yWeights[ky-iy]
-				for kx := ix; kx < jx; kx++ {
-					w := xWeights[kx-ix] * yWeight
-					pi := (ky-src.Rect.Min.Y)*src.YStride + (kx - src.Rect.Min.X)
-					pj := ((ky)/2-src.Rect.Min.Y/2)*src.CStride + (kx - src.Rect.Min.X)
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pi := (ky-src.Rect.Min.Y)*src.YStride + (kx - src.Rect.Min.X)
+							pj := ((ky)/2-src.Rect.Min.Y/2)*src.CStride + (kx - src.Rect.Min.X)
 
-					// This is an inline version of image/color/ycbcr.go's func YCbCrToRGB.
-					pyy1 := int(src.Y[pi])<<16 + 1<<15
-					pcb1 := int(src.Cb[pj]) - 128
-					pcr1 := int(src.Cr[pj]) - 128
-					pr8 := (pyy1 + 91881*pcr1) >> 16
-					pg8 := (pyy1 - 22554*pcb1 - 46802*pcr1) >> 16
-					pb8 := (pyy1 + 116130*pcb1) >> 16
-					if pr8 < 0 {
-						pr8 = 0
-					} else if pr8 > 0xff {
-						pr8 = 0xff
-					}
-					if pg8 < 0 {
-						pg8 = 0
-					} else if pg8 > 0xff {
-						pg8 = 0xff
-					}
-					if pb8 < 0 {
-						pb8 = 0
-					} else if pb8 > 0xff {
-						pb8 = 0xff
-					}
+							// This is an inline version of image/color/ycbcr.go's func YCbCrToRGB.
+							pyy1 := int(src.Y[pi])<<16 + 1<<15
+							pcb1 := int(src.Cb[pj]) - 128
+							pcr1 := int(src.Cr[pj]) - 128
+							pr8 := (pyy1 + 91881*pcr1) >> 16
+							pg8 := (pyy1 - 22554*pcb1 - 46802*pcr1) >> 16
+							pb8 := (pyy1 + 116130*pcb1) >> 16
+							if pr8 < 0 {
+								pr8 = 0
+							} else if pr8 > 0xff {
+								pr8 = 0xff
+							}
+							if pg8 < 0 {
+								pg8 = 0
+							} else if pg8 > 0xff {
+								pg8 = 0xff
+							}
+							if pb8 < 0 {
+								pb8 = 0
+							} else if pb8 > 0xff {
+								pb8 = 0xff
+							}
 
-					pru := uint32(pr8) * 0x101
-					pgu := uint32(pg8) * 0x101
-					pbu := uint32(pb8) * 0x101
-					pr += float64(pru) * w
-					pg += float64(pgu) * w
-					pb += float64(pbu) * w
+							pru := uint32(pr8) * 0x101
+							pgu := uint32(pg8) * 0x101
+							pbu := uint32(pb8) * 0x101
+							pr += float64(pru) * w
+							pg += float64(pgu) * w
+							pb += float64(pbu) * w
+						}
+					}
 				}
 			}
 			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
@@ -4507,14 +4521,16 @@
 
 			var pr, pg, pb, pa float64
 			for ky := iy; ky < jy; ky++ {
-				yWeight := yWeights[ky-iy]
-				for kx := ix; kx < jx; kx++ {
-					w := xWeights[kx-ix] * yWeight
-					pru, pgu, pbu, pau := src.At(kx, ky).RGBA()
-					pr += float64(pru) * w
-					pg += float64(pgu) * w
-					pb += float64(pbu) * w
-					pa += float64(pau) * w
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pru, pgu, pbu, pau := src.At(kx, ky).RGBA()
+							pr += float64(pru) * w
+							pg += float64(pgu) * w
+							pb += float64(pbu) * w
+							pa += float64(pau) * w
+						}
+					}
 				}
 			}
 			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
@@ -4606,14 +4622,16 @@
 
 			var pr, pg, pb, pa float64
 			for ky := iy; ky < jy; ky++ {
-				yWeight := yWeights[ky-iy]
-				for kx := ix; kx < jx; kx++ {
-					w := xWeights[kx-ix] * yWeight
-					pru, pgu, pbu, pau := src.At(kx, ky).RGBA()
-					pr += float64(pru) * w
-					pg += float64(pgu) * w
-					pb += float64(pbu) * w
-					pa += float64(pau) * w
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pru, pgu, pbu, pau := src.At(kx, ky).RGBA()
+							pr += float64(pru) * w
+							pg += float64(pgu) * w
+							pb += float64(pbu) * w
+							pa += float64(pau) * w
+						}
+					}
 				}
 			}
 			dstColorRGBA64.R = fffftou(pr)