draw: clip scaling to the dst bounds.

This is necessary for the upcoming RGBA dst fast path. The RGBA.Set slow
path will clip automatically. Accessing RGBA.Pix directly will not.

Benchmarks look like noise to me:
benchmark                     old ns/op      new ns/op      delta
BenchmarkScaleLargeDownNN     6212108        6131166        -1.30%
BenchmarkScaleLargeDownAB     15586042       15656681       +0.45%
BenchmarkScaleLargeDownBL     1518783517     1508124217     -0.70%
BenchmarkScaleLargeDownCR     2998969089     2978114154     -0.70%
BenchmarkScaleDownNN          1821187        1809314        -0.65%
BenchmarkScaleDownAB          4286983        4248974        -0.89%
BenchmarkScaleDownBL          29396818       30181926       +2.67%
BenchmarkScaleDownCR          56441945       57952417       +2.68%
BenchmarkScaleUpNN            90325384       89734496       -0.65%
BenchmarkScaleUpAB            211613922      211625435      +0.01%
BenchmarkScaleUpBL            119730880      120817135      +0.91%
BenchmarkScaleUpCR            178592665      182305702      +2.08%
BenchmarkScaleSrcNRGBA        13271034       13210760       -0.45%
BenchmarkScaleSrcRGBA         13082234       12997551       -0.65%
BenchmarkScaleSrcUniform      4003966        3934184        -1.74%
BenchmarkScaleSrcYCbCr        15939182       15900123       -0.25%

Change-Id: Ibf2843bb3c4eb695b58030e7314053c669533016
Reviewed-on: https://go-review.googlesource.com/6073
Reviewed-by: Rob Pike <r@golang.org>
diff --git a/draw/gen.go b/draw/gen.go
index 8fef308..387b879 100644
--- a/draw/gen.go
+++ b/draw/gen.go
@@ -347,16 +347,21 @@
 			if z.dw <= 0 || z.dh <= 0 || z.sw <= 0 || z.sh <= 0 {
 				return
 			}
-			$switch z.scale_$dTypeRN_$sTypeRN(dst, dp, src, sp)
+			// dr is the affected destination pixels, relative to dp.
+			dr := dst.Bounds().Sub(dp).Intersect(image.Rectangle{Max: image.Point{int(z.dw), int(z.dh)}})
+			if dr.Empty() {
+				return
+			}
+			$switch z.scale_$dTypeRN_$sTypeRN(dst, dp, dr, src, sp)
 		}
 	`
 
 	codeNNLeaf = `
-		func (z *nnScaler) scale_$dTypeRN_$sTypeRN(dst $dType, dp image.Point, src $sType, sp image.Point) {
+		func (z *nnScaler) scale_$dTypeRN_$sTypeRN(dst $dType, dp image.Point, dr image.Rectangle, src $sType, sp image.Point) {
 			$dstColorDecl
-			for dy := int32(0); dy < z.dh; dy++ {
+			for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 				sy := (2*uint64(dy) + 1) * uint64(z.sh) / (2 * uint64(z.dh))
-				for dx := int32(0); dx < z.dw; dx++ {
+				for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 					sx := (2*uint64(dx) + 1) * uint64(z.sw) / (2 * uint64(z.dw))
 					p := $srcu[sx, sy]
 					$outputu[dx, dy, p]
@@ -366,11 +371,11 @@
 	`
 
 	codeABLLeaf = `
-		func (z *ablScaler) scale_$dTypeRN_$sTypeRN(dst $dType, dp image.Point, src $sType, sp image.Point) {
+		func (z *ablScaler) scale_$dTypeRN_$sTypeRN(dst $dType, dp image.Point, dr image.Rectangle, src $sType, sp image.Point) {
 			yscale := float64(z.sh) / float64(z.dh)
 			xscale := float64(z.sw) / float64(z.dw)
 			$dstColorDecl
-			for dy := int32(0); dy < z.dh; dy++ {
+			for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 				sy := (float64(dy)+0.5)*yscale - 0.5
 				sy0 := int32(sy)
 				yFrac0 := sy - float64(sy0)
@@ -383,7 +388,7 @@
 					sy1 = sy0
 					yFrac0, yFrac1 = 1, 0
 				}
-				for dx := int32(0); dx < z.dw; dx++ {
+				for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 					sx := (float64(dx)+0.5)*xscale - 0.5
 					sx0 := int32(sx)
 					xFrac0 := sx - float64(sx0)
@@ -414,13 +419,18 @@
 			if z.dw <= 0 || z.dh <= 0 || z.sw <= 0 || z.sh <= 0 {
 				return
 			}
+			// dr is the affected destination pixels, relative to dp.
+			dr := dst.Bounds().Sub(dp).Intersect(image.Rectangle{Max: image.Point{int(z.dw), int(z.dh)}})
+			if dr.Empty() {
+				return
+			}
 			// Create a temporary buffer:
 			// scaleX distributes the source image's columns over the temporary image.
 			// scaleY distributes the temporary image's rows over the destination image.
 			// TODO: is it worth having a sync.Pool for this temporary buffer?
 			tmp := make([][4]float64, z.dw*z.sh)
 			$switchS z.scaleX_$sTypeRN(tmp, src, sp)
-			$switchD z.scaleY_$dTypeRN(dst, dp, tmp)
+			$switchD z.scaleY_$dTypeRN(dst, dp, dr, tmp)
 		}
 	`
 
@@ -446,19 +456,19 @@
 	`
 
 	codeKernelLeafY = `
-		func (z *kernelScaler) scaleY_$dTypeRN(dst $dType, dp image.Point, tmp [][4]float64) {
+		func (z *kernelScaler) scaleY_$dTypeRN(dst $dType, dp image.Point, dr image.Rectangle, tmp [][4]float64) {
 			$dstColorDecl
-			for x := int32(0); x < z.dw; x++ {
-				for y, s := range z.vertical.sources {
+			for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
+				for dy, s := range z.vertical.sources[dr.Min.Y:dr.Max.Y] {
 					var pr, pg, pb, pa float64
 					for _, c := range z.vertical.contribs[s.i:s.j] {
-						p := &tmp[c.coord*z.dw+x]
+						p := &tmp[c.coord*z.dw+dx]
 						pr += p[0] * c.weight
 						pg += p[1] * c.weight
 						pb += p[2] * c.weight
 						pa += p[3] * c.weight
 					}
-					$outputf[x, y, p, s.invTotalWeight]
+					$outputf[dx, dr.Min.Y+dy, p, s.invTotalWeight]
 				}
 			}
 		}
diff --git a/draw/impl.go b/draw/impl.go
index 2c18717..0cfee91 100644
--- a/draw/impl.go
+++ b/draw/impl.go
@@ -11,34 +11,39 @@
 	if z.dw <= 0 || z.dh <= 0 || z.sw <= 0 || z.sh <= 0 {
 		return
 	}
+	// dr is the affected destination pixels, relative to dp.
+	dr := dst.Bounds().Sub(dp).Intersect(image.Rectangle{Max: image.Point{int(z.dw), int(z.dh)}})
+	if dr.Empty() {
+		return
+	}
 	switch dst := dst.(type) {
 	case *image.RGBA:
 		switch src := src.(type) {
 		case *image.NRGBA:
-			z.scale_RGBA_NRGBA(dst, dp, src, sp)
+			z.scale_RGBA_NRGBA(dst, dp, dr, src, sp)
 		case *image.RGBA:
-			z.scale_RGBA_RGBA(dst, dp, src, sp)
+			z.scale_RGBA_RGBA(dst, dp, dr, src, sp)
 		case *image.Uniform:
-			z.scale_RGBA_Uniform(dst, dp, src, sp)
+			z.scale_RGBA_Uniform(dst, dp, dr, src, sp)
 		case *image.YCbCr:
-			z.scale_RGBA_YCbCr(dst, dp, src, sp)
+			z.scale_RGBA_YCbCr(dst, dp, dr, src, sp)
 		default:
-			z.scale_RGBA_Image(dst, dp, src, sp)
+			z.scale_RGBA_Image(dst, dp, dr, src, sp)
 		}
 	default:
 		switch src := src.(type) {
 		default:
-			z.scale_Image_Image(dst, dp, src, sp)
+			z.scale_Image_Image(dst, dp, dr, src, sp)
 		}
 	}
 }
 
-func (z *nnScaler) scale_RGBA_NRGBA(dst *image.RGBA, dp image.Point, src *image.NRGBA, sp image.Point) {
+func (z *nnScaler) scale_RGBA_NRGBA(dst *image.RGBA, dp image.Point, dr image.Rectangle, src *image.NRGBA, sp image.Point) {
 	dstColorRGBA64 := &color.RGBA64{}
 	dstColor := color.Color(dstColorRGBA64)
-	for dy := int32(0); dy < z.dh; dy++ {
+	for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 		sy := (2*uint64(dy) + 1) * uint64(z.sh) / (2 * uint64(z.dh))
-		for dx := int32(0); dx < z.dw; dx++ {
+		for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 			sx := (2*uint64(dx) + 1) * uint64(z.sw) / (2 * uint64(z.dw))
 			pr, pg, pb, pa := src.At(sp.X+int(sx), sp.Y+int(sy)).RGBA()
 			dstColorRGBA64.R = uint16(pr)
@@ -50,12 +55,12 @@
 	}
 }
 
-func (z *nnScaler) scale_RGBA_RGBA(dst *image.RGBA, dp image.Point, src *image.RGBA, sp image.Point) {
+func (z *nnScaler) scale_RGBA_RGBA(dst *image.RGBA, dp image.Point, dr image.Rectangle, src *image.RGBA, sp image.Point) {
 	dstColorRGBA64 := &color.RGBA64{}
 	dstColor := color.Color(dstColorRGBA64)
-	for dy := int32(0); dy < z.dh; dy++ {
+	for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 		sy := (2*uint64(dy) + 1) * uint64(z.sh) / (2 * uint64(z.dh))
-		for dx := int32(0); dx < z.dw; dx++ {
+		for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 			sx := (2*uint64(dx) + 1) * uint64(z.sw) / (2 * uint64(z.dw))
 			pr, pg, pb, pa := src.At(sp.X+int(sx), sp.Y+int(sy)).RGBA()
 			dstColorRGBA64.R = uint16(pr)
@@ -67,12 +72,12 @@
 	}
 }
 
-func (z *nnScaler) scale_RGBA_Uniform(dst *image.RGBA, dp image.Point, src *image.Uniform, sp image.Point) {
+func (z *nnScaler) scale_RGBA_Uniform(dst *image.RGBA, dp image.Point, dr image.Rectangle, src *image.Uniform, sp image.Point) {
 	dstColorRGBA64 := &color.RGBA64{}
 	dstColor := color.Color(dstColorRGBA64)
-	for dy := int32(0); dy < z.dh; dy++ {
+	for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 		sy := (2*uint64(dy) + 1) * uint64(z.sh) / (2 * uint64(z.dh))
-		for dx := int32(0); dx < z.dw; dx++ {
+		for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 			sx := (2*uint64(dx) + 1) * uint64(z.sw) / (2 * uint64(z.dw))
 			pr, pg, pb, pa := src.At(sp.X+int(sx), sp.Y+int(sy)).RGBA()
 			dstColorRGBA64.R = uint16(pr)
@@ -84,12 +89,12 @@
 	}
 }
 
-func (z *nnScaler) scale_RGBA_YCbCr(dst *image.RGBA, dp image.Point, src *image.YCbCr, sp image.Point) {
+func (z *nnScaler) scale_RGBA_YCbCr(dst *image.RGBA, dp image.Point, dr image.Rectangle, src *image.YCbCr, sp image.Point) {
 	dstColorRGBA64 := &color.RGBA64{}
 	dstColor := color.Color(dstColorRGBA64)
-	for dy := int32(0); dy < z.dh; dy++ {
+	for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 		sy := (2*uint64(dy) + 1) * uint64(z.sh) / (2 * uint64(z.dh))
-		for dx := int32(0); dx < z.dw; dx++ {
+		for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 			sx := (2*uint64(dx) + 1) * uint64(z.sw) / (2 * uint64(z.dw))
 			pr, pg, pb, pa := src.At(sp.X+int(sx), sp.Y+int(sy)).RGBA()
 			dstColorRGBA64.R = uint16(pr)
@@ -101,12 +106,12 @@
 	}
 }
 
-func (z *nnScaler) scale_RGBA_Image(dst *image.RGBA, dp image.Point, src image.Image, sp image.Point) {
+func (z *nnScaler) scale_RGBA_Image(dst *image.RGBA, dp image.Point, dr image.Rectangle, src image.Image, sp image.Point) {
 	dstColorRGBA64 := &color.RGBA64{}
 	dstColor := color.Color(dstColorRGBA64)
-	for dy := int32(0); dy < z.dh; dy++ {
+	for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 		sy := (2*uint64(dy) + 1) * uint64(z.sh) / (2 * uint64(z.dh))
-		for dx := int32(0); dx < z.dw; dx++ {
+		for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 			sx := (2*uint64(dx) + 1) * uint64(z.sw) / (2 * uint64(z.dw))
 			pr, pg, pb, pa := src.At(sp.X+int(sx), sp.Y+int(sy)).RGBA()
 			dstColorRGBA64.R = uint16(pr)
@@ -118,12 +123,12 @@
 	}
 }
 
-func (z *nnScaler) scale_Image_Image(dst Image, dp image.Point, src image.Image, sp image.Point) {
+func (z *nnScaler) scale_Image_Image(dst Image, dp image.Point, dr image.Rectangle, src image.Image, sp image.Point) {
 	dstColorRGBA64 := &color.RGBA64{}
 	dstColor := color.Color(dstColorRGBA64)
-	for dy := int32(0); dy < z.dh; dy++ {
+	for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 		sy := (2*uint64(dy) + 1) * uint64(z.sh) / (2 * uint64(z.dh))
-		for dx := int32(0); dx < z.dw; dx++ {
+		for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 			sx := (2*uint64(dx) + 1) * uint64(z.sw) / (2 * uint64(z.dw))
 			pr, pg, pb, pa := src.At(sp.X+int(sx), sp.Y+int(sy)).RGBA()
 			dstColorRGBA64.R = uint16(pr)
@@ -139,34 +144,39 @@
 	if z.dw <= 0 || z.dh <= 0 || z.sw <= 0 || z.sh <= 0 {
 		return
 	}
+	// dr is the affected destination pixels, relative to dp.
+	dr := dst.Bounds().Sub(dp).Intersect(image.Rectangle{Max: image.Point{int(z.dw), int(z.dh)}})
+	if dr.Empty() {
+		return
+	}
 	switch dst := dst.(type) {
 	case *image.RGBA:
 		switch src := src.(type) {
 		case *image.NRGBA:
-			z.scale_RGBA_NRGBA(dst, dp, src, sp)
+			z.scale_RGBA_NRGBA(dst, dp, dr, src, sp)
 		case *image.RGBA:
-			z.scale_RGBA_RGBA(dst, dp, src, sp)
+			z.scale_RGBA_RGBA(dst, dp, dr, src, sp)
 		case *image.Uniform:
-			z.scale_RGBA_Uniform(dst, dp, src, sp)
+			z.scale_RGBA_Uniform(dst, dp, dr, src, sp)
 		case *image.YCbCr:
-			z.scale_RGBA_YCbCr(dst, dp, src, sp)
+			z.scale_RGBA_YCbCr(dst, dp, dr, src, sp)
 		default:
-			z.scale_RGBA_Image(dst, dp, src, sp)
+			z.scale_RGBA_Image(dst, dp, dr, src, sp)
 		}
 	default:
 		switch src := src.(type) {
 		default:
-			z.scale_Image_Image(dst, dp, src, sp)
+			z.scale_Image_Image(dst, dp, dr, src, sp)
 		}
 	}
 }
 
-func (z *ablScaler) scale_RGBA_NRGBA(dst *image.RGBA, dp image.Point, src *image.NRGBA, sp image.Point) {
+func (z *ablScaler) scale_RGBA_NRGBA(dst *image.RGBA, dp image.Point, dr image.Rectangle, src *image.NRGBA, sp image.Point) {
 	yscale := float64(z.sh) / float64(z.dh)
 	xscale := float64(z.sw) / float64(z.dw)
 	dstColorRGBA64 := &color.RGBA64{}
 	dstColor := color.Color(dstColorRGBA64)
-	for dy := int32(0); dy < z.dh; dy++ {
+	for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 		sy := (float64(dy)+0.5)*yscale - 0.5
 		sy0 := int32(sy)
 		yFrac0 := sy - float64(sy0)
@@ -179,7 +189,7 @@
 			sy1 = sy0
 			yFrac0, yFrac1 = 1, 0
 		}
-		for dx := int32(0); dx < z.dw; dx++ {
+		for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 			sx := (float64(dx)+0.5)*xscale - 0.5
 			sx0 := int32(sx)
 			xFrac0 := sx - float64(sx0)
@@ -233,12 +243,12 @@
 	}
 }
 
-func (z *ablScaler) scale_RGBA_RGBA(dst *image.RGBA, dp image.Point, src *image.RGBA, sp image.Point) {
+func (z *ablScaler) scale_RGBA_RGBA(dst *image.RGBA, dp image.Point, dr image.Rectangle, src *image.RGBA, sp image.Point) {
 	yscale := float64(z.sh) / float64(z.dh)
 	xscale := float64(z.sw) / float64(z.dw)
 	dstColorRGBA64 := &color.RGBA64{}
 	dstColor := color.Color(dstColorRGBA64)
-	for dy := int32(0); dy < z.dh; dy++ {
+	for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 		sy := (float64(dy)+0.5)*yscale - 0.5
 		sy0 := int32(sy)
 		yFrac0 := sy - float64(sy0)
@@ -251,7 +261,7 @@
 			sy1 = sy0
 			yFrac0, yFrac1 = 1, 0
 		}
-		for dx := int32(0); dx < z.dw; dx++ {
+		for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 			sx := (float64(dx)+0.5)*xscale - 0.5
 			sx0 := int32(sx)
 			xFrac0 := sx - float64(sx0)
@@ -305,12 +315,12 @@
 	}
 }
 
-func (z *ablScaler) scale_RGBA_Uniform(dst *image.RGBA, dp image.Point, src *image.Uniform, sp image.Point) {
+func (z *ablScaler) scale_RGBA_Uniform(dst *image.RGBA, dp image.Point, dr image.Rectangle, src *image.Uniform, sp image.Point) {
 	yscale := float64(z.sh) / float64(z.dh)
 	xscale := float64(z.sw) / float64(z.dw)
 	dstColorRGBA64 := &color.RGBA64{}
 	dstColor := color.Color(dstColorRGBA64)
-	for dy := int32(0); dy < z.dh; dy++ {
+	for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 		sy := (float64(dy)+0.5)*yscale - 0.5
 		sy0 := int32(sy)
 		yFrac0 := sy - float64(sy0)
@@ -323,7 +333,7 @@
 			sy1 = sy0
 			yFrac0, yFrac1 = 1, 0
 		}
-		for dx := int32(0); dx < z.dw; dx++ {
+		for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 			sx := (float64(dx)+0.5)*xscale - 0.5
 			sx0 := int32(sx)
 			xFrac0 := sx - float64(sx0)
@@ -377,12 +387,12 @@
 	}
 }
 
-func (z *ablScaler) scale_RGBA_YCbCr(dst *image.RGBA, dp image.Point, src *image.YCbCr, sp image.Point) {
+func (z *ablScaler) scale_RGBA_YCbCr(dst *image.RGBA, dp image.Point, dr image.Rectangle, src *image.YCbCr, sp image.Point) {
 	yscale := float64(z.sh) / float64(z.dh)
 	xscale := float64(z.sw) / float64(z.dw)
 	dstColorRGBA64 := &color.RGBA64{}
 	dstColor := color.Color(dstColorRGBA64)
-	for dy := int32(0); dy < z.dh; dy++ {
+	for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 		sy := (float64(dy)+0.5)*yscale - 0.5
 		sy0 := int32(sy)
 		yFrac0 := sy - float64(sy0)
@@ -395,7 +405,7 @@
 			sy1 = sy0
 			yFrac0, yFrac1 = 1, 0
 		}
-		for dx := int32(0); dx < z.dw; dx++ {
+		for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 			sx := (float64(dx)+0.5)*xscale - 0.5
 			sx0 := int32(sx)
 			xFrac0 := sx - float64(sx0)
@@ -449,12 +459,12 @@
 	}
 }
 
-func (z *ablScaler) scale_RGBA_Image(dst *image.RGBA, dp image.Point, src image.Image, sp image.Point) {
+func (z *ablScaler) scale_RGBA_Image(dst *image.RGBA, dp image.Point, dr image.Rectangle, src image.Image, sp image.Point) {
 	yscale := float64(z.sh) / float64(z.dh)
 	xscale := float64(z.sw) / float64(z.dw)
 	dstColorRGBA64 := &color.RGBA64{}
 	dstColor := color.Color(dstColorRGBA64)
-	for dy := int32(0); dy < z.dh; dy++ {
+	for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 		sy := (float64(dy)+0.5)*yscale - 0.5
 		sy0 := int32(sy)
 		yFrac0 := sy - float64(sy0)
@@ -467,7 +477,7 @@
 			sy1 = sy0
 			yFrac0, yFrac1 = 1, 0
 		}
-		for dx := int32(0); dx < z.dw; dx++ {
+		for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 			sx := (float64(dx)+0.5)*xscale - 0.5
 			sx0 := int32(sx)
 			xFrac0 := sx - float64(sx0)
@@ -521,12 +531,12 @@
 	}
 }
 
-func (z *ablScaler) scale_Image_Image(dst Image, dp image.Point, src image.Image, sp image.Point) {
+func (z *ablScaler) scale_Image_Image(dst Image, dp image.Point, dr image.Rectangle, src image.Image, sp image.Point) {
 	yscale := float64(z.sh) / float64(z.dh)
 	xscale := float64(z.sw) / float64(z.dw)
 	dstColorRGBA64 := &color.RGBA64{}
 	dstColor := color.Color(dstColorRGBA64)
-	for dy := int32(0); dy < z.dh; dy++ {
+	for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 		sy := (float64(dy)+0.5)*yscale - 0.5
 		sy0 := int32(sy)
 		yFrac0 := sy - float64(sy0)
@@ -539,7 +549,7 @@
 			sy1 = sy0
 			yFrac0, yFrac1 = 1, 0
 		}
-		for dx := int32(0); dx < z.dw; dx++ {
+		for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 			sx := (float64(dx)+0.5)*xscale - 0.5
 			sx0 := int32(sx)
 			xFrac0 := sx - float64(sx0)
@@ -597,6 +607,11 @@
 	if z.dw <= 0 || z.dh <= 0 || z.sw <= 0 || z.sh <= 0 {
 		return
 	}
+	// dr is the affected destination pixels, relative to dp.
+	dr := dst.Bounds().Sub(dp).Intersect(image.Rectangle{Max: image.Point{int(z.dw), int(z.dh)}})
+	if dr.Empty() {
+		return
+	}
 	// Create a temporary buffer:
 	// scaleX distributes the source image's columns over the temporary image.
 	// scaleY distributes the temporary image's rows over the destination image.
@@ -616,9 +631,9 @@
 	}
 	switch dst := dst.(type) {
 	case *image.RGBA:
-		z.scaleY_RGBA(dst, dp, tmp)
+		z.scaleY_RGBA(dst, dp, dr, tmp)
 	default:
-		z.scaleY_Image(dst, dp, tmp)
+		z.scaleY_Image(dst, dp, dr, tmp)
 	}
 }
 
@@ -737,14 +752,14 @@
 	}
 }
 
-func (z *kernelScaler) scaleY_RGBA(dst *image.RGBA, dp image.Point, tmp [][4]float64) {
+func (z *kernelScaler) scaleY_RGBA(dst *image.RGBA, dp image.Point, dr image.Rectangle, tmp [][4]float64) {
 	dstColorRGBA64 := &color.RGBA64{}
 	dstColor := color.Color(dstColorRGBA64)
-	for x := int32(0); x < z.dw; x++ {
-		for y, s := range z.vertical.sources {
+	for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
+		for dy, s := range z.vertical.sources[dr.Min.Y:dr.Max.Y] {
 			var pr, pg, pb, pa float64
 			for _, c := range z.vertical.contribs[s.i:s.j] {
-				p := &tmp[c.coord*z.dw+x]
+				p := &tmp[c.coord*z.dw+dx]
 				pr += p[0] * c.weight
 				pg += p[1] * c.weight
 				pb += p[2] * c.weight
@@ -754,19 +769,19 @@
 			dstColorRGBA64.G = ftou(pg * s.invTotalWeight)
 			dstColorRGBA64.B = ftou(pb * s.invTotalWeight)
 			dstColorRGBA64.A = ftou(pa * s.invTotalWeight)
-			dst.Set(dp.X+int(x), dp.Y+int(y), dstColor)
+			dst.Set(dp.X+int(dx), dp.Y+int(dr.Min.Y+dy), dstColor)
 		}
 	}
 }
 
-func (z *kernelScaler) scaleY_Image(dst Image, dp image.Point, tmp [][4]float64) {
+func (z *kernelScaler) scaleY_Image(dst Image, dp image.Point, dr image.Rectangle, tmp [][4]float64) {
 	dstColorRGBA64 := &color.RGBA64{}
 	dstColor := color.Color(dstColorRGBA64)
-	for x := int32(0); x < z.dw; x++ {
-		for y, s := range z.vertical.sources {
+	for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
+		for dy, s := range z.vertical.sources[dr.Min.Y:dr.Max.Y] {
 			var pr, pg, pb, pa float64
 			for _, c := range z.vertical.contribs[s.i:s.j] {
-				p := &tmp[c.coord*z.dw+x]
+				p := &tmp[c.coord*z.dw+dx]
 				pr += p[0] * c.weight
 				pg += p[1] * c.weight
 				pb += p[2] * c.weight
@@ -776,7 +791,7 @@
 			dstColorRGBA64.G = ftou(pg * s.invTotalWeight)
 			dstColorRGBA64.B = ftou(pb * s.invTotalWeight)
 			dstColorRGBA64.A = ftou(pa * s.invTotalWeight)
-			dst.Set(dp.X+int(x), dp.Y+int(y), dstColor)
+			dst.Set(dp.X+int(dx), dp.Y+int(dr.Min.Y+dy), dstColor)
 		}
 	}
 }
diff --git a/draw/scale_test.go b/draw/scale_test.go
index 0404d8e..cceaad4 100644
--- a/draw/scale_test.go
+++ b/draw/scale_test.go
@@ -84,6 +84,53 @@
 func TestScaleDown(t *testing.T) { testScale(t, 100, 100, "down", "280x360.jpeg") }
 func TestScaleUp(t *testing.T)   { testScale(t, 75, 100, "up", "14x18.png") }
 
+func fillPix(r *rand.Rand, pixs ...[]byte) {
+	for _, pix := range pixs {
+		for i := range pix {
+			pix[i] = uint8(r.Intn(256))
+		}
+	}
+}
+
+func TestScaleClipCommute(t *testing.T) {
+	src := image.NewNRGBA(image.Rect(0, 0, 20, 20))
+	fillPix(rand.New(rand.NewSource(0)), src.Pix)
+
+	outer := image.Rect(1, 1, 8, 5)
+	inner := image.Rect(2, 3, 6, 5)
+	qs := []Interpolator{
+		NearestNeighbor,
+		ApproxBiLinear,
+		CatmullRom,
+	}
+	for _, q := range qs {
+		dst0 := image.NewRGBA(image.Rect(1, 1, 10, 10))
+		dst1 := image.NewRGBA(image.Rect(1, 1, 10, 10))
+		for i := range dst0.Pix {
+			dst0.Pix[i] = uint8(i / 4)
+			dst1.Pix[i] = uint8(i / 4)
+		}
+
+		// Scale then clip.
+		Scale(dst0, outer, src, src.Bounds(), q)
+		dst0 = dst0.SubImage(inner).(*image.RGBA)
+
+		// Clip then scale.
+		dst1 = dst1.SubImage(inner).(*image.RGBA)
+		Scale(dst1, outer, src, src.Bounds(), q)
+
+	loop:
+		for y := inner.Min.Y; y < inner.Max.Y; y++ {
+			for x := inner.Min.X; x < inner.Max.X; x++ {
+				if c0, c1 := dst0.RGBAAt(x, y), dst1.RGBAAt(x, y); c0 != c1 {
+					t.Errorf("q=%T: at (%d, %d): c0=%v, c1=%v", q, x, y, c0, c1)
+					break loop
+				}
+			}
+		}
+	}
+}
+
 // The fooWrapper types wrap the dst or src image to avoid triggering the
 // type-specific fast path implementations.
 type (
@@ -152,19 +199,13 @@
 
 func srcNRGBA(boundsHint image.Rectangle) (image.Image, error) {
 	m := image.NewNRGBA(boundsHint)
-	r := rand.New(rand.NewSource(1))
-	for i := range m.Pix {
-		m.Pix[i] = uint8(r.Intn(256))
-	}
+	fillPix(rand.New(rand.NewSource(1)), m.Pix)
 	return m, nil
 }
 
 func srcRGBA(boundsHint image.Rectangle) (image.Image, error) {
 	m := image.NewRGBA(boundsHint)
-	r := rand.New(rand.NewSource(2))
-	for i := range m.Pix {
-		m.Pix[i] = uint8(r.Intn(256))
-	}
+	fillPix(rand.New(rand.NewSource(2)), m.Pix)
 	// RGBA is alpha-premultiplied, so the R, G and B values should
 	// be <= the A values.
 	for i := 0; i < len(m.Pix); i += 4 {
@@ -181,16 +222,7 @@
 
 func srcYCbCr(boundsHint image.Rectangle) (image.Image, error) {
 	m := image.NewYCbCr(boundsHint, image.YCbCrSubsampleRatio420)
-	r := rand.New(rand.NewSource(3))
-	for i := range m.Y {
-		m.Y[i] = uint8(r.Intn(256))
-	}
-	for i := range m.Cb {
-		m.Cb[i] = uint8(r.Intn(256))
-	}
-	for i := range m.Cr {
-		m.Cr[i] = uint8(r.Intn(256))
-	}
+	fillPix(rand.New(rand.NewSource(3)), m.Y, m.Cb, m.Cr)
 	return m, nil
 }