draw: implement Kernel.Transform.

Also fix the NN and ABL fast paths to only apply if we can access the
Pix elements without src-bounds checking.

Change-Id: Ie9fc96b28e0665df49d00c4c53cb81385faee4db
Reviewed-on: https://go-review.googlesource.com/7675
Reviewed-by: Rob Pike <r@golang.org>
diff --git a/draw/gen.go b/draw/gen.go
index ced1c9a..b429465 100644
--- a/draw/gen.go
+++ b/draw/gen.go
@@ -116,6 +116,12 @@
 			dType: dType,
 		})
 	}
+	for _, t := range dsTypes {
+		expn(w, codeKernelTransformLeaf, &data{
+			dType: t.dType,
+			sType: t.sType,
+		})
+	}
 }
 
 func expn(w *bytes.Buffer, code string, d *data) {
@@ -154,6 +160,9 @@
 	return line
 }
 
+// expnDollar expands a "$foo" fragment in a line of generated code. It returns
+// the empty string if there was a problem. It returns ";" if the generated
+// code is a no-op.
 func expnDollar(prefix, dollar, suffix string, d *data) string {
 	switch dollar {
 	case "dType":
@@ -246,32 +255,39 @@
 
 	case "outputf":
 		args, _ := splitArgs(suffix)
-		if len(args) != 4 {
+		if len(args) != 5 {
 			return ""
 		}
+		ret := ""
 		switch d.dType {
 		default:
 			log.Fatalf("bad dType %q", d.dType)
 		case "Image":
-			return fmt.Sprintf(""+
-				"dstColorRGBA64.R = ftou(%sr * %s)\n"+
-				"dstColorRGBA64.G = ftou(%sg * %s)\n"+
-				"dstColorRGBA64.B = ftou(%sb * %s)\n"+
-				"dstColorRGBA64.A = ftou(%sa * %s)\n"+
+			ret = fmt.Sprintf(""+
+				"dstColorRGBA64.R = %s(%sr * %s)\n"+
+				"dstColorRGBA64.G = %s(%sg * %s)\n"+
+				"dstColorRGBA64.B = %s(%sb * %s)\n"+
+				"dstColorRGBA64.A = %s(%sa * %s)\n"+
 				"dst.Set(%s, %s, dstColor)",
-				args[2], args[3], args[2], args[3], args[2], args[3], args[2], args[3],
+				args[2], args[3], args[4],
+				args[2], args[3], args[4],
+				args[2], args[3], args[4],
+				args[2], args[3], args[4],
 				args[0], args[1],
 			)
 		case "*image.RGBA":
-			return fmt.Sprintf(""+
-				"dst.Pix[d+0] = uint8(ftou(%sr * %s) >> 8)\n"+
-				"dst.Pix[d+1] = uint8(ftou(%sg * %s) >> 8)\n"+
-				"dst.Pix[d+2] = uint8(ftou(%sb * %s) >> 8)\n"+
-				"dst.Pix[d+3] = uint8(ftou(%sa * %s) >> 8)\n"+
-				"d += dst.Stride",
-				args[2], args[3], args[2], args[3], args[2], args[3], args[2], args[3],
+			ret = fmt.Sprintf(""+
+				"dst.Pix[d+0] = uint8(%s(%sr * %s) >> 8)\n"+
+				"dst.Pix[d+1] = uint8(%s(%sg * %s) >> 8)\n"+
+				"dst.Pix[d+2] = uint8(%s(%sb * %s) >> 8)\n"+
+				"dst.Pix[d+3] = uint8(%s(%sa * %s) >> 8)",
+				args[2], args[3], args[4],
+				args[2], args[3], args[4],
+				args[2], args[3], args[4],
+				args[2], args[3], args[4],
 			)
 		}
+		return strings.Replace(ret, " * 1)", ")", -1)
 
 	case "srcf", "srcu":
 		lhs, eqOp := splitEq(prefix)
@@ -329,6 +345,12 @@
 
 		return strings.TrimSpace(buf.String())
 
+	case "tweakD":
+		if d.dType == "*image.RGBA" {
+			return "d += dst.Stride"
+		}
+		return ";"
+
 	case "tweakDx":
 		if d.dType == "*image.RGBA" {
 			return strings.Replace(suffix, "dx++", "dx, d = dx+1, d+4", 1)
@@ -444,7 +466,14 @@
 				return
 			}
 			d2s := invert(s2d)
-			$switch z.transform_$dTypeRN_$sTypeRN(dst, dr, adr, &d2s, src, sr)
+			// sr is the source pixels. If it extends beyond the src bounds,
+			// we cannot use the type-specific fast paths, as they access
+			// the Pix fields directly without bounds checking.
+			if !sr.In(src.Bounds()) {
+				z.transform_Image_Image(dst, dr, adr, &d2s, src, sr)
+			} else {
+				$switch z.transform_$dTypeRN_$sTypeRN(dst, dr, adr, &d2s, src, sr)
+			}
 		}
 	`
 
@@ -475,6 +504,7 @@
 				$preInner
 				$tweakDx for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
 					dxf := float64(dr.Min.X + int(dx)) + 0.5
+					// TODO: change the src origin so that we can say int(f) instead of int(math.Floor(f)).
 					sx0 := int(math.Floor(d2s[0]*dxf + d2s[1]*dyf + d2s[2]))
 					sy0 := int(math.Floor(d2s[3]*dxf + d2s[4]*dyf + d2s[5]))
 					if !(image.Point{sx0, sy0}).In(sr) {
@@ -549,6 +579,7 @@
 				$preInner
 				$tweakDx for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
 					dxf := float64(dr.Min.X + int(dx)) + 0.5
+					// TODO: change the src origin so that we can say int(f) instead of int(math.Floor(f)).
 					sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
 					sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
 					if !(image.Point{int(math.Floor(sx)), int(math.Floor(sy))}).In(sr) {
@@ -625,8 +656,32 @@
 			$switchD z.scaleY_$dTypeRN(dst, dr, adr, tmp)
 		}
 
-		func (z *Kernel) Transform(dst Image, m *f64.Aff3, src image.Image, sr image.Rectangle, opts *Options) {
-			panic("unimplemented")
+		func (q *Kernel) Transform(dst Image, s2d *f64.Aff3, src image.Image, sr image.Rectangle, opts *Options) {
+			dr := transformRect(s2d, &sr)
+			// adr is the affected destination pixels, relative to dr.Min.
+			adr := dst.Bounds().Intersect(dr).Sub(dr.Min)
+			if adr.Empty() || sr.Empty() {
+				return
+			}
+			d2s := invert(s2d)
+
+			xscale := abs(d2s[0])
+			if s := abs(d2s[1]); xscale < s {
+				xscale = s
+			}
+			yscale := abs(d2s[3])
+			if s := abs(d2s[4]); yscale < s {
+				yscale = s
+			}
+
+			// sr is the source pixels. If it extends beyond the src bounds,
+			// we cannot use the type-specific fast paths, as they access
+			// the Pix fields directly without bounds checking.
+			if !sr.In(src.Bounds()) {
+				q.transform_Image_Image(dst, dr, adr, &d2s, src, sr, xscale, yscale)
+			} else {
+				$switch q.transform_$dTypeRN_$sTypeRN(dst, dr, adr, &d2s, src, sr, xscale, yscale)
+			}
 		}
 	`
 
@@ -665,7 +720,98 @@
 						pb += p[2] * c.weight
 						pa += p[3] * c.weight
 					}
-					$outputf[dr.Min.X + int(dx), dr.Min.Y + int(adr.Min.Y + dy), p, s.invTotalWeight]
+					$outputf[dr.Min.X + int(dx), dr.Min.Y + int(adr.Min.Y + dy), ftou, p, s.invTotalWeight]
+					$tweakD
+				}
+			}
+		}
+	`
+
+	codeKernelTransformLeaf = `
+		func (q *Kernel) transform_$dTypeRN_$sTypeRN(dst $dType, dr, adr image.Rectangle, d2s *f64.Aff3, src $sType, sr image.Rectangle, xscale, yscale float64) {
+			// When shrinking, broaden the effective kernel support so that we still
+			// visit every source pixel.
+			xHalfWidth, xKernelArgScale := q.Support, 1.0
+			if xscale > 1 {
+				xHalfWidth *= xscale
+				xKernelArgScale = 1 / xscale
+			}
+			yHalfWidth, yKernelArgScale := q.Support, 1.0
+			if yscale > 1 {
+				yHalfWidth *= yscale
+				yKernelArgScale = 1 / yscale
+			}
+
+			xWeights := make([]float64, 1 + 2*int(math.Ceil(xHalfWidth)))
+			yWeights := make([]float64, 1 + 2*int(math.Ceil(yHalfWidth)))
+
+			$preOuter
+			for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+				dyf := float64(dr.Min.Y + int(dy)) + 0.5
+				$preInner
+				$tweakDx for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+					dxf := float64(dr.Min.X + int(dx)) + 0.5
+					// TODO: change the src origin so that we can say int(f) instead of int(math.Floor(f)).
+					sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+					sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+					if !(image.Point{int(math.Floor(sx)), int(math.Floor(sy))}).In(sr) {
+						continue
+					}
+
+					sx -= 0.5
+					ix := int(math.Floor(sx - xHalfWidth))
+					if ix < sr.Min.X {
+						ix = sr.Min.X
+					}
+					jx := int(math.Ceil(sx + xHalfWidth))
+					if jx > sr.Max.X {
+						jx = sr.Max.X
+					}
+
+					totalXWeight := 0.0
+					for kx := ix; kx < jx; kx++ {
+						xWeight := 0.0
+						if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+							xWeight = q.At(t)
+						}
+						xWeights[kx - ix] = xWeight
+						totalXWeight += xWeight
+					}
+					for x := range xWeights[:jx-ix] {
+						xWeights[x] /= totalXWeight
+					}
+
+					sy -= 0.5
+					iy := int(math.Floor(sy - yHalfWidth))
+					if iy < sr.Min.Y {
+						iy = sr.Min.Y
+					}
+					jy := int(math.Ceil(sy + yHalfWidth))
+					if jy > sr.Max.Y {
+						jy = sr.Max.Y
+					}
+
+					totalYWeight := 0.0
+					for ky := iy; ky < jy; ky++ {
+						yWeight := 0.0
+						if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+							yWeight = q.At(t)
+						}
+						yWeights[ky - iy] = yWeight
+						totalYWeight += yWeight
+					}
+					for y := range yWeights[:jy-iy] {
+						yWeights[y] /= totalYWeight
+					}
+
+					var pr, pg, pb, pa float64
+					for ky := iy; ky < jy; ky++ {
+						yWeight := yWeights[ky - iy]
+						for kx := ix; kx < jx; kx++ {
+							p += $srcf[kx, ky] * xWeights[kx - ix] * yWeight
+						}
+					}
+					$outputf[dr.Min.X + int(dx), dr.Min.Y + int(dy), fffftou, p, 1]
 				}
 			}
 		}
diff --git a/draw/impl.go b/draw/impl.go
index e6f215a..8ebdd6f 100644
--- a/draw/impl.go
+++ b/draw/impl.go
@@ -55,26 +55,33 @@
 		return
 	}
 	d2s := invert(s2d)
-	switch dst := dst.(type) {
-	case *image.RGBA:
-		switch src := src.(type) {
-		case *image.Gray:
-			z.transform_RGBA_Gray(dst, dr, adr, &d2s, src, sr)
-		case *image.NRGBA:
-			z.transform_RGBA_NRGBA(dst, dr, adr, &d2s, src, sr)
+	// sr is the source pixels. If it extends beyond the src bounds,
+	// we cannot use the type-specific fast paths, as they access
+	// the Pix fields directly without bounds checking.
+	if !sr.In(src.Bounds()) {
+		z.transform_Image_Image(dst, dr, adr, &d2s, src, sr)
+	} else {
+		switch dst := dst.(type) {
 		case *image.RGBA:
-			z.transform_RGBA_RGBA(dst, dr, adr, &d2s, src, sr)
-		case *image.Uniform:
-			z.transform_RGBA_Uniform(dst, dr, adr, &d2s, src, sr)
-		case *image.YCbCr:
-			z.transform_RGBA_YCbCr(dst, dr, adr, &d2s, src, sr)
+			switch src := src.(type) {
+			case *image.Gray:
+				z.transform_RGBA_Gray(dst, dr, adr, &d2s, src, sr)
+			case *image.NRGBA:
+				z.transform_RGBA_NRGBA(dst, dr, adr, &d2s, src, sr)
+			case *image.RGBA:
+				z.transform_RGBA_RGBA(dst, dr, adr, &d2s, src, sr)
+			case *image.Uniform:
+				z.transform_RGBA_Uniform(dst, dr, adr, &d2s, src, sr)
+			case *image.YCbCr:
+				z.transform_RGBA_YCbCr(dst, dr, adr, &d2s, src, sr)
+			default:
+				z.transform_RGBA_Image(dst, dr, adr, &d2s, src, sr)
+			}
 		default:
-			z.transform_RGBA_Image(dst, dr, adr, &d2s, src, sr)
-		}
-	default:
-		switch src := src.(type) {
-		default:
-			z.transform_Image_Image(dst, dr, adr, &d2s, src, sr)
+			switch src := src.(type) {
+			default:
+				z.transform_Image_Image(dst, dr, adr, &d2s, src, sr)
+			}
 		}
 	}
 }
@@ -224,6 +231,7 @@
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
 		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			// TODO: change the src origin so that we can say int(f) instead of int(math.Floor(f)).
 			sx0 := int(math.Floor(d2s[0]*dxf + d2s[1]*dyf + d2s[2]))
 			sy0 := int(math.Floor(d2s[3]*dxf + d2s[4]*dyf + d2s[5]))
 			if !(image.Point{sx0, sy0}).In(sr) {
@@ -244,6 +252,7 @@
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
 		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			// TODO: change the src origin so that we can say int(f) instead of int(math.Floor(f)).
 			sx0 := int(math.Floor(d2s[0]*dxf + d2s[1]*dyf + d2s[2]))
 			sy0 := int(math.Floor(d2s[3]*dxf + d2s[4]*dyf + d2s[5]))
 			if !(image.Point{sx0, sy0}).In(sr) {
@@ -264,6 +273,7 @@
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
 		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			// TODO: change the src origin so that we can say int(f) instead of int(math.Floor(f)).
 			sx0 := int(math.Floor(d2s[0]*dxf + d2s[1]*dyf + d2s[2]))
 			sy0 := int(math.Floor(d2s[3]*dxf + d2s[4]*dyf + d2s[5]))
 			if !(image.Point{sx0, sy0}).In(sr) {
@@ -288,6 +298,7 @@
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
 		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			// TODO: change the src origin so that we can say int(f) instead of int(math.Floor(f)).
 			sx0 := int(math.Floor(d2s[0]*dxf + d2s[1]*dyf + d2s[2]))
 			sy0 := int(math.Floor(d2s[3]*dxf + d2s[4]*dyf + d2s[5]))
 			if !(image.Point{sx0, sy0}).In(sr) {
@@ -308,6 +319,7 @@
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
 		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			// TODO: change the src origin so that we can say int(f) instead of int(math.Floor(f)).
 			sx0 := int(math.Floor(d2s[0]*dxf + d2s[1]*dyf + d2s[2]))
 			sy0 := int(math.Floor(d2s[3]*dxf + d2s[4]*dyf + d2s[5]))
 			if !(image.Point{sx0, sy0}).In(sr) {
@@ -328,6 +340,7 @@
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
 		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			// TODO: change the src origin so that we can say int(f) instead of int(math.Floor(f)).
 			sx0 := int(math.Floor(d2s[0]*dxf + d2s[1]*dyf + d2s[2]))
 			sy0 := int(math.Floor(d2s[3]*dxf + d2s[4]*dyf + d2s[5]))
 			if !(image.Point{sx0, sy0}).In(sr) {
@@ -349,6 +362,7 @@
 		dyf := float64(dr.Min.Y+int(dy)) + 0.5
 		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
 			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			// TODO: change the src origin so that we can say int(f) instead of int(math.Floor(f)).
 			sx0 := int(math.Floor(d2s[0]*dxf + d2s[1]*dyf + d2s[2]))
 			sy0 := int(math.Floor(d2s[3]*dxf + d2s[4]*dyf + d2s[5]))
 			if !(image.Point{sx0, sy0}).In(sr) {
@@ -409,26 +423,33 @@
 		return
 	}
 	d2s := invert(s2d)
-	switch dst := dst.(type) {
-	case *image.RGBA:
-		switch src := src.(type) {
-		case *image.Gray:
-			z.transform_RGBA_Gray(dst, dr, adr, &d2s, src, sr)
-		case *image.NRGBA:
-			z.transform_RGBA_NRGBA(dst, dr, adr, &d2s, src, sr)
+	// sr is the source pixels. If it extends beyond the src bounds,
+	// we cannot use the type-specific fast paths, as they access
+	// the Pix fields directly without bounds checking.
+	if !sr.In(src.Bounds()) {
+		z.transform_Image_Image(dst, dr, adr, &d2s, src, sr)
+	} else {
+		switch dst := dst.(type) {
 		case *image.RGBA:
-			z.transform_RGBA_RGBA(dst, dr, adr, &d2s, src, sr)
-		case *image.Uniform:
-			z.transform_RGBA_Uniform(dst, dr, adr, &d2s, src, sr)
-		case *image.YCbCr:
-			z.transform_RGBA_YCbCr(dst, dr, adr, &d2s, src, sr)
+			switch src := src.(type) {
+			case *image.Gray:
+				z.transform_RGBA_Gray(dst, dr, adr, &d2s, src, sr)
+			case *image.NRGBA:
+				z.transform_RGBA_NRGBA(dst, dr, adr, &d2s, src, sr)
+			case *image.RGBA:
+				z.transform_RGBA_RGBA(dst, dr, adr, &d2s, src, sr)
+			case *image.Uniform:
+				z.transform_RGBA_Uniform(dst, dr, adr, &d2s, src, sr)
+			case *image.YCbCr:
+				z.transform_RGBA_YCbCr(dst, dr, adr, &d2s, src, sr)
+			default:
+				z.transform_RGBA_Image(dst, dr, adr, &d2s, src, sr)
+			}
 		default:
-			z.transform_RGBA_Image(dst, dr, adr, &d2s, src, sr)
-		}
-	default:
-		switch src := src.(type) {
-		default:
-			z.transform_Image_Image(dst, dr, adr, &d2s, src, sr)
+			switch src := src.(type) {
+			default:
+				z.transform_Image_Image(dst, dr, adr, &d2s, src, sr)
+			}
 		}
 	}
 }
@@ -1010,6 +1031,7 @@
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
 		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			// TODO: change the src origin so that we can say int(f) instead of int(math.Floor(f)).
 			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
 			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
 			if !(image.Point{int(math.Floor(sx)), int(math.Floor(sy))}).In(sr) {
@@ -1090,6 +1112,7 @@
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
 		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			// TODO: change the src origin so that we can say int(f) instead of int(math.Floor(f)).
 			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
 			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
 			if !(image.Point{int(math.Floor(sx)), int(math.Floor(sy))}).In(sr) {
@@ -1170,6 +1193,7 @@
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
 		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			// TODO: change the src origin so that we can say int(f) instead of int(math.Floor(f)).
 			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
 			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
 			if !(image.Point{int(math.Floor(sx)), int(math.Floor(sy))}).In(sr) {
@@ -1266,6 +1290,7 @@
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
 		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			// TODO: change the src origin so that we can say int(f) instead of int(math.Floor(f)).
 			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
 			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
 			if !(image.Point{int(math.Floor(sx)), int(math.Floor(sy))}).In(sr) {
@@ -1346,6 +1371,7 @@
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
 		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			// TODO: change the src origin so that we can say int(f) instead of int(math.Floor(f)).
 			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
 			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
 			if !(image.Point{int(math.Floor(sx)), int(math.Floor(sy))}).In(sr) {
@@ -1426,6 +1452,7 @@
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
 		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			// TODO: change the src origin so that we can say int(f) instead of int(math.Floor(f)).
 			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
 			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
 			if !(image.Point{int(math.Floor(sx)), int(math.Floor(sy))}).In(sr) {
@@ -1507,6 +1534,7 @@
 		dyf := float64(dr.Min.Y+int(dy)) + 0.5
 		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
 			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			// TODO: change the src origin so that we can say int(f) instead of int(math.Floor(f)).
 			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
 			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
 			if !(image.Point{int(math.Floor(sx)), int(math.Floor(sy))}).In(sr) {
@@ -1628,8 +1656,53 @@
 	}
 }
 
-func (z *Kernel) Transform(dst Image, m *f64.Aff3, src image.Image, sr image.Rectangle, opts *Options) {
-	panic("unimplemented")
+func (q *Kernel) Transform(dst Image, s2d *f64.Aff3, src image.Image, sr image.Rectangle, opts *Options) {
+	dr := transformRect(s2d, &sr)
+	// adr is the affected destination pixels, relative to dr.Min.
+	adr := dst.Bounds().Intersect(dr).Sub(dr.Min)
+	if adr.Empty() || sr.Empty() {
+		return
+	}
+	d2s := invert(s2d)
+
+	xscale := abs(d2s[0])
+	if s := abs(d2s[1]); xscale < s {
+		xscale = s
+	}
+	yscale := abs(d2s[3])
+	if s := abs(d2s[4]); yscale < s {
+		yscale = s
+	}
+
+	// sr is the source pixels. If it extends beyond the src bounds,
+	// we cannot use the type-specific fast paths, as they access
+	// the Pix fields directly without bounds checking.
+	if !sr.In(src.Bounds()) {
+		q.transform_Image_Image(dst, dr, adr, &d2s, src, sr, xscale, yscale)
+	} else {
+		switch dst := dst.(type) {
+		case *image.RGBA:
+			switch src := src.(type) {
+			case *image.Gray:
+				q.transform_RGBA_Gray(dst, dr, adr, &d2s, src, sr, xscale, yscale)
+			case *image.NRGBA:
+				q.transform_RGBA_NRGBA(dst, dr, adr, &d2s, src, sr, xscale, yscale)
+			case *image.RGBA:
+				q.transform_RGBA_RGBA(dst, dr, adr, &d2s, src, sr, xscale, yscale)
+			case *image.Uniform:
+				q.transform_RGBA_Uniform(dst, dr, adr, &d2s, src, sr, xscale, yscale)
+			case *image.YCbCr:
+				q.transform_RGBA_YCbCr(dst, dr, adr, &d2s, src, sr, xscale, yscale)
+			default:
+				q.transform_RGBA_Image(dst, dr, adr, &d2s, src, sr, xscale, yscale)
+			}
+		default:
+			switch src := src.(type) {
+			default:
+				q.transform_Image_Image(dst, dr, adr, &d2s, src, sr, xscale, yscale)
+			}
+		}
+	}
 }
 
 func (z *kernelScaler) scaleX_Gray(tmp [][4]float64, src *image.Gray, sr image.Rectangle) {
@@ -1816,3 +1889,667 @@
 		}
 	}
 }
+
+func (q *Kernel) transform_RGBA_Gray(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.Gray, sr image.Rectangle, xscale, yscale float64) {
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	xHalfWidth, xKernelArgScale := q.Support, 1.0
+	if xscale > 1 {
+		xHalfWidth *= xscale
+		xKernelArgScale = 1 / xscale
+	}
+	yHalfWidth, yKernelArgScale := q.Support, 1.0
+	if yscale > 1 {
+		yHalfWidth *= yscale
+		yKernelArgScale = 1 / yscale
+	}
+
+	xWeights := make([]float64, 1+2*int(math.Ceil(xHalfWidth)))
+	yWeights := make([]float64, 1+2*int(math.Ceil(yHalfWidth)))
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			// TODO: change the src origin so that we can say int(f) instead of int(math.Floor(f)).
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(math.Floor(sx)), int(math.Floor(sy))}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			ix := int(math.Floor(sx - xHalfWidth))
+			if ix < sr.Min.X {
+				ix = sr.Min.X
+			}
+			jx := int(math.Ceil(sx + xHalfWidth))
+			if jx > sr.Max.X {
+				jx = sr.Max.X
+			}
+
+			totalXWeight := 0.0
+			for kx := ix; kx < jx; kx++ {
+				xWeight := 0.0
+				if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+					xWeight = q.At(t)
+				}
+				xWeights[kx-ix] = xWeight
+				totalXWeight += xWeight
+			}
+			for x := range xWeights[:jx-ix] {
+				xWeights[x] /= totalXWeight
+			}
+
+			sy -= 0.5
+			iy := int(math.Floor(sy - yHalfWidth))
+			if iy < sr.Min.Y {
+				iy = sr.Min.Y
+			}
+			jy := int(math.Ceil(sy + yHalfWidth))
+			if jy > sr.Max.Y {
+				jy = sr.Max.Y
+			}
+
+			totalYWeight := 0.0
+			for ky := iy; ky < jy; ky++ {
+				yWeight := 0.0
+				if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+					yWeight = q.At(t)
+				}
+				yWeights[ky-iy] = yWeight
+				totalYWeight += yWeight
+			}
+			for y := range yWeights[:jy-iy] {
+				yWeights[y] /= totalYWeight
+			}
+
+			var pr, pg, pb, pa float64
+			for ky := iy; ky < jy; ky++ {
+				yWeight := yWeights[ky-iy]
+				for kx := ix; kx < jx; kx++ {
+					pru, pgu, pbu, pau := src.At(kx, ky).RGBA()
+					pr += float64(pru) * xWeights[kx-ix] * yWeight
+					pg += float64(pgu) * xWeights[kx-ix] * yWeight
+					pb += float64(pbu) * xWeights[kx-ix] * yWeight
+					pa += float64(pau) * xWeights[kx-ix] * yWeight
+				}
+			}
+			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
+			dst.Pix[d+1] = uint8(fffftou(pg) >> 8)
+			dst.Pix[d+2] = uint8(fffftou(pb) >> 8)
+			dst.Pix[d+3] = uint8(fffftou(pa) >> 8)
+		}
+	}
+}
+
+func (q *Kernel) transform_RGBA_NRGBA(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.NRGBA, sr image.Rectangle, xscale, yscale float64) {
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	xHalfWidth, xKernelArgScale := q.Support, 1.0
+	if xscale > 1 {
+		xHalfWidth *= xscale
+		xKernelArgScale = 1 / xscale
+	}
+	yHalfWidth, yKernelArgScale := q.Support, 1.0
+	if yscale > 1 {
+		yHalfWidth *= yscale
+		yKernelArgScale = 1 / yscale
+	}
+
+	xWeights := make([]float64, 1+2*int(math.Ceil(xHalfWidth)))
+	yWeights := make([]float64, 1+2*int(math.Ceil(yHalfWidth)))
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			// TODO: change the src origin so that we can say int(f) instead of int(math.Floor(f)).
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(math.Floor(sx)), int(math.Floor(sy))}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			ix := int(math.Floor(sx - xHalfWidth))
+			if ix < sr.Min.X {
+				ix = sr.Min.X
+			}
+			jx := int(math.Ceil(sx + xHalfWidth))
+			if jx > sr.Max.X {
+				jx = sr.Max.X
+			}
+
+			totalXWeight := 0.0
+			for kx := ix; kx < jx; kx++ {
+				xWeight := 0.0
+				if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+					xWeight = q.At(t)
+				}
+				xWeights[kx-ix] = xWeight
+				totalXWeight += xWeight
+			}
+			for x := range xWeights[:jx-ix] {
+				xWeights[x] /= totalXWeight
+			}
+
+			sy -= 0.5
+			iy := int(math.Floor(sy - yHalfWidth))
+			if iy < sr.Min.Y {
+				iy = sr.Min.Y
+			}
+			jy := int(math.Ceil(sy + yHalfWidth))
+			if jy > sr.Max.Y {
+				jy = sr.Max.Y
+			}
+
+			totalYWeight := 0.0
+			for ky := iy; ky < jy; ky++ {
+				yWeight := 0.0
+				if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+					yWeight = q.At(t)
+				}
+				yWeights[ky-iy] = yWeight
+				totalYWeight += yWeight
+			}
+			for y := range yWeights[:jy-iy] {
+				yWeights[y] /= totalYWeight
+			}
+
+			var pr, pg, pb, pa float64
+			for ky := iy; ky < jy; ky++ {
+				yWeight := yWeights[ky-iy]
+				for kx := ix; kx < jx; kx++ {
+					pru, pgu, pbu, pau := src.At(kx, ky).RGBA()
+					pr += float64(pru) * xWeights[kx-ix] * yWeight
+					pg += float64(pgu) * xWeights[kx-ix] * yWeight
+					pb += float64(pbu) * xWeights[kx-ix] * yWeight
+					pa += float64(pau) * xWeights[kx-ix] * yWeight
+				}
+			}
+			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
+			dst.Pix[d+1] = uint8(fffftou(pg) >> 8)
+			dst.Pix[d+2] = uint8(fffftou(pb) >> 8)
+			dst.Pix[d+3] = uint8(fffftou(pa) >> 8)
+		}
+	}
+}
+
+func (q *Kernel) transform_RGBA_RGBA(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.RGBA, sr image.Rectangle, xscale, yscale float64) {
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	xHalfWidth, xKernelArgScale := q.Support, 1.0
+	if xscale > 1 {
+		xHalfWidth *= xscale
+		xKernelArgScale = 1 / xscale
+	}
+	yHalfWidth, yKernelArgScale := q.Support, 1.0
+	if yscale > 1 {
+		yHalfWidth *= yscale
+		yKernelArgScale = 1 / yscale
+	}
+
+	xWeights := make([]float64, 1+2*int(math.Ceil(xHalfWidth)))
+	yWeights := make([]float64, 1+2*int(math.Ceil(yHalfWidth)))
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			// TODO: change the src origin so that we can say int(f) instead of int(math.Floor(f)).
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(math.Floor(sx)), int(math.Floor(sy))}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			ix := int(math.Floor(sx - xHalfWidth))
+			if ix < sr.Min.X {
+				ix = sr.Min.X
+			}
+			jx := int(math.Ceil(sx + xHalfWidth))
+			if jx > sr.Max.X {
+				jx = sr.Max.X
+			}
+
+			totalXWeight := 0.0
+			for kx := ix; kx < jx; kx++ {
+				xWeight := 0.0
+				if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+					xWeight = q.At(t)
+				}
+				xWeights[kx-ix] = xWeight
+				totalXWeight += xWeight
+			}
+			for x := range xWeights[:jx-ix] {
+				xWeights[x] /= totalXWeight
+			}
+
+			sy -= 0.5
+			iy := int(math.Floor(sy - yHalfWidth))
+			if iy < sr.Min.Y {
+				iy = sr.Min.Y
+			}
+			jy := int(math.Ceil(sy + yHalfWidth))
+			if jy > sr.Max.Y {
+				jy = sr.Max.Y
+			}
+
+			totalYWeight := 0.0
+			for ky := iy; ky < jy; ky++ {
+				yWeight := 0.0
+				if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+					yWeight = q.At(t)
+				}
+				yWeights[ky-iy] = yWeight
+				totalYWeight += yWeight
+			}
+			for y := range yWeights[:jy-iy] {
+				yWeights[y] /= totalYWeight
+			}
+
+			var pr, pg, pb, pa float64
+			for ky := iy; ky < jy; ky++ {
+				yWeight := yWeights[ky-iy]
+				for kx := ix; kx < jx; kx++ {
+					pi := src.PixOffset(kx, ky)
+					pru := uint32(src.Pix[pi+0]) * 0x101
+					pgu := uint32(src.Pix[pi+1]) * 0x101
+					pbu := uint32(src.Pix[pi+2]) * 0x101
+					pau := uint32(src.Pix[pi+3]) * 0x101
+					pr += float64(pru) * xWeights[kx-ix] * yWeight
+					pg += float64(pgu) * xWeights[kx-ix] * yWeight
+					pb += float64(pbu) * xWeights[kx-ix] * yWeight
+					pa += float64(pau) * xWeights[kx-ix] * yWeight
+				}
+			}
+			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
+			dst.Pix[d+1] = uint8(fffftou(pg) >> 8)
+			dst.Pix[d+2] = uint8(fffftou(pb) >> 8)
+			dst.Pix[d+3] = uint8(fffftou(pa) >> 8)
+		}
+	}
+}
+
+func (q *Kernel) transform_RGBA_Uniform(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.Uniform, sr image.Rectangle, xscale, yscale float64) {
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	xHalfWidth, xKernelArgScale := q.Support, 1.0
+	if xscale > 1 {
+		xHalfWidth *= xscale
+		xKernelArgScale = 1 / xscale
+	}
+	yHalfWidth, yKernelArgScale := q.Support, 1.0
+	if yscale > 1 {
+		yHalfWidth *= yscale
+		yKernelArgScale = 1 / yscale
+	}
+
+	xWeights := make([]float64, 1+2*int(math.Ceil(xHalfWidth)))
+	yWeights := make([]float64, 1+2*int(math.Ceil(yHalfWidth)))
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			// TODO: change the src origin so that we can say int(f) instead of int(math.Floor(f)).
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(math.Floor(sx)), int(math.Floor(sy))}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			ix := int(math.Floor(sx - xHalfWidth))
+			if ix < sr.Min.X {
+				ix = sr.Min.X
+			}
+			jx := int(math.Ceil(sx + xHalfWidth))
+			if jx > sr.Max.X {
+				jx = sr.Max.X
+			}
+
+			totalXWeight := 0.0
+			for kx := ix; kx < jx; kx++ {
+				xWeight := 0.0
+				if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+					xWeight = q.At(t)
+				}
+				xWeights[kx-ix] = xWeight
+				totalXWeight += xWeight
+			}
+			for x := range xWeights[:jx-ix] {
+				xWeights[x] /= totalXWeight
+			}
+
+			sy -= 0.5
+			iy := int(math.Floor(sy - yHalfWidth))
+			if iy < sr.Min.Y {
+				iy = sr.Min.Y
+			}
+			jy := int(math.Ceil(sy + yHalfWidth))
+			if jy > sr.Max.Y {
+				jy = sr.Max.Y
+			}
+
+			totalYWeight := 0.0
+			for ky := iy; ky < jy; ky++ {
+				yWeight := 0.0
+				if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+					yWeight = q.At(t)
+				}
+				yWeights[ky-iy] = yWeight
+				totalYWeight += yWeight
+			}
+			for y := range yWeights[:jy-iy] {
+				yWeights[y] /= totalYWeight
+			}
+
+			var pr, pg, pb, pa float64
+			for ky := iy; ky < jy; ky++ {
+				yWeight := yWeights[ky-iy]
+				for kx := ix; kx < jx; kx++ {
+					pru, pgu, pbu, pau := src.At(kx, ky).RGBA()
+					pr += float64(pru) * xWeights[kx-ix] * yWeight
+					pg += float64(pgu) * xWeights[kx-ix] * yWeight
+					pb += float64(pbu) * xWeights[kx-ix] * yWeight
+					pa += float64(pau) * xWeights[kx-ix] * yWeight
+				}
+			}
+			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
+			dst.Pix[d+1] = uint8(fffftou(pg) >> 8)
+			dst.Pix[d+2] = uint8(fffftou(pb) >> 8)
+			dst.Pix[d+3] = uint8(fffftou(pa) >> 8)
+		}
+	}
+}
+
+func (q *Kernel) transform_RGBA_YCbCr(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.YCbCr, sr image.Rectangle, xscale, yscale float64) {
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	xHalfWidth, xKernelArgScale := q.Support, 1.0
+	if xscale > 1 {
+		xHalfWidth *= xscale
+		xKernelArgScale = 1 / xscale
+	}
+	yHalfWidth, yKernelArgScale := q.Support, 1.0
+	if yscale > 1 {
+		yHalfWidth *= yscale
+		yKernelArgScale = 1 / yscale
+	}
+
+	xWeights := make([]float64, 1+2*int(math.Ceil(xHalfWidth)))
+	yWeights := make([]float64, 1+2*int(math.Ceil(yHalfWidth)))
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			// TODO: change the src origin so that we can say int(f) instead of int(math.Floor(f)).
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(math.Floor(sx)), int(math.Floor(sy))}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			ix := int(math.Floor(sx - xHalfWidth))
+			if ix < sr.Min.X {
+				ix = sr.Min.X
+			}
+			jx := int(math.Ceil(sx + xHalfWidth))
+			if jx > sr.Max.X {
+				jx = sr.Max.X
+			}
+
+			totalXWeight := 0.0
+			for kx := ix; kx < jx; kx++ {
+				xWeight := 0.0
+				if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+					xWeight = q.At(t)
+				}
+				xWeights[kx-ix] = xWeight
+				totalXWeight += xWeight
+			}
+			for x := range xWeights[:jx-ix] {
+				xWeights[x] /= totalXWeight
+			}
+
+			sy -= 0.5
+			iy := int(math.Floor(sy - yHalfWidth))
+			if iy < sr.Min.Y {
+				iy = sr.Min.Y
+			}
+			jy := int(math.Ceil(sy + yHalfWidth))
+			if jy > sr.Max.Y {
+				jy = sr.Max.Y
+			}
+
+			totalYWeight := 0.0
+			for ky := iy; ky < jy; ky++ {
+				yWeight := 0.0
+				if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+					yWeight = q.At(t)
+				}
+				yWeights[ky-iy] = yWeight
+				totalYWeight += yWeight
+			}
+			for y := range yWeights[:jy-iy] {
+				yWeights[y] /= totalYWeight
+			}
+
+			var pr, pg, pb, pa float64
+			for ky := iy; ky < jy; ky++ {
+				yWeight := yWeights[ky-iy]
+				for kx := ix; kx < jx; kx++ {
+					pru, pgu, pbu, pau := src.At(kx, ky).RGBA()
+					pr += float64(pru) * xWeights[kx-ix] * yWeight
+					pg += float64(pgu) * xWeights[kx-ix] * yWeight
+					pb += float64(pbu) * xWeights[kx-ix] * yWeight
+					pa += float64(pau) * xWeights[kx-ix] * yWeight
+				}
+			}
+			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
+			dst.Pix[d+1] = uint8(fffftou(pg) >> 8)
+			dst.Pix[d+2] = uint8(fffftou(pb) >> 8)
+			dst.Pix[d+3] = uint8(fffftou(pa) >> 8)
+		}
+	}
+}
+
+func (q *Kernel) transform_RGBA_Image(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src image.Image, sr image.Rectangle, xscale, yscale float64) {
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	xHalfWidth, xKernelArgScale := q.Support, 1.0
+	if xscale > 1 {
+		xHalfWidth *= xscale
+		xKernelArgScale = 1 / xscale
+	}
+	yHalfWidth, yKernelArgScale := q.Support, 1.0
+	if yscale > 1 {
+		yHalfWidth *= yscale
+		yKernelArgScale = 1 / yscale
+	}
+
+	xWeights := make([]float64, 1+2*int(math.Ceil(xHalfWidth)))
+	yWeights := make([]float64, 1+2*int(math.Ceil(yHalfWidth)))
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			// TODO: change the src origin so that we can say int(f) instead of int(math.Floor(f)).
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(math.Floor(sx)), int(math.Floor(sy))}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			ix := int(math.Floor(sx - xHalfWidth))
+			if ix < sr.Min.X {
+				ix = sr.Min.X
+			}
+			jx := int(math.Ceil(sx + xHalfWidth))
+			if jx > sr.Max.X {
+				jx = sr.Max.X
+			}
+
+			totalXWeight := 0.0
+			for kx := ix; kx < jx; kx++ {
+				xWeight := 0.0
+				if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+					xWeight = q.At(t)
+				}
+				xWeights[kx-ix] = xWeight
+				totalXWeight += xWeight
+			}
+			for x := range xWeights[:jx-ix] {
+				xWeights[x] /= totalXWeight
+			}
+
+			sy -= 0.5
+			iy := int(math.Floor(sy - yHalfWidth))
+			if iy < sr.Min.Y {
+				iy = sr.Min.Y
+			}
+			jy := int(math.Ceil(sy + yHalfWidth))
+			if jy > sr.Max.Y {
+				jy = sr.Max.Y
+			}
+
+			totalYWeight := 0.0
+			for ky := iy; ky < jy; ky++ {
+				yWeight := 0.0
+				if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+					yWeight = q.At(t)
+				}
+				yWeights[ky-iy] = yWeight
+				totalYWeight += yWeight
+			}
+			for y := range yWeights[:jy-iy] {
+				yWeights[y] /= totalYWeight
+			}
+
+			var pr, pg, pb, pa float64
+			for ky := iy; ky < jy; ky++ {
+				yWeight := yWeights[ky-iy]
+				for kx := ix; kx < jx; kx++ {
+					pru, pgu, pbu, pau := src.At(kx, ky).RGBA()
+					pr += float64(pru) * xWeights[kx-ix] * yWeight
+					pg += float64(pgu) * xWeights[kx-ix] * yWeight
+					pb += float64(pbu) * xWeights[kx-ix] * yWeight
+					pa += float64(pau) * xWeights[kx-ix] * yWeight
+				}
+			}
+			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
+			dst.Pix[d+1] = uint8(fffftou(pg) >> 8)
+			dst.Pix[d+2] = uint8(fffftou(pb) >> 8)
+			dst.Pix[d+3] = uint8(fffftou(pa) >> 8)
+		}
+	}
+}
+
+func (q *Kernel) transform_Image_Image(dst Image, dr, adr image.Rectangle, d2s *f64.Aff3, src image.Image, sr image.Rectangle, xscale, yscale float64) {
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	xHalfWidth, xKernelArgScale := q.Support, 1.0
+	if xscale > 1 {
+		xHalfWidth *= xscale
+		xKernelArgScale = 1 / xscale
+	}
+	yHalfWidth, yKernelArgScale := q.Support, 1.0
+	if yscale > 1 {
+		yHalfWidth *= yscale
+		yKernelArgScale = 1 / yscale
+	}
+
+	xWeights := make([]float64, 1+2*int(math.Ceil(xHalfWidth)))
+	yWeights := make([]float64, 1+2*int(math.Ceil(yHalfWidth)))
+
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			// TODO: change the src origin so that we can say int(f) instead of int(math.Floor(f)).
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(math.Floor(sx)), int(math.Floor(sy))}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			ix := int(math.Floor(sx - xHalfWidth))
+			if ix < sr.Min.X {
+				ix = sr.Min.X
+			}
+			jx := int(math.Ceil(sx + xHalfWidth))
+			if jx > sr.Max.X {
+				jx = sr.Max.X
+			}
+
+			totalXWeight := 0.0
+			for kx := ix; kx < jx; kx++ {
+				xWeight := 0.0
+				if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+					xWeight = q.At(t)
+				}
+				xWeights[kx-ix] = xWeight
+				totalXWeight += xWeight
+			}
+			for x := range xWeights[:jx-ix] {
+				xWeights[x] /= totalXWeight
+			}
+
+			sy -= 0.5
+			iy := int(math.Floor(sy - yHalfWidth))
+			if iy < sr.Min.Y {
+				iy = sr.Min.Y
+			}
+			jy := int(math.Ceil(sy + yHalfWidth))
+			if jy > sr.Max.Y {
+				jy = sr.Max.Y
+			}
+
+			totalYWeight := 0.0
+			for ky := iy; ky < jy; ky++ {
+				yWeight := 0.0
+				if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+					yWeight = q.At(t)
+				}
+				yWeights[ky-iy] = yWeight
+				totalYWeight += yWeight
+			}
+			for y := range yWeights[:jy-iy] {
+				yWeights[y] /= totalYWeight
+			}
+
+			var pr, pg, pb, pa float64
+			for ky := iy; ky < jy; ky++ {
+				yWeight := yWeights[ky-iy]
+				for kx := ix; kx < jx; kx++ {
+					pru, pgu, pbu, pau := src.At(kx, ky).RGBA()
+					pr += float64(pru) * xWeights[kx-ix] * yWeight
+					pg += float64(pgu) * xWeights[kx-ix] * yWeight
+					pb += float64(pbu) * xWeights[kx-ix] * yWeight
+					pa += float64(pau) * xWeights[kx-ix] * yWeight
+				}
+			}
+			dstColorRGBA64.R = fffftou(pr)
+			dstColorRGBA64.G = fffftou(pg)
+			dstColorRGBA64.B = fffftou(pb)
+			dstColorRGBA64.A = fffftou(pa)
+			dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
+		}
+	}
+}
diff --git a/draw/scale.go b/draw/scale.go
index 4f94086..d570f6e 100644
--- a/draw/scale.go
+++ b/draw/scale.go
@@ -86,21 +86,21 @@
 }
 
 // Scale implements the Scaler interface.
-func (k *Kernel) Scale(dst Image, dr image.Rectangle, src image.Image, sr image.Rectangle, opts *Options) {
-	k.NewScaler(dr.Dx(), dr.Dy(), sr.Dx(), sr.Dy()).Scale(dst, dr, src, sr, opts)
+func (q *Kernel) Scale(dst Image, dr image.Rectangle, src image.Image, sr image.Rectangle, opts *Options) {
+	q.NewScaler(dr.Dx(), dr.Dy(), sr.Dx(), sr.Dy()).Scale(dst, dr, src, sr, opts)
 }
 
 // NewScaler returns a Scaler that is optimized for scaling multiple times with
 // the same fixed destination and source width and height.
-func (k *Kernel) NewScaler(dw, dh, sw, sh int) Scaler {
+func (q *Kernel) NewScaler(dw, dh, sw, sh int) Scaler {
 	return &kernelScaler{
-		kernel:     k,
+		kernel:     q,
 		dw:         int32(dw),
 		dh:         int32(dh),
 		sw:         int32(sw),
 		sh:         int32(sh),
-		horizontal: newDistrib(k, int32(dw), int32(sw)),
-		vertical:   newDistrib(k, int32(dh), int32(sh)),
+		horizontal: newDistrib(q, int32(dw), int32(sw)),
+		vertical:   newDistrib(q, int32(dh), int32(sh)),
 	}
 }
 
@@ -181,6 +181,8 @@
 func newDistrib(q *Kernel, dw, sw int32) distrib {
 	scale := float64(sw) / float64(dw)
 	halfWidth, kernelArgScale := q.Support, 1.0
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
 	if scale > 1 {
 		halfWidth *= scale
 		kernelArgScale = 1 / scale
@@ -199,25 +201,22 @@
 			i = 0
 		}
 		j := int32(math.Ceil(center + halfWidth))
-		if j >= sw {
-			j = sw - 1
+		if j > sw {
+			j = sw
 			if j < i {
 				j = i
 			}
 		}
 		sources[x] = source{i: i, j: j, invTotalWeight: center}
-		n += j - i + 1
+		n += j - i
 	}
 
 	contribs := make([]contrib, 0, n)
 	for k, b := range sources {
 		totalWeight := 0.0
 		l := int32(len(contribs))
-		for coord := b.i; coord <= b.j; coord++ {
-			t := (b.invTotalWeight - float64(coord)) * kernelArgScale
-			if t < 0 {
-				t = -t
-			}
+		for coord := b.i; coord < b.j; coord++ {
+			t := abs((b.invTotalWeight - float64(coord)) * kernelArgScale)
 			if t >= q.Support {
 				continue
 			}
@@ -240,11 +239,34 @@
 	return distrib{sources, contribs}
 }
 
+// abs is like math.Abs, but it doesn't care about negative zero, infinities or
+// NaNs.
+func abs(f float64) float64 {
+	if f < 0 {
+		f = -f
+	}
+	return f
+}
+
+// ftou converts the range [0.0, 1.0] to [0, 0xffff].
 func ftou(f float64) uint16 {
 	i := int32(0xffff*f + 0.5)
 	if i > 0xffff {
 		return 0xffff
-	} else if i > 0 {
+	}
+	if i > 0 {
+		return uint16(i)
+	}
+	return 0
+}
+
+// fffftou converts the range [0.0, 65535.0] to [0, 0xffff].
+func fffftou(f float64) uint16 {
+	i := int32(f + 0.5)
+	if i > 0xffff {
+		return 0xffff
+	}
+	if i > 0 {
 		return uint16(i)
 	}
 	return 0
@@ -275,6 +297,17 @@
 	}
 }
 
+func matMul(p, q *f64.Aff3) f64.Aff3 {
+	return f64.Aff3{
+		p[3*0+0]*q[3*0+0] + p[3*0+1]*q[3*1+0],
+		p[3*0+0]*q[3*0+1] + p[3*0+1]*q[3*1+1],
+		p[3*0+0]*q[3*0+2] + p[3*0+1]*q[3*1+2] + p[3*0+2],
+		p[3*1+0]*q[3*0+0] + p[3*1+1]*q[3*1+0],
+		p[3*1+0]*q[3*0+1] + p[3*1+1]*q[3*1+1],
+		p[3*1+0]*q[3*0+2] + p[3*1+1]*q[3*1+2] + p[3*1+2],
+	}
+}
+
 // transformRect returns a rectangle dr that contains sr transformed by s2d.
 func transformRect(s2d *f64.Aff3, sr *image.Rectangle) (dr image.Rectangle) {
 	ps := [...]image.Point{
diff --git a/draw/scale_test.go b/draw/scale_test.go
index c15810e..4ed60d6 100644
--- a/draw/scale_test.go
+++ b/draw/scale_test.go
@@ -23,13 +23,25 @@
 
 var genGoldenFiles = flag.Bool("gen_golden_files", false, "whether to generate the TestXxx golden files.")
 
-var transformMatrix = func() *f64.Aff3 {
+var transformMatrix = func(tx, ty float64) *f64.Aff3 {
 	const scale, cos30, sin30 = 3.75, 0.866025404, 0.5
 	return &f64.Aff3{
-		+scale * cos30, -scale * sin30, 40,
-		+scale * sin30, +scale * cos30, 10,
+		+scale * cos30, -scale * sin30, tx,
+		+scale * sin30, +scale * cos30, ty,
 	}
-}()
+}
+
+func encode(filename string, m image.Image) error {
+	f, err := os.Create(filename)
+	if err != nil {
+		return fmt.Errorf("Create: %v", err)
+	}
+	defer f.Close()
+	if err := png.Encode(f, m); err != nil {
+		return fmt.Errorf("Encode: %v", err)
+	}
+	return nil
+}
 
 // testInterp tests that interpolating the source image gives the exact
 // destination image. This is to ensure that any refactoring or optimization of
@@ -58,25 +70,14 @@
 
 		got := image.NewRGBA(image.Rect(0, 0, w, h))
 		if direction == "rotate" {
-			if name == "bl" || name == "cr" {
-				// TODO: implement Kernel.Transform.
-				continue
-			}
-			q.Transform(got, transformMatrix, src, src.Bounds(), nil)
+			q.Transform(got, transformMatrix(40, 10), src, src.Bounds(), nil)
 		} else {
 			q.Scale(got, got.Bounds(), src, src.Bounds(), nil)
 		}
 
 		if *genGoldenFiles {
-			g, err := os.Create(goldenFilename)
-			if err != nil {
-				t.Errorf("Create: %v", err)
-				continue
-			}
-			defer g.Close()
-			if err := png.Encode(g, got); err != nil {
-				t.Errorf("Encode: %v", err)
-				continue
+			if err := encode(goldenFilename, got); err != nil {
+				t.Error(err)
 			}
 			continue
 		}
@@ -132,11 +133,6 @@
 	}
 	for _, transform := range []bool{false, true} {
 		for _, q := range qs {
-			if transform && q == CatmullRom {
-				// TODO: implement Kernel.Transform.
-				continue
-			}
-
 			dst0 := image.NewRGBA(image.Rect(1, 1, 10, 10))
 			dst1 := image.NewRGBA(image.Rect(1, 1, 10, 10))
 			for i := range dst0.Pix {
@@ -147,7 +143,7 @@
 			var interp func(dst *image.RGBA)
 			if transform {
 				interp = func(dst *image.RGBA) {
-					q.Transform(dst, transformMatrix, src, src.Bounds(), nil)
+					q.Transform(dst, transformMatrix(2, 1), src, src.Bounds(), nil)
 				}
 			} else {
 				interp = func(dst *image.RGBA) {
@@ -216,20 +212,32 @@
 		{-8, +8},
 		{-8, -8},
 	}
+	m00 := transformMatrix(0, 0)
 
-	for _, q := range qs {
-		want := image.NewRGBA(image.Rect(0, 0, 200, 200))
-		q.Scale(want, want.Bounds(), src, src.Bounds(), nil)
-		for _, delta := range deltas {
-			tsrc := &translatedImage{src, delta}
-
-			got := image.NewRGBA(image.Rect(0, 0, 200, 200))
-			q.Scale(got, got.Bounds(), tsrc, tsrc.Bounds(), nil)
-			if !bytes.Equal(got.Pix, want.Pix) {
-				t.Errorf("pix differ for delta=%v, q=%T", delta, q)
+	for _, transform := range []bool{false, true} {
+		for _, q := range qs {
+			want := image.NewRGBA(image.Rect(0, 0, 200, 200))
+			if transform {
+				q.Transform(want, m00, src, src.Bounds(), nil)
+			} else {
+				q.Scale(want, want.Bounds(), src, src.Bounds(), nil)
 			}
-
-			// TODO: Transform, once Kernel.Transform is implemented.
+			for _, delta := range deltas {
+				tsrc := &translatedImage{src, delta}
+				got := image.NewRGBA(image.Rect(0, 0, 200, 200))
+				if transform {
+					m := matMul(m00, &f64.Aff3{
+						1, 0, -float64(delta.X),
+						0, 1, -float64(delta.Y),
+					})
+					q.Transform(got, &m, tsrc, tsrc.Bounds(), nil)
+				} else {
+					q.Scale(got, got.Bounds(), tsrc, tsrc.Bounds(), nil)
+				}
+				if !bytes.Equal(got.Pix, want.Pix) {
+					t.Errorf("pix differ for delta=%v, transform=%t, q=%T", delta, transform, q)
+				}
+			}
 		}
 	}
 }
@@ -285,18 +293,27 @@
 	for _, dr := range drs {
 		for _, src := range srcs {
 			for _, sr := range srs {
-				for _, q := range qs {
-					dst0 := image.NewRGBA(drs[0])
-					dst1 := image.NewRGBA(drs[0])
-					Draw(dst0, dst0.Bounds(), blue, image.Point{}, Src)
-					Draw(dstWrapper{dst1}, dst1.Bounds(), srcWrapper{blue}, image.Point{}, Src)
-					q.Scale(dst0, dr, src, sr, nil)
-					q.Scale(dstWrapper{dst1}, dr, srcWrapper{src}, sr, nil)
-					if !bytes.Equal(dst0.Pix, dst1.Pix) {
-						t.Errorf("pix differ for dr=%v, src=%T, sr=%v, q=%T", dr, src, sr, q)
-					}
+				for _, transform := range []bool{false, true} {
+					for _, q := range qs {
+						dst0 := image.NewRGBA(drs[0])
+						dst1 := image.NewRGBA(drs[0])
+						Draw(dst0, dst0.Bounds(), blue, image.Point{}, Src)
+						Draw(dstWrapper{dst1}, dst1.Bounds(), srcWrapper{blue}, image.Point{}, Src)
 
-					// TODO: Transform, once Kernel.Transform is implemented.
+						if transform {
+							m := transformMatrix(2, 1)
+							q.Transform(dst0, m, src, sr, nil)
+							q.Transform(dstWrapper{dst1}, m, srcWrapper{src}, sr, nil)
+						} else {
+							q.Scale(dst0, dr, src, sr, nil)
+							q.Scale(dstWrapper{dst1}, dr, srcWrapper{src}, sr, nil)
+						}
+
+						if !bytes.Equal(dst0.Pix, dst1.Pix) {
+							t.Errorf("pix differ for dr=%v, src=%T, sr=%v, transform=%t, q=%T",
+								dr, src, sr, transform, q)
+						}
+					}
 				}
 			}
 		}
@@ -385,10 +402,11 @@
 		b.Fatal(err)
 	}
 	sr := src.Bounds()
+	m := transformMatrix(40, 10)
 
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		q.Transform(dst, transformMatrix, src, sr, nil)
+		q.Transform(dst, m, src, sr, nil)
 	}
 }
 
@@ -413,8 +431,14 @@
 func BenchmarkScaleSrcUniform(b *testing.B) { benchScale(b, srcUniform, 200, 150, ApproxBiLinear) }
 func BenchmarkScaleSrcYCbCr(b *testing.B)   { benchScale(b, srcYCbCr, 200, 150, ApproxBiLinear) }
 
-func BenchmarkTformSrcGray(b *testing.B)    { benchTform(b, srcGray, 200, 150, ApproxBiLinear) }
-func BenchmarkTformSrcNRGBA(b *testing.B)   { benchTform(b, srcNRGBA, 200, 150, ApproxBiLinear) }
-func BenchmarkTformSrcRGBA(b *testing.B)    { benchTform(b, srcRGBA, 200, 150, ApproxBiLinear) }
-func BenchmarkTformSrcUniform(b *testing.B) { benchTform(b, srcUniform, 200, 150, ApproxBiLinear) }
-func BenchmarkTformSrcYCbCr(b *testing.B)   { benchTform(b, srcYCbCr, 200, 150, ApproxBiLinear) }
+func BenchmarkTformABSrcGray(b *testing.B)    { benchTform(b, srcGray, 200, 150, ApproxBiLinear) }
+func BenchmarkTformABSrcNRGBA(b *testing.B)   { benchTform(b, srcNRGBA, 200, 150, ApproxBiLinear) }
+func BenchmarkTformABSrcRGBA(b *testing.B)    { benchTform(b, srcRGBA, 200, 150, ApproxBiLinear) }
+func BenchmarkTformABSrcUniform(b *testing.B) { benchTform(b, srcUniform, 200, 150, ApproxBiLinear) }
+func BenchmarkTformABSrcYCbCr(b *testing.B)   { benchTform(b, srcYCbCr, 200, 150, ApproxBiLinear) }
+
+func BenchmarkTformCRSrcGray(b *testing.B)    { benchTform(b, srcGray, 200, 150, CatmullRom) }
+func BenchmarkTformCRSrcNRGBA(b *testing.B)   { benchTform(b, srcNRGBA, 200, 150, CatmullRom) }
+func BenchmarkTformCRSrcRGBA(b *testing.B)    { benchTform(b, srcRGBA, 200, 150, CatmullRom) }
+func BenchmarkTformCRSrcUniform(b *testing.B) { benchTform(b, srcUniform, 200, 150, CatmullRom) }
+func BenchmarkTformCRSrcYCbCr(b *testing.B)   { benchTform(b, srcYCbCr, 200, 150, CatmullRom) }
diff --git a/testdata/go-turns-two-rotate-bl.png b/testdata/go-turns-two-rotate-bl.png
new file mode 100644
index 0000000..b4e1279
--- /dev/null
+++ b/testdata/go-turns-two-rotate-bl.png
Binary files differ
diff --git a/testdata/go-turns-two-rotate-cr.png b/testdata/go-turns-two-rotate-cr.png
new file mode 100644
index 0000000..0b64d02
--- /dev/null
+++ b/testdata/go-turns-two-rotate-cr.png
Binary files differ