draw: implement NearestNeighbor and ApproxBiLinear Transform.

Change-Id: I70a5e3703dea436354e9591fce7b704ec749c2d1
Reviewed-on: https://go-review.googlesource.com/7541
Reviewed-by: Rob Pike <r@golang.org>
diff --git a/draw/example_test.go b/draw/example_test.go
index 02e4c5d..f8545ad 100644
--- a/draw/example_test.go
+++ b/draw/example_test.go
@@ -9,7 +9,6 @@
 	"image"
 	"image/png"
 	"log"
-	"math"
 	"os"
 
 	"golang.org/x/image/draw"
@@ -34,19 +33,17 @@
 		draw.ApproxBiLinear,
 		draw.CatmullRom,
 	}
-	c, s := math.Cos(math.Pi/3), math.Sin(math.Pi/3)
+	const cos60, sin60 = 0.5, 0.866025404
 	t := &f64.Aff3{
-		+2 * c, -2 * s, 100,
-		+2 * s, +2 * c, 100,
+		+2 * cos60, -2 * sin60, 100,
+		+2 * sin60, +2 * cos60, 100,
 	}
 
 	draw.Copy(dst, image.Point{20, 30}, src, sr, nil)
 	for i, q := range qs {
 		q.Scale(dst, image.Rect(200+10*i, 100*i, 600+10*i, 150+100*i), src, sr, nil)
 	}
-	// TODO: delete the "_ = t" and uncomment this when Transform is implemented.
-	// draw.NearestNeighbor.Transform(dst, t, src, sr, nil)
-	_ = t
+	draw.NearestNeighbor.Transform(dst, t, src, sr, nil)
 
 	// Change false to true to write the resultant image to disk.
 	if false {
diff --git a/draw/gen.go b/draw/gen.go
index bc11b3e..587b969 100644
--- a/draw/gen.go
+++ b/draw/gen.go
@@ -27,12 +27,13 @@
 		"package draw\n\nimport (\n" +
 		"\"image\"\n" +
 		"\"image/color\"\n" +
+		"\"math\"\n" +
 		"\n" +
 		"\"golang.org/x/image/math/f64\"\n" +
 		")\n")
 
-	gen(w, "nnInterpolator", codeNNScaleLeaf)
-	gen(w, "ablInterpolator", codeABLScaleLeaf)
+	gen(w, "nnInterpolator", codeNNScaleLeaf, codeNNTransformLeaf)
+	gen(w, "ablInterpolator", codeABLScaleLeaf, codeABLTransformLeaf)
 	genKernel(w)
 
 	if *debug {
@@ -90,14 +91,16 @@
 	receiver string
 }
 
-func gen(w *bytes.Buffer, receiver string, code string) {
+func gen(w *bytes.Buffer, receiver string, codes ...string) {
 	expn(w, codeRoot, &data{receiver: receiver})
-	for _, t := range dsTypes {
-		expn(w, code, &data{
-			dType:    t.dType,
-			sType:    t.sType,
-			receiver: receiver,
-		})
+	for _, code := range codes {
+		for _, t := range dsTypes {
+			expn(w, code, &data{
+				dType:    t.dType,
+				sType:    t.sType,
+				receiver: receiver,
+			})
+		}
 	}
 }
 
@@ -227,7 +230,7 @@
 				"dstColorRGBA64.G = uint16(%sg)\n"+
 				"dstColorRGBA64.B = uint16(%sb)\n"+
 				"dstColorRGBA64.A = uint16(%sa)\n"+
-				"dst.Set(dr.Min.X+int(%s), dr.Min.Y+int(%s), dstColor)",
+				"dst.Set(%s, %s, dstColor)",
 				args[2], args[2], args[2], args[2],
 				args[0], args[1],
 			)
@@ -236,8 +239,7 @@
 				"dst.Pix[d+0] = uint8(uint32(%sr) >> 8)\n"+
 				"dst.Pix[d+1] = uint8(uint32(%sg) >> 8)\n"+
 				"dst.Pix[d+2] = uint8(uint32(%sb) >> 8)\n"+
-				"dst.Pix[d+3] = uint8(uint32(%sa) >> 8)\n"+
-				"d += 4",
+				"dst.Pix[d+3] = uint8(uint32(%sa) >> 8)",
 				args[2], args[2], args[2], args[2],
 			)
 		}
@@ -256,7 +258,7 @@
 				"dstColorRGBA64.G = ftou(%sg * %s)\n"+
 				"dstColorRGBA64.B = ftou(%sb * %s)\n"+
 				"dstColorRGBA64.A = ftou(%sa * %s)\n"+
-				"dst.Set(dr.Min.X+int(%s), dr.Min.Y+int(%s), dstColor)",
+				"dst.Set(%s, %s, dstColor)",
 				args[2], args[3], args[2], args[3], args[2], args[3], args[2], args[3],
 				args[0], args[1],
 			)
@@ -292,14 +294,14 @@
 			log.Fatalf("bad sType %q", d.sType)
 		case "image.Image", "*image.Gray", "*image.NRGBA", "*image.Uniform", "*image.YCbCr": // TODO: separate code for concrete types.
 			fmt.Fprintf(buf, "%sr%s, %sg%s, %sb%s, %sa%s := "+
-				"src.At(sr.Min.X + int(%s), sr.Min.Y+int(%s)).RGBA()\n",
+				"src.At(%s, %s).RGBA()\n",
 				lhs, tmp, lhs, tmp, lhs, tmp, lhs, tmp,
 				args[0], args[1],
 			)
 		case "*image.RGBA":
 			// TODO: there's no need to multiply by 0x101 if the next thing
 			// we're going to do is shift right by 8.
-			fmt.Fprintf(buf, "%si := src.PixOffset(sr.Min.X + int(%s), sr.Min.Y+int(%s))\n"+
+			fmt.Fprintf(buf, "%si := src.PixOffset(%s, %s)\n"+
 				"%sr%s := uint32(src.Pix[%si+0]) * 0x101\n"+
 				"%sg%s := uint32(src.Pix[%si+1]) * 0x101\n"+
 				"%sb%s := uint32(src.Pix[%si+2]) * 0x101\n"+
@@ -327,6 +329,12 @@
 
 		return strings.TrimSpace(buf.String())
 
+	case "tweakDx":
+		if d.dType == "*image.RGBA" {
+			return strings.Replace(suffix, "dx++", "dx, d = dx+1, d+4", 1)
+		}
+		return suffix
+
 	case "tweakDy":
 		if d.dType == "*image.RGBA" {
 			return strings.Replace(suffix, "for dy, s", "for _, s", 1)
@@ -428,8 +436,15 @@
 			}
 		}
 
-		func (z $receiver) Transform(dst Image, m *f64.Aff3, src image.Image, sr image.Rectangle, opts *Options) {
-			panic("unimplemented")
+		func (z $receiver) Transform(dst Image, s2d *f64.Aff3, src image.Image, sr image.Rectangle, opts *Options) {
+			dr := transformRect(s2d, &sr)
+			// adr is the affected destination pixels, relative to dr.Min.
+			adr := dst.Bounds().Intersect(dr).Sub(dr.Min)
+			if adr.Empty() || sr.Empty() {
+				return
+			}
+			d2s := invert(s2d)
+			z.transform_Image_Image(dst, dr, adr, &d2s, src, sr)
 		}
 	`
 
@@ -443,10 +458,30 @@
 			for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
 				sy := (2*uint64(dy) + 1) * sh / dh2
 				$preInner
-				for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+				$tweakDx for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
 					sx := (2*uint64(dx) + 1) * sw / dw2
-					p := $srcu[sx, sy]
-					$outputu[dx, dy, p]
+					p := $srcu[sr.Min.X + int(sx), sr.Min.Y + int(sy)]
+					$outputu[dr.Min.X + int(dx), dr.Min.Y + int(dy), p]
+				}
+			}
+		}
+	`
+
+	codeNNTransformLeaf = `
+		func (nnInterpolator) transform_$dTypeRN_$sTypeRN(dst $dType, dr, adr image.Rectangle, d2s *f64.Aff3, src $sType, sr image.Rectangle) {
+			$preOuter
+			for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+				dyf := float64(dr.Min.Y + int(dy)) + 0.5
+				$preInner
+				$tweakDx for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+					dxf := float64(dr.Min.X + int(dx)) + 0.5
+					sx0 := int(math.Floor(d2s[0]*dxf + d2s[1]*dyf + d2s[2]))
+					sy0 := int(math.Floor(d2s[3]*dxf + d2s[4]*dyf + d2s[5]))
+					if !(image.Point{sx0, sy0}).In(sr) {
+						continue
+					}
+					p := $srcu[sx0, sy0]
+					$outputu[dr.Min.X + int(dx), dr.Min.Y + int(dy), p]
 				}
 			}
 		}
@@ -458,9 +493,14 @@
 			sh := int32(sr.Dy())
 			yscale := float64(sh) / float64(dr.Dy())
 			xscale := float64(sw) / float64(dr.Dx())
+			swMinus1, shMinus1 := sw - 1, sh - 1
 			$preOuter
+
 			for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
 				sy := (float64(dy)+0.5)*yscale - 0.5
+				// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+				// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+				// sx, below.
 				sy0 := int32(sy)
 				yFrac0 := sy - float64(sy0)
 				yFrac1 := 1 - yFrac0
@@ -468,12 +508,13 @@
 				if sy < 0 {
 					sy0, sy1 = 0, 0
 					yFrac0, yFrac1 = 0, 1
-				} else if sy1 >= sh {
-					sy1 = sy0
+				} else if sy1 > shMinus1 {
+					sy0, sy1 = shMinus1, shMinus1
 					yFrac0, yFrac1 = 1, 0
 				}
 				$preInner
-				for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+
+				$tweakDx for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
 					sx := (float64(dx)+0.5)*xscale - 0.5
 					sx0 := int32(sx)
 					xFrac0 := sx - float64(sx0)
@@ -482,10 +523,66 @@
 					if sx < 0 {
 						sx0, sx1 = 0, 0
 						xFrac0, xFrac1 = 0, 1
-					} else if sx1 >= sw {
-						sx1 = sx0
+					} else if sx1 > swMinus1 {
+						sx0, sx1 = swMinus1, swMinus1
 						xFrac0, xFrac1 = 1, 0
 					}
+
+					s00 := $srcf[sr.Min.X + int(sx0), sr.Min.Y + int(sy0)]
+					s10 := $srcf[sr.Min.X + int(sx1), sr.Min.Y + int(sy0)]
+					$blend[xFrac1, s00, xFrac0, s10]
+					s01 := $srcf[sr.Min.X + int(sx0), sr.Min.Y + int(sy1)]
+					s11 := $srcf[sr.Min.X + int(sx1), sr.Min.Y + int(sy1)]
+					$blend[xFrac1, s01, xFrac0, s11]
+					$blend[yFrac1, s10, yFrac0, s11]
+					$outputu[dr.Min.X + int(dx), dr.Min.Y + int(dy), s11]
+				}
+			}
+		}
+	`
+
+	codeABLTransformLeaf = `
+		func (ablInterpolator) transform_$dTypeRN_$sTypeRN(dst $dType, dr, adr image.Rectangle, d2s *f64.Aff3, src $sType, sr image.Rectangle) {
+			$preOuter
+			for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+				dyf := float64(dr.Min.Y + int(dy)) + 0.5
+				$preInner
+				$tweakDx for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+					dxf := float64(dr.Min.X + int(dx)) + 0.5
+					sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+					sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+					if !(image.Point{int(math.Floor(sx)), int(math.Floor(sy))}).In(sr) {
+						continue
+					}
+
+					sx -= 0.5
+					sxf := math.Floor(sx)
+					xFrac0 := sx - sxf
+					xFrac1 := 1 - xFrac0
+					sx0 := int(sxf)
+					sx1 := sx0 + 1
+					if sx0 < sr.Min.X {
+						sx0, sx1 = sr.Min.X, sr.Min.X
+						xFrac0, xFrac1 = 0, 1
+					} else if sx1 >= sr.Max.X {
+						sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+						xFrac0, xFrac1 = 1, 0
+					}
+
+					sy -= 0.5
+					syf := math.Floor(sy)
+					yFrac0 := sy - syf
+					yFrac1 := 1 - yFrac0
+					sy0 := int(syf)
+					sy1 := sy0 + 1
+					if sy0 < sr.Min.Y {
+						sy0, sy1 = sr.Min.Y, sr.Min.Y
+						yFrac0, yFrac1 = 0, 1
+					} else if sy1 >= sr.Max.Y {
+						sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+						yFrac0, yFrac1 = 1, 0
+					}
+
 					s00 := $srcf[sx0, sy0]
 					s10 := $srcf[sx1, sy0]
 					$blend[xFrac1, s00, xFrac0, s10]
@@ -493,7 +590,7 @@
 					s11 := $srcf[sx1, sy1]
 					$blend[xFrac1, s01, xFrac0, s11]
 					$blend[yFrac1, s10, yFrac0, s11]
-					$outputu[dx, dy, s11]
+					$outputu[dr.Min.X + int(dx), dr.Min.Y + int(dy), s11]
 				}
 			}
 		}
@@ -540,7 +637,7 @@
 				for _, s := range z.horizontal.sources {
 					var pr, pg, pb, pa float64
 					for _, c := range z.horizontal.contribs[s.i:s.j] {
-						p += $srcf[c.coord, y] * c.weight
+						p += $srcf[sr.Min.X + int(c.coord), sr.Min.Y + int(y)] * c.weight
 					}
 					tmp[t] = [4]float64{
 						pr * s.invTotalWeightFFFF,
@@ -568,7 +665,7 @@
 						pb += p[2] * c.weight
 						pa += p[3] * c.weight
 					}
-					$outputf[dx, adr.Min.Y+dy, p, s.invTotalWeight]
+					$outputf[dr.Min.X + int(dx), dr.Min.Y + int(adr.Min.Y + dy), p, s.invTotalWeight]
 				}
 			}
 		}
diff --git a/draw/impl.go b/draw/impl.go
index ff9f988..cba9349 100644
--- a/draw/impl.go
+++ b/draw/impl.go
@@ -5,6 +5,7 @@
 import (
 	"image"
 	"image/color"
+	"math"
 
 	"golang.org/x/image/math/f64"
 )
@@ -46,8 +47,15 @@
 	}
 }
 
-func (z nnInterpolator) Transform(dst Image, m *f64.Aff3, src image.Image, sr image.Rectangle, opts *Options) {
-	panic("unimplemented")
+func (z nnInterpolator) Transform(dst Image, s2d *f64.Aff3, src image.Image, sr image.Rectangle, opts *Options) {
+	dr := transformRect(s2d, &sr)
+	// adr is the affected destination pixels, relative to dr.Min.
+	adr := dst.Bounds().Intersect(dr).Sub(dr.Min)
+	if adr.Empty() || sr.Empty() {
+		return
+	}
+	d2s := invert(s2d)
+	z.transform_Image_Image(dst, dr, adr, &d2s, src, sr)
 }
 
 func (nnInterpolator) scale_RGBA_Gray(dst *image.RGBA, dr, adr image.Rectangle, src *image.Gray, sr image.Rectangle) {
@@ -58,14 +66,13 @@
 	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
 		sy := (2*uint64(dy) + 1) * sh / dh2
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
-		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			sx := (2*uint64(dx) + 1) * sw / dw2
 			pr, pg, pb, pa := src.At(sr.Min.X+int(sx), sr.Min.Y+int(sy)).RGBA()
 			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
 			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
 			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
 			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
-			d += 4
 		}
 	}
 }
@@ -78,14 +85,13 @@
 	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
 		sy := (2*uint64(dy) + 1) * sh / dh2
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
-		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			sx := (2*uint64(dx) + 1) * sw / dw2
 			pr, pg, pb, pa := src.At(sr.Min.X+int(sx), sr.Min.Y+int(sy)).RGBA()
 			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
 			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
 			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
 			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
-			d += 4
 		}
 	}
 }
@@ -98,7 +104,7 @@
 	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
 		sy := (2*uint64(dy) + 1) * sh / dh2
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
-		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			sx := (2*uint64(dx) + 1) * sw / dw2
 			pi := src.PixOffset(sr.Min.X+int(sx), sr.Min.Y+int(sy))
 			pr := uint32(src.Pix[pi+0]) * 0x101
@@ -109,7 +115,6 @@
 			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
 			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
 			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
-			d += 4
 		}
 	}
 }
@@ -122,14 +127,13 @@
 	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
 		sy := (2*uint64(dy) + 1) * sh / dh2
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
-		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			sx := (2*uint64(dx) + 1) * sw / dw2
 			pr, pg, pb, pa := src.At(sr.Min.X+int(sx), sr.Min.Y+int(sy)).RGBA()
 			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
 			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
 			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
 			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
-			d += 4
 		}
 	}
 }
@@ -142,14 +146,13 @@
 	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
 		sy := (2*uint64(dy) + 1) * sh / dh2
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
-		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			sx := (2*uint64(dx) + 1) * sw / dw2
 			pr, pg, pb, pa := src.At(sr.Min.X+int(sx), sr.Min.Y+int(sy)).RGBA()
 			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
 			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
 			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
 			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
-			d += 4
 		}
 	}
 }
@@ -162,14 +165,13 @@
 	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
 		sy := (2*uint64(dy) + 1) * sh / dh2
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
-		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			sx := (2*uint64(dx) + 1) * sw / dw2
 			pr, pg, pb, pa := src.At(sr.Min.X+int(sx), sr.Min.Y+int(sy)).RGBA()
 			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
 			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
 			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
 			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
-			d += 4
 		}
 	}
 }
@@ -195,6 +197,152 @@
 	}
 }
 
+func (nnInterpolator) transform_RGBA_Gray(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.Gray, sr image.Rectangle) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx0 := int(math.Floor(d2s[0]*dxf + d2s[1]*dyf + d2s[2]))
+			sy0 := int(math.Floor(d2s[3]*dxf + d2s[4]*dyf + d2s[5]))
+			if !(image.Point{sx0, sy0}).In(sr) {
+				continue
+			}
+			pr, pg, pb, pa := src.At(sx0, sy0).RGBA()
+			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
+			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
+			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
+			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
+		}
+	}
+}
+
+func (nnInterpolator) transform_RGBA_NRGBA(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.NRGBA, sr image.Rectangle) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx0 := int(math.Floor(d2s[0]*dxf + d2s[1]*dyf + d2s[2]))
+			sy0 := int(math.Floor(d2s[3]*dxf + d2s[4]*dyf + d2s[5]))
+			if !(image.Point{sx0, sy0}).In(sr) {
+				continue
+			}
+			pr, pg, pb, pa := src.At(sx0, sy0).RGBA()
+			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
+			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
+			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
+			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
+		}
+	}
+}
+
+func (nnInterpolator) transform_RGBA_RGBA(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.RGBA, sr image.Rectangle) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx0 := int(math.Floor(d2s[0]*dxf + d2s[1]*dyf + d2s[2]))
+			sy0 := int(math.Floor(d2s[3]*dxf + d2s[4]*dyf + d2s[5]))
+			if !(image.Point{sx0, sy0}).In(sr) {
+				continue
+			}
+			pi := src.PixOffset(sx0, sy0)
+			pr := uint32(src.Pix[pi+0]) * 0x101
+			pg := uint32(src.Pix[pi+1]) * 0x101
+			pb := uint32(src.Pix[pi+2]) * 0x101
+			pa := uint32(src.Pix[pi+3]) * 0x101
+			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
+			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
+			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
+			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
+		}
+	}
+}
+
+func (nnInterpolator) transform_RGBA_Uniform(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.Uniform, sr image.Rectangle) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx0 := int(math.Floor(d2s[0]*dxf + d2s[1]*dyf + d2s[2]))
+			sy0 := int(math.Floor(d2s[3]*dxf + d2s[4]*dyf + d2s[5]))
+			if !(image.Point{sx0, sy0}).In(sr) {
+				continue
+			}
+			pr, pg, pb, pa := src.At(sx0, sy0).RGBA()
+			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
+			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
+			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
+			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
+		}
+	}
+}
+
+func (nnInterpolator) transform_RGBA_YCbCr(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.YCbCr, sr image.Rectangle) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx0 := int(math.Floor(d2s[0]*dxf + d2s[1]*dyf + d2s[2]))
+			sy0 := int(math.Floor(d2s[3]*dxf + d2s[4]*dyf + d2s[5]))
+			if !(image.Point{sx0, sy0}).In(sr) {
+				continue
+			}
+			pr, pg, pb, pa := src.At(sx0, sy0).RGBA()
+			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
+			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
+			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
+			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
+		}
+	}
+}
+
+func (nnInterpolator) transform_RGBA_Image(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src image.Image, sr image.Rectangle) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx0 := int(math.Floor(d2s[0]*dxf + d2s[1]*dyf + d2s[2]))
+			sy0 := int(math.Floor(d2s[3]*dxf + d2s[4]*dyf + d2s[5]))
+			if !(image.Point{sx0, sy0}).In(sr) {
+				continue
+			}
+			pr, pg, pb, pa := src.At(sx0, sy0).RGBA()
+			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
+			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
+			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
+			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
+		}
+	}
+}
+
+func (nnInterpolator) transform_Image_Image(dst Image, dr, adr image.Rectangle, d2s *f64.Aff3, src image.Image, sr image.Rectangle) {
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx0 := int(math.Floor(d2s[0]*dxf + d2s[1]*dyf + d2s[2]))
+			sy0 := int(math.Floor(d2s[3]*dxf + d2s[4]*dyf + d2s[5]))
+			if !(image.Point{sx0, sy0}).In(sr) {
+				continue
+			}
+			pr, pg, pb, pa := src.At(sx0, sy0).RGBA()
+			dstColorRGBA64.R = uint16(pr)
+			dstColorRGBA64.G = uint16(pg)
+			dstColorRGBA64.B = uint16(pb)
+			dstColorRGBA64.A = uint16(pa)
+			dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
+		}
+	}
+}
+
 func (z ablInterpolator) Scale(dst Image, dr image.Rectangle, src image.Image, sr image.Rectangle, opts *Options) {
 	// adr is the affected destination pixels, relative to dr.Min.
 	adr := dst.Bounds().Intersect(dr).Sub(dr.Min)
@@ -232,8 +380,15 @@
 	}
 }
 
-func (z ablInterpolator) Transform(dst Image, m *f64.Aff3, src image.Image, sr image.Rectangle, opts *Options) {
-	panic("unimplemented")
+func (z ablInterpolator) Transform(dst Image, s2d *f64.Aff3, src image.Image, sr image.Rectangle, opts *Options) {
+	dr := transformRect(s2d, &sr)
+	// adr is the affected destination pixels, relative to dr.Min.
+	adr := dst.Bounds().Intersect(dr).Sub(dr.Min)
+	if adr.Empty() || sr.Empty() {
+		return
+	}
+	d2s := invert(s2d)
+	z.transform_Image_Image(dst, dr, adr, &d2s, src, sr)
 }
 
 func (ablInterpolator) scale_RGBA_Gray(dst *image.RGBA, dr, adr image.Rectangle, src *image.Gray, sr image.Rectangle) {
@@ -241,8 +396,13 @@
 	sh := int32(sr.Dy())
 	yscale := float64(sh) / float64(dr.Dy())
 	xscale := float64(sw) / float64(dr.Dx())
+	swMinus1, shMinus1 := sw-1, sh-1
+
 	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
 		sy := (float64(dy)+0.5)*yscale - 0.5
+		// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+		// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+		// sx, below.
 		sy0 := int32(sy)
 		yFrac0 := sy - float64(sy0)
 		yFrac1 := 1 - yFrac0
@@ -250,12 +410,13 @@
 		if sy < 0 {
 			sy0, sy1 = 0, 0
 			yFrac0, yFrac1 = 0, 1
-		} else if sy1 >= sh {
-			sy1 = sy0
+		} else if sy1 > shMinus1 {
+			sy0, sy1 = shMinus1, shMinus1
 			yFrac0, yFrac1 = 1, 0
 		}
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
-		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			sx := (float64(dx)+0.5)*xscale - 0.5
 			sx0 := int32(sx)
 			xFrac0 := sx - float64(sx0)
@@ -264,10 +425,11 @@
 			if sx < 0 {
 				sx0, sx1 = 0, 0
 				xFrac0, xFrac1 = 0, 1
-			} else if sx1 >= sw {
-				sx1 = sx0
+			} else if sx1 > swMinus1 {
+				sx0, sx1 = swMinus1, swMinus1
 				xFrac0, xFrac1 = 1, 0
 			}
+
 			s00ru, s00gu, s00bu, s00au := src.At(sr.Min.X+int(sx0), sr.Min.Y+int(sy0)).RGBA()
 			s00r := float64(s00ru)
 			s00g := float64(s00gu)
@@ -304,7 +466,6 @@
 			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
 			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
 			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
-			d += 4
 		}
 	}
 }
@@ -314,8 +475,13 @@
 	sh := int32(sr.Dy())
 	yscale := float64(sh) / float64(dr.Dy())
 	xscale := float64(sw) / float64(dr.Dx())
+	swMinus1, shMinus1 := sw-1, sh-1
+
 	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
 		sy := (float64(dy)+0.5)*yscale - 0.5
+		// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+		// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+		// sx, below.
 		sy0 := int32(sy)
 		yFrac0 := sy - float64(sy0)
 		yFrac1 := 1 - yFrac0
@@ -323,12 +489,13 @@
 		if sy < 0 {
 			sy0, sy1 = 0, 0
 			yFrac0, yFrac1 = 0, 1
-		} else if sy1 >= sh {
-			sy1 = sy0
+		} else if sy1 > shMinus1 {
+			sy0, sy1 = shMinus1, shMinus1
 			yFrac0, yFrac1 = 1, 0
 		}
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
-		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			sx := (float64(dx)+0.5)*xscale - 0.5
 			sx0 := int32(sx)
 			xFrac0 := sx - float64(sx0)
@@ -337,10 +504,11 @@
 			if sx < 0 {
 				sx0, sx1 = 0, 0
 				xFrac0, xFrac1 = 0, 1
-			} else if sx1 >= sw {
-				sx1 = sx0
+			} else if sx1 > swMinus1 {
+				sx0, sx1 = swMinus1, swMinus1
 				xFrac0, xFrac1 = 1, 0
 			}
+
 			s00ru, s00gu, s00bu, s00au := src.At(sr.Min.X+int(sx0), sr.Min.Y+int(sy0)).RGBA()
 			s00r := float64(s00ru)
 			s00g := float64(s00gu)
@@ -377,7 +545,6 @@
 			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
 			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
 			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
-			d += 4
 		}
 	}
 }
@@ -387,8 +554,13 @@
 	sh := int32(sr.Dy())
 	yscale := float64(sh) / float64(dr.Dy())
 	xscale := float64(sw) / float64(dr.Dx())
+	swMinus1, shMinus1 := sw-1, sh-1
+
 	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
 		sy := (float64(dy)+0.5)*yscale - 0.5
+		// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+		// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+		// sx, below.
 		sy0 := int32(sy)
 		yFrac0 := sy - float64(sy0)
 		yFrac1 := 1 - yFrac0
@@ -396,12 +568,13 @@
 		if sy < 0 {
 			sy0, sy1 = 0, 0
 			yFrac0, yFrac1 = 0, 1
-		} else if sy1 >= sh {
-			sy1 = sy0
+		} else if sy1 > shMinus1 {
+			sy0, sy1 = shMinus1, shMinus1
 			yFrac0, yFrac1 = 1, 0
 		}
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
-		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			sx := (float64(dx)+0.5)*xscale - 0.5
 			sx0 := int32(sx)
 			xFrac0 := sx - float64(sx0)
@@ -410,10 +583,11 @@
 			if sx < 0 {
 				sx0, sx1 = 0, 0
 				xFrac0, xFrac1 = 0, 1
-			} else if sx1 >= sw {
-				sx1 = sx0
+			} else if sx1 > swMinus1 {
+				sx0, sx1 = swMinus1, swMinus1
 				xFrac0, xFrac1 = 1, 0
 			}
+
 			s00i := src.PixOffset(sr.Min.X+int(sx0), sr.Min.Y+int(sy0))
 			s00ru := uint32(src.Pix[s00i+0]) * 0x101
 			s00gu := uint32(src.Pix[s00i+1]) * 0x101
@@ -466,7 +640,6 @@
 			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
 			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
 			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
-			d += 4
 		}
 	}
 }
@@ -476,8 +649,13 @@
 	sh := int32(sr.Dy())
 	yscale := float64(sh) / float64(dr.Dy())
 	xscale := float64(sw) / float64(dr.Dx())
+	swMinus1, shMinus1 := sw-1, sh-1
+
 	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
 		sy := (float64(dy)+0.5)*yscale - 0.5
+		// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+		// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+		// sx, below.
 		sy0 := int32(sy)
 		yFrac0 := sy - float64(sy0)
 		yFrac1 := 1 - yFrac0
@@ -485,12 +663,13 @@
 		if sy < 0 {
 			sy0, sy1 = 0, 0
 			yFrac0, yFrac1 = 0, 1
-		} else if sy1 >= sh {
-			sy1 = sy0
+		} else if sy1 > shMinus1 {
+			sy0, sy1 = shMinus1, shMinus1
 			yFrac0, yFrac1 = 1, 0
 		}
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
-		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			sx := (float64(dx)+0.5)*xscale - 0.5
 			sx0 := int32(sx)
 			xFrac0 := sx - float64(sx0)
@@ -499,10 +678,11 @@
 			if sx < 0 {
 				sx0, sx1 = 0, 0
 				xFrac0, xFrac1 = 0, 1
-			} else if sx1 >= sw {
-				sx1 = sx0
+			} else if sx1 > swMinus1 {
+				sx0, sx1 = swMinus1, swMinus1
 				xFrac0, xFrac1 = 1, 0
 			}
+
 			s00ru, s00gu, s00bu, s00au := src.At(sr.Min.X+int(sx0), sr.Min.Y+int(sy0)).RGBA()
 			s00r := float64(s00ru)
 			s00g := float64(s00gu)
@@ -539,7 +719,6 @@
 			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
 			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
 			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
-			d += 4
 		}
 	}
 }
@@ -549,8 +728,13 @@
 	sh := int32(sr.Dy())
 	yscale := float64(sh) / float64(dr.Dy())
 	xscale := float64(sw) / float64(dr.Dx())
+	swMinus1, shMinus1 := sw-1, sh-1
+
 	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
 		sy := (float64(dy)+0.5)*yscale - 0.5
+		// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+		// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+		// sx, below.
 		sy0 := int32(sy)
 		yFrac0 := sy - float64(sy0)
 		yFrac1 := 1 - yFrac0
@@ -558,12 +742,13 @@
 		if sy < 0 {
 			sy0, sy1 = 0, 0
 			yFrac0, yFrac1 = 0, 1
-		} else if sy1 >= sh {
-			sy1 = sy0
+		} else if sy1 > shMinus1 {
+			sy0, sy1 = shMinus1, shMinus1
 			yFrac0, yFrac1 = 1, 0
 		}
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
-		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			sx := (float64(dx)+0.5)*xscale - 0.5
 			sx0 := int32(sx)
 			xFrac0 := sx - float64(sx0)
@@ -572,10 +757,11 @@
 			if sx < 0 {
 				sx0, sx1 = 0, 0
 				xFrac0, xFrac1 = 0, 1
-			} else if sx1 >= sw {
-				sx1 = sx0
+			} else if sx1 > swMinus1 {
+				sx0, sx1 = swMinus1, swMinus1
 				xFrac0, xFrac1 = 1, 0
 			}
+
 			s00ru, s00gu, s00bu, s00au := src.At(sr.Min.X+int(sx0), sr.Min.Y+int(sy0)).RGBA()
 			s00r := float64(s00ru)
 			s00g := float64(s00gu)
@@ -612,7 +798,6 @@
 			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
 			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
 			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
-			d += 4
 		}
 	}
 }
@@ -622,8 +807,13 @@
 	sh := int32(sr.Dy())
 	yscale := float64(sh) / float64(dr.Dy())
 	xscale := float64(sw) / float64(dr.Dx())
+	swMinus1, shMinus1 := sw-1, sh-1
+
 	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
 		sy := (float64(dy)+0.5)*yscale - 0.5
+		// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+		// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+		// sx, below.
 		sy0 := int32(sy)
 		yFrac0 := sy - float64(sy0)
 		yFrac1 := 1 - yFrac0
@@ -631,12 +821,13 @@
 		if sy < 0 {
 			sy0, sy1 = 0, 0
 			yFrac0, yFrac1 = 0, 1
-		} else if sy1 >= sh {
-			sy1 = sy0
+		} else if sy1 > shMinus1 {
+			sy0, sy1 = shMinus1, shMinus1
 			yFrac0, yFrac1 = 1, 0
 		}
 		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
-		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			sx := (float64(dx)+0.5)*xscale - 0.5
 			sx0 := int32(sx)
 			xFrac0 := sx - float64(sx0)
@@ -645,10 +836,11 @@
 			if sx < 0 {
 				sx0, sx1 = 0, 0
 				xFrac0, xFrac1 = 0, 1
-			} else if sx1 >= sw {
-				sx1 = sx0
+			} else if sx1 > swMinus1 {
+				sx0, sx1 = swMinus1, swMinus1
 				xFrac0, xFrac1 = 1, 0
 			}
+
 			s00ru, s00gu, s00bu, s00au := src.At(sr.Min.X+int(sx0), sr.Min.Y+int(sy0)).RGBA()
 			s00r := float64(s00ru)
 			s00g := float64(s00gu)
@@ -685,7 +877,6 @@
 			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
 			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
 			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
-			d += 4
 		}
 	}
 }
@@ -695,10 +886,15 @@
 	sh := int32(sr.Dy())
 	yscale := float64(sh) / float64(dr.Dy())
 	xscale := float64(sw) / float64(dr.Dx())
+	swMinus1, shMinus1 := sw-1, sh-1
 	dstColorRGBA64 := &color.RGBA64{}
 	dstColor := color.Color(dstColorRGBA64)
+
 	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
 		sy := (float64(dy)+0.5)*yscale - 0.5
+		// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+		// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+		// sx, below.
 		sy0 := int32(sy)
 		yFrac0 := sy - float64(sy0)
 		yFrac1 := 1 - yFrac0
@@ -706,10 +902,11 @@
 		if sy < 0 {
 			sy0, sy1 = 0, 0
 			yFrac0, yFrac1 = 0, 1
-		} else if sy1 >= sh {
-			sy1 = sy0
+		} else if sy1 > shMinus1 {
+			sy0, sy1 = shMinus1, shMinus1
 			yFrac0, yFrac1 = 1, 0
 		}
+
 		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
 			sx := (float64(dx)+0.5)*xscale - 0.5
 			sx0 := int32(sx)
@@ -719,10 +916,11 @@
 			if sx < 0 {
 				sx0, sx1 = 0, 0
 				xFrac0, xFrac1 = 0, 1
-			} else if sx1 >= sw {
-				sx1 = sx0
+			} else if sx1 > swMinus1 {
+				sx0, sx1 = swMinus1, swMinus1
 				xFrac0, xFrac1 = 1, 0
 			}
+
 			s00ru, s00gu, s00bu, s00au := src.At(sr.Min.X+int(sx0), sr.Min.Y+int(sy0)).RGBA()
 			s00r := float64(s00ru)
 			s00g := float64(s00gu)
@@ -764,6 +962,584 @@
 	}
 }
 
+func (ablInterpolator) transform_RGBA_Gray(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.Gray, sr image.Rectangle) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(math.Floor(sx)), int(math.Floor(sy))}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			sxf := math.Floor(sx)
+			xFrac0 := sx - sxf
+			xFrac1 := 1 - xFrac0
+			sx0 := int(sxf)
+			sx1 := sx0 + 1
+			if sx0 < sr.Min.X {
+				sx0, sx1 = sr.Min.X, sr.Min.X
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= sr.Max.X {
+				sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			sy -= 0.5
+			syf := math.Floor(sy)
+			yFrac0 := sy - syf
+			yFrac1 := 1 - yFrac0
+			sy0 := int(syf)
+			sy1 := sy0 + 1
+			if sy0 < sr.Min.Y {
+				sy0, sy1 = sr.Min.Y, sr.Min.Y
+				yFrac0, yFrac1 = 0, 1
+			} else if sy1 >= sr.Max.Y {
+				sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+				yFrac0, yFrac1 = 1, 0
+			}
+
+			s00ru, s00gu, s00bu, s00au := src.At(sx0, sy0).RGBA()
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10ru, s10gu, s10bu, s10au := src.At(sx1, sy0).RGBA()
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01ru, s01gu, s01bu, s01au := src.At(sx0, sy1).RGBA()
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11ru, s11gu, s11bu, s11au := src.At(sx1, sy1).RGBA()
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			dst.Pix[d+0] = uint8(uint32(s11r) >> 8)
+			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
+			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
+			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
+		}
+	}
+}
+
+func (ablInterpolator) transform_RGBA_NRGBA(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.NRGBA, sr image.Rectangle) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(math.Floor(sx)), int(math.Floor(sy))}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			sxf := math.Floor(sx)
+			xFrac0 := sx - sxf
+			xFrac1 := 1 - xFrac0
+			sx0 := int(sxf)
+			sx1 := sx0 + 1
+			if sx0 < sr.Min.X {
+				sx0, sx1 = sr.Min.X, sr.Min.X
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= sr.Max.X {
+				sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			sy -= 0.5
+			syf := math.Floor(sy)
+			yFrac0 := sy - syf
+			yFrac1 := 1 - yFrac0
+			sy0 := int(syf)
+			sy1 := sy0 + 1
+			if sy0 < sr.Min.Y {
+				sy0, sy1 = sr.Min.Y, sr.Min.Y
+				yFrac0, yFrac1 = 0, 1
+			} else if sy1 >= sr.Max.Y {
+				sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+				yFrac0, yFrac1 = 1, 0
+			}
+
+			s00ru, s00gu, s00bu, s00au := src.At(sx0, sy0).RGBA()
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10ru, s10gu, s10bu, s10au := src.At(sx1, sy0).RGBA()
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01ru, s01gu, s01bu, s01au := src.At(sx0, sy1).RGBA()
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11ru, s11gu, s11bu, s11au := src.At(sx1, sy1).RGBA()
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			dst.Pix[d+0] = uint8(uint32(s11r) >> 8)
+			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
+			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
+			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
+		}
+	}
+}
+
+func (ablInterpolator) transform_RGBA_RGBA(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.RGBA, sr image.Rectangle) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(math.Floor(sx)), int(math.Floor(sy))}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			sxf := math.Floor(sx)
+			xFrac0 := sx - sxf
+			xFrac1 := 1 - xFrac0
+			sx0 := int(sxf)
+			sx1 := sx0 + 1
+			if sx0 < sr.Min.X {
+				sx0, sx1 = sr.Min.X, sr.Min.X
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= sr.Max.X {
+				sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			sy -= 0.5
+			syf := math.Floor(sy)
+			yFrac0 := sy - syf
+			yFrac1 := 1 - yFrac0
+			sy0 := int(syf)
+			sy1 := sy0 + 1
+			if sy0 < sr.Min.Y {
+				sy0, sy1 = sr.Min.Y, sr.Min.Y
+				yFrac0, yFrac1 = 0, 1
+			} else if sy1 >= sr.Max.Y {
+				sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+				yFrac0, yFrac1 = 1, 0
+			}
+
+			s00i := src.PixOffset(sx0, sy0)
+			s00ru := uint32(src.Pix[s00i+0]) * 0x101
+			s00gu := uint32(src.Pix[s00i+1]) * 0x101
+			s00bu := uint32(src.Pix[s00i+2]) * 0x101
+			s00au := uint32(src.Pix[s00i+3]) * 0x101
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10i := src.PixOffset(sx1, sy0)
+			s10ru := uint32(src.Pix[s10i+0]) * 0x101
+			s10gu := uint32(src.Pix[s10i+1]) * 0x101
+			s10bu := uint32(src.Pix[s10i+2]) * 0x101
+			s10au := uint32(src.Pix[s10i+3]) * 0x101
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01i := src.PixOffset(sx0, sy1)
+			s01ru := uint32(src.Pix[s01i+0]) * 0x101
+			s01gu := uint32(src.Pix[s01i+1]) * 0x101
+			s01bu := uint32(src.Pix[s01i+2]) * 0x101
+			s01au := uint32(src.Pix[s01i+3]) * 0x101
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11i := src.PixOffset(sx1, sy1)
+			s11ru := uint32(src.Pix[s11i+0]) * 0x101
+			s11gu := uint32(src.Pix[s11i+1]) * 0x101
+			s11bu := uint32(src.Pix[s11i+2]) * 0x101
+			s11au := uint32(src.Pix[s11i+3]) * 0x101
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			dst.Pix[d+0] = uint8(uint32(s11r) >> 8)
+			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
+			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
+			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
+		}
+	}
+}
+
+func (ablInterpolator) transform_RGBA_Uniform(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.Uniform, sr image.Rectangle) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(math.Floor(sx)), int(math.Floor(sy))}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			sxf := math.Floor(sx)
+			xFrac0 := sx - sxf
+			xFrac1 := 1 - xFrac0
+			sx0 := int(sxf)
+			sx1 := sx0 + 1
+			if sx0 < sr.Min.X {
+				sx0, sx1 = sr.Min.X, sr.Min.X
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= sr.Max.X {
+				sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			sy -= 0.5
+			syf := math.Floor(sy)
+			yFrac0 := sy - syf
+			yFrac1 := 1 - yFrac0
+			sy0 := int(syf)
+			sy1 := sy0 + 1
+			if sy0 < sr.Min.Y {
+				sy0, sy1 = sr.Min.Y, sr.Min.Y
+				yFrac0, yFrac1 = 0, 1
+			} else if sy1 >= sr.Max.Y {
+				sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+				yFrac0, yFrac1 = 1, 0
+			}
+
+			s00ru, s00gu, s00bu, s00au := src.At(sx0, sy0).RGBA()
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10ru, s10gu, s10bu, s10au := src.At(sx1, sy0).RGBA()
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01ru, s01gu, s01bu, s01au := src.At(sx0, sy1).RGBA()
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11ru, s11gu, s11bu, s11au := src.At(sx1, sy1).RGBA()
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			dst.Pix[d+0] = uint8(uint32(s11r) >> 8)
+			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
+			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
+			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
+		}
+	}
+}
+
+func (ablInterpolator) transform_RGBA_YCbCr(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.YCbCr, sr image.Rectangle) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(math.Floor(sx)), int(math.Floor(sy))}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			sxf := math.Floor(sx)
+			xFrac0 := sx - sxf
+			xFrac1 := 1 - xFrac0
+			sx0 := int(sxf)
+			sx1 := sx0 + 1
+			if sx0 < sr.Min.X {
+				sx0, sx1 = sr.Min.X, sr.Min.X
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= sr.Max.X {
+				sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			sy -= 0.5
+			syf := math.Floor(sy)
+			yFrac0 := sy - syf
+			yFrac1 := 1 - yFrac0
+			sy0 := int(syf)
+			sy1 := sy0 + 1
+			if sy0 < sr.Min.Y {
+				sy0, sy1 = sr.Min.Y, sr.Min.Y
+				yFrac0, yFrac1 = 0, 1
+			} else if sy1 >= sr.Max.Y {
+				sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+				yFrac0, yFrac1 = 1, 0
+			}
+
+			s00ru, s00gu, s00bu, s00au := src.At(sx0, sy0).RGBA()
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10ru, s10gu, s10bu, s10au := src.At(sx1, sy0).RGBA()
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01ru, s01gu, s01bu, s01au := src.At(sx0, sy1).RGBA()
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11ru, s11gu, s11bu, s11au := src.At(sx1, sy1).RGBA()
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			dst.Pix[d+0] = uint8(uint32(s11r) >> 8)
+			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
+			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
+			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
+		}
+	}
+}
+
+func (ablInterpolator) transform_RGBA_Image(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src image.Image, sr image.Rectangle) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(math.Floor(sx)), int(math.Floor(sy))}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			sxf := math.Floor(sx)
+			xFrac0 := sx - sxf
+			xFrac1 := 1 - xFrac0
+			sx0 := int(sxf)
+			sx1 := sx0 + 1
+			if sx0 < sr.Min.X {
+				sx0, sx1 = sr.Min.X, sr.Min.X
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= sr.Max.X {
+				sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			sy -= 0.5
+			syf := math.Floor(sy)
+			yFrac0 := sy - syf
+			yFrac1 := 1 - yFrac0
+			sy0 := int(syf)
+			sy1 := sy0 + 1
+			if sy0 < sr.Min.Y {
+				sy0, sy1 = sr.Min.Y, sr.Min.Y
+				yFrac0, yFrac1 = 0, 1
+			} else if sy1 >= sr.Max.Y {
+				sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+				yFrac0, yFrac1 = 1, 0
+			}
+
+			s00ru, s00gu, s00bu, s00au := src.At(sx0, sy0).RGBA()
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10ru, s10gu, s10bu, s10au := src.At(sx1, sy0).RGBA()
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01ru, s01gu, s01bu, s01au := src.At(sx0, sy1).RGBA()
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11ru, s11gu, s11bu, s11au := src.At(sx1, sy1).RGBA()
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			dst.Pix[d+0] = uint8(uint32(s11r) >> 8)
+			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
+			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
+			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
+		}
+	}
+}
+
+func (ablInterpolator) transform_Image_Image(dst Image, dr, adr image.Rectangle, d2s *f64.Aff3, src image.Image, sr image.Rectangle) {
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(math.Floor(sx)), int(math.Floor(sy))}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			sxf := math.Floor(sx)
+			xFrac0 := sx - sxf
+			xFrac1 := 1 - xFrac0
+			sx0 := int(sxf)
+			sx1 := sx0 + 1
+			if sx0 < sr.Min.X {
+				sx0, sx1 = sr.Min.X, sr.Min.X
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= sr.Max.X {
+				sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			sy -= 0.5
+			syf := math.Floor(sy)
+			yFrac0 := sy - syf
+			yFrac1 := 1 - yFrac0
+			sy0 := int(syf)
+			sy1 := sy0 + 1
+			if sy0 < sr.Min.Y {
+				sy0, sy1 = sr.Min.Y, sr.Min.Y
+				yFrac0, yFrac1 = 0, 1
+			} else if sy1 >= sr.Max.Y {
+				sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+				yFrac0, yFrac1 = 1, 0
+			}
+
+			s00ru, s00gu, s00bu, s00au := src.At(sx0, sy0).RGBA()
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10ru, s10gu, s10bu, s10au := src.At(sx1, sy0).RGBA()
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01ru, s01gu, s01bu, s01au := src.At(sx0, sy1).RGBA()
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11ru, s11gu, s11bu, s11au := src.At(sx1, sy1).RGBA()
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			dstColorRGBA64.R = uint16(s11r)
+			dstColorRGBA64.G = uint16(s11g)
+			dstColorRGBA64.B = uint16(s11b)
+			dstColorRGBA64.A = uint16(s11a)
+			dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
+		}
+	}
+}
+
 func (z *kernelScaler) Scale(dst Image, dr image.Rectangle, src image.Image, sr image.Rectangle, opts *Options) {
 	if z.dw != int32(dr.Dx()) || z.dh != int32(dr.Dy()) || z.sw != int32(sr.Dx()) || z.sh != int32(sr.Dy()) {
 		z.kernel.Scale(dst, dr, src, sr, opts)
diff --git a/draw/scale.go b/draw/scale.go
index 8fc1644..4f94086 100644
--- a/draw/scale.go
+++ b/draw/scale.go
@@ -249,3 +249,72 @@
 	}
 	return 0
 }
+
+// invert returns the inverse of m.
+//
+// TODO: move this into the f64 package, once we work out the convention for
+// matrix methods in that package: do they modify the receiver, take a dst
+// pointer argument, or return a new value?
+func invert(m *f64.Aff3) f64.Aff3 {
+	m00 := +m[3*1+1]
+	m01 := -m[3*0+1]
+	m02 := +m[3*1+2]*m[3*0+1] - m[3*1+1]*m[3*0+2]
+	m10 := -m[3*1+0]
+	m11 := +m[3*0+0]
+	m12 := +m[3*1+0]*m[3*0+2] - m[3*1+2]*m[3*0+0]
+
+	det := m00*m11 - m10*m01
+
+	return f64.Aff3{
+		m00 / det,
+		m01 / det,
+		m02 / det,
+		m10 / det,
+		m11 / det,
+		m12 / det,
+	}
+}
+
+// transformRect returns a rectangle dr that contains sr transformed by s2d.
+func transformRect(s2d *f64.Aff3, sr *image.Rectangle) (dr image.Rectangle) {
+	ps := [...]image.Point{
+		{sr.Min.X, sr.Min.Y},
+		{sr.Max.X, sr.Min.Y},
+		{sr.Min.X, sr.Max.Y},
+		{sr.Max.X, sr.Max.Y},
+	}
+	for i, p := range ps {
+		sxf := float64(p.X)
+		syf := float64(p.Y)
+		dx := int(math.Floor(s2d[0]*sxf + s2d[1]*syf + s2d[2]))
+		dy := int(math.Floor(s2d[3]*sxf + s2d[4]*syf + s2d[5]))
+
+		// The +1 adjustments below are because an image.Rectangle is inclusive
+		// on the low end but exclusive on the high end.
+
+		if i == 0 {
+			dr = image.Rectangle{
+				Min: image.Point{dx + 0, dy + 0},
+				Max: image.Point{dx + 1, dy + 1},
+			}
+			continue
+		}
+
+		if dr.Min.X > dx {
+			dr.Min.X = dx
+		}
+		dx++
+		if dr.Max.X < dx {
+			dr.Max.X = dx
+		}
+
+		if dr.Min.Y > dy {
+			dr.Min.Y = dy
+		}
+		dy++
+		if dr.Max.Y < dy {
+			dr.Max.Y = dy
+		}
+	}
+	return dr
+}
diff --git a/draw/scale_test.go b/draw/scale_test.go
index 30a82c6..c15810e 100644
--- a/draw/scale_test.go
+++ b/draw/scale_test.go
@@ -16,18 +16,28 @@
 	"reflect"
 	"testing"
 
+	"golang.org/x/image/math/f64"
+
 	_ "image/jpeg"
 )
 
-var genScaleFiles = flag.Bool("gen_scale_files", false, "whether to generate the TestScaleXxx golden files.")
+var genGoldenFiles = flag.Bool("gen_golden_files", false, "whether to generate the TestXxx golden files.")
 
-// testScale tests that scaling the source image gives the exact destination
-// image. This is to ensure that any refactoring or optimization of the scaling
-// code doesn't change the scaling behavior. Changing the actual algorithm or
-// kernel used by any particular quality setting will obviously change the
-// resultant pixels. In such a case, use the gen_scale_files flag to regenerate
-// the golden files.
-func testScale(t *testing.T, w int, h int, direction, srcFilename string) {
+var transformMatrix = func() *f64.Aff3 {
+	const scale, cos30, sin30 = 3.75, 0.866025404, 0.5
+	return &f64.Aff3{
+		+scale * cos30, -scale * sin30, 40,
+		+scale * sin30, +scale * cos30, 10,
+	}
+}()
+
+// testInterp tests that interpolating the source image gives the exact
+// destination image. This is to ensure that any refactoring or optimization of
+// the interpolation code doesn't change the behavior. Changing the actual
+// algorithm or kernel used by any particular quality setting will obviously
+// change the resultant pixels. In such a case, use the gen_golden_files flag
+// to regenerate the golden files.
+func testInterp(t *testing.T, w int, h int, direction, srcFilename string) {
 	f, err := os.Open("../testdata/go-turns-two-" + srcFilename)
 	if err != nil {
 		t.Fatalf("Open: %v", err)
@@ -44,12 +54,21 @@
 		"cr": CatmullRom,
 	}
 	for name, q := range testCases {
-		gotFilename := fmt.Sprintf("../testdata/go-turns-two-%s-%s.png", direction, name)
+		goldenFilename := fmt.Sprintf("../testdata/go-turns-two-%s-%s.png", direction, name)
 
 		got := image.NewRGBA(image.Rect(0, 0, w, h))
-		q.Scale(got, got.Bounds(), src, src.Bounds(), nil)
-		if *genScaleFiles {
-			g, err := os.Create(gotFilename)
+		if direction == "rotate" {
+			if name == "bl" || name == "cr" {
+				// TODO: implement Kernel.Transform.
+				continue
+			}
+			q.Transform(got, transformMatrix, src, src.Bounds(), nil)
+		} else {
+			q.Scale(got, got.Bounds(), src, src.Bounds(), nil)
+		}
+
+		if *genGoldenFiles {
+			g, err := os.Create(goldenFilename)
 			if err != nil {
 				t.Errorf("Create: %v", err)
 				continue
@@ -62,27 +81,35 @@
 			continue
 		}
 
-		g, err := os.Open(gotFilename)
+		g, err := os.Open(goldenFilename)
 		if err != nil {
 			t.Errorf("Open: %v", err)
 			continue
 		}
 		defer g.Close()
-		want, err := png.Decode(g)
+		wantRaw, err := png.Decode(g)
 		if err != nil {
 			t.Errorf("Decode: %v", err)
 			continue
 		}
+		// convert wantRaw to RGBA.
+		want, ok := wantRaw.(*image.RGBA)
+		if !ok {
+			b := wantRaw.Bounds()
+			want = image.NewRGBA(b)
+			Draw(want, b, wantRaw, b.Min, Src)
+		}
 
 		if !reflect.DeepEqual(got, want) {
-			t.Errorf("%s: actual image differs from golden image", gotFilename)
+			t.Errorf("%s: actual image differs from golden image", goldenFilename)
 			continue
 		}
 	}
 }
 
-func TestScaleDown(t *testing.T) { testScale(t, 100, 100, "down", "280x360.jpeg") }
-func TestScaleUp(t *testing.T)   { testScale(t, 75, 100, "up", "14x18.png") }
+func TestScaleDown(t *testing.T) { testInterp(t, 100, 100, "down", "280x360.jpeg") }
+func TestScaleUp(t *testing.T)   { testInterp(t, 75, 100, "up", "14x18.png") }
+func TestTransform(t *testing.T) { testInterp(t, 100, 100, "rotate", "14x18.png") }
 
 func fillPix(r *rand.Rand, pixs ...[]byte) {
 	for _, pix := range pixs {
@@ -92,7 +119,7 @@
 	}
 }
 
-func TestScaleClipCommute(t *testing.T) {
+func TestInterpClipCommute(t *testing.T) {
 	src := image.NewNRGBA(image.Rect(0, 0, 20, 20))
 	fillPix(rand.New(rand.NewSource(0)), src.Pix)
 
@@ -103,28 +130,46 @@
 		ApproxBiLinear,
 		CatmullRom,
 	}
-	for _, q := range qs {
-		dst0 := image.NewRGBA(image.Rect(1, 1, 10, 10))
-		dst1 := image.NewRGBA(image.Rect(1, 1, 10, 10))
-		for i := range dst0.Pix {
-			dst0.Pix[i] = uint8(i / 4)
-			dst1.Pix[i] = uint8(i / 4)
-		}
+	for _, transform := range []bool{false, true} {
+		for _, q := range qs {
+			if transform && q == CatmullRom {
+				// TODO: implement Kernel.Transform.
+				continue
+			}
 
-		// Scale then clip.
-		q.Scale(dst0, outer, src, src.Bounds(), nil)
-		dst0 = dst0.SubImage(inner).(*image.RGBA)
+			dst0 := image.NewRGBA(image.Rect(1, 1, 10, 10))
+			dst1 := image.NewRGBA(image.Rect(1, 1, 10, 10))
+			for i := range dst0.Pix {
+				dst0.Pix[i] = uint8(i / 4)
+				dst1.Pix[i] = uint8(i / 4)
+			}
 
-		// Clip then scale.
-		dst1 = dst1.SubImage(inner).(*image.RGBA)
-		q.Scale(dst1, outer, src, src.Bounds(), nil)
+			var interp func(dst *image.RGBA)
+			if transform {
+				interp = func(dst *image.RGBA) {
+					q.Transform(dst, transformMatrix, src, src.Bounds(), nil)
+				}
+			} else {
+				interp = func(dst *image.RGBA) {
+					q.Scale(dst, outer, src, src.Bounds(), nil)
+				}
+			}
 
-	loop:
-		for y := inner.Min.Y; y < inner.Max.Y; y++ {
-			for x := inner.Min.X; x < inner.Max.X; x++ {
-				if c0, c1 := dst0.RGBAAt(x, y), dst1.RGBAAt(x, y); c0 != c1 {
-					t.Errorf("q=%T: at (%d, %d): c0=%v, c1=%v", q, x, y, c0, c1)
-					break loop
+			// Interpolate then clip.
+			interp(dst0)
+			dst0 = dst0.SubImage(inner).(*image.RGBA)
+
+			// Clip then interpolate.
+			dst1 = dst1.SubImage(inner).(*image.RGBA)
+			interp(dst1)
+
+		loop:
+			for y := inner.Min.Y; y < inner.Max.Y; y++ {
+				for x := inner.Min.X; x < inner.Max.X; x++ {
+					if c0, c1 := dst0.RGBAAt(x, y), dst1.RGBAAt(x, y); c0 != c1 {
+						t.Errorf("q=%T: at (%d, %d): c0=%v, c1=%v", q, x, y, c0, c1)
+						break loop
+					}
 				}
 			}
 		}
@@ -184,7 +229,7 @@
 				t.Errorf("pix differ for delta=%v, q=%T", delta, q)
 			}
 
-			// TODO: Transform.
+			// TODO: Transform, once Kernel.Transform is implemented.
 		}
 	}
 }
@@ -250,6 +295,8 @@
 					if !bytes.Equal(dst0.Pix, dst1.Pix) {
 						t.Errorf("pix differ for dr=%v, src=%T, sr=%v, q=%T", dr, src, sr, q)
 					}
+
+					// TODO: Transform, once Kernel.Transform is implemented.
 				}
 			}
 		}
@@ -331,6 +378,20 @@
 	}
 }
 
+func benchTform(b *testing.B, srcf func(image.Rectangle) (image.Image, error), w int, h int, q Interpolator) {
+	dst := image.NewRGBA(image.Rect(0, 0, w, h))
+	src, err := srcf(image.Rect(0, 0, 1024, 768))
+	if err != nil {
+		b.Fatal(err)
+	}
+	sr := src.Bounds()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		q.Transform(dst, transformMatrix, src, sr, nil)
+	}
+}
+
 func BenchmarkScaleLargeDownNN(b *testing.B) { benchScale(b, srcYCbCrLarge, 200, 150, NearestNeighbor) }
 func BenchmarkScaleLargeDownAB(b *testing.B) { benchScale(b, srcYCbCrLarge, 200, 150, ApproxBiLinear) }
 func BenchmarkScaleLargeDownBL(b *testing.B) { benchScale(b, srcYCbCrLarge, 200, 150, BiLinear) }
@@ -351,3 +412,9 @@
 func BenchmarkScaleSrcRGBA(b *testing.B)    { benchScale(b, srcRGBA, 200, 150, ApproxBiLinear) }
 func BenchmarkScaleSrcUniform(b *testing.B) { benchScale(b, srcUniform, 200, 150, ApproxBiLinear) }
 func BenchmarkScaleSrcYCbCr(b *testing.B)   { benchScale(b, srcYCbCr, 200, 150, ApproxBiLinear) }
+
+func BenchmarkTformSrcGray(b *testing.B)    { benchTform(b, srcGray, 200, 150, ApproxBiLinear) }
+func BenchmarkTformSrcNRGBA(b *testing.B)   { benchTform(b, srcNRGBA, 200, 150, ApproxBiLinear) }
+func BenchmarkTformSrcRGBA(b *testing.B)    { benchTform(b, srcRGBA, 200, 150, ApproxBiLinear) }
+func BenchmarkTformSrcUniform(b *testing.B) { benchTform(b, srcUniform, 200, 150, ApproxBiLinear) }
+func BenchmarkTformSrcYCbCr(b *testing.B)   { benchTform(b, srcYCbCr, 200, 150, ApproxBiLinear) }
diff --git a/testdata/go-turns-two-rotate-ab.png b/testdata/go-turns-two-rotate-ab.png
new file mode 100644
index 0000000..b04ab3c
--- /dev/null
+++ b/testdata/go-turns-two-rotate-ab.png
Binary files differ
diff --git a/testdata/go-turns-two-rotate-nn.png b/testdata/go-turns-two-rotate-nn.png
new file mode 100644
index 0000000..da93978
--- /dev/null
+++ b/testdata/go-turns-two-rotate-nn.png
Binary files differ