diff --git a/draw/gen.go b/draw/gen.go
index 4ba769a..8fef308 100644
--- a/draw/gen.go
+++ b/draw/gen.go
@@ -43,13 +43,39 @@
 	}
 }
 
-// dsTypes are the space-separated (dst image type, src image type) pairs to
-// generate scale_DType_SType implementations for. The last element in the
-// slice should be the fallback pair "Image image.Image".
-//
-// TODO: add more concrete types: *image.RGBA, *image.YCbCr, etc.
-var dsTypes = []string{
-	"Image image.Image",
+var (
+	// dsTypes are the (dst image type, src image type) pairs to generate
+	// scale_DType_SType implementations for. The last element in the slice
+	// should be the fallback pair ("Image", "image.Image").
+	//
+	// TODO: add *image.CMYK src type after Go 1.5 is released.
+	dsTypes = []struct{ dType, sType string }{
+		{"*image.RGBA", "*image.NRGBA"},
+		{"*image.RGBA", "*image.RGBA"},
+		{"*image.RGBA", "*image.Uniform"},
+		{"*image.RGBA", "*image.YCbCr"},
+		{"*image.RGBA", "image.Image"},
+		{"Image", "image.Image"},
+	}
+	dTypes, sTypes []string
+	sTypesForDType = map[string][]string{}
+)
+
+func init() {
+	dTypesSeen := map[string]bool{}
+	sTypesSeen := map[string]bool{}
+	for _, t := range dsTypes {
+		if !sTypesSeen[t.sType] {
+			sTypesSeen[t.sType] = true
+			sTypes = append(sTypes, t.sType)
+		}
+		if !dTypesSeen[t.dType] {
+			dTypesSeen[t.dType] = true
+			dTypes = append(dTypes, t.dType)
+		}
+		sTypesForDType[t.dType] = append(sTypesForDType[t.dType], t.sType)
+	}
+	sTypesForDType["anyDType"] = sTypes
 }
 
 type data struct {
@@ -60,12 +86,10 @@
 
 func gen(w *bytes.Buffer, receiver string, code string) {
 	expn(w, codeRoot, &data{receiver: receiver})
-
-	for _, dsType := range dsTypes {
-		dType, sType := split(dsType, " ")
+	for _, t := range dsTypes {
 		expn(w, code, &data{
-			dType:    dType,
-			sType:    sType,
+			dType:    t.dType,
+			sType:    t.sType,
 			receiver: receiver,
 		})
 	}
@@ -73,55 +97,51 @@
 
 func genKernel(w *bytes.Buffer) {
 	expn(w, codeKernelRoot, &data{})
-
-	dTypesSeen := map[string]bool{}
-	sTypesSeen := map[string]bool{}
-	for _, dsType := range dsTypes {
-		dType, sType := split(dsType, " ")
-		if !sTypesSeen[sType] {
-			sTypesSeen[sType] = true
-			expn(w, codeKernelLeafX, &data{
-				sType: sType,
-			})
-		}
-		if !dTypesSeen[dType] {
-			dTypesSeen[dType] = true
-			expn(w, codeKernelLeafY, &data{
-				dType: dType,
-			})
-		}
+	for _, sType := range sTypes {
+		expn(w, codeKernelLeafX, &data{
+			sType: sType,
+		})
+	}
+	for _, dType := range dTypes {
+		expn(w, codeKernelLeafY, &data{
+			dType: dType,
+		})
 	}
 }
 
 func expn(w *bytes.Buffer, code string, d *data) {
 	for _, line := range strings.Split(code, "\n") {
-		for {
-			i := strings.IndexByte(line, '$')
-			if i < 0 {
-				break
-			}
-			prefix, s := line[:i], line[i+1:]
-
-			i = len(s)
-			for j, c := range s {
-				if !('A' <= c && c <= 'Z' || 'a' <= c && c <= 'z') {
-					i = j
-					break
-				}
-			}
-			dollar, suffix := s[:i], s[i:]
-
-			e := expnLine(prefix, dollar, suffix, d)
-			if e == "" {
-				log.Fatalf("couldn't expand %q", line)
-			}
-			line = e
-		}
-		fmt.Fprintln(w, line)
+		fmt.Fprintln(w, expnLine(line, d))
 	}
 }
 
-func expnLine(prefix, dollar, suffix string, d *data) string {
+func expnLine(line string, d *data) string {
+	for {
+		i := strings.IndexByte(line, '$')
+		if i < 0 {
+			break
+		}
+		prefix, s := line[:i], line[i+1:]
+
+		i = len(s)
+		for j, c := range s {
+			if !('A' <= c && c <= 'Z' || 'a' <= c && c <= 'z') {
+				i = j
+				break
+			}
+		}
+		dollar, suffix := s[:i], s[i:]
+
+		e := expnDollar(prefix, dollar, suffix, d)
+		if e == "" {
+			log.Fatalf("couldn't expand %q", line)
+		}
+		line = e
+	}
+	return line
+}
+
+func expnDollar(prefix, dollar, suffix string, d *data) string {
 	switch dollar {
 	case "dType":
 		return prefix + d.dType + suffix
@@ -134,8 +154,15 @@
 	case "receiver":
 		return prefix + d.receiver + suffix
 
+	case "switch":
+		return expnSwitch("", true, suffix)
+	case "switchD":
+		return expnSwitch("", false, suffix)
+	case "switchS":
+		return expnSwitch("anyDType", false, suffix)
+
 	case "dstColorDecl":
-		if d.dType == "Image" {
+		if d.dType == "Image" || d.dType == "*image.RGBA" { // TODO: separate code for concrete types.
 			return "dstColorRGBA64 := &color.RGBA64{}\n" +
 				"dstColor := color.Color(dstColorRGBA64)"
 		}
@@ -165,7 +192,7 @@
 		switch d.dType {
 		default:
 			log.Fatalf("bad dType %q", d.dType)
-		case "Image":
+		case "Image", "*image.RGBA": // TODO: separate code for concrete types.
 			return fmt.Sprintf(""+
 				"dstColorRGBA64.R = uint16(%sr)\n"+
 				"dstColorRGBA64.G = uint16(%sg)\n"+
@@ -185,7 +212,7 @@
 		switch d.dType {
 		default:
 			log.Fatalf("bad dType %q", d.dType)
-		case "Image":
+		case "Image", "*image.RGBA": // TODO: separate code for concrete types.
 			return fmt.Sprintf(""+
 				"dstColorRGBA64.R = ftou(%sr * %s)\n"+
 				"dstColorRGBA64.G = ftou(%sg * %s)\n"+
@@ -216,7 +243,7 @@
 		switch d.sType {
 		default:
 			log.Fatalf("bad sType %q", d.sType)
-		case "image.Image":
+		case "image.Image", "*image.NRGBA", "*image.RGBA", "*image.Uniform", "*image.YCbCr": // TODO: separate code for concrete types.
 			fmt.Fprintf(buf, "%sr%s, %sg%s, %sb%s, %sa%s := "+
 				"src.At(sp.X + int(%s), sp.Y+int(%s)).RGBA()\n",
 				lhs, tmp, lhs, tmp, lhs, tmp, lhs, tmp, args[0], args[1])
@@ -240,6 +267,37 @@
 	return ""
 }
 
+func expnSwitch(dType string, expandBoth bool, template string) string {
+	switchVar := "dst"
+	if dType != "" {
+		switchVar = "src"
+	}
+	lines := []string{fmt.Sprintf("switch %s := %s.(type) {", switchVar, switchVar)}
+
+	fallback, values := "Image", dTypes
+	if dType != "" {
+		fallback, values = "image.Image", sTypesForDType[dType]
+	}
+	for _, v := range values {
+		if v == fallback {
+			lines = append(lines, "default:")
+		} else {
+			lines = append(lines, fmt.Sprintf("case %s:", v))
+		}
+
+		if dType != "" {
+			lines = append(lines, expnLine(template, &data{dType: dType, sType: v}))
+		} else if !expandBoth {
+			lines = append(lines, expnLine(template, &data{dType: v}))
+		} else {
+			lines = append(lines, expnSwitch(v, false, template))
+		}
+	}
+
+	lines = append(lines, "}")
+	return strings.Join(lines, "\n")
+}
+
 func split(s, sep string) (string, string) {
 	if i := strings.Index(s, sep); i >= 0 {
 		return strings.TrimSpace(s[:i]), strings.TrimSpace(s[i+len(sep):])
@@ -289,8 +347,7 @@
 			if z.dw <= 0 || z.dh <= 0 || z.sw <= 0 || z.sh <= 0 {
 				return
 			}
-			// TODO: generate type switches for the different dsTypes.
-			z.scale_Image_Image(dst, dp, src, sp)
+			$switch z.scale_$dTypeRN_$sTypeRN(dst, dp, src, sp)
 		}
 	`
 
@@ -362,9 +419,8 @@
 			// scaleY distributes the temporary image's rows over the destination image.
 			// TODO: is it worth having a sync.Pool for this temporary buffer?
 			tmp := make([][4]float64, z.dw*z.sh)
-			// TODO: generate type switches for the different dTypes and sTypes.
-			z.scaleX_Image(tmp, src, sp)
-			z.scaleY_Image(dst, dp, tmp)
+			$switchS z.scaleX_$sTypeRN(tmp, src, sp)
+			$switchD z.scaleY_$dTypeRN(dst, dp, tmp)
 		}
 	`
 
diff --git a/draw/impl.go b/draw/impl.go
index cf164aa..2c18717 100644
--- a/draw/impl.go
+++ b/draw/impl.go
@@ -11,8 +11,111 @@
 	if z.dw <= 0 || z.dh <= 0 || z.sw <= 0 || z.sh <= 0 {
 		return
 	}
-	// TODO: generate type switches for the different dsTypes.
-	z.scale_Image_Image(dst, dp, src, sp)
+	switch dst := dst.(type) {
+	case *image.RGBA:
+		switch src := src.(type) {
+		case *image.NRGBA:
+			z.scale_RGBA_NRGBA(dst, dp, src, sp)
+		case *image.RGBA:
+			z.scale_RGBA_RGBA(dst, dp, src, sp)
+		case *image.Uniform:
+			z.scale_RGBA_Uniform(dst, dp, src, sp)
+		case *image.YCbCr:
+			z.scale_RGBA_YCbCr(dst, dp, src, sp)
+		default:
+			z.scale_RGBA_Image(dst, dp, src, sp)
+		}
+	default:
+		switch src := src.(type) {
+		default:
+			z.scale_Image_Image(dst, dp, src, sp)
+		}
+	}
+}
+
+func (z *nnScaler) scale_RGBA_NRGBA(dst *image.RGBA, dp image.Point, src *image.NRGBA, sp image.Point) {
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+	for dy := int32(0); dy < z.dh; dy++ {
+		sy := (2*uint64(dy) + 1) * uint64(z.sh) / (2 * uint64(z.dh))
+		for dx := int32(0); dx < z.dw; dx++ {
+			sx := (2*uint64(dx) + 1) * uint64(z.sw) / (2 * uint64(z.dw))
+			pr, pg, pb, pa := src.At(sp.X+int(sx), sp.Y+int(sy)).RGBA()
+			dstColorRGBA64.R = uint16(pr)
+			dstColorRGBA64.G = uint16(pg)
+			dstColorRGBA64.B = uint16(pb)
+			dstColorRGBA64.A = uint16(pa)
+			dst.Set(dp.X+int(dx), dp.Y+int(dy), dstColor)
+		}
+	}
+}
+
+func (z *nnScaler) scale_RGBA_RGBA(dst *image.RGBA, dp image.Point, src *image.RGBA, sp image.Point) {
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+	for dy := int32(0); dy < z.dh; dy++ {
+		sy := (2*uint64(dy) + 1) * uint64(z.sh) / (2 * uint64(z.dh))
+		for dx := int32(0); dx < z.dw; dx++ {
+			sx := (2*uint64(dx) + 1) * uint64(z.sw) / (2 * uint64(z.dw))
+			pr, pg, pb, pa := src.At(sp.X+int(sx), sp.Y+int(sy)).RGBA()
+			dstColorRGBA64.R = uint16(pr)
+			dstColorRGBA64.G = uint16(pg)
+			dstColorRGBA64.B = uint16(pb)
+			dstColorRGBA64.A = uint16(pa)
+			dst.Set(dp.X+int(dx), dp.Y+int(dy), dstColor)
+		}
+	}
+}
+
+func (z *nnScaler) scale_RGBA_Uniform(dst *image.RGBA, dp image.Point, src *image.Uniform, sp image.Point) {
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+	for dy := int32(0); dy < z.dh; dy++ {
+		sy := (2*uint64(dy) + 1) * uint64(z.sh) / (2 * uint64(z.dh))
+		for dx := int32(0); dx < z.dw; dx++ {
+			sx := (2*uint64(dx) + 1) * uint64(z.sw) / (2 * uint64(z.dw))
+			pr, pg, pb, pa := src.At(sp.X+int(sx), sp.Y+int(sy)).RGBA()
+			dstColorRGBA64.R = uint16(pr)
+			dstColorRGBA64.G = uint16(pg)
+			dstColorRGBA64.B = uint16(pb)
+			dstColorRGBA64.A = uint16(pa)
+			dst.Set(dp.X+int(dx), dp.Y+int(dy), dstColor)
+		}
+	}
+}
+
+func (z *nnScaler) scale_RGBA_YCbCr(dst *image.RGBA, dp image.Point, src *image.YCbCr, sp image.Point) {
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+	for dy := int32(0); dy < z.dh; dy++ {
+		sy := (2*uint64(dy) + 1) * uint64(z.sh) / (2 * uint64(z.dh))
+		for dx := int32(0); dx < z.dw; dx++ {
+			sx := (2*uint64(dx) + 1) * uint64(z.sw) / (2 * uint64(z.dw))
+			pr, pg, pb, pa := src.At(sp.X+int(sx), sp.Y+int(sy)).RGBA()
+			dstColorRGBA64.R = uint16(pr)
+			dstColorRGBA64.G = uint16(pg)
+			dstColorRGBA64.B = uint16(pb)
+			dstColorRGBA64.A = uint16(pa)
+			dst.Set(dp.X+int(dx), dp.Y+int(dy), dstColor)
+		}
+	}
+}
+
+func (z *nnScaler) scale_RGBA_Image(dst *image.RGBA, dp image.Point, src image.Image, sp image.Point) {
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+	for dy := int32(0); dy < z.dh; dy++ {
+		sy := (2*uint64(dy) + 1) * uint64(z.sh) / (2 * uint64(z.dh))
+		for dx := int32(0); dx < z.dw; dx++ {
+			sx := (2*uint64(dx) + 1) * uint64(z.sw) / (2 * uint64(z.dw))
+			pr, pg, pb, pa := src.At(sp.X+int(sx), sp.Y+int(sy)).RGBA()
+			dstColorRGBA64.R = uint16(pr)
+			dstColorRGBA64.G = uint16(pg)
+			dstColorRGBA64.B = uint16(pb)
+			dstColorRGBA64.A = uint16(pa)
+			dst.Set(dp.X+int(dx), dp.Y+int(dy), dstColor)
+		}
+	}
 }
 
 func (z *nnScaler) scale_Image_Image(dst Image, dp image.Point, src image.Image, sp image.Point) {
@@ -36,8 +139,386 @@
 	if z.dw <= 0 || z.dh <= 0 || z.sw <= 0 || z.sh <= 0 {
 		return
 	}
-	// TODO: generate type switches for the different dsTypes.
-	z.scale_Image_Image(dst, dp, src, sp)
+	switch dst := dst.(type) {
+	case *image.RGBA:
+		switch src := src.(type) {
+		case *image.NRGBA:
+			z.scale_RGBA_NRGBA(dst, dp, src, sp)
+		case *image.RGBA:
+			z.scale_RGBA_RGBA(dst, dp, src, sp)
+		case *image.Uniform:
+			z.scale_RGBA_Uniform(dst, dp, src, sp)
+		case *image.YCbCr:
+			z.scale_RGBA_YCbCr(dst, dp, src, sp)
+		default:
+			z.scale_RGBA_Image(dst, dp, src, sp)
+		}
+	default:
+		switch src := src.(type) {
+		default:
+			z.scale_Image_Image(dst, dp, src, sp)
+		}
+	}
+}
+
+func (z *ablScaler) scale_RGBA_NRGBA(dst *image.RGBA, dp image.Point, src *image.NRGBA, sp image.Point) {
+	yscale := float64(z.sh) / float64(z.dh)
+	xscale := float64(z.sw) / float64(z.dw)
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+	for dy := int32(0); dy < z.dh; dy++ {
+		sy := (float64(dy)+0.5)*yscale - 0.5
+		sy0 := int32(sy)
+		yFrac0 := sy - float64(sy0)
+		yFrac1 := 1 - yFrac0
+		sy1 := sy0 + 1
+		if sy < 0 {
+			sy0, sy1 = 0, 0
+			yFrac0, yFrac1 = 0, 1
+		} else if sy1 >= z.sh {
+			sy1 = sy0
+			yFrac0, yFrac1 = 1, 0
+		}
+		for dx := int32(0); dx < z.dw; dx++ {
+			sx := (float64(dx)+0.5)*xscale - 0.5
+			sx0 := int32(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx1 := sx0 + 1
+			if sx < 0 {
+				sx0, sx1 = 0, 0
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= z.sw {
+				sx1 = sx0
+				xFrac0, xFrac1 = 1, 0
+			}
+			s00ru, s00gu, s00bu, s00au := src.At(sp.X+int(sx0), sp.Y+int(sy0)).RGBA()
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10ru, s10gu, s10bu, s10au := src.At(sp.X+int(sx1), sp.Y+int(sy0)).RGBA()
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01ru, s01gu, s01bu, s01au := src.At(sp.X+int(sx0), sp.Y+int(sy1)).RGBA()
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11ru, s11gu, s11bu, s11au := src.At(sp.X+int(sx1), sp.Y+int(sy1)).RGBA()
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			dstColorRGBA64.R = uint16(s11r)
+			dstColorRGBA64.G = uint16(s11g)
+			dstColorRGBA64.B = uint16(s11b)
+			dstColorRGBA64.A = uint16(s11a)
+			dst.Set(dp.X+int(dx), dp.Y+int(dy), dstColor)
+		}
+	}
+}
+
+func (z *ablScaler) scale_RGBA_RGBA(dst *image.RGBA, dp image.Point, src *image.RGBA, sp image.Point) {
+	yscale := float64(z.sh) / float64(z.dh)
+	xscale := float64(z.sw) / float64(z.dw)
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+	for dy := int32(0); dy < z.dh; dy++ {
+		sy := (float64(dy)+0.5)*yscale - 0.5
+		sy0 := int32(sy)
+		yFrac0 := sy - float64(sy0)
+		yFrac1 := 1 - yFrac0
+		sy1 := sy0 + 1
+		if sy < 0 {
+			sy0, sy1 = 0, 0
+			yFrac0, yFrac1 = 0, 1
+		} else if sy1 >= z.sh {
+			sy1 = sy0
+			yFrac0, yFrac1 = 1, 0
+		}
+		for dx := int32(0); dx < z.dw; dx++ {
+			sx := (float64(dx)+0.5)*xscale - 0.5
+			sx0 := int32(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx1 := sx0 + 1
+			if sx < 0 {
+				sx0, sx1 = 0, 0
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= z.sw {
+				sx1 = sx0
+				xFrac0, xFrac1 = 1, 0
+			}
+			s00ru, s00gu, s00bu, s00au := src.At(sp.X+int(sx0), sp.Y+int(sy0)).RGBA()
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10ru, s10gu, s10bu, s10au := src.At(sp.X+int(sx1), sp.Y+int(sy0)).RGBA()
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01ru, s01gu, s01bu, s01au := src.At(sp.X+int(sx0), sp.Y+int(sy1)).RGBA()
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11ru, s11gu, s11bu, s11au := src.At(sp.X+int(sx1), sp.Y+int(sy1)).RGBA()
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			dstColorRGBA64.R = uint16(s11r)
+			dstColorRGBA64.G = uint16(s11g)
+			dstColorRGBA64.B = uint16(s11b)
+			dstColorRGBA64.A = uint16(s11a)
+			dst.Set(dp.X+int(dx), dp.Y+int(dy), dstColor)
+		}
+	}
+}
+
+func (z *ablScaler) scale_RGBA_Uniform(dst *image.RGBA, dp image.Point, src *image.Uniform, sp image.Point) {
+	yscale := float64(z.sh) / float64(z.dh)
+	xscale := float64(z.sw) / float64(z.dw)
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+	for dy := int32(0); dy < z.dh; dy++ {
+		sy := (float64(dy)+0.5)*yscale - 0.5
+		sy0 := int32(sy)
+		yFrac0 := sy - float64(sy0)
+		yFrac1 := 1 - yFrac0
+		sy1 := sy0 + 1
+		if sy < 0 {
+			sy0, sy1 = 0, 0
+			yFrac0, yFrac1 = 0, 1
+		} else if sy1 >= z.sh {
+			sy1 = sy0
+			yFrac0, yFrac1 = 1, 0
+		}
+		for dx := int32(0); dx < z.dw; dx++ {
+			sx := (float64(dx)+0.5)*xscale - 0.5
+			sx0 := int32(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx1 := sx0 + 1
+			if sx < 0 {
+				sx0, sx1 = 0, 0
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= z.sw {
+				sx1 = sx0
+				xFrac0, xFrac1 = 1, 0
+			}
+			s00ru, s00gu, s00bu, s00au := src.At(sp.X+int(sx0), sp.Y+int(sy0)).RGBA()
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10ru, s10gu, s10bu, s10au := src.At(sp.X+int(sx1), sp.Y+int(sy0)).RGBA()
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01ru, s01gu, s01bu, s01au := src.At(sp.X+int(sx0), sp.Y+int(sy1)).RGBA()
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11ru, s11gu, s11bu, s11au := src.At(sp.X+int(sx1), sp.Y+int(sy1)).RGBA()
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			dstColorRGBA64.R = uint16(s11r)
+			dstColorRGBA64.G = uint16(s11g)
+			dstColorRGBA64.B = uint16(s11b)
+			dstColorRGBA64.A = uint16(s11a)
+			dst.Set(dp.X+int(dx), dp.Y+int(dy), dstColor)
+		}
+	}
+}
+
+func (z *ablScaler) scale_RGBA_YCbCr(dst *image.RGBA, dp image.Point, src *image.YCbCr, sp image.Point) {
+	yscale := float64(z.sh) / float64(z.dh)
+	xscale := float64(z.sw) / float64(z.dw)
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+	for dy := int32(0); dy < z.dh; dy++ {
+		sy := (float64(dy)+0.5)*yscale - 0.5
+		sy0 := int32(sy)
+		yFrac0 := sy - float64(sy0)
+		yFrac1 := 1 - yFrac0
+		sy1 := sy0 + 1
+		if sy < 0 {
+			sy0, sy1 = 0, 0
+			yFrac0, yFrac1 = 0, 1
+		} else if sy1 >= z.sh {
+			sy1 = sy0
+			yFrac0, yFrac1 = 1, 0
+		}
+		for dx := int32(0); dx < z.dw; dx++ {
+			sx := (float64(dx)+0.5)*xscale - 0.5
+			sx0 := int32(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx1 := sx0 + 1
+			if sx < 0 {
+				sx0, sx1 = 0, 0
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= z.sw {
+				sx1 = sx0
+				xFrac0, xFrac1 = 1, 0
+			}
+			s00ru, s00gu, s00bu, s00au := src.At(sp.X+int(sx0), sp.Y+int(sy0)).RGBA()
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10ru, s10gu, s10bu, s10au := src.At(sp.X+int(sx1), sp.Y+int(sy0)).RGBA()
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01ru, s01gu, s01bu, s01au := src.At(sp.X+int(sx0), sp.Y+int(sy1)).RGBA()
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11ru, s11gu, s11bu, s11au := src.At(sp.X+int(sx1), sp.Y+int(sy1)).RGBA()
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			dstColorRGBA64.R = uint16(s11r)
+			dstColorRGBA64.G = uint16(s11g)
+			dstColorRGBA64.B = uint16(s11b)
+			dstColorRGBA64.A = uint16(s11a)
+			dst.Set(dp.X+int(dx), dp.Y+int(dy), dstColor)
+		}
+	}
+}
+
+func (z *ablScaler) scale_RGBA_Image(dst *image.RGBA, dp image.Point, src image.Image, sp image.Point) {
+	yscale := float64(z.sh) / float64(z.dh)
+	xscale := float64(z.sw) / float64(z.dw)
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+	for dy := int32(0); dy < z.dh; dy++ {
+		sy := (float64(dy)+0.5)*yscale - 0.5
+		sy0 := int32(sy)
+		yFrac0 := sy - float64(sy0)
+		yFrac1 := 1 - yFrac0
+		sy1 := sy0 + 1
+		if sy < 0 {
+			sy0, sy1 = 0, 0
+			yFrac0, yFrac1 = 0, 1
+		} else if sy1 >= z.sh {
+			sy1 = sy0
+			yFrac0, yFrac1 = 1, 0
+		}
+		for dx := int32(0); dx < z.dw; dx++ {
+			sx := (float64(dx)+0.5)*xscale - 0.5
+			sx0 := int32(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx1 := sx0 + 1
+			if sx < 0 {
+				sx0, sx1 = 0, 0
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= z.sw {
+				sx1 = sx0
+				xFrac0, xFrac1 = 1, 0
+			}
+			s00ru, s00gu, s00bu, s00au := src.At(sp.X+int(sx0), sp.Y+int(sy0)).RGBA()
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10ru, s10gu, s10bu, s10au := src.At(sp.X+int(sx1), sp.Y+int(sy0)).RGBA()
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01ru, s01gu, s01bu, s01au := src.At(sp.X+int(sx0), sp.Y+int(sy1)).RGBA()
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11ru, s11gu, s11bu, s11au := src.At(sp.X+int(sx1), sp.Y+int(sy1)).RGBA()
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			dstColorRGBA64.R = uint16(s11r)
+			dstColorRGBA64.G = uint16(s11g)
+			dstColorRGBA64.B = uint16(s11b)
+			dstColorRGBA64.A = uint16(s11a)
+			dst.Set(dp.X+int(dx), dp.Y+int(dy), dstColor)
+		}
+	}
 }
 
 func (z *ablScaler) scale_Image_Image(dst Image, dp image.Point, src image.Image, sp image.Point) {
@@ -121,9 +602,116 @@
 	// scaleY distributes the temporary image's rows over the destination image.
 	// TODO: is it worth having a sync.Pool for this temporary buffer?
 	tmp := make([][4]float64, z.dw*z.sh)
-	// TODO: generate type switches for the different dTypes and sTypes.
-	z.scaleX_Image(tmp, src, sp)
-	z.scaleY_Image(dst, dp, tmp)
+	switch src := src.(type) {
+	case *image.NRGBA:
+		z.scaleX_NRGBA(tmp, src, sp)
+	case *image.RGBA:
+		z.scaleX_RGBA(tmp, src, sp)
+	case *image.Uniform:
+		z.scaleX_Uniform(tmp, src, sp)
+	case *image.YCbCr:
+		z.scaleX_YCbCr(tmp, src, sp)
+	default:
+		z.scaleX_Image(tmp, src, sp)
+	}
+	switch dst := dst.(type) {
+	case *image.RGBA:
+		z.scaleY_RGBA(dst, dp, tmp)
+	default:
+		z.scaleY_Image(dst, dp, tmp)
+	}
+}
+
+func (z *kernelScaler) scaleX_NRGBA(tmp [][4]float64, src *image.NRGBA, sp image.Point) {
+	t := 0
+	for y := int32(0); y < z.sh; y++ {
+		for _, s := range z.horizontal.sources {
+			var pr, pg, pb, pa float64
+			for _, c := range z.horizontal.contribs[s.i:s.j] {
+				pru, pgu, pbu, pau := src.At(sp.X+int(c.coord), sp.Y+int(y)).RGBA()
+				pr += float64(pru) * c.weight
+				pg += float64(pgu) * c.weight
+				pb += float64(pbu) * c.weight
+				pa += float64(pau) * c.weight
+			}
+			tmp[t] = [4]float64{
+				pr * s.invTotalWeightFFFF,
+				pg * s.invTotalWeightFFFF,
+				pb * s.invTotalWeightFFFF,
+				pa * s.invTotalWeightFFFF,
+			}
+			t++
+		}
+	}
+}
+
+func (z *kernelScaler) scaleX_RGBA(tmp [][4]float64, src *image.RGBA, sp image.Point) {
+	t := 0
+	for y := int32(0); y < z.sh; y++ {
+		for _, s := range z.horizontal.sources {
+			var pr, pg, pb, pa float64
+			for _, c := range z.horizontal.contribs[s.i:s.j] {
+				pru, pgu, pbu, pau := src.At(sp.X+int(c.coord), sp.Y+int(y)).RGBA()
+				pr += float64(pru) * c.weight
+				pg += float64(pgu) * c.weight
+				pb += float64(pbu) * c.weight
+				pa += float64(pau) * c.weight
+			}
+			tmp[t] = [4]float64{
+				pr * s.invTotalWeightFFFF,
+				pg * s.invTotalWeightFFFF,
+				pb * s.invTotalWeightFFFF,
+				pa * s.invTotalWeightFFFF,
+			}
+			t++
+		}
+	}
+}
+
+func (z *kernelScaler) scaleX_Uniform(tmp [][4]float64, src *image.Uniform, sp image.Point) {
+	t := 0
+	for y := int32(0); y < z.sh; y++ {
+		for _, s := range z.horizontal.sources {
+			var pr, pg, pb, pa float64
+			for _, c := range z.horizontal.contribs[s.i:s.j] {
+				pru, pgu, pbu, pau := src.At(sp.X+int(c.coord), sp.Y+int(y)).RGBA()
+				pr += float64(pru) * c.weight
+				pg += float64(pgu) * c.weight
+				pb += float64(pbu) * c.weight
+				pa += float64(pau) * c.weight
+			}
+			tmp[t] = [4]float64{
+				pr * s.invTotalWeightFFFF,
+				pg * s.invTotalWeightFFFF,
+				pb * s.invTotalWeightFFFF,
+				pa * s.invTotalWeightFFFF,
+			}
+			t++
+		}
+	}
+}
+
+func (z *kernelScaler) scaleX_YCbCr(tmp [][4]float64, src *image.YCbCr, sp image.Point) {
+	t := 0
+	for y := int32(0); y < z.sh; y++ {
+		for _, s := range z.horizontal.sources {
+			var pr, pg, pb, pa float64
+			for _, c := range z.horizontal.contribs[s.i:s.j] {
+				pru, pgu, pbu, pau := src.At(sp.X+int(c.coord), sp.Y+int(y)).RGBA()
+				pr += float64(pru) * c.weight
+				pg += float64(pgu) * c.weight
+				pb += float64(pbu) * c.weight
+				pa += float64(pau) * c.weight
+			}
+			tmp[t] = [4]float64{
+				pr * s.invTotalWeightFFFF,
+				pg * s.invTotalWeightFFFF,
+				pb * s.invTotalWeightFFFF,
+				pa * s.invTotalWeightFFFF,
+			}
+			t++
+		}
+	}
 }
 
 func (z *kernelScaler) scaleX_Image(tmp [][4]float64, src image.Image, sp image.Point) {
@@ -149,6 +737,28 @@
 	}
 }
 
+func (z *kernelScaler) scaleY_RGBA(dst *image.RGBA, dp image.Point, tmp [][4]float64) {
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+	for x := int32(0); x < z.dw; x++ {
+		for y, s := range z.vertical.sources {
+			var pr, pg, pb, pa float64
+			for _, c := range z.vertical.contribs[s.i:s.j] {
+				p := &tmp[c.coord*z.dw+x]
+				pr += p[0] * c.weight
+				pg += p[1] * c.weight
+				pb += p[2] * c.weight
+				pa += p[3] * c.weight
+			}
+			dstColorRGBA64.R = ftou(pr * s.invTotalWeight)
+			dstColorRGBA64.G = ftou(pg * s.invTotalWeight)
+			dstColorRGBA64.B = ftou(pb * s.invTotalWeight)
+			dstColorRGBA64.A = ftou(pa * s.invTotalWeight)
+			dst.Set(dp.X+int(x), dp.Y+int(y), dstColor)
+		}
+	}
+}
+
 func (z *kernelScaler) scaleY_Image(dst Image, dp image.Point, tmp [][4]float64) {
 	dstColorRGBA64 := &color.RGBA64{}
 	dstColor := color.Color(dstColorRGBA64)
diff --git a/draw/scale_test.go b/draw/scale_test.go
index bc99393..6d4baac 100644
--- a/draw/scale_test.go
+++ b/draw/scale_test.go
@@ -81,26 +81,52 @@
 func TestScaleDown(t *testing.T) { testScale(t, 100, 100, "down", "280x360.jpeg") }
 func TestScaleUp(t *testing.T)   { testScale(t, 75, 100, "up", "14x18.png") }
 
-func benchScale(b *testing.B, largeSrc bool, w int, h int, q Interpolator) {
-	var src image.Image
-	if largeSrc {
-		// 3072 x 2304 is over 7 million pixels at 4:3, comparable to a
-		// 2015 smart-phone camera's output.
-		src = image.NewYCbCr(image.Rect(0, 0, 3072, 2304), image.YCbCrSubsampleRatio420)
-	} else {
-		// tux.png is a 386 x 395 image.
-		f, err := os.Open("../testdata/tux.png")
-		if err != nil {
-			b.Fatalf("Open: %v", err)
-		}
-		defer f.Close()
-		src, err = png.Decode(f)
-		if err != nil {
-			b.Fatalf("Decode: %v", err)
-		}
-	}
+// TODO: test that scaling concrete types like *image.RGBA and *image.YCbCr
+// give the same results as scaling those images wrapped in another Image or
+// image.Image type that would skip the fast-path type switch.
 
+func srcNRGBA() (image.Image, error) {
+	return image.NewNRGBA(image.Rect(0, 0, 1024, 768)), nil
+}
+
+func srcRGBA() (image.Image, error) {
+	return image.NewRGBA(image.Rect(0, 0, 1024, 768)), nil
+}
+
+func srcUniform() (image.Image, error) {
+	return image.White, nil
+}
+
+func srcYCbCr() (image.Image, error) {
+	return image.NewYCbCr(image.Rect(0, 0, 1024, 768), image.YCbCrSubsampleRatio420), nil
+}
+
+func srcYCbCrLarge() (image.Image, error) {
+	// 3072 x 2304 is over 7 million pixels at 4:3, comparable to a
+	// 2015 smart-phone camera's output.
+	return image.NewYCbCr(image.Rect(0, 0, 3072, 2304), image.YCbCrSubsampleRatio420), nil
+}
+
+func srcTux() (image.Image, error) {
+	// tux.png is a 386 x 395 image.
+	f, err := os.Open("../testdata/tux.png")
+	if err != nil {
+		return nil, fmt.Errorf("Open: %v", err)
+	}
+	defer f.Close()
+	src, err := png.Decode(f)
+	if err != nil {
+		return nil, fmt.Errorf("Decode: %v", err)
+	}
+	return src, nil
+}
+
+func benchScale(b *testing.B, srcf func() (image.Image, error), w int, h int, q Interpolator) {
 	dst := image.NewRGBA(image.Rect(0, 0, w, h))
+	src, err := srcf()
+	if err != nil {
+		b.Fatal(err)
+	}
 	dr, sr := dst.Bounds(), src.Bounds()
 	scaler := q.NewScaler(int32(dr.Dx()), int32(dr.Dy()), int32(sr.Dx()), int32(sr.Dy()))
 
@@ -110,15 +136,22 @@
 	}
 }
 
-func BenchmarkScaleLargeDownNN(b *testing.B) { benchScale(b, true, 200, 150, NearestNeighbor) }
-func BenchmarkScaleLargeDownAB(b *testing.B) { benchScale(b, true, 200, 150, ApproxBiLinear) }
-func BenchmarkScaleLargeDownBL(b *testing.B) { benchScale(b, true, 200, 150, BiLinear) }
-func BenchmarkScaleLargeDownCR(b *testing.B) { benchScale(b, true, 200, 150, CatmullRom) }
-func BenchmarkScaleDownNN(b *testing.B)      { benchScale(b, false, 120, 80, NearestNeighbor) }
-func BenchmarkScaleDownAB(b *testing.B)      { benchScale(b, false, 120, 80, ApproxBiLinear) }
-func BenchmarkScaleDownBL(b *testing.B)      { benchScale(b, false, 120, 80, BiLinear) }
-func BenchmarkScaleDownCR(b *testing.B)      { benchScale(b, false, 120, 80, CatmullRom) }
-func BenchmarkScaleUpNN(b *testing.B)        { benchScale(b, false, 800, 600, NearestNeighbor) }
-func BenchmarkScaleUpAB(b *testing.B)        { benchScale(b, false, 800, 600, ApproxBiLinear) }
-func BenchmarkScaleUpBL(b *testing.B)        { benchScale(b, false, 800, 600, BiLinear) }
-func BenchmarkScaleUpCR(b *testing.B)        { benchScale(b, false, 800, 600, CatmullRom) }
+func BenchmarkScaleLargeDownNN(b *testing.B) { benchScale(b, srcYCbCrLarge, 200, 150, NearestNeighbor) }
+func BenchmarkScaleLargeDownAB(b *testing.B) { benchScale(b, srcYCbCrLarge, 200, 150, ApproxBiLinear) }
+func BenchmarkScaleLargeDownBL(b *testing.B) { benchScale(b, srcYCbCrLarge, 200, 150, BiLinear) }
+func BenchmarkScaleLargeDownCR(b *testing.B) { benchScale(b, srcYCbCrLarge, 200, 150, CatmullRom) }
+
+func BenchmarkScaleDownNN(b *testing.B) { benchScale(b, srcTux, 120, 80, NearestNeighbor) }
+func BenchmarkScaleDownAB(b *testing.B) { benchScale(b, srcTux, 120, 80, ApproxBiLinear) }
+func BenchmarkScaleDownBL(b *testing.B) { benchScale(b, srcTux, 120, 80, BiLinear) }
+func BenchmarkScaleDownCR(b *testing.B) { benchScale(b, srcTux, 120, 80, CatmullRom) }
+
+func BenchmarkScaleUpNN(b *testing.B) { benchScale(b, srcTux, 800, 600, NearestNeighbor) }
+func BenchmarkScaleUpAB(b *testing.B) { benchScale(b, srcTux, 800, 600, ApproxBiLinear) }
+func BenchmarkScaleUpBL(b *testing.B) { benchScale(b, srcTux, 800, 600, BiLinear) }
+func BenchmarkScaleUpCR(b *testing.B) { benchScale(b, srcTux, 800, 600, CatmullRom) }
+
+func BenchmarkScaleSrcNRGBA(b *testing.B)   { benchScale(b, srcNRGBA, 200, 150, ApproxBiLinear) }
+func BenchmarkScaleSrcRGBA(b *testing.B)    { benchScale(b, srcRGBA, 200, 150, ApproxBiLinear) }
+func BenchmarkScaleSrcUniform(b *testing.B) { benchScale(b, srcUniform, 200, 150, ApproxBiLinear) }
+func BenchmarkScaleSrcYCbCr(b *testing.B)   { benchScale(b, srcYCbCr, 200, 150, ApproxBiLinear) }
