draw: RGBA dst fast path for scaling.

benchmark                     old ns/op      new ns/op      delta
BenchmarkScaleLargeDownNN     6124873        3348203        -45.33%
BenchmarkScaleLargeDownAB     15608417       12626534       -19.10%
BenchmarkScaleLargeDownBL     1503354937     1482605150     -1.38%
BenchmarkScaleLargeDownCR     2987623786     2937846270     -1.67%
BenchmarkScaleDownNN          1793478        935896         -47.82%
BenchmarkScaleDownAB          4277596        3405613        -20.38%
BenchmarkScaleDownBL          29932226       29268085       -2.22%
BenchmarkScaleDownCR          57563042       57322266       -0.42%
BenchmarkScaleUpNN            89694138       46216098       -48.47%
BenchmarkScaleUpAB            212318283      169267373      -20.28%
BenchmarkScaleUpBL            120899444      80215032       -33.65%
BenchmarkScaleUpCR            181116518      140140247      -22.62%
BenchmarkScaleSrcNRGBA        13229017       10620746       -19.72%
BenchmarkScaleSrcRGBA         12993292       10155919       -21.84%
BenchmarkScaleSrcUniform      3964808        1146947        -71.07%
BenchmarkScaleSrcYCbCr        15871184       12779895       -19.48%

Change-Id: I7d92bd9f4c20692c5a52ea31019fe3852e657535
Reviewed-on: https://go-review.googlesource.com/6230
Reviewed-by: Rob Pike <r@golang.org>
diff --git a/draw/gen.go b/draw/gen.go
index 387b879..7b7bfc3 100644
--- a/draw/gen.go
+++ b/draw/gen.go
@@ -111,7 +111,11 @@
 
 func expn(w *bytes.Buffer, code string, d *data) {
 	for _, line := range strings.Split(code, "\n") {
-		fmt.Fprintln(w, expnLine(line, d))
+		line = expnLine(line, d)
+		if line == ";" {
+			continue
+		}
+		fmt.Fprintln(w, line)
 	}
 }
 
@@ -161,12 +165,31 @@
 	case "switchS":
 		return expnSwitch("anyDType", false, suffix)
 
-	case "dstColorDecl":
-		if d.dType == "Image" || d.dType == "*image.RGBA" { // TODO: separate code for concrete types.
-			return "dstColorRGBA64 := &color.RGBA64{}\n" +
+	case "preOuter":
+		switch d.dType {
+		default:
+			return ";"
+		case "Image":
+			return "" +
+				"dstColorRGBA64 := &color.RGBA64{}\n" +
 				"dstColor := color.Color(dstColorRGBA64)"
 		}
-		return ";"
+
+	case "preInner":
+		switch d.dType {
+		default:
+			return ";"
+		case "*image.RGBA":
+			return "d := dst.PixOffset(dp.X+dr.Min.X, dp.Y+int(dy))"
+		}
+
+	case "preKernelInner":
+		switch d.dType {
+		default:
+			return ";"
+		case "*image.RGBA":
+			return "d := dst.PixOffset(dp.X+int(dx), dp.Y+dr.Min.Y)"
+		}
 
 	case "blend":
 		args, _ := splitArgs(suffix)
@@ -192,7 +215,7 @@
 		switch d.dType {
 		default:
 			log.Fatalf("bad dType %q", d.dType)
-		case "Image", "*image.RGBA": // TODO: separate code for concrete types.
+		case "Image":
 			return fmt.Sprintf(""+
 				"dstColorRGBA64.R = uint16(%sr)\n"+
 				"dstColorRGBA64.G = uint16(%sg)\n"+
@@ -202,6 +225,15 @@
 				args[2], args[2], args[2], args[2],
 				args[0], args[1],
 			)
+		case "*image.RGBA":
+			return fmt.Sprintf(""+
+				"dst.Pix[d+0] = uint8(uint32(%sr) >> 8)\n"+
+				"dst.Pix[d+1] = uint8(uint32(%sg) >> 8)\n"+
+				"dst.Pix[d+2] = uint8(uint32(%sb) >> 8)\n"+
+				"dst.Pix[d+3] = uint8(uint32(%sa) >> 8)\n"+
+				"d += 4",
+				args[2], args[2], args[2], args[2],
+			)
 		}
 
 	case "outputf":
@@ -212,7 +244,7 @@
 		switch d.dType {
 		default:
 			log.Fatalf("bad dType %q", d.dType)
-		case "Image", "*image.RGBA": // TODO: separate code for concrete types.
+		case "Image":
 			return fmt.Sprintf(""+
 				"dstColorRGBA64.R = ftou(%sr * %s)\n"+
 				"dstColorRGBA64.G = ftou(%sg * %s)\n"+
@@ -222,6 +254,15 @@
 				args[2], args[3], args[2], args[3], args[2], args[3], args[2], args[3],
 				args[0], args[1],
 			)
+		case "*image.RGBA":
+			return fmt.Sprintf(""+
+				"dst.Pix[d+0] = uint8(ftou(%sr * %s) >> 8)\n"+
+				"dst.Pix[d+1] = uint8(ftou(%sg * %s) >> 8)\n"+
+				"dst.Pix[d+2] = uint8(ftou(%sb * %s) >> 8)\n"+
+				"dst.Pix[d+3] = uint8(ftou(%sa * %s) >> 8)\n"+
+				"d += dst.Stride",
+				args[2], args[3], args[2], args[3], args[2], args[3], args[2], args[3],
+			)
 		}
 
 	case "srcf", "srcu":
@@ -263,6 +304,12 @@
 		}
 
 		return strings.TrimSpace(buf.String())
+
+	case "tweakDy":
+		if d.dType == "*image.RGBA" {
+			return strings.Replace(suffix, "for dy, s", "for _, s", 1)
+		}
+		return suffix
 	}
 	return ""
 }
@@ -358,9 +405,10 @@
 
 	codeNNLeaf = `
 		func (z *nnScaler) scale_$dTypeRN_$sTypeRN(dst $dType, dp image.Point, dr image.Rectangle, src $sType, sp image.Point) {
-			$dstColorDecl
+			$preOuter
 			for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 				sy := (2*uint64(dy) + 1) * uint64(z.sh) / (2 * uint64(z.dh))
+				$preInner
 				for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 					sx := (2*uint64(dx) + 1) * uint64(z.sw) / (2 * uint64(z.dw))
 					p := $srcu[sx, sy]
@@ -374,7 +422,7 @@
 		func (z *ablScaler) scale_$dTypeRN_$sTypeRN(dst $dType, dp image.Point, dr image.Rectangle, src $sType, sp image.Point) {
 			yscale := float64(z.sh) / float64(z.dh)
 			xscale := float64(z.sw) / float64(z.dw)
-			$dstColorDecl
+			$preOuter
 			for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 				sy := (float64(dy)+0.5)*yscale - 0.5
 				sy0 := int32(sy)
@@ -388,6 +436,7 @@
 					sy1 = sy0
 					yFrac0, yFrac1 = 1, 0
 				}
+				$preInner
 				for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 					sx := (float64(dx)+0.5)*xscale - 0.5
 					sx0 := int32(sx)
@@ -457,9 +506,10 @@
 
 	codeKernelLeafY = `
 		func (z *kernelScaler) scaleY_$dTypeRN(dst $dType, dp image.Point, dr image.Rectangle, tmp [][4]float64) {
-			$dstColorDecl
+			$preOuter
 			for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
-				for dy, s := range z.vertical.sources[dr.Min.Y:dr.Max.Y] {
+				$preKernelInner
+				$tweakDy for dy, s := range z.vertical.sources[dr.Min.Y:dr.Max.Y] {
 					var pr, pg, pb, pa float64
 					for _, c := range z.vertical.contribs[s.i:s.j] {
 						p := &tmp[c.coord*z.dw+dx]
diff --git a/draw/impl.go b/draw/impl.go
index 0cfee91..c284c4c 100644
--- a/draw/impl.go
+++ b/draw/impl.go
@@ -39,86 +39,81 @@
 }
 
 func (z *nnScaler) scale_RGBA_NRGBA(dst *image.RGBA, dp image.Point, dr image.Rectangle, src *image.NRGBA, sp image.Point) {
-	dstColorRGBA64 := &color.RGBA64{}
-	dstColor := color.Color(dstColorRGBA64)
 	for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 		sy := (2*uint64(dy) + 1) * uint64(z.sh) / (2 * uint64(z.dh))
+		d := dst.PixOffset(dp.X+dr.Min.X, dp.Y+int(dy))
 		for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 			sx := (2*uint64(dx) + 1) * uint64(z.sw) / (2 * uint64(z.dw))
 			pr, pg, pb, pa := src.At(sp.X+int(sx), sp.Y+int(sy)).RGBA()
-			dstColorRGBA64.R = uint16(pr)
-			dstColorRGBA64.G = uint16(pg)
-			dstColorRGBA64.B = uint16(pb)
-			dstColorRGBA64.A = uint16(pa)
-			dst.Set(dp.X+int(dx), dp.Y+int(dy), dstColor)
+			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
+			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
+			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
+			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
+			d += 4
 		}
 	}
 }
 
 func (z *nnScaler) scale_RGBA_RGBA(dst *image.RGBA, dp image.Point, dr image.Rectangle, src *image.RGBA, sp image.Point) {
-	dstColorRGBA64 := &color.RGBA64{}
-	dstColor := color.Color(dstColorRGBA64)
 	for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 		sy := (2*uint64(dy) + 1) * uint64(z.sh) / (2 * uint64(z.dh))
+		d := dst.PixOffset(dp.X+dr.Min.X, dp.Y+int(dy))
 		for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 			sx := (2*uint64(dx) + 1) * uint64(z.sw) / (2 * uint64(z.dw))
 			pr, pg, pb, pa := src.At(sp.X+int(sx), sp.Y+int(sy)).RGBA()
-			dstColorRGBA64.R = uint16(pr)
-			dstColorRGBA64.G = uint16(pg)
-			dstColorRGBA64.B = uint16(pb)
-			dstColorRGBA64.A = uint16(pa)
-			dst.Set(dp.X+int(dx), dp.Y+int(dy), dstColor)
+			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
+			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
+			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
+			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
+			d += 4
 		}
 	}
 }
 
 func (z *nnScaler) scale_RGBA_Uniform(dst *image.RGBA, dp image.Point, dr image.Rectangle, src *image.Uniform, sp image.Point) {
-	dstColorRGBA64 := &color.RGBA64{}
-	dstColor := color.Color(dstColorRGBA64)
 	for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 		sy := (2*uint64(dy) + 1) * uint64(z.sh) / (2 * uint64(z.dh))
+		d := dst.PixOffset(dp.X+dr.Min.X, dp.Y+int(dy))
 		for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 			sx := (2*uint64(dx) + 1) * uint64(z.sw) / (2 * uint64(z.dw))
 			pr, pg, pb, pa := src.At(sp.X+int(sx), sp.Y+int(sy)).RGBA()
-			dstColorRGBA64.R = uint16(pr)
-			dstColorRGBA64.G = uint16(pg)
-			dstColorRGBA64.B = uint16(pb)
-			dstColorRGBA64.A = uint16(pa)
-			dst.Set(dp.X+int(dx), dp.Y+int(dy), dstColor)
+			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
+			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
+			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
+			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
+			d += 4
 		}
 	}
 }
 
 func (z *nnScaler) scale_RGBA_YCbCr(dst *image.RGBA, dp image.Point, dr image.Rectangle, src *image.YCbCr, sp image.Point) {
-	dstColorRGBA64 := &color.RGBA64{}
-	dstColor := color.Color(dstColorRGBA64)
 	for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 		sy := (2*uint64(dy) + 1) * uint64(z.sh) / (2 * uint64(z.dh))
+		d := dst.PixOffset(dp.X+dr.Min.X, dp.Y+int(dy))
 		for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 			sx := (2*uint64(dx) + 1) * uint64(z.sw) / (2 * uint64(z.dw))
 			pr, pg, pb, pa := src.At(sp.X+int(sx), sp.Y+int(sy)).RGBA()
-			dstColorRGBA64.R = uint16(pr)
-			dstColorRGBA64.G = uint16(pg)
-			dstColorRGBA64.B = uint16(pb)
-			dstColorRGBA64.A = uint16(pa)
-			dst.Set(dp.X+int(dx), dp.Y+int(dy), dstColor)
+			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
+			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
+			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
+			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
+			d += 4
 		}
 	}
 }
 
 func (z *nnScaler) scale_RGBA_Image(dst *image.RGBA, dp image.Point, dr image.Rectangle, src image.Image, sp image.Point) {
-	dstColorRGBA64 := &color.RGBA64{}
-	dstColor := color.Color(dstColorRGBA64)
 	for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 		sy := (2*uint64(dy) + 1) * uint64(z.sh) / (2 * uint64(z.dh))
+		d := dst.PixOffset(dp.X+dr.Min.X, dp.Y+int(dy))
 		for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 			sx := (2*uint64(dx) + 1) * uint64(z.sw) / (2 * uint64(z.dw))
 			pr, pg, pb, pa := src.At(sp.X+int(sx), sp.Y+int(sy)).RGBA()
-			dstColorRGBA64.R = uint16(pr)
-			dstColorRGBA64.G = uint16(pg)
-			dstColorRGBA64.B = uint16(pb)
-			dstColorRGBA64.A = uint16(pa)
-			dst.Set(dp.X+int(dx), dp.Y+int(dy), dstColor)
+			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
+			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
+			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
+			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
+			d += 4
 		}
 	}
 }
@@ -174,8 +169,6 @@
 func (z *ablScaler) scale_RGBA_NRGBA(dst *image.RGBA, dp image.Point, dr image.Rectangle, src *image.NRGBA, sp image.Point) {
 	yscale := float64(z.sh) / float64(z.dh)
 	xscale := float64(z.sw) / float64(z.dw)
-	dstColorRGBA64 := &color.RGBA64{}
-	dstColor := color.Color(dstColorRGBA64)
 	for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 		sy := (float64(dy)+0.5)*yscale - 0.5
 		sy0 := int32(sy)
@@ -189,6 +182,7 @@
 			sy1 = sy0
 			yFrac0, yFrac1 = 1, 0
 		}
+		d := dst.PixOffset(dp.X+dr.Min.X, dp.Y+int(dy))
 		for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 			sx := (float64(dx)+0.5)*xscale - 0.5
 			sx0 := int32(sx)
@@ -234,11 +228,11 @@
 			s11g = yFrac1*s10g + yFrac0*s11g
 			s11b = yFrac1*s10b + yFrac0*s11b
 			s11a = yFrac1*s10a + yFrac0*s11a
-			dstColorRGBA64.R = uint16(s11r)
-			dstColorRGBA64.G = uint16(s11g)
-			dstColorRGBA64.B = uint16(s11b)
-			dstColorRGBA64.A = uint16(s11a)
-			dst.Set(dp.X+int(dx), dp.Y+int(dy), dstColor)
+			dst.Pix[d+0] = uint8(uint32(s11r) >> 8)
+			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
+			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
+			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
+			d += 4
 		}
 	}
 }
@@ -246,8 +240,6 @@
 func (z *ablScaler) scale_RGBA_RGBA(dst *image.RGBA, dp image.Point, dr image.Rectangle, src *image.RGBA, sp image.Point) {
 	yscale := float64(z.sh) / float64(z.dh)
 	xscale := float64(z.sw) / float64(z.dw)
-	dstColorRGBA64 := &color.RGBA64{}
-	dstColor := color.Color(dstColorRGBA64)
 	for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 		sy := (float64(dy)+0.5)*yscale - 0.5
 		sy0 := int32(sy)
@@ -261,6 +253,7 @@
 			sy1 = sy0
 			yFrac0, yFrac1 = 1, 0
 		}
+		d := dst.PixOffset(dp.X+dr.Min.X, dp.Y+int(dy))
 		for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 			sx := (float64(dx)+0.5)*xscale - 0.5
 			sx0 := int32(sx)
@@ -306,11 +299,11 @@
 			s11g = yFrac1*s10g + yFrac0*s11g
 			s11b = yFrac1*s10b + yFrac0*s11b
 			s11a = yFrac1*s10a + yFrac0*s11a
-			dstColorRGBA64.R = uint16(s11r)
-			dstColorRGBA64.G = uint16(s11g)
-			dstColorRGBA64.B = uint16(s11b)
-			dstColorRGBA64.A = uint16(s11a)
-			dst.Set(dp.X+int(dx), dp.Y+int(dy), dstColor)
+			dst.Pix[d+0] = uint8(uint32(s11r) >> 8)
+			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
+			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
+			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
+			d += 4
 		}
 	}
 }
@@ -318,8 +311,6 @@
 func (z *ablScaler) scale_RGBA_Uniform(dst *image.RGBA, dp image.Point, dr image.Rectangle, src *image.Uniform, sp image.Point) {
 	yscale := float64(z.sh) / float64(z.dh)
 	xscale := float64(z.sw) / float64(z.dw)
-	dstColorRGBA64 := &color.RGBA64{}
-	dstColor := color.Color(dstColorRGBA64)
 	for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 		sy := (float64(dy)+0.5)*yscale - 0.5
 		sy0 := int32(sy)
@@ -333,6 +324,7 @@
 			sy1 = sy0
 			yFrac0, yFrac1 = 1, 0
 		}
+		d := dst.PixOffset(dp.X+dr.Min.X, dp.Y+int(dy))
 		for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 			sx := (float64(dx)+0.5)*xscale - 0.5
 			sx0 := int32(sx)
@@ -378,11 +370,11 @@
 			s11g = yFrac1*s10g + yFrac0*s11g
 			s11b = yFrac1*s10b + yFrac0*s11b
 			s11a = yFrac1*s10a + yFrac0*s11a
-			dstColorRGBA64.R = uint16(s11r)
-			dstColorRGBA64.G = uint16(s11g)
-			dstColorRGBA64.B = uint16(s11b)
-			dstColorRGBA64.A = uint16(s11a)
-			dst.Set(dp.X+int(dx), dp.Y+int(dy), dstColor)
+			dst.Pix[d+0] = uint8(uint32(s11r) >> 8)
+			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
+			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
+			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
+			d += 4
 		}
 	}
 }
@@ -390,8 +382,6 @@
 func (z *ablScaler) scale_RGBA_YCbCr(dst *image.RGBA, dp image.Point, dr image.Rectangle, src *image.YCbCr, sp image.Point) {
 	yscale := float64(z.sh) / float64(z.dh)
 	xscale := float64(z.sw) / float64(z.dw)
-	dstColorRGBA64 := &color.RGBA64{}
-	dstColor := color.Color(dstColorRGBA64)
 	for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 		sy := (float64(dy)+0.5)*yscale - 0.5
 		sy0 := int32(sy)
@@ -405,6 +395,7 @@
 			sy1 = sy0
 			yFrac0, yFrac1 = 1, 0
 		}
+		d := dst.PixOffset(dp.X+dr.Min.X, dp.Y+int(dy))
 		for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 			sx := (float64(dx)+0.5)*xscale - 0.5
 			sx0 := int32(sx)
@@ -450,11 +441,11 @@
 			s11g = yFrac1*s10g + yFrac0*s11g
 			s11b = yFrac1*s10b + yFrac0*s11b
 			s11a = yFrac1*s10a + yFrac0*s11a
-			dstColorRGBA64.R = uint16(s11r)
-			dstColorRGBA64.G = uint16(s11g)
-			dstColorRGBA64.B = uint16(s11b)
-			dstColorRGBA64.A = uint16(s11a)
-			dst.Set(dp.X+int(dx), dp.Y+int(dy), dstColor)
+			dst.Pix[d+0] = uint8(uint32(s11r) >> 8)
+			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
+			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
+			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
+			d += 4
 		}
 	}
 }
@@ -462,8 +453,6 @@
 func (z *ablScaler) scale_RGBA_Image(dst *image.RGBA, dp image.Point, dr image.Rectangle, src image.Image, sp image.Point) {
 	yscale := float64(z.sh) / float64(z.dh)
 	xscale := float64(z.sw) / float64(z.dw)
-	dstColorRGBA64 := &color.RGBA64{}
-	dstColor := color.Color(dstColorRGBA64)
 	for dy := int32(dr.Min.Y); dy < int32(dr.Max.Y); dy++ {
 		sy := (float64(dy)+0.5)*yscale - 0.5
 		sy0 := int32(sy)
@@ -477,6 +466,7 @@
 			sy1 = sy0
 			yFrac0, yFrac1 = 1, 0
 		}
+		d := dst.PixOffset(dp.X+dr.Min.X, dp.Y+int(dy))
 		for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 			sx := (float64(dx)+0.5)*xscale - 0.5
 			sx0 := int32(sx)
@@ -522,11 +512,11 @@
 			s11g = yFrac1*s10g + yFrac0*s11g
 			s11b = yFrac1*s10b + yFrac0*s11b
 			s11a = yFrac1*s10a + yFrac0*s11a
-			dstColorRGBA64.R = uint16(s11r)
-			dstColorRGBA64.G = uint16(s11g)
-			dstColorRGBA64.B = uint16(s11b)
-			dstColorRGBA64.A = uint16(s11a)
-			dst.Set(dp.X+int(dx), dp.Y+int(dy), dstColor)
+			dst.Pix[d+0] = uint8(uint32(s11r) >> 8)
+			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
+			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
+			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
+			d += 4
 		}
 	}
 }
@@ -753,10 +743,9 @@
 }
 
 func (z *kernelScaler) scaleY_RGBA(dst *image.RGBA, dp image.Point, dr image.Rectangle, tmp [][4]float64) {
-	dstColorRGBA64 := &color.RGBA64{}
-	dstColor := color.Color(dstColorRGBA64)
 	for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
-		for dy, s := range z.vertical.sources[dr.Min.Y:dr.Max.Y] {
+		d := dst.PixOffset(dp.X+int(dx), dp.Y+dr.Min.Y)
+		for _, s := range z.vertical.sources[dr.Min.Y:dr.Max.Y] {
 			var pr, pg, pb, pa float64
 			for _, c := range z.vertical.contribs[s.i:s.j] {
 				p := &tmp[c.coord*z.dw+dx]
@@ -765,11 +754,11 @@
 				pb += p[2] * c.weight
 				pa += p[3] * c.weight
 			}
-			dstColorRGBA64.R = ftou(pr * s.invTotalWeight)
-			dstColorRGBA64.G = ftou(pg * s.invTotalWeight)
-			dstColorRGBA64.B = ftou(pb * s.invTotalWeight)
-			dstColorRGBA64.A = ftou(pa * s.invTotalWeight)
-			dst.Set(dp.X+int(dx), dp.Y+int(dr.Min.Y+dy), dstColor)
+			dst.Pix[d+0] = uint8(ftou(pr*s.invTotalWeight) >> 8)
+			dst.Pix[d+1] = uint8(ftou(pg*s.invTotalWeight) >> 8)
+			dst.Pix[d+2] = uint8(ftou(pb*s.invTotalWeight) >> 8)
+			dst.Pix[d+3] = uint8(ftou(pa*s.invTotalWeight) >> 8)
+			d += dst.Stride
 		}
 	}
 }