draw: RGBA src fast path for scaling.

benchmark                     old ns/op      new ns/op      delta
BenchmarkScaleSrcRGBA         15124800       2091946        -86.17%

Change-Id: Id8d3088793ebf1d75b929fcf6945987817e87463
Reviewed-on: https://go-review.googlesource.com/6234
Reviewed-by: Rob Pike <r@golang.org>
diff --git a/draw/gen.go b/draw/gen.go
index 7b7bfc3..8ffea52 100644
--- a/draw/gen.go
+++ b/draw/gen.go
@@ -284,10 +284,26 @@
 		switch d.sType {
 		default:
 			log.Fatalf("bad sType %q", d.sType)
-		case "image.Image", "*image.NRGBA", "*image.RGBA", "*image.Uniform", "*image.YCbCr": // TODO: separate code for concrete types.
+		case "image.Image", "*image.NRGBA", "*image.Uniform", "*image.YCbCr": // TODO: separate code for concrete types.
 			fmt.Fprintf(buf, "%sr%s, %sg%s, %sb%s, %sa%s := "+
 				"src.At(sp.X + int(%s), sp.Y+int(%s)).RGBA()\n",
-				lhs, tmp, lhs, tmp, lhs, tmp, lhs, tmp, args[0], args[1])
+				lhs, tmp, lhs, tmp, lhs, tmp, lhs, tmp,
+				args[0], args[1],
+			)
+		case "*image.RGBA":
+			// TODO: there's no need to multiply by 0x101 if the next thing
+			// we're going to do is shift right by 8.
+			fmt.Fprintf(buf, "%si := src.PixOffset(sp.X + int(%s), sp.Y+int(%s))\n"+
+				"%sr%s := uint32(src.Pix[%si+0]) * 0x101\n"+
+				"%sg%s := uint32(src.Pix[%si+1]) * 0x101\n"+
+				"%sb%s := uint32(src.Pix[%si+2]) * 0x101\n"+
+				"%sa%s := uint32(src.Pix[%si+3]) * 0x101\n",
+				lhs, args[0], args[1],
+				lhs, tmp, lhs,
+				lhs, tmp, lhs,
+				lhs, tmp, lhs,
+				lhs, tmp, lhs,
+			)
 		}
 
 		if dollar == "srcf" {
@@ -399,7 +415,14 @@
 			if dr.Empty() {
 				return
 			}
-			$switch z.scale_$dTypeRN_$sTypeRN(dst, dp, dr, src, sp)
+			// sr is the source pixels. If it extends beyond the src bounds,
+			// we cannot use the type-specific fast paths, as they access
+			// the Pix fields directly without bounds checking.
+			if sr := (image.Rectangle{sp, sp.Add(image.Point{int(z.sw), int(z.sh)})}); !sr.In(src.Bounds()) {
+				z.scale_Image_Image(dst, dp, dr, src, sp)
+			} else {
+				$switch z.scale_$dTypeRN_$sTypeRN(dst, dp, dr, src, sp)
+			}
 		}
 	`
 
@@ -478,7 +501,16 @@
 			// scaleY distributes the temporary image's rows over the destination image.
 			// TODO: is it worth having a sync.Pool for this temporary buffer?
 			tmp := make([][4]float64, z.dw*z.sh)
-			$switchS z.scaleX_$sTypeRN(tmp, src, sp)
+
+			// sr is the source pixels. If it extends beyond the src bounds,
+			// we cannot use the type-specific fast paths, as they access
+			// the Pix fields directly without bounds checking.
+			if sr := (image.Rectangle{sp, sp.Add(image.Point{int(z.sw), int(z.sh)})}); !sr.In(src.Bounds()) {
+				z.scaleX_Image(tmp, src, sp)
+			} else {
+				$switchS z.scaleX_$sTypeRN(tmp, src, sp)
+			}
+
 			$switchD z.scaleY_$dTypeRN(dst, dp, dr, tmp)
 		}
 	`
diff --git a/draw/impl.go b/draw/impl.go
index c284c4c..3c07d31 100644
--- a/draw/impl.go
+++ b/draw/impl.go
@@ -16,24 +16,31 @@
 	if dr.Empty() {
 		return
 	}
-	switch dst := dst.(type) {
-	case *image.RGBA:
-		switch src := src.(type) {
-		case *image.NRGBA:
-			z.scale_RGBA_NRGBA(dst, dp, dr, src, sp)
+	// sr is the source pixels. If it extends beyond the src bounds,
+	// we cannot use the type-specific fast paths, as they access
+	// the Pix fields directly without bounds checking.
+	if sr := (image.Rectangle{sp, sp.Add(image.Point{int(z.sw), int(z.sh)})}); !sr.In(src.Bounds()) {
+		z.scale_Image_Image(dst, dp, dr, src, sp)
+	} else {
+		switch dst := dst.(type) {
 		case *image.RGBA:
-			z.scale_RGBA_RGBA(dst, dp, dr, src, sp)
-		case *image.Uniform:
-			z.scale_RGBA_Uniform(dst, dp, dr, src, sp)
-		case *image.YCbCr:
-			z.scale_RGBA_YCbCr(dst, dp, dr, src, sp)
+			switch src := src.(type) {
+			case *image.NRGBA:
+				z.scale_RGBA_NRGBA(dst, dp, dr, src, sp)
+			case *image.RGBA:
+				z.scale_RGBA_RGBA(dst, dp, dr, src, sp)
+			case *image.Uniform:
+				z.scale_RGBA_Uniform(dst, dp, dr, src, sp)
+			case *image.YCbCr:
+				z.scale_RGBA_YCbCr(dst, dp, dr, src, sp)
+			default:
+				z.scale_RGBA_Image(dst, dp, dr, src, sp)
+			}
 		default:
-			z.scale_RGBA_Image(dst, dp, dr, src, sp)
-		}
-	default:
-		switch src := src.(type) {
-		default:
-			z.scale_Image_Image(dst, dp, dr, src, sp)
+			switch src := src.(type) {
+			default:
+				z.scale_Image_Image(dst, dp, dr, src, sp)
+			}
 		}
 	}
 }
@@ -60,7 +67,11 @@
 		d := dst.PixOffset(dp.X+dr.Min.X, dp.Y+int(dy))
 		for dx := int32(dr.Min.X); dx < int32(dr.Max.X); dx++ {
 			sx := (2*uint64(dx) + 1) * uint64(z.sw) / (2 * uint64(z.dw))
-			pr, pg, pb, pa := src.At(sp.X+int(sx), sp.Y+int(sy)).RGBA()
+			pi := src.PixOffset(sp.X+int(sx), sp.Y+int(sy))
+			pr := uint32(src.Pix[pi+0]) * 0x101
+			pg := uint32(src.Pix[pi+1]) * 0x101
+			pb := uint32(src.Pix[pi+2]) * 0x101
+			pa := uint32(src.Pix[pi+3]) * 0x101
 			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
 			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
 			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
@@ -144,24 +155,31 @@
 	if dr.Empty() {
 		return
 	}
-	switch dst := dst.(type) {
-	case *image.RGBA:
-		switch src := src.(type) {
-		case *image.NRGBA:
-			z.scale_RGBA_NRGBA(dst, dp, dr, src, sp)
+	// sr is the source pixels. If it extends beyond the src bounds,
+	// we cannot use the type-specific fast paths, as they access
+	// the Pix fields directly without bounds checking.
+	if sr := (image.Rectangle{sp, sp.Add(image.Point{int(z.sw), int(z.sh)})}); !sr.In(src.Bounds()) {
+		z.scale_Image_Image(dst, dp, dr, src, sp)
+	} else {
+		switch dst := dst.(type) {
 		case *image.RGBA:
-			z.scale_RGBA_RGBA(dst, dp, dr, src, sp)
-		case *image.Uniform:
-			z.scale_RGBA_Uniform(dst, dp, dr, src, sp)
-		case *image.YCbCr:
-			z.scale_RGBA_YCbCr(dst, dp, dr, src, sp)
+			switch src := src.(type) {
+			case *image.NRGBA:
+				z.scale_RGBA_NRGBA(dst, dp, dr, src, sp)
+			case *image.RGBA:
+				z.scale_RGBA_RGBA(dst, dp, dr, src, sp)
+			case *image.Uniform:
+				z.scale_RGBA_Uniform(dst, dp, dr, src, sp)
+			case *image.YCbCr:
+				z.scale_RGBA_YCbCr(dst, dp, dr, src, sp)
+			default:
+				z.scale_RGBA_Image(dst, dp, dr, src, sp)
+			}
 		default:
-			z.scale_RGBA_Image(dst, dp, dr, src, sp)
-		}
-	default:
-		switch src := src.(type) {
-		default:
-			z.scale_Image_Image(dst, dp, dr, src, sp)
+			switch src := src.(type) {
+			default:
+				z.scale_Image_Image(dst, dp, dr, src, sp)
+			}
 		}
 	}
 }
@@ -267,12 +285,20 @@
 				sx1 = sx0
 				xFrac0, xFrac1 = 1, 0
 			}
-			s00ru, s00gu, s00bu, s00au := src.At(sp.X+int(sx0), sp.Y+int(sy0)).RGBA()
+			s00i := src.PixOffset(sp.X+int(sx0), sp.Y+int(sy0))
+			s00ru := uint32(src.Pix[s00i+0]) * 0x101
+			s00gu := uint32(src.Pix[s00i+1]) * 0x101
+			s00bu := uint32(src.Pix[s00i+2]) * 0x101
+			s00au := uint32(src.Pix[s00i+3]) * 0x101
 			s00r := float64(s00ru)
 			s00g := float64(s00gu)
 			s00b := float64(s00bu)
 			s00a := float64(s00au)
-			s10ru, s10gu, s10bu, s10au := src.At(sp.X+int(sx1), sp.Y+int(sy0)).RGBA()
+			s10i := src.PixOffset(sp.X+int(sx1), sp.Y+int(sy0))
+			s10ru := uint32(src.Pix[s10i+0]) * 0x101
+			s10gu := uint32(src.Pix[s10i+1]) * 0x101
+			s10bu := uint32(src.Pix[s10i+2]) * 0x101
+			s10au := uint32(src.Pix[s10i+3]) * 0x101
 			s10r := float64(s10ru)
 			s10g := float64(s10gu)
 			s10b := float64(s10bu)
@@ -281,12 +307,20 @@
 			s10g = xFrac1*s00g + xFrac0*s10g
 			s10b = xFrac1*s00b + xFrac0*s10b
 			s10a = xFrac1*s00a + xFrac0*s10a
-			s01ru, s01gu, s01bu, s01au := src.At(sp.X+int(sx0), sp.Y+int(sy1)).RGBA()
+			s01i := src.PixOffset(sp.X+int(sx0), sp.Y+int(sy1))
+			s01ru := uint32(src.Pix[s01i+0]) * 0x101
+			s01gu := uint32(src.Pix[s01i+1]) * 0x101
+			s01bu := uint32(src.Pix[s01i+2]) * 0x101
+			s01au := uint32(src.Pix[s01i+3]) * 0x101
 			s01r := float64(s01ru)
 			s01g := float64(s01gu)
 			s01b := float64(s01bu)
 			s01a := float64(s01au)
-			s11ru, s11gu, s11bu, s11au := src.At(sp.X+int(sx1), sp.Y+int(sy1)).RGBA()
+			s11i := src.PixOffset(sp.X+int(sx1), sp.Y+int(sy1))
+			s11ru := uint32(src.Pix[s11i+0]) * 0x101
+			s11gu := uint32(src.Pix[s11i+1]) * 0x101
+			s11bu := uint32(src.Pix[s11i+2]) * 0x101
+			s11au := uint32(src.Pix[s11i+3]) * 0x101
 			s11r := float64(s11ru)
 			s11g := float64(s11gu)
 			s11b := float64(s11bu)
@@ -607,18 +641,27 @@
 	// scaleY distributes the temporary image's rows over the destination image.
 	// TODO: is it worth having a sync.Pool for this temporary buffer?
 	tmp := make([][4]float64, z.dw*z.sh)
-	switch src := src.(type) {
-	case *image.NRGBA:
-		z.scaleX_NRGBA(tmp, src, sp)
-	case *image.RGBA:
-		z.scaleX_RGBA(tmp, src, sp)
-	case *image.Uniform:
-		z.scaleX_Uniform(tmp, src, sp)
-	case *image.YCbCr:
-		z.scaleX_YCbCr(tmp, src, sp)
-	default:
+
+	// sr is the source pixels. If it extends beyond the src bounds,
+	// we cannot use the type-specific fast paths, as they access
+	// the Pix fields directly without bounds checking.
+	if sr := (image.Rectangle{sp, sp.Add(image.Point{int(z.sw), int(z.sh)})}); !sr.In(src.Bounds()) {
 		z.scaleX_Image(tmp, src, sp)
+	} else {
+		switch src := src.(type) {
+		case *image.NRGBA:
+			z.scaleX_NRGBA(tmp, src, sp)
+		case *image.RGBA:
+			z.scaleX_RGBA(tmp, src, sp)
+		case *image.Uniform:
+			z.scaleX_Uniform(tmp, src, sp)
+		case *image.YCbCr:
+			z.scaleX_YCbCr(tmp, src, sp)
+		default:
+			z.scaleX_Image(tmp, src, sp)
+		}
 	}
+
 	switch dst := dst.(type) {
 	case *image.RGBA:
 		z.scaleY_RGBA(dst, dp, dr, tmp)
@@ -656,7 +699,11 @@
 		for _, s := range z.horizontal.sources {
 			var pr, pg, pb, pa float64
 			for _, c := range z.horizontal.contribs[s.i:s.j] {
-				pru, pgu, pbu, pau := src.At(sp.X+int(c.coord), sp.Y+int(y)).RGBA()
+				pi := src.PixOffset(sp.X+int(c.coord), sp.Y+int(y))
+				pru := uint32(src.Pix[pi+0]) * 0x101
+				pgu := uint32(src.Pix[pi+1]) * 0x101
+				pbu := uint32(src.Pix[pi+2]) * 0x101
+				pau := uint32(src.Pix[pi+3]) * 0x101
 				pr += float64(pru) * c.weight
 				pg += float64(pgu) * c.weight
 				pb += float64(pbu) * c.weight