vector: add a fast path for RGBA dst images.

name                      old time/op  new time/op  delta
GlyphRGBA16Over-8         25.1µs ± 2%   6.0µs ± 0%  -76.09%  (p=0.000 n=10+10)
GlyphRGBA16Src-8          17.3µs ± 2%   5.4µs ± 0%  -68.68%  (p=0.000 n=10+10)
GlyphRGBA32Over-8         93.2µs ± 2%  14.5µs ± 0%  -84.41%  (p=0.000 n=10+10)
GlyphRGBA32Src-8          59.3µs ± 2%  12.4µs ± 0%  -79.04%    (p=0.000 n=9+9)
GlyphRGBA64Over-8          350µs ± 2%    45µs ± 0%  -87.21%    (p=0.000 n=9+9)
GlyphRGBA64Src-8           223µs ± 2%    37µs ± 0%  -83.58%    (p=0.000 n=9+9)
GlyphRGBA128Over-8        1.37ms ± 2%  0.16ms ± 1%  -88.19%   (p=0.000 n=9+10)
GlyphRGBA128Src-8          868µs ± 2%   128µs ± 0%  -85.21%   (p=0.000 n=10+9)
GlyphRGBA256Over-8        5.50ms ± 3%  0.59ms ± 0%  -89.31%  (p=0.000 n=10+10)
GlyphRGBA256Src-8         3.47ms ± 2%  0.45ms ± 0%  -86.91%   (p=0.000 n=9+10)

Change-Id: I3df60e6b7147872367715361c9d1ed52951b22e0
Reviewed-on: https://go-review.googlesource.com/29699
Reviewed-by: David Crawshaw <crawshaw@golang.org>
diff --git a/vector/vector.go b/vector/vector.go
index e11c721..2374995 100644
--- a/vector/vector.go
+++ b/vector/vector.go
@@ -228,7 +228,7 @@
 	// r.Add(sp.Sub(r.Min)).
 
 	if src, ok := src.(*image.Uniform); ok {
-		_, _, _, srcA := src.RGBA()
+		srcR, srcG, srcB, srcA := src.RGBA()
 		switch dst := dst.(type) {
 		case *image.Alpha:
 			// Fast path for glyph rendering.
@@ -240,6 +240,13 @@
 				}
 				return
 			}
+		case *image.RGBA:
+			if z.DrawOp == draw.Over {
+				z.rasterizeDstRGBASrcUniformOpOver(dst, r, srcR, srcG, srcB, srcA)
+			} else {
+				z.rasterizeDstRGBASrcUniformOpSrc(dst, r, srcR, srcG, srcB, srcA)
+			}
+			return
 		}
 	}
 
@@ -309,6 +316,43 @@
 	}
 }
 
+func (z *Rasterizer) rasterizeDstRGBASrcUniformOpOver(dst *image.RGBA, r image.Rectangle, sr, sg, sb, sa uint32) {
+	z.accumulateMask()
+	pix := dst.Pix[dst.PixOffset(r.Min.X, r.Min.Y):]
+	for y, y1 := 0, r.Max.Y-r.Min.Y; y < y1; y++ {
+		for x, x1 := 0, r.Max.X-r.Min.X; x < x1; x++ {
+			ma := z.bufU32[y*z.size.X+x]
+
+			// This formula is like rasterizeOpOver's, simplified for the
+			// concrete dst type and uniform src assumption.
+			a := 0xffff - (sa * ma / 0xffff)
+			i := y*dst.Stride + 4*x
+			pix[i+0] = uint8(((uint32(pix[i+0])*0x101*a + sr*ma) / 0xffff) >> 8)
+			pix[i+1] = uint8(((uint32(pix[i+1])*0x101*a + sg*ma) / 0xffff) >> 8)
+			pix[i+2] = uint8(((uint32(pix[i+2])*0x101*a + sb*ma) / 0xffff) >> 8)
+			pix[i+3] = uint8(((uint32(pix[i+3])*0x101*a + sa*ma) / 0xffff) >> 8)
+		}
+	}
+}
+
+func (z *Rasterizer) rasterizeDstRGBASrcUniformOpSrc(dst *image.RGBA, r image.Rectangle, sr, sg, sb, sa uint32) {
+	z.accumulateMask()
+	pix := dst.Pix[dst.PixOffset(r.Min.X, r.Min.Y):]
+	for y, y1 := 0, r.Max.Y-r.Min.Y; y < y1; y++ {
+		for x, x1 := 0, r.Max.X-r.Min.X; x < x1; x++ {
+			ma := z.bufU32[y*z.size.X+x]
+
+			// This formula is like rasterizeOpSrc's, simplified for the
+			// concrete dst type and uniform src assumption.
+			i := y*dst.Stride + 4*x
+			pix[i+0] = uint8((sr * ma / 0xffff) >> 8)
+			pix[i+1] = uint8((sg * ma / 0xffff) >> 8)
+			pix[i+2] = uint8((sb * ma / 0xffff) >> 8)
+			pix[i+3] = uint8((sa * ma / 0xffff) >> 8)
+		}
+	}
+}
+
 func (z *Rasterizer) rasterizeOpOver(dst draw.Image, r image.Rectangle, src image.Image, sp image.Point) {
 	z.accumulateMask()
 	out := color.RGBA64{}