draw: implement the Over operator.

Change-Id: Id207b8f2fa5233175285800477e60f111ef4af63
Reviewed-on: https://go-review.googlesource.com/8744
Reviewed-by: Rob Pike <r@golang.org>
diff --git a/draw/example_test.go b/draw/example_test.go
index f8545ad..948be8d 100644
--- a/draw/example_test.go
+++ b/draw/example_test.go
@@ -7,6 +7,7 @@
 import (
 	"fmt"
 	"image"
+	"image/color"
 	"image/png"
 	"log"
 	"os"
@@ -28,6 +29,8 @@
 
 	sr := src.Bounds()
 	dst := image.NewRGBA(image.Rect(0, 0, 400, 300))
+	green := image.NewUniform(color.RGBA{0x00, 0x1f, 0x00, 0xff})
+	draw.Copy(dst, image.Point{}, green, dst.Bounds(), nil)
 	qs := []draw.Interpolator{
 		draw.NearestNeighbor,
 		draw.ApproxBiLinear,
@@ -45,6 +48,33 @@
 	}
 	draw.NearestNeighbor.Transform(dst, t, src, sr, nil)
 
+	red := image.NewNRGBA(image.Rect(0, 0, 16, 16))
+	for y := 0; y < 16; y++ {
+		for x := 0; x < 16; x++ {
+			red.SetNRGBA(x, y, color.NRGBA{
+				R: uint8(x * 0x11),
+				A: uint8(y * 0x11),
+			})
+		}
+	}
+	red.SetNRGBA(0, 0, color.NRGBA{0xff, 0xff, 0x00, 0xff})
+	red.SetNRGBA(15, 15, color.NRGBA{0xff, 0xff, 0x00, 0xff})
+
+	ops := []draw.Op{
+		draw.Over,
+		draw.Src,
+	}
+	for i, op := range ops {
+		q, opts := draw.NearestNeighbor, &draw.Options{Op: op}
+		dr := image.Rect(120+10*i, 150+60*i, 170+10*i, 200+60*i)
+		q.Scale(dst, dr, red, red.Bounds(), opts)
+		t := &f64.Aff3{
+			+cos60, -sin60, float64(190 + 10*i),
+			+sin60, +cos60, float64(140 + 50*i),
+		}
+		q.Transform(dst, t, red, red.Bounds(), opts)
+	}
+
 	// Change false to true to write the resultant image to disk.
 	if false {
 		fDst, err := os.Create("out.png")
diff --git a/draw/gen.go b/draw/gen.go
index 3295c2d..048434c 100644
--- a/draw/gen.go
+++ b/draw/gen.go
@@ -305,161 +305,256 @@
 		`
 
 	case "outputu":
-		// TODO: handle op==Over, not just op==Src.
 		args, _ := splitArgs(suffix)
 		if len(args) != 3 {
 			return ""
 		}
-		switch d.dType {
-		default:
-			log.Fatalf("bad dType %q", d.dType)
-		case "Image":
-			switch d.sType {
+
+		switch d.op {
+		case "Over":
+			switch d.dType {
 			default:
+				log.Fatalf("bad dType %q", d.dType)
+			case "Image":
 				return fmt.Sprintf(""+
-					"dstColorRGBA64.R = uint16(%sr)\n"+
-					"dstColorRGBA64.G = uint16(%sg)\n"+
-					"dstColorRGBA64.B = uint16(%sb)\n"+
-					"dstColorRGBA64.A = uint16(%sa)\n"+
+					"qr, qg, qb, qa := dst.At(%s, %s).RGBA()\n"+
+					"%sa1 := 0xffff - uint32(%sa)\n"+
+					"dstColorRGBA64.R = uint16(qr*%sa1/0xffff + uint32(%sr))\n"+
+					"dstColorRGBA64.G = uint16(qg*%sa1/0xffff + uint32(%sg))\n"+
+					"dstColorRGBA64.B = uint16(qb*%sa1/0xffff + uint32(%sb))\n"+
+					"dstColorRGBA64.A = uint16(qa*%sa1/0xffff + uint32(%sa))\n"+
 					"dst.Set(%s, %s, dstColor)",
-					args[2], args[2], args[2], args[2],
+					args[0], args[1],
+					args[2], args[2],
+					args[2], args[2],
+					args[2], args[2],
+					args[2], args[2],
+					args[2], args[2],
 					args[0], args[1],
 				)
-			case "*image.Gray":
+			case "*image.RGBA":
 				return fmt.Sprintf(""+
-					"out := uint16(%sr)\n"+
-					"dstColorRGBA64.R = out\n"+
-					"dstColorRGBA64.G = out\n"+
-					"dstColorRGBA64.B = out\n"+
-					"dstColorRGBA64.A = 0xffff\n"+
-					"dst.Set(%s, %s, dstColor)",
-					args[2],
-					args[0], args[1],
-				)
-			case "*image.YCbCr":
-				return fmt.Sprintf(""+
-					"dstColorRGBA64.R = uint16(%sr)\n"+
-					"dstColorRGBA64.G = uint16(%sg)\n"+
-					"dstColorRGBA64.B = uint16(%sb)\n"+
-					"dstColorRGBA64.A = 0xffff\n"+
-					"dst.Set(%s, %s, dstColor)",
-					args[2], args[2], args[2],
-					args[0], args[1],
+					"%sa1 := (0xffff - uint32(%sa)) * 0x101\n"+
+					"dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*%sa1/0xffff + uint32(%sr)) >> 8)\n"+
+					"dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*%sa1/0xffff + uint32(%sg)) >> 8)\n"+
+					"dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*%sa1/0xffff + uint32(%sb)) >> 8)\n"+
+					"dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*%sa1/0xffff + uint32(%sa)) >> 8)",
+					args[2], args[2],
+					args[2], args[2],
+					args[2], args[2],
+					args[2], args[2],
+					args[2], args[2],
 				)
 			}
-		case "*image.RGBA":
-			switch d.sType {
+
+		case "Src":
+			switch d.dType {
 			default:
-				return fmt.Sprintf(""+
-					"dst.Pix[d+0] = uint8(uint32(%sr) >> 8)\n"+
-					"dst.Pix[d+1] = uint8(uint32(%sg) >> 8)\n"+
-					"dst.Pix[d+2] = uint8(uint32(%sb) >> 8)\n"+
-					"dst.Pix[d+3] = uint8(uint32(%sa) >> 8)",
-					args[2], args[2], args[2], args[2],
-				)
-			case "*image.Gray":
-				return fmt.Sprintf(""+
-					"out := uint8(uint32(%sr) >> 8)\n"+
-					"dst.Pix[d+0] = out\n"+
-					"dst.Pix[d+1] = out\n"+
-					"dst.Pix[d+2] = out\n"+
-					"dst.Pix[d+3] = 0xff",
-					args[2],
-				)
-			case "*image.YCbCr":
-				return fmt.Sprintf(""+
-					"dst.Pix[d+0] = uint8(uint32(%sr) >> 8)\n"+
-					"dst.Pix[d+1] = uint8(uint32(%sg) >> 8)\n"+
-					"dst.Pix[d+2] = uint8(uint32(%sb) >> 8)\n"+
-					"dst.Pix[d+3] = 0xff",
-					args[2], args[2], args[2],
-				)
+				log.Fatalf("bad dType %q", d.dType)
+			case "Image":
+				switch d.sType {
+				default:
+					return fmt.Sprintf(""+
+						"dstColorRGBA64.R = uint16(%sr)\n"+
+						"dstColorRGBA64.G = uint16(%sg)\n"+
+						"dstColorRGBA64.B = uint16(%sb)\n"+
+						"dstColorRGBA64.A = uint16(%sa)\n"+
+						"dst.Set(%s, %s, dstColor)",
+						args[2], args[2], args[2], args[2],
+						args[0], args[1],
+					)
+				case "*image.Gray":
+					return fmt.Sprintf(""+
+						"out := uint16(%sr)\n"+
+						"dstColorRGBA64.R = out\n"+
+						"dstColorRGBA64.G = out\n"+
+						"dstColorRGBA64.B = out\n"+
+						"dstColorRGBA64.A = 0xffff\n"+
+						"dst.Set(%s, %s, dstColor)",
+						args[2],
+						args[0], args[1],
+					)
+				case "*image.YCbCr":
+					return fmt.Sprintf(""+
+						"dstColorRGBA64.R = uint16(%sr)\n"+
+						"dstColorRGBA64.G = uint16(%sg)\n"+
+						"dstColorRGBA64.B = uint16(%sb)\n"+
+						"dstColorRGBA64.A = 0xffff\n"+
+						"dst.Set(%s, %s, dstColor)",
+						args[2], args[2], args[2],
+						args[0], args[1],
+					)
+				}
+			case "*image.RGBA":
+				switch d.sType {
+				default:
+					return fmt.Sprintf(""+
+						"dst.Pix[d+0] = uint8(uint32(%sr) >> 8)\n"+
+						"dst.Pix[d+1] = uint8(uint32(%sg) >> 8)\n"+
+						"dst.Pix[d+2] = uint8(uint32(%sb) >> 8)\n"+
+						"dst.Pix[d+3] = uint8(uint32(%sa) >> 8)",
+						args[2], args[2], args[2], args[2],
+					)
+				case "*image.Gray":
+					return fmt.Sprintf(""+
+						"out := uint8(uint32(%sr) >> 8)\n"+
+						"dst.Pix[d+0] = out\n"+
+						"dst.Pix[d+1] = out\n"+
+						"dst.Pix[d+2] = out\n"+
+						"dst.Pix[d+3] = 0xff",
+						args[2],
+					)
+				case "*image.YCbCr":
+					return fmt.Sprintf(""+
+						"dst.Pix[d+0] = uint8(uint32(%sr) >> 8)\n"+
+						"dst.Pix[d+1] = uint8(uint32(%sg) >> 8)\n"+
+						"dst.Pix[d+2] = uint8(uint32(%sb) >> 8)\n"+
+						"dst.Pix[d+3] = 0xff",
+						args[2], args[2], args[2],
+					)
+				}
 			}
 		}
 
 	case "outputf":
-		// TODO: handle op==Over, not just op==Src.
 		args, _ := splitArgs(suffix)
 		if len(args) != 5 {
 			return ""
 		}
 		ret := ""
-		switch d.dType {
-		default:
-			log.Fatalf("bad dType %q", d.dType)
-		case "Image":
-			switch d.sType {
+
+		switch d.op {
+		case "Over":
+			switch d.dType {
 			default:
+				log.Fatalf("bad dType %q", d.dType)
+			case "Image":
 				ret = fmt.Sprintf(""+
-					"dstColorRGBA64.R = %s(%sr * %s)\n"+
-					"dstColorRGBA64.G = %s(%sg * %s)\n"+
-					"dstColorRGBA64.B = %s(%sb * %s)\n"+
-					"dstColorRGBA64.A = %s(%sa * %s)\n"+
+					"qr, qg, qb, qa := dst.At(%s, %s).RGBA()\n"+
+					"%sr0 := uint32(%s(%sr * %s))\n"+
+					"%sg0 := uint32(%s(%sg * %s))\n"+
+					"%sb0 := uint32(%s(%sb * %s))\n"+
+					"%sa0 := uint32(%s(%sa * %s))\n"+
+					"%sa1 := 0xffff - %sa0\n"+
+					"dstColorRGBA64.R = uint16(qr*%sa1/0xffff + %sr0)\n"+
+					"dstColorRGBA64.G = uint16(qg*%sa1/0xffff + %sg0)\n"+
+					"dstColorRGBA64.B = uint16(qb*%sa1/0xffff + %sb0)\n"+
+					"dstColorRGBA64.A = uint16(qa*%sa1/0xffff + %sa0)\n"+
 					"dst.Set(%s, %s, dstColor)",
-					args[2], args[3], args[4],
-					args[2], args[3], args[4],
-					args[2], args[3], args[4],
-					args[2], args[3], args[4],
+					args[0], args[1],
+					args[3], args[2], args[3], args[4],
+					args[3], args[2], args[3], args[4],
+					args[3], args[2], args[3], args[4],
+					args[3], args[2], args[3], args[4],
+					args[3], args[3],
+					args[3], args[3],
+					args[3], args[3],
+					args[3], args[3],
+					args[3], args[3],
 					args[0], args[1],
 				)
-			case "*image.Gray":
+			case "*image.RGBA":
 				ret = fmt.Sprintf(""+
-					"out := %s(%sr * %s)\n"+
-					"dstColorRGBA64.R = out\n"+
-					"dstColorRGBA64.G = out\n"+
-					"dstColorRGBA64.B = out\n"+
-					"dstColorRGBA64.A = 0xffff\n"+
-					"dst.Set(%s, %s, dstColor)",
-					args[2], args[3], args[4],
-					args[0], args[1],
-				)
-			case "*image.YCbCr":
-				ret = fmt.Sprintf(""+
-					"dstColorRGBA64.R = %s(%sr * %s)\n"+
-					"dstColorRGBA64.G = %s(%sg * %s)\n"+
-					"dstColorRGBA64.B = %s(%sb * %s)\n"+
-					"dstColorRGBA64.A = 0xffff\n"+
-					"dst.Set(%s, %s, dstColor)",
-					args[2], args[3], args[4],
-					args[2], args[3], args[4],
-					args[2], args[3], args[4],
-					args[0], args[1],
+					"%sr0 := uint32(%s(%sr * %s))\n"+
+					"%sg0 := uint32(%s(%sg * %s))\n"+
+					"%sb0 := uint32(%s(%sb * %s))\n"+
+					"%sa0 := uint32(%s(%sa * %s))\n"+
+					"%sa1 := (0xffff - uint32(%sa0)) * 0x101\n"+
+					"dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*%sa1/0xffff + %sr0) >> 8)\n"+
+					"dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*%sa1/0xffff + %sg0) >> 8)\n"+
+					"dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*%sa1/0xffff + %sb0) >> 8)\n"+
+					"dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*%sa1/0xffff + %sa0) >> 8)",
+					args[3], args[2], args[3], args[4],
+					args[3], args[2], args[3], args[4],
+					args[3], args[2], args[3], args[4],
+					args[3], args[2], args[3], args[4],
+					args[3], args[3],
+					args[3], args[3],
+					args[3], args[3],
+					args[3], args[3],
+					args[3], args[3],
 				)
 			}
-		case "*image.RGBA":
-			switch d.sType {
+
+		case "Src":
+			switch d.dType {
 			default:
-				ret = fmt.Sprintf(""+
-					"dst.Pix[d+0] = uint8(%s(%sr * %s) >> 8)\n"+
-					"dst.Pix[d+1] = uint8(%s(%sg * %s) >> 8)\n"+
-					"dst.Pix[d+2] = uint8(%s(%sb * %s) >> 8)\n"+
-					"dst.Pix[d+3] = uint8(%s(%sa * %s) >> 8)",
-					args[2], args[3], args[4],
-					args[2], args[3], args[4],
-					args[2], args[3], args[4],
-					args[2], args[3], args[4],
-				)
-			case "*image.Gray":
-				ret = fmt.Sprintf(""+
-					"out := uint8(%s(%sr * %s) >> 8)\n"+
-					"dst.Pix[d+0] = out\n"+
-					"dst.Pix[d+1] = out\n"+
-					"dst.Pix[d+2] = out\n"+
-					"dst.Pix[d+3] = 0xff",
-					args[2], args[3], args[4],
-				)
-			case "*image.YCbCr":
-				ret = fmt.Sprintf(""+
-					"dst.Pix[d+0] = uint8(%s(%sr * %s) >> 8)\n"+
-					"dst.Pix[d+1] = uint8(%s(%sg * %s) >> 8)\n"+
-					"dst.Pix[d+2] = uint8(%s(%sb * %s) >> 8)\n"+
-					"dst.Pix[d+3] = 0xff",
-					args[2], args[3], args[4],
-					args[2], args[3], args[4],
-					args[2], args[3], args[4],
-				)
+				log.Fatalf("bad dType %q", d.dType)
+			case "Image":
+				switch d.sType {
+				default:
+					ret = fmt.Sprintf(""+
+						"dstColorRGBA64.R = %s(%sr * %s)\n"+
+						"dstColorRGBA64.G = %s(%sg * %s)\n"+
+						"dstColorRGBA64.B = %s(%sb * %s)\n"+
+						"dstColorRGBA64.A = %s(%sa * %s)\n"+
+						"dst.Set(%s, %s, dstColor)",
+						args[2], args[3], args[4],
+						args[2], args[3], args[4],
+						args[2], args[3], args[4],
+						args[2], args[3], args[4],
+						args[0], args[1],
+					)
+				case "*image.Gray":
+					ret = fmt.Sprintf(""+
+						"out := %s(%sr * %s)\n"+
+						"dstColorRGBA64.R = out\n"+
+						"dstColorRGBA64.G = out\n"+
+						"dstColorRGBA64.B = out\n"+
+						"dstColorRGBA64.A = 0xffff\n"+
+						"dst.Set(%s, %s, dstColor)",
+						args[2], args[3], args[4],
+						args[0], args[1],
+					)
+				case "*image.YCbCr":
+					ret = fmt.Sprintf(""+
+						"dstColorRGBA64.R = %s(%sr * %s)\n"+
+						"dstColorRGBA64.G = %s(%sg * %s)\n"+
+						"dstColorRGBA64.B = %s(%sb * %s)\n"+
+						"dstColorRGBA64.A = 0xffff\n"+
+						"dst.Set(%s, %s, dstColor)",
+						args[2], args[3], args[4],
+						args[2], args[3], args[4],
+						args[2], args[3], args[4],
+						args[0], args[1],
+					)
+				}
+			case "*image.RGBA":
+				switch d.sType {
+				default:
+					ret = fmt.Sprintf(""+
+						"dst.Pix[d+0] = uint8(%s(%sr * %s) >> 8)\n"+
+						"dst.Pix[d+1] = uint8(%s(%sg * %s) >> 8)\n"+
+						"dst.Pix[d+2] = uint8(%s(%sb * %s) >> 8)\n"+
+						"dst.Pix[d+3] = uint8(%s(%sa * %s) >> 8)",
+						args[2], args[3], args[4],
+						args[2], args[3], args[4],
+						args[2], args[3], args[4],
+						args[2], args[3], args[4],
+					)
+				case "*image.Gray":
+					ret = fmt.Sprintf(""+
+						"out := uint8(%s(%sr * %s) >> 8)\n"+
+						"dst.Pix[d+0] = out\n"+
+						"dst.Pix[d+1] = out\n"+
+						"dst.Pix[d+2] = out\n"+
+						"dst.Pix[d+3] = 0xff",
+						args[2], args[3], args[4],
+					)
+				case "*image.YCbCr":
+					ret = fmt.Sprintf(""+
+						"dst.Pix[d+0] = uint8(%s(%sr * %s) >> 8)\n"+
+						"dst.Pix[d+1] = uint8(%s(%sg * %s) >> 8)\n"+
+						"dst.Pix[d+2] = uint8(%s(%sb * %s) >> 8)\n"+
+						"dst.Pix[d+3] = 0xff",
+						args[2], args[3], args[4],
+						args[2], args[3], args[4],
+						args[2], args[3], args[4],
+					)
+				}
 			}
 		}
+
 		return strings.Replace(ret, " * 1)", ")", -1)
 
 	case "srcf", "srcu":
diff --git a/draw/impl.go b/draw/impl.go
index 4ad48ca..5292ced 100644
--- a/draw/impl.go
+++ b/draw/impl.go
@@ -216,10 +216,11 @@
 			pr := uint32(src.Pix[pi+0]) * pa / 0xff
 			pg := uint32(src.Pix[pi+1]) * pa / 0xff
 			pb := uint32(src.Pix[pi+2]) * pa / 0xff
-			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
-			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
-			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
-			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
+			pa1 := (0xffff - uint32(pa)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + uint32(pr)) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + uint32(pg)) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + uint32(pb)) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + uint32(pa)) >> 8)
 		}
 	}
 }
@@ -262,10 +263,11 @@
 			pg := uint32(src.Pix[pi+1]) * 0x101
 			pb := uint32(src.Pix[pi+2]) * 0x101
 			pa := uint32(src.Pix[pi+3]) * 0x101
-			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
-			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
-			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
-			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
+			pa1 := (0xffff - uint32(pa)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + uint32(pr)) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + uint32(pg)) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + uint32(pb)) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + uint32(pa)) >> 8)
 		}
 	}
 }
@@ -476,10 +478,11 @@
 		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
 			sx := (2*uint64(dx) + 1) * sw / dw2
 			pr, pg, pb, pa := src.At(sr.Min.X+int(sx), sr.Min.Y+int(sy)).RGBA()
-			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
-			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
-			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
-			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
+			pa1 := (0xffff - uint32(pa)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + uint32(pr)) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + uint32(pg)) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + uint32(pb)) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + uint32(pa)) >> 8)
 		}
 	}
 }
@@ -515,10 +518,12 @@
 		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
 			sx := (2*uint64(dx) + 1) * sw / dw2
 			pr, pg, pb, pa := src.At(sr.Min.X+int(sx), sr.Min.Y+int(sy)).RGBA()
-			dstColorRGBA64.R = uint16(pr)
-			dstColorRGBA64.G = uint16(pg)
-			dstColorRGBA64.B = uint16(pb)
-			dstColorRGBA64.A = uint16(pa)
+			qr, qg, qb, qa := dst.At(dr.Min.X+int(dx), dr.Min.Y+int(dy)).RGBA()
+			pa1 := 0xffff - uint32(pa)
+			dstColorRGBA64.R = uint16(qr*pa1/0xffff + uint32(pr))
+			dstColorRGBA64.G = uint16(qg*pa1/0xffff + uint32(pg))
+			dstColorRGBA64.B = uint16(qb*pa1/0xffff + uint32(pb))
+			dstColorRGBA64.A = uint16(qa*pa1/0xffff + uint32(pa))
 			dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
 		}
 	}
@@ -583,10 +588,11 @@
 			pr := uint32(src.Pix[pi+0]) * pa / 0xff
 			pg := uint32(src.Pix[pi+1]) * pa / 0xff
 			pb := uint32(src.Pix[pi+2]) * pa / 0xff
-			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
-			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
-			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
-			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
+			pa1 := (0xffff - uint32(pa)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + uint32(pr)) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + uint32(pg)) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + uint32(pb)) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + uint32(pa)) >> 8)
 		}
 	}
 }
@@ -631,10 +637,11 @@
 			pg := uint32(src.Pix[pi+1]) * 0x101
 			pb := uint32(src.Pix[pi+2]) * 0x101
 			pa := uint32(src.Pix[pi+3]) * 0x101
-			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
-			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
-			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
-			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
+			pa1 := (0xffff - uint32(pa)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + uint32(pr)) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + uint32(pg)) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + uint32(pb)) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + uint32(pa)) >> 8)
 		}
 	}
 }
@@ -851,10 +858,11 @@
 				continue
 			}
 			pr, pg, pb, pa := src.At(sx0, sy0).RGBA()
-			dst.Pix[d+0] = uint8(uint32(pr) >> 8)
-			dst.Pix[d+1] = uint8(uint32(pg) >> 8)
-			dst.Pix[d+2] = uint8(uint32(pb) >> 8)
-			dst.Pix[d+3] = uint8(uint32(pa) >> 8)
+			pa1 := (0xffff - uint32(pa)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + uint32(pr)) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + uint32(pg)) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + uint32(pb)) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + uint32(pa)) >> 8)
 		}
 	}
 }
@@ -892,10 +900,12 @@
 				continue
 			}
 			pr, pg, pb, pa := src.At(sx0, sy0).RGBA()
-			dstColorRGBA64.R = uint16(pr)
-			dstColorRGBA64.G = uint16(pg)
-			dstColorRGBA64.B = uint16(pb)
-			dstColorRGBA64.A = uint16(pa)
+			qr, qg, qb, qa := dst.At(dr.Min.X+int(dx), dr.Min.Y+int(dy)).RGBA()
+			pa1 := 0xffff - uint32(pa)
+			dstColorRGBA64.R = uint16(qr*pa1/0xffff + uint32(pr))
+			dstColorRGBA64.G = uint16(qg*pa1/0xffff + uint32(pg))
+			dstColorRGBA64.B = uint16(qb*pa1/0xffff + uint32(pb))
+			dstColorRGBA64.A = uint16(qa*pa1/0xffff + uint32(pa))
 			dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
 		}
 	}
@@ -1243,10 +1253,11 @@
 			s11g = yFrac1*s10g + yFrac0*s11g
 			s11b = yFrac1*s10b + yFrac0*s11b
 			s11a = yFrac1*s10a + yFrac0*s11a
-			dst.Pix[d+0] = uint8(uint32(s11r) >> 8)
-			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
-			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
-			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
+			s11a1 := (0xffff - uint32(s11a)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*s11a1/0xffff + uint32(s11r)) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*s11a1/0xffff + uint32(s11g)) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*s11a1/0xffff + uint32(s11b)) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*s11a1/0xffff + uint32(s11a)) >> 8)
 		}
 	}
 }
@@ -1433,10 +1444,11 @@
 			s11g = yFrac1*s10g + yFrac0*s11g
 			s11b = yFrac1*s10b + yFrac0*s11b
 			s11a = yFrac1*s10a + yFrac0*s11a
-			dst.Pix[d+0] = uint8(uint32(s11r) >> 8)
-			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
-			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
-			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
+			s11a1 := (0xffff - uint32(s11a)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*s11a1/0xffff + uint32(s11r)) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*s11a1/0xffff + uint32(s11g)) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*s11a1/0xffff + uint32(s11b)) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*s11a1/0xffff + uint32(s11a)) >> 8)
 		}
 	}
 }
@@ -2295,10 +2307,11 @@
 			s11g = yFrac1*s10g + yFrac0*s11g
 			s11b = yFrac1*s10b + yFrac0*s11b
 			s11a = yFrac1*s10a + yFrac0*s11a
-			dst.Pix[d+0] = uint8(uint32(s11r) >> 8)
-			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
-			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
-			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
+			s11a1 := (0xffff - uint32(s11a)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*s11a1/0xffff + uint32(s11r)) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*s11a1/0xffff + uint32(s11g)) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*s11a1/0xffff + uint32(s11b)) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*s11a1/0xffff + uint32(s11a)) >> 8)
 		}
 	}
 }
@@ -2454,10 +2467,12 @@
 			s11g = yFrac1*s10g + yFrac0*s11g
 			s11b = yFrac1*s10b + yFrac0*s11b
 			s11a = yFrac1*s10a + yFrac0*s11a
-			dstColorRGBA64.R = uint16(s11r)
-			dstColorRGBA64.G = uint16(s11g)
-			dstColorRGBA64.B = uint16(s11b)
-			dstColorRGBA64.A = uint16(s11a)
+			qr, qg, qb, qa := dst.At(dr.Min.X+int(dx), dr.Min.Y+int(dy)).RGBA()
+			s11a1 := 0xffff - uint32(s11a)
+			dstColorRGBA64.R = uint16(qr*s11a1/0xffff + uint32(s11r))
+			dstColorRGBA64.G = uint16(qg*s11a1/0xffff + uint32(s11g))
+			dstColorRGBA64.B = uint16(qb*s11a1/0xffff + uint32(s11b))
+			dstColorRGBA64.A = uint16(qa*s11a1/0xffff + uint32(s11a))
 			dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
 		}
 	}
@@ -2696,10 +2711,11 @@
 			s11g = yFrac1*s10g + yFrac0*s11g
 			s11b = yFrac1*s10b + yFrac0*s11b
 			s11a = yFrac1*s10a + yFrac0*s11a
-			dst.Pix[d+0] = uint8(uint32(s11r) >> 8)
-			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
-			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
-			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
+			s11a1 := (0xffff - uint32(s11a)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*s11a1/0xffff + uint32(s11r)) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*s11a1/0xffff + uint32(s11g)) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*s11a1/0xffff + uint32(s11b)) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*s11a1/0xffff + uint32(s11a)) >> 8)
 		}
 	}
 }
@@ -2888,10 +2904,11 @@
 			s11g = yFrac1*s10g + yFrac0*s11g
 			s11b = yFrac1*s10b + yFrac0*s11b
 			s11a = yFrac1*s10a + yFrac0*s11a
-			dst.Pix[d+0] = uint8(uint32(s11r) >> 8)
-			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
-			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
-			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
+			s11a1 := (0xffff - uint32(s11a)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*s11a1/0xffff + uint32(s11r)) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*s11a1/0xffff + uint32(s11g)) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*s11a1/0xffff + uint32(s11b)) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*s11a1/0xffff + uint32(s11a)) >> 8)
 		}
 	}
 }
@@ -3756,10 +3773,11 @@
 			s11g = yFrac1*s10g + yFrac0*s11g
 			s11b = yFrac1*s10b + yFrac0*s11b
 			s11a = yFrac1*s10a + yFrac0*s11a
-			dst.Pix[d+0] = uint8(uint32(s11r) >> 8)
-			dst.Pix[d+1] = uint8(uint32(s11g) >> 8)
-			dst.Pix[d+2] = uint8(uint32(s11b) >> 8)
-			dst.Pix[d+3] = uint8(uint32(s11a) >> 8)
+			s11a1 := (0xffff - uint32(s11a)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*s11a1/0xffff + uint32(s11r)) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*s11a1/0xffff + uint32(s11g)) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*s11a1/0xffff + uint32(s11b)) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*s11a1/0xffff + uint32(s11a)) >> 8)
 		}
 	}
 }
@@ -3917,10 +3935,12 @@
 			s11g = yFrac1*s10g + yFrac0*s11g
 			s11b = yFrac1*s10b + yFrac0*s11b
 			s11a = yFrac1*s10a + yFrac0*s11a
-			dstColorRGBA64.R = uint16(s11r)
-			dstColorRGBA64.G = uint16(s11g)
-			dstColorRGBA64.B = uint16(s11b)
-			dstColorRGBA64.A = uint16(s11a)
+			qr, qg, qb, qa := dst.At(dr.Min.X+int(dx), dr.Min.Y+int(dy)).RGBA()
+			s11a1 := 0xffff - uint32(s11a)
+			dstColorRGBA64.R = uint16(qr*s11a1/0xffff + uint32(s11r))
+			dstColorRGBA64.G = uint16(qg*s11a1/0xffff + uint32(s11g))
+			dstColorRGBA64.B = uint16(qb*s11a1/0xffff + uint32(s11b))
+			dstColorRGBA64.A = uint16(qa*s11a1/0xffff + uint32(s11a))
 			dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
 		}
 	}
@@ -4505,10 +4525,15 @@
 				pb = pa
 			}
 
-			dst.Pix[d+0] = uint8(ftou(pr*s.invTotalWeight) >> 8)
-			dst.Pix[d+1] = uint8(ftou(pg*s.invTotalWeight) >> 8)
-			dst.Pix[d+2] = uint8(ftou(pb*s.invTotalWeight) >> 8)
-			dst.Pix[d+3] = uint8(ftou(pa*s.invTotalWeight) >> 8)
+			pr0 := uint32(ftou(pr * s.invTotalWeight))
+			pg0 := uint32(ftou(pg * s.invTotalWeight))
+			pb0 := uint32(ftou(pb * s.invTotalWeight))
+			pa0 := uint32(ftou(pa * s.invTotalWeight))
+			pa1 := (0xffff - uint32(pa0)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + pr0) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + pg0) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + pb0) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + pa0) >> 8)
 			d += dst.Stride
 		}
 	}
@@ -4570,10 +4595,16 @@
 				pb = pa
 			}
 
-			dstColorRGBA64.R = ftou(pr * s.invTotalWeight)
-			dstColorRGBA64.G = ftou(pg * s.invTotalWeight)
-			dstColorRGBA64.B = ftou(pb * s.invTotalWeight)
-			dstColorRGBA64.A = ftou(pa * s.invTotalWeight)
+			qr, qg, qb, qa := dst.At(dr.Min.X+int(dx), dr.Min.Y+int(adr.Min.Y+dy)).RGBA()
+			pr0 := uint32(ftou(pr * s.invTotalWeight))
+			pg0 := uint32(ftou(pg * s.invTotalWeight))
+			pb0 := uint32(ftou(pb * s.invTotalWeight))
+			pa0 := uint32(ftou(pa * s.invTotalWeight))
+			pa1 := 0xffff - pa0
+			dstColorRGBA64.R = uint16(qr*pa1/0xffff + pr0)
+			dstColorRGBA64.G = uint16(qg*pa1/0xffff + pg0)
+			dstColorRGBA64.B = uint16(qb*pa1/0xffff + pb0)
+			dstColorRGBA64.A = uint16(qa*pa1/0xffff + pa0)
 			dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(adr.Min.Y+dy), dstColor)
 		}
 	}
@@ -4818,10 +4849,15 @@
 				pb = pa
 			}
 
-			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
-			dst.Pix[d+1] = uint8(fffftou(pg) >> 8)
-			dst.Pix[d+2] = uint8(fffftou(pb) >> 8)
-			dst.Pix[d+3] = uint8(fffftou(pa) >> 8)
+			pr0 := uint32(fffftou(pr))
+			pg0 := uint32(fffftou(pg))
+			pb0 := uint32(fffftou(pb))
+			pa0 := uint32(fffftou(pa))
+			pa1 := (0xffff - uint32(pa0)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + pr0) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + pg0) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + pb0) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + pa0) >> 8)
 		}
 	}
 }
@@ -5048,10 +5084,15 @@
 				pb = pa
 			}
 
-			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
-			dst.Pix[d+1] = uint8(fffftou(pg) >> 8)
-			dst.Pix[d+2] = uint8(fffftou(pb) >> 8)
-			dst.Pix[d+3] = uint8(fffftou(pa) >> 8)
+			pr0 := uint32(fffftou(pr))
+			pg0 := uint32(fffftou(pg))
+			pb0 := uint32(fffftou(pb))
+			pa0 := uint32(fffftou(pa))
+			pa1 := (0xffff - uint32(pa0)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + pr0) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + pg0) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + pb0) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + pa0) >> 8)
 		}
 	}
 }
@@ -5770,10 +5811,15 @@
 				pb = pa
 			}
 
-			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
-			dst.Pix[d+1] = uint8(fffftou(pg) >> 8)
-			dst.Pix[d+2] = uint8(fffftou(pb) >> 8)
-			dst.Pix[d+3] = uint8(fffftou(pa) >> 8)
+			pr0 := uint32(fffftou(pr))
+			pg0 := uint32(fffftou(pg))
+			pb0 := uint32(fffftou(pb))
+			pa0 := uint32(fffftou(pa))
+			pa1 := (0xffff - uint32(pa0)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + pr0) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + pg0) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + pb0) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + pa0) >> 8)
 		}
 	}
 }
@@ -5993,10 +6039,16 @@
 				pb = pa
 			}
 
-			dstColorRGBA64.R = fffftou(pr)
-			dstColorRGBA64.G = fffftou(pg)
-			dstColorRGBA64.B = fffftou(pb)
-			dstColorRGBA64.A = fffftou(pa)
+			qr, qg, qb, qa := dst.At(dr.Min.X+int(dx), dr.Min.Y+int(dy)).RGBA()
+			pr0 := uint32(fffftou(pr))
+			pg0 := uint32(fffftou(pg))
+			pb0 := uint32(fffftou(pb))
+			pa0 := uint32(fffftou(pa))
+			pa1 := 0xffff - pa0
+			dstColorRGBA64.R = uint16(qr*pa1/0xffff + pr0)
+			dstColorRGBA64.G = uint16(qg*pa1/0xffff + pg0)
+			dstColorRGBA64.B = uint16(qb*pa1/0xffff + pb0)
+			dstColorRGBA64.A = uint16(qa*pa1/0xffff + pa0)
 			dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
 		}
 	}
diff --git a/draw/scale.go b/draw/scale.go
index 00ef1d3..8207e81 100644
--- a/draw/scale.go
+++ b/draw/scale.go
@@ -381,52 +381,102 @@
 }
 
 func transform_Uniform(dst Image, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.Uniform, sr image.Rectangle, bias image.Point, op Op) {
-	// TODO: implement op == Over.
-	switch dst := dst.(type) {
-	case *image.RGBA:
-		pr, pg, pb, pa := src.C.RGBA()
-		pr8 := uint8(pr >> 8)
-		pg8 := uint8(pg >> 8)
-		pb8 := uint8(pb >> 8)
-		pa8 := uint8(pa >> 8)
+	switch op {
+	case Over:
+		switch dst := dst.(type) {
+		case *image.RGBA:
+			pr, pg, pb, pa := src.C.RGBA()
+			pa1 := (0xffff - pa) * 0x101
 
-		for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
-			dyf := float64(dr.Min.Y+int(dy)) + 0.5
-			d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
-			for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
-				dxf := float64(dr.Min.X+int(dx)) + 0.5
-				sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
-				sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
-				if !(image.Point{sx0, sy0}).In(sr) {
-					continue
+			for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+				dyf := float64(dr.Min.Y+int(dy)) + 0.5
+				d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
+				for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+					dxf := float64(dr.Min.X+int(dx)) + 0.5
+					sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+					sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+					if !(image.Point{sx0, sy0}).In(sr) {
+						continue
+					}
+					dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + pr) >> 8)
+					dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + pg) >> 8)
+					dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + pb) >> 8)
+					dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + pa) >> 8)
 				}
-				dst.Pix[d+0] = pr8
-				dst.Pix[d+1] = pg8
-				dst.Pix[d+2] = pb8
-				dst.Pix[d+3] = pa8
+			}
+
+		default:
+			pr, pg, pb, pa := src.C.RGBA()
+			pa1 := 0xffff - pa
+			dstColorRGBA64 := &color.RGBA64{}
+			dstColor := color.Color(dstColorRGBA64)
+
+			for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+				dyf := float64(dr.Min.Y+int(dy)) + 0.5
+				for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+					dxf := float64(dr.Min.X+int(dx)) + 0.5
+					sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+					sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+					if !(image.Point{sx0, sy0}).In(sr) {
+						continue
+					}
+					qr, qg, qb, qa := dst.At(dr.Min.X+int(dx), dr.Min.Y+int(dy)).RGBA()
+					dstColorRGBA64.R = uint16(qr*pa1/0xffff + pr)
+					dstColorRGBA64.G = uint16(qg*pa1/0xffff + pg)
+					dstColorRGBA64.B = uint16(qb*pa1/0xffff + pb)
+					dstColorRGBA64.A = uint16(qa*pa1/0xffff + pa)
+					dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
+				}
 			}
 		}
 
-	default:
-		pr, pg, pb, pa := src.C.RGBA()
-		dstColorRGBA64 := &color.RGBA64{
-			uint16(pr),
-			uint16(pg),
-			uint16(pb),
-			uint16(pa),
-		}
-		dstColor := color.Color(dstColorRGBA64)
+	case Src:
+		switch dst := dst.(type) {
+		case *image.RGBA:
+			pr, pg, pb, pa := src.C.RGBA()
+			pr8 := uint8(pr >> 8)
+			pg8 := uint8(pg >> 8)
+			pb8 := uint8(pb >> 8)
+			pa8 := uint8(pa >> 8)
 
-		for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
-			dyf := float64(dr.Min.Y+int(dy)) + 0.5
-			for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
-				dxf := float64(dr.Min.X+int(dx)) + 0.5
-				sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
-				sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
-				if !(image.Point{sx0, sy0}).In(sr) {
-					continue
+			for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+				dyf := float64(dr.Min.Y+int(dy)) + 0.5
+				d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
+				for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+					dxf := float64(dr.Min.X+int(dx)) + 0.5
+					sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+					sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+					if !(image.Point{sx0, sy0}).In(sr) {
+						continue
+					}
+					dst.Pix[d+0] = pr8
+					dst.Pix[d+1] = pg8
+					dst.Pix[d+2] = pb8
+					dst.Pix[d+3] = pa8
 				}
-				dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
+			}
+
+		default:
+			pr, pg, pb, pa := src.C.RGBA()
+			dstColorRGBA64 := &color.RGBA64{
+				uint16(pr),
+				uint16(pg),
+				uint16(pb),
+				uint16(pa),
+			}
+			dstColor := color.Color(dstColorRGBA64)
+
+			for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+				dyf := float64(dr.Min.Y+int(dy)) + 0.5
+				for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+					dxf := float64(dr.Min.X+int(dx)) + 0.5
+					sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+					sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+					if !(image.Point{sx0, sy0}).In(sr) {
+						continue
+					}
+					dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
+				}
 			}
 		}
 	}
diff --git a/draw/scale_test.go b/draw/scale_test.go
index 47a06bf..a1fb5cc 100644
--- a/draw/scale_test.go
+++ b/draw/scale_test.go
@@ -23,8 +23,8 @@
 
 var genGoldenFiles = flag.Bool("gen_golden_files", false, "whether to generate the TestXxx golden files.")
 
-var transformMatrix = func(tx, ty float64) *f64.Aff3 {
-	const scale, cos30, sin30 = 3.75, 0.866025404, 0.5
+var transformMatrix = func(scale, tx, ty float64) *f64.Aff3 {
+	const cos30, sin30 = 0.866025404, 0.5
 	return &f64.Aff3{
 		+scale * cos30, -scale * sin30, tx,
 		+scale * sin30, +scale * cos30, ty,
@@ -49,8 +49,8 @@
 // algorithm or kernel used by any particular quality setting will obviously
 // change the resultant pixels. In such a case, use the gen_golden_files flag
 // to regenerate the golden files.
-func testInterp(t *testing.T, w int, h int, direction, srcFilename string) {
-	f, err := os.Open("../testdata/go-turns-two-" + srcFilename)
+func testInterp(t *testing.T, w int, h int, direction, prefix, suffix string) {
+	f, err := os.Open("../testdata/" + prefix + suffix)
 	if err != nil {
 		t.Fatalf("Open: %v", err)
 	}
@@ -59,9 +59,16 @@
 	if err != nil {
 		t.Fatalf("Decode: %v", err)
 	}
-	opts := &Options{
-		Op: Src,
+
+	op, scale := Src, 3.75
+	if prefix == "tux" {
+		op, scale = Over, 0.125
 	}
+	opts := &Options{
+		Op: op,
+	}
+	green := image.NewUniform(color.RGBA{0x00, 0x22, 0x11, 0xff})
+
 	testCases := map[string]Interpolator{
 		"nn": NearestNeighbor,
 		"ab": ApproxBiLinear,
@@ -69,11 +76,12 @@
 		"cr": CatmullRom,
 	}
 	for name, q := range testCases {
-		goldenFilename := fmt.Sprintf("../testdata/go-turns-two-%s-%s.png", direction, name)
+		goldenFilename := fmt.Sprintf("../testdata/%s-%s-%s.png", prefix, direction, name)
 
 		got := image.NewRGBA(image.Rect(0, 0, w, h))
+		Copy(got, image.Point{}, green, got.Bounds(), nil)
 		if direction == "rotate" {
-			q.Transform(got, transformMatrix(40, 10), src, src.Bounds(), opts)
+			q.Transform(got, transformMatrix(scale, 40, 10), src, src.Bounds(), opts)
 		} else {
 			q.Scale(got, got.Bounds(), src, src.Bounds(), opts)
 		}
@@ -111,9 +119,31 @@
 	}
 }
 
-func TestScaleDown(t *testing.T) { testInterp(t, 100, 100, "down", "280x360.jpeg") }
-func TestScaleUp(t *testing.T)   { testInterp(t, 75, 100, "up", "14x18.png") }
-func TestTransform(t *testing.T) { testInterp(t, 100, 100, "rotate", "14x18.png") }
+func TestScaleDown(t *testing.T) { testInterp(t, 100, 100, "down", "go-turns-two", "-280x360.jpeg") }
+func TestScaleUp(t *testing.T)   { testInterp(t, 75, 100, "up", "go-turns-two", "-14x18.png") }
+func TestTformSrc(t *testing.T)  { testInterp(t, 100, 100, "rotate", "go-turns-two", "-14x18.png") }
+func TestTformOver(t *testing.T) { testInterp(t, 100, 100, "rotate", "tux", ".png") }
+
+func TestOps(t *testing.T) {
+	blue := image.NewUniform(color.RGBA{0x00, 0x00, 0xff, 0xff})
+	testCases := map[Op]color.RGBA{
+		Over: color.RGBA{0x7f, 0x00, 0x80, 0xff},
+		Src:  color.RGBA{0x7f, 0x00, 0x00, 0x7f},
+	}
+	for op, want := range testCases {
+		dst := image.NewRGBA(image.Rect(0, 0, 2, 2))
+		Copy(dst, image.Point{}, blue, dst.Bounds(), nil)
+
+		src := image.NewRGBA(image.Rect(0, 0, 1, 1))
+		src.SetRGBA(0, 0, color.RGBA{0x7f, 0x00, 0x00, 0x7f})
+
+		NearestNeighbor.Scale(dst, dst.Bounds(), src, src.Bounds(), &Options{Op: op})
+
+		if got := dst.RGBAAt(0, 0); got != want {
+			t.Errorf("op=%v: got %v, want %v", op, got, want)
+		}
+	}
+}
 
 // TestNegativeWeights tests that scaling by a kernel that produces negative
 // weights, such as the Catmull-Rom kernel, doesn't produce an invalid color
@@ -183,7 +213,7 @@
 			var interp func(dst *image.RGBA)
 			if transform {
 				interp = func(dst *image.RGBA) {
-					q.Transform(dst, transformMatrix(2, 1), src, src.Bounds(), nil)
+					q.Transform(dst, transformMatrix(3.75, 2, 1), src, src.Bounds(), nil)
 				}
 			} else {
 				interp = func(dst *image.RGBA) {
@@ -256,7 +286,7 @@
 		{-8, +8},
 		{-8, -8},
 	}
-	m00 := transformMatrix(0, 0)
+	m00 := transformMatrix(3.75, 0, 0)
 
 	for _, transform := range []bool{false, true} {
 		for _, q := range qs {
@@ -332,33 +362,37 @@
 		ApproxBiLinear,
 		CatmullRom,
 	}
-	blue := image.NewUniform(color.RGBA{0x11, 0x22, 0x44, 0x7f})
-	opts := &Options{
-		Op: Src,
+	ops := []Op{
+		Over,
+		Src,
 	}
+	blue := image.NewUniform(color.RGBA{0x11, 0x22, 0x44, 0x7f})
 
 	for _, dr := range drs {
 		for _, src := range srcs {
 			for _, sr := range srs {
 				for _, transform := range []bool{false, true} {
 					for _, q := range qs {
-						dst0 := image.NewRGBA(drs[0])
-						dst1 := image.NewRGBA(drs[0])
-						Draw(dst0, dst0.Bounds(), blue, image.Point{}, Src)
-						Draw(dstWrapper{dst1}, dst1.Bounds(), srcWrapper{blue}, image.Point{}, Src)
+						for _, op := range ops {
+							opts := &Options{Op: op}
+							dst0 := image.NewRGBA(drs[0])
+							dst1 := image.NewRGBA(drs[0])
+							Draw(dst0, dst0.Bounds(), blue, image.Point{}, Src)
+							Draw(dstWrapper{dst1}, dst1.Bounds(), srcWrapper{blue}, image.Point{}, Src)
 
-						if transform {
-							m := transformMatrix(2, 1)
-							q.Transform(dst0, m, src, sr, opts)
-							q.Transform(dstWrapper{dst1}, m, srcWrapper{src}, sr, opts)
-						} else {
-							q.Scale(dst0, dr, src, sr, opts)
-							q.Scale(dstWrapper{dst1}, dr, srcWrapper{src}, sr, opts)
-						}
+							if transform {
+								m := transformMatrix(3.75, 2, 1)
+								q.Transform(dst0, m, src, sr, opts)
+								q.Transform(dstWrapper{dst1}, m, srcWrapper{src}, sr, opts)
+							} else {
+								q.Scale(dst0, dr, src, sr, opts)
+								q.Scale(dstWrapper{dst1}, dr, srcWrapper{src}, sr, opts)
+							}
 
-						if !bytes.Equal(dst0.Pix, dst1.Pix) {
-							t.Errorf("pix differ for dr=%v, src=%T, sr=%v, transform=%t, q=%T",
-								dr, src, sr, transform, q)
+							if !bytes.Equal(dst0.Pix, dst1.Pix) {
+								t.Errorf("pix differ for dr=%v, src=%T, sr=%v, transform=%t, q=%T",
+									dr, src, sr, transform, q)
+							}
 						}
 					}
 				}
@@ -453,7 +487,7 @@
 		b.Fatal(err)
 	}
 	sr := src.Bounds()
-	m := transformMatrix(40, 10)
+	m := transformMatrix(3.75, 40, 10)
 	opts := &Options{
 		Op: op,
 	}
@@ -480,28 +514,54 @@
 func BenchmarkScaleBLUp(b *testing.B) { benchScale(b, 800, 600, Src, srcTux, BiLinear) }
 func BenchmarkScaleCRUp(b *testing.B) { benchScale(b, 800, 600, Src, srcTux, CatmullRom) }
 
-func BenchmarkScaleNNSrcRGBA(b *testing.B)    { benchScale(b, 200, 150, Src, srcRGBA, NearestNeighbor) }
-func BenchmarkScaleNNSrcUniform(b *testing.B) { benchScale(b, 200, 150, Src, srcUnif, NearestNeighbor) }
+func BenchmarkScaleNNSrcRGBA(b *testing.B) { benchScale(b, 200, 150, Src, srcRGBA, NearestNeighbor) }
+func BenchmarkScaleNNSrcUnif(b *testing.B) { benchScale(b, 200, 150, Src, srcUnif, NearestNeighbor) }
 
-func BenchmarkTformNNSrcRGBA(b *testing.B)    { benchTform(b, 200, 150, Src, srcRGBA, NearestNeighbor) }
-func BenchmarkTformNNSrcUniform(b *testing.B) { benchTform(b, 200, 150, Src, srcUnif, NearestNeighbor) }
+func BenchmarkScaleNNOverRGBA(b *testing.B) { benchScale(b, 200, 150, Over, srcRGBA, NearestNeighbor) }
+func BenchmarkScaleNNOverUnif(b *testing.B) { benchScale(b, 200, 150, Over, srcUnif, NearestNeighbor) }
+
+func BenchmarkTformNNSrcRGBA(b *testing.B) { benchTform(b, 200, 150, Src, srcRGBA, NearestNeighbor) }
+func BenchmarkTformNNSrcUnif(b *testing.B) { benchTform(b, 200, 150, Src, srcUnif, NearestNeighbor) }
+
+func BenchmarkTformNNOverRGBA(b *testing.B) { benchTform(b, 200, 150, Over, srcRGBA, NearestNeighbor) }
+func BenchmarkTformNNOverUnif(b *testing.B) { benchTform(b, 200, 150, Over, srcUnif, NearestNeighbor) }
 
 func BenchmarkScaleABSrcGray(b *testing.B)  { benchScale(b, 200, 150, Src, srcGray, ApproxBiLinear) }
 func BenchmarkScaleABSrcNRGBA(b *testing.B) { benchScale(b, 200, 150, Src, srcNRGBA, ApproxBiLinear) }
 func BenchmarkScaleABSrcRGBA(b *testing.B)  { benchScale(b, 200, 150, Src, srcRGBA, ApproxBiLinear) }
 func BenchmarkScaleABSrcYCbCr(b *testing.B) { benchScale(b, 200, 150, Src, srcYCbCr, ApproxBiLinear) }
 
+func BenchmarkScaleABOverGray(b *testing.B)  { benchScale(b, 200, 150, Over, srcGray, ApproxBiLinear) }
+func BenchmarkScaleABOverNRGBA(b *testing.B) { benchScale(b, 200, 150, Over, srcNRGBA, ApproxBiLinear) }
+func BenchmarkScaleABOverRGBA(b *testing.B)  { benchScale(b, 200, 150, Over, srcRGBA, ApproxBiLinear) }
+func BenchmarkScaleABOverYCbCr(b *testing.B) { benchScale(b, 200, 150, Over, srcYCbCr, ApproxBiLinear) }
+
 func BenchmarkTformABSrcGray(b *testing.B)  { benchTform(b, 200, 150, Src, srcGray, ApproxBiLinear) }
 func BenchmarkTformABSrcNRGBA(b *testing.B) { benchTform(b, 200, 150, Src, srcNRGBA, ApproxBiLinear) }
 func BenchmarkTformABSrcRGBA(b *testing.B)  { benchTform(b, 200, 150, Src, srcRGBA, ApproxBiLinear) }
 func BenchmarkTformABSrcYCbCr(b *testing.B) { benchTform(b, 200, 150, Src, srcYCbCr, ApproxBiLinear) }
 
+func BenchmarkTformABOverGray(b *testing.B)  { benchTform(b, 200, 150, Over, srcGray, ApproxBiLinear) }
+func BenchmarkTformABOverNRGBA(b *testing.B) { benchTform(b, 200, 150, Over, srcNRGBA, ApproxBiLinear) }
+func BenchmarkTformABOverRGBA(b *testing.B)  { benchTform(b, 200, 150, Over, srcRGBA, ApproxBiLinear) }
+func BenchmarkTformABOverYCbCr(b *testing.B) { benchTform(b, 200, 150, Over, srcYCbCr, ApproxBiLinear) }
+
 func BenchmarkScaleCRSrcGray(b *testing.B)  { benchScale(b, 200, 150, Src, srcGray, CatmullRom) }
 func BenchmarkScaleCRSrcNRGBA(b *testing.B) { benchScale(b, 200, 150, Src, srcNRGBA, CatmullRom) }
 func BenchmarkScaleCRSrcRGBA(b *testing.B)  { benchScale(b, 200, 150, Src, srcRGBA, CatmullRom) }
 func BenchmarkScaleCRSrcYCbCr(b *testing.B) { benchScale(b, 200, 150, Src, srcYCbCr, CatmullRom) }
 
+func BenchmarkScaleCROverGray(b *testing.B)  { benchScale(b, 200, 150, Over, srcGray, CatmullRom) }
+func BenchmarkScaleCROverNRGBA(b *testing.B) { benchScale(b, 200, 150, Over, srcNRGBA, CatmullRom) }
+func BenchmarkScaleCROverRGBA(b *testing.B)  { benchScale(b, 200, 150, Over, srcRGBA, CatmullRom) }
+func BenchmarkScaleCROverYCbCr(b *testing.B) { benchScale(b, 200, 150, Over, srcYCbCr, CatmullRom) }
+
 func BenchmarkTformCRSrcGray(b *testing.B)  { benchTform(b, 200, 150, Src, srcGray, CatmullRom) }
 func BenchmarkTformCRSrcNRGBA(b *testing.B) { benchTform(b, 200, 150, Src, srcNRGBA, CatmullRom) }
 func BenchmarkTformCRSrcRGBA(b *testing.B)  { benchTform(b, 200, 150, Src, srcRGBA, CatmullRom) }
 func BenchmarkTformCRSrcYCbCr(b *testing.B) { benchTform(b, 200, 150, Src, srcYCbCr, CatmullRom) }
+
+func BenchmarkTformCROverGray(b *testing.B)  { benchTform(b, 200, 150, Over, srcGray, CatmullRom) }
+func BenchmarkTformCROverNRGBA(b *testing.B) { benchTform(b, 200, 150, Over, srcNRGBA, CatmullRom) }
+func BenchmarkTformCROverRGBA(b *testing.B)  { benchTform(b, 200, 150, Over, srcRGBA, CatmullRom) }
+func BenchmarkTformCROverYCbCr(b *testing.B) { benchTform(b, 200, 150, Over, srcYCbCr, CatmullRom) }
diff --git a/testdata/go-turns-two-rotate-ab.png b/testdata/go-turns-two-rotate-ab.png
index b04ab3c..04fceaa 100644
--- a/testdata/go-turns-two-rotate-ab.png
+++ b/testdata/go-turns-two-rotate-ab.png
Binary files differ
diff --git a/testdata/go-turns-two-rotate-bl.png b/testdata/go-turns-two-rotate-bl.png
index b4e1279..c8b717e 100644
--- a/testdata/go-turns-two-rotate-bl.png
+++ b/testdata/go-turns-two-rotate-bl.png
Binary files differ
diff --git a/testdata/go-turns-two-rotate-cr.png b/testdata/go-turns-two-rotate-cr.png
index 0b64d02..7e5cd9f 100644
--- a/testdata/go-turns-two-rotate-cr.png
+++ b/testdata/go-turns-two-rotate-cr.png
Binary files differ
diff --git a/testdata/go-turns-two-rotate-nn.png b/testdata/go-turns-two-rotate-nn.png
index da93978..702c863 100644
--- a/testdata/go-turns-two-rotate-nn.png
+++ b/testdata/go-turns-two-rotate-nn.png
Binary files differ
diff --git a/testdata/tux-rotate-ab.png b/testdata/tux-rotate-ab.png
new file mode 100644
index 0000000..181966c
--- /dev/null
+++ b/testdata/tux-rotate-ab.png
Binary files differ
diff --git a/testdata/tux-rotate-bl.png b/testdata/tux-rotate-bl.png
new file mode 100644
index 0000000..af3f4b0
--- /dev/null
+++ b/testdata/tux-rotate-bl.png
Binary files differ
diff --git a/testdata/tux-rotate-cr.png b/testdata/tux-rotate-cr.png
new file mode 100644
index 0000000..e5cff31
--- /dev/null
+++ b/testdata/tux-rotate-cr.png
Binary files differ
diff --git a/testdata/tux-rotate-nn.png b/testdata/tux-rotate-nn.png
new file mode 100644
index 0000000..c775c61
--- /dev/null
+++ b/testdata/tux-rotate-nn.png
Binary files differ