Merge x/exp/shiny/font to x/image/font.
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..d2f212e
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,10 @@
+# Treat all files in this repo as binary, with no git magic updating
+# line endings. Windows users contributing to Go will need to use a
+# modern version of git and editors capable of LF line endings.
+#
+# We'll prevent accidental CRLF line endings from entering the repo
+# via the git-review gofmt checks.
+#
+# See golang.org/issue/9281
+
+* -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..8339fd6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+# Add no patterns to .hgignore except for files generated by the build.
+last-change
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..15167cd
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,3 @@
+# This source code refers to The Go Authors for copyright purposes.
+# The master list of authors is in the main Go distribution,
+# visible at http://tip.golang.org/AUTHORS.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..88dff59
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,31 @@
+# Contributing to Go
+
+Go is an open source project.
+
+It is the work of hundreds of contributors. We appreciate your help!
+
+
+## Filing issues
+
+When [filing an issue](https://golang.org/issue/new), make sure to answer these five questions:
+
+1. What version of Go are you using (`go version`)?
+2. What operating system and processor architecture are you using?
+3. What did you do?
+4. What did you expect to see?
+5. What did you see instead?
+
+General questions should go to the [golang-nuts mailing list](https://groups.google.com/group/golang-nuts) instead of the issue tracker.
+The gophers there will answer or ask you to file an issue if you've tripped over a bug.
+
+## Contributing code
+
+Please read the [Contribution Guidelines](https://golang.org/doc/contribute.html)
+before sending patches.
+
+**We do not accept GitHub pull requests**
+(we use [Gerrit](https://code.google.com/p/gerrit/) instead for code review).
+
+Unless otherwise noted, the Go source files are distributed under
+the BSD-style license found in the LICENSE file.
+
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
new file mode 100644
index 0000000..1c4577e
--- /dev/null
+++ b/CONTRIBUTORS
@@ -0,0 +1,3 @@
+# This source code was written by the Go contributors.
+# The master list of contributors is in the main Go distribution,
+# visible at http://tip.golang.org/CONTRIBUTORS.
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..6a66aea
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2009 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/PATENTS b/PATENTS
new file mode 100644
index 0000000..7330990
--- /dev/null
+++ b/PATENTS
@@ -0,0 +1,22 @@
+Additional IP Rights Grant (Patents)
+
+"This implementation" means the copyrightable works distributed by
+Google as part of the Go project.
+
+Google hereby grants to You a perpetual, worldwide, non-exclusive,
+no-charge, royalty-free, irrevocable (except as stated in this section)
+patent license to make, have made, use, offer to sell, sell, import,
+transfer and otherwise run, modify and propagate the contents of this
+implementation of Go, where such license applies only to those patent
+claims, both currently owned or controlled by Google and acquired in
+the future, licensable by Google that are necessarily infringed by this
+implementation of Go.  This grant does not include claims that would be
+infringed only as a consequence of further modification of this
+implementation.  If you or your agent or exclusive licensee institute or
+order or agree to the institution of patent litigation against any
+entity (including a cross-claim or counterclaim in a lawsuit) alleging
+that this implementation of Go or any code incorporated within this
+implementation of Go constitutes direct or contributory patent
+infringement, or inducement of patent infringement, then any patent
+rights granted to you under this License for this implementation of Go
+shall terminate as of the date such litigation is filed.
diff --git a/README b/README
new file mode 100644
index 0000000..4620380
--- /dev/null
+++ b/README
@@ -0,0 +1,3 @@
+This repository holds supplementary Go image libraries.
+
+To submit changes to this repository, see http://golang.org/doc/contribute.html.
diff --git a/bmp/reader.go b/bmp/reader.go
new file mode 100644
index 0000000..a0f2715
--- /dev/null
+++ b/bmp/reader.go
@@ -0,0 +1,199 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package bmp implements a BMP image decoder and encoder.
+//
+// The BMP specification is at http://www.digicamsoft.com/bmp/bmp.html.
+package bmp // import "golang.org/x/image/bmp"
+
+import (
+	"errors"
+	"image"
+	"image/color"
+	"io"
+)
+
+// ErrUnsupported means that the input BMP image uses a valid but unsupported
+// feature.
+var ErrUnsupported = errors.New("bmp: unsupported BMP image")
+
+func readUint16(b []byte) uint16 {
+	return uint16(b[0]) | uint16(b[1])<<8
+}
+
+func readUint32(b []byte) uint32 {
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+// decodePaletted reads an 8 bit-per-pixel BMP image from r.
+// If topDown is false, the image rows will be read bottom-up.
+func decodePaletted(r io.Reader, c image.Config, topDown bool) (image.Image, error) {
+	paletted := image.NewPaletted(image.Rect(0, 0, c.Width, c.Height), c.ColorModel.(color.Palette))
+	if c.Width == 0 || c.Height == 0 {
+		return paletted, nil
+	}
+	var tmp [4]byte
+	y0, y1, yDelta := c.Height-1, -1, -1
+	if topDown {
+		y0, y1, yDelta = 0, c.Height, +1
+	}
+	for y := y0; y != y1; y += yDelta {
+		p := paletted.Pix[y*paletted.Stride : y*paletted.Stride+c.Width]
+		if _, err := io.ReadFull(r, p); err != nil {
+			return nil, err
+		}
+		// Each row is 4-byte aligned.
+		if c.Width%4 != 0 {
+			_, err := io.ReadFull(r, tmp[:4-c.Width%4])
+			if err != nil {
+				return nil, err
+			}
+		}
+	}
+	return paletted, nil
+}
+
+// decodeRGB reads a 24 bit-per-pixel BMP image from r.
+// If topDown is false, the image rows will be read bottom-up.
+func decodeRGB(r io.Reader, c image.Config, topDown bool) (image.Image, error) {
+	rgba := image.NewRGBA(image.Rect(0, 0, c.Width, c.Height))
+	if c.Width == 0 || c.Height == 0 {
+		return rgba, nil
+	}
+	// There are 3 bytes per pixel, and each row is 4-byte aligned.
+	b := make([]byte, (3*c.Width+3)&^3)
+	y0, y1, yDelta := c.Height-1, -1, -1
+	if topDown {
+		y0, y1, yDelta = 0, c.Height, +1
+	}
+	for y := y0; y != y1; y += yDelta {
+		if _, err := io.ReadFull(r, b); err != nil {
+			return nil, err
+		}
+		p := rgba.Pix[y*rgba.Stride : y*rgba.Stride+c.Width*4]
+		for i, j := 0, 0; i < len(p); i, j = i+4, j+3 {
+			// BMP images are stored in BGR order rather than RGB order.
+			p[i+0] = b[j+2]
+			p[i+1] = b[j+1]
+			p[i+2] = b[j+0]
+			p[i+3] = 0xFF
+		}
+	}
+	return rgba, nil
+}
+
+// decodeNRGBA reads a 32 bit-per-pixel BMP image from r.
+// If topDown is false, the image rows will be read bottom-up.
+func decodeNRGBA(r io.Reader, c image.Config, topDown bool) (image.Image, error) {
+	rgba := image.NewNRGBA(image.Rect(0, 0, c.Width, c.Height))
+	if c.Width == 0 || c.Height == 0 {
+		return rgba, nil
+	}
+	y0, y1, yDelta := c.Height-1, -1, -1
+	if topDown {
+		y0, y1, yDelta = 0, c.Height, +1
+	}
+	for y := y0; y != y1; y += yDelta {
+		p := rgba.Pix[y*rgba.Stride : y*rgba.Stride+c.Width*4]
+		if _, err := io.ReadFull(r, p); err != nil {
+			return nil, err
+		}
+		for i := 0; i < len(p); i += 4 {
+			// BMP images are stored in BGRA order rather than RGBA order.
+			p[i+0], p[i+2] = p[i+2], p[i+0]
+		}
+	}
+	return rgba, nil
+}
+
+// Decode reads a BMP image from r and returns it as an image.Image.
+// Limitation: The file must be 8, 24 or 32 bits per pixel.
+func Decode(r io.Reader) (image.Image, error) {
+	c, bpp, topDown, err := decodeConfig(r)
+	if err != nil {
+		return nil, err
+	}
+	switch bpp {
+	case 8:
+		return decodePaletted(r, c, topDown)
+	case 24:
+		return decodeRGB(r, c, topDown)
+	case 32:
+		return decodeNRGBA(r, c, topDown)
+	}
+	panic("unreachable")
+}
+
+// DecodeConfig returns the color model and dimensions of a BMP image without
+// decoding the entire image.
+// Limitation: The file must be 8, 24 or 32 bits per pixel.
+func DecodeConfig(r io.Reader) (image.Config, error) {
+	config, _, _, err := decodeConfig(r)
+	return config, err
+}
+
+func decodeConfig(r io.Reader) (config image.Config, bitsPerPixel int, topDown bool, err error) {
+	// We only support those BMP images that are a BITMAPFILEHEADER
+	// immediately followed by a BITMAPINFOHEADER.
+	const (
+		fileHeaderLen = 14
+		infoHeaderLen = 40
+	)
+	var b [1024]byte
+	if _, err := io.ReadFull(r, b[:fileHeaderLen+infoHeaderLen]); err != nil {
+		return image.Config{}, 0, false, err
+	}
+	if string(b[:2]) != "BM" {
+		return image.Config{}, 0, false, errors.New("bmp: invalid format")
+	}
+	offset := readUint32(b[10:14])
+	if readUint32(b[14:18]) != infoHeaderLen {
+		return image.Config{}, 0, false, ErrUnsupported
+	}
+	width := int(int32(readUint32(b[18:22])))
+	height := int(int32(readUint32(b[22:26])))
+	if height < 0 {
+		height, topDown = -height, true
+	}
+	if width < 0 || height < 0 {
+		return image.Config{}, 0, false, ErrUnsupported
+	}
+	// We only support 1 plane, 8 or 24 bits per pixel and no compression.
+	planes, bpp, compression := readUint16(b[26:28]), readUint16(b[28:30]), readUint32(b[30:34])
+	if planes != 1 || compression != 0 {
+		return image.Config{}, 0, false, ErrUnsupported
+	}
+	switch bpp {
+	case 8:
+		if offset != fileHeaderLen+infoHeaderLen+256*4 {
+			return image.Config{}, 0, false, ErrUnsupported
+		}
+		_, err = io.ReadFull(r, b[:256*4])
+		if err != nil {
+			return image.Config{}, 0, false, err
+		}
+		pcm := make(color.Palette, 256)
+		for i := range pcm {
+			// BMP images are stored in BGR order rather than RGB order.
+			// Every 4th byte is padding.
+			pcm[i] = color.RGBA{b[4*i+2], b[4*i+1], b[4*i+0], 0xFF}
+		}
+		return image.Config{ColorModel: pcm, Width: width, Height: height}, 8, topDown, nil
+	case 24:
+		if offset != fileHeaderLen+infoHeaderLen {
+			return image.Config{}, 0, false, ErrUnsupported
+		}
+		return image.Config{ColorModel: color.RGBAModel, Width: width, Height: height}, 24, topDown, nil
+	case 32:
+		if offset != fileHeaderLen+infoHeaderLen {
+			return image.Config{}, 0, false, ErrUnsupported
+		}
+		return image.Config{ColorModel: color.RGBAModel, Width: width, Height: height}, 32, topDown, nil
+	}
+	return image.Config{}, 0, false, ErrUnsupported
+}
+
+func init() {
+	image.RegisterFormat("bmp", "BM????\x00\x00\x00\x00", Decode, DecodeConfig)
+}
diff --git a/bmp/reader_test.go b/bmp/reader_test.go
new file mode 100644
index 0000000..fd6ff64
--- /dev/null
+++ b/bmp/reader_test.go
@@ -0,0 +1,75 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bmp
+
+import (
+	"fmt"
+	"image"
+	"os"
+	"testing"
+
+	_ "image/png"
+)
+
+const testdataDir = "../testdata/"
+
+func compare(t *testing.T, img0, img1 image.Image) error {
+	b := img1.Bounds()
+	if !b.Eq(img0.Bounds()) {
+		return fmt.Errorf("wrong image size: want %s, got %s", img0.Bounds(), b)
+	}
+	for y := b.Min.Y; y < b.Max.Y; y++ {
+		for x := b.Min.X; x < b.Max.X; x++ {
+			c0 := img0.At(x, y)
+			c1 := img1.At(x, y)
+			r0, g0, b0, a0 := c0.RGBA()
+			r1, g1, b1, a1 := c1.RGBA()
+			if r0 != r1 || g0 != g1 || b0 != b1 || a0 != a1 {
+				return fmt.Errorf("pixel at (%d, %d) has wrong color: want %v, got %v", x, y, c0, c1)
+			}
+		}
+	}
+	return nil
+}
+
+// TestDecode tests that decoding a PNG image and a BMP image result in the
+// same pixel data.
+func TestDecode(t *testing.T) {
+	testCases := []string{
+		"video-001",
+		"yellow_rose-small",
+	}
+
+	for _, tc := range testCases {
+		f0, err := os.Open(testdataDir + tc + ".png")
+		if err != nil {
+			t.Errorf("%s: Open PNG: %v", tc, err)
+			continue
+		}
+		defer f0.Close()
+		img0, _, err := image.Decode(f0)
+		if err != nil {
+			t.Errorf("%s: Decode PNG: %v", tc, err)
+			continue
+		}
+
+		f1, err := os.Open(testdataDir + tc + ".bmp")
+		if err != nil {
+			t.Errorf("%s: Open BMP: %v", tc, err)
+			continue
+		}
+		defer f1.Close()
+		img1, _, err := image.Decode(f1)
+		if err != nil {
+			t.Errorf("%s: Decode BMP: %v", tc, err)
+			continue
+		}
+
+		if err := compare(t, img0, img1); err != nil {
+			t.Errorf("%s: %v", tc, err)
+			continue
+		}
+	}
+}
diff --git a/bmp/writer.go b/bmp/writer.go
new file mode 100644
index 0000000..6947968
--- /dev/null
+++ b/bmp/writer.go
@@ -0,0 +1,166 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bmp
+
+import (
+	"encoding/binary"
+	"errors"
+	"image"
+	"io"
+)
+
+type header struct {
+	sigBM           [2]byte
+	fileSize        uint32
+	resverved       [2]uint16
+	pixOffset       uint32
+	dibHeaderSize   uint32
+	width           uint32
+	height          uint32
+	colorPlane      uint16
+	bpp             uint16
+	compression     uint32
+	imageSize       uint32
+	xPixelsPerMeter uint32
+	yPixelsPerMeter uint32
+	colorUse        uint32
+	colorImportant  uint32
+}
+
+func encodePaletted(w io.Writer, pix []uint8, dx, dy, stride, step int) error {
+	var padding []byte
+	if dx < step {
+		padding = make([]byte, step-dx)
+	}
+	for y := dy - 1; y >= 0; y-- {
+		min := y*stride + 0
+		max := y*stride + dx
+		if _, err := w.Write(pix[min:max]); err != nil {
+			return err
+		}
+		if padding != nil {
+			if _, err := w.Write(padding); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+func encodeRGBA(w io.Writer, pix []uint8, dx, dy, stride, step int) error {
+	buf := make([]byte, step)
+	for y := dy - 1; y >= 0; y-- {
+		min := y*stride + 0
+		max := y*stride + dx*4
+		off := 0
+		for i := min; i < max; i += 4 {
+			buf[off+2] = pix[i+0]
+			buf[off+1] = pix[i+1]
+			buf[off+0] = pix[i+2]
+			off += 3
+		}
+		if _, err := w.Write(buf); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func encode(w io.Writer, m image.Image, step int) error {
+	b := m.Bounds()
+	buf := make([]byte, step)
+	for y := b.Max.Y - 1; y >= b.Min.Y; y-- {
+		off := 0
+		for x := b.Min.X; x < b.Max.X; x++ {
+			r, g, b, _ := m.At(x, y).RGBA()
+			buf[off+2] = byte(r >> 8)
+			buf[off+1] = byte(g >> 8)
+			buf[off+0] = byte(b >> 8)
+			off += 3
+		}
+		if _, err := w.Write(buf); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Encode writes the image m to w in BMP format.
+func Encode(w io.Writer, m image.Image) error {
+	d := m.Bounds().Size()
+	if d.X < 0 || d.Y < 0 {
+		return errors.New("bmp: negative bounds")
+	}
+	h := &header{
+		sigBM:         [2]byte{'B', 'M'},
+		fileSize:      14 + 40,
+		pixOffset:     14 + 40,
+		dibHeaderSize: 40,
+		width:         uint32(d.X),
+		height:        uint32(d.Y),
+		colorPlane:    1,
+	}
+
+	var step int
+	var palette []byte
+	switch m := m.(type) {
+	case *image.Gray:
+		step = (d.X + 3) &^ 3
+		palette = make([]byte, 1024)
+		for i := 0; i < 256; i++ {
+			palette[i*4+0] = uint8(i)
+			palette[i*4+1] = uint8(i)
+			palette[i*4+2] = uint8(i)
+			palette[i*4+3] = 0xFF
+		}
+		h.imageSize = uint32(d.Y * step)
+		h.fileSize += uint32(len(palette)) + h.imageSize
+		h.pixOffset += uint32(len(palette))
+		h.bpp = 8
+
+	case *image.Paletted:
+		step = (d.X + 3) &^ 3
+		palette = make([]byte, 1024)
+		for i := 0; i < len(m.Palette) && i < 256; i++ {
+			r, g, b, _ := m.Palette[i].RGBA()
+			palette[i*4+0] = uint8(b >> 8)
+			palette[i*4+1] = uint8(g >> 8)
+			palette[i*4+2] = uint8(r >> 8)
+			palette[i*4+3] = 0xFF
+		}
+		h.imageSize = uint32(d.Y * step)
+		h.fileSize += uint32(len(palette)) + h.imageSize
+		h.pixOffset += uint32(len(palette))
+		h.bpp = 8
+	default:
+		step = (3*d.X + 3) &^ 3
+		h.imageSize = uint32(d.Y * step)
+		h.fileSize += h.imageSize
+		h.bpp = 24
+	}
+
+	if err := binary.Write(w, binary.LittleEndian, h); err != nil {
+		return err
+	}
+	if palette != nil {
+		if err := binary.Write(w, binary.LittleEndian, palette); err != nil {
+			return err
+		}
+	}
+
+	if d.X == 0 || d.Y == 0 {
+		return nil
+	}
+
+	switch m := m.(type) {
+	case *image.Gray:
+		return encodePaletted(w, m.Pix, d.X, d.Y, m.Stride, step)
+	case *image.Paletted:
+		return encodePaletted(w, m.Pix, d.X, d.Y, m.Stride, step)
+	case *image.RGBA:
+		return encodeRGBA(w, m.Pix, d.X, d.Y, m.Stride, step)
+	}
+	return encode(w, m, step)
+}
diff --git a/bmp/writer_test.go b/bmp/writer_test.go
new file mode 100644
index 0000000..9e5a327
--- /dev/null
+++ b/bmp/writer_test.go
@@ -0,0 +1,91 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bmp
+
+import (
+	"bytes"
+	"fmt"
+	"image"
+	"io/ioutil"
+	"os"
+	"testing"
+	"time"
+)
+
+func openImage(filename string) (image.Image, error) {
+	f, err := os.Open(testdataDir + filename)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	return Decode(f)
+}
+
+func TestEncode(t *testing.T) {
+	img0, err := openImage("video-001.bmp")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	buf := new(bytes.Buffer)
+	err = Encode(buf, img0)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	img1, err := Decode(buf)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	compare(t, img0, img1)
+}
+
+// TestZeroWidthVeryLargeHeight tests that encoding and decoding a degenerate
+// image with zero width but over one billion pixels in height is faster than
+// naively calling an io.Reader or io.Writer method once per row.
+func TestZeroWidthVeryLargeHeight(t *testing.T) {
+	c := make(chan error, 1)
+	go func() {
+		b := image.Rect(0, 0, 0, 0x3fffffff)
+		var buf bytes.Buffer
+		if err := Encode(&buf, image.NewRGBA(b)); err != nil {
+			c <- err
+			return
+		}
+		m, err := Decode(&buf)
+		if err != nil {
+			c <- err
+			return
+		}
+		if got := m.Bounds(); got != b {
+			c <- fmt.Errorf("bounds: got %v, want %v", got, b)
+			return
+		}
+		c <- nil
+	}()
+	select {
+	case err := <-c:
+		if err != nil {
+			t.Fatal(err)
+		}
+	case <-time.After(3 * time.Second):
+		t.Fatalf("timed out")
+	}
+}
+
+// BenchmarkEncode benchmarks the encoding of an image.
+func BenchmarkEncode(b *testing.B) {
+	img, err := openImage("video-001.bmp")
+	if err != nil {
+		b.Fatal(err)
+	}
+	s := img.Bounds().Size()
+	b.SetBytes(int64(s.X * s.Y * 4))
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		Encode(ioutil.Discard, img)
+	}
+}
diff --git a/cmd/webp-manual-test/main.go b/cmd/webp-manual-test/main.go
new file mode 100644
index 0000000..c041e21
--- /dev/null
+++ b/cmd/webp-manual-test/main.go
@@ -0,0 +1,201 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+//
+// This build tag means that "go install golang.org/x/image/..." doesn't
+// install this manual test. Use "go run main.go" to explicitly run it.
+
+// Program webp-manual-test checks that the Go WEBP library's decodings match
+// the C WEBP library's.
+package main // import "golang.org/x/image/cmd/webp-manual-test"
+
+import (
+	"bytes"
+	"encoding/hex"
+	"flag"
+	"fmt"
+	"image"
+	"io"
+	"log"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"sort"
+	"strings"
+
+	"golang.org/x/image/webp"
+	"golang.org/x/image/webp/nycbcra"
+)
+
+var (
+	dwebp = flag.String("dwebp", "/usr/bin/dwebp", "path to the dwebp program "+
+		"installed from https://developers.google.com/speed/webp/download")
+	testdata = flag.String("testdata", "", "path to the libwebp-test-data directory "+
+		"checked out from https://chromium.googlesource.com/webm/libwebp-test-data")
+)
+
+func main() {
+	flag.Parse()
+	if *dwebp == "" {
+		flag.Usage()
+		log.Fatal("dwebp flag was not specified")
+	}
+	if _, err := os.Stat(*dwebp); err != nil {
+		flag.Usage()
+		log.Fatalf("could not find dwebp program at %q", *dwebp)
+	}
+	if *testdata == "" {
+		flag.Usage()
+		log.Fatal("testdata flag was not specified")
+	}
+
+	f, err := os.Open(*testdata)
+	if err != nil {
+		log.Fatalf("Open: %v", err)
+	}
+	defer f.Close()
+	names, err := f.Readdirnames(-1)
+	if err != nil {
+		log.Fatalf("Readdirnames: %v", err)
+	}
+	sort.Strings(names)
+
+	nFail, nPass := 0, 0
+	for _, name := range names {
+		if !strings.HasSuffix(name, "webp") {
+			continue
+		}
+		if err := test(name); err != nil {
+			fmt.Printf("FAIL\t%s\t%v\n", name, err)
+			nFail++
+		} else {
+			fmt.Printf("PASS\t%s\n", name)
+			nPass++
+		}
+	}
+	fmt.Printf("%d PASS, %d FAIL, %d TOTAL\n", nPass, nFail, nPass+nFail)
+	if nFail != 0 {
+		os.Exit(1)
+	}
+}
+
+// test tests a single WEBP image.
+func test(name string) error {
+	filename := filepath.Join(*testdata, name)
+	f, err := os.Open(filename)
+	if err != nil {
+		return fmt.Errorf("Open: %v", err)
+	}
+	defer f.Close()
+
+	gotImage, err := webp.Decode(f)
+	if err != nil {
+		return fmt.Errorf("Decode: %v", err)
+	}
+	format, encode := "-pgm", encodePGM
+	if _, lossless := gotImage.(*image.NRGBA); lossless {
+		format, encode = "-pam", encodePAM
+	}
+	got, err := encode(gotImage)
+	if err != nil {
+		return fmt.Errorf("encode: %v", err)
+	}
+
+	stdout := new(bytes.Buffer)
+	stderr := new(bytes.Buffer)
+	c := exec.Command(*dwebp, filename, format, "-o", "/dev/stdout")
+	c.Stdout = stdout
+	c.Stderr = stderr
+	if err := c.Run(); err != nil {
+		os.Stderr.Write(stderr.Bytes())
+		return fmt.Errorf("executing dwebp: %v", err)
+	}
+	want := stdout.Bytes()
+
+	if len(got) != len(want) {
+		return fmt.Errorf("encodings have different length: got %d, want %d", len(got), len(want))
+	}
+	for i, g := range got {
+		if w := want[i]; g != w {
+			return fmt.Errorf("encodings differ at position 0x%x: got 0x%02x, want 0x%02x", i, g, w)
+		}
+	}
+	return nil
+}
+
+// encodePAM encodes gotImage in the PAM format.
+func encodePAM(gotImage image.Image) ([]byte, error) {
+	m, ok := gotImage.(*image.NRGBA)
+	if !ok {
+		return nil, fmt.Errorf("lossless image did not decode to an *image.NRGBA")
+	}
+	b := m.Bounds()
+	w, h := b.Dx(), b.Dy()
+	buf := new(bytes.Buffer)
+	fmt.Fprintf(buf, "P7\nWIDTH %d\nHEIGHT %d\nDEPTH 4\nMAXVAL 255\nTUPLTYPE RGB_ALPHA\nENDHDR\n", w, h)
+	for y := b.Min.Y; y < b.Max.Y; y++ {
+		o := m.PixOffset(b.Min.X, y)
+		buf.Write(m.Pix[o : o+4*w])
+	}
+	return buf.Bytes(), nil
+}
+
+// encodePGM encodes gotImage in the PGM format in the IMC4 layout.
+func encodePGM(gotImage image.Image) ([]byte, error) {
+	var (
+		m  *image.YCbCr
+		ma *nycbcra.Image
+	)
+	switch g := gotImage.(type) {
+	case *image.YCbCr:
+		m = g
+	case *nycbcra.Image:
+		m = &g.YCbCr
+		ma = g
+	default:
+		return nil, fmt.Errorf("lossy image did not decode to an *image.YCbCr")
+	}
+	if m.SubsampleRatio != image.YCbCrSubsampleRatio420 {
+		return nil, fmt.Errorf("lossy image did not decode to a 4:2:0 YCbCr")
+	}
+	b := m.Bounds()
+	w, h := b.Dx(), b.Dy()
+	w2, h2 := (w+1)/2, (h+1)/2
+	outW, outH := 2*w2, h+h2
+	if ma != nil {
+		outH += h
+	}
+	buf := new(bytes.Buffer)
+	fmt.Fprintf(buf, "P5\n%d %d\n255\n", outW, outH)
+	for y := b.Min.Y; y < b.Max.Y; y++ {
+		o := m.YOffset(b.Min.X, y)
+		buf.Write(m.Y[o : o+w])
+		if w&1 != 0 {
+			buf.WriteByte(0x00)
+		}
+	}
+	for y := b.Min.Y; y < b.Max.Y; y += 2 {
+		o := m.COffset(b.Min.X, y)
+		buf.Write(m.Cb[o : o+w2])
+		buf.Write(m.Cr[o : o+w2])
+	}
+	if ma != nil {
+		for y := b.Min.Y; y < b.Max.Y; y++ {
+			o := ma.AOffset(b.Min.X, y)
+			buf.Write(ma.A[o : o+w])
+			if w&1 != 0 {
+				buf.WriteByte(0x00)
+			}
+		}
+	}
+	return buf.Bytes(), nil
+}
+
+// dump can be useful for debugging.
+func dump(w io.Writer, b []byte) {
+	h := hex.Dumper(w)
+	h.Write(b)
+	h.Close()
+}
diff --git a/codereview.cfg b/codereview.cfg
new file mode 100644
index 0000000..3f8b14b
--- /dev/null
+++ b/codereview.cfg
@@ -0,0 +1 @@
+issuerepo: golang/go
diff --git a/colornames/colornames.go b/colornames/colornames.go
new file mode 100644
index 0000000..e952be9
--- /dev/null
+++ b/colornames/colornames.go
@@ -0,0 +1,10 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:generate go run gen.go
+
+// Package colornames provides named colors as defined in the SVG 1.1 spec.
+//
+// See http://www.w3.org/TR/SVG/types.html#ColorKeywords
+package colornames
diff --git a/colornames/colornames_test.go b/colornames/colornames_test.go
new file mode 100644
index 0000000..d6d0324
--- /dev/null
+++ b/colornames/colornames_test.go
@@ -0,0 +1,42 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package colornames
+
+import (
+	"image/color"
+	"testing"
+)
+
+func TestColornames(t *testing.T) {
+	if len(Map) != len(Names) {
+		t.Fatalf("Map and Names have different length: %d vs %d", len(Map), len(Names))
+	}
+
+	for name, want := range testCases {
+		got, ok := Map[name]
+		if !ok {
+			t.Errorf("Did not find %s", name)
+			continue
+		}
+		if got != want {
+			t.Errorf("%s:\ngot  %v\nwant %v", name, got, want)
+		}
+	}
+}
+
+var testCases = map[string]color.RGBA{
+	"aliceblue":      color.RGBA{240, 248, 255, 255},
+	"crimson":        color.RGBA{220, 20, 60, 255},
+	"darkorange":     color.RGBA{255, 140, 0, 255},
+	"deepskyblue":    color.RGBA{0, 191, 255, 255},
+	"greenyellow":    color.RGBA{173, 255, 47, 255},
+	"lightgrey":      color.RGBA{211, 211, 211, 255},
+	"lightpink":      color.RGBA{255, 182, 193, 255},
+	"mediumseagreen": color.RGBA{60, 179, 113, 255},
+	"olivedrab":      color.RGBA{107, 142, 35, 255},
+	"purple":         color.RGBA{128, 0, 128, 255},
+	"slategrey":      color.RGBA{112, 128, 144, 255},
+	"yellowgreen":    color.RGBA{154, 205, 50, 255},
+}
diff --git a/colornames/gen.go b/colornames/gen.go
new file mode 100644
index 0000000..035825c
--- /dev/null
+++ b/colornames/gen.go
@@ -0,0 +1,187 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+// This program generates table.go from
+// http://www.w3.org/TR/SVG/types.html#ColorKeywords
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"go/format"
+	"image/color"
+	"io"
+	"io/ioutil"
+	"log"
+	"net/http"
+	"regexp"
+	"sort"
+	"strconv"
+	"strings"
+
+	"golang.org/x/net/html"
+	"golang.org/x/net/html/atom"
+)
+
+// matchFunc matches HTML nodes.
+type matchFunc func(*html.Node) bool
+
+// appendAll recursively traverses the parse tree rooted under the provided
+// node and appends all nodes matched by the matchFunc to dst.
+func appendAll(dst []*html.Node, n *html.Node, mf matchFunc) []*html.Node {
+	if mf(n) {
+		dst = append(dst, n)
+	}
+	for c := n.FirstChild; c != nil; c = c.NextSibling {
+		dst = appendAll(dst, c, mf)
+	}
+	return dst
+}
+
+// matchAtom returns a matchFunc that matches a Node with the specified Atom.
+func matchAtom(a atom.Atom) matchFunc {
+	return func(n *html.Node) bool {
+		return n.DataAtom == a
+	}
+}
+
+// matchAtomAttr returns a matchFunc that matches a Node with the specified
+// Atom and a html.Attribute's namespace, key and value.
+func matchAtomAttr(a atom.Atom, namespace, key, value string) matchFunc {
+	return func(n *html.Node) bool {
+		return n.DataAtom == a && getAttr(n, namespace, key) == value
+	}
+}
+
+// getAttr fetches the value of a html.Attribute for a given namespace and key.
+func getAttr(n *html.Node, namespace, key string) string {
+	for _, attr := range n.Attr {
+		if attr.Namespace == namespace && attr.Key == key {
+			return attr.Val
+		}
+	}
+	return ""
+}
+
+// re extracts RGB values from strings like "rgb( 0, 223, 128)".
+var re = regexp.MustCompile(`rgb\(\s*([0-9]+),\s*([0-9]+),\s*([0-9]+)\)`)
+
+// parseRGB parses a color from a string like "rgb( 0, 233, 128)". It sets
+// the alpha value of the color to full opacity.
+func parseRGB(s string) (color.RGBA, error) {
+	m := re.FindStringSubmatch(s)
+	if m == nil {
+		return color.RGBA{}, fmt.Errorf("malformed color: %q", s)
+	}
+	var rgb [3]uint8
+	for i, t := range m[1:] {
+		num, err := strconv.ParseUint(t, 10, 8)
+		if err != nil {
+			return color.RGBA{}, fmt.Errorf("malformed value %q in %q: %s", t, s, err)
+		}
+		rgb[i] = uint8(num)
+	}
+	return color.RGBA{rgb[0], rgb[1], rgb[2], 0xFF}, nil
+}
+
+// extractSVGColors extracts named colors from the parse tree of the SVG 1.1
+// spec HTML document "Chapter 4: Basic data types and interfaces".
+func extractSVGColors(tree *html.Node) (map[string]color.RGBA, error) {
+	ret := make(map[string]color.RGBA)
+
+	// Find the tables which store the color keywords in the parse tree.
+	colorTables := appendAll(nil, tree, func(n *html.Node) bool {
+		return n.DataAtom == atom.Table && strings.Contains(getAttr(n, "", "summary"), "color keywords part")
+	})
+
+	for _, table := range colorTables {
+		// Color names and values are stored in TextNodes within spans in each row.
+		for _, tr := range appendAll(nil, table, matchAtom(atom.Tr)) {
+			nameSpan := appendAll(nil, tr, matchAtomAttr(atom.Span, "", "class", "prop-value"))
+			valueSpan := appendAll(nil, tr, matchAtomAttr(atom.Span, "", "class", "color-keyword-value"))
+
+			// Since SVG 1.1 defines an odd number of colors, the last row
+			// in the second table does not have contents. We skip it.
+			if len(nameSpan) != 1 || len(valueSpan) != 1 {
+				continue
+			}
+			n, v := nameSpan[0].FirstChild, valueSpan[0].FirstChild
+			// This sanity checks for the existence of TextNodes under spans.
+			if n == nil || n.Type != html.TextNode || v == nil || v.Type != html.TextNode {
+				return nil, fmt.Errorf("extractSVGColors: couldn't find name/value text nodes")
+			}
+			val, err := parseRGB(v.Data)
+			if err != nil {
+				return nil, fmt.Errorf("extractSVGColors: couldn't parse name/value %q/%q: %s", n.Data, v.Data, err)
+			}
+			ret[n.Data] = val
+		}
+	}
+	return ret, nil
+}
+
+const preamble = `// generated by go generate; DO NOT EDIT.
+
+package colornames
+
+import "image/color"
+
+`
+
+// WriteColorNames writes table.go.
+func writeColorNames(w io.Writer, m map[string]color.RGBA) {
+	keys := make([]string, 0, len(m))
+	for k := range m {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
+
+	fmt.Fprintln(w, preamble)
+	fmt.Fprintln(w, "// Map contains named colors defined in the SVG 1.1 spec.")
+	fmt.Fprintln(w, "var Map = map[string]color.RGBA{")
+	for _, k := range keys {
+		fmt.Fprintf(w, "%q:color.RGBA{%#02x, %#02x, %#02x, %#02x}, // rgb(%d, %d, %d)\n",
+			k, m[k].R, m[k].G, m[k].B, m[k].A, m[k].R, m[k].G, m[k].B)
+	}
+	fmt.Fprintln(w, "}\n")
+	fmt.Fprintln(w, "// Names contains the color names defined in the SVG 1.1 spec.")
+	fmt.Fprintln(w, "var Names = []string{")
+	for _, k := range keys {
+		fmt.Fprintf(w, "%q,\n", k)
+	}
+	fmt.Fprintln(w, "}")
+}
+
+const url = "http://www.w3.org/TR/SVG/types.html"
+
+func main() {
+	res, err := http.Get(url)
+	if err != nil {
+		log.Fatalf("Couldn't read from %s: %s\n", url, err)
+	}
+	defer res.Body.Close()
+
+	tree, err := html.Parse(res.Body)
+	if err != nil {
+		log.Fatalf("Couldn't parse %s: %s\n", url, err)
+	}
+
+	colors, err := extractSVGColors(tree)
+	if err != nil {
+		log.Fatalf("Couldn't extract colors: %s\n", err)
+	}
+
+	buf := &bytes.Buffer{}
+	writeColorNames(buf, colors)
+	fmted, err := format.Source(buf.Bytes())
+	if err != nil {
+		log.Fatalf("Error while formatting code: %s\n", err)
+	}
+
+	if err := ioutil.WriteFile("table.go", fmted, 0644); err != nil {
+		log.Fatalf("Error writing table.go: %s\n", err)
+	}
+}
diff --git a/colornames/table.go b/colornames/table.go
new file mode 100644
index 0000000..72ac9fe
--- /dev/null
+++ b/colornames/table.go
@@ -0,0 +1,307 @@
+// generated by go generate; DO NOT EDIT.
+
+package colornames
+
+import "image/color"
+
+// Map contains named colors defined in the SVG 1.1 spec.
+var Map = map[string]color.RGBA{
+	"aliceblue":            color.RGBA{0xf0, 0xf8, 0xff, 0xff}, // rgb(240, 248, 255)
+	"antiquewhite":         color.RGBA{0xfa, 0xeb, 0xd7, 0xff}, // rgb(250, 235, 215)
+	"aqua":                 color.RGBA{0x00, 0xff, 0xff, 0xff}, // rgb(0, 255, 255)
+	"aquamarine":           color.RGBA{0x7f, 0xff, 0xd4, 0xff}, // rgb(127, 255, 212)
+	"azure":                color.RGBA{0xf0, 0xff, 0xff, 0xff}, // rgb(240, 255, 255)
+	"beige":                color.RGBA{0xf5, 0xf5, 0xdc, 0xff}, // rgb(245, 245, 220)
+	"bisque":               color.RGBA{0xff, 0xe4, 0xc4, 0xff}, // rgb(255, 228, 196)
+	"black":                color.RGBA{0x00, 0x00, 0x00, 0xff}, // rgb(0, 0, 0)
+	"blanchedalmond":       color.RGBA{0xff, 0xeb, 0xcd, 0xff}, // rgb(255, 235, 205)
+	"blue":                 color.RGBA{0x00, 0x00, 0xff, 0xff}, // rgb(0, 0, 255)
+	"blueviolet":           color.RGBA{0x8a, 0x2b, 0xe2, 0xff}, // rgb(138, 43, 226)
+	"brown":                color.RGBA{0xa5, 0x2a, 0x2a, 0xff}, // rgb(165, 42, 42)
+	"burlywood":            color.RGBA{0xde, 0xb8, 0x87, 0xff}, // rgb(222, 184, 135)
+	"cadetblue":            color.RGBA{0x5f, 0x9e, 0xa0, 0xff}, // rgb(95, 158, 160)
+	"chartreuse":           color.RGBA{0x7f, 0xff, 0x00, 0xff}, // rgb(127, 255, 0)
+	"chocolate":            color.RGBA{0xd2, 0x69, 0x1e, 0xff}, // rgb(210, 105, 30)
+	"coral":                color.RGBA{0xff, 0x7f, 0x50, 0xff}, // rgb(255, 127, 80)
+	"cornflowerblue":       color.RGBA{0x64, 0x95, 0xed, 0xff}, // rgb(100, 149, 237)
+	"cornsilk":             color.RGBA{0xff, 0xf8, 0xdc, 0xff}, // rgb(255, 248, 220)
+	"crimson":              color.RGBA{0xdc, 0x14, 0x3c, 0xff}, // rgb(220, 20, 60)
+	"cyan":                 color.RGBA{0x00, 0xff, 0xff, 0xff}, // rgb(0, 255, 255)
+	"darkblue":             color.RGBA{0x00, 0x00, 0x8b, 0xff}, // rgb(0, 0, 139)
+	"darkcyan":             color.RGBA{0x00, 0x8b, 0x8b, 0xff}, // rgb(0, 139, 139)
+	"darkgoldenrod":        color.RGBA{0xb8, 0x86, 0x0b, 0xff}, // rgb(184, 134, 11)
+	"darkgray":             color.RGBA{0xa9, 0xa9, 0xa9, 0xff}, // rgb(169, 169, 169)
+	"darkgreen":            color.RGBA{0x00, 0x64, 0x00, 0xff}, // rgb(0, 100, 0)
+	"darkgrey":             color.RGBA{0xa9, 0xa9, 0xa9, 0xff}, // rgb(169, 169, 169)
+	"darkkhaki":            color.RGBA{0xbd, 0xb7, 0x6b, 0xff}, // rgb(189, 183, 107)
+	"darkmagenta":          color.RGBA{0x8b, 0x00, 0x8b, 0xff}, // rgb(139, 0, 139)
+	"darkolivegreen":       color.RGBA{0x55, 0x6b, 0x2f, 0xff}, // rgb(85, 107, 47)
+	"darkorange":           color.RGBA{0xff, 0x8c, 0x00, 0xff}, // rgb(255, 140, 0)
+	"darkorchid":           color.RGBA{0x99, 0x32, 0xcc, 0xff}, // rgb(153, 50, 204)
+	"darkred":              color.RGBA{0x8b, 0x00, 0x00, 0xff}, // rgb(139, 0, 0)
+	"darksalmon":           color.RGBA{0xe9, 0x96, 0x7a, 0xff}, // rgb(233, 150, 122)
+	"darkseagreen":         color.RGBA{0x8f, 0xbc, 0x8f, 0xff}, // rgb(143, 188, 143)
+	"darkslateblue":        color.RGBA{0x48, 0x3d, 0x8b, 0xff}, // rgb(72, 61, 139)
+	"darkslategray":        color.RGBA{0x2f, 0x4f, 0x4f, 0xff}, // rgb(47, 79, 79)
+	"darkslategrey":        color.RGBA{0x2f, 0x4f, 0x4f, 0xff}, // rgb(47, 79, 79)
+	"darkturquoise":        color.RGBA{0x00, 0xce, 0xd1, 0xff}, // rgb(0, 206, 209)
+	"darkviolet":           color.RGBA{0x94, 0x00, 0xd3, 0xff}, // rgb(148, 0, 211)
+	"deeppink":             color.RGBA{0xff, 0x14, 0x93, 0xff}, // rgb(255, 20, 147)
+	"deepskyblue":          color.RGBA{0x00, 0xbf, 0xff, 0xff}, // rgb(0, 191, 255)
+	"dimgray":              color.RGBA{0x69, 0x69, 0x69, 0xff}, // rgb(105, 105, 105)
+	"dimgrey":              color.RGBA{0x69, 0x69, 0x69, 0xff}, // rgb(105, 105, 105)
+	"dodgerblue":           color.RGBA{0x1e, 0x90, 0xff, 0xff}, // rgb(30, 144, 255)
+	"firebrick":            color.RGBA{0xb2, 0x22, 0x22, 0xff}, // rgb(178, 34, 34)
+	"floralwhite":          color.RGBA{0xff, 0xfa, 0xf0, 0xff}, // rgb(255, 250, 240)
+	"forestgreen":          color.RGBA{0x22, 0x8b, 0x22, 0xff}, // rgb(34, 139, 34)
+	"fuchsia":              color.RGBA{0xff, 0x00, 0xff, 0xff}, // rgb(255, 0, 255)
+	"gainsboro":            color.RGBA{0xdc, 0xdc, 0xdc, 0xff}, // rgb(220, 220, 220)
+	"ghostwhite":           color.RGBA{0xf8, 0xf8, 0xff, 0xff}, // rgb(248, 248, 255)
+	"gold":                 color.RGBA{0xff, 0xd7, 0x00, 0xff}, // rgb(255, 215, 0)
+	"goldenrod":            color.RGBA{0xda, 0xa5, 0x20, 0xff}, // rgb(218, 165, 32)
+	"gray":                 color.RGBA{0x80, 0x80, 0x80, 0xff}, // rgb(128, 128, 128)
+	"green":                color.RGBA{0x00, 0x80, 0x00, 0xff}, // rgb(0, 128, 0)
+	"greenyellow":          color.RGBA{0xad, 0xff, 0x2f, 0xff}, // rgb(173, 255, 47)
+	"grey":                 color.RGBA{0x80, 0x80, 0x80, 0xff}, // rgb(128, 128, 128)
+	"honeydew":             color.RGBA{0xf0, 0xff, 0xf0, 0xff}, // rgb(240, 255, 240)
+	"hotpink":              color.RGBA{0xff, 0x69, 0xb4, 0xff}, // rgb(255, 105, 180)
+	"indianred":            color.RGBA{0xcd, 0x5c, 0x5c, 0xff}, // rgb(205, 92, 92)
+	"indigo":               color.RGBA{0x4b, 0x00, 0x82, 0xff}, // rgb(75, 0, 130)
+	"ivory":                color.RGBA{0xff, 0xff, 0xf0, 0xff}, // rgb(255, 255, 240)
+	"khaki":                color.RGBA{0xf0, 0xe6, 0x8c, 0xff}, // rgb(240, 230, 140)
+	"lavender":             color.RGBA{0xe6, 0xe6, 0xfa, 0xff}, // rgb(230, 230, 250)
+	"lavenderblush":        color.RGBA{0xff, 0xf0, 0xf5, 0xff}, // rgb(255, 240, 245)
+	"lawngreen":            color.RGBA{0x7c, 0xfc, 0x00, 0xff}, // rgb(124, 252, 0)
+	"lemonchiffon":         color.RGBA{0xff, 0xfa, 0xcd, 0xff}, // rgb(255, 250, 205)
+	"lightblue":            color.RGBA{0xad, 0xd8, 0xe6, 0xff}, // rgb(173, 216, 230)
+	"lightcoral":           color.RGBA{0xf0, 0x80, 0x80, 0xff}, // rgb(240, 128, 128)
+	"lightcyan":            color.RGBA{0xe0, 0xff, 0xff, 0xff}, // rgb(224, 255, 255)
+	"lightgoldenrodyellow": color.RGBA{0xfa, 0xfa, 0xd2, 0xff}, // rgb(250, 250, 210)
+	"lightgray":            color.RGBA{0xd3, 0xd3, 0xd3, 0xff}, // rgb(211, 211, 211)
+	"lightgreen":           color.RGBA{0x90, 0xee, 0x90, 0xff}, // rgb(144, 238, 144)
+	"lightgrey":            color.RGBA{0xd3, 0xd3, 0xd3, 0xff}, // rgb(211, 211, 211)
+	"lightpink":            color.RGBA{0xff, 0xb6, 0xc1, 0xff}, // rgb(255, 182, 193)
+	"lightsalmon":          color.RGBA{0xff, 0xa0, 0x7a, 0xff}, // rgb(255, 160, 122)
+	"lightseagreen":        color.RGBA{0x20, 0xb2, 0xaa, 0xff}, // rgb(32, 178, 170)
+	"lightskyblue":         color.RGBA{0x87, 0xce, 0xfa, 0xff}, // rgb(135, 206, 250)
+	"lightslategray":       color.RGBA{0x77, 0x88, 0x99, 0xff}, // rgb(119, 136, 153)
+	"lightslategrey":       color.RGBA{0x77, 0x88, 0x99, 0xff}, // rgb(119, 136, 153)
+	"lightsteelblue":       color.RGBA{0xb0, 0xc4, 0xde, 0xff}, // rgb(176, 196, 222)
+	"lightyellow":          color.RGBA{0xff, 0xff, 0xe0, 0xff}, // rgb(255, 255, 224)
+	"lime":                 color.RGBA{0x00, 0xff, 0x00, 0xff}, // rgb(0, 255, 0)
+	"limegreen":            color.RGBA{0x32, 0xcd, 0x32, 0xff}, // rgb(50, 205, 50)
+	"linen":                color.RGBA{0xfa, 0xf0, 0xe6, 0xff}, // rgb(250, 240, 230)
+	"magenta":              color.RGBA{0xff, 0x00, 0xff, 0xff}, // rgb(255, 0, 255)
+	"maroon":               color.RGBA{0x80, 0x00, 0x00, 0xff}, // rgb(128, 0, 0)
+	"mediumaquamarine":     color.RGBA{0x66, 0xcd, 0xaa, 0xff}, // rgb(102, 205, 170)
+	"mediumblue":           color.RGBA{0x00, 0x00, 0xcd, 0xff}, // rgb(0, 0, 205)
+	"mediumorchid":         color.RGBA{0xba, 0x55, 0xd3, 0xff}, // rgb(186, 85, 211)
+	"mediumpurple":         color.RGBA{0x93, 0x70, 0xdb, 0xff}, // rgb(147, 112, 219)
+	"mediumseagreen":       color.RGBA{0x3c, 0xb3, 0x71, 0xff}, // rgb(60, 179, 113)
+	"mediumslateblue":      color.RGBA{0x7b, 0x68, 0xee, 0xff}, // rgb(123, 104, 238)
+	"mediumspringgreen":    color.RGBA{0x00, 0xfa, 0x9a, 0xff}, // rgb(0, 250, 154)
+	"mediumturquoise":      color.RGBA{0x48, 0xd1, 0xcc, 0xff}, // rgb(72, 209, 204)
+	"mediumvioletred":      color.RGBA{0xc7, 0x15, 0x85, 0xff}, // rgb(199, 21, 133)
+	"midnightblue":         color.RGBA{0x19, 0x19, 0x70, 0xff}, // rgb(25, 25, 112)
+	"mintcream":            color.RGBA{0xf5, 0xff, 0xfa, 0xff}, // rgb(245, 255, 250)
+	"mistyrose":            color.RGBA{0xff, 0xe4, 0xe1, 0xff}, // rgb(255, 228, 225)
+	"moccasin":             color.RGBA{0xff, 0xe4, 0xb5, 0xff}, // rgb(255, 228, 181)
+	"navajowhite":          color.RGBA{0xff, 0xde, 0xad, 0xff}, // rgb(255, 222, 173)
+	"navy":                 color.RGBA{0x00, 0x00, 0x80, 0xff}, // rgb(0, 0, 128)
+	"oldlace":              color.RGBA{0xfd, 0xf5, 0xe6, 0xff}, // rgb(253, 245, 230)
+	"olive":                color.RGBA{0x80, 0x80, 0x00, 0xff}, // rgb(128, 128, 0)
+	"olivedrab":            color.RGBA{0x6b, 0x8e, 0x23, 0xff}, // rgb(107, 142, 35)
+	"orange":               color.RGBA{0xff, 0xa5, 0x00, 0xff}, // rgb(255, 165, 0)
+	"orangered":            color.RGBA{0xff, 0x45, 0x00, 0xff}, // rgb(255, 69, 0)
+	"orchid":               color.RGBA{0xda, 0x70, 0xd6, 0xff}, // rgb(218, 112, 214)
+	"palegoldenrod":        color.RGBA{0xee, 0xe8, 0xaa, 0xff}, // rgb(238, 232, 170)
+	"palegreen":            color.RGBA{0x98, 0xfb, 0x98, 0xff}, // rgb(152, 251, 152)
+	"paleturquoise":        color.RGBA{0xaf, 0xee, 0xee, 0xff}, // rgb(175, 238, 238)
+	"palevioletred":        color.RGBA{0xdb, 0x70, 0x93, 0xff}, // rgb(219, 112, 147)
+	"papayawhip":           color.RGBA{0xff, 0xef, 0xd5, 0xff}, // rgb(255, 239, 213)
+	"peachpuff":            color.RGBA{0xff, 0xda, 0xb9, 0xff}, // rgb(255, 218, 185)
+	"peru":                 color.RGBA{0xcd, 0x85, 0x3f, 0xff}, // rgb(205, 133, 63)
+	"pink":                 color.RGBA{0xff, 0xc0, 0xcb, 0xff}, // rgb(255, 192, 203)
+	"plum":                 color.RGBA{0xdd, 0xa0, 0xdd, 0xff}, // rgb(221, 160, 221)
+	"powderblue":           color.RGBA{0xb0, 0xe0, 0xe6, 0xff}, // rgb(176, 224, 230)
+	"purple":               color.RGBA{0x80, 0x00, 0x80, 0xff}, // rgb(128, 0, 128)
+	"red":                  color.RGBA{0xff, 0x00, 0x00, 0xff}, // rgb(255, 0, 0)
+	"rosybrown":            color.RGBA{0xbc, 0x8f, 0x8f, 0xff}, // rgb(188, 143, 143)
+	"royalblue":            color.RGBA{0x41, 0x69, 0xe1, 0xff}, // rgb(65, 105, 225)
+	"saddlebrown":          color.RGBA{0x8b, 0x45, 0x13, 0xff}, // rgb(139, 69, 19)
+	"salmon":               color.RGBA{0xfa, 0x80, 0x72, 0xff}, // rgb(250, 128, 114)
+	"sandybrown":           color.RGBA{0xf4, 0xa4, 0x60, 0xff}, // rgb(244, 164, 96)
+	"seagreen":             color.RGBA{0x2e, 0x8b, 0x57, 0xff}, // rgb(46, 139, 87)
+	"seashell":             color.RGBA{0xff, 0xf5, 0xee, 0xff}, // rgb(255, 245, 238)
+	"sienna":               color.RGBA{0xa0, 0x52, 0x2d, 0xff}, // rgb(160, 82, 45)
+	"silver":               color.RGBA{0xc0, 0xc0, 0xc0, 0xff}, // rgb(192, 192, 192)
+	"skyblue":              color.RGBA{0x87, 0xce, 0xeb, 0xff}, // rgb(135, 206, 235)
+	"slateblue":            color.RGBA{0x6a, 0x5a, 0xcd, 0xff}, // rgb(106, 90, 205)
+	"slategray":            color.RGBA{0x70, 0x80, 0x90, 0xff}, // rgb(112, 128, 144)
+	"slategrey":            color.RGBA{0x70, 0x80, 0x90, 0xff}, // rgb(112, 128, 144)
+	"snow":                 color.RGBA{0xff, 0xfa, 0xfa, 0xff}, // rgb(255, 250, 250)
+	"springgreen":          color.RGBA{0x00, 0xff, 0x7f, 0xff}, // rgb(0, 255, 127)
+	"steelblue":            color.RGBA{0x46, 0x82, 0xb4, 0xff}, // rgb(70, 130, 180)
+	"tan":                  color.RGBA{0xd2, 0xb4, 0x8c, 0xff}, // rgb(210, 180, 140)
+	"teal":                 color.RGBA{0x00, 0x80, 0x80, 0xff}, // rgb(0, 128, 128)
+	"thistle":              color.RGBA{0xd8, 0xbf, 0xd8, 0xff}, // rgb(216, 191, 216)
+	"tomato":               color.RGBA{0xff, 0x63, 0x47, 0xff}, // rgb(255, 99, 71)
+	"turquoise":            color.RGBA{0x40, 0xe0, 0xd0, 0xff}, // rgb(64, 224, 208)
+	"violet":               color.RGBA{0xee, 0x82, 0xee, 0xff}, // rgb(238, 130, 238)
+	"wheat":                color.RGBA{0xf5, 0xde, 0xb3, 0xff}, // rgb(245, 222, 179)
+	"white":                color.RGBA{0xff, 0xff, 0xff, 0xff}, // rgb(255, 255, 255)
+	"whitesmoke":           color.RGBA{0xf5, 0xf5, 0xf5, 0xff}, // rgb(245, 245, 245)
+	"yellow":               color.RGBA{0xff, 0xff, 0x00, 0xff}, // rgb(255, 255, 0)
+	"yellowgreen":          color.RGBA{0x9a, 0xcd, 0x32, 0xff}, // rgb(154, 205, 50)
+}
+
+// Names contains the color names defined in the SVG 1.1 spec.
+var Names = []string{
+	"aliceblue",
+	"antiquewhite",
+	"aqua",
+	"aquamarine",
+	"azure",
+	"beige",
+	"bisque",
+	"black",
+	"blanchedalmond",
+	"blue",
+	"blueviolet",
+	"brown",
+	"burlywood",
+	"cadetblue",
+	"chartreuse",
+	"chocolate",
+	"coral",
+	"cornflowerblue",
+	"cornsilk",
+	"crimson",
+	"cyan",
+	"darkblue",
+	"darkcyan",
+	"darkgoldenrod",
+	"darkgray",
+	"darkgreen",
+	"darkgrey",
+	"darkkhaki",
+	"darkmagenta",
+	"darkolivegreen",
+	"darkorange",
+	"darkorchid",
+	"darkred",
+	"darksalmon",
+	"darkseagreen",
+	"darkslateblue",
+	"darkslategray",
+	"darkslategrey",
+	"darkturquoise",
+	"darkviolet",
+	"deeppink",
+	"deepskyblue",
+	"dimgray",
+	"dimgrey",
+	"dodgerblue",
+	"firebrick",
+	"floralwhite",
+	"forestgreen",
+	"fuchsia",
+	"gainsboro",
+	"ghostwhite",
+	"gold",
+	"goldenrod",
+	"gray",
+	"green",
+	"greenyellow",
+	"grey",
+	"honeydew",
+	"hotpink",
+	"indianred",
+	"indigo",
+	"ivory",
+	"khaki",
+	"lavender",
+	"lavenderblush",
+	"lawngreen",
+	"lemonchiffon",
+	"lightblue",
+	"lightcoral",
+	"lightcyan",
+	"lightgoldenrodyellow",
+	"lightgray",
+	"lightgreen",
+	"lightgrey",
+	"lightpink",
+	"lightsalmon",
+	"lightseagreen",
+	"lightskyblue",
+	"lightslategray",
+	"lightslategrey",
+	"lightsteelblue",
+	"lightyellow",
+	"lime",
+	"limegreen",
+	"linen",
+	"magenta",
+	"maroon",
+	"mediumaquamarine",
+	"mediumblue",
+	"mediumorchid",
+	"mediumpurple",
+	"mediumseagreen",
+	"mediumslateblue",
+	"mediumspringgreen",
+	"mediumturquoise",
+	"mediumvioletred",
+	"midnightblue",
+	"mintcream",
+	"mistyrose",
+	"moccasin",
+	"navajowhite",
+	"navy",
+	"oldlace",
+	"olive",
+	"olivedrab",
+	"orange",
+	"orangered",
+	"orchid",
+	"palegoldenrod",
+	"palegreen",
+	"paleturquoise",
+	"palevioletred",
+	"papayawhip",
+	"peachpuff",
+	"peru",
+	"pink",
+	"plum",
+	"powderblue",
+	"purple",
+	"red",
+	"rosybrown",
+	"royalblue",
+	"saddlebrown",
+	"salmon",
+	"sandybrown",
+	"seagreen",
+	"seashell",
+	"sienna",
+	"silver",
+	"skyblue",
+	"slateblue",
+	"slategray",
+	"slategrey",
+	"snow",
+	"springgreen",
+	"steelblue",
+	"tan",
+	"teal",
+	"thistle",
+	"tomato",
+	"turquoise",
+	"violet",
+	"wheat",
+	"white",
+	"whitesmoke",
+	"yellow",
+	"yellowgreen",
+}
diff --git a/draw/draw.go b/draw/draw.go
new file mode 100644
index 0000000..b92e3c7
--- /dev/null
+++ b/draw/draw.go
@@ -0,0 +1,79 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package draw provides image composition functions.
+//
+// See "The Go image/draw package" for an introduction to this package:
+// http://golang.org/doc/articles/image_draw.html
+//
+// This package is a superset of and a drop-in replacement for the image/draw
+// package in the standard library.
+package draw
+
+// This file just contains the API exported by the image/draw package in the
+// standard library. Other files in this package provide additional features.
+
+import (
+	"image"
+	"image/color"
+	"image/draw"
+)
+
+// Draw calls DrawMask with a nil mask.
+func Draw(dst Image, r image.Rectangle, src image.Image, sp image.Point, op Op) {
+	draw.Draw(dst, r, src, sp, draw.Op(op))
+}
+
+// DrawMask aligns r.Min in dst with sp in src and mp in mask and then
+// replaces the rectangle r in dst with the result of a Porter-Duff
+// composition. A nil mask is treated as opaque.
+func DrawMask(dst Image, r image.Rectangle, src image.Image, sp image.Point, mask image.Image, mp image.Point, op Op) {
+	draw.DrawMask(dst, r, src, sp, mask, mp, draw.Op(op))
+}
+
+// Drawer contains the Draw method.
+type Drawer interface {
+	// Draw aligns r.Min in dst with sp in src and then replaces the
+	// rectangle r in dst with the result of drawing src on dst.
+	Draw(dst Image, r image.Rectangle, src image.Image, sp image.Point)
+}
+
+// FloydSteinberg is a Drawer that is the Src Op with Floyd-Steinberg error
+// diffusion.
+var FloydSteinberg Drawer = floydSteinberg{}
+
+type floydSteinberg struct{}
+
+func (floydSteinberg) Draw(dst Image, r image.Rectangle, src image.Image, sp image.Point) {
+	draw.FloydSteinberg.Draw(dst, r, src, sp)
+}
+
+// Image is an image.Image with a Set method to change a single pixel.
+type Image interface {
+	image.Image
+	Set(x, y int, c color.Color)
+}
+
+// Op is a Porter-Duff compositing operator.
+type Op int
+
+const (
+	// Over specifies ``(src in mask) over dst''.
+	Over Op = Op(draw.Over)
+	// Src specifies ``src in mask''.
+	Src Op = Op(draw.Src)
+)
+
+// Draw implements the Drawer interface by calling the Draw function with
+// this Op.
+func (op Op) Draw(dst Image, r image.Rectangle, src image.Image, sp image.Point) {
+	(draw.Op(op)).Draw(dst, r, src, sp)
+}
+
+// Quantizer produces a palette for an image.
+type Quantizer interface {
+	// Quantize appends up to cap(p) - len(p) colors to p and returns the
+	// updated palette suitable for converting m to a paletted image.
+	Quantize(p color.Palette, m image.Image) color.Palette
+}
diff --git a/draw/example_test.go b/draw/example_test.go
new file mode 100644
index 0000000..bcb4662
--- /dev/null
+++ b/draw/example_test.go
@@ -0,0 +1,118 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package draw_test
+
+import (
+	"fmt"
+	"image"
+	"image/color"
+	"image/png"
+	"log"
+	"math"
+	"os"
+
+	"golang.org/x/image/draw"
+	"golang.org/x/image/math/f64"
+)
+
+func ExampleDraw() {
+	fSrc, err := os.Open("../testdata/blue-purple-pink.png")
+	if err != nil {
+		log.Fatal(err)
+	}
+	defer fSrc.Close()
+	src, err := png.Decode(fSrc)
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	dst := image.NewRGBA(image.Rect(0, 0, 400, 300))
+	green := image.NewUniform(color.RGBA{0x00, 0x1f, 0x00, 0xff})
+	draw.Copy(dst, image.Point{}, green, dst.Bounds(), draw.Src, nil)
+	qs := []draw.Interpolator{
+		draw.NearestNeighbor,
+		draw.ApproxBiLinear,
+		draw.CatmullRom,
+	}
+	const cos60, sin60 = 0.5, 0.866025404
+	t := f64.Aff3{
+		+2 * cos60, -2 * sin60, 100,
+		+2 * sin60, +2 * cos60, 100,
+	}
+
+	draw.Copy(dst, image.Point{20, 30}, src, src.Bounds(), draw.Over, nil)
+	for i, q := range qs {
+		q.Scale(dst, image.Rect(200+10*i, 100*i, 600+10*i, 150+100*i), src, src.Bounds(), draw.Over, nil)
+	}
+	draw.NearestNeighbor.Transform(dst, t, src, src.Bounds(), draw.Over, nil)
+
+	red := image.NewNRGBA(image.Rect(0, 0, 16, 16))
+	for y := 0; y < 16; y++ {
+		for x := 0; x < 16; x++ {
+			red.SetNRGBA(x, y, color.NRGBA{
+				R: uint8(x * 0x11),
+				A: uint8(y * 0x11),
+			})
+		}
+	}
+	red.SetNRGBA(0, 0, color.NRGBA{0xff, 0xff, 0x00, 0xff})
+	red.SetNRGBA(15, 15, color.NRGBA{0xff, 0xff, 0x00, 0xff})
+
+	ops := []draw.Op{
+		draw.Over,
+		draw.Src,
+	}
+	for i, op := range ops {
+		dr := image.Rect(120+10*i, 150+60*i, 170+10*i, 200+60*i)
+		draw.NearestNeighbor.Scale(dst, dr, red, red.Bounds(), op, nil)
+		t := f64.Aff3{
+			+cos60, -sin60, float64(190 + 10*i),
+			+sin60, +cos60, float64(140 + 50*i),
+		}
+		draw.NearestNeighbor.Transform(dst, t, red, red.Bounds(), op, nil)
+	}
+
+	dr := image.Rect(0, 0, 128, 128)
+	checkerboard := image.NewAlpha(dr)
+	for y := dr.Min.Y; y < dr.Max.Y; y++ {
+		for x := dr.Min.X; x < dr.Max.X; x++ {
+			if (x/20)%2 == (y/20)%2 {
+				checkerboard.SetAlpha(x, y, color.Alpha{0xff})
+			}
+		}
+	}
+	sr := image.Rect(0, 0, 16, 16)
+	circle := image.NewAlpha(sr)
+	for y := sr.Min.Y; y < sr.Max.Y; y++ {
+		for x := sr.Min.X; x < sr.Max.X; x++ {
+			dx, dy := x-10, y-8
+			if d := 32 * math.Sqrt(float64(dx*dx)+float64(dy*dy)); d < 0xff {
+				circle.SetAlpha(x, y, color.Alpha{0xff - uint8(d)})
+			}
+		}
+	}
+	cyan := image.NewUniform(color.RGBA{0x00, 0xff, 0xff, 0xff})
+	draw.NearestNeighbor.Scale(dst, dr, cyan, sr, draw.Over, &draw.Options{
+		DstMask: checkerboard,
+		SrcMask: circle,
+	})
+
+	// Change false to true to write the resultant image to disk.
+	if false {
+		fDst, err := os.Create("out.png")
+		if err != nil {
+			log.Fatal(err)
+		}
+		defer fDst.Close()
+		err = png.Encode(fDst, dst)
+		if err != nil {
+			log.Fatal(err)
+		}
+	}
+
+	fmt.Printf("dst has bounds %v.\n", dst.Bounds())
+	// Output:
+	// dst has bounds (0,0)-(400,300).
+}
diff --git a/draw/gen.go b/draw/gen.go
new file mode 100644
index 0000000..0fed474
--- /dev/null
+++ b/draw/gen.go
@@ -0,0 +1,1403 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+import (
+	"bytes"
+	"flag"
+	"fmt"
+	"go/format"
+	"io/ioutil"
+	"log"
+	"os"
+	"strings"
+)
+
+var debug = flag.Bool("debug", false, "")
+
+func main() {
+	flag.Parse()
+
+	w := new(bytes.Buffer)
+	w.WriteString("// generated by \"go run gen.go\". DO NOT EDIT.\n\n" +
+		"package draw\n\nimport (\n" +
+		"\"image\"\n" +
+		"\"image/color\"\n" +
+		"\"math\"\n" +
+		"\n" +
+		"\"golang.org/x/image/math/f64\"\n" +
+		")\n")
+
+	gen(w, "nnInterpolator", codeNNScaleLeaf, codeNNTransformLeaf)
+	gen(w, "ablInterpolator", codeABLScaleLeaf, codeABLTransformLeaf)
+	genKernel(w)
+
+	if *debug {
+		os.Stdout.Write(w.Bytes())
+		return
+	}
+	out, err := format.Source(w.Bytes())
+	if err != nil {
+		log.Fatal(err)
+	}
+	if err := ioutil.WriteFile("impl.go", out, 0660); err != nil {
+		log.Fatal(err)
+	}
+}
+
+var (
+	// dsTypes are the (dst image type, src image type) pairs to generate
+	// scale_DType_SType implementations for. The last element in the slice
+	// should be the fallback pair ("Image", "image.Image").
+	//
+	// TODO: add *image.CMYK src type after Go 1.5 is released.
+	// An *image.CMYK is also alwaysOpaque.
+	dsTypes = []struct{ dType, sType string }{
+		{"*image.RGBA", "*image.Gray"},
+		{"*image.RGBA", "*image.NRGBA"},
+		{"*image.RGBA", "*image.RGBA"},
+		{"*image.RGBA", "*image.YCbCr"},
+		{"*image.RGBA", "image.Image"},
+		{"Image", "image.Image"},
+	}
+	dTypes, sTypes  []string
+	sTypesForDType  = map[string][]string{}
+	subsampleRatios = []string{
+		"444",
+		"422",
+		"420",
+		"440",
+	}
+	ops = []string{"Over", "Src"}
+	// alwaysOpaque are those image.Image implementations that are always
+	// opaque. For these types, Over is equivalent to the faster Src, in the
+	// absence of a source mask.
+	alwaysOpaque = map[string]bool{
+		"*image.Gray":  true,
+		"*image.YCbCr": true,
+	}
+)
+
+func init() {
+	dTypesSeen := map[string]bool{}
+	sTypesSeen := map[string]bool{}
+	for _, t := range dsTypes {
+		if !sTypesSeen[t.sType] {
+			sTypesSeen[t.sType] = true
+			sTypes = append(sTypes, t.sType)
+		}
+		if !dTypesSeen[t.dType] {
+			dTypesSeen[t.dType] = true
+			dTypes = append(dTypes, t.dType)
+		}
+		sTypesForDType[t.dType] = append(sTypesForDType[t.dType], t.sType)
+	}
+	sTypesForDType["anyDType"] = sTypes
+}
+
+type data struct {
+	dType    string
+	sType    string
+	sratio   string
+	receiver string
+	op       string
+}
+
+func gen(w *bytes.Buffer, receiver string, codes ...string) {
+	expn(w, codeRoot, &data{receiver: receiver})
+	for _, code := range codes {
+		for _, t := range dsTypes {
+			for _, op := range ops {
+				if op == "Over" && alwaysOpaque[t.sType] {
+					continue
+				}
+				expn(w, code, &data{
+					dType:    t.dType,
+					sType:    t.sType,
+					receiver: receiver,
+					op:       op,
+				})
+			}
+		}
+	}
+}
+
+func genKernel(w *bytes.Buffer) {
+	expn(w, codeKernelRoot, &data{})
+	for _, sType := range sTypes {
+		expn(w, codeKernelScaleLeafX, &data{
+			sType: sType,
+		})
+	}
+	for _, dType := range dTypes {
+		for _, op := range ops {
+			expn(w, codeKernelScaleLeafY, &data{
+				dType: dType,
+				op:    op,
+			})
+		}
+	}
+	for _, t := range dsTypes {
+		for _, op := range ops {
+			if op == "Over" && alwaysOpaque[t.sType] {
+				continue
+			}
+			expn(w, codeKernelTransformLeaf, &data{
+				dType: t.dType,
+				sType: t.sType,
+				op:    op,
+			})
+		}
+	}
+}
+
+func expn(w *bytes.Buffer, code string, d *data) {
+	if d.sType == "*image.YCbCr" && d.sratio == "" {
+		for _, sratio := range subsampleRatios {
+			e := *d
+			e.sratio = sratio
+			expn(w, code, &e)
+		}
+		return
+	}
+
+	for _, line := range strings.Split(code, "\n") {
+		line = expnLine(line, d)
+		if line == ";" {
+			continue
+		}
+		fmt.Fprintln(w, line)
+	}
+}
+
+func expnLine(line string, d *data) string {
+	for {
+		i := strings.IndexByte(line, '$')
+		if i < 0 {
+			break
+		}
+		prefix, s := line[:i], line[i+1:]
+
+		i = len(s)
+		for j, c := range s {
+			if !('A' <= c && c <= 'Z' || 'a' <= c && c <= 'z') {
+				i = j
+				break
+			}
+		}
+		dollar, suffix := s[:i], s[i:]
+
+		e := expnDollar(prefix, dollar, suffix, d)
+		if e == "" {
+			log.Fatalf("couldn't expand %q", line)
+		}
+		line = e
+	}
+	return line
+}
+
+// expnDollar expands a "$foo" fragment in a line of generated code. It returns
+// the empty string if there was a problem. It returns ";" if the generated
+// code is a no-op.
+func expnDollar(prefix, dollar, suffix string, d *data) string {
+	switch dollar {
+	case "dType":
+		return prefix + d.dType + suffix
+	case "dTypeRN":
+		return prefix + relName(d.dType) + suffix
+	case "sratio":
+		return prefix + d.sratio + suffix
+	case "sType":
+		return prefix + d.sType + suffix
+	case "sTypeRN":
+		return prefix + relName(d.sType) + suffix
+	case "receiver":
+		return prefix + d.receiver + suffix
+	case "op":
+		return prefix + d.op + suffix
+
+	case "switch":
+		return expnSwitch("", "", true, suffix)
+	case "switchD":
+		return expnSwitch("", "", false, suffix)
+	case "switchS":
+		return expnSwitch("", "anyDType", false, suffix)
+
+	case "preOuter":
+		switch d.dType {
+		default:
+			return ";"
+		case "Image":
+			s := ""
+			if d.sType == "image.Image" {
+				s = "srcMask, smp := opts.SrcMask, opts.SrcMaskP\n"
+			}
+			return s +
+				"dstMask, dmp := opts.DstMask, opts.DstMaskP\n" +
+				"dstColorRGBA64 := &color.RGBA64{}\n" +
+				"dstColor := color.Color(dstColorRGBA64)"
+		}
+
+	case "preInner":
+		switch d.dType {
+		default:
+			return ";"
+		case "*image.RGBA":
+			return "d := " + pixOffset("dst", "dr.Min.X+adr.Min.X", "dr.Min.Y+int(dy)", "*4", "*dst.Stride")
+		}
+
+	case "preKernelOuter":
+		switch d.sType {
+		default:
+			return ";"
+		case "image.Image":
+			return "srcMask, smp := opts.SrcMask, opts.SrcMaskP"
+		}
+
+	case "preKernelInner":
+		switch d.dType {
+		default:
+			return ";"
+		case "*image.RGBA":
+			return "d := " + pixOffset("dst", "dr.Min.X+int(dx)", "dr.Min.Y+adr.Min.Y", "*4", "*dst.Stride")
+		}
+
+	case "blend":
+		args, _ := splitArgs(suffix)
+		if len(args) != 4 {
+			return ""
+		}
+		switch d.sType {
+		default:
+			return argf(args, ""+
+				"$3r = $0*$1r + $2*$3r\n"+
+				"$3g = $0*$1g + $2*$3g\n"+
+				"$3b = $0*$1b + $2*$3b\n"+
+				"$3a = $0*$1a + $2*$3a",
+			)
+		case "*image.Gray":
+			return argf(args, ""+
+				"$3r = $0*$1r + $2*$3r",
+			)
+		case "*image.YCbCr":
+			return argf(args, ""+
+				"$3r = $0*$1r + $2*$3r\n"+
+				"$3g = $0*$1g + $2*$3g\n"+
+				"$3b = $0*$1b + $2*$3b",
+			)
+		}
+
+	case "clampToAlpha":
+		if alwaysOpaque[d.sType] {
+			return ";"
+		}
+		// Go uses alpha-premultiplied color. The naive computation can lead to
+		// invalid colors, e.g. red > alpha, when some weights are negative.
+		return `
+			if pr > pa {
+				pr = pa
+			}
+			if pg > pa {
+				pg = pa
+			}
+			if pb > pa {
+				pb = pa
+			}
+		`
+
+	case "convFtou":
+		args, _ := splitArgs(suffix)
+		if len(args) != 2 {
+			return ""
+		}
+
+		switch d.sType {
+		default:
+			return argf(args, ""+
+				"$0r := uint32($1r)\n"+
+				"$0g := uint32($1g)\n"+
+				"$0b := uint32($1b)\n"+
+				"$0a := uint32($1a)",
+			)
+		case "*image.Gray":
+			return argf(args, ""+
+				"$0r := uint32($1r)",
+			)
+		case "*image.YCbCr":
+			return argf(args, ""+
+				"$0r := uint32($1r)\n"+
+				"$0g := uint32($1g)\n"+
+				"$0b := uint32($1b)",
+			)
+		}
+
+	case "outputu":
+		args, _ := splitArgs(suffix)
+		if len(args) != 3 {
+			return ""
+		}
+
+		switch d.op {
+		case "Over":
+			switch d.dType {
+			default:
+				log.Fatalf("bad dType %q", d.dType)
+			case "Image":
+				return argf(args, ""+
+					"qr, qg, qb, qa := dst.At($0, $1).RGBA()\n"+
+					"if dstMask != nil {\n"+
+					"	_, _, _, ma := dstMask.At(dmp.X + $0, dmp.Y + $1).RGBA()\n"+
+					"	$2r = $2r * ma / 0xffff\n"+
+					"	$2g = $2g * ma / 0xffff\n"+
+					"	$2b = $2b * ma / 0xffff\n"+
+					"	$2a = $2a * ma / 0xffff\n"+
+					"}\n"+
+					"$2a1 := 0xffff - $2a\n"+
+					"dstColorRGBA64.R = uint16(qr*$2a1/0xffff + $2r)\n"+
+					"dstColorRGBA64.G = uint16(qg*$2a1/0xffff + $2g)\n"+
+					"dstColorRGBA64.B = uint16(qb*$2a1/0xffff + $2b)\n"+
+					"dstColorRGBA64.A = uint16(qa*$2a1/0xffff + $2a)\n"+
+					"dst.Set($0, $1, dstColor)",
+				)
+			case "*image.RGBA":
+				return argf(args, ""+
+					"$2a1 := (0xffff - $2a) * 0x101\n"+
+					"dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*$2a1/0xffff + $2r) >> 8)\n"+
+					"dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*$2a1/0xffff + $2g) >> 8)\n"+
+					"dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*$2a1/0xffff + $2b) >> 8)\n"+
+					"dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*$2a1/0xffff + $2a) >> 8)",
+				)
+			}
+
+		case "Src":
+			switch d.dType {
+			default:
+				log.Fatalf("bad dType %q", d.dType)
+			case "Image":
+				return argf(args, ""+
+					"if dstMask != nil {\n"+
+					"	qr, qg, qb, qa := dst.At($0, $1).RGBA()\n"+
+					"	_, _, _, ma := dstMask.At(dmp.X + $0, dmp.Y + $1).RGBA()\n"+
+					"	pr = pr * ma / 0xffff\n"+
+					"	pg = pg * ma / 0xffff\n"+
+					"	pb = pb * ma / 0xffff\n"+
+					"	pa = pa * ma / 0xffff\n"+
+					"	$2a1 := 0xffff - ma\n"+ // Note that this is ma, not $2a.
+					"	dstColorRGBA64.R = uint16(qr*$2a1/0xffff + $2r)\n"+
+					"	dstColorRGBA64.G = uint16(qg*$2a1/0xffff + $2g)\n"+
+					"	dstColorRGBA64.B = uint16(qb*$2a1/0xffff + $2b)\n"+
+					"	dstColorRGBA64.A = uint16(qa*$2a1/0xffff + $2a)\n"+
+					"	dst.Set($0, $1, dstColor)\n"+
+					"} else {\n"+
+					"	dstColorRGBA64.R = uint16($2r)\n"+
+					"	dstColorRGBA64.G = uint16($2g)\n"+
+					"	dstColorRGBA64.B = uint16($2b)\n"+
+					"	dstColorRGBA64.A = uint16($2a)\n"+
+					"	dst.Set($0, $1, dstColor)\n"+
+					"}",
+				)
+			case "*image.RGBA":
+				switch d.sType {
+				default:
+					return argf(args, ""+
+						"dst.Pix[d+0] = uint8($2r >> 8)\n"+
+						"dst.Pix[d+1] = uint8($2g >> 8)\n"+
+						"dst.Pix[d+2] = uint8($2b >> 8)\n"+
+						"dst.Pix[d+3] = uint8($2a >> 8)",
+					)
+				case "*image.Gray":
+					return argf(args, ""+
+						"out := uint8($2r >> 8)\n"+
+						"dst.Pix[d+0] = out\n"+
+						"dst.Pix[d+1] = out\n"+
+						"dst.Pix[d+2] = out\n"+
+						"dst.Pix[d+3] = 0xff",
+					)
+				case "*image.YCbCr":
+					return argf(args, ""+
+						"dst.Pix[d+0] = uint8($2r >> 8)\n"+
+						"dst.Pix[d+1] = uint8($2g >> 8)\n"+
+						"dst.Pix[d+2] = uint8($2b >> 8)\n"+
+						"dst.Pix[d+3] = 0xff",
+					)
+				}
+			}
+		}
+
+	case "outputf":
+		args, _ := splitArgs(suffix)
+		if len(args) != 5 {
+			return ""
+		}
+		ret := ""
+
+		switch d.op {
+		case "Over":
+			switch d.dType {
+			default:
+				log.Fatalf("bad dType %q", d.dType)
+			case "Image":
+				ret = argf(args, ""+
+					"qr, qg, qb, qa := dst.At($0, $1).RGBA()\n"+
+					"$3r0 := uint32($2($3r * $4))\n"+
+					"$3g0 := uint32($2($3g * $4))\n"+
+					"$3b0 := uint32($2($3b * $4))\n"+
+					"$3a0 := uint32($2($3a * $4))\n"+
+					"if dstMask != nil {\n"+
+					"	_, _, _, ma := dstMask.At(dmp.X + $0, dmp.Y + $1).RGBA()\n"+
+					"	$3r0 = $3r0 * ma / 0xffff\n"+
+					"	$3g0 = $3g0 * ma / 0xffff\n"+
+					"	$3b0 = $3b0 * ma / 0xffff\n"+
+					"	$3a0 = $3a0 * ma / 0xffff\n"+
+					"}\n"+
+					"$3a1 := 0xffff - $3a0\n"+
+					"dstColorRGBA64.R = uint16(qr*$3a1/0xffff + $3r0)\n"+
+					"dstColorRGBA64.G = uint16(qg*$3a1/0xffff + $3g0)\n"+
+					"dstColorRGBA64.B = uint16(qb*$3a1/0xffff + $3b0)\n"+
+					"dstColorRGBA64.A = uint16(qa*$3a1/0xffff + $3a0)\n"+
+					"dst.Set($0, $1, dstColor)",
+				)
+			case "*image.RGBA":
+				ret = argf(args, ""+
+					"$3r0 := uint32($2($3r * $4))\n"+
+					"$3g0 := uint32($2($3g * $4))\n"+
+					"$3b0 := uint32($2($3b * $4))\n"+
+					"$3a0 := uint32($2($3a * $4))\n"+
+					"$3a1 := (0xffff - uint32($3a0)) * 0x101\n"+
+					"dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*$3a1/0xffff + $3r0) >> 8)\n"+
+					"dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*$3a1/0xffff + $3g0) >> 8)\n"+
+					"dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*$3a1/0xffff + $3b0) >> 8)\n"+
+					"dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*$3a1/0xffff + $3a0) >> 8)",
+				)
+			}
+
+		case "Src":
+			switch d.dType {
+			default:
+				log.Fatalf("bad dType %q", d.dType)
+			case "Image":
+				ret = argf(args, ""+
+					"if dstMask != nil {\n"+
+					"	qr, qg, qb, qa := dst.At($0, $1).RGBA()\n"+
+					"	_, _, _, ma := dstMask.At(dmp.X + $0, dmp.Y + $1).RGBA()\n"+
+					"	pr := uint32($2($3r * $4)) * ma / 0xffff\n"+
+					"	pg := uint32($2($3g * $4)) * ma / 0xffff\n"+
+					"	pb := uint32($2($3b * $4)) * ma / 0xffff\n"+
+					"	pa := uint32($2($3a * $4)) * ma / 0xffff\n"+
+					"	pa1 := 0xffff - ma\n"+ // Note that this is ma, not pa.
+					"	dstColorRGBA64.R = uint16(qr*pa1/0xffff + pr)\n"+
+					"	dstColorRGBA64.G = uint16(qg*pa1/0xffff + pg)\n"+
+					"	dstColorRGBA64.B = uint16(qb*pa1/0xffff + pb)\n"+
+					"	dstColorRGBA64.A = uint16(qa*pa1/0xffff + pa)\n"+
+					"	dst.Set($0, $1, dstColor)\n"+
+					"} else {\n"+
+					"	dstColorRGBA64.R = $2($3r * $4)\n"+
+					"	dstColorRGBA64.G = $2($3g * $4)\n"+
+					"	dstColorRGBA64.B = $2($3b * $4)\n"+
+					"	dstColorRGBA64.A = $2($3a * $4)\n"+
+					"	dst.Set($0, $1, dstColor)\n"+
+					"}",
+				)
+			case "*image.RGBA":
+				switch d.sType {
+				default:
+					ret = argf(args, ""+
+						"dst.Pix[d+0] = uint8($2($3r * $4) >> 8)\n"+
+						"dst.Pix[d+1] = uint8($2($3g * $4) >> 8)\n"+
+						"dst.Pix[d+2] = uint8($2($3b * $4) >> 8)\n"+
+						"dst.Pix[d+3] = uint8($2($3a * $4) >> 8)",
+					)
+				case "*image.Gray":
+					ret = argf(args, ""+
+						"out := uint8($2($3r * $4) >> 8)\n"+
+						"dst.Pix[d+0] = out\n"+
+						"dst.Pix[d+1] = out\n"+
+						"dst.Pix[d+2] = out\n"+
+						"dst.Pix[d+3] = 0xff",
+					)
+				case "*image.YCbCr":
+					ret = argf(args, ""+
+						"dst.Pix[d+0] = uint8($2($3r * $4) >> 8)\n"+
+						"dst.Pix[d+1] = uint8($2($3g * $4) >> 8)\n"+
+						"dst.Pix[d+2] = uint8($2($3b * $4) >> 8)\n"+
+						"dst.Pix[d+3] = 0xff",
+					)
+				}
+			}
+		}
+
+		return strings.Replace(ret, " * 1)", ")", -1)
+
+	case "srcf", "srcu":
+		lhs, eqOp := splitEq(prefix)
+		if lhs == "" {
+			return ""
+		}
+		args, extra := splitArgs(suffix)
+		if len(args) != 2 {
+			return ""
+		}
+
+		tmp := ""
+		if dollar == "srcf" {
+			tmp = "u"
+		}
+
+		// TODO: there's no need to multiply by 0x101 in the switch below if
+		// the next thing we're going to do is shift right by 8.
+
+		buf := new(bytes.Buffer)
+		switch d.sType {
+		default:
+			log.Fatalf("bad sType %q", d.sType)
+		case "image.Image":
+			fmt.Fprintf(buf, ""+
+				"%sr%s, %sg%s, %sb%s, %sa%s := src.At(%s, %s).RGBA()\n",
+				lhs, tmp, lhs, tmp, lhs, tmp, lhs, tmp, args[0], args[1],
+			)
+			if d.dType == "" || d.dType == "Image" {
+				fmt.Fprintf(buf, ""+
+					"if srcMask != nil {\n"+
+					"	_, _, _, ma := srcMask.At(smp.X+%s, smp.Y+%s).RGBA()\n"+
+					"	%sr%s = %sr%s * ma / 0xffff\n"+
+					"	%sg%s = %sg%s * ma / 0xffff\n"+
+					"	%sb%s = %sb%s * ma / 0xffff\n"+
+					"	%sa%s = %sa%s * ma / 0xffff\n"+
+					"}\n",
+					args[0], args[1],
+					lhs, tmp, lhs, tmp,
+					lhs, tmp, lhs, tmp,
+					lhs, tmp, lhs, tmp,
+					lhs, tmp, lhs, tmp,
+				)
+			}
+		case "*image.Gray":
+			fmt.Fprintf(buf, ""+
+				"%si := %s\n"+
+				"%sr%s := uint32(src.Pix[%si]) * 0x101\n",
+				lhs, pixOffset("src", args[0], args[1], "", "*src.Stride"),
+				lhs, tmp, lhs,
+			)
+		case "*image.NRGBA":
+			fmt.Fprintf(buf, ""+
+				"%si := %s\n"+
+				"%sa%s := uint32(src.Pix[%si+3]) * 0x101\n"+
+				"%sr%s := uint32(src.Pix[%si+0]) * %sa%s / 0xff\n"+
+				"%sg%s := uint32(src.Pix[%si+1]) * %sa%s / 0xff\n"+
+				"%sb%s := uint32(src.Pix[%si+2]) * %sa%s / 0xff\n",
+				lhs, pixOffset("src", args[0], args[1], "*4", "*src.Stride"),
+				lhs, tmp, lhs,
+				lhs, tmp, lhs, lhs, tmp,
+				lhs, tmp, lhs, lhs, tmp,
+				lhs, tmp, lhs, lhs, tmp,
+			)
+		case "*image.RGBA":
+			fmt.Fprintf(buf, ""+
+				"%si := %s\n"+
+				"%sr%s := uint32(src.Pix[%si+0]) * 0x101\n"+
+				"%sg%s := uint32(src.Pix[%si+1]) * 0x101\n"+
+				"%sb%s := uint32(src.Pix[%si+2]) * 0x101\n"+
+				"%sa%s := uint32(src.Pix[%si+3]) * 0x101\n",
+				lhs, pixOffset("src", args[0], args[1], "*4", "*src.Stride"),
+				lhs, tmp, lhs,
+				lhs, tmp, lhs,
+				lhs, tmp, lhs,
+				lhs, tmp, lhs,
+			)
+		case "*image.YCbCr":
+			fmt.Fprintf(buf, ""+
+				"%si := %s\n"+
+				"%sj := %s\n"+
+				"%s\n",
+				lhs, pixOffset("src", args[0], args[1], "", "*src.YStride"),
+				lhs, cOffset(args[0], args[1], d.sratio),
+				ycbcrToRGB(lhs, tmp),
+			)
+		}
+
+		if dollar == "srcf" {
+			switch d.sType {
+			default:
+				fmt.Fprintf(buf, ""+
+					"%sr %s float64(%sru)%s\n"+
+					"%sg %s float64(%sgu)%s\n"+
+					"%sb %s float64(%sbu)%s\n"+
+					"%sa %s float64(%sau)%s\n",
+					lhs, eqOp, lhs, extra,
+					lhs, eqOp, lhs, extra,
+					lhs, eqOp, lhs, extra,
+					lhs, eqOp, lhs, extra,
+				)
+			case "*image.Gray":
+				fmt.Fprintf(buf, ""+
+					"%sr %s float64(%sru)%s\n",
+					lhs, eqOp, lhs, extra,
+				)
+			case "*image.YCbCr":
+				fmt.Fprintf(buf, ""+
+					"%sr %s float64(%sru)%s\n"+
+					"%sg %s float64(%sgu)%s\n"+
+					"%sb %s float64(%sbu)%s\n",
+					lhs, eqOp, lhs, extra,
+					lhs, eqOp, lhs, extra,
+					lhs, eqOp, lhs, extra,
+				)
+			}
+		}
+
+		return strings.TrimSpace(buf.String())
+
+	case "tweakD":
+		if d.dType == "*image.RGBA" {
+			return "d += dst.Stride"
+		}
+		return ";"
+
+	case "tweakDx":
+		if d.dType == "*image.RGBA" {
+			return strings.Replace(prefix, "dx++", "dx, d = dx+1, d+4", 1)
+		}
+		return prefix
+
+	case "tweakDy":
+		if d.dType == "*image.RGBA" {
+			return strings.Replace(prefix, "for dy, s", "for _, s", 1)
+		}
+		return prefix
+
+	case "tweakP":
+		switch d.sType {
+		case "*image.Gray":
+			if strings.HasPrefix(strings.TrimSpace(prefix), "pa * ") {
+				return "1,"
+			}
+			return "pr,"
+		case "*image.YCbCr":
+			if strings.HasPrefix(strings.TrimSpace(prefix), "pa * ") {
+				return "1,"
+			}
+		}
+		return prefix
+
+	case "tweakPr":
+		if d.sType == "*image.Gray" {
+			return "pr *= s.invTotalWeightFFFF"
+		}
+		return ";"
+
+	case "tweakVarP":
+		switch d.sType {
+		case "*image.Gray":
+			return strings.Replace(prefix, "var pr, pg, pb, pa", "var pr", 1)
+		case "*image.YCbCr":
+			return strings.Replace(prefix, "var pr, pg, pb, pa", "var pr, pg, pb", 1)
+		}
+		return prefix
+	}
+	return ""
+}
+
+func expnSwitch(op, dType string, expandBoth bool, template string) string {
+	if op == "" && dType != "anyDType" {
+		lines := []string{"switch op {"}
+		for _, op = range ops {
+			lines = append(lines,
+				fmt.Sprintf("case %s:", op),
+				expnSwitch(op, dType, expandBoth, template),
+			)
+		}
+		lines = append(lines, "}")
+		return strings.Join(lines, "\n")
+	}
+
+	switchVar := "dst"
+	if dType != "" {
+		switchVar = "src"
+	}
+	lines := []string{fmt.Sprintf("switch %s := %s.(type) {", switchVar, switchVar)}
+
+	fallback, values := "Image", dTypes
+	if dType != "" {
+		fallback, values = "image.Image", sTypesForDType[dType]
+	}
+	for _, v := range values {
+		if dType != "" {
+			// v is the sType. Skip those always-opaque sTypes, where Over is
+			// equivalent to Src.
+			if op == "Over" && alwaysOpaque[v] {
+				continue
+			}
+		}
+
+		if v == fallback {
+			lines = append(lines, "default:")
+		} else {
+			lines = append(lines, fmt.Sprintf("case %s:", v))
+		}
+
+		if dType != "" {
+			if v == "*image.YCbCr" {
+				lines = append(lines, expnSwitchYCbCr(op, dType, template))
+			} else {
+				lines = append(lines, expnLine(template, &data{dType: dType, sType: v, op: op}))
+			}
+		} else if !expandBoth {
+			lines = append(lines, expnLine(template, &data{dType: v, op: op}))
+		} else {
+			lines = append(lines, expnSwitch(op, v, false, template))
+		}
+	}
+
+	lines = append(lines, "}")
+	return strings.Join(lines, "\n")
+}
+
+func expnSwitchYCbCr(op, dType, template string) string {
+	lines := []string{
+		"switch src.SubsampleRatio {",
+		"default:",
+		expnLine(template, &data{dType: dType, sType: "image.Image", op: op}),
+	}
+	for _, sratio := range subsampleRatios {
+		lines = append(lines,
+			fmt.Sprintf("case image.YCbCrSubsampleRatio%s:", sratio),
+			expnLine(template, &data{dType: dType, sType: "*image.YCbCr", sratio: sratio, op: op}),
+		)
+	}
+	lines = append(lines, "}")
+	return strings.Join(lines, "\n")
+}
+
+func argf(args []string, s string) string {
+	if len(args) > 9 {
+		panic("too many args")
+	}
+	for i, a := range args {
+		old := fmt.Sprintf("$%d", i)
+		s = strings.Replace(s, old, a, -1)
+	}
+	return s
+}
+
+func pixOffset(m, x, y, xstride, ystride string) string {
+	return fmt.Sprintf("(%s-%s.Rect.Min.Y)%s + (%s-%s.Rect.Min.X)%s", y, m, ystride, x, m, xstride)
+}
+
+func cOffset(x, y, sratio string) string {
+	switch sratio {
+	case "444":
+		return fmt.Sprintf("( %s    - src.Rect.Min.Y  )*src.CStride + ( %s    - src.Rect.Min.X  )", y, x)
+	case "422":
+		return fmt.Sprintf("( %s    - src.Rect.Min.Y  )*src.CStride + ((%s)/2 - src.Rect.Min.X/2)", y, x)
+	case "420":
+		return fmt.Sprintf("((%s)/2 - src.Rect.Min.Y/2)*src.CStride + ((%s)/2 - src.Rect.Min.X/2)", y, x)
+	case "440":
+		return fmt.Sprintf("((%s)/2 - src.Rect.Min.Y/2)*src.CStride + ( %s    - src.Rect.Min.X  )", y, x)
+	}
+	return fmt.Sprintf("unsupported sratio %q", sratio)
+}
+
+func ycbcrToRGB(lhs, tmp string) string {
+	s := `
+		// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+		$yy1 := int(src.Y[$i]) * 0x10100
+		$cb1 := int(src.Cb[$j]) - 128
+		$cr1 := int(src.Cr[$j]) - 128
+		$r@ := ($yy1 + 91881*$cr1) >> 8
+		$g@ := ($yy1 - 22554*$cb1 - 46802*$cr1) >> 8
+		$b@ := ($yy1 + 116130*$cb1) >> 8
+		if $r@ < 0 {
+			$r@ = 0
+		} else if $r@ > 0xffff {
+			$r@ = 0xffff
+		}
+		if $g@ < 0 {
+			$g@ = 0
+		} else if $g@ > 0xffff {
+			$g@ = 0xffff
+		}
+		if $b@ < 0 {
+			$b@ = 0
+		} else if $b@ > 0xffff {
+			$b@ = 0xffff
+		}
+	`
+	s = strings.Replace(s, "$", lhs, -1)
+	s = strings.Replace(s, "@", tmp, -1)
+	return s
+}
+
+func split(s, sep string) (string, string) {
+	if i := strings.Index(s, sep); i >= 0 {
+		return strings.TrimSpace(s[:i]), strings.TrimSpace(s[i+len(sep):])
+	}
+	return "", ""
+}
+
+func splitEq(s string) (lhs, eqOp string) {
+	s = strings.TrimSpace(s)
+	if lhs, _ = split(s, ":="); lhs != "" {
+		return lhs, ":="
+	}
+	if lhs, _ = split(s, "+="); lhs != "" {
+		return lhs, "+="
+	}
+	return "", ""
+}
+
+func splitArgs(s string) (args []string, extra string) {
+	s = strings.TrimSpace(s)
+	if s == "" || s[0] != '[' {
+		return nil, ""
+	}
+	s = s[1:]
+
+	i := strings.IndexByte(s, ']')
+	if i < 0 {
+		return nil, ""
+	}
+	args, extra = strings.Split(s[:i], ","), s[i+1:]
+	for i := range args {
+		args[i] = strings.TrimSpace(args[i])
+	}
+	return args, extra
+}
+
+func relName(s string) string {
+	if i := strings.LastIndex(s, "."); i >= 0 {
+		return s[i+1:]
+	}
+	return s
+}
+
+const (
+	codeRoot = `
+		func (z $receiver) Scale(dst Image, dr image.Rectangle, src image.Image, sr image.Rectangle, op Op, opts *Options) {
+			// Try to simplify a Scale to a Copy.
+			if dr.Size() == sr.Size() {
+				Copy(dst, dr.Min, src, sr, op, opts)
+				return
+			}
+
+			var o Options
+			if opts != nil {
+				o = *opts
+			}
+
+			// adr is the affected destination pixels.
+			adr := dst.Bounds().Intersect(dr)
+			adr, o.DstMask = clipAffectedDestRect(adr, o.DstMask, o.DstMaskP)
+			if adr.Empty() || sr.Empty() {
+				return
+			}
+			// Make adr relative to dr.Min.
+			adr = adr.Sub(dr.Min)
+			if op == Over && o.SrcMask == nil && opaque(src) {
+				op = Src
+			}
+
+			// sr is the source pixels. If it extends beyond the src bounds,
+			// we cannot use the type-specific fast paths, as they access
+			// the Pix fields directly without bounds checking.
+			//
+			// Similarly, the fast paths assume that the masks are nil.
+			if o.DstMask != nil || o.SrcMask != nil || !sr.In(src.Bounds()) {
+				switch op {
+				case Over:
+					z.scale_Image_Image_Over(dst, dr, adr, src, sr, &o)
+				case Src:
+					z.scale_Image_Image_Src(dst, dr, adr, src, sr, &o)
+				}
+			} else if _, ok := src.(*image.Uniform); ok {
+				Draw(dst, dr, src, src.Bounds().Min, op)
+			} else {
+				$switch z.scale_$dTypeRN_$sTypeRN$sratio_$op(dst, dr, adr, src, sr, &o)
+			}
+		}
+
+		func (z $receiver) Transform(dst Image, s2d f64.Aff3, src image.Image, sr image.Rectangle, op Op, opts *Options) {
+			// Try to simplify a Transform to a Copy.
+			if s2d[0] == 1 && s2d[1] == 0 && s2d[3] == 0 && s2d[4] == 1 {
+				dx := int(s2d[2])
+				dy := int(s2d[5])
+				if float64(dx) == s2d[2] && float64(dy) == s2d[5] {
+					Copy(dst, image.Point{X: sr.Min.X + dx, Y: sr.Min.X + dy}, src, sr, op, opts)
+					return
+				}
+			}
+
+			var o Options
+			if opts != nil {
+				o = *opts
+			}
+
+			dr := transformRect(&s2d, &sr)
+			// adr is the affected destination pixels.
+			adr := dst.Bounds().Intersect(dr)
+			adr, o.DstMask = clipAffectedDestRect(adr, o.DstMask, o.DstMaskP)
+			if adr.Empty() || sr.Empty() {
+				return
+			}
+			if op == Over && o.SrcMask == nil && opaque(src) {
+				op = Src
+			}
+
+			d2s := invert(&s2d)
+			// bias is a translation of the mapping from dst coordinates to src
+			// coordinates such that the latter temporarily have non-negative X
+			// and Y coordinates. This allows us to write int(f) instead of
+			// int(math.Floor(f)), since "round to zero" and "round down" are
+			// equivalent when f >= 0, but the former is much cheaper. The X--
+			// and Y-- are because the TransformLeaf methods have a "sx -= 0.5"
+			// adjustment.
+			bias := transformRect(&d2s, &adr).Min
+			bias.X--
+			bias.Y--
+			d2s[2] -= float64(bias.X)
+			d2s[5] -= float64(bias.Y)
+			// Make adr relative to dr.Min.
+			adr = adr.Sub(dr.Min)
+			// sr is the source pixels. If it extends beyond the src bounds,
+			// we cannot use the type-specific fast paths, as they access
+			// the Pix fields directly without bounds checking.
+			//
+			// Similarly, the fast paths assume that the masks are nil.
+			if o.DstMask != nil || o.SrcMask != nil || !sr.In(src.Bounds()) {
+				switch op {
+				case Over:
+					z.transform_Image_Image_Over(dst, dr, adr, &d2s, src, sr, bias, &o)
+				case Src:
+					z.transform_Image_Image_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+				}
+			} else if u, ok := src.(*image.Uniform); ok {
+				transform_Uniform(dst, dr, adr, &d2s, u, sr, bias, op)
+			} else {
+				$switch z.transform_$dTypeRN_$sTypeRN$sratio_$op(dst, dr, adr, &d2s, src, sr, bias, &o)
+			}
+		}
+	`
+
+	codeNNScaleLeaf = `
+		func (nnInterpolator) scale_$dTypeRN_$sTypeRN$sratio_$op(dst $dType, dr, adr image.Rectangle, src $sType, sr image.Rectangle, opts *Options) {
+			dw2 := uint64(dr.Dx()) * 2
+			dh2 := uint64(dr.Dy()) * 2
+			sw := uint64(sr.Dx())
+			sh := uint64(sr.Dy())
+			$preOuter
+			for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+				sy := (2*uint64(dy) + 1) * sh / dh2
+				$preInner
+				for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ { $tweakDx
+					sx := (2*uint64(dx) + 1) * sw / dw2
+					p := $srcu[sr.Min.X + int(sx), sr.Min.Y + int(sy)]
+					$outputu[dr.Min.X + int(dx), dr.Min.Y + int(dy), p]
+				}
+			}
+		}
+	`
+
+	codeNNTransformLeaf = `
+		func (nnInterpolator) transform_$dTypeRN_$sTypeRN$sratio_$op(dst $dType, dr, adr image.Rectangle, d2s *f64.Aff3, src $sType, sr image.Rectangle, bias image.Point, opts *Options) {
+			$preOuter
+			for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+				dyf := float64(dr.Min.Y + int(dy)) + 0.5
+				$preInner
+				for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ { $tweakDx
+					dxf := float64(dr.Min.X + int(dx)) + 0.5
+					sx0 := int(d2s[0]*dxf + d2s[1]*dyf + d2s[2]) + bias.X
+					sy0 := int(d2s[3]*dxf + d2s[4]*dyf + d2s[5]) + bias.Y
+					if !(image.Point{sx0, sy0}).In(sr) {
+						continue
+					}
+					p := $srcu[sx0, sy0]
+					$outputu[dr.Min.X + int(dx), dr.Min.Y + int(dy), p]
+				}
+			}
+		}
+	`
+
+	codeABLScaleLeaf = `
+		func (ablInterpolator) scale_$dTypeRN_$sTypeRN$sratio_$op(dst $dType, dr, adr image.Rectangle, src $sType, sr image.Rectangle, opts *Options) {
+			sw := int32(sr.Dx())
+			sh := int32(sr.Dy())
+			yscale := float64(sh) / float64(dr.Dy())
+			xscale := float64(sw) / float64(dr.Dx())
+			swMinus1, shMinus1 := sw - 1, sh - 1
+			$preOuter
+
+			for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+				sy := (float64(dy)+0.5)*yscale - 0.5
+				// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+				// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+				// sx, below.
+				sy0 := int32(sy)
+				yFrac0 := sy - float64(sy0)
+				yFrac1 := 1 - yFrac0
+				sy1 := sy0 + 1
+				if sy < 0 {
+					sy0, sy1 = 0, 0
+					yFrac0, yFrac1 = 0, 1
+				} else if sy1 > shMinus1 {
+					sy0, sy1 = shMinus1, shMinus1
+					yFrac0, yFrac1 = 1, 0
+				}
+				$preInner
+
+				for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ { $tweakDx
+					sx := (float64(dx)+0.5)*xscale - 0.5
+					sx0 := int32(sx)
+					xFrac0 := sx - float64(sx0)
+					xFrac1 := 1 - xFrac0
+					sx1 := sx0 + 1
+					if sx < 0 {
+						sx0, sx1 = 0, 0
+						xFrac0, xFrac1 = 0, 1
+					} else if sx1 > swMinus1 {
+						sx0, sx1 = swMinus1, swMinus1
+						xFrac0, xFrac1 = 1, 0
+					}
+
+					s00 := $srcf[sr.Min.X + int(sx0), sr.Min.Y + int(sy0)]
+					s10 := $srcf[sr.Min.X + int(sx1), sr.Min.Y + int(sy0)]
+					$blend[xFrac1, s00, xFrac0, s10]
+					s01 := $srcf[sr.Min.X + int(sx0), sr.Min.Y + int(sy1)]
+					s11 := $srcf[sr.Min.X + int(sx1), sr.Min.Y + int(sy1)]
+					$blend[xFrac1, s01, xFrac0, s11]
+					$blend[yFrac1, s10, yFrac0, s11]
+					$convFtou[p, s11]
+					$outputu[dr.Min.X + int(dx), dr.Min.Y + int(dy), p]
+				}
+			}
+		}
+	`
+
+	codeABLTransformLeaf = `
+		func (ablInterpolator) transform_$dTypeRN_$sTypeRN$sratio_$op(dst $dType, dr, adr image.Rectangle, d2s *f64.Aff3, src $sType, sr image.Rectangle, bias image.Point, opts *Options) {
+			$preOuter
+			for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+				dyf := float64(dr.Min.Y + int(dy)) + 0.5
+				$preInner
+				for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ { $tweakDx
+					dxf := float64(dr.Min.X + int(dx)) + 0.5
+					sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+					sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+					if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+						continue
+					}
+
+					sx -= 0.5
+					sx0 := int(sx)
+					xFrac0 := sx - float64(sx0)
+					xFrac1 := 1 - xFrac0
+					sx0 += bias.X
+					sx1 := sx0 + 1
+					if sx0 < sr.Min.X {
+						sx0, sx1 = sr.Min.X, sr.Min.X
+						xFrac0, xFrac1 = 0, 1
+					} else if sx1 >= sr.Max.X {
+						sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+						xFrac0, xFrac1 = 1, 0
+					}
+
+					sy -= 0.5
+					sy0 := int(sy)
+					yFrac0 := sy - float64(sy0)
+					yFrac1 := 1 - yFrac0
+					sy0 += bias.Y
+					sy1 := sy0 + 1
+					if sy0 < sr.Min.Y {
+						sy0, sy1 = sr.Min.Y, sr.Min.Y
+						yFrac0, yFrac1 = 0, 1
+					} else if sy1 >= sr.Max.Y {
+						sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+						yFrac0, yFrac1 = 1, 0
+					}
+
+					s00 := $srcf[sx0, sy0]
+					s10 := $srcf[sx1, sy0]
+					$blend[xFrac1, s00, xFrac0, s10]
+					s01 := $srcf[sx0, sy1]
+					s11 := $srcf[sx1, sy1]
+					$blend[xFrac1, s01, xFrac0, s11]
+					$blend[yFrac1, s10, yFrac0, s11]
+					$convFtou[p, s11]
+					$outputu[dr.Min.X + int(dx), dr.Min.Y + int(dy), p]
+				}
+			}
+		}
+	`
+
+	codeKernelRoot = `
+		func (z *kernelScaler) Scale(dst Image, dr image.Rectangle, src image.Image, sr image.Rectangle, op Op, opts *Options) {
+			if z.dw != int32(dr.Dx()) || z.dh != int32(dr.Dy()) || z.sw != int32(sr.Dx()) || z.sh != int32(sr.Dy()) {
+				z.kernel.Scale(dst, dr, src, sr, op, opts)
+				return
+			}
+
+			var o Options
+			if opts != nil {
+				o = *opts
+			}
+
+			// adr is the affected destination pixels.
+			adr := dst.Bounds().Intersect(dr)
+			adr, o.DstMask = clipAffectedDestRect(adr, o.DstMask, o.DstMaskP)
+			if adr.Empty() || sr.Empty() {
+				return
+			}
+			// Make adr relative to dr.Min.
+			adr = adr.Sub(dr.Min)
+			if op == Over && o.SrcMask == nil && opaque(src) {
+				op = Src
+			}
+
+			if _, ok := src.(*image.Uniform); ok && o.DstMask == nil && o.SrcMask == nil && sr.In(src.Bounds()) {
+				Draw(dst, dr, src, src.Bounds().Min, op)
+				return
+			}
+
+			// Create a temporary buffer:
+			// scaleX distributes the source image's columns over the temporary image.
+			// scaleY distributes the temporary image's rows over the destination image.
+			var tmp [][4]float64
+			if z.pool.New != nil {
+				tmpp := z.pool.Get().(*[][4]float64)
+				defer z.pool.Put(tmpp)
+				tmp = *tmpp
+			} else {
+				tmp = z.makeTmpBuf()
+			}
+
+			// sr is the source pixels. If it extends beyond the src bounds,
+			// we cannot use the type-specific fast paths, as they access
+			// the Pix fields directly without bounds checking.
+			//
+			// Similarly, the fast paths assume that the masks are nil.
+			if o.SrcMask != nil || !sr.In(src.Bounds()) {
+				z.scaleX_Image(tmp, src, sr, &o)
+			} else {
+				$switchS z.scaleX_$sTypeRN$sratio(tmp, src, sr, &o)
+			}
+
+			if o.DstMask != nil {
+				switch op {
+				case Over:
+					z.scaleY_Image_Over(dst, dr, adr, tmp, &o)
+				case Src:
+					z.scaleY_Image_Src(dst, dr, adr, tmp, &o)
+				}
+			} else {
+				$switchD z.scaleY_$dTypeRN_$op(dst, dr, adr, tmp, &o)
+			}
+		}
+
+		func (q *Kernel) Transform(dst Image, s2d f64.Aff3, src image.Image, sr image.Rectangle, op Op, opts *Options) {
+			var o Options
+			if opts != nil {
+				o = *opts
+			}
+
+			dr := transformRect(&s2d, &sr)
+			// adr is the affected destination pixels.
+			adr := dst.Bounds().Intersect(dr)
+			adr, o.DstMask = clipAffectedDestRect(adr, o.DstMask, o.DstMaskP)
+			if adr.Empty() || sr.Empty() {
+				return
+			}
+			if op == Over && o.SrcMask == nil && opaque(src) {
+				op = Src
+			}
+			d2s := invert(&s2d)
+			// bias is a translation of the mapping from dst coordinates to src
+			// coordinates such that the latter temporarily have non-negative X
+			// and Y coordinates. This allows us to write int(f) instead of
+			// int(math.Floor(f)), since "round to zero" and "round down" are
+			// equivalent when f >= 0, but the former is much cheaper. The X--
+			// and Y-- are because the TransformLeaf methods have a "sx -= 0.5"
+			// adjustment.
+			bias := transformRect(&d2s, &adr).Min
+			bias.X--
+			bias.Y--
+			d2s[2] -= float64(bias.X)
+			d2s[5] -= float64(bias.Y)
+			// Make adr relative to dr.Min.
+			adr = adr.Sub(dr.Min)
+
+			if u, ok := src.(*image.Uniform); ok && o.DstMask != nil && o.SrcMask != nil && sr.In(src.Bounds()) {
+				transform_Uniform(dst, dr, adr, &d2s, u, sr, bias, op)
+				return
+			}
+
+			xscale := abs(d2s[0])
+			if s := abs(d2s[1]); xscale < s {
+				xscale = s
+			}
+			yscale := abs(d2s[3])
+			if s := abs(d2s[4]); yscale < s {
+				yscale = s
+			}
+
+			// sr is the source pixels. If it extends beyond the src bounds,
+			// we cannot use the type-specific fast paths, as they access
+			// the Pix fields directly without bounds checking.
+			//
+			// Similarly, the fast paths assume that the masks are nil.
+			if o.DstMask != nil || o.SrcMask != nil || !sr.In(src.Bounds()) {
+				switch op {
+				case Over:
+					q.transform_Image_Image_Over(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
+				case Src:
+					q.transform_Image_Image_Src(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
+				}
+			} else {
+				$switch q.transform_$dTypeRN_$sTypeRN$sratio_$op(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
+			}
+		}
+	`
+
+	codeKernelScaleLeafX = `
+		func (z *kernelScaler) scaleX_$sTypeRN$sratio(tmp [][4]float64, src $sType, sr image.Rectangle, opts *Options) {
+			t := 0
+			$preKernelOuter
+			for y := int32(0); y < z.sh; y++ {
+				for _, s := range z.horizontal.sources {
+					var pr, pg, pb, pa float64 $tweakVarP
+					for _, c := range z.horizontal.contribs[s.i:s.j] {
+						p += $srcf[sr.Min.X + int(c.coord), sr.Min.Y + int(y)] * c.weight
+					}
+					$tweakPr
+					tmp[t] = [4]float64{
+						pr * s.invTotalWeightFFFF, $tweakP
+						pg * s.invTotalWeightFFFF, $tweakP
+						pb * s.invTotalWeightFFFF, $tweakP
+						pa * s.invTotalWeightFFFF, $tweakP
+					}
+					t++
+				}
+			}
+		}
+	`
+
+	codeKernelScaleLeafY = `
+		func (z *kernelScaler) scaleY_$dTypeRN_$op(dst $dType, dr, adr image.Rectangle, tmp [][4]float64, opts *Options) {
+			$preOuter
+			for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+				$preKernelInner
+				for dy, s := range z.vertical.sources[adr.Min.Y:adr.Max.Y] { $tweakDy
+					var pr, pg, pb, pa float64
+					for _, c := range z.vertical.contribs[s.i:s.j] {
+						p := &tmp[c.coord*z.dw+dx]
+						pr += p[0] * c.weight
+						pg += p[1] * c.weight
+						pb += p[2] * c.weight
+						pa += p[3] * c.weight
+					}
+					$clampToAlpha
+					$outputf[dr.Min.X + int(dx), dr.Min.Y + int(adr.Min.Y + dy), ftou, p, s.invTotalWeight]
+					$tweakD
+				}
+			}
+		}
+	`
+
+	codeKernelTransformLeaf = `
+		func (q *Kernel) transform_$dTypeRN_$sTypeRN$sratio_$op(dst $dType, dr, adr image.Rectangle, d2s *f64.Aff3, src $sType, sr image.Rectangle, bias image.Point, xscale, yscale float64, opts *Options) {
+			// When shrinking, broaden the effective kernel support so that we still
+			// visit every source pixel.
+			xHalfWidth, xKernelArgScale := q.Support, 1.0
+			if xscale > 1 {
+				xHalfWidth *= xscale
+				xKernelArgScale = 1 / xscale
+			}
+			yHalfWidth, yKernelArgScale := q.Support, 1.0
+			if yscale > 1 {
+				yHalfWidth *= yscale
+				yKernelArgScale = 1 / yscale
+			}
+
+			xWeights := make([]float64, 1 + 2*int(math.Ceil(xHalfWidth)))
+			yWeights := make([]float64, 1 + 2*int(math.Ceil(yHalfWidth)))
+
+			$preOuter
+			for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+				dyf := float64(dr.Min.Y + int(dy)) + 0.5
+				$preInner
+				for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ { $tweakDx
+					dxf := float64(dr.Min.X + int(dx)) + 0.5
+					sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+					sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+					if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+						continue
+					}
+
+					// TODO: adjust the bias so that we can use int(f) instead
+					// of math.Floor(f) and math.Ceil(f).
+					sx += float64(bias.X)
+					sx -= 0.5
+					ix := int(math.Floor(sx - xHalfWidth))
+					if ix < sr.Min.X {
+						ix = sr.Min.X
+					}
+					jx := int(math.Ceil(sx + xHalfWidth))
+					if jx > sr.Max.X {
+						jx = sr.Max.X
+					}
+
+					totalXWeight := 0.0
+					for kx := ix; kx < jx; kx++ {
+						xWeight := 0.0
+						if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+							xWeight = q.At(t)
+						}
+						xWeights[kx - ix] = xWeight
+						totalXWeight += xWeight
+					}
+					for x := range xWeights[:jx-ix] {
+						xWeights[x] /= totalXWeight
+					}
+
+					sy += float64(bias.Y)
+					sy -= 0.5
+					iy := int(math.Floor(sy - yHalfWidth))
+					if iy < sr.Min.Y {
+						iy = sr.Min.Y
+					}
+					jy := int(math.Ceil(sy + yHalfWidth))
+					if jy > sr.Max.Y {
+						jy = sr.Max.Y
+					}
+
+					totalYWeight := 0.0
+					for ky := iy; ky < jy; ky++ {
+						yWeight := 0.0
+						if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+							yWeight = q.At(t)
+						}
+						yWeights[ky - iy] = yWeight
+						totalYWeight += yWeight
+					}
+					for y := range yWeights[:jy-iy] {
+						yWeights[y] /= totalYWeight
+					}
+
+					var pr, pg, pb, pa float64 $tweakVarP
+					for ky := iy; ky < jy; ky++ {
+						if yWeight := yWeights[ky - iy]; yWeight != 0 {
+							for kx := ix; kx < jx; kx++ {
+								if w := xWeights[kx - ix] * yWeight; w != 0 {
+									p += $srcf[kx, ky] * w
+								}
+							}
+						}
+					}
+					$clampToAlpha
+					$outputf[dr.Min.X + int(dx), dr.Min.Y + int(dy), fffftou, p, 1]
+				}
+			}
+		}
+	`
+)
diff --git a/draw/impl.go b/draw/impl.go
new file mode 100644
index 0000000..d6484d7
--- /dev/null
+++ b/draw/impl.go
@@ -0,0 +1,6668 @@
+// generated by "go run gen.go". DO NOT EDIT.
+
+package draw
+
+import (
+	"image"
+	"image/color"
+	"math"
+
+	"golang.org/x/image/math/f64"
+)
+
+func (z nnInterpolator) Scale(dst Image, dr image.Rectangle, src image.Image, sr image.Rectangle, op Op, opts *Options) {
+	// Try to simplify a Scale to a Copy.
+	if dr.Size() == sr.Size() {
+		Copy(dst, dr.Min, src, sr, op, opts)
+		return
+	}
+
+	var o Options
+	if opts != nil {
+		o = *opts
+	}
+
+	// adr is the affected destination pixels.
+	adr := dst.Bounds().Intersect(dr)
+	adr, o.DstMask = clipAffectedDestRect(adr, o.DstMask, o.DstMaskP)
+	if adr.Empty() || sr.Empty() {
+		return
+	}
+	// Make adr relative to dr.Min.
+	adr = adr.Sub(dr.Min)
+	if op == Over && o.SrcMask == nil && opaque(src) {
+		op = Src
+	}
+
+	// sr is the source pixels. If it extends beyond the src bounds,
+	// we cannot use the type-specific fast paths, as they access
+	// the Pix fields directly without bounds checking.
+	//
+	// Similarly, the fast paths assume that the masks are nil.
+	if o.DstMask != nil || o.SrcMask != nil || !sr.In(src.Bounds()) {
+		switch op {
+		case Over:
+			z.scale_Image_Image_Over(dst, dr, adr, src, sr, &o)
+		case Src:
+			z.scale_Image_Image_Src(dst, dr, adr, src, sr, &o)
+		}
+	} else if _, ok := src.(*image.Uniform); ok {
+		Draw(dst, dr, src, src.Bounds().Min, op)
+	} else {
+		switch op {
+		case Over:
+			switch dst := dst.(type) {
+			case *image.RGBA:
+				switch src := src.(type) {
+				case *image.NRGBA:
+					z.scale_RGBA_NRGBA_Over(dst, dr, adr, src, sr, &o)
+				case *image.RGBA:
+					z.scale_RGBA_RGBA_Over(dst, dr, adr, src, sr, &o)
+				default:
+					z.scale_RGBA_Image_Over(dst, dr, adr, src, sr, &o)
+				}
+			default:
+				switch src := src.(type) {
+				default:
+					z.scale_Image_Image_Over(dst, dr, adr, src, sr, &o)
+				}
+			}
+		case Src:
+			switch dst := dst.(type) {
+			case *image.RGBA:
+				switch src := src.(type) {
+				case *image.Gray:
+					z.scale_RGBA_Gray_Src(dst, dr, adr, src, sr, &o)
+				case *image.NRGBA:
+					z.scale_RGBA_NRGBA_Src(dst, dr, adr, src, sr, &o)
+				case *image.RGBA:
+					z.scale_RGBA_RGBA_Src(dst, dr, adr, src, sr, &o)
+				case *image.YCbCr:
+					switch src.SubsampleRatio {
+					default:
+						z.scale_RGBA_Image_Src(dst, dr, adr, src, sr, &o)
+					case image.YCbCrSubsampleRatio444:
+						z.scale_RGBA_YCbCr444_Src(dst, dr, adr, src, sr, &o)
+					case image.YCbCrSubsampleRatio422:
+						z.scale_RGBA_YCbCr422_Src(dst, dr, adr, src, sr, &o)
+					case image.YCbCrSubsampleRatio420:
+						z.scale_RGBA_YCbCr420_Src(dst, dr, adr, src, sr, &o)
+					case image.YCbCrSubsampleRatio440:
+						z.scale_RGBA_YCbCr440_Src(dst, dr, adr, src, sr, &o)
+					}
+				default:
+					z.scale_RGBA_Image_Src(dst, dr, adr, src, sr, &o)
+				}
+			default:
+				switch src := src.(type) {
+				default:
+					z.scale_Image_Image_Src(dst, dr, adr, src, sr, &o)
+				}
+			}
+		}
+	}
+}
+
+func (z nnInterpolator) Transform(dst Image, s2d f64.Aff3, src image.Image, sr image.Rectangle, op Op, opts *Options) {
+	// Try to simplify a Transform to a Copy.
+	if s2d[0] == 1 && s2d[1] == 0 && s2d[3] == 0 && s2d[4] == 1 {
+		dx := int(s2d[2])
+		dy := int(s2d[5])
+		if float64(dx) == s2d[2] && float64(dy) == s2d[5] {
+			Copy(dst, image.Point{X: sr.Min.X + dx, Y: sr.Min.X + dy}, src, sr, op, opts)
+			return
+		}
+	}
+
+	var o Options
+	if opts != nil {
+		o = *opts
+	}
+
+	dr := transformRect(&s2d, &sr)
+	// adr is the affected destination pixels.
+	adr := dst.Bounds().Intersect(dr)
+	adr, o.DstMask = clipAffectedDestRect(adr, o.DstMask, o.DstMaskP)
+	if adr.Empty() || sr.Empty() {
+		return
+	}
+	if op == Over && o.SrcMask == nil && opaque(src) {
+		op = Src
+	}
+
+	d2s := invert(&s2d)
+	// bias is a translation of the mapping from dst coordinates to src
+	// coordinates such that the latter temporarily have non-negative X
+	// and Y coordinates. This allows us to write int(f) instead of
+	// int(math.Floor(f)), since "round to zero" and "round down" are
+	// equivalent when f >= 0, but the former is much cheaper. The X--
+	// and Y-- are because the TransformLeaf methods have a "sx -= 0.5"
+	// adjustment.
+	bias := transformRect(&d2s, &adr).Min
+	bias.X--
+	bias.Y--
+	d2s[2] -= float64(bias.X)
+	d2s[5] -= float64(bias.Y)
+	// Make adr relative to dr.Min.
+	adr = adr.Sub(dr.Min)
+	// sr is the source pixels. If it extends beyond the src bounds,
+	// we cannot use the type-specific fast paths, as they access
+	// the Pix fields directly without bounds checking.
+	//
+	// Similarly, the fast paths assume that the masks are nil.
+	if o.DstMask != nil || o.SrcMask != nil || !sr.In(src.Bounds()) {
+		switch op {
+		case Over:
+			z.transform_Image_Image_Over(dst, dr, adr, &d2s, src, sr, bias, &o)
+		case Src:
+			z.transform_Image_Image_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+		}
+	} else if u, ok := src.(*image.Uniform); ok {
+		transform_Uniform(dst, dr, adr, &d2s, u, sr, bias, op)
+	} else {
+		switch op {
+		case Over:
+			switch dst := dst.(type) {
+			case *image.RGBA:
+				switch src := src.(type) {
+				case *image.NRGBA:
+					z.transform_RGBA_NRGBA_Over(dst, dr, adr, &d2s, src, sr, bias, &o)
+				case *image.RGBA:
+					z.transform_RGBA_RGBA_Over(dst, dr, adr, &d2s, src, sr, bias, &o)
+				default:
+					z.transform_RGBA_Image_Over(dst, dr, adr, &d2s, src, sr, bias, &o)
+				}
+			default:
+				switch src := src.(type) {
+				default:
+					z.transform_Image_Image_Over(dst, dr, adr, &d2s, src, sr, bias, &o)
+				}
+			}
+		case Src:
+			switch dst := dst.(type) {
+			case *image.RGBA:
+				switch src := src.(type) {
+				case *image.Gray:
+					z.transform_RGBA_Gray_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+				case *image.NRGBA:
+					z.transform_RGBA_NRGBA_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+				case *image.RGBA:
+					z.transform_RGBA_RGBA_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+				case *image.YCbCr:
+					switch src.SubsampleRatio {
+					default:
+						z.transform_RGBA_Image_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+					case image.YCbCrSubsampleRatio444:
+						z.transform_RGBA_YCbCr444_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+					case image.YCbCrSubsampleRatio422:
+						z.transform_RGBA_YCbCr422_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+					case image.YCbCrSubsampleRatio420:
+						z.transform_RGBA_YCbCr420_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+					case image.YCbCrSubsampleRatio440:
+						z.transform_RGBA_YCbCr440_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+					}
+				default:
+					z.transform_RGBA_Image_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+				}
+			default:
+				switch src := src.(type) {
+				default:
+					z.transform_Image_Image_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+				}
+			}
+		}
+	}
+}
+
+func (nnInterpolator) scale_RGBA_Gray_Src(dst *image.RGBA, dr, adr image.Rectangle, src *image.Gray, sr image.Rectangle, opts *Options) {
+	dw2 := uint64(dr.Dx()) * 2
+	dh2 := uint64(dr.Dy()) * 2
+	sw := uint64(sr.Dx())
+	sh := uint64(sr.Dy())
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (2*uint64(dy) + 1) * sh / dh2
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (2*uint64(dx) + 1) * sw / dw2
+			pi := (sr.Min.Y+int(sy)-src.Rect.Min.Y)*src.Stride + (sr.Min.X + int(sx) - src.Rect.Min.X)
+			pr := uint32(src.Pix[pi]) * 0x101
+			out := uint8(pr >> 8)
+			dst.Pix[d+0] = out
+			dst.Pix[d+1] = out
+			dst.Pix[d+2] = out
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (nnInterpolator) scale_RGBA_NRGBA_Over(dst *image.RGBA, dr, adr image.Rectangle, src *image.NRGBA, sr image.Rectangle, opts *Options) {
+	dw2 := uint64(dr.Dx()) * 2
+	dh2 := uint64(dr.Dy()) * 2
+	sw := uint64(sr.Dx())
+	sh := uint64(sr.Dy())
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (2*uint64(dy) + 1) * sh / dh2
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (2*uint64(dx) + 1) * sw / dw2
+			pi := (sr.Min.Y+int(sy)-src.Rect.Min.Y)*src.Stride + (sr.Min.X+int(sx)-src.Rect.Min.X)*4
+			pa := uint32(src.Pix[pi+3]) * 0x101
+			pr := uint32(src.Pix[pi+0]) * pa / 0xff
+			pg := uint32(src.Pix[pi+1]) * pa / 0xff
+			pb := uint32(src.Pix[pi+2]) * pa / 0xff
+			pa1 := (0xffff - pa) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + pr) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + pg) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + pb) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + pa) >> 8)
+		}
+	}
+}
+
+func (nnInterpolator) scale_RGBA_NRGBA_Src(dst *image.RGBA, dr, adr image.Rectangle, src *image.NRGBA, sr image.Rectangle, opts *Options) {
+	dw2 := uint64(dr.Dx()) * 2
+	dh2 := uint64(dr.Dy()) * 2
+	sw := uint64(sr.Dx())
+	sh := uint64(sr.Dy())
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (2*uint64(dy) + 1) * sh / dh2
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (2*uint64(dx) + 1) * sw / dw2
+			pi := (sr.Min.Y+int(sy)-src.Rect.Min.Y)*src.Stride + (sr.Min.X+int(sx)-src.Rect.Min.X)*4
+			pa := uint32(src.Pix[pi+3]) * 0x101
+			pr := uint32(src.Pix[pi+0]) * pa / 0xff
+			pg := uint32(src.Pix[pi+1]) * pa / 0xff
+			pb := uint32(src.Pix[pi+2]) * pa / 0xff
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = uint8(pa >> 8)
+		}
+	}
+}
+
+func (nnInterpolator) scale_RGBA_RGBA_Over(dst *image.RGBA, dr, adr image.Rectangle, src *image.RGBA, sr image.Rectangle, opts *Options) {
+	dw2 := uint64(dr.Dx()) * 2
+	dh2 := uint64(dr.Dy()) * 2
+	sw := uint64(sr.Dx())
+	sh := uint64(sr.Dy())
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (2*uint64(dy) + 1) * sh / dh2
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (2*uint64(dx) + 1) * sw / dw2
+			pi := (sr.Min.Y+int(sy)-src.Rect.Min.Y)*src.Stride + (sr.Min.X+int(sx)-src.Rect.Min.X)*4
+			pr := uint32(src.Pix[pi+0]) * 0x101
+			pg := uint32(src.Pix[pi+1]) * 0x101
+			pb := uint32(src.Pix[pi+2]) * 0x101
+			pa := uint32(src.Pix[pi+3]) * 0x101
+			pa1 := (0xffff - pa) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + pr) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + pg) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + pb) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + pa) >> 8)
+		}
+	}
+}
+
+func (nnInterpolator) scale_RGBA_RGBA_Src(dst *image.RGBA, dr, adr image.Rectangle, src *image.RGBA, sr image.Rectangle, opts *Options) {
+	dw2 := uint64(dr.Dx()) * 2
+	dh2 := uint64(dr.Dy()) * 2
+	sw := uint64(sr.Dx())
+	sh := uint64(sr.Dy())
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (2*uint64(dy) + 1) * sh / dh2
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (2*uint64(dx) + 1) * sw / dw2
+			pi := (sr.Min.Y+int(sy)-src.Rect.Min.Y)*src.Stride + (sr.Min.X+int(sx)-src.Rect.Min.X)*4
+			pr := uint32(src.Pix[pi+0]) * 0x101
+			pg := uint32(src.Pix[pi+1]) * 0x101
+			pb := uint32(src.Pix[pi+2]) * 0x101
+			pa := uint32(src.Pix[pi+3]) * 0x101
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = uint8(pa >> 8)
+		}
+	}
+}
+
+func (nnInterpolator) scale_RGBA_YCbCr444_Src(dst *image.RGBA, dr, adr image.Rectangle, src *image.YCbCr, sr image.Rectangle, opts *Options) {
+	dw2 := uint64(dr.Dx()) * 2
+	dh2 := uint64(dr.Dy()) * 2
+	sw := uint64(sr.Dx())
+	sh := uint64(sr.Dy())
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (2*uint64(dy) + 1) * sh / dh2
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (2*uint64(dx) + 1) * sw / dw2
+			pi := (sr.Min.Y+int(sy)-src.Rect.Min.Y)*src.YStride + (sr.Min.X + int(sx) - src.Rect.Min.X)
+			pj := (sr.Min.Y+int(sy)-src.Rect.Min.Y)*src.CStride + (sr.Min.X + int(sx) - src.Rect.Min.X)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			pyy1 := int(src.Y[pi]) * 0x10100
+			pcb1 := int(src.Cb[pj]) - 128
+			pcr1 := int(src.Cr[pj]) - 128
+			pr := (pyy1 + 91881*pcr1) >> 8
+			pg := (pyy1 - 22554*pcb1 - 46802*pcr1) >> 8
+			pb := (pyy1 + 116130*pcb1) >> 8
+			if pr < 0 {
+				pr = 0
+			} else if pr > 0xffff {
+				pr = 0xffff
+			}
+			if pg < 0 {
+				pg = 0
+			} else if pg > 0xffff {
+				pg = 0xffff
+			}
+			if pb < 0 {
+				pb = 0
+			} else if pb > 0xffff {
+				pb = 0xffff
+			}
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (nnInterpolator) scale_RGBA_YCbCr422_Src(dst *image.RGBA, dr, adr image.Rectangle, src *image.YCbCr, sr image.Rectangle, opts *Options) {
+	dw2 := uint64(dr.Dx()) * 2
+	dh2 := uint64(dr.Dy()) * 2
+	sw := uint64(sr.Dx())
+	sh := uint64(sr.Dy())
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (2*uint64(dy) + 1) * sh / dh2
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (2*uint64(dx) + 1) * sw / dw2
+			pi := (sr.Min.Y+int(sy)-src.Rect.Min.Y)*src.YStride + (sr.Min.X + int(sx) - src.Rect.Min.X)
+			pj := (sr.Min.Y+int(sy)-src.Rect.Min.Y)*src.CStride + ((sr.Min.X+int(sx))/2 - src.Rect.Min.X/2)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			pyy1 := int(src.Y[pi]) * 0x10100
+			pcb1 := int(src.Cb[pj]) - 128
+			pcr1 := int(src.Cr[pj]) - 128
+			pr := (pyy1 + 91881*pcr1) >> 8
+			pg := (pyy1 - 22554*pcb1 - 46802*pcr1) >> 8
+			pb := (pyy1 + 116130*pcb1) >> 8
+			if pr < 0 {
+				pr = 0
+			} else if pr > 0xffff {
+				pr = 0xffff
+			}
+			if pg < 0 {
+				pg = 0
+			} else if pg > 0xffff {
+				pg = 0xffff
+			}
+			if pb < 0 {
+				pb = 0
+			} else if pb > 0xffff {
+				pb = 0xffff
+			}
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (nnInterpolator) scale_RGBA_YCbCr420_Src(dst *image.RGBA, dr, adr image.Rectangle, src *image.YCbCr, sr image.Rectangle, opts *Options) {
+	dw2 := uint64(dr.Dx()) * 2
+	dh2 := uint64(dr.Dy()) * 2
+	sw := uint64(sr.Dx())
+	sh := uint64(sr.Dy())
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (2*uint64(dy) + 1) * sh / dh2
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (2*uint64(dx) + 1) * sw / dw2
+			pi := (sr.Min.Y+int(sy)-src.Rect.Min.Y)*src.YStride + (sr.Min.X + int(sx) - src.Rect.Min.X)
+			pj := ((sr.Min.Y+int(sy))/2-src.Rect.Min.Y/2)*src.CStride + ((sr.Min.X+int(sx))/2 - src.Rect.Min.X/2)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			pyy1 := int(src.Y[pi]) * 0x10100
+			pcb1 := int(src.Cb[pj]) - 128
+			pcr1 := int(src.Cr[pj]) - 128
+			pr := (pyy1 + 91881*pcr1) >> 8
+			pg := (pyy1 - 22554*pcb1 - 46802*pcr1) >> 8
+			pb := (pyy1 + 116130*pcb1) >> 8
+			if pr < 0 {
+				pr = 0
+			} else if pr > 0xffff {
+				pr = 0xffff
+			}
+			if pg < 0 {
+				pg = 0
+			} else if pg > 0xffff {
+				pg = 0xffff
+			}
+			if pb < 0 {
+				pb = 0
+			} else if pb > 0xffff {
+				pb = 0xffff
+			}
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (nnInterpolator) scale_RGBA_YCbCr440_Src(dst *image.RGBA, dr, adr image.Rectangle, src *image.YCbCr, sr image.Rectangle, opts *Options) {
+	dw2 := uint64(dr.Dx()) * 2
+	dh2 := uint64(dr.Dy()) * 2
+	sw := uint64(sr.Dx())
+	sh := uint64(sr.Dy())
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (2*uint64(dy) + 1) * sh / dh2
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (2*uint64(dx) + 1) * sw / dw2
+			pi := (sr.Min.Y+int(sy)-src.Rect.Min.Y)*src.YStride + (sr.Min.X + int(sx) - src.Rect.Min.X)
+			pj := ((sr.Min.Y+int(sy))/2-src.Rect.Min.Y/2)*src.CStride + (sr.Min.X + int(sx) - src.Rect.Min.X)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			pyy1 := int(src.Y[pi]) * 0x10100
+			pcb1 := int(src.Cb[pj]) - 128
+			pcr1 := int(src.Cr[pj]) - 128
+			pr := (pyy1 + 91881*pcr1) >> 8
+			pg := (pyy1 - 22554*pcb1 - 46802*pcr1) >> 8
+			pb := (pyy1 + 116130*pcb1) >> 8
+			if pr < 0 {
+				pr = 0
+			} else if pr > 0xffff {
+				pr = 0xffff
+			}
+			if pg < 0 {
+				pg = 0
+			} else if pg > 0xffff {
+				pg = 0xffff
+			}
+			if pb < 0 {
+				pb = 0
+			} else if pb > 0xffff {
+				pb = 0xffff
+			}
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (nnInterpolator) scale_RGBA_Image_Over(dst *image.RGBA, dr, adr image.Rectangle, src image.Image, sr image.Rectangle, opts *Options) {
+	dw2 := uint64(dr.Dx()) * 2
+	dh2 := uint64(dr.Dy()) * 2
+	sw := uint64(sr.Dx())
+	sh := uint64(sr.Dy())
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (2*uint64(dy) + 1) * sh / dh2
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (2*uint64(dx) + 1) * sw / dw2
+			pr, pg, pb, pa := src.At(sr.Min.X+int(sx), sr.Min.Y+int(sy)).RGBA()
+			pa1 := (0xffff - pa) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + pr) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + pg) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + pb) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + pa) >> 8)
+		}
+	}
+}
+
+func (nnInterpolator) scale_RGBA_Image_Src(dst *image.RGBA, dr, adr image.Rectangle, src image.Image, sr image.Rectangle, opts *Options) {
+	dw2 := uint64(dr.Dx()) * 2
+	dh2 := uint64(dr.Dy()) * 2
+	sw := uint64(sr.Dx())
+	sh := uint64(sr.Dy())
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (2*uint64(dy) + 1) * sh / dh2
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (2*uint64(dx) + 1) * sw / dw2
+			pr, pg, pb, pa := src.At(sr.Min.X+int(sx), sr.Min.Y+int(sy)).RGBA()
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = uint8(pa >> 8)
+		}
+	}
+}
+
+func (nnInterpolator) scale_Image_Image_Over(dst Image, dr, adr image.Rectangle, src image.Image, sr image.Rectangle, opts *Options) {
+	dw2 := uint64(dr.Dx()) * 2
+	dh2 := uint64(dr.Dy()) * 2
+	sw := uint64(sr.Dx())
+	sh := uint64(sr.Dy())
+	srcMask, smp := opts.SrcMask, opts.SrcMaskP
+	dstMask, dmp := opts.DstMask, opts.DstMaskP
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (2*uint64(dy) + 1) * sh / dh2
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+			sx := (2*uint64(dx) + 1) * sw / dw2
+			pr, pg, pb, pa := src.At(sr.Min.X+int(sx), sr.Min.Y+int(sy)).RGBA()
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sr.Min.X+int(sx), smp.Y+sr.Min.Y+int(sy)).RGBA()
+				pr = pr * ma / 0xffff
+				pg = pg * ma / 0xffff
+				pb = pb * ma / 0xffff
+				pa = pa * ma / 0xffff
+			}
+			qr, qg, qb, qa := dst.At(dr.Min.X+int(dx), dr.Min.Y+int(dy)).RGBA()
+			if dstMask != nil {
+				_, _, _, ma := dstMask.At(dmp.X+dr.Min.X+int(dx), dmp.Y+dr.Min.Y+int(dy)).RGBA()
+				pr = pr * ma / 0xffff
+				pg = pg * ma / 0xffff
+				pb = pb * ma / 0xffff
+				pa = pa * ma / 0xffff
+			}
+			pa1 := 0xffff - pa
+			dstColorRGBA64.R = uint16(qr*pa1/0xffff + pr)
+			dstColorRGBA64.G = uint16(qg*pa1/0xffff + pg)
+			dstColorRGBA64.B = uint16(qb*pa1/0xffff + pb)
+			dstColorRGBA64.A = uint16(qa*pa1/0xffff + pa)
+			dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
+		}
+	}
+}
+
+func (nnInterpolator) scale_Image_Image_Src(dst Image, dr, adr image.Rectangle, src image.Image, sr image.Rectangle, opts *Options) {
+	dw2 := uint64(dr.Dx()) * 2
+	dh2 := uint64(dr.Dy()) * 2
+	sw := uint64(sr.Dx())
+	sh := uint64(sr.Dy())
+	srcMask, smp := opts.SrcMask, opts.SrcMaskP
+	dstMask, dmp := opts.DstMask, opts.DstMaskP
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (2*uint64(dy) + 1) * sh / dh2
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+			sx := (2*uint64(dx) + 1) * sw / dw2
+			pr, pg, pb, pa := src.At(sr.Min.X+int(sx), sr.Min.Y+int(sy)).RGBA()
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sr.Min.X+int(sx), smp.Y+sr.Min.Y+int(sy)).RGBA()
+				pr = pr * ma / 0xffff
+				pg = pg * ma / 0xffff
+				pb = pb * ma / 0xffff
+				pa = pa * ma / 0xffff
+			}
+			if dstMask != nil {
+				qr, qg, qb, qa := dst.At(dr.Min.X+int(dx), dr.Min.Y+int(dy)).RGBA()
+				_, _, _, ma := dstMask.At(dmp.X+dr.Min.X+int(dx), dmp.Y+dr.Min.Y+int(dy)).RGBA()
+				pr = pr * ma / 0xffff
+				pg = pg * ma / 0xffff
+				pb = pb * ma / 0xffff
+				pa = pa * ma / 0xffff
+				pa1 := 0xffff - ma
+				dstColorRGBA64.R = uint16(qr*pa1/0xffff + pr)
+				dstColorRGBA64.G = uint16(qg*pa1/0xffff + pg)
+				dstColorRGBA64.B = uint16(qb*pa1/0xffff + pb)
+				dstColorRGBA64.A = uint16(qa*pa1/0xffff + pa)
+				dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
+			} else {
+				dstColorRGBA64.R = uint16(pr)
+				dstColorRGBA64.G = uint16(pg)
+				dstColorRGBA64.B = uint16(pb)
+				dstColorRGBA64.A = uint16(pa)
+				dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
+			}
+		}
+	}
+}
+
+func (nnInterpolator) transform_RGBA_Gray_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.Gray, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+			sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+			if !(image.Point{sx0, sy0}).In(sr) {
+				continue
+			}
+			pi := (sy0-src.Rect.Min.Y)*src.Stride + (sx0 - src.Rect.Min.X)
+			pr := uint32(src.Pix[pi]) * 0x101
+			out := uint8(pr >> 8)
+			dst.Pix[d+0] = out
+			dst.Pix[d+1] = out
+			dst.Pix[d+2] = out
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (nnInterpolator) transform_RGBA_NRGBA_Over(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.NRGBA, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+			sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+			if !(image.Point{sx0, sy0}).In(sr) {
+				continue
+			}
+			pi := (sy0-src.Rect.Min.Y)*src.Stride + (sx0-src.Rect.Min.X)*4
+			pa := uint32(src.Pix[pi+3]) * 0x101
+			pr := uint32(src.Pix[pi+0]) * pa / 0xff
+			pg := uint32(src.Pix[pi+1]) * pa / 0xff
+			pb := uint32(src.Pix[pi+2]) * pa / 0xff
+			pa1 := (0xffff - pa) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + pr) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + pg) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + pb) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + pa) >> 8)
+		}
+	}
+}
+
+func (nnInterpolator) transform_RGBA_NRGBA_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.NRGBA, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+			sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+			if !(image.Point{sx0, sy0}).In(sr) {
+				continue
+			}
+			pi := (sy0-src.Rect.Min.Y)*src.Stride + (sx0-src.Rect.Min.X)*4
+			pa := uint32(src.Pix[pi+3]) * 0x101
+			pr := uint32(src.Pix[pi+0]) * pa / 0xff
+			pg := uint32(src.Pix[pi+1]) * pa / 0xff
+			pb := uint32(src.Pix[pi+2]) * pa / 0xff
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = uint8(pa >> 8)
+		}
+	}
+}
+
+func (nnInterpolator) transform_RGBA_RGBA_Over(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.RGBA, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+			sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+			if !(image.Point{sx0, sy0}).In(sr) {
+				continue
+			}
+			pi := (sy0-src.Rect.Min.Y)*src.Stride + (sx0-src.Rect.Min.X)*4
+			pr := uint32(src.Pix[pi+0]) * 0x101
+			pg := uint32(src.Pix[pi+1]) * 0x101
+			pb := uint32(src.Pix[pi+2]) * 0x101
+			pa := uint32(src.Pix[pi+3]) * 0x101
+			pa1 := (0xffff - pa) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + pr) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + pg) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + pb) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + pa) >> 8)
+		}
+	}
+}
+
+func (nnInterpolator) transform_RGBA_RGBA_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.RGBA, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+			sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+			if !(image.Point{sx0, sy0}).In(sr) {
+				continue
+			}
+			pi := (sy0-src.Rect.Min.Y)*src.Stride + (sx0-src.Rect.Min.X)*4
+			pr := uint32(src.Pix[pi+0]) * 0x101
+			pg := uint32(src.Pix[pi+1]) * 0x101
+			pb := uint32(src.Pix[pi+2]) * 0x101
+			pa := uint32(src.Pix[pi+3]) * 0x101
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = uint8(pa >> 8)
+		}
+	}
+}
+
+func (nnInterpolator) transform_RGBA_YCbCr444_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.YCbCr, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+			sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+			if !(image.Point{sx0, sy0}).In(sr) {
+				continue
+			}
+			pi := (sy0-src.Rect.Min.Y)*src.YStride + (sx0 - src.Rect.Min.X)
+			pj := (sy0-src.Rect.Min.Y)*src.CStride + (sx0 - src.Rect.Min.X)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			pyy1 := int(src.Y[pi]) * 0x10100
+			pcb1 := int(src.Cb[pj]) - 128
+			pcr1 := int(src.Cr[pj]) - 128
+			pr := (pyy1 + 91881*pcr1) >> 8
+			pg := (pyy1 - 22554*pcb1 - 46802*pcr1) >> 8
+			pb := (pyy1 + 116130*pcb1) >> 8
+			if pr < 0 {
+				pr = 0
+			} else if pr > 0xffff {
+				pr = 0xffff
+			}
+			if pg < 0 {
+				pg = 0
+			} else if pg > 0xffff {
+				pg = 0xffff
+			}
+			if pb < 0 {
+				pb = 0
+			} else if pb > 0xffff {
+				pb = 0xffff
+			}
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (nnInterpolator) transform_RGBA_YCbCr422_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.YCbCr, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+			sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+			if !(image.Point{sx0, sy0}).In(sr) {
+				continue
+			}
+			pi := (sy0-src.Rect.Min.Y)*src.YStride + (sx0 - src.Rect.Min.X)
+			pj := (sy0-src.Rect.Min.Y)*src.CStride + ((sx0)/2 - src.Rect.Min.X/2)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			pyy1 := int(src.Y[pi]) * 0x10100
+			pcb1 := int(src.Cb[pj]) - 128
+			pcr1 := int(src.Cr[pj]) - 128
+			pr := (pyy1 + 91881*pcr1) >> 8
+			pg := (pyy1 - 22554*pcb1 - 46802*pcr1) >> 8
+			pb := (pyy1 + 116130*pcb1) >> 8
+			if pr < 0 {
+				pr = 0
+			} else if pr > 0xffff {
+				pr = 0xffff
+			}
+			if pg < 0 {
+				pg = 0
+			} else if pg > 0xffff {
+				pg = 0xffff
+			}
+			if pb < 0 {
+				pb = 0
+			} else if pb > 0xffff {
+				pb = 0xffff
+			}
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (nnInterpolator) transform_RGBA_YCbCr420_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.YCbCr, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+			sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+			if !(image.Point{sx0, sy0}).In(sr) {
+				continue
+			}
+			pi := (sy0-src.Rect.Min.Y)*src.YStride + (sx0 - src.Rect.Min.X)
+			pj := ((sy0)/2-src.Rect.Min.Y/2)*src.CStride + ((sx0)/2 - src.Rect.Min.X/2)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			pyy1 := int(src.Y[pi]) * 0x10100
+			pcb1 := int(src.Cb[pj]) - 128
+			pcr1 := int(src.Cr[pj]) - 128
+			pr := (pyy1 + 91881*pcr1) >> 8
+			pg := (pyy1 - 22554*pcb1 - 46802*pcr1) >> 8
+			pb := (pyy1 + 116130*pcb1) >> 8
+			if pr < 0 {
+				pr = 0
+			} else if pr > 0xffff {
+				pr = 0xffff
+			}
+			if pg < 0 {
+				pg = 0
+			} else if pg > 0xffff {
+				pg = 0xffff
+			}
+			if pb < 0 {
+				pb = 0
+			} else if pb > 0xffff {
+				pb = 0xffff
+			}
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (nnInterpolator) transform_RGBA_YCbCr440_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.YCbCr, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+			sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+			if !(image.Point{sx0, sy0}).In(sr) {
+				continue
+			}
+			pi := (sy0-src.Rect.Min.Y)*src.YStride + (sx0 - src.Rect.Min.X)
+			pj := ((sy0)/2-src.Rect.Min.Y/2)*src.CStride + (sx0 - src.Rect.Min.X)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			pyy1 := int(src.Y[pi]) * 0x10100
+			pcb1 := int(src.Cb[pj]) - 128
+			pcr1 := int(src.Cr[pj]) - 128
+			pr := (pyy1 + 91881*pcr1) >> 8
+			pg := (pyy1 - 22554*pcb1 - 46802*pcr1) >> 8
+			pb := (pyy1 + 116130*pcb1) >> 8
+			if pr < 0 {
+				pr = 0
+			} else if pr > 0xffff {
+				pr = 0xffff
+			}
+			if pg < 0 {
+				pg = 0
+			} else if pg > 0xffff {
+				pg = 0xffff
+			}
+			if pb < 0 {
+				pb = 0
+			} else if pb > 0xffff {
+				pb = 0xffff
+			}
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (nnInterpolator) transform_RGBA_Image_Over(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src image.Image, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+			sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+			if !(image.Point{sx0, sy0}).In(sr) {
+				continue
+			}
+			pr, pg, pb, pa := src.At(sx0, sy0).RGBA()
+			pa1 := (0xffff - pa) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + pr) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + pg) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + pb) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + pa) >> 8)
+		}
+	}
+}
+
+func (nnInterpolator) transform_RGBA_Image_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src image.Image, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+			sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+			if !(image.Point{sx0, sy0}).In(sr) {
+				continue
+			}
+			pr, pg, pb, pa := src.At(sx0, sy0).RGBA()
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = uint8(pa >> 8)
+		}
+	}
+}
+
+func (nnInterpolator) transform_Image_Image_Over(dst Image, dr, adr image.Rectangle, d2s *f64.Aff3, src image.Image, sr image.Rectangle, bias image.Point, opts *Options) {
+	srcMask, smp := opts.SrcMask, opts.SrcMaskP
+	dstMask, dmp := opts.DstMask, opts.DstMaskP
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+			sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+			if !(image.Point{sx0, sy0}).In(sr) {
+				continue
+			}
+			pr, pg, pb, pa := src.At(sx0, sy0).RGBA()
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sx0, smp.Y+sy0).RGBA()
+				pr = pr * ma / 0xffff
+				pg = pg * ma / 0xffff
+				pb = pb * ma / 0xffff
+				pa = pa * ma / 0xffff
+			}
+			qr, qg, qb, qa := dst.At(dr.Min.X+int(dx), dr.Min.Y+int(dy)).RGBA()
+			if dstMask != nil {
+				_, _, _, ma := dstMask.At(dmp.X+dr.Min.X+int(dx), dmp.Y+dr.Min.Y+int(dy)).RGBA()
+				pr = pr * ma / 0xffff
+				pg = pg * ma / 0xffff
+				pb = pb * ma / 0xffff
+				pa = pa * ma / 0xffff
+			}
+			pa1 := 0xffff - pa
+			dstColorRGBA64.R = uint16(qr*pa1/0xffff + pr)
+			dstColorRGBA64.G = uint16(qg*pa1/0xffff + pg)
+			dstColorRGBA64.B = uint16(qb*pa1/0xffff + pb)
+			dstColorRGBA64.A = uint16(qa*pa1/0xffff + pa)
+			dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
+		}
+	}
+}
+
+func (nnInterpolator) transform_Image_Image_Src(dst Image, dr, adr image.Rectangle, d2s *f64.Aff3, src image.Image, sr image.Rectangle, bias image.Point, opts *Options) {
+	srcMask, smp := opts.SrcMask, opts.SrcMaskP
+	dstMask, dmp := opts.DstMask, opts.DstMaskP
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+			sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+			if !(image.Point{sx0, sy0}).In(sr) {
+				continue
+			}
+			pr, pg, pb, pa := src.At(sx0, sy0).RGBA()
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sx0, smp.Y+sy0).RGBA()
+				pr = pr * ma / 0xffff
+				pg = pg * ma / 0xffff
+				pb = pb * ma / 0xffff
+				pa = pa * ma / 0xffff
+			}
+			if dstMask != nil {
+				qr, qg, qb, qa := dst.At(dr.Min.X+int(dx), dr.Min.Y+int(dy)).RGBA()
+				_, _, _, ma := dstMask.At(dmp.X+dr.Min.X+int(dx), dmp.Y+dr.Min.Y+int(dy)).RGBA()
+				pr = pr * ma / 0xffff
+				pg = pg * ma / 0xffff
+				pb = pb * ma / 0xffff
+				pa = pa * ma / 0xffff
+				pa1 := 0xffff - ma
+				dstColorRGBA64.R = uint16(qr*pa1/0xffff + pr)
+				dstColorRGBA64.G = uint16(qg*pa1/0xffff + pg)
+				dstColorRGBA64.B = uint16(qb*pa1/0xffff + pb)
+				dstColorRGBA64.A = uint16(qa*pa1/0xffff + pa)
+				dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
+			} else {
+				dstColorRGBA64.R = uint16(pr)
+				dstColorRGBA64.G = uint16(pg)
+				dstColorRGBA64.B = uint16(pb)
+				dstColorRGBA64.A = uint16(pa)
+				dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
+			}
+		}
+	}
+}
+
+func (z ablInterpolator) Scale(dst Image, dr image.Rectangle, src image.Image, sr image.Rectangle, op Op, opts *Options) {
+	// Try to simplify a Scale to a Copy.
+	if dr.Size() == sr.Size() {
+		Copy(dst, dr.Min, src, sr, op, opts)
+		return
+	}
+
+	var o Options
+	if opts != nil {
+		o = *opts
+	}
+
+	// adr is the affected destination pixels.
+	adr := dst.Bounds().Intersect(dr)
+	adr, o.DstMask = clipAffectedDestRect(adr, o.DstMask, o.DstMaskP)
+	if adr.Empty() || sr.Empty() {
+		return
+	}
+	// Make adr relative to dr.Min.
+	adr = adr.Sub(dr.Min)
+	if op == Over && o.SrcMask == nil && opaque(src) {
+		op = Src
+	}
+
+	// sr is the source pixels. If it extends beyond the src bounds,
+	// we cannot use the type-specific fast paths, as they access
+	// the Pix fields directly without bounds checking.
+	//
+	// Similarly, the fast paths assume that the masks are nil.
+	if o.DstMask != nil || o.SrcMask != nil || !sr.In(src.Bounds()) {
+		switch op {
+		case Over:
+			z.scale_Image_Image_Over(dst, dr, adr, src, sr, &o)
+		case Src:
+			z.scale_Image_Image_Src(dst, dr, adr, src, sr, &o)
+		}
+	} else if _, ok := src.(*image.Uniform); ok {
+		Draw(dst, dr, src, src.Bounds().Min, op)
+	} else {
+		switch op {
+		case Over:
+			switch dst := dst.(type) {
+			case *image.RGBA:
+				switch src := src.(type) {
+				case *image.NRGBA:
+					z.scale_RGBA_NRGBA_Over(dst, dr, adr, src, sr, &o)
+				case *image.RGBA:
+					z.scale_RGBA_RGBA_Over(dst, dr, adr, src, sr, &o)
+				default:
+					z.scale_RGBA_Image_Over(dst, dr, adr, src, sr, &o)
+				}
+			default:
+				switch src := src.(type) {
+				default:
+					z.scale_Image_Image_Over(dst, dr, adr, src, sr, &o)
+				}
+			}
+		case Src:
+			switch dst := dst.(type) {
+			case *image.RGBA:
+				switch src := src.(type) {
+				case *image.Gray:
+					z.scale_RGBA_Gray_Src(dst, dr, adr, src, sr, &o)
+				case *image.NRGBA:
+					z.scale_RGBA_NRGBA_Src(dst, dr, adr, src, sr, &o)
+				case *image.RGBA:
+					z.scale_RGBA_RGBA_Src(dst, dr, adr, src, sr, &o)
+				case *image.YCbCr:
+					switch src.SubsampleRatio {
+					default:
+						z.scale_RGBA_Image_Src(dst, dr, adr, src, sr, &o)
+					case image.YCbCrSubsampleRatio444:
+						z.scale_RGBA_YCbCr444_Src(dst, dr, adr, src, sr, &o)
+					case image.YCbCrSubsampleRatio422:
+						z.scale_RGBA_YCbCr422_Src(dst, dr, adr, src, sr, &o)
+					case image.YCbCrSubsampleRatio420:
+						z.scale_RGBA_YCbCr420_Src(dst, dr, adr, src, sr, &o)
+					case image.YCbCrSubsampleRatio440:
+						z.scale_RGBA_YCbCr440_Src(dst, dr, adr, src, sr, &o)
+					}
+				default:
+					z.scale_RGBA_Image_Src(dst, dr, adr, src, sr, &o)
+				}
+			default:
+				switch src := src.(type) {
+				default:
+					z.scale_Image_Image_Src(dst, dr, adr, src, sr, &o)
+				}
+			}
+		}
+	}
+}
+
+func (z ablInterpolator) Transform(dst Image, s2d f64.Aff3, src image.Image, sr image.Rectangle, op Op, opts *Options) {
+	// Try to simplify a Transform to a Copy.
+	if s2d[0] == 1 && s2d[1] == 0 && s2d[3] == 0 && s2d[4] == 1 {
+		dx := int(s2d[2])
+		dy := int(s2d[5])
+		if float64(dx) == s2d[2] && float64(dy) == s2d[5] {
+			Copy(dst, image.Point{X: sr.Min.X + dx, Y: sr.Min.X + dy}, src, sr, op, opts)
+			return
+		}
+	}
+
+	var o Options
+	if opts != nil {
+		o = *opts
+	}
+
+	dr := transformRect(&s2d, &sr)
+	// adr is the affected destination pixels.
+	adr := dst.Bounds().Intersect(dr)
+	adr, o.DstMask = clipAffectedDestRect(adr, o.DstMask, o.DstMaskP)
+	if adr.Empty() || sr.Empty() {
+		return
+	}
+	if op == Over && o.SrcMask == nil && opaque(src) {
+		op = Src
+	}
+
+	d2s := invert(&s2d)
+	// bias is a translation of the mapping from dst coordinates to src
+	// coordinates such that the latter temporarily have non-negative X
+	// and Y coordinates. This allows us to write int(f) instead of
+	// int(math.Floor(f)), since "round to zero" and "round down" are
+	// equivalent when f >= 0, but the former is much cheaper. The X--
+	// and Y-- are because the TransformLeaf methods have a "sx -= 0.5"
+	// adjustment.
+	bias := transformRect(&d2s, &adr).Min
+	bias.X--
+	bias.Y--
+	d2s[2] -= float64(bias.X)
+	d2s[5] -= float64(bias.Y)
+	// Make adr relative to dr.Min.
+	adr = adr.Sub(dr.Min)
+	// sr is the source pixels. If it extends beyond the src bounds,
+	// we cannot use the type-specific fast paths, as they access
+	// the Pix fields directly without bounds checking.
+	//
+	// Similarly, the fast paths assume that the masks are nil.
+	if o.DstMask != nil || o.SrcMask != nil || !sr.In(src.Bounds()) {
+		switch op {
+		case Over:
+			z.transform_Image_Image_Over(dst, dr, adr, &d2s, src, sr, bias, &o)
+		case Src:
+			z.transform_Image_Image_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+		}
+	} else if u, ok := src.(*image.Uniform); ok {
+		transform_Uniform(dst, dr, adr, &d2s, u, sr, bias, op)
+	} else {
+		switch op {
+		case Over:
+			switch dst := dst.(type) {
+			case *image.RGBA:
+				switch src := src.(type) {
+				case *image.NRGBA:
+					z.transform_RGBA_NRGBA_Over(dst, dr, adr, &d2s, src, sr, bias, &o)
+				case *image.RGBA:
+					z.transform_RGBA_RGBA_Over(dst, dr, adr, &d2s, src, sr, bias, &o)
+				default:
+					z.transform_RGBA_Image_Over(dst, dr, adr, &d2s, src, sr, bias, &o)
+				}
+			default:
+				switch src := src.(type) {
+				default:
+					z.transform_Image_Image_Over(dst, dr, adr, &d2s, src, sr, bias, &o)
+				}
+			}
+		case Src:
+			switch dst := dst.(type) {
+			case *image.RGBA:
+				switch src := src.(type) {
+				case *image.Gray:
+					z.transform_RGBA_Gray_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+				case *image.NRGBA:
+					z.transform_RGBA_NRGBA_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+				case *image.RGBA:
+					z.transform_RGBA_RGBA_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+				case *image.YCbCr:
+					switch src.SubsampleRatio {
+					default:
+						z.transform_RGBA_Image_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+					case image.YCbCrSubsampleRatio444:
+						z.transform_RGBA_YCbCr444_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+					case image.YCbCrSubsampleRatio422:
+						z.transform_RGBA_YCbCr422_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+					case image.YCbCrSubsampleRatio420:
+						z.transform_RGBA_YCbCr420_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+					case image.YCbCrSubsampleRatio440:
+						z.transform_RGBA_YCbCr440_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+					}
+				default:
+					z.transform_RGBA_Image_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+				}
+			default:
+				switch src := src.(type) {
+				default:
+					z.transform_Image_Image_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+				}
+			}
+		}
+	}
+}
+
+func (ablInterpolator) scale_RGBA_Gray_Src(dst *image.RGBA, dr, adr image.Rectangle, src *image.Gray, sr image.Rectangle, opts *Options) {
+	sw := int32(sr.Dx())
+	sh := int32(sr.Dy())
+	yscale := float64(sh) / float64(dr.Dy())
+	xscale := float64(sw) / float64(dr.Dx())
+	swMinus1, shMinus1 := sw-1, sh-1
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (float64(dy)+0.5)*yscale - 0.5
+		// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+		// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+		// sx, below.
+		sy0 := int32(sy)
+		yFrac0 := sy - float64(sy0)
+		yFrac1 := 1 - yFrac0
+		sy1 := sy0 + 1
+		if sy < 0 {
+			sy0, sy1 = 0, 0
+			yFrac0, yFrac1 = 0, 1
+		} else if sy1 > shMinus1 {
+			sy0, sy1 = shMinus1, shMinus1
+			yFrac0, yFrac1 = 1, 0
+		}
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (float64(dx)+0.5)*xscale - 0.5
+			sx0 := int32(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx1 := sx0 + 1
+			if sx < 0 {
+				sx0, sx1 = 0, 0
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 > swMinus1 {
+				sx0, sx1 = swMinus1, swMinus1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			s00i := (sr.Min.Y+int(sy0)-src.Rect.Min.Y)*src.Stride + (sr.Min.X + int(sx0) - src.Rect.Min.X)
+			s00ru := uint32(src.Pix[s00i]) * 0x101
+			s00r := float64(s00ru)
+			s10i := (sr.Min.Y+int(sy0)-src.Rect.Min.Y)*src.Stride + (sr.Min.X + int(sx1) - src.Rect.Min.X)
+			s10ru := uint32(src.Pix[s10i]) * 0x101
+			s10r := float64(s10ru)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s01i := (sr.Min.Y+int(sy1)-src.Rect.Min.Y)*src.Stride + (sr.Min.X + int(sx0) - src.Rect.Min.X)
+			s01ru := uint32(src.Pix[s01i]) * 0x101
+			s01r := float64(s01ru)
+			s11i := (sr.Min.Y+int(sy1)-src.Rect.Min.Y)*src.Stride + (sr.Min.X + int(sx1) - src.Rect.Min.X)
+			s11ru := uint32(src.Pix[s11i]) * 0x101
+			s11r := float64(s11ru)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11r = yFrac1*s10r + yFrac0*s11r
+			pr := uint32(s11r)
+			out := uint8(pr >> 8)
+			dst.Pix[d+0] = out
+			dst.Pix[d+1] = out
+			dst.Pix[d+2] = out
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (ablInterpolator) scale_RGBA_NRGBA_Over(dst *image.RGBA, dr, adr image.Rectangle, src *image.NRGBA, sr image.Rectangle, opts *Options) {
+	sw := int32(sr.Dx())
+	sh := int32(sr.Dy())
+	yscale := float64(sh) / float64(dr.Dy())
+	xscale := float64(sw) / float64(dr.Dx())
+	swMinus1, shMinus1 := sw-1, sh-1
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (float64(dy)+0.5)*yscale - 0.5
+		// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+		// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+		// sx, below.
+		sy0 := int32(sy)
+		yFrac0 := sy - float64(sy0)
+		yFrac1 := 1 - yFrac0
+		sy1 := sy0 + 1
+		if sy < 0 {
+			sy0, sy1 = 0, 0
+			yFrac0, yFrac1 = 0, 1
+		} else if sy1 > shMinus1 {
+			sy0, sy1 = shMinus1, shMinus1
+			yFrac0, yFrac1 = 1, 0
+		}
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (float64(dx)+0.5)*xscale - 0.5
+			sx0 := int32(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx1 := sx0 + 1
+			if sx < 0 {
+				sx0, sx1 = 0, 0
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 > swMinus1 {
+				sx0, sx1 = swMinus1, swMinus1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			s00i := (sr.Min.Y+int(sy0)-src.Rect.Min.Y)*src.Stride + (sr.Min.X+int(sx0)-src.Rect.Min.X)*4
+			s00au := uint32(src.Pix[s00i+3]) * 0x101
+			s00ru := uint32(src.Pix[s00i+0]) * s00au / 0xff
+			s00gu := uint32(src.Pix[s00i+1]) * s00au / 0xff
+			s00bu := uint32(src.Pix[s00i+2]) * s00au / 0xff
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10i := (sr.Min.Y+int(sy0)-src.Rect.Min.Y)*src.Stride + (sr.Min.X+int(sx1)-src.Rect.Min.X)*4
+			s10au := uint32(src.Pix[s10i+3]) * 0x101
+			s10ru := uint32(src.Pix[s10i+0]) * s10au / 0xff
+			s10gu := uint32(src.Pix[s10i+1]) * s10au / 0xff
+			s10bu := uint32(src.Pix[s10i+2]) * s10au / 0xff
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01i := (sr.Min.Y+int(sy1)-src.Rect.Min.Y)*src.Stride + (sr.Min.X+int(sx0)-src.Rect.Min.X)*4
+			s01au := uint32(src.Pix[s01i+3]) * 0x101
+			s01ru := uint32(src.Pix[s01i+0]) * s01au / 0xff
+			s01gu := uint32(src.Pix[s01i+1]) * s01au / 0xff
+			s01bu := uint32(src.Pix[s01i+2]) * s01au / 0xff
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11i := (sr.Min.Y+int(sy1)-src.Rect.Min.Y)*src.Stride + (sr.Min.X+int(sx1)-src.Rect.Min.X)*4
+			s11au := uint32(src.Pix[s11i+3]) * 0x101
+			s11ru := uint32(src.Pix[s11i+0]) * s11au / 0xff
+			s11gu := uint32(src.Pix[s11i+1]) * s11au / 0xff
+			s11bu := uint32(src.Pix[s11i+2]) * s11au / 0xff
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			pr := uint32(s11r)
+			pg := uint32(s11g)
+			pb := uint32(s11b)
+			pa := uint32(s11a)
+			pa1 := (0xffff - pa) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + pr) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + pg) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + pb) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + pa) >> 8)
+		}
+	}
+}
+
+func (ablInterpolator) scale_RGBA_NRGBA_Src(dst *image.RGBA, dr, adr image.Rectangle, src *image.NRGBA, sr image.Rectangle, opts *Options) {
+	sw := int32(sr.Dx())
+	sh := int32(sr.Dy())
+	yscale := float64(sh) / float64(dr.Dy())
+	xscale := float64(sw) / float64(dr.Dx())
+	swMinus1, shMinus1 := sw-1, sh-1
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (float64(dy)+0.5)*yscale - 0.5
+		// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+		// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+		// sx, below.
+		sy0 := int32(sy)
+		yFrac0 := sy - float64(sy0)
+		yFrac1 := 1 - yFrac0
+		sy1 := sy0 + 1
+		if sy < 0 {
+			sy0, sy1 = 0, 0
+			yFrac0, yFrac1 = 0, 1
+		} else if sy1 > shMinus1 {
+			sy0, sy1 = shMinus1, shMinus1
+			yFrac0, yFrac1 = 1, 0
+		}
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (float64(dx)+0.5)*xscale - 0.5
+			sx0 := int32(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx1 := sx0 + 1
+			if sx < 0 {
+				sx0, sx1 = 0, 0
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 > swMinus1 {
+				sx0, sx1 = swMinus1, swMinus1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			s00i := (sr.Min.Y+int(sy0)-src.Rect.Min.Y)*src.Stride + (sr.Min.X+int(sx0)-src.Rect.Min.X)*4
+			s00au := uint32(src.Pix[s00i+3]) * 0x101
+			s00ru := uint32(src.Pix[s00i+0]) * s00au / 0xff
+			s00gu := uint32(src.Pix[s00i+1]) * s00au / 0xff
+			s00bu := uint32(src.Pix[s00i+2]) * s00au / 0xff
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10i := (sr.Min.Y+int(sy0)-src.Rect.Min.Y)*src.Stride + (sr.Min.X+int(sx1)-src.Rect.Min.X)*4
+			s10au := uint32(src.Pix[s10i+3]) * 0x101
+			s10ru := uint32(src.Pix[s10i+0]) * s10au / 0xff
+			s10gu := uint32(src.Pix[s10i+1]) * s10au / 0xff
+			s10bu := uint32(src.Pix[s10i+2]) * s10au / 0xff
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01i := (sr.Min.Y+int(sy1)-src.Rect.Min.Y)*src.Stride + (sr.Min.X+int(sx0)-src.Rect.Min.X)*4
+			s01au := uint32(src.Pix[s01i+3]) * 0x101
+			s01ru := uint32(src.Pix[s01i+0]) * s01au / 0xff
+			s01gu := uint32(src.Pix[s01i+1]) * s01au / 0xff
+			s01bu := uint32(src.Pix[s01i+2]) * s01au / 0xff
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11i := (sr.Min.Y+int(sy1)-src.Rect.Min.Y)*src.Stride + (sr.Min.X+int(sx1)-src.Rect.Min.X)*4
+			s11au := uint32(src.Pix[s11i+3]) * 0x101
+			s11ru := uint32(src.Pix[s11i+0]) * s11au / 0xff
+			s11gu := uint32(src.Pix[s11i+1]) * s11au / 0xff
+			s11bu := uint32(src.Pix[s11i+2]) * s11au / 0xff
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			pr := uint32(s11r)
+			pg := uint32(s11g)
+			pb := uint32(s11b)
+			pa := uint32(s11a)
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = uint8(pa >> 8)
+		}
+	}
+}
+
+func (ablInterpolator) scale_RGBA_RGBA_Over(dst *image.RGBA, dr, adr image.Rectangle, src *image.RGBA, sr image.Rectangle, opts *Options) {
+	sw := int32(sr.Dx())
+	sh := int32(sr.Dy())
+	yscale := float64(sh) / float64(dr.Dy())
+	xscale := float64(sw) / float64(dr.Dx())
+	swMinus1, shMinus1 := sw-1, sh-1
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (float64(dy)+0.5)*yscale - 0.5
+		// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+		// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+		// sx, below.
+		sy0 := int32(sy)
+		yFrac0 := sy - float64(sy0)
+		yFrac1 := 1 - yFrac0
+		sy1 := sy0 + 1
+		if sy < 0 {
+			sy0, sy1 = 0, 0
+			yFrac0, yFrac1 = 0, 1
+		} else if sy1 > shMinus1 {
+			sy0, sy1 = shMinus1, shMinus1
+			yFrac0, yFrac1 = 1, 0
+		}
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (float64(dx)+0.5)*xscale - 0.5
+			sx0 := int32(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx1 := sx0 + 1
+			if sx < 0 {
+				sx0, sx1 = 0, 0
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 > swMinus1 {
+				sx0, sx1 = swMinus1, swMinus1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			s00i := (sr.Min.Y+int(sy0)-src.Rect.Min.Y)*src.Stride + (sr.Min.X+int(sx0)-src.Rect.Min.X)*4
+			s00ru := uint32(src.Pix[s00i+0]) * 0x101
+			s00gu := uint32(src.Pix[s00i+1]) * 0x101
+			s00bu := uint32(src.Pix[s00i+2]) * 0x101
+			s00au := uint32(src.Pix[s00i+3]) * 0x101
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10i := (sr.Min.Y+int(sy0)-src.Rect.Min.Y)*src.Stride + (sr.Min.X+int(sx1)-src.Rect.Min.X)*4
+			s10ru := uint32(src.Pix[s10i+0]) * 0x101
+			s10gu := uint32(src.Pix[s10i+1]) * 0x101
+			s10bu := uint32(src.Pix[s10i+2]) * 0x101
+			s10au := uint32(src.Pix[s10i+3]) * 0x101
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01i := (sr.Min.Y+int(sy1)-src.Rect.Min.Y)*src.Stride + (sr.Min.X+int(sx0)-src.Rect.Min.X)*4
+			s01ru := uint32(src.Pix[s01i+0]) * 0x101
+			s01gu := uint32(src.Pix[s01i+1]) * 0x101
+			s01bu := uint32(src.Pix[s01i+2]) * 0x101
+			s01au := uint32(src.Pix[s01i+3]) * 0x101
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11i := (sr.Min.Y+int(sy1)-src.Rect.Min.Y)*src.Stride + (sr.Min.X+int(sx1)-src.Rect.Min.X)*4
+			s11ru := uint32(src.Pix[s11i+0]) * 0x101
+			s11gu := uint32(src.Pix[s11i+1]) * 0x101
+			s11bu := uint32(src.Pix[s11i+2]) * 0x101
+			s11au := uint32(src.Pix[s11i+3]) * 0x101
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			pr := uint32(s11r)
+			pg := uint32(s11g)
+			pb := uint32(s11b)
+			pa := uint32(s11a)
+			pa1 := (0xffff - pa) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + pr) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + pg) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + pb) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + pa) >> 8)
+		}
+	}
+}
+
+func (ablInterpolator) scale_RGBA_RGBA_Src(dst *image.RGBA, dr, adr image.Rectangle, src *image.RGBA, sr image.Rectangle, opts *Options) {
+	sw := int32(sr.Dx())
+	sh := int32(sr.Dy())
+	yscale := float64(sh) / float64(dr.Dy())
+	xscale := float64(sw) / float64(dr.Dx())
+	swMinus1, shMinus1 := sw-1, sh-1
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (float64(dy)+0.5)*yscale - 0.5
+		// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+		// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+		// sx, below.
+		sy0 := int32(sy)
+		yFrac0 := sy - float64(sy0)
+		yFrac1 := 1 - yFrac0
+		sy1 := sy0 + 1
+		if sy < 0 {
+			sy0, sy1 = 0, 0
+			yFrac0, yFrac1 = 0, 1
+		} else if sy1 > shMinus1 {
+			sy0, sy1 = shMinus1, shMinus1
+			yFrac0, yFrac1 = 1, 0
+		}
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (float64(dx)+0.5)*xscale - 0.5
+			sx0 := int32(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx1 := sx0 + 1
+			if sx < 0 {
+				sx0, sx1 = 0, 0
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 > swMinus1 {
+				sx0, sx1 = swMinus1, swMinus1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			s00i := (sr.Min.Y+int(sy0)-src.Rect.Min.Y)*src.Stride + (sr.Min.X+int(sx0)-src.Rect.Min.X)*4
+			s00ru := uint32(src.Pix[s00i+0]) * 0x101
+			s00gu := uint32(src.Pix[s00i+1]) * 0x101
+			s00bu := uint32(src.Pix[s00i+2]) * 0x101
+			s00au := uint32(src.Pix[s00i+3]) * 0x101
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10i := (sr.Min.Y+int(sy0)-src.Rect.Min.Y)*src.Stride + (sr.Min.X+int(sx1)-src.Rect.Min.X)*4
+			s10ru := uint32(src.Pix[s10i+0]) * 0x101
+			s10gu := uint32(src.Pix[s10i+1]) * 0x101
+			s10bu := uint32(src.Pix[s10i+2]) * 0x101
+			s10au := uint32(src.Pix[s10i+3]) * 0x101
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01i := (sr.Min.Y+int(sy1)-src.Rect.Min.Y)*src.Stride + (sr.Min.X+int(sx0)-src.Rect.Min.X)*4
+			s01ru := uint32(src.Pix[s01i+0]) * 0x101
+			s01gu := uint32(src.Pix[s01i+1]) * 0x101
+			s01bu := uint32(src.Pix[s01i+2]) * 0x101
+			s01au := uint32(src.Pix[s01i+3]) * 0x101
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11i := (sr.Min.Y+int(sy1)-src.Rect.Min.Y)*src.Stride + (sr.Min.X+int(sx1)-src.Rect.Min.X)*4
+			s11ru := uint32(src.Pix[s11i+0]) * 0x101
+			s11gu := uint32(src.Pix[s11i+1]) * 0x101
+			s11bu := uint32(src.Pix[s11i+2]) * 0x101
+			s11au := uint32(src.Pix[s11i+3]) * 0x101
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			pr := uint32(s11r)
+			pg := uint32(s11g)
+			pb := uint32(s11b)
+			pa := uint32(s11a)
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = uint8(pa >> 8)
+		}
+	}
+}
+
+func (ablInterpolator) scale_RGBA_YCbCr444_Src(dst *image.RGBA, dr, adr image.Rectangle, src *image.YCbCr, sr image.Rectangle, opts *Options) {
+	sw := int32(sr.Dx())
+	sh := int32(sr.Dy())
+	yscale := float64(sh) / float64(dr.Dy())
+	xscale := float64(sw) / float64(dr.Dx())
+	swMinus1, shMinus1 := sw-1, sh-1
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (float64(dy)+0.5)*yscale - 0.5
+		// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+		// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+		// sx, below.
+		sy0 := int32(sy)
+		yFrac0 := sy - float64(sy0)
+		yFrac1 := 1 - yFrac0
+		sy1 := sy0 + 1
+		if sy < 0 {
+			sy0, sy1 = 0, 0
+			yFrac0, yFrac1 = 0, 1
+		} else if sy1 > shMinus1 {
+			sy0, sy1 = shMinus1, shMinus1
+			yFrac0, yFrac1 = 1, 0
+		}
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (float64(dx)+0.5)*xscale - 0.5
+			sx0 := int32(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx1 := sx0 + 1
+			if sx < 0 {
+				sx0, sx1 = 0, 0
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 > swMinus1 {
+				sx0, sx1 = swMinus1, swMinus1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			s00i := (sr.Min.Y+int(sy0)-src.Rect.Min.Y)*src.YStride + (sr.Min.X + int(sx0) - src.Rect.Min.X)
+			s00j := (sr.Min.Y+int(sy0)-src.Rect.Min.Y)*src.CStride + (sr.Min.X + int(sx0) - src.Rect.Min.X)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s00yy1 := int(src.Y[s00i]) * 0x10100
+			s00cb1 := int(src.Cb[s00j]) - 128
+			s00cr1 := int(src.Cr[s00j]) - 128
+			s00ru := (s00yy1 + 91881*s00cr1) >> 8
+			s00gu := (s00yy1 - 22554*s00cb1 - 46802*s00cr1) >> 8
+			s00bu := (s00yy1 + 116130*s00cb1) >> 8
+			if s00ru < 0 {
+				s00ru = 0
+			} else if s00ru > 0xffff {
+				s00ru = 0xffff
+			}
+			if s00gu < 0 {
+				s00gu = 0
+			} else if s00gu > 0xffff {
+				s00gu = 0xffff
+			}
+			if s00bu < 0 {
+				s00bu = 0
+			} else if s00bu > 0xffff {
+				s00bu = 0xffff
+			}
+
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s10i := (sr.Min.Y+int(sy0)-src.Rect.Min.Y)*src.YStride + (sr.Min.X + int(sx1) - src.Rect.Min.X)
+			s10j := (sr.Min.Y+int(sy0)-src.Rect.Min.Y)*src.CStride + (sr.Min.X + int(sx1) - src.Rect.Min.X)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s10yy1 := int(src.Y[s10i]) * 0x10100
+			s10cb1 := int(src.Cb[s10j]) - 128
+			s10cr1 := int(src.Cr[s10j]) - 128
+			s10ru := (s10yy1 + 91881*s10cr1) >> 8
+			s10gu := (s10yy1 - 22554*s10cb1 - 46802*s10cr1) >> 8
+			s10bu := (s10yy1 + 116130*s10cb1) >> 8
+			if s10ru < 0 {
+				s10ru = 0
+			} else if s10ru > 0xffff {
+				s10ru = 0xffff
+			}
+			if s10gu < 0 {
+				s10gu = 0
+			} else if s10gu > 0xffff {
+				s10gu = 0xffff
+			}
+			if s10bu < 0 {
+				s10bu = 0
+			} else if s10bu > 0xffff {
+				s10bu = 0xffff
+			}
+
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s01i := (sr.Min.Y+int(sy1)-src.Rect.Min.Y)*src.YStride + (sr.Min.X + int(sx0) - src.Rect.Min.X)
+			s01j := (sr.Min.Y+int(sy1)-src.Rect.Min.Y)*src.CStride + (sr.Min.X + int(sx0) - src.Rect.Min.X)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s01yy1 := int(src.Y[s01i]) * 0x10100
+			s01cb1 := int(src.Cb[s01j]) - 128
+			s01cr1 := int(src.Cr[s01j]) - 128
+			s01ru := (s01yy1 + 91881*s01cr1) >> 8
+			s01gu := (s01yy1 - 22554*s01cb1 - 46802*s01cr1) >> 8
+			s01bu := (s01yy1 + 116130*s01cb1) >> 8
+			if s01ru < 0 {
+				s01ru = 0
+			} else if s01ru > 0xffff {
+				s01ru = 0xffff
+			}
+			if s01gu < 0 {
+				s01gu = 0
+			} else if s01gu > 0xffff {
+				s01gu = 0xffff
+			}
+			if s01bu < 0 {
+				s01bu = 0
+			} else if s01bu > 0xffff {
+				s01bu = 0xffff
+			}
+
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s11i := (sr.Min.Y+int(sy1)-src.Rect.Min.Y)*src.YStride + (sr.Min.X + int(sx1) - src.Rect.Min.X)
+			s11j := (sr.Min.Y+int(sy1)-src.Rect.Min.Y)*src.CStride + (sr.Min.X + int(sx1) - src.Rect.Min.X)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s11yy1 := int(src.Y[s11i]) * 0x10100
+			s11cb1 := int(src.Cb[s11j]) - 128
+			s11cr1 := int(src.Cr[s11j]) - 128
+			s11ru := (s11yy1 + 91881*s11cr1) >> 8
+			s11gu := (s11yy1 - 22554*s11cb1 - 46802*s11cr1) >> 8
+			s11bu := (s11yy1 + 116130*s11cb1) >> 8
+			if s11ru < 0 {
+				s11ru = 0
+			} else if s11ru > 0xffff {
+				s11ru = 0xffff
+			}
+			if s11gu < 0 {
+				s11gu = 0
+			} else if s11gu > 0xffff {
+				s11gu = 0xffff
+			}
+			if s11bu < 0 {
+				s11bu = 0
+			} else if s11bu > 0xffff {
+				s11bu = 0xffff
+			}
+
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			pr := uint32(s11r)
+			pg := uint32(s11g)
+			pb := uint32(s11b)
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (ablInterpolator) scale_RGBA_YCbCr422_Src(dst *image.RGBA, dr, adr image.Rectangle, src *image.YCbCr, sr image.Rectangle, opts *Options) {
+	sw := int32(sr.Dx())
+	sh := int32(sr.Dy())
+	yscale := float64(sh) / float64(dr.Dy())
+	xscale := float64(sw) / float64(dr.Dx())
+	swMinus1, shMinus1 := sw-1, sh-1
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (float64(dy)+0.5)*yscale - 0.5
+		// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+		// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+		// sx, below.
+		sy0 := int32(sy)
+		yFrac0 := sy - float64(sy0)
+		yFrac1 := 1 - yFrac0
+		sy1 := sy0 + 1
+		if sy < 0 {
+			sy0, sy1 = 0, 0
+			yFrac0, yFrac1 = 0, 1
+		} else if sy1 > shMinus1 {
+			sy0, sy1 = shMinus1, shMinus1
+			yFrac0, yFrac1 = 1, 0
+		}
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (float64(dx)+0.5)*xscale - 0.5
+			sx0 := int32(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx1 := sx0 + 1
+			if sx < 0 {
+				sx0, sx1 = 0, 0
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 > swMinus1 {
+				sx0, sx1 = swMinus1, swMinus1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			s00i := (sr.Min.Y+int(sy0)-src.Rect.Min.Y)*src.YStride + (sr.Min.X + int(sx0) - src.Rect.Min.X)
+			s00j := (sr.Min.Y+int(sy0)-src.Rect.Min.Y)*src.CStride + ((sr.Min.X+int(sx0))/2 - src.Rect.Min.X/2)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s00yy1 := int(src.Y[s00i]) * 0x10100
+			s00cb1 := int(src.Cb[s00j]) - 128
+			s00cr1 := int(src.Cr[s00j]) - 128
+			s00ru := (s00yy1 + 91881*s00cr1) >> 8
+			s00gu := (s00yy1 - 22554*s00cb1 - 46802*s00cr1) >> 8
+			s00bu := (s00yy1 + 116130*s00cb1) >> 8
+			if s00ru < 0 {
+				s00ru = 0
+			} else if s00ru > 0xffff {
+				s00ru = 0xffff
+			}
+			if s00gu < 0 {
+				s00gu = 0
+			} else if s00gu > 0xffff {
+				s00gu = 0xffff
+			}
+			if s00bu < 0 {
+				s00bu = 0
+			} else if s00bu > 0xffff {
+				s00bu = 0xffff
+			}
+
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s10i := (sr.Min.Y+int(sy0)-src.Rect.Min.Y)*src.YStride + (sr.Min.X + int(sx1) - src.Rect.Min.X)
+			s10j := (sr.Min.Y+int(sy0)-src.Rect.Min.Y)*src.CStride + ((sr.Min.X+int(sx1))/2 - src.Rect.Min.X/2)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s10yy1 := int(src.Y[s10i]) * 0x10100
+			s10cb1 := int(src.Cb[s10j]) - 128
+			s10cr1 := int(src.Cr[s10j]) - 128
+			s10ru := (s10yy1 + 91881*s10cr1) >> 8
+			s10gu := (s10yy1 - 22554*s10cb1 - 46802*s10cr1) >> 8
+			s10bu := (s10yy1 + 116130*s10cb1) >> 8
+			if s10ru < 0 {
+				s10ru = 0
+			} else if s10ru > 0xffff {
+				s10ru = 0xffff
+			}
+			if s10gu < 0 {
+				s10gu = 0
+			} else if s10gu > 0xffff {
+				s10gu = 0xffff
+			}
+			if s10bu < 0 {
+				s10bu = 0
+			} else if s10bu > 0xffff {
+				s10bu = 0xffff
+			}
+
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s01i := (sr.Min.Y+int(sy1)-src.Rect.Min.Y)*src.YStride + (sr.Min.X + int(sx0) - src.Rect.Min.X)
+			s01j := (sr.Min.Y+int(sy1)-src.Rect.Min.Y)*src.CStride + ((sr.Min.X+int(sx0))/2 - src.Rect.Min.X/2)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s01yy1 := int(src.Y[s01i]) * 0x10100
+			s01cb1 := int(src.Cb[s01j]) - 128
+			s01cr1 := int(src.Cr[s01j]) - 128
+			s01ru := (s01yy1 + 91881*s01cr1) >> 8
+			s01gu := (s01yy1 - 22554*s01cb1 - 46802*s01cr1) >> 8
+			s01bu := (s01yy1 + 116130*s01cb1) >> 8
+			if s01ru < 0 {
+				s01ru = 0
+			} else if s01ru > 0xffff {
+				s01ru = 0xffff
+			}
+			if s01gu < 0 {
+				s01gu = 0
+			} else if s01gu > 0xffff {
+				s01gu = 0xffff
+			}
+			if s01bu < 0 {
+				s01bu = 0
+			} else if s01bu > 0xffff {
+				s01bu = 0xffff
+			}
+
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s11i := (sr.Min.Y+int(sy1)-src.Rect.Min.Y)*src.YStride + (sr.Min.X + int(sx1) - src.Rect.Min.X)
+			s11j := (sr.Min.Y+int(sy1)-src.Rect.Min.Y)*src.CStride + ((sr.Min.X+int(sx1))/2 - src.Rect.Min.X/2)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s11yy1 := int(src.Y[s11i]) * 0x10100
+			s11cb1 := int(src.Cb[s11j]) - 128
+			s11cr1 := int(src.Cr[s11j]) - 128
+			s11ru := (s11yy1 + 91881*s11cr1) >> 8
+			s11gu := (s11yy1 - 22554*s11cb1 - 46802*s11cr1) >> 8
+			s11bu := (s11yy1 + 116130*s11cb1) >> 8
+			if s11ru < 0 {
+				s11ru = 0
+			} else if s11ru > 0xffff {
+				s11ru = 0xffff
+			}
+			if s11gu < 0 {
+				s11gu = 0
+			} else if s11gu > 0xffff {
+				s11gu = 0xffff
+			}
+			if s11bu < 0 {
+				s11bu = 0
+			} else if s11bu > 0xffff {
+				s11bu = 0xffff
+			}
+
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			pr := uint32(s11r)
+			pg := uint32(s11g)
+			pb := uint32(s11b)
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (ablInterpolator) scale_RGBA_YCbCr420_Src(dst *image.RGBA, dr, adr image.Rectangle, src *image.YCbCr, sr image.Rectangle, opts *Options) {
+	sw := int32(sr.Dx())
+	sh := int32(sr.Dy())
+	yscale := float64(sh) / float64(dr.Dy())
+	xscale := float64(sw) / float64(dr.Dx())
+	swMinus1, shMinus1 := sw-1, sh-1
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (float64(dy)+0.5)*yscale - 0.5
+		// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+		// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+		// sx, below.
+		sy0 := int32(sy)
+		yFrac0 := sy - float64(sy0)
+		yFrac1 := 1 - yFrac0
+		sy1 := sy0 + 1
+		if sy < 0 {
+			sy0, sy1 = 0, 0
+			yFrac0, yFrac1 = 0, 1
+		} else if sy1 > shMinus1 {
+			sy0, sy1 = shMinus1, shMinus1
+			yFrac0, yFrac1 = 1, 0
+		}
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (float64(dx)+0.5)*xscale - 0.5
+			sx0 := int32(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx1 := sx0 + 1
+			if sx < 0 {
+				sx0, sx1 = 0, 0
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 > swMinus1 {
+				sx0, sx1 = swMinus1, swMinus1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			s00i := (sr.Min.Y+int(sy0)-src.Rect.Min.Y)*src.YStride + (sr.Min.X + int(sx0) - src.Rect.Min.X)
+			s00j := ((sr.Min.Y+int(sy0))/2-src.Rect.Min.Y/2)*src.CStride + ((sr.Min.X+int(sx0))/2 - src.Rect.Min.X/2)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s00yy1 := int(src.Y[s00i]) * 0x10100
+			s00cb1 := int(src.Cb[s00j]) - 128
+			s00cr1 := int(src.Cr[s00j]) - 128
+			s00ru := (s00yy1 + 91881*s00cr1) >> 8
+			s00gu := (s00yy1 - 22554*s00cb1 - 46802*s00cr1) >> 8
+			s00bu := (s00yy1 + 116130*s00cb1) >> 8
+			if s00ru < 0 {
+				s00ru = 0
+			} else if s00ru > 0xffff {
+				s00ru = 0xffff
+			}
+			if s00gu < 0 {
+				s00gu = 0
+			} else if s00gu > 0xffff {
+				s00gu = 0xffff
+			}
+			if s00bu < 0 {
+				s00bu = 0
+			} else if s00bu > 0xffff {
+				s00bu = 0xffff
+			}
+
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s10i := (sr.Min.Y+int(sy0)-src.Rect.Min.Y)*src.YStride + (sr.Min.X + int(sx1) - src.Rect.Min.X)
+			s10j := ((sr.Min.Y+int(sy0))/2-src.Rect.Min.Y/2)*src.CStride + ((sr.Min.X+int(sx1))/2 - src.Rect.Min.X/2)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s10yy1 := int(src.Y[s10i]) * 0x10100
+			s10cb1 := int(src.Cb[s10j]) - 128
+			s10cr1 := int(src.Cr[s10j]) - 128
+			s10ru := (s10yy1 + 91881*s10cr1) >> 8
+			s10gu := (s10yy1 - 22554*s10cb1 - 46802*s10cr1) >> 8
+			s10bu := (s10yy1 + 116130*s10cb1) >> 8
+			if s10ru < 0 {
+				s10ru = 0
+			} else if s10ru > 0xffff {
+				s10ru = 0xffff
+			}
+			if s10gu < 0 {
+				s10gu = 0
+			} else if s10gu > 0xffff {
+				s10gu = 0xffff
+			}
+			if s10bu < 0 {
+				s10bu = 0
+			} else if s10bu > 0xffff {
+				s10bu = 0xffff
+			}
+
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s01i := (sr.Min.Y+int(sy1)-src.Rect.Min.Y)*src.YStride + (sr.Min.X + int(sx0) - src.Rect.Min.X)
+			s01j := ((sr.Min.Y+int(sy1))/2-src.Rect.Min.Y/2)*src.CStride + ((sr.Min.X+int(sx0))/2 - src.Rect.Min.X/2)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s01yy1 := int(src.Y[s01i]) * 0x10100
+			s01cb1 := int(src.Cb[s01j]) - 128
+			s01cr1 := int(src.Cr[s01j]) - 128
+			s01ru := (s01yy1 + 91881*s01cr1) >> 8
+			s01gu := (s01yy1 - 22554*s01cb1 - 46802*s01cr1) >> 8
+			s01bu := (s01yy1 + 116130*s01cb1) >> 8
+			if s01ru < 0 {
+				s01ru = 0
+			} else if s01ru > 0xffff {
+				s01ru = 0xffff
+			}
+			if s01gu < 0 {
+				s01gu = 0
+			} else if s01gu > 0xffff {
+				s01gu = 0xffff
+			}
+			if s01bu < 0 {
+				s01bu = 0
+			} else if s01bu > 0xffff {
+				s01bu = 0xffff
+			}
+
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s11i := (sr.Min.Y+int(sy1)-src.Rect.Min.Y)*src.YStride + (sr.Min.X + int(sx1) - src.Rect.Min.X)
+			s11j := ((sr.Min.Y+int(sy1))/2-src.Rect.Min.Y/2)*src.CStride + ((sr.Min.X+int(sx1))/2 - src.Rect.Min.X/2)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s11yy1 := int(src.Y[s11i]) * 0x10100
+			s11cb1 := int(src.Cb[s11j]) - 128
+			s11cr1 := int(src.Cr[s11j]) - 128
+			s11ru := (s11yy1 + 91881*s11cr1) >> 8
+			s11gu := (s11yy1 - 22554*s11cb1 - 46802*s11cr1) >> 8
+			s11bu := (s11yy1 + 116130*s11cb1) >> 8
+			if s11ru < 0 {
+				s11ru = 0
+			} else if s11ru > 0xffff {
+				s11ru = 0xffff
+			}
+			if s11gu < 0 {
+				s11gu = 0
+			} else if s11gu > 0xffff {
+				s11gu = 0xffff
+			}
+			if s11bu < 0 {
+				s11bu = 0
+			} else if s11bu > 0xffff {
+				s11bu = 0xffff
+			}
+
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			pr := uint32(s11r)
+			pg := uint32(s11g)
+			pb := uint32(s11b)
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (ablInterpolator) scale_RGBA_YCbCr440_Src(dst *image.RGBA, dr, adr image.Rectangle, src *image.YCbCr, sr image.Rectangle, opts *Options) {
+	sw := int32(sr.Dx())
+	sh := int32(sr.Dy())
+	yscale := float64(sh) / float64(dr.Dy())
+	xscale := float64(sw) / float64(dr.Dx())
+	swMinus1, shMinus1 := sw-1, sh-1
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (float64(dy)+0.5)*yscale - 0.5
+		// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+		// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+		// sx, below.
+		sy0 := int32(sy)
+		yFrac0 := sy - float64(sy0)
+		yFrac1 := 1 - yFrac0
+		sy1 := sy0 + 1
+		if sy < 0 {
+			sy0, sy1 = 0, 0
+			yFrac0, yFrac1 = 0, 1
+		} else if sy1 > shMinus1 {
+			sy0, sy1 = shMinus1, shMinus1
+			yFrac0, yFrac1 = 1, 0
+		}
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (float64(dx)+0.5)*xscale - 0.5
+			sx0 := int32(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx1 := sx0 + 1
+			if sx < 0 {
+				sx0, sx1 = 0, 0
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 > swMinus1 {
+				sx0, sx1 = swMinus1, swMinus1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			s00i := (sr.Min.Y+int(sy0)-src.Rect.Min.Y)*src.YStride + (sr.Min.X + int(sx0) - src.Rect.Min.X)
+			s00j := ((sr.Min.Y+int(sy0))/2-src.Rect.Min.Y/2)*src.CStride + (sr.Min.X + int(sx0) - src.Rect.Min.X)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s00yy1 := int(src.Y[s00i]) * 0x10100
+			s00cb1 := int(src.Cb[s00j]) - 128
+			s00cr1 := int(src.Cr[s00j]) - 128
+			s00ru := (s00yy1 + 91881*s00cr1) >> 8
+			s00gu := (s00yy1 - 22554*s00cb1 - 46802*s00cr1) >> 8
+			s00bu := (s00yy1 + 116130*s00cb1) >> 8
+			if s00ru < 0 {
+				s00ru = 0
+			} else if s00ru > 0xffff {
+				s00ru = 0xffff
+			}
+			if s00gu < 0 {
+				s00gu = 0
+			} else if s00gu > 0xffff {
+				s00gu = 0xffff
+			}
+			if s00bu < 0 {
+				s00bu = 0
+			} else if s00bu > 0xffff {
+				s00bu = 0xffff
+			}
+
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s10i := (sr.Min.Y+int(sy0)-src.Rect.Min.Y)*src.YStride + (sr.Min.X + int(sx1) - src.Rect.Min.X)
+			s10j := ((sr.Min.Y+int(sy0))/2-src.Rect.Min.Y/2)*src.CStride + (sr.Min.X + int(sx1) - src.Rect.Min.X)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s10yy1 := int(src.Y[s10i]) * 0x10100
+			s10cb1 := int(src.Cb[s10j]) - 128
+			s10cr1 := int(src.Cr[s10j]) - 128
+			s10ru := (s10yy1 + 91881*s10cr1) >> 8
+			s10gu := (s10yy1 - 22554*s10cb1 - 46802*s10cr1) >> 8
+			s10bu := (s10yy1 + 116130*s10cb1) >> 8
+			if s10ru < 0 {
+				s10ru = 0
+			} else if s10ru > 0xffff {
+				s10ru = 0xffff
+			}
+			if s10gu < 0 {
+				s10gu = 0
+			} else if s10gu > 0xffff {
+				s10gu = 0xffff
+			}
+			if s10bu < 0 {
+				s10bu = 0
+			} else if s10bu > 0xffff {
+				s10bu = 0xffff
+			}
+
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s01i := (sr.Min.Y+int(sy1)-src.Rect.Min.Y)*src.YStride + (sr.Min.X + int(sx0) - src.Rect.Min.X)
+			s01j := ((sr.Min.Y+int(sy1))/2-src.Rect.Min.Y/2)*src.CStride + (sr.Min.X + int(sx0) - src.Rect.Min.X)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s01yy1 := int(src.Y[s01i]) * 0x10100
+			s01cb1 := int(src.Cb[s01j]) - 128
+			s01cr1 := int(src.Cr[s01j]) - 128
+			s01ru := (s01yy1 + 91881*s01cr1) >> 8
+			s01gu := (s01yy1 - 22554*s01cb1 - 46802*s01cr1) >> 8
+			s01bu := (s01yy1 + 116130*s01cb1) >> 8
+			if s01ru < 0 {
+				s01ru = 0
+			} else if s01ru > 0xffff {
+				s01ru = 0xffff
+			}
+			if s01gu < 0 {
+				s01gu = 0
+			} else if s01gu > 0xffff {
+				s01gu = 0xffff
+			}
+			if s01bu < 0 {
+				s01bu = 0
+			} else if s01bu > 0xffff {
+				s01bu = 0xffff
+			}
+
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s11i := (sr.Min.Y+int(sy1)-src.Rect.Min.Y)*src.YStride + (sr.Min.X + int(sx1) - src.Rect.Min.X)
+			s11j := ((sr.Min.Y+int(sy1))/2-src.Rect.Min.Y/2)*src.CStride + (sr.Min.X + int(sx1) - src.Rect.Min.X)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s11yy1 := int(src.Y[s11i]) * 0x10100
+			s11cb1 := int(src.Cb[s11j]) - 128
+			s11cr1 := int(src.Cr[s11j]) - 128
+			s11ru := (s11yy1 + 91881*s11cr1) >> 8
+			s11gu := (s11yy1 - 22554*s11cb1 - 46802*s11cr1) >> 8
+			s11bu := (s11yy1 + 116130*s11cb1) >> 8
+			if s11ru < 0 {
+				s11ru = 0
+			} else if s11ru > 0xffff {
+				s11ru = 0xffff
+			}
+			if s11gu < 0 {
+				s11gu = 0
+			} else if s11gu > 0xffff {
+				s11gu = 0xffff
+			}
+			if s11bu < 0 {
+				s11bu = 0
+			} else if s11bu > 0xffff {
+				s11bu = 0xffff
+			}
+
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			pr := uint32(s11r)
+			pg := uint32(s11g)
+			pb := uint32(s11b)
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (ablInterpolator) scale_RGBA_Image_Over(dst *image.RGBA, dr, adr image.Rectangle, src image.Image, sr image.Rectangle, opts *Options) {
+	sw := int32(sr.Dx())
+	sh := int32(sr.Dy())
+	yscale := float64(sh) / float64(dr.Dy())
+	xscale := float64(sw) / float64(dr.Dx())
+	swMinus1, shMinus1 := sw-1, sh-1
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (float64(dy)+0.5)*yscale - 0.5
+		// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+		// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+		// sx, below.
+		sy0 := int32(sy)
+		yFrac0 := sy - float64(sy0)
+		yFrac1 := 1 - yFrac0
+		sy1 := sy0 + 1
+		if sy < 0 {
+			sy0, sy1 = 0, 0
+			yFrac0, yFrac1 = 0, 1
+		} else if sy1 > shMinus1 {
+			sy0, sy1 = shMinus1, shMinus1
+			yFrac0, yFrac1 = 1, 0
+		}
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (float64(dx)+0.5)*xscale - 0.5
+			sx0 := int32(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx1 := sx0 + 1
+			if sx < 0 {
+				sx0, sx1 = 0, 0
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 > swMinus1 {
+				sx0, sx1 = swMinus1, swMinus1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			s00ru, s00gu, s00bu, s00au := src.At(sr.Min.X+int(sx0), sr.Min.Y+int(sy0)).RGBA()
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10ru, s10gu, s10bu, s10au := src.At(sr.Min.X+int(sx1), sr.Min.Y+int(sy0)).RGBA()
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01ru, s01gu, s01bu, s01au := src.At(sr.Min.X+int(sx0), sr.Min.Y+int(sy1)).RGBA()
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11ru, s11gu, s11bu, s11au := src.At(sr.Min.X+int(sx1), sr.Min.Y+int(sy1)).RGBA()
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			pr := uint32(s11r)
+			pg := uint32(s11g)
+			pb := uint32(s11b)
+			pa := uint32(s11a)
+			pa1 := (0xffff - pa) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + pr) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + pg) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + pb) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + pa) >> 8)
+		}
+	}
+}
+
+func (ablInterpolator) scale_RGBA_Image_Src(dst *image.RGBA, dr, adr image.Rectangle, src image.Image, sr image.Rectangle, opts *Options) {
+	sw := int32(sr.Dx())
+	sh := int32(sr.Dy())
+	yscale := float64(sh) / float64(dr.Dy())
+	xscale := float64(sw) / float64(dr.Dx())
+	swMinus1, shMinus1 := sw-1, sh-1
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (float64(dy)+0.5)*yscale - 0.5
+		// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+		// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+		// sx, below.
+		sy0 := int32(sy)
+		yFrac0 := sy - float64(sy0)
+		yFrac1 := 1 - yFrac0
+		sy1 := sy0 + 1
+		if sy < 0 {
+			sy0, sy1 = 0, 0
+			yFrac0, yFrac1 = 0, 1
+		} else if sy1 > shMinus1 {
+			sy0, sy1 = shMinus1, shMinus1
+			yFrac0, yFrac1 = 1, 0
+		}
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (float64(dx)+0.5)*xscale - 0.5
+			sx0 := int32(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx1 := sx0 + 1
+			if sx < 0 {
+				sx0, sx1 = 0, 0
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 > swMinus1 {
+				sx0, sx1 = swMinus1, swMinus1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			s00ru, s00gu, s00bu, s00au := src.At(sr.Min.X+int(sx0), sr.Min.Y+int(sy0)).RGBA()
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10ru, s10gu, s10bu, s10au := src.At(sr.Min.X+int(sx1), sr.Min.Y+int(sy0)).RGBA()
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01ru, s01gu, s01bu, s01au := src.At(sr.Min.X+int(sx0), sr.Min.Y+int(sy1)).RGBA()
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11ru, s11gu, s11bu, s11au := src.At(sr.Min.X+int(sx1), sr.Min.Y+int(sy1)).RGBA()
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			pr := uint32(s11r)
+			pg := uint32(s11g)
+			pb := uint32(s11b)
+			pa := uint32(s11a)
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = uint8(pa >> 8)
+		}
+	}
+}
+
+func (ablInterpolator) scale_Image_Image_Over(dst Image, dr, adr image.Rectangle, src image.Image, sr image.Rectangle, opts *Options) {
+	sw := int32(sr.Dx())
+	sh := int32(sr.Dy())
+	yscale := float64(sh) / float64(dr.Dy())
+	xscale := float64(sw) / float64(dr.Dx())
+	swMinus1, shMinus1 := sw-1, sh-1
+	srcMask, smp := opts.SrcMask, opts.SrcMaskP
+	dstMask, dmp := opts.DstMask, opts.DstMaskP
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (float64(dy)+0.5)*yscale - 0.5
+		// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+		// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+		// sx, below.
+		sy0 := int32(sy)
+		yFrac0 := sy - float64(sy0)
+		yFrac1 := 1 - yFrac0
+		sy1 := sy0 + 1
+		if sy < 0 {
+			sy0, sy1 = 0, 0
+			yFrac0, yFrac1 = 0, 1
+		} else if sy1 > shMinus1 {
+			sy0, sy1 = shMinus1, shMinus1
+			yFrac0, yFrac1 = 1, 0
+		}
+
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+			sx := (float64(dx)+0.5)*xscale - 0.5
+			sx0 := int32(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx1 := sx0 + 1
+			if sx < 0 {
+				sx0, sx1 = 0, 0
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 > swMinus1 {
+				sx0, sx1 = swMinus1, swMinus1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			s00ru, s00gu, s00bu, s00au := src.At(sr.Min.X+int(sx0), sr.Min.Y+int(sy0)).RGBA()
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sr.Min.X+int(sx0), smp.Y+sr.Min.Y+int(sy0)).RGBA()
+				s00ru = s00ru * ma / 0xffff
+				s00gu = s00gu * ma / 0xffff
+				s00bu = s00bu * ma / 0xffff
+				s00au = s00au * ma / 0xffff
+			}
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10ru, s10gu, s10bu, s10au := src.At(sr.Min.X+int(sx1), sr.Min.Y+int(sy0)).RGBA()
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sr.Min.X+int(sx1), smp.Y+sr.Min.Y+int(sy0)).RGBA()
+				s10ru = s10ru * ma / 0xffff
+				s10gu = s10gu * ma / 0xffff
+				s10bu = s10bu * ma / 0xffff
+				s10au = s10au * ma / 0xffff
+			}
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01ru, s01gu, s01bu, s01au := src.At(sr.Min.X+int(sx0), sr.Min.Y+int(sy1)).RGBA()
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sr.Min.X+int(sx0), smp.Y+sr.Min.Y+int(sy1)).RGBA()
+				s01ru = s01ru * ma / 0xffff
+				s01gu = s01gu * ma / 0xffff
+				s01bu = s01bu * ma / 0xffff
+				s01au = s01au * ma / 0xffff
+			}
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11ru, s11gu, s11bu, s11au := src.At(sr.Min.X+int(sx1), sr.Min.Y+int(sy1)).RGBA()
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sr.Min.X+int(sx1), smp.Y+sr.Min.Y+int(sy1)).RGBA()
+				s11ru = s11ru * ma / 0xffff
+				s11gu = s11gu * ma / 0xffff
+				s11bu = s11bu * ma / 0xffff
+				s11au = s11au * ma / 0xffff
+			}
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			pr := uint32(s11r)
+			pg := uint32(s11g)
+			pb := uint32(s11b)
+			pa := uint32(s11a)
+			qr, qg, qb, qa := dst.At(dr.Min.X+int(dx), dr.Min.Y+int(dy)).RGBA()
+			if dstMask != nil {
+				_, _, _, ma := dstMask.At(dmp.X+dr.Min.X+int(dx), dmp.Y+dr.Min.Y+int(dy)).RGBA()
+				pr = pr * ma / 0xffff
+				pg = pg * ma / 0xffff
+				pb = pb * ma / 0xffff
+				pa = pa * ma / 0xffff
+			}
+			pa1 := 0xffff - pa
+			dstColorRGBA64.R = uint16(qr*pa1/0xffff + pr)
+			dstColorRGBA64.G = uint16(qg*pa1/0xffff + pg)
+			dstColorRGBA64.B = uint16(qb*pa1/0xffff + pb)
+			dstColorRGBA64.A = uint16(qa*pa1/0xffff + pa)
+			dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
+		}
+	}
+}
+
+func (ablInterpolator) scale_Image_Image_Src(dst Image, dr, adr image.Rectangle, src image.Image, sr image.Rectangle, opts *Options) {
+	sw := int32(sr.Dx())
+	sh := int32(sr.Dy())
+	yscale := float64(sh) / float64(dr.Dy())
+	xscale := float64(sw) / float64(dr.Dx())
+	swMinus1, shMinus1 := sw-1, sh-1
+	srcMask, smp := opts.SrcMask, opts.SrcMaskP
+	dstMask, dmp := opts.DstMask, opts.DstMaskP
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (float64(dy)+0.5)*yscale - 0.5
+		// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+		// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+		// sx, below.
+		sy0 := int32(sy)
+		yFrac0 := sy - float64(sy0)
+		yFrac1 := 1 - yFrac0
+		sy1 := sy0 + 1
+		if sy < 0 {
+			sy0, sy1 = 0, 0
+			yFrac0, yFrac1 = 0, 1
+		} else if sy1 > shMinus1 {
+			sy0, sy1 = shMinus1, shMinus1
+			yFrac0, yFrac1 = 1, 0
+		}
+
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+			sx := (float64(dx)+0.5)*xscale - 0.5
+			sx0 := int32(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx1 := sx0 + 1
+			if sx < 0 {
+				sx0, sx1 = 0, 0
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 > swMinus1 {
+				sx0, sx1 = swMinus1, swMinus1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			s00ru, s00gu, s00bu, s00au := src.At(sr.Min.X+int(sx0), sr.Min.Y+int(sy0)).RGBA()
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sr.Min.X+int(sx0), smp.Y+sr.Min.Y+int(sy0)).RGBA()
+				s00ru = s00ru * ma / 0xffff
+				s00gu = s00gu * ma / 0xffff
+				s00bu = s00bu * ma / 0xffff
+				s00au = s00au * ma / 0xffff
+			}
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10ru, s10gu, s10bu, s10au := src.At(sr.Min.X+int(sx1), sr.Min.Y+int(sy0)).RGBA()
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sr.Min.X+int(sx1), smp.Y+sr.Min.Y+int(sy0)).RGBA()
+				s10ru = s10ru * ma / 0xffff
+				s10gu = s10gu * ma / 0xffff
+				s10bu = s10bu * ma / 0xffff
+				s10au = s10au * ma / 0xffff
+			}
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01ru, s01gu, s01bu, s01au := src.At(sr.Min.X+int(sx0), sr.Min.Y+int(sy1)).RGBA()
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sr.Min.X+int(sx0), smp.Y+sr.Min.Y+int(sy1)).RGBA()
+				s01ru = s01ru * ma / 0xffff
+				s01gu = s01gu * ma / 0xffff
+				s01bu = s01bu * ma / 0xffff
+				s01au = s01au * ma / 0xffff
+			}
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11ru, s11gu, s11bu, s11au := src.At(sr.Min.X+int(sx1), sr.Min.Y+int(sy1)).RGBA()
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sr.Min.X+int(sx1), smp.Y+sr.Min.Y+int(sy1)).RGBA()
+				s11ru = s11ru * ma / 0xffff
+				s11gu = s11gu * ma / 0xffff
+				s11bu = s11bu * ma / 0xffff
+				s11au = s11au * ma / 0xffff
+			}
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			pr := uint32(s11r)
+			pg := uint32(s11g)
+			pb := uint32(s11b)
+			pa := uint32(s11a)
+			if dstMask != nil {
+				qr, qg, qb, qa := dst.At(dr.Min.X+int(dx), dr.Min.Y+int(dy)).RGBA()
+				_, _, _, ma := dstMask.At(dmp.X+dr.Min.X+int(dx), dmp.Y+dr.Min.Y+int(dy)).RGBA()
+				pr = pr * ma / 0xffff
+				pg = pg * ma / 0xffff
+				pb = pb * ma / 0xffff
+				pa = pa * ma / 0xffff
+				pa1 := 0xffff - ma
+				dstColorRGBA64.R = uint16(qr*pa1/0xffff + pr)
+				dstColorRGBA64.G = uint16(qg*pa1/0xffff + pg)
+				dstColorRGBA64.B = uint16(qb*pa1/0xffff + pb)
+				dstColorRGBA64.A = uint16(qa*pa1/0xffff + pa)
+				dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
+			} else {
+				dstColorRGBA64.R = uint16(pr)
+				dstColorRGBA64.G = uint16(pg)
+				dstColorRGBA64.B = uint16(pb)
+				dstColorRGBA64.A = uint16(pa)
+				dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
+			}
+		}
+	}
+}
+
+func (ablInterpolator) transform_RGBA_Gray_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.Gray, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			sx0 := int(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx0 += bias.X
+			sx1 := sx0 + 1
+			if sx0 < sr.Min.X {
+				sx0, sx1 = sr.Min.X, sr.Min.X
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= sr.Max.X {
+				sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			sy -= 0.5
+			sy0 := int(sy)
+			yFrac0 := sy - float64(sy0)
+			yFrac1 := 1 - yFrac0
+			sy0 += bias.Y
+			sy1 := sy0 + 1
+			if sy0 < sr.Min.Y {
+				sy0, sy1 = sr.Min.Y, sr.Min.Y
+				yFrac0, yFrac1 = 0, 1
+			} else if sy1 >= sr.Max.Y {
+				sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+				yFrac0, yFrac1 = 1, 0
+			}
+
+			s00i := (sy0-src.Rect.Min.Y)*src.Stride + (sx0 - src.Rect.Min.X)
+			s00ru := uint32(src.Pix[s00i]) * 0x101
+			s00r := float64(s00ru)
+			s10i := (sy0-src.Rect.Min.Y)*src.Stride + (sx1 - src.Rect.Min.X)
+			s10ru := uint32(src.Pix[s10i]) * 0x101
+			s10r := float64(s10ru)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s01i := (sy1-src.Rect.Min.Y)*src.Stride + (sx0 - src.Rect.Min.X)
+			s01ru := uint32(src.Pix[s01i]) * 0x101
+			s01r := float64(s01ru)
+			s11i := (sy1-src.Rect.Min.Y)*src.Stride + (sx1 - src.Rect.Min.X)
+			s11ru := uint32(src.Pix[s11i]) * 0x101
+			s11r := float64(s11ru)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11r = yFrac1*s10r + yFrac0*s11r
+			pr := uint32(s11r)
+			out := uint8(pr >> 8)
+			dst.Pix[d+0] = out
+			dst.Pix[d+1] = out
+			dst.Pix[d+2] = out
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (ablInterpolator) transform_RGBA_NRGBA_Over(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.NRGBA, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			sx0 := int(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx0 += bias.X
+			sx1 := sx0 + 1
+			if sx0 < sr.Min.X {
+				sx0, sx1 = sr.Min.X, sr.Min.X
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= sr.Max.X {
+				sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			sy -= 0.5
+			sy0 := int(sy)
+			yFrac0 := sy - float64(sy0)
+			yFrac1 := 1 - yFrac0
+			sy0 += bias.Y
+			sy1 := sy0 + 1
+			if sy0 < sr.Min.Y {
+				sy0, sy1 = sr.Min.Y, sr.Min.Y
+				yFrac0, yFrac1 = 0, 1
+			} else if sy1 >= sr.Max.Y {
+				sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+				yFrac0, yFrac1 = 1, 0
+			}
+
+			s00i := (sy0-src.Rect.Min.Y)*src.Stride + (sx0-src.Rect.Min.X)*4
+			s00au := uint32(src.Pix[s00i+3]) * 0x101
+			s00ru := uint32(src.Pix[s00i+0]) * s00au / 0xff
+			s00gu := uint32(src.Pix[s00i+1]) * s00au / 0xff
+			s00bu := uint32(src.Pix[s00i+2]) * s00au / 0xff
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10i := (sy0-src.Rect.Min.Y)*src.Stride + (sx1-src.Rect.Min.X)*4
+			s10au := uint32(src.Pix[s10i+3]) * 0x101
+			s10ru := uint32(src.Pix[s10i+0]) * s10au / 0xff
+			s10gu := uint32(src.Pix[s10i+1]) * s10au / 0xff
+			s10bu := uint32(src.Pix[s10i+2]) * s10au / 0xff
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01i := (sy1-src.Rect.Min.Y)*src.Stride + (sx0-src.Rect.Min.X)*4
+			s01au := uint32(src.Pix[s01i+3]) * 0x101
+			s01ru := uint32(src.Pix[s01i+0]) * s01au / 0xff
+			s01gu := uint32(src.Pix[s01i+1]) * s01au / 0xff
+			s01bu := uint32(src.Pix[s01i+2]) * s01au / 0xff
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11i := (sy1-src.Rect.Min.Y)*src.Stride + (sx1-src.Rect.Min.X)*4
+			s11au := uint32(src.Pix[s11i+3]) * 0x101
+			s11ru := uint32(src.Pix[s11i+0]) * s11au / 0xff
+			s11gu := uint32(src.Pix[s11i+1]) * s11au / 0xff
+			s11bu := uint32(src.Pix[s11i+2]) * s11au / 0xff
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			pr := uint32(s11r)
+			pg := uint32(s11g)
+			pb := uint32(s11b)
+			pa := uint32(s11a)
+			pa1 := (0xffff - pa) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + pr) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + pg) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + pb) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + pa) >> 8)
+		}
+	}
+}
+
+func (ablInterpolator) transform_RGBA_NRGBA_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.NRGBA, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			sx0 := int(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx0 += bias.X
+			sx1 := sx0 + 1
+			if sx0 < sr.Min.X {
+				sx0, sx1 = sr.Min.X, sr.Min.X
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= sr.Max.X {
+				sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			sy -= 0.5
+			sy0 := int(sy)
+			yFrac0 := sy - float64(sy0)
+			yFrac1 := 1 - yFrac0
+			sy0 += bias.Y
+			sy1 := sy0 + 1
+			if sy0 < sr.Min.Y {
+				sy0, sy1 = sr.Min.Y, sr.Min.Y
+				yFrac0, yFrac1 = 0, 1
+			} else if sy1 >= sr.Max.Y {
+				sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+				yFrac0, yFrac1 = 1, 0
+			}
+
+			s00i := (sy0-src.Rect.Min.Y)*src.Stride + (sx0-src.Rect.Min.X)*4
+			s00au := uint32(src.Pix[s00i+3]) * 0x101
+			s00ru := uint32(src.Pix[s00i+0]) * s00au / 0xff
+			s00gu := uint32(src.Pix[s00i+1]) * s00au / 0xff
+			s00bu := uint32(src.Pix[s00i+2]) * s00au / 0xff
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10i := (sy0-src.Rect.Min.Y)*src.Stride + (sx1-src.Rect.Min.X)*4
+			s10au := uint32(src.Pix[s10i+3]) * 0x101
+			s10ru := uint32(src.Pix[s10i+0]) * s10au / 0xff
+			s10gu := uint32(src.Pix[s10i+1]) * s10au / 0xff
+			s10bu := uint32(src.Pix[s10i+2]) * s10au / 0xff
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01i := (sy1-src.Rect.Min.Y)*src.Stride + (sx0-src.Rect.Min.X)*4
+			s01au := uint32(src.Pix[s01i+3]) * 0x101
+			s01ru := uint32(src.Pix[s01i+0]) * s01au / 0xff
+			s01gu := uint32(src.Pix[s01i+1]) * s01au / 0xff
+			s01bu := uint32(src.Pix[s01i+2]) * s01au / 0xff
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11i := (sy1-src.Rect.Min.Y)*src.Stride + (sx1-src.Rect.Min.X)*4
+			s11au := uint32(src.Pix[s11i+3]) * 0x101
+			s11ru := uint32(src.Pix[s11i+0]) * s11au / 0xff
+			s11gu := uint32(src.Pix[s11i+1]) * s11au / 0xff
+			s11bu := uint32(src.Pix[s11i+2]) * s11au / 0xff
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			pr := uint32(s11r)
+			pg := uint32(s11g)
+			pb := uint32(s11b)
+			pa := uint32(s11a)
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = uint8(pa >> 8)
+		}
+	}
+}
+
+func (ablInterpolator) transform_RGBA_RGBA_Over(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.RGBA, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			sx0 := int(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx0 += bias.X
+			sx1 := sx0 + 1
+			if sx0 < sr.Min.X {
+				sx0, sx1 = sr.Min.X, sr.Min.X
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= sr.Max.X {
+				sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			sy -= 0.5
+			sy0 := int(sy)
+			yFrac0 := sy - float64(sy0)
+			yFrac1 := 1 - yFrac0
+			sy0 += bias.Y
+			sy1 := sy0 + 1
+			if sy0 < sr.Min.Y {
+				sy0, sy1 = sr.Min.Y, sr.Min.Y
+				yFrac0, yFrac1 = 0, 1
+			} else if sy1 >= sr.Max.Y {
+				sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+				yFrac0, yFrac1 = 1, 0
+			}
+
+			s00i := (sy0-src.Rect.Min.Y)*src.Stride + (sx0-src.Rect.Min.X)*4
+			s00ru := uint32(src.Pix[s00i+0]) * 0x101
+			s00gu := uint32(src.Pix[s00i+1]) * 0x101
+			s00bu := uint32(src.Pix[s00i+2]) * 0x101
+			s00au := uint32(src.Pix[s00i+3]) * 0x101
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10i := (sy0-src.Rect.Min.Y)*src.Stride + (sx1-src.Rect.Min.X)*4
+			s10ru := uint32(src.Pix[s10i+0]) * 0x101
+			s10gu := uint32(src.Pix[s10i+1]) * 0x101
+			s10bu := uint32(src.Pix[s10i+2]) * 0x101
+			s10au := uint32(src.Pix[s10i+3]) * 0x101
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01i := (sy1-src.Rect.Min.Y)*src.Stride + (sx0-src.Rect.Min.X)*4
+			s01ru := uint32(src.Pix[s01i+0]) * 0x101
+			s01gu := uint32(src.Pix[s01i+1]) * 0x101
+			s01bu := uint32(src.Pix[s01i+2]) * 0x101
+			s01au := uint32(src.Pix[s01i+3]) * 0x101
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11i := (sy1-src.Rect.Min.Y)*src.Stride + (sx1-src.Rect.Min.X)*4
+			s11ru := uint32(src.Pix[s11i+0]) * 0x101
+			s11gu := uint32(src.Pix[s11i+1]) * 0x101
+			s11bu := uint32(src.Pix[s11i+2]) * 0x101
+			s11au := uint32(src.Pix[s11i+3]) * 0x101
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			pr := uint32(s11r)
+			pg := uint32(s11g)
+			pb := uint32(s11b)
+			pa := uint32(s11a)
+			pa1 := (0xffff - pa) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + pr) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + pg) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + pb) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + pa) >> 8)
+		}
+	}
+}
+
+func (ablInterpolator) transform_RGBA_RGBA_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.RGBA, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			sx0 := int(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx0 += bias.X
+			sx1 := sx0 + 1
+			if sx0 < sr.Min.X {
+				sx0, sx1 = sr.Min.X, sr.Min.X
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= sr.Max.X {
+				sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			sy -= 0.5
+			sy0 := int(sy)
+			yFrac0 := sy - float64(sy0)
+			yFrac1 := 1 - yFrac0
+			sy0 += bias.Y
+			sy1 := sy0 + 1
+			if sy0 < sr.Min.Y {
+				sy0, sy1 = sr.Min.Y, sr.Min.Y
+				yFrac0, yFrac1 = 0, 1
+			} else if sy1 >= sr.Max.Y {
+				sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+				yFrac0, yFrac1 = 1, 0
+			}
+
+			s00i := (sy0-src.Rect.Min.Y)*src.Stride + (sx0-src.Rect.Min.X)*4
+			s00ru := uint32(src.Pix[s00i+0]) * 0x101
+			s00gu := uint32(src.Pix[s00i+1]) * 0x101
+			s00bu := uint32(src.Pix[s00i+2]) * 0x101
+			s00au := uint32(src.Pix[s00i+3]) * 0x101
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10i := (sy0-src.Rect.Min.Y)*src.Stride + (sx1-src.Rect.Min.X)*4
+			s10ru := uint32(src.Pix[s10i+0]) * 0x101
+			s10gu := uint32(src.Pix[s10i+1]) * 0x101
+			s10bu := uint32(src.Pix[s10i+2]) * 0x101
+			s10au := uint32(src.Pix[s10i+3]) * 0x101
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01i := (sy1-src.Rect.Min.Y)*src.Stride + (sx0-src.Rect.Min.X)*4
+			s01ru := uint32(src.Pix[s01i+0]) * 0x101
+			s01gu := uint32(src.Pix[s01i+1]) * 0x101
+			s01bu := uint32(src.Pix[s01i+2]) * 0x101
+			s01au := uint32(src.Pix[s01i+3]) * 0x101
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11i := (sy1-src.Rect.Min.Y)*src.Stride + (sx1-src.Rect.Min.X)*4
+			s11ru := uint32(src.Pix[s11i+0]) * 0x101
+			s11gu := uint32(src.Pix[s11i+1]) * 0x101
+			s11bu := uint32(src.Pix[s11i+2]) * 0x101
+			s11au := uint32(src.Pix[s11i+3]) * 0x101
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			pr := uint32(s11r)
+			pg := uint32(s11g)
+			pb := uint32(s11b)
+			pa := uint32(s11a)
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = uint8(pa >> 8)
+		}
+	}
+}
+
+func (ablInterpolator) transform_RGBA_YCbCr444_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.YCbCr, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			sx0 := int(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx0 += bias.X
+			sx1 := sx0 + 1
+			if sx0 < sr.Min.X {
+				sx0, sx1 = sr.Min.X, sr.Min.X
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= sr.Max.X {
+				sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			sy -= 0.5
+			sy0 := int(sy)
+			yFrac0 := sy - float64(sy0)
+			yFrac1 := 1 - yFrac0
+			sy0 += bias.Y
+			sy1 := sy0 + 1
+			if sy0 < sr.Min.Y {
+				sy0, sy1 = sr.Min.Y, sr.Min.Y
+				yFrac0, yFrac1 = 0, 1
+			} else if sy1 >= sr.Max.Y {
+				sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+				yFrac0, yFrac1 = 1, 0
+			}
+
+			s00i := (sy0-src.Rect.Min.Y)*src.YStride + (sx0 - src.Rect.Min.X)
+			s00j := (sy0-src.Rect.Min.Y)*src.CStride + (sx0 - src.Rect.Min.X)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s00yy1 := int(src.Y[s00i]) * 0x10100
+			s00cb1 := int(src.Cb[s00j]) - 128
+			s00cr1 := int(src.Cr[s00j]) - 128
+			s00ru := (s00yy1 + 91881*s00cr1) >> 8
+			s00gu := (s00yy1 - 22554*s00cb1 - 46802*s00cr1) >> 8
+			s00bu := (s00yy1 + 116130*s00cb1) >> 8
+			if s00ru < 0 {
+				s00ru = 0
+			} else if s00ru > 0xffff {
+				s00ru = 0xffff
+			}
+			if s00gu < 0 {
+				s00gu = 0
+			} else if s00gu > 0xffff {
+				s00gu = 0xffff
+			}
+			if s00bu < 0 {
+				s00bu = 0
+			} else if s00bu > 0xffff {
+				s00bu = 0xffff
+			}
+
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s10i := (sy0-src.Rect.Min.Y)*src.YStride + (sx1 - src.Rect.Min.X)
+			s10j := (sy0-src.Rect.Min.Y)*src.CStride + (sx1 - src.Rect.Min.X)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s10yy1 := int(src.Y[s10i]) * 0x10100
+			s10cb1 := int(src.Cb[s10j]) - 128
+			s10cr1 := int(src.Cr[s10j]) - 128
+			s10ru := (s10yy1 + 91881*s10cr1) >> 8
+			s10gu := (s10yy1 - 22554*s10cb1 - 46802*s10cr1) >> 8
+			s10bu := (s10yy1 + 116130*s10cb1) >> 8
+			if s10ru < 0 {
+				s10ru = 0
+			} else if s10ru > 0xffff {
+				s10ru = 0xffff
+			}
+			if s10gu < 0 {
+				s10gu = 0
+			} else if s10gu > 0xffff {
+				s10gu = 0xffff
+			}
+			if s10bu < 0 {
+				s10bu = 0
+			} else if s10bu > 0xffff {
+				s10bu = 0xffff
+			}
+
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s01i := (sy1-src.Rect.Min.Y)*src.YStride + (sx0 - src.Rect.Min.X)
+			s01j := (sy1-src.Rect.Min.Y)*src.CStride + (sx0 - src.Rect.Min.X)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s01yy1 := int(src.Y[s01i]) * 0x10100
+			s01cb1 := int(src.Cb[s01j]) - 128
+			s01cr1 := int(src.Cr[s01j]) - 128
+			s01ru := (s01yy1 + 91881*s01cr1) >> 8
+			s01gu := (s01yy1 - 22554*s01cb1 - 46802*s01cr1) >> 8
+			s01bu := (s01yy1 + 116130*s01cb1) >> 8
+			if s01ru < 0 {
+				s01ru = 0
+			} else if s01ru > 0xffff {
+				s01ru = 0xffff
+			}
+			if s01gu < 0 {
+				s01gu = 0
+			} else if s01gu > 0xffff {
+				s01gu = 0xffff
+			}
+			if s01bu < 0 {
+				s01bu = 0
+			} else if s01bu > 0xffff {
+				s01bu = 0xffff
+			}
+
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s11i := (sy1-src.Rect.Min.Y)*src.YStride + (sx1 - src.Rect.Min.X)
+			s11j := (sy1-src.Rect.Min.Y)*src.CStride + (sx1 - src.Rect.Min.X)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s11yy1 := int(src.Y[s11i]) * 0x10100
+			s11cb1 := int(src.Cb[s11j]) - 128
+			s11cr1 := int(src.Cr[s11j]) - 128
+			s11ru := (s11yy1 + 91881*s11cr1) >> 8
+			s11gu := (s11yy1 - 22554*s11cb1 - 46802*s11cr1) >> 8
+			s11bu := (s11yy1 + 116130*s11cb1) >> 8
+			if s11ru < 0 {
+				s11ru = 0
+			} else if s11ru > 0xffff {
+				s11ru = 0xffff
+			}
+			if s11gu < 0 {
+				s11gu = 0
+			} else if s11gu > 0xffff {
+				s11gu = 0xffff
+			}
+			if s11bu < 0 {
+				s11bu = 0
+			} else if s11bu > 0xffff {
+				s11bu = 0xffff
+			}
+
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			pr := uint32(s11r)
+			pg := uint32(s11g)
+			pb := uint32(s11b)
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (ablInterpolator) transform_RGBA_YCbCr422_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.YCbCr, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			sx0 := int(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx0 += bias.X
+			sx1 := sx0 + 1
+			if sx0 < sr.Min.X {
+				sx0, sx1 = sr.Min.X, sr.Min.X
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= sr.Max.X {
+				sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			sy -= 0.5
+			sy0 := int(sy)
+			yFrac0 := sy - float64(sy0)
+			yFrac1 := 1 - yFrac0
+			sy0 += bias.Y
+			sy1 := sy0 + 1
+			if sy0 < sr.Min.Y {
+				sy0, sy1 = sr.Min.Y, sr.Min.Y
+				yFrac0, yFrac1 = 0, 1
+			} else if sy1 >= sr.Max.Y {
+				sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+				yFrac0, yFrac1 = 1, 0
+			}
+
+			s00i := (sy0-src.Rect.Min.Y)*src.YStride + (sx0 - src.Rect.Min.X)
+			s00j := (sy0-src.Rect.Min.Y)*src.CStride + ((sx0)/2 - src.Rect.Min.X/2)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s00yy1 := int(src.Y[s00i]) * 0x10100
+			s00cb1 := int(src.Cb[s00j]) - 128
+			s00cr1 := int(src.Cr[s00j]) - 128
+			s00ru := (s00yy1 + 91881*s00cr1) >> 8
+			s00gu := (s00yy1 - 22554*s00cb1 - 46802*s00cr1) >> 8
+			s00bu := (s00yy1 + 116130*s00cb1) >> 8
+			if s00ru < 0 {
+				s00ru = 0
+			} else if s00ru > 0xffff {
+				s00ru = 0xffff
+			}
+			if s00gu < 0 {
+				s00gu = 0
+			} else if s00gu > 0xffff {
+				s00gu = 0xffff
+			}
+			if s00bu < 0 {
+				s00bu = 0
+			} else if s00bu > 0xffff {
+				s00bu = 0xffff
+			}
+
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s10i := (sy0-src.Rect.Min.Y)*src.YStride + (sx1 - src.Rect.Min.X)
+			s10j := (sy0-src.Rect.Min.Y)*src.CStride + ((sx1)/2 - src.Rect.Min.X/2)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s10yy1 := int(src.Y[s10i]) * 0x10100
+			s10cb1 := int(src.Cb[s10j]) - 128
+			s10cr1 := int(src.Cr[s10j]) - 128
+			s10ru := (s10yy1 + 91881*s10cr1) >> 8
+			s10gu := (s10yy1 - 22554*s10cb1 - 46802*s10cr1) >> 8
+			s10bu := (s10yy1 + 116130*s10cb1) >> 8
+			if s10ru < 0 {
+				s10ru = 0
+			} else if s10ru > 0xffff {
+				s10ru = 0xffff
+			}
+			if s10gu < 0 {
+				s10gu = 0
+			} else if s10gu > 0xffff {
+				s10gu = 0xffff
+			}
+			if s10bu < 0 {
+				s10bu = 0
+			} else if s10bu > 0xffff {
+				s10bu = 0xffff
+			}
+
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s01i := (sy1-src.Rect.Min.Y)*src.YStride + (sx0 - src.Rect.Min.X)
+			s01j := (sy1-src.Rect.Min.Y)*src.CStride + ((sx0)/2 - src.Rect.Min.X/2)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s01yy1 := int(src.Y[s01i]) * 0x10100
+			s01cb1 := int(src.Cb[s01j]) - 128
+			s01cr1 := int(src.Cr[s01j]) - 128
+			s01ru := (s01yy1 + 91881*s01cr1) >> 8
+			s01gu := (s01yy1 - 22554*s01cb1 - 46802*s01cr1) >> 8
+			s01bu := (s01yy1 + 116130*s01cb1) >> 8
+			if s01ru < 0 {
+				s01ru = 0
+			} else if s01ru > 0xffff {
+				s01ru = 0xffff
+			}
+			if s01gu < 0 {
+				s01gu = 0
+			} else if s01gu > 0xffff {
+				s01gu = 0xffff
+			}
+			if s01bu < 0 {
+				s01bu = 0
+			} else if s01bu > 0xffff {
+				s01bu = 0xffff
+			}
+
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s11i := (sy1-src.Rect.Min.Y)*src.YStride + (sx1 - src.Rect.Min.X)
+			s11j := (sy1-src.Rect.Min.Y)*src.CStride + ((sx1)/2 - src.Rect.Min.X/2)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s11yy1 := int(src.Y[s11i]) * 0x10100
+			s11cb1 := int(src.Cb[s11j]) - 128
+			s11cr1 := int(src.Cr[s11j]) - 128
+			s11ru := (s11yy1 + 91881*s11cr1) >> 8
+			s11gu := (s11yy1 - 22554*s11cb1 - 46802*s11cr1) >> 8
+			s11bu := (s11yy1 + 116130*s11cb1) >> 8
+			if s11ru < 0 {
+				s11ru = 0
+			} else if s11ru > 0xffff {
+				s11ru = 0xffff
+			}
+			if s11gu < 0 {
+				s11gu = 0
+			} else if s11gu > 0xffff {
+				s11gu = 0xffff
+			}
+			if s11bu < 0 {
+				s11bu = 0
+			} else if s11bu > 0xffff {
+				s11bu = 0xffff
+			}
+
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			pr := uint32(s11r)
+			pg := uint32(s11g)
+			pb := uint32(s11b)
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (ablInterpolator) transform_RGBA_YCbCr420_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.YCbCr, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			sx0 := int(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx0 += bias.X
+			sx1 := sx0 + 1
+			if sx0 < sr.Min.X {
+				sx0, sx1 = sr.Min.X, sr.Min.X
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= sr.Max.X {
+				sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			sy -= 0.5
+			sy0 := int(sy)
+			yFrac0 := sy - float64(sy0)
+			yFrac1 := 1 - yFrac0
+			sy0 += bias.Y
+			sy1 := sy0 + 1
+			if sy0 < sr.Min.Y {
+				sy0, sy1 = sr.Min.Y, sr.Min.Y
+				yFrac0, yFrac1 = 0, 1
+			} else if sy1 >= sr.Max.Y {
+				sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+				yFrac0, yFrac1 = 1, 0
+			}
+
+			s00i := (sy0-src.Rect.Min.Y)*src.YStride + (sx0 - src.Rect.Min.X)
+			s00j := ((sy0)/2-src.Rect.Min.Y/2)*src.CStride + ((sx0)/2 - src.Rect.Min.X/2)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s00yy1 := int(src.Y[s00i]) * 0x10100
+			s00cb1 := int(src.Cb[s00j]) - 128
+			s00cr1 := int(src.Cr[s00j]) - 128
+			s00ru := (s00yy1 + 91881*s00cr1) >> 8
+			s00gu := (s00yy1 - 22554*s00cb1 - 46802*s00cr1) >> 8
+			s00bu := (s00yy1 + 116130*s00cb1) >> 8
+			if s00ru < 0 {
+				s00ru = 0
+			} else if s00ru > 0xffff {
+				s00ru = 0xffff
+			}
+			if s00gu < 0 {
+				s00gu = 0
+			} else if s00gu > 0xffff {
+				s00gu = 0xffff
+			}
+			if s00bu < 0 {
+				s00bu = 0
+			} else if s00bu > 0xffff {
+				s00bu = 0xffff
+			}
+
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s10i := (sy0-src.Rect.Min.Y)*src.YStride + (sx1 - src.Rect.Min.X)
+			s10j := ((sy0)/2-src.Rect.Min.Y/2)*src.CStride + ((sx1)/2 - src.Rect.Min.X/2)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s10yy1 := int(src.Y[s10i]) * 0x10100
+			s10cb1 := int(src.Cb[s10j]) - 128
+			s10cr1 := int(src.Cr[s10j]) - 128
+			s10ru := (s10yy1 + 91881*s10cr1) >> 8
+			s10gu := (s10yy1 - 22554*s10cb1 - 46802*s10cr1) >> 8
+			s10bu := (s10yy1 + 116130*s10cb1) >> 8
+			if s10ru < 0 {
+				s10ru = 0
+			} else if s10ru > 0xffff {
+				s10ru = 0xffff
+			}
+			if s10gu < 0 {
+				s10gu = 0
+			} else if s10gu > 0xffff {
+				s10gu = 0xffff
+			}
+			if s10bu < 0 {
+				s10bu = 0
+			} else if s10bu > 0xffff {
+				s10bu = 0xffff
+			}
+
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s01i := (sy1-src.Rect.Min.Y)*src.YStride + (sx0 - src.Rect.Min.X)
+			s01j := ((sy1)/2-src.Rect.Min.Y/2)*src.CStride + ((sx0)/2 - src.Rect.Min.X/2)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s01yy1 := int(src.Y[s01i]) * 0x10100
+			s01cb1 := int(src.Cb[s01j]) - 128
+			s01cr1 := int(src.Cr[s01j]) - 128
+			s01ru := (s01yy1 + 91881*s01cr1) >> 8
+			s01gu := (s01yy1 - 22554*s01cb1 - 46802*s01cr1) >> 8
+			s01bu := (s01yy1 + 116130*s01cb1) >> 8
+			if s01ru < 0 {
+				s01ru = 0
+			} else if s01ru > 0xffff {
+				s01ru = 0xffff
+			}
+			if s01gu < 0 {
+				s01gu = 0
+			} else if s01gu > 0xffff {
+				s01gu = 0xffff
+			}
+			if s01bu < 0 {
+				s01bu = 0
+			} else if s01bu > 0xffff {
+				s01bu = 0xffff
+			}
+
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s11i := (sy1-src.Rect.Min.Y)*src.YStride + (sx1 - src.Rect.Min.X)
+			s11j := ((sy1)/2-src.Rect.Min.Y/2)*src.CStride + ((sx1)/2 - src.Rect.Min.X/2)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s11yy1 := int(src.Y[s11i]) * 0x10100
+			s11cb1 := int(src.Cb[s11j]) - 128
+			s11cr1 := int(src.Cr[s11j]) - 128
+			s11ru := (s11yy1 + 91881*s11cr1) >> 8
+			s11gu := (s11yy1 - 22554*s11cb1 - 46802*s11cr1) >> 8
+			s11bu := (s11yy1 + 116130*s11cb1) >> 8
+			if s11ru < 0 {
+				s11ru = 0
+			} else if s11ru > 0xffff {
+				s11ru = 0xffff
+			}
+			if s11gu < 0 {
+				s11gu = 0
+			} else if s11gu > 0xffff {
+				s11gu = 0xffff
+			}
+			if s11bu < 0 {
+				s11bu = 0
+			} else if s11bu > 0xffff {
+				s11bu = 0xffff
+			}
+
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			pr := uint32(s11r)
+			pg := uint32(s11g)
+			pb := uint32(s11b)
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (ablInterpolator) transform_RGBA_YCbCr440_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.YCbCr, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			sx0 := int(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx0 += bias.X
+			sx1 := sx0 + 1
+			if sx0 < sr.Min.X {
+				sx0, sx1 = sr.Min.X, sr.Min.X
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= sr.Max.X {
+				sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			sy -= 0.5
+			sy0 := int(sy)
+			yFrac0 := sy - float64(sy0)
+			yFrac1 := 1 - yFrac0
+			sy0 += bias.Y
+			sy1 := sy0 + 1
+			if sy0 < sr.Min.Y {
+				sy0, sy1 = sr.Min.Y, sr.Min.Y
+				yFrac0, yFrac1 = 0, 1
+			} else if sy1 >= sr.Max.Y {
+				sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+				yFrac0, yFrac1 = 1, 0
+			}
+
+			s00i := (sy0-src.Rect.Min.Y)*src.YStride + (sx0 - src.Rect.Min.X)
+			s00j := ((sy0)/2-src.Rect.Min.Y/2)*src.CStride + (sx0 - src.Rect.Min.X)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s00yy1 := int(src.Y[s00i]) * 0x10100
+			s00cb1 := int(src.Cb[s00j]) - 128
+			s00cr1 := int(src.Cr[s00j]) - 128
+			s00ru := (s00yy1 + 91881*s00cr1) >> 8
+			s00gu := (s00yy1 - 22554*s00cb1 - 46802*s00cr1) >> 8
+			s00bu := (s00yy1 + 116130*s00cb1) >> 8
+			if s00ru < 0 {
+				s00ru = 0
+			} else if s00ru > 0xffff {
+				s00ru = 0xffff
+			}
+			if s00gu < 0 {
+				s00gu = 0
+			} else if s00gu > 0xffff {
+				s00gu = 0xffff
+			}
+			if s00bu < 0 {
+				s00bu = 0
+			} else if s00bu > 0xffff {
+				s00bu = 0xffff
+			}
+
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s10i := (sy0-src.Rect.Min.Y)*src.YStride + (sx1 - src.Rect.Min.X)
+			s10j := ((sy0)/2-src.Rect.Min.Y/2)*src.CStride + (sx1 - src.Rect.Min.X)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s10yy1 := int(src.Y[s10i]) * 0x10100
+			s10cb1 := int(src.Cb[s10j]) - 128
+			s10cr1 := int(src.Cr[s10j]) - 128
+			s10ru := (s10yy1 + 91881*s10cr1) >> 8
+			s10gu := (s10yy1 - 22554*s10cb1 - 46802*s10cr1) >> 8
+			s10bu := (s10yy1 + 116130*s10cb1) >> 8
+			if s10ru < 0 {
+				s10ru = 0
+			} else if s10ru > 0xffff {
+				s10ru = 0xffff
+			}
+			if s10gu < 0 {
+				s10gu = 0
+			} else if s10gu > 0xffff {
+				s10gu = 0xffff
+			}
+			if s10bu < 0 {
+				s10bu = 0
+			} else if s10bu > 0xffff {
+				s10bu = 0xffff
+			}
+
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s01i := (sy1-src.Rect.Min.Y)*src.YStride + (sx0 - src.Rect.Min.X)
+			s01j := ((sy1)/2-src.Rect.Min.Y/2)*src.CStride + (sx0 - src.Rect.Min.X)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s01yy1 := int(src.Y[s01i]) * 0x10100
+			s01cb1 := int(src.Cb[s01j]) - 128
+			s01cr1 := int(src.Cr[s01j]) - 128
+			s01ru := (s01yy1 + 91881*s01cr1) >> 8
+			s01gu := (s01yy1 - 22554*s01cb1 - 46802*s01cr1) >> 8
+			s01bu := (s01yy1 + 116130*s01cb1) >> 8
+			if s01ru < 0 {
+				s01ru = 0
+			} else if s01ru > 0xffff {
+				s01ru = 0xffff
+			}
+			if s01gu < 0 {
+				s01gu = 0
+			} else if s01gu > 0xffff {
+				s01gu = 0xffff
+			}
+			if s01bu < 0 {
+				s01bu = 0
+			} else if s01bu > 0xffff {
+				s01bu = 0xffff
+			}
+
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s11i := (sy1-src.Rect.Min.Y)*src.YStride + (sx1 - src.Rect.Min.X)
+			s11j := ((sy1)/2-src.Rect.Min.Y/2)*src.CStride + (sx1 - src.Rect.Min.X)
+
+			// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+			s11yy1 := int(src.Y[s11i]) * 0x10100
+			s11cb1 := int(src.Cb[s11j]) - 128
+			s11cr1 := int(src.Cr[s11j]) - 128
+			s11ru := (s11yy1 + 91881*s11cr1) >> 8
+			s11gu := (s11yy1 - 22554*s11cb1 - 46802*s11cr1) >> 8
+			s11bu := (s11yy1 + 116130*s11cb1) >> 8
+			if s11ru < 0 {
+				s11ru = 0
+			} else if s11ru > 0xffff {
+				s11ru = 0xffff
+			}
+			if s11gu < 0 {
+				s11gu = 0
+			} else if s11gu > 0xffff {
+				s11gu = 0xffff
+			}
+			if s11bu < 0 {
+				s11bu = 0
+			} else if s11bu > 0xffff {
+				s11bu = 0xffff
+			}
+
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			pr := uint32(s11r)
+			pg := uint32(s11g)
+			pb := uint32(s11b)
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (ablInterpolator) transform_RGBA_Image_Over(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src image.Image, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			sx0 := int(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx0 += bias.X
+			sx1 := sx0 + 1
+			if sx0 < sr.Min.X {
+				sx0, sx1 = sr.Min.X, sr.Min.X
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= sr.Max.X {
+				sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			sy -= 0.5
+			sy0 := int(sy)
+			yFrac0 := sy - float64(sy0)
+			yFrac1 := 1 - yFrac0
+			sy0 += bias.Y
+			sy1 := sy0 + 1
+			if sy0 < sr.Min.Y {
+				sy0, sy1 = sr.Min.Y, sr.Min.Y
+				yFrac0, yFrac1 = 0, 1
+			} else if sy1 >= sr.Max.Y {
+				sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+				yFrac0, yFrac1 = 1, 0
+			}
+
+			s00ru, s00gu, s00bu, s00au := src.At(sx0, sy0).RGBA()
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10ru, s10gu, s10bu, s10au := src.At(sx1, sy0).RGBA()
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01ru, s01gu, s01bu, s01au := src.At(sx0, sy1).RGBA()
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11ru, s11gu, s11bu, s11au := src.At(sx1, sy1).RGBA()
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			pr := uint32(s11r)
+			pg := uint32(s11g)
+			pb := uint32(s11b)
+			pa := uint32(s11a)
+			pa1 := (0xffff - pa) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + pr) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + pg) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + pb) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + pa) >> 8)
+		}
+	}
+}
+
+func (ablInterpolator) transform_RGBA_Image_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src image.Image, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			sx0 := int(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx0 += bias.X
+			sx1 := sx0 + 1
+			if sx0 < sr.Min.X {
+				sx0, sx1 = sr.Min.X, sr.Min.X
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= sr.Max.X {
+				sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			sy -= 0.5
+			sy0 := int(sy)
+			yFrac0 := sy - float64(sy0)
+			yFrac1 := 1 - yFrac0
+			sy0 += bias.Y
+			sy1 := sy0 + 1
+			if sy0 < sr.Min.Y {
+				sy0, sy1 = sr.Min.Y, sr.Min.Y
+				yFrac0, yFrac1 = 0, 1
+			} else if sy1 >= sr.Max.Y {
+				sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+				yFrac0, yFrac1 = 1, 0
+			}
+
+			s00ru, s00gu, s00bu, s00au := src.At(sx0, sy0).RGBA()
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10ru, s10gu, s10bu, s10au := src.At(sx1, sy0).RGBA()
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01ru, s01gu, s01bu, s01au := src.At(sx0, sy1).RGBA()
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11ru, s11gu, s11bu, s11au := src.At(sx1, sy1).RGBA()
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			pr := uint32(s11r)
+			pg := uint32(s11g)
+			pb := uint32(s11b)
+			pa := uint32(s11a)
+			dst.Pix[d+0] = uint8(pr >> 8)
+			dst.Pix[d+1] = uint8(pg >> 8)
+			dst.Pix[d+2] = uint8(pb >> 8)
+			dst.Pix[d+3] = uint8(pa >> 8)
+		}
+	}
+}
+
+func (ablInterpolator) transform_Image_Image_Over(dst Image, dr, adr image.Rectangle, d2s *f64.Aff3, src image.Image, sr image.Rectangle, bias image.Point, opts *Options) {
+	srcMask, smp := opts.SrcMask, opts.SrcMaskP
+	dstMask, dmp := opts.DstMask, opts.DstMaskP
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			sx0 := int(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx0 += bias.X
+			sx1 := sx0 + 1
+			if sx0 < sr.Min.X {
+				sx0, sx1 = sr.Min.X, sr.Min.X
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= sr.Max.X {
+				sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			sy -= 0.5
+			sy0 := int(sy)
+			yFrac0 := sy - float64(sy0)
+			yFrac1 := 1 - yFrac0
+			sy0 += bias.Y
+			sy1 := sy0 + 1
+			if sy0 < sr.Min.Y {
+				sy0, sy1 = sr.Min.Y, sr.Min.Y
+				yFrac0, yFrac1 = 0, 1
+			} else if sy1 >= sr.Max.Y {
+				sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+				yFrac0, yFrac1 = 1, 0
+			}
+
+			s00ru, s00gu, s00bu, s00au := src.At(sx0, sy0).RGBA()
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sx0, smp.Y+sy0).RGBA()
+				s00ru = s00ru * ma / 0xffff
+				s00gu = s00gu * ma / 0xffff
+				s00bu = s00bu * ma / 0xffff
+				s00au = s00au * ma / 0xffff
+			}
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10ru, s10gu, s10bu, s10au := src.At(sx1, sy0).RGBA()
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sx1, smp.Y+sy0).RGBA()
+				s10ru = s10ru * ma / 0xffff
+				s10gu = s10gu * ma / 0xffff
+				s10bu = s10bu * ma / 0xffff
+				s10au = s10au * ma / 0xffff
+			}
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01ru, s01gu, s01bu, s01au := src.At(sx0, sy1).RGBA()
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sx0, smp.Y+sy1).RGBA()
+				s01ru = s01ru * ma / 0xffff
+				s01gu = s01gu * ma / 0xffff
+				s01bu = s01bu * ma / 0xffff
+				s01au = s01au * ma / 0xffff
+			}
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11ru, s11gu, s11bu, s11au := src.At(sx1, sy1).RGBA()
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sx1, smp.Y+sy1).RGBA()
+				s11ru = s11ru * ma / 0xffff
+				s11gu = s11gu * ma / 0xffff
+				s11bu = s11bu * ma / 0xffff
+				s11au = s11au * ma / 0xffff
+			}
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			pr := uint32(s11r)
+			pg := uint32(s11g)
+			pb := uint32(s11b)
+			pa := uint32(s11a)
+			qr, qg, qb, qa := dst.At(dr.Min.X+int(dx), dr.Min.Y+int(dy)).RGBA()
+			if dstMask != nil {
+				_, _, _, ma := dstMask.At(dmp.X+dr.Min.X+int(dx), dmp.Y+dr.Min.Y+int(dy)).RGBA()
+				pr = pr * ma / 0xffff
+				pg = pg * ma / 0xffff
+				pb = pb * ma / 0xffff
+				pa = pa * ma / 0xffff
+			}
+			pa1 := 0xffff - pa
+			dstColorRGBA64.R = uint16(qr*pa1/0xffff + pr)
+			dstColorRGBA64.G = uint16(qg*pa1/0xffff + pg)
+			dstColorRGBA64.B = uint16(qb*pa1/0xffff + pb)
+			dstColorRGBA64.A = uint16(qa*pa1/0xffff + pa)
+			dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
+		}
+	}
+}
+
+func (ablInterpolator) transform_Image_Image_Src(dst Image, dr, adr image.Rectangle, d2s *f64.Aff3, src image.Image, sr image.Rectangle, bias image.Point, opts *Options) {
+	srcMask, smp := opts.SrcMask, opts.SrcMaskP
+	dstMask, dmp := opts.DstMask, opts.DstMaskP
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			sx0 := int(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx0 += bias.X
+			sx1 := sx0 + 1
+			if sx0 < sr.Min.X {
+				sx0, sx1 = sr.Min.X, sr.Min.X
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= sr.Max.X {
+				sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			sy -= 0.5
+			sy0 := int(sy)
+			yFrac0 := sy - float64(sy0)
+			yFrac1 := 1 - yFrac0
+			sy0 += bias.Y
+			sy1 := sy0 + 1
+			if sy0 < sr.Min.Y {
+				sy0, sy1 = sr.Min.Y, sr.Min.Y
+				yFrac0, yFrac1 = 0, 1
+			} else if sy1 >= sr.Max.Y {
+				sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+				yFrac0, yFrac1 = 1, 0
+			}
+
+			s00ru, s00gu, s00bu, s00au := src.At(sx0, sy0).RGBA()
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sx0, smp.Y+sy0).RGBA()
+				s00ru = s00ru * ma / 0xffff
+				s00gu = s00gu * ma / 0xffff
+				s00bu = s00bu * ma / 0xffff
+				s00au = s00au * ma / 0xffff
+			}
+			s00r := float64(s00ru)
+			s00g := float64(s00gu)
+			s00b := float64(s00bu)
+			s00a := float64(s00au)
+			s10ru, s10gu, s10bu, s10au := src.At(sx1, sy0).RGBA()
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sx1, smp.Y+sy0).RGBA()
+				s10ru = s10ru * ma / 0xffff
+				s10gu = s10gu * ma / 0xffff
+				s10bu = s10bu * ma / 0xffff
+				s10au = s10au * ma / 0xffff
+			}
+			s10r := float64(s10ru)
+			s10g := float64(s10gu)
+			s10b := float64(s10bu)
+			s10a := float64(s10au)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01ru, s01gu, s01bu, s01au := src.At(sx0, sy1).RGBA()
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sx0, smp.Y+sy1).RGBA()
+				s01ru = s01ru * ma / 0xffff
+				s01gu = s01gu * ma / 0xffff
+				s01bu = s01bu * ma / 0xffff
+				s01au = s01au * ma / 0xffff
+			}
+			s01r := float64(s01ru)
+			s01g := float64(s01gu)
+			s01b := float64(s01bu)
+			s01a := float64(s01au)
+			s11ru, s11gu, s11bu, s11au := src.At(sx1, sy1).RGBA()
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sx1, smp.Y+sy1).RGBA()
+				s11ru = s11ru * ma / 0xffff
+				s11gu = s11gu * ma / 0xffff
+				s11bu = s11bu * ma / 0xffff
+				s11au = s11au * ma / 0xffff
+			}
+			s11r := float64(s11ru)
+			s11g := float64(s11gu)
+			s11b := float64(s11bu)
+			s11a := float64(s11au)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			pr := uint32(s11r)
+			pg := uint32(s11g)
+			pb := uint32(s11b)
+			pa := uint32(s11a)
+			if dstMask != nil {
+				qr, qg, qb, qa := dst.At(dr.Min.X+int(dx), dr.Min.Y+int(dy)).RGBA()
+				_, _, _, ma := dstMask.At(dmp.X+dr.Min.X+int(dx), dmp.Y+dr.Min.Y+int(dy)).RGBA()
+				pr = pr * ma / 0xffff
+				pg = pg * ma / 0xffff
+				pb = pb * ma / 0xffff
+				pa = pa * ma / 0xffff
+				pa1 := 0xffff - ma
+				dstColorRGBA64.R = uint16(qr*pa1/0xffff + pr)
+				dstColorRGBA64.G = uint16(qg*pa1/0xffff + pg)
+				dstColorRGBA64.B = uint16(qb*pa1/0xffff + pb)
+				dstColorRGBA64.A = uint16(qa*pa1/0xffff + pa)
+				dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
+			} else {
+				dstColorRGBA64.R = uint16(pr)
+				dstColorRGBA64.G = uint16(pg)
+				dstColorRGBA64.B = uint16(pb)
+				dstColorRGBA64.A = uint16(pa)
+				dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
+			}
+		}
+	}
+}
+
+func (z *kernelScaler) Scale(dst Image, dr image.Rectangle, src image.Image, sr image.Rectangle, op Op, opts *Options) {
+	if z.dw != int32(dr.Dx()) || z.dh != int32(dr.Dy()) || z.sw != int32(sr.Dx()) || z.sh != int32(sr.Dy()) {
+		z.kernel.Scale(dst, dr, src, sr, op, opts)
+		return
+	}
+
+	var o Options
+	if opts != nil {
+		o = *opts
+	}
+
+	// adr is the affected destination pixels.
+	adr := dst.Bounds().Intersect(dr)
+	adr, o.DstMask = clipAffectedDestRect(adr, o.DstMask, o.DstMaskP)
+	if adr.Empty() || sr.Empty() {
+		return
+	}
+	// Make adr relative to dr.Min.
+	adr = adr.Sub(dr.Min)
+	if op == Over && o.SrcMask == nil && opaque(src) {
+		op = Src
+	}
+
+	if _, ok := src.(*image.Uniform); ok && o.DstMask == nil && o.SrcMask == nil && sr.In(src.Bounds()) {
+		Draw(dst, dr, src, src.Bounds().Min, op)
+		return
+	}
+
+	// Create a temporary buffer:
+	// scaleX distributes the source image's columns over the temporary image.
+	// scaleY distributes the temporary image's rows over the destination image.
+	var tmp [][4]float64
+	if z.pool.New != nil {
+		tmpp := z.pool.Get().(*[][4]float64)
+		defer z.pool.Put(tmpp)
+		tmp = *tmpp
+	} else {
+		tmp = z.makeTmpBuf()
+	}
+
+	// sr is the source pixels. If it extends beyond the src bounds,
+	// we cannot use the type-specific fast paths, as they access
+	// the Pix fields directly without bounds checking.
+	//
+	// Similarly, the fast paths assume that the masks are nil.
+	if o.SrcMask != nil || !sr.In(src.Bounds()) {
+		z.scaleX_Image(tmp, src, sr, &o)
+	} else {
+		switch src := src.(type) {
+		case *image.Gray:
+			z.scaleX_Gray(tmp, src, sr, &o)
+		case *image.NRGBA:
+			z.scaleX_NRGBA(tmp, src, sr, &o)
+		case *image.RGBA:
+			z.scaleX_RGBA(tmp, src, sr, &o)
+		case *image.YCbCr:
+			switch src.SubsampleRatio {
+			default:
+				z.scaleX_Image(tmp, src, sr, &o)
+			case image.YCbCrSubsampleRatio444:
+				z.scaleX_YCbCr444(tmp, src, sr, &o)
+			case image.YCbCrSubsampleRatio422:
+				z.scaleX_YCbCr422(tmp, src, sr, &o)
+			case image.YCbCrSubsampleRatio420:
+				z.scaleX_YCbCr420(tmp, src, sr, &o)
+			case image.YCbCrSubsampleRatio440:
+				z.scaleX_YCbCr440(tmp, src, sr, &o)
+			}
+		default:
+			z.scaleX_Image(tmp, src, sr, &o)
+		}
+	}
+
+	if o.DstMask != nil {
+		switch op {
+		case Over:
+			z.scaleY_Image_Over(dst, dr, adr, tmp, &o)
+		case Src:
+			z.scaleY_Image_Src(dst, dr, adr, tmp, &o)
+		}
+	} else {
+		switch op {
+		case Over:
+			switch dst := dst.(type) {
+			case *image.RGBA:
+				z.scaleY_RGBA_Over(dst, dr, adr, tmp, &o)
+			default:
+				z.scaleY_Image_Over(dst, dr, adr, tmp, &o)
+			}
+		case Src:
+			switch dst := dst.(type) {
+			case *image.RGBA:
+				z.scaleY_RGBA_Src(dst, dr, adr, tmp, &o)
+			default:
+				z.scaleY_Image_Src(dst, dr, adr, tmp, &o)
+			}
+		}
+	}
+}
+
+func (q *Kernel) Transform(dst Image, s2d f64.Aff3, src image.Image, sr image.Rectangle, op Op, opts *Options) {
+	var o Options
+	if opts != nil {
+		o = *opts
+	}
+
+	dr := transformRect(&s2d, &sr)
+	// adr is the affected destination pixels.
+	adr := dst.Bounds().Intersect(dr)
+	adr, o.DstMask = clipAffectedDestRect(adr, o.DstMask, o.DstMaskP)
+	if adr.Empty() || sr.Empty() {
+		return
+	}
+	if op == Over && o.SrcMask == nil && opaque(src) {
+		op = Src
+	}
+	d2s := invert(&s2d)
+	// bias is a translation of the mapping from dst coordinates to src
+	// coordinates such that the latter temporarily have non-negative X
+	// and Y coordinates. This allows us to write int(f) instead of
+	// int(math.Floor(f)), since "round to zero" and "round down" are
+	// equivalent when f >= 0, but the former is much cheaper. The X--
+	// and Y-- are because the TransformLeaf methods have a "sx -= 0.5"
+	// adjustment.
+	bias := transformRect(&d2s, &adr).Min
+	bias.X--
+	bias.Y--
+	d2s[2] -= float64(bias.X)
+	d2s[5] -= float64(bias.Y)
+	// Make adr relative to dr.Min.
+	adr = adr.Sub(dr.Min)
+
+	if u, ok := src.(*image.Uniform); ok && o.DstMask != nil && o.SrcMask != nil && sr.In(src.Bounds()) {
+		transform_Uniform(dst, dr, adr, &d2s, u, sr, bias, op)
+		return
+	}
+
+	xscale := abs(d2s[0])
+	if s := abs(d2s[1]); xscale < s {
+		xscale = s
+	}
+	yscale := abs(d2s[3])
+	if s := abs(d2s[4]); yscale < s {
+		yscale = s
+	}
+
+	// sr is the source pixels. If it extends beyond the src bounds,
+	// we cannot use the type-specific fast paths, as they access
+	// the Pix fields directly without bounds checking.
+	//
+	// Similarly, the fast paths assume that the masks are nil.
+	if o.DstMask != nil || o.SrcMask != nil || !sr.In(src.Bounds()) {
+		switch op {
+		case Over:
+			q.transform_Image_Image_Over(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
+		case Src:
+			q.transform_Image_Image_Src(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
+		}
+	} else {
+		switch op {
+		case Over:
+			switch dst := dst.(type) {
+			case *image.RGBA:
+				switch src := src.(type) {
+				case *image.NRGBA:
+					q.transform_RGBA_NRGBA_Over(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
+				case *image.RGBA:
+					q.transform_RGBA_RGBA_Over(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
+				default:
+					q.transform_RGBA_Image_Over(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
+				}
+			default:
+				switch src := src.(type) {
+				default:
+					q.transform_Image_Image_Over(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
+				}
+			}
+		case Src:
+			switch dst := dst.(type) {
+			case *image.RGBA:
+				switch src := src.(type) {
+				case *image.Gray:
+					q.transform_RGBA_Gray_Src(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
+				case *image.NRGBA:
+					q.transform_RGBA_NRGBA_Src(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
+				case *image.RGBA:
+					q.transform_RGBA_RGBA_Src(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
+				case *image.YCbCr:
+					switch src.SubsampleRatio {
+					default:
+						q.transform_RGBA_Image_Src(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
+					case image.YCbCrSubsampleRatio444:
+						q.transform_RGBA_YCbCr444_Src(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
+					case image.YCbCrSubsampleRatio422:
+						q.transform_RGBA_YCbCr422_Src(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
+					case image.YCbCrSubsampleRatio420:
+						q.transform_RGBA_YCbCr420_Src(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
+					case image.YCbCrSubsampleRatio440:
+						q.transform_RGBA_YCbCr440_Src(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
+					}
+				default:
+					q.transform_RGBA_Image_Src(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
+				}
+			default:
+				switch src := src.(type) {
+				default:
+					q.transform_Image_Image_Src(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
+				}
+			}
+		}
+	}
+}
+
+func (z *kernelScaler) scaleX_Gray(tmp [][4]float64, src *image.Gray, sr image.Rectangle, opts *Options) {
+	t := 0
+	for y := int32(0); y < z.sh; y++ {
+		for _, s := range z.horizontal.sources {
+			var pr float64
+			for _, c := range z.horizontal.contribs[s.i:s.j] {
+				pi := (sr.Min.Y+int(y)-src.Rect.Min.Y)*src.Stride + (sr.Min.X + int(c.coord) - src.Rect.Min.X)
+				pru := uint32(src.Pix[pi]) * 0x101
+				pr += float64(pru) * c.weight
+			}
+			pr *= s.invTotalWeightFFFF
+			tmp[t] = [4]float64{
+				pr,
+				pr,
+				pr,
+				1,
+			}
+			t++
+		}
+	}
+}
+
+func (z *kernelScaler) scaleX_NRGBA(tmp [][4]float64, src *image.NRGBA, sr image.Rectangle, opts *Options) {
+	t := 0
+	for y := int32(0); y < z.sh; y++ {
+		for _, s := range z.horizontal.sources {
+			var pr, pg, pb, pa float64
+			for _, c := range z.horizontal.contribs[s.i:s.j] {
+				pi := (sr.Min.Y+int(y)-src.Rect.Min.Y)*src.Stride + (sr.Min.X+int(c.coord)-src.Rect.Min.X)*4
+				pau := uint32(src.Pix[pi+3]) * 0x101
+				pru := uint32(src.Pix[pi+0]) * pau / 0xff
+				pgu := uint32(src.Pix[pi+1]) * pau / 0xff
+				pbu := uint32(src.Pix[pi+2]) * pau / 0xff
+				pr += float64(pru) * c.weight
+				pg += float64(pgu) * c.weight
+				pb += float64(pbu) * c.weight
+				pa += float64(pau) * c.weight
+			}
+			tmp[t] = [4]float64{
+				pr * s.invTotalWeightFFFF,
+				pg * s.invTotalWeightFFFF,
+				pb * s.invTotalWeightFFFF,
+				pa * s.invTotalWeightFFFF,
+			}
+			t++
+		}
+	}
+}
+
+func (z *kernelScaler) scaleX_RGBA(tmp [][4]float64, src *image.RGBA, sr image.Rectangle, opts *Options) {
+	t := 0
+	for y := int32(0); y < z.sh; y++ {
+		for _, s := range z.horizontal.sources {
+			var pr, pg, pb, pa float64
+			for _, c := range z.horizontal.contribs[s.i:s.j] {
+				pi := (sr.Min.Y+int(y)-src.Rect.Min.Y)*src.Stride + (sr.Min.X+int(c.coord)-src.Rect.Min.X)*4
+				pru := uint32(src.Pix[pi+0]) * 0x101
+				pgu := uint32(src.Pix[pi+1]) * 0x101
+				pbu := uint32(src.Pix[pi+2]) * 0x101
+				pau := uint32(src.Pix[pi+3]) * 0x101
+				pr += float64(pru) * c.weight
+				pg += float64(pgu) * c.weight
+				pb += float64(pbu) * c.weight
+				pa += float64(pau) * c.weight
+			}
+			tmp[t] = [4]float64{
+				pr * s.invTotalWeightFFFF,
+				pg * s.invTotalWeightFFFF,
+				pb * s.invTotalWeightFFFF,
+				pa * s.invTotalWeightFFFF,
+			}
+			t++
+		}
+	}
+}
+
+func (z *kernelScaler) scaleX_YCbCr444(tmp [][4]float64, src *image.YCbCr, sr image.Rectangle, opts *Options) {
+	t := 0
+	for y := int32(0); y < z.sh; y++ {
+		for _, s := range z.horizontal.sources {
+			var pr, pg, pb float64
+			for _, c := range z.horizontal.contribs[s.i:s.j] {
+				pi := (sr.Min.Y+int(y)-src.Rect.Min.Y)*src.YStride + (sr.Min.X + int(c.coord) - src.Rect.Min.X)
+				pj := (sr.Min.Y+int(y)-src.Rect.Min.Y)*src.CStride + (sr.Min.X + int(c.coord) - src.Rect.Min.X)
+
+				// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+				pyy1 := int(src.Y[pi]) * 0x10100
+				pcb1 := int(src.Cb[pj]) - 128
+				pcr1 := int(src.Cr[pj]) - 128
+				pru := (pyy1 + 91881*pcr1) >> 8
+				pgu := (pyy1 - 22554*pcb1 - 46802*pcr1) >> 8
+				pbu := (pyy1 + 116130*pcb1) >> 8
+				if pru < 0 {
+					pru = 0
+				} else if pru > 0xffff {
+					pru = 0xffff
+				}
+				if pgu < 0 {
+					pgu = 0
+				} else if pgu > 0xffff {
+					pgu = 0xffff
+				}
+				if pbu < 0 {
+					pbu = 0
+				} else if pbu > 0xffff {
+					pbu = 0xffff
+				}
+
+				pr += float64(pru) * c.weight
+				pg += float64(pgu) * c.weight
+				pb += float64(pbu) * c.weight
+			}
+			tmp[t] = [4]float64{
+				pr * s.invTotalWeightFFFF,
+				pg * s.invTotalWeightFFFF,
+				pb * s.invTotalWeightFFFF,
+				1,
+			}
+			t++
+		}
+	}
+}
+
+func (z *kernelScaler) scaleX_YCbCr422(tmp [][4]float64, src *image.YCbCr, sr image.Rectangle, opts *Options) {
+	t := 0
+	for y := int32(0); y < z.sh; y++ {
+		for _, s := range z.horizontal.sources {
+			var pr, pg, pb float64
+			for _, c := range z.horizontal.contribs[s.i:s.j] {
+				pi := (sr.Min.Y+int(y)-src.Rect.Min.Y)*src.YStride + (sr.Min.X + int(c.coord) - src.Rect.Min.X)
+				pj := (sr.Min.Y+int(y)-src.Rect.Min.Y)*src.CStride + ((sr.Min.X+int(c.coord))/2 - src.Rect.Min.X/2)
+
+				// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+				pyy1 := int(src.Y[pi]) * 0x10100
+				pcb1 := int(src.Cb[pj]) - 128
+				pcr1 := int(src.Cr[pj]) - 128
+				pru := (pyy1 + 91881*pcr1) >> 8
+				pgu := (pyy1 - 22554*pcb1 - 46802*pcr1) >> 8
+				pbu := (pyy1 + 116130*pcb1) >> 8
+				if pru < 0 {
+					pru = 0
+				} else if pru > 0xffff {
+					pru = 0xffff
+				}
+				if pgu < 0 {
+					pgu = 0
+				} else if pgu > 0xffff {
+					pgu = 0xffff
+				}
+				if pbu < 0 {
+					pbu = 0
+				} else if pbu > 0xffff {
+					pbu = 0xffff
+				}
+
+				pr += float64(pru) * c.weight
+				pg += float64(pgu) * c.weight
+				pb += float64(pbu) * c.weight
+			}
+			tmp[t] = [4]float64{
+				pr * s.invTotalWeightFFFF,
+				pg * s.invTotalWeightFFFF,
+				pb * s.invTotalWeightFFFF,
+				1,
+			}
+			t++
+		}
+	}
+}
+
+func (z *kernelScaler) scaleX_YCbCr420(tmp [][4]float64, src *image.YCbCr, sr image.Rectangle, opts *Options) {
+	t := 0
+	for y := int32(0); y < z.sh; y++ {
+		for _, s := range z.horizontal.sources {
+			var pr, pg, pb float64
+			for _, c := range z.horizontal.contribs[s.i:s.j] {
+				pi := (sr.Min.Y+int(y)-src.Rect.Min.Y)*src.YStride + (sr.Min.X + int(c.coord) - src.Rect.Min.X)
+				pj := ((sr.Min.Y+int(y))/2-src.Rect.Min.Y/2)*src.CStride + ((sr.Min.X+int(c.coord))/2 - src.Rect.Min.X/2)
+
+				// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+				pyy1 := int(src.Y[pi]) * 0x10100
+				pcb1 := int(src.Cb[pj]) - 128
+				pcr1 := int(src.Cr[pj]) - 128
+				pru := (pyy1 + 91881*pcr1) >> 8
+				pgu := (pyy1 - 22554*pcb1 - 46802*pcr1) >> 8
+				pbu := (pyy1 + 116130*pcb1) >> 8
+				if pru < 0 {
+					pru = 0
+				} else if pru > 0xffff {
+					pru = 0xffff
+				}
+				if pgu < 0 {
+					pgu = 0
+				} else if pgu > 0xffff {
+					pgu = 0xffff
+				}
+				if pbu < 0 {
+					pbu = 0
+				} else if pbu > 0xffff {
+					pbu = 0xffff
+				}
+
+				pr += float64(pru) * c.weight
+				pg += float64(pgu) * c.weight
+				pb += float64(pbu) * c.weight
+			}
+			tmp[t] = [4]float64{
+				pr * s.invTotalWeightFFFF,
+				pg * s.invTotalWeightFFFF,
+				pb * s.invTotalWeightFFFF,
+				1,
+			}
+			t++
+		}
+	}
+}
+
+func (z *kernelScaler) scaleX_YCbCr440(tmp [][4]float64, src *image.YCbCr, sr image.Rectangle, opts *Options) {
+	t := 0
+	for y := int32(0); y < z.sh; y++ {
+		for _, s := range z.horizontal.sources {
+			var pr, pg, pb float64
+			for _, c := range z.horizontal.contribs[s.i:s.j] {
+				pi := (sr.Min.Y+int(y)-src.Rect.Min.Y)*src.YStride + (sr.Min.X + int(c.coord) - src.Rect.Min.X)
+				pj := ((sr.Min.Y+int(y))/2-src.Rect.Min.Y/2)*src.CStride + (sr.Min.X + int(c.coord) - src.Rect.Min.X)
+
+				// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+				pyy1 := int(src.Y[pi]) * 0x10100
+				pcb1 := int(src.Cb[pj]) - 128
+				pcr1 := int(src.Cr[pj]) - 128
+				pru := (pyy1 + 91881*pcr1) >> 8
+				pgu := (pyy1 - 22554*pcb1 - 46802*pcr1) >> 8
+				pbu := (pyy1 + 116130*pcb1) >> 8
+				if pru < 0 {
+					pru = 0
+				} else if pru > 0xffff {
+					pru = 0xffff
+				}
+				if pgu < 0 {
+					pgu = 0
+				} else if pgu > 0xffff {
+					pgu = 0xffff
+				}
+				if pbu < 0 {
+					pbu = 0
+				} else if pbu > 0xffff {
+					pbu = 0xffff
+				}
+
+				pr += float64(pru) * c.weight
+				pg += float64(pgu) * c.weight
+				pb += float64(pbu) * c.weight
+			}
+			tmp[t] = [4]float64{
+				pr * s.invTotalWeightFFFF,
+				pg * s.invTotalWeightFFFF,
+				pb * s.invTotalWeightFFFF,
+				1,
+			}
+			t++
+		}
+	}
+}
+
+func (z *kernelScaler) scaleX_Image(tmp [][4]float64, src image.Image, sr image.Rectangle, opts *Options) {
+	t := 0
+	srcMask, smp := opts.SrcMask, opts.SrcMaskP
+	for y := int32(0); y < z.sh; y++ {
+		for _, s := range z.horizontal.sources {
+			var pr, pg, pb, pa float64
+			for _, c := range z.horizontal.contribs[s.i:s.j] {
+				pru, pgu, pbu, pau := src.At(sr.Min.X+int(c.coord), sr.Min.Y+int(y)).RGBA()
+				if srcMask != nil {
+					_, _, _, ma := srcMask.At(smp.X+sr.Min.X+int(c.coord), smp.Y+sr.Min.Y+int(y)).RGBA()
+					pru = pru * ma / 0xffff
+					pgu = pgu * ma / 0xffff
+					pbu = pbu * ma / 0xffff
+					pau = pau * ma / 0xffff
+				}
+				pr += float64(pru) * c.weight
+				pg += float64(pgu) * c.weight
+				pb += float64(pbu) * c.weight
+				pa += float64(pau) * c.weight
+			}
+			tmp[t] = [4]float64{
+				pr * s.invTotalWeightFFFF,
+				pg * s.invTotalWeightFFFF,
+				pb * s.invTotalWeightFFFF,
+				pa * s.invTotalWeightFFFF,
+			}
+			t++
+		}
+	}
+}
+
+func (z *kernelScaler) scaleY_RGBA_Over(dst *image.RGBA, dr, adr image.Rectangle, tmp [][4]float64, opts *Options) {
+	for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+		d := (dr.Min.Y+adr.Min.Y-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+int(dx)-dst.Rect.Min.X)*4
+		for _, s := range z.vertical.sources[adr.Min.Y:adr.Max.Y] {
+			var pr, pg, pb, pa float64
+			for _, c := range z.vertical.contribs[s.i:s.j] {
+				p := &tmp[c.coord*z.dw+dx]
+				pr += p[0] * c.weight
+				pg += p[1] * c.weight
+				pb += p[2] * c.weight
+				pa += p[3] * c.weight
+			}
+
+			if pr > pa {
+				pr = pa
+			}
+			if pg > pa {
+				pg = pa
+			}
+			if pb > pa {
+				pb = pa
+			}
+
+			pr0 := uint32(ftou(pr * s.invTotalWeight))
+			pg0 := uint32(ftou(pg * s.invTotalWeight))
+			pb0 := uint32(ftou(pb * s.invTotalWeight))
+			pa0 := uint32(ftou(pa * s.invTotalWeight))
+			pa1 := (0xffff - uint32(pa0)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + pr0) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + pg0) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + pb0) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + pa0) >> 8)
+			d += dst.Stride
+		}
+	}
+}
+
+func (z *kernelScaler) scaleY_RGBA_Src(dst *image.RGBA, dr, adr image.Rectangle, tmp [][4]float64, opts *Options) {
+	for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+		d := (dr.Min.Y+adr.Min.Y-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+int(dx)-dst.Rect.Min.X)*4
+		for _, s := range z.vertical.sources[adr.Min.Y:adr.Max.Y] {
+			var pr, pg, pb, pa float64
+			for _, c := range z.vertical.contribs[s.i:s.j] {
+				p := &tmp[c.coord*z.dw+dx]
+				pr += p[0] * c.weight
+				pg += p[1] * c.weight
+				pb += p[2] * c.weight
+				pa += p[3] * c.weight
+			}
+
+			if pr > pa {
+				pr = pa
+			}
+			if pg > pa {
+				pg = pa
+			}
+			if pb > pa {
+				pb = pa
+			}
+
+			dst.Pix[d+0] = uint8(ftou(pr*s.invTotalWeight) >> 8)
+			dst.Pix[d+1] = uint8(ftou(pg*s.invTotalWeight) >> 8)
+			dst.Pix[d+2] = uint8(ftou(pb*s.invTotalWeight) >> 8)
+			dst.Pix[d+3] = uint8(ftou(pa*s.invTotalWeight) >> 8)
+			d += dst.Stride
+		}
+	}
+}
+
+func (z *kernelScaler) scaleY_Image_Over(dst Image, dr, adr image.Rectangle, tmp [][4]float64, opts *Options) {
+	dstMask, dmp := opts.DstMask, opts.DstMaskP
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+	for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+		for dy, s := range z.vertical.sources[adr.Min.Y:adr.Max.Y] {
+			var pr, pg, pb, pa float64
+			for _, c := range z.vertical.contribs[s.i:s.j] {
+				p := &tmp[c.coord*z.dw+dx]
+				pr += p[0] * c.weight
+				pg += p[1] * c.weight
+				pb += p[2] * c.weight
+				pa += p[3] * c.weight
+			}
+
+			if pr > pa {
+				pr = pa
+			}
+			if pg > pa {
+				pg = pa
+			}
+			if pb > pa {
+				pb = pa
+			}
+
+			qr, qg, qb, qa := dst.At(dr.Min.X+int(dx), dr.Min.Y+int(adr.Min.Y+dy)).RGBA()
+			pr0 := uint32(ftou(pr * s.invTotalWeight))
+			pg0 := uint32(ftou(pg * s.invTotalWeight))
+			pb0 := uint32(ftou(pb * s.invTotalWeight))
+			pa0 := uint32(ftou(pa * s.invTotalWeight))
+			if dstMask != nil {
+				_, _, _, ma := dstMask.At(dmp.X+dr.Min.X+int(dx), dmp.Y+dr.Min.Y+int(adr.Min.Y+dy)).RGBA()
+				pr0 = pr0 * ma / 0xffff
+				pg0 = pg0 * ma / 0xffff
+				pb0 = pb0 * ma / 0xffff
+				pa0 = pa0 * ma / 0xffff
+			}
+			pa1 := 0xffff - pa0
+			dstColorRGBA64.R = uint16(qr*pa1/0xffff + pr0)
+			dstColorRGBA64.G = uint16(qg*pa1/0xffff + pg0)
+			dstColorRGBA64.B = uint16(qb*pa1/0xffff + pb0)
+			dstColorRGBA64.A = uint16(qa*pa1/0xffff + pa0)
+			dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(adr.Min.Y+dy), dstColor)
+		}
+	}
+}
+
+func (z *kernelScaler) scaleY_Image_Src(dst Image, dr, adr image.Rectangle, tmp [][4]float64, opts *Options) {
+	dstMask, dmp := opts.DstMask, opts.DstMaskP
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+	for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+		for dy, s := range z.vertical.sources[adr.Min.Y:adr.Max.Y] {
+			var pr, pg, pb, pa float64
+			for _, c := range z.vertical.contribs[s.i:s.j] {
+				p := &tmp[c.coord*z.dw+dx]
+				pr += p[0] * c.weight
+				pg += p[1] * c.weight
+				pb += p[2] * c.weight
+				pa += p[3] * c.weight
+			}
+
+			if pr > pa {
+				pr = pa
+			}
+			if pg > pa {
+				pg = pa
+			}
+			if pb > pa {
+				pb = pa
+			}
+
+			if dstMask != nil {
+				qr, qg, qb, qa := dst.At(dr.Min.X+int(dx), dr.Min.Y+int(adr.Min.Y+dy)).RGBA()
+				_, _, _, ma := dstMask.At(dmp.X+dr.Min.X+int(dx), dmp.Y+dr.Min.Y+int(adr.Min.Y+dy)).RGBA()
+				pr := uint32(ftou(pr*s.invTotalWeight)) * ma / 0xffff
+				pg := uint32(ftou(pg*s.invTotalWeight)) * ma / 0xffff
+				pb := uint32(ftou(pb*s.invTotalWeight)) * ma / 0xffff
+				pa := uint32(ftou(pa*s.invTotalWeight)) * ma / 0xffff
+				pa1 := 0xffff - ma
+				dstColorRGBA64.R = uint16(qr*pa1/0xffff + pr)
+				dstColorRGBA64.G = uint16(qg*pa1/0xffff + pg)
+				dstColorRGBA64.B = uint16(qb*pa1/0xffff + pb)
+				dstColorRGBA64.A = uint16(qa*pa1/0xffff + pa)
+				dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(adr.Min.Y+dy), dstColor)
+			} else {
+				dstColorRGBA64.R = ftou(pr * s.invTotalWeight)
+				dstColorRGBA64.G = ftou(pg * s.invTotalWeight)
+				dstColorRGBA64.B = ftou(pb * s.invTotalWeight)
+				dstColorRGBA64.A = ftou(pa * s.invTotalWeight)
+				dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(adr.Min.Y+dy), dstColor)
+			}
+		}
+	}
+}
+
+func (q *Kernel) transform_RGBA_Gray_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.Gray, sr image.Rectangle, bias image.Point, xscale, yscale float64, opts *Options) {
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	xHalfWidth, xKernelArgScale := q.Support, 1.0
+	if xscale > 1 {
+		xHalfWidth *= xscale
+		xKernelArgScale = 1 / xscale
+	}
+	yHalfWidth, yKernelArgScale := q.Support, 1.0
+	if yscale > 1 {
+		yHalfWidth *= yscale
+		yKernelArgScale = 1 / yscale
+	}
+
+	xWeights := make([]float64, 1+2*int(math.Ceil(xHalfWidth)))
+	yWeights := make([]float64, 1+2*int(math.Ceil(yHalfWidth)))
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			// TODO: adjust the bias so that we can use int(f) instead
+			// of math.Floor(f) and math.Ceil(f).
+			sx += float64(bias.X)
+			sx -= 0.5
+			ix := int(math.Floor(sx - xHalfWidth))
+			if ix < sr.Min.X {
+				ix = sr.Min.X
+			}
+			jx := int(math.Ceil(sx + xHalfWidth))
+			if jx > sr.Max.X {
+				jx = sr.Max.X
+			}
+
+			totalXWeight := 0.0
+			for kx := ix; kx < jx; kx++ {
+				xWeight := 0.0
+				if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+					xWeight = q.At(t)
+				}
+				xWeights[kx-ix] = xWeight
+				totalXWeight += xWeight
+			}
+			for x := range xWeights[:jx-ix] {
+				xWeights[x] /= totalXWeight
+			}
+
+			sy += float64(bias.Y)
+			sy -= 0.5
+			iy := int(math.Floor(sy - yHalfWidth))
+			if iy < sr.Min.Y {
+				iy = sr.Min.Y
+			}
+			jy := int(math.Ceil(sy + yHalfWidth))
+			if jy > sr.Max.Y {
+				jy = sr.Max.Y
+			}
+
+			totalYWeight := 0.0
+			for ky := iy; ky < jy; ky++ {
+				yWeight := 0.0
+				if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+					yWeight = q.At(t)
+				}
+				yWeights[ky-iy] = yWeight
+				totalYWeight += yWeight
+			}
+			for y := range yWeights[:jy-iy] {
+				yWeights[y] /= totalYWeight
+			}
+
+			var pr float64
+			for ky := iy; ky < jy; ky++ {
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pi := (ky-src.Rect.Min.Y)*src.Stride + (kx - src.Rect.Min.X)
+							pru := uint32(src.Pix[pi]) * 0x101
+							pr += float64(pru) * w
+						}
+					}
+				}
+			}
+			out := uint8(fffftou(pr) >> 8)
+			dst.Pix[d+0] = out
+			dst.Pix[d+1] = out
+			dst.Pix[d+2] = out
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (q *Kernel) transform_RGBA_NRGBA_Over(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.NRGBA, sr image.Rectangle, bias image.Point, xscale, yscale float64, opts *Options) {
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	xHalfWidth, xKernelArgScale := q.Support, 1.0
+	if xscale > 1 {
+		xHalfWidth *= xscale
+		xKernelArgScale = 1 / xscale
+	}
+	yHalfWidth, yKernelArgScale := q.Support, 1.0
+	if yscale > 1 {
+		yHalfWidth *= yscale
+		yKernelArgScale = 1 / yscale
+	}
+
+	xWeights := make([]float64, 1+2*int(math.Ceil(xHalfWidth)))
+	yWeights := make([]float64, 1+2*int(math.Ceil(yHalfWidth)))
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			// TODO: adjust the bias so that we can use int(f) instead
+			// of math.Floor(f) and math.Ceil(f).
+			sx += float64(bias.X)
+			sx -= 0.5
+			ix := int(math.Floor(sx - xHalfWidth))
+			if ix < sr.Min.X {
+				ix = sr.Min.X
+			}
+			jx := int(math.Ceil(sx + xHalfWidth))
+			if jx > sr.Max.X {
+				jx = sr.Max.X
+			}
+
+			totalXWeight := 0.0
+			for kx := ix; kx < jx; kx++ {
+				xWeight := 0.0
+				if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+					xWeight = q.At(t)
+				}
+				xWeights[kx-ix] = xWeight
+				totalXWeight += xWeight
+			}
+			for x := range xWeights[:jx-ix] {
+				xWeights[x] /= totalXWeight
+			}
+
+			sy += float64(bias.Y)
+			sy -= 0.5
+			iy := int(math.Floor(sy - yHalfWidth))
+			if iy < sr.Min.Y {
+				iy = sr.Min.Y
+			}
+			jy := int(math.Ceil(sy + yHalfWidth))
+			if jy > sr.Max.Y {
+				jy = sr.Max.Y
+			}
+
+			totalYWeight := 0.0
+			for ky := iy; ky < jy; ky++ {
+				yWeight := 0.0
+				if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+					yWeight = q.At(t)
+				}
+				yWeights[ky-iy] = yWeight
+				totalYWeight += yWeight
+			}
+			for y := range yWeights[:jy-iy] {
+				yWeights[y] /= totalYWeight
+			}
+
+			var pr, pg, pb, pa float64
+			for ky := iy; ky < jy; ky++ {
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pi := (ky-src.Rect.Min.Y)*src.Stride + (kx-src.Rect.Min.X)*4
+							pau := uint32(src.Pix[pi+3]) * 0x101
+							pru := uint32(src.Pix[pi+0]) * pau / 0xff
+							pgu := uint32(src.Pix[pi+1]) * pau / 0xff
+							pbu := uint32(src.Pix[pi+2]) * pau / 0xff
+							pr += float64(pru) * w
+							pg += float64(pgu) * w
+							pb += float64(pbu) * w
+							pa += float64(pau) * w
+						}
+					}
+				}
+			}
+
+			if pr > pa {
+				pr = pa
+			}
+			if pg > pa {
+				pg = pa
+			}
+			if pb > pa {
+				pb = pa
+			}
+
+			pr0 := uint32(fffftou(pr))
+			pg0 := uint32(fffftou(pg))
+			pb0 := uint32(fffftou(pb))
+			pa0 := uint32(fffftou(pa))
+			pa1 := (0xffff - uint32(pa0)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + pr0) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + pg0) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + pb0) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + pa0) >> 8)
+		}
+	}
+}
+
+func (q *Kernel) transform_RGBA_NRGBA_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.NRGBA, sr image.Rectangle, bias image.Point, xscale, yscale float64, opts *Options) {
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	xHalfWidth, xKernelArgScale := q.Support, 1.0
+	if xscale > 1 {
+		xHalfWidth *= xscale
+		xKernelArgScale = 1 / xscale
+	}
+	yHalfWidth, yKernelArgScale := q.Support, 1.0
+	if yscale > 1 {
+		yHalfWidth *= yscale
+		yKernelArgScale = 1 / yscale
+	}
+
+	xWeights := make([]float64, 1+2*int(math.Ceil(xHalfWidth)))
+	yWeights := make([]float64, 1+2*int(math.Ceil(yHalfWidth)))
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			// TODO: adjust the bias so that we can use int(f) instead
+			// of math.Floor(f) and math.Ceil(f).
+			sx += float64(bias.X)
+			sx -= 0.5
+			ix := int(math.Floor(sx - xHalfWidth))
+			if ix < sr.Min.X {
+				ix = sr.Min.X
+			}
+			jx := int(math.Ceil(sx + xHalfWidth))
+			if jx > sr.Max.X {
+				jx = sr.Max.X
+			}
+
+			totalXWeight := 0.0
+			for kx := ix; kx < jx; kx++ {
+				xWeight := 0.0
+				if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+					xWeight = q.At(t)
+				}
+				xWeights[kx-ix] = xWeight
+				totalXWeight += xWeight
+			}
+			for x := range xWeights[:jx-ix] {
+				xWeights[x] /= totalXWeight
+			}
+
+			sy += float64(bias.Y)
+			sy -= 0.5
+			iy := int(math.Floor(sy - yHalfWidth))
+			if iy < sr.Min.Y {
+				iy = sr.Min.Y
+			}
+			jy := int(math.Ceil(sy + yHalfWidth))
+			if jy > sr.Max.Y {
+				jy = sr.Max.Y
+			}
+
+			totalYWeight := 0.0
+			for ky := iy; ky < jy; ky++ {
+				yWeight := 0.0
+				if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+					yWeight = q.At(t)
+				}
+				yWeights[ky-iy] = yWeight
+				totalYWeight += yWeight
+			}
+			for y := range yWeights[:jy-iy] {
+				yWeights[y] /= totalYWeight
+			}
+
+			var pr, pg, pb, pa float64
+			for ky := iy; ky < jy; ky++ {
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pi := (ky-src.Rect.Min.Y)*src.Stride + (kx-src.Rect.Min.X)*4
+							pau := uint32(src.Pix[pi+3]) * 0x101
+							pru := uint32(src.Pix[pi+0]) * pau / 0xff
+							pgu := uint32(src.Pix[pi+1]) * pau / 0xff
+							pbu := uint32(src.Pix[pi+2]) * pau / 0xff
+							pr += float64(pru) * w
+							pg += float64(pgu) * w
+							pb += float64(pbu) * w
+							pa += float64(pau) * w
+						}
+					}
+				}
+			}
+
+			if pr > pa {
+				pr = pa
+			}
+			if pg > pa {
+				pg = pa
+			}
+			if pb > pa {
+				pb = pa
+			}
+
+			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
+			dst.Pix[d+1] = uint8(fffftou(pg) >> 8)
+			dst.Pix[d+2] = uint8(fffftou(pb) >> 8)
+			dst.Pix[d+3] = uint8(fffftou(pa) >> 8)
+		}
+	}
+}
+
+func (q *Kernel) transform_RGBA_RGBA_Over(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.RGBA, sr image.Rectangle, bias image.Point, xscale, yscale float64, opts *Options) {
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	xHalfWidth, xKernelArgScale := q.Support, 1.0
+	if xscale > 1 {
+		xHalfWidth *= xscale
+		xKernelArgScale = 1 / xscale
+	}
+	yHalfWidth, yKernelArgScale := q.Support, 1.0
+	if yscale > 1 {
+		yHalfWidth *= yscale
+		yKernelArgScale = 1 / yscale
+	}
+
+	xWeights := make([]float64, 1+2*int(math.Ceil(xHalfWidth)))
+	yWeights := make([]float64, 1+2*int(math.Ceil(yHalfWidth)))
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			// TODO: adjust the bias so that we can use int(f) instead
+			// of math.Floor(f) and math.Ceil(f).
+			sx += float64(bias.X)
+			sx -= 0.5
+			ix := int(math.Floor(sx - xHalfWidth))
+			if ix < sr.Min.X {
+				ix = sr.Min.X
+			}
+			jx := int(math.Ceil(sx + xHalfWidth))
+			if jx > sr.Max.X {
+				jx = sr.Max.X
+			}
+
+			totalXWeight := 0.0
+			for kx := ix; kx < jx; kx++ {
+				xWeight := 0.0
+				if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+					xWeight = q.At(t)
+				}
+				xWeights[kx-ix] = xWeight
+				totalXWeight += xWeight
+			}
+			for x := range xWeights[:jx-ix] {
+				xWeights[x] /= totalXWeight
+			}
+
+			sy += float64(bias.Y)
+			sy -= 0.5
+			iy := int(math.Floor(sy - yHalfWidth))
+			if iy < sr.Min.Y {
+				iy = sr.Min.Y
+			}
+			jy := int(math.Ceil(sy + yHalfWidth))
+			if jy > sr.Max.Y {
+				jy = sr.Max.Y
+			}
+
+			totalYWeight := 0.0
+			for ky := iy; ky < jy; ky++ {
+				yWeight := 0.0
+				if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+					yWeight = q.At(t)
+				}
+				yWeights[ky-iy] = yWeight
+				totalYWeight += yWeight
+			}
+			for y := range yWeights[:jy-iy] {
+				yWeights[y] /= totalYWeight
+			}
+
+			var pr, pg, pb, pa float64
+			for ky := iy; ky < jy; ky++ {
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pi := (ky-src.Rect.Min.Y)*src.Stride + (kx-src.Rect.Min.X)*4
+							pru := uint32(src.Pix[pi+0]) * 0x101
+							pgu := uint32(src.Pix[pi+1]) * 0x101
+							pbu := uint32(src.Pix[pi+2]) * 0x101
+							pau := uint32(src.Pix[pi+3]) * 0x101
+							pr += float64(pru) * w
+							pg += float64(pgu) * w
+							pb += float64(pbu) * w
+							pa += float64(pau) * w
+						}
+					}
+				}
+			}
+
+			if pr > pa {
+				pr = pa
+			}
+			if pg > pa {
+				pg = pa
+			}
+			if pb > pa {
+				pb = pa
+			}
+
+			pr0 := uint32(fffftou(pr))
+			pg0 := uint32(fffftou(pg))
+			pb0 := uint32(fffftou(pb))
+			pa0 := uint32(fffftou(pa))
+			pa1 := (0xffff - uint32(pa0)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + pr0) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + pg0) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + pb0) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + pa0) >> 8)
+		}
+	}
+}
+
+func (q *Kernel) transform_RGBA_RGBA_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.RGBA, sr image.Rectangle, bias image.Point, xscale, yscale float64, opts *Options) {
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	xHalfWidth, xKernelArgScale := q.Support, 1.0
+	if xscale > 1 {
+		xHalfWidth *= xscale
+		xKernelArgScale = 1 / xscale
+	}
+	yHalfWidth, yKernelArgScale := q.Support, 1.0
+	if yscale > 1 {
+		yHalfWidth *= yscale
+		yKernelArgScale = 1 / yscale
+	}
+
+	xWeights := make([]float64, 1+2*int(math.Ceil(xHalfWidth)))
+	yWeights := make([]float64, 1+2*int(math.Ceil(yHalfWidth)))
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			// TODO: adjust the bias so that we can use int(f) instead
+			// of math.Floor(f) and math.Ceil(f).
+			sx += float64(bias.X)
+			sx -= 0.5
+			ix := int(math.Floor(sx - xHalfWidth))
+			if ix < sr.Min.X {
+				ix = sr.Min.X
+			}
+			jx := int(math.Ceil(sx + xHalfWidth))
+			if jx > sr.Max.X {
+				jx = sr.Max.X
+			}
+
+			totalXWeight := 0.0
+			for kx := ix; kx < jx; kx++ {
+				xWeight := 0.0
+				if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+					xWeight = q.At(t)
+				}
+				xWeights[kx-ix] = xWeight
+				totalXWeight += xWeight
+			}
+			for x := range xWeights[:jx-ix] {
+				xWeights[x] /= totalXWeight
+			}
+
+			sy += float64(bias.Y)
+			sy -= 0.5
+			iy := int(math.Floor(sy - yHalfWidth))
+			if iy < sr.Min.Y {
+				iy = sr.Min.Y
+			}
+			jy := int(math.Ceil(sy + yHalfWidth))
+			if jy > sr.Max.Y {
+				jy = sr.Max.Y
+			}
+
+			totalYWeight := 0.0
+			for ky := iy; ky < jy; ky++ {
+				yWeight := 0.0
+				if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+					yWeight = q.At(t)
+				}
+				yWeights[ky-iy] = yWeight
+				totalYWeight += yWeight
+			}
+			for y := range yWeights[:jy-iy] {
+				yWeights[y] /= totalYWeight
+			}
+
+			var pr, pg, pb, pa float64
+			for ky := iy; ky < jy; ky++ {
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pi := (ky-src.Rect.Min.Y)*src.Stride + (kx-src.Rect.Min.X)*4
+							pru := uint32(src.Pix[pi+0]) * 0x101
+							pgu := uint32(src.Pix[pi+1]) * 0x101
+							pbu := uint32(src.Pix[pi+2]) * 0x101
+							pau := uint32(src.Pix[pi+3]) * 0x101
+							pr += float64(pru) * w
+							pg += float64(pgu) * w
+							pb += float64(pbu) * w
+							pa += float64(pau) * w
+						}
+					}
+				}
+			}
+
+			if pr > pa {
+				pr = pa
+			}
+			if pg > pa {
+				pg = pa
+			}
+			if pb > pa {
+				pb = pa
+			}
+
+			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
+			dst.Pix[d+1] = uint8(fffftou(pg) >> 8)
+			dst.Pix[d+2] = uint8(fffftou(pb) >> 8)
+			dst.Pix[d+3] = uint8(fffftou(pa) >> 8)
+		}
+	}
+}
+
+func (q *Kernel) transform_RGBA_YCbCr444_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.YCbCr, sr image.Rectangle, bias image.Point, xscale, yscale float64, opts *Options) {
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	xHalfWidth, xKernelArgScale := q.Support, 1.0
+	if xscale > 1 {
+		xHalfWidth *= xscale
+		xKernelArgScale = 1 / xscale
+	}
+	yHalfWidth, yKernelArgScale := q.Support, 1.0
+	if yscale > 1 {
+		yHalfWidth *= yscale
+		yKernelArgScale = 1 / yscale
+	}
+
+	xWeights := make([]float64, 1+2*int(math.Ceil(xHalfWidth)))
+	yWeights := make([]float64, 1+2*int(math.Ceil(yHalfWidth)))
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			// TODO: adjust the bias so that we can use int(f) instead
+			// of math.Floor(f) and math.Ceil(f).
+			sx += float64(bias.X)
+			sx -= 0.5
+			ix := int(math.Floor(sx - xHalfWidth))
+			if ix < sr.Min.X {
+				ix = sr.Min.X
+			}
+			jx := int(math.Ceil(sx + xHalfWidth))
+			if jx > sr.Max.X {
+				jx = sr.Max.X
+			}
+
+			totalXWeight := 0.0
+			for kx := ix; kx < jx; kx++ {
+				xWeight := 0.0
+				if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+					xWeight = q.At(t)
+				}
+				xWeights[kx-ix] = xWeight
+				totalXWeight += xWeight
+			}
+			for x := range xWeights[:jx-ix] {
+				xWeights[x] /= totalXWeight
+			}
+
+			sy += float64(bias.Y)
+			sy -= 0.5
+			iy := int(math.Floor(sy - yHalfWidth))
+			if iy < sr.Min.Y {
+				iy = sr.Min.Y
+			}
+			jy := int(math.Ceil(sy + yHalfWidth))
+			if jy > sr.Max.Y {
+				jy = sr.Max.Y
+			}
+
+			totalYWeight := 0.0
+			for ky := iy; ky < jy; ky++ {
+				yWeight := 0.0
+				if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+					yWeight = q.At(t)
+				}
+				yWeights[ky-iy] = yWeight
+				totalYWeight += yWeight
+			}
+			for y := range yWeights[:jy-iy] {
+				yWeights[y] /= totalYWeight
+			}
+
+			var pr, pg, pb float64
+			for ky := iy; ky < jy; ky++ {
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pi := (ky-src.Rect.Min.Y)*src.YStride + (kx - src.Rect.Min.X)
+							pj := (ky-src.Rect.Min.Y)*src.CStride + (kx - src.Rect.Min.X)
+
+							// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+							pyy1 := int(src.Y[pi]) * 0x10100
+							pcb1 := int(src.Cb[pj]) - 128
+							pcr1 := int(src.Cr[pj]) - 128
+							pru := (pyy1 + 91881*pcr1) >> 8
+							pgu := (pyy1 - 22554*pcb1 - 46802*pcr1) >> 8
+							pbu := (pyy1 + 116130*pcb1) >> 8
+							if pru < 0 {
+								pru = 0
+							} else if pru > 0xffff {
+								pru = 0xffff
+							}
+							if pgu < 0 {
+								pgu = 0
+							} else if pgu > 0xffff {
+								pgu = 0xffff
+							}
+							if pbu < 0 {
+								pbu = 0
+							} else if pbu > 0xffff {
+								pbu = 0xffff
+							}
+
+							pr += float64(pru) * w
+							pg += float64(pgu) * w
+							pb += float64(pbu) * w
+						}
+					}
+				}
+			}
+			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
+			dst.Pix[d+1] = uint8(fffftou(pg) >> 8)
+			dst.Pix[d+2] = uint8(fffftou(pb) >> 8)
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (q *Kernel) transform_RGBA_YCbCr422_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.YCbCr, sr image.Rectangle, bias image.Point, xscale, yscale float64, opts *Options) {
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	xHalfWidth, xKernelArgScale := q.Support, 1.0
+	if xscale > 1 {
+		xHalfWidth *= xscale
+		xKernelArgScale = 1 / xscale
+	}
+	yHalfWidth, yKernelArgScale := q.Support, 1.0
+	if yscale > 1 {
+		yHalfWidth *= yscale
+		yKernelArgScale = 1 / yscale
+	}
+
+	xWeights := make([]float64, 1+2*int(math.Ceil(xHalfWidth)))
+	yWeights := make([]float64, 1+2*int(math.Ceil(yHalfWidth)))
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			// TODO: adjust the bias so that we can use int(f) instead
+			// of math.Floor(f) and math.Ceil(f).
+			sx += float64(bias.X)
+			sx -= 0.5
+			ix := int(math.Floor(sx - xHalfWidth))
+			if ix < sr.Min.X {
+				ix = sr.Min.X
+			}
+			jx := int(math.Ceil(sx + xHalfWidth))
+			if jx > sr.Max.X {
+				jx = sr.Max.X
+			}
+
+			totalXWeight := 0.0
+			for kx := ix; kx < jx; kx++ {
+				xWeight := 0.0
+				if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+					xWeight = q.At(t)
+				}
+				xWeights[kx-ix] = xWeight
+				totalXWeight += xWeight
+			}
+			for x := range xWeights[:jx-ix] {
+				xWeights[x] /= totalXWeight
+			}
+
+			sy += float64(bias.Y)
+			sy -= 0.5
+			iy := int(math.Floor(sy - yHalfWidth))
+			if iy < sr.Min.Y {
+				iy = sr.Min.Y
+			}
+			jy := int(math.Ceil(sy + yHalfWidth))
+			if jy > sr.Max.Y {
+				jy = sr.Max.Y
+			}
+
+			totalYWeight := 0.0
+			for ky := iy; ky < jy; ky++ {
+				yWeight := 0.0
+				if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+					yWeight = q.At(t)
+				}
+				yWeights[ky-iy] = yWeight
+				totalYWeight += yWeight
+			}
+			for y := range yWeights[:jy-iy] {
+				yWeights[y] /= totalYWeight
+			}
+
+			var pr, pg, pb float64
+			for ky := iy; ky < jy; ky++ {
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pi := (ky-src.Rect.Min.Y)*src.YStride + (kx - src.Rect.Min.X)
+							pj := (ky-src.Rect.Min.Y)*src.CStride + ((kx)/2 - src.Rect.Min.X/2)
+
+							// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+							pyy1 := int(src.Y[pi]) * 0x10100
+							pcb1 := int(src.Cb[pj]) - 128
+							pcr1 := int(src.Cr[pj]) - 128
+							pru := (pyy1 + 91881*pcr1) >> 8
+							pgu := (pyy1 - 22554*pcb1 - 46802*pcr1) >> 8
+							pbu := (pyy1 + 116130*pcb1) >> 8
+							if pru < 0 {
+								pru = 0
+							} else if pru > 0xffff {
+								pru = 0xffff
+							}
+							if pgu < 0 {
+								pgu = 0
+							} else if pgu > 0xffff {
+								pgu = 0xffff
+							}
+							if pbu < 0 {
+								pbu = 0
+							} else if pbu > 0xffff {
+								pbu = 0xffff
+							}
+
+							pr += float64(pru) * w
+							pg += float64(pgu) * w
+							pb += float64(pbu) * w
+						}
+					}
+				}
+			}
+			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
+			dst.Pix[d+1] = uint8(fffftou(pg) >> 8)
+			dst.Pix[d+2] = uint8(fffftou(pb) >> 8)
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (q *Kernel) transform_RGBA_YCbCr420_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.YCbCr, sr image.Rectangle, bias image.Point, xscale, yscale float64, opts *Options) {
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	xHalfWidth, xKernelArgScale := q.Support, 1.0
+	if xscale > 1 {
+		xHalfWidth *= xscale
+		xKernelArgScale = 1 / xscale
+	}
+	yHalfWidth, yKernelArgScale := q.Support, 1.0
+	if yscale > 1 {
+		yHalfWidth *= yscale
+		yKernelArgScale = 1 / yscale
+	}
+
+	xWeights := make([]float64, 1+2*int(math.Ceil(xHalfWidth)))
+	yWeights := make([]float64, 1+2*int(math.Ceil(yHalfWidth)))
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			// TODO: adjust the bias so that we can use int(f) instead
+			// of math.Floor(f) and math.Ceil(f).
+			sx += float64(bias.X)
+			sx -= 0.5
+			ix := int(math.Floor(sx - xHalfWidth))
+			if ix < sr.Min.X {
+				ix = sr.Min.X
+			}
+			jx := int(math.Ceil(sx + xHalfWidth))
+			if jx > sr.Max.X {
+				jx = sr.Max.X
+			}
+
+			totalXWeight := 0.0
+			for kx := ix; kx < jx; kx++ {
+				xWeight := 0.0
+				if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+					xWeight = q.At(t)
+				}
+				xWeights[kx-ix] = xWeight
+				totalXWeight += xWeight
+			}
+			for x := range xWeights[:jx-ix] {
+				xWeights[x] /= totalXWeight
+			}
+
+			sy += float64(bias.Y)
+			sy -= 0.5
+			iy := int(math.Floor(sy - yHalfWidth))
+			if iy < sr.Min.Y {
+				iy = sr.Min.Y
+			}
+			jy := int(math.Ceil(sy + yHalfWidth))
+			if jy > sr.Max.Y {
+				jy = sr.Max.Y
+			}
+
+			totalYWeight := 0.0
+			for ky := iy; ky < jy; ky++ {
+				yWeight := 0.0
+				if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+					yWeight = q.At(t)
+				}
+				yWeights[ky-iy] = yWeight
+				totalYWeight += yWeight
+			}
+			for y := range yWeights[:jy-iy] {
+				yWeights[y] /= totalYWeight
+			}
+
+			var pr, pg, pb float64
+			for ky := iy; ky < jy; ky++ {
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pi := (ky-src.Rect.Min.Y)*src.YStride + (kx - src.Rect.Min.X)
+							pj := ((ky)/2-src.Rect.Min.Y/2)*src.CStride + ((kx)/2 - src.Rect.Min.X/2)
+
+							// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+							pyy1 := int(src.Y[pi]) * 0x10100
+							pcb1 := int(src.Cb[pj]) - 128
+							pcr1 := int(src.Cr[pj]) - 128
+							pru := (pyy1 + 91881*pcr1) >> 8
+							pgu := (pyy1 - 22554*pcb1 - 46802*pcr1) >> 8
+							pbu := (pyy1 + 116130*pcb1) >> 8
+							if pru < 0 {
+								pru = 0
+							} else if pru > 0xffff {
+								pru = 0xffff
+							}
+							if pgu < 0 {
+								pgu = 0
+							} else if pgu > 0xffff {
+								pgu = 0xffff
+							}
+							if pbu < 0 {
+								pbu = 0
+							} else if pbu > 0xffff {
+								pbu = 0xffff
+							}
+
+							pr += float64(pru) * w
+							pg += float64(pgu) * w
+							pb += float64(pbu) * w
+						}
+					}
+				}
+			}
+			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
+			dst.Pix[d+1] = uint8(fffftou(pg) >> 8)
+			dst.Pix[d+2] = uint8(fffftou(pb) >> 8)
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (q *Kernel) transform_RGBA_YCbCr440_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.YCbCr, sr image.Rectangle, bias image.Point, xscale, yscale float64, opts *Options) {
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	xHalfWidth, xKernelArgScale := q.Support, 1.0
+	if xscale > 1 {
+		xHalfWidth *= xscale
+		xKernelArgScale = 1 / xscale
+	}
+	yHalfWidth, yKernelArgScale := q.Support, 1.0
+	if yscale > 1 {
+		yHalfWidth *= yscale
+		yKernelArgScale = 1 / yscale
+	}
+
+	xWeights := make([]float64, 1+2*int(math.Ceil(xHalfWidth)))
+	yWeights := make([]float64, 1+2*int(math.Ceil(yHalfWidth)))
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			// TODO: adjust the bias so that we can use int(f) instead
+			// of math.Floor(f) and math.Ceil(f).
+			sx += float64(bias.X)
+			sx -= 0.5
+			ix := int(math.Floor(sx - xHalfWidth))
+			if ix < sr.Min.X {
+				ix = sr.Min.X
+			}
+			jx := int(math.Ceil(sx + xHalfWidth))
+			if jx > sr.Max.X {
+				jx = sr.Max.X
+			}
+
+			totalXWeight := 0.0
+			for kx := ix; kx < jx; kx++ {
+				xWeight := 0.0
+				if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+					xWeight = q.At(t)
+				}
+				xWeights[kx-ix] = xWeight
+				totalXWeight += xWeight
+			}
+			for x := range xWeights[:jx-ix] {
+				xWeights[x] /= totalXWeight
+			}
+
+			sy += float64(bias.Y)
+			sy -= 0.5
+			iy := int(math.Floor(sy - yHalfWidth))
+			if iy < sr.Min.Y {
+				iy = sr.Min.Y
+			}
+			jy := int(math.Ceil(sy + yHalfWidth))
+			if jy > sr.Max.Y {
+				jy = sr.Max.Y
+			}
+
+			totalYWeight := 0.0
+			for ky := iy; ky < jy; ky++ {
+				yWeight := 0.0
+				if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+					yWeight = q.At(t)
+				}
+				yWeights[ky-iy] = yWeight
+				totalYWeight += yWeight
+			}
+			for y := range yWeights[:jy-iy] {
+				yWeights[y] /= totalYWeight
+			}
+
+			var pr, pg, pb float64
+			for ky := iy; ky < jy; ky++ {
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pi := (ky-src.Rect.Min.Y)*src.YStride + (kx - src.Rect.Min.X)
+							pj := ((ky)/2-src.Rect.Min.Y/2)*src.CStride + (kx - src.Rect.Min.X)
+
+							// This is an inline version of image/color/ycbcr.go's YCbCr.RGBA method.
+							pyy1 := int(src.Y[pi]) * 0x10100
+							pcb1 := int(src.Cb[pj]) - 128
+							pcr1 := int(src.Cr[pj]) - 128
+							pru := (pyy1 + 91881*pcr1) >> 8
+							pgu := (pyy1 - 22554*pcb1 - 46802*pcr1) >> 8
+							pbu := (pyy1 + 116130*pcb1) >> 8
+							if pru < 0 {
+								pru = 0
+							} else if pru > 0xffff {
+								pru = 0xffff
+							}
+							if pgu < 0 {
+								pgu = 0
+							} else if pgu > 0xffff {
+								pgu = 0xffff
+							}
+							if pbu < 0 {
+								pbu = 0
+							} else if pbu > 0xffff {
+								pbu = 0xffff
+							}
+
+							pr += float64(pru) * w
+							pg += float64(pgu) * w
+							pb += float64(pbu) * w
+						}
+					}
+				}
+			}
+			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
+			dst.Pix[d+1] = uint8(fffftou(pg) >> 8)
+			dst.Pix[d+2] = uint8(fffftou(pb) >> 8)
+			dst.Pix[d+3] = 0xff
+		}
+	}
+}
+
+func (q *Kernel) transform_RGBA_Image_Over(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src image.Image, sr image.Rectangle, bias image.Point, xscale, yscale float64, opts *Options) {
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	xHalfWidth, xKernelArgScale := q.Support, 1.0
+	if xscale > 1 {
+		xHalfWidth *= xscale
+		xKernelArgScale = 1 / xscale
+	}
+	yHalfWidth, yKernelArgScale := q.Support, 1.0
+	if yscale > 1 {
+		yHalfWidth *= yscale
+		yKernelArgScale = 1 / yscale
+	}
+
+	xWeights := make([]float64, 1+2*int(math.Ceil(xHalfWidth)))
+	yWeights := make([]float64, 1+2*int(math.Ceil(yHalfWidth)))
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			// TODO: adjust the bias so that we can use int(f) instead
+			// of math.Floor(f) and math.Ceil(f).
+			sx += float64(bias.X)
+			sx -= 0.5
+			ix := int(math.Floor(sx - xHalfWidth))
+			if ix < sr.Min.X {
+				ix = sr.Min.X
+			}
+			jx := int(math.Ceil(sx + xHalfWidth))
+			if jx > sr.Max.X {
+				jx = sr.Max.X
+			}
+
+			totalXWeight := 0.0
+			for kx := ix; kx < jx; kx++ {
+				xWeight := 0.0
+				if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+					xWeight = q.At(t)
+				}
+				xWeights[kx-ix] = xWeight
+				totalXWeight += xWeight
+			}
+			for x := range xWeights[:jx-ix] {
+				xWeights[x] /= totalXWeight
+			}
+
+			sy += float64(bias.Y)
+			sy -= 0.5
+			iy := int(math.Floor(sy - yHalfWidth))
+			if iy < sr.Min.Y {
+				iy = sr.Min.Y
+			}
+			jy := int(math.Ceil(sy + yHalfWidth))
+			if jy > sr.Max.Y {
+				jy = sr.Max.Y
+			}
+
+			totalYWeight := 0.0
+			for ky := iy; ky < jy; ky++ {
+				yWeight := 0.0
+				if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+					yWeight = q.At(t)
+				}
+				yWeights[ky-iy] = yWeight
+				totalYWeight += yWeight
+			}
+			for y := range yWeights[:jy-iy] {
+				yWeights[y] /= totalYWeight
+			}
+
+			var pr, pg, pb, pa float64
+			for ky := iy; ky < jy; ky++ {
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pru, pgu, pbu, pau := src.At(kx, ky).RGBA()
+							pr += float64(pru) * w
+							pg += float64(pgu) * w
+							pb += float64(pbu) * w
+							pa += float64(pau) * w
+						}
+					}
+				}
+			}
+
+			if pr > pa {
+				pr = pa
+			}
+			if pg > pa {
+				pg = pa
+			}
+			if pb > pa {
+				pb = pa
+			}
+
+			pr0 := uint32(fffftou(pr))
+			pg0 := uint32(fffftou(pg))
+			pb0 := uint32(fffftou(pb))
+			pa0 := uint32(fffftou(pa))
+			pa1 := (0xffff - uint32(pa0)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + pr0) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + pg0) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + pb0) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + pa0) >> 8)
+		}
+	}
+}
+
+func (q *Kernel) transform_RGBA_Image_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src image.Image, sr image.Rectangle, bias image.Point, xscale, yscale float64, opts *Options) {
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	xHalfWidth, xKernelArgScale := q.Support, 1.0
+	if xscale > 1 {
+		xHalfWidth *= xscale
+		xKernelArgScale = 1 / xscale
+	}
+	yHalfWidth, yKernelArgScale := q.Support, 1.0
+	if yscale > 1 {
+		yHalfWidth *= yscale
+		yKernelArgScale = 1 / yscale
+	}
+
+	xWeights := make([]float64, 1+2*int(math.Ceil(xHalfWidth)))
+	yWeights := make([]float64, 1+2*int(math.Ceil(yHalfWidth)))
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			// TODO: adjust the bias so that we can use int(f) instead
+			// of math.Floor(f) and math.Ceil(f).
+			sx += float64(bias.X)
+			sx -= 0.5
+			ix := int(math.Floor(sx - xHalfWidth))
+			if ix < sr.Min.X {
+				ix = sr.Min.X
+			}
+			jx := int(math.Ceil(sx + xHalfWidth))
+			if jx > sr.Max.X {
+				jx = sr.Max.X
+			}
+
+			totalXWeight := 0.0
+			for kx := ix; kx < jx; kx++ {
+				xWeight := 0.0
+				if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+					xWeight = q.At(t)
+				}
+				xWeights[kx-ix] = xWeight
+				totalXWeight += xWeight
+			}
+			for x := range xWeights[:jx-ix] {
+				xWeights[x] /= totalXWeight
+			}
+
+			sy += float64(bias.Y)
+			sy -= 0.5
+			iy := int(math.Floor(sy - yHalfWidth))
+			if iy < sr.Min.Y {
+				iy = sr.Min.Y
+			}
+			jy := int(math.Ceil(sy + yHalfWidth))
+			if jy > sr.Max.Y {
+				jy = sr.Max.Y
+			}
+
+			totalYWeight := 0.0
+			for ky := iy; ky < jy; ky++ {
+				yWeight := 0.0
+				if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+					yWeight = q.At(t)
+				}
+				yWeights[ky-iy] = yWeight
+				totalYWeight += yWeight
+			}
+			for y := range yWeights[:jy-iy] {
+				yWeights[y] /= totalYWeight
+			}
+
+			var pr, pg, pb, pa float64
+			for ky := iy; ky < jy; ky++ {
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pru, pgu, pbu, pau := src.At(kx, ky).RGBA()
+							pr += float64(pru) * w
+							pg += float64(pgu) * w
+							pb += float64(pbu) * w
+							pa += float64(pau) * w
+						}
+					}
+				}
+			}
+
+			if pr > pa {
+				pr = pa
+			}
+			if pg > pa {
+				pg = pa
+			}
+			if pb > pa {
+				pb = pa
+			}
+
+			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
+			dst.Pix[d+1] = uint8(fffftou(pg) >> 8)
+			dst.Pix[d+2] = uint8(fffftou(pb) >> 8)
+			dst.Pix[d+3] = uint8(fffftou(pa) >> 8)
+		}
+	}
+}
+
+func (q *Kernel) transform_Image_Image_Over(dst Image, dr, adr image.Rectangle, d2s *f64.Aff3, src image.Image, sr image.Rectangle, bias image.Point, xscale, yscale float64, opts *Options) {
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	xHalfWidth, xKernelArgScale := q.Support, 1.0
+	if xscale > 1 {
+		xHalfWidth *= xscale
+		xKernelArgScale = 1 / xscale
+	}
+	yHalfWidth, yKernelArgScale := q.Support, 1.0
+	if yscale > 1 {
+		yHalfWidth *= yscale
+		yKernelArgScale = 1 / yscale
+	}
+
+	xWeights := make([]float64, 1+2*int(math.Ceil(xHalfWidth)))
+	yWeights := make([]float64, 1+2*int(math.Ceil(yHalfWidth)))
+
+	srcMask, smp := opts.SrcMask, opts.SrcMaskP
+	dstMask, dmp := opts.DstMask, opts.DstMaskP
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			// TODO: adjust the bias so that we can use int(f) instead
+			// of math.Floor(f) and math.Ceil(f).
+			sx += float64(bias.X)
+			sx -= 0.5
+			ix := int(math.Floor(sx - xHalfWidth))
+			if ix < sr.Min.X {
+				ix = sr.Min.X
+			}
+			jx := int(math.Ceil(sx + xHalfWidth))
+			if jx > sr.Max.X {
+				jx = sr.Max.X
+			}
+
+			totalXWeight := 0.0
+			for kx := ix; kx < jx; kx++ {
+				xWeight := 0.0
+				if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+					xWeight = q.At(t)
+				}
+				xWeights[kx-ix] = xWeight
+				totalXWeight += xWeight
+			}
+			for x := range xWeights[:jx-ix] {
+				xWeights[x] /= totalXWeight
+			}
+
+			sy += float64(bias.Y)
+			sy -= 0.5
+			iy := int(math.Floor(sy - yHalfWidth))
+			if iy < sr.Min.Y {
+				iy = sr.Min.Y
+			}
+			jy := int(math.Ceil(sy + yHalfWidth))
+			if jy > sr.Max.Y {
+				jy = sr.Max.Y
+			}
+
+			totalYWeight := 0.0
+			for ky := iy; ky < jy; ky++ {
+				yWeight := 0.0
+				if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+					yWeight = q.At(t)
+				}
+				yWeights[ky-iy] = yWeight
+				totalYWeight += yWeight
+			}
+			for y := range yWeights[:jy-iy] {
+				yWeights[y] /= totalYWeight
+			}
+
+			var pr, pg, pb, pa float64
+			for ky := iy; ky < jy; ky++ {
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pru, pgu, pbu, pau := src.At(kx, ky).RGBA()
+							if srcMask != nil {
+								_, _, _, ma := srcMask.At(smp.X+kx, smp.Y+ky).RGBA()
+								pru = pru * ma / 0xffff
+								pgu = pgu * ma / 0xffff
+								pbu = pbu * ma / 0xffff
+								pau = pau * ma / 0xffff
+							}
+							pr += float64(pru) * w
+							pg += float64(pgu) * w
+							pb += float64(pbu) * w
+							pa += float64(pau) * w
+						}
+					}
+				}
+			}
+
+			if pr > pa {
+				pr = pa
+			}
+			if pg > pa {
+				pg = pa
+			}
+			if pb > pa {
+				pb = pa
+			}
+
+			qr, qg, qb, qa := dst.At(dr.Min.X+int(dx), dr.Min.Y+int(dy)).RGBA()
+			pr0 := uint32(fffftou(pr))
+			pg0 := uint32(fffftou(pg))
+			pb0 := uint32(fffftou(pb))
+			pa0 := uint32(fffftou(pa))
+			if dstMask != nil {
+				_, _, _, ma := dstMask.At(dmp.X+dr.Min.X+int(dx), dmp.Y+dr.Min.Y+int(dy)).RGBA()
+				pr0 = pr0 * ma / 0xffff
+				pg0 = pg0 * ma / 0xffff
+				pb0 = pb0 * ma / 0xffff
+				pa0 = pa0 * ma / 0xffff
+			}
+			pa1 := 0xffff - pa0
+			dstColorRGBA64.R = uint16(qr*pa1/0xffff + pr0)
+			dstColorRGBA64.G = uint16(qg*pa1/0xffff + pg0)
+			dstColorRGBA64.B = uint16(qb*pa1/0xffff + pb0)
+			dstColorRGBA64.A = uint16(qa*pa1/0xffff + pa0)
+			dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
+		}
+	}
+}
+
+func (q *Kernel) transform_Image_Image_Src(dst Image, dr, adr image.Rectangle, d2s *f64.Aff3, src image.Image, sr image.Rectangle, bias image.Point, xscale, yscale float64, opts *Options) {
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	xHalfWidth, xKernelArgScale := q.Support, 1.0
+	if xscale > 1 {
+		xHalfWidth *= xscale
+		xKernelArgScale = 1 / xscale
+	}
+	yHalfWidth, yKernelArgScale := q.Support, 1.0
+	if yscale > 1 {
+		yHalfWidth *= yscale
+		yKernelArgScale = 1 / yscale
+	}
+
+	xWeights := make([]float64, 1+2*int(math.Ceil(xHalfWidth)))
+	yWeights := make([]float64, 1+2*int(math.Ceil(yHalfWidth)))
+
+	srcMask, smp := opts.SrcMask, opts.SrcMaskP
+	dstMask, dmp := opts.DstMask, opts.DstMaskP
+	dstColorRGBA64 := &color.RGBA64{}
+	dstColor := color.Color(dstColorRGBA64)
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			// TODO: adjust the bias so that we can use int(f) instead
+			// of math.Floor(f) and math.Ceil(f).
+			sx += float64(bias.X)
+			sx -= 0.5
+			ix := int(math.Floor(sx - xHalfWidth))
+			if ix < sr.Min.X {
+				ix = sr.Min.X
+			}
+			jx := int(math.Ceil(sx + xHalfWidth))
+			if jx > sr.Max.X {
+				jx = sr.Max.X
+			}
+
+			totalXWeight := 0.0
+			for kx := ix; kx < jx; kx++ {
+				xWeight := 0.0
+				if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+					xWeight = q.At(t)
+				}
+				xWeights[kx-ix] = xWeight
+				totalXWeight += xWeight
+			}
+			for x := range xWeights[:jx-ix] {
+				xWeights[x] /= totalXWeight
+			}
+
+			sy += float64(bias.Y)
+			sy -= 0.5
+			iy := int(math.Floor(sy - yHalfWidth))
+			if iy < sr.Min.Y {
+				iy = sr.Min.Y
+			}
+			jy := int(math.Ceil(sy + yHalfWidth))
+			if jy > sr.Max.Y {
+				jy = sr.Max.Y
+			}
+
+			totalYWeight := 0.0
+			for ky := iy; ky < jy; ky++ {
+				yWeight := 0.0
+				if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+					yWeight = q.At(t)
+				}
+				yWeights[ky-iy] = yWeight
+				totalYWeight += yWeight
+			}
+			for y := range yWeights[:jy-iy] {
+				yWeights[y] /= totalYWeight
+			}
+
+			var pr, pg, pb, pa float64
+			for ky := iy; ky < jy; ky++ {
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pru, pgu, pbu, pau := src.At(kx, ky).RGBA()
+							if srcMask != nil {
+								_, _, _, ma := srcMask.At(smp.X+kx, smp.Y+ky).RGBA()
+								pru = pru * ma / 0xffff
+								pgu = pgu * ma / 0xffff
+								pbu = pbu * ma / 0xffff
+								pau = pau * ma / 0xffff
+							}
+							pr += float64(pru) * w
+							pg += float64(pgu) * w
+							pb += float64(pbu) * w
+							pa += float64(pau) * w
+						}
+					}
+				}
+			}
+
+			if pr > pa {
+				pr = pa
+			}
+			if pg > pa {
+				pg = pa
+			}
+			if pb > pa {
+				pb = pa
+			}
+
+			if dstMask != nil {
+				qr, qg, qb, qa := dst.At(dr.Min.X+int(dx), dr.Min.Y+int(dy)).RGBA()
+				_, _, _, ma := dstMask.At(dmp.X+dr.Min.X+int(dx), dmp.Y+dr.Min.Y+int(dy)).RGBA()
+				pr := uint32(fffftou(pr)) * ma / 0xffff
+				pg := uint32(fffftou(pg)) * ma / 0xffff
+				pb := uint32(fffftou(pb)) * ma / 0xffff
+				pa := uint32(fffftou(pa)) * ma / 0xffff
+				pa1 := 0xffff - ma
+				dstColorRGBA64.R = uint16(qr*pa1/0xffff + pr)
+				dstColorRGBA64.G = uint16(qg*pa1/0xffff + pg)
+				dstColorRGBA64.B = uint16(qb*pa1/0xffff + pb)
+				dstColorRGBA64.A = uint16(qa*pa1/0xffff + pa)
+				dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
+			} else {
+				dstColorRGBA64.R = fffftou(pr)
+				dstColorRGBA64.G = fffftou(pg)
+				dstColorRGBA64.B = fffftou(pb)
+				dstColorRGBA64.A = fffftou(pa)
+				dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
+			}
+		}
+	}
+}
diff --git a/draw/scale.go b/draw/scale.go
new file mode 100644
index 0000000..98ab404
--- /dev/null
+++ b/draw/scale.go
@@ -0,0 +1,527 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:generate go run gen.go
+
+package draw
+
+import (
+	"image"
+	"image/color"
+	"math"
+	"sync"
+
+	"golang.org/x/image/math/f64"
+)
+
+// Copy copies the part of the source image defined by src and sr and writes
+// the result of a Porter-Duff composition to the part of the destination image
+// defined by dst and the translation of sr so that sr.Min translates to dp.
+func Copy(dst Image, dp image.Point, src image.Image, sr image.Rectangle, op Op, opts *Options) {
+	var o Options
+	if opts != nil {
+		o = *opts
+	}
+	dr := sr.Add(dp.Sub(sr.Min))
+	if o.DstMask == nil {
+		DrawMask(dst, dr, src, sr.Min, o.SrcMask, o.SrcMaskP.Add(sr.Min), op)
+	} else {
+		NearestNeighbor.Scale(dst, dr, src, sr, op, opts)
+	}
+}
+
+// Scaler scales the part of the source image defined by src and sr and writes
+// the result of a Porter-Duff composition to the part of the destination image
+// defined by dst and dr.
+//
+// A Scaler is safe to use concurrently.
+type Scaler interface {
+	Scale(dst Image, dr image.Rectangle, src image.Image, sr image.Rectangle, op Op, opts *Options)
+}
+
+// Transformer transforms the part of the source image defined by src and sr
+// and writes the result of a Porter-Duff composition to the part of the
+// destination image defined by dst and the affine transform m applied to sr.
+//
+// For example, if m is the matrix
+//
+// m00 m01 m02
+// m10 m11 m12
+//
+// then the src-space point (sx, sy) maps to the dst-space point
+// (m00*sx + m01*sy + m02, m10*sx + m11*sy + m12).
+//
+// A Transformer is safe to use concurrently.
+type Transformer interface {
+	Transform(dst Image, m f64.Aff3, src image.Image, sr image.Rectangle, op Op, opts *Options)
+}
+
+// Options are optional parameters to Copy, Scale and Transform.
+//
+// A nil *Options means to use the default (zero) values of each field.
+type Options struct {
+	// Masks limit what parts of the dst image are drawn to and what parts of
+	// the src image are drawn from.
+	//
+	// A dst or src mask image having a zero alpha (transparent) pixel value in
+	// the respective coordinate space means that that dst pixel is entirely
+	// unaffected or that src pixel is considered transparent black. A full
+	// alpha (opaque) value means that the dst pixel is maximally affected or
+	// the src pixel contributes maximally. The default values, nil, are
+	// equivalent to fully opaque, infinitely large mask images.
+	//
+	// The DstMask is otherwise known as a clip mask, and its pixels map 1:1 to
+	// the dst image's pixels. DstMaskP in DstMask space corresponds to
+	// image.Point{X:0, Y:0} in dst space. For example, when limiting
+	// repainting to a 'dirty rectangle', use that image.Rectangle and a zero
+	// image.Point as the DstMask and DstMaskP.
+	//
+	// The SrcMask's pixels map 1:1 to the src image's pixels. SrcMaskP in
+	// SrcMask space corresponds to image.Point{X:0, Y:0} in src space. For
+	// example, when drawing font glyphs in a uniform color, use an
+	// *image.Uniform as the src, and use the glyph atlas image and the
+	// per-glyph offset as SrcMask and SrcMaskP:
+	//	Copy(dst, dp, image.NewUniform(color), image.Rect(0, 0, glyphWidth, glyphHeight), &Options{
+	//		SrcMask:  glyphAtlas,
+	//		SrcMaskP: glyphOffset,
+	//	})
+	DstMask  image.Image
+	DstMaskP image.Point
+	SrcMask  image.Image
+	SrcMaskP image.Point
+
+	// TODO: a smooth vs sharp edges option, for arbitrary rotations?
+}
+
+// Interpolator is an interpolation algorithm, when dst and src pixels don't
+// have a 1:1 correspondence.
+//
+// Of the interpolators provided by this package:
+//	- NearestNeighbor is fast but usually looks worst.
+//	- CatmullRom is slow but usually looks best.
+//	- ApproxBiLinear has reasonable speed and quality.
+//
+// The time taken depends on the size of dr. For kernel interpolators, the
+// speed also depends on the size of sr, and so are often slower than
+// non-kernel interpolators, especially when scaling down.
+type Interpolator interface {
+	Scaler
+	Transformer
+}
+
+// Kernel is an interpolator that blends source pixels weighted by a symmetric
+// kernel function.
+type Kernel struct {
+	// Support is the kernel support and must be >= 0. At(t) is assumed to be
+	// zero when t >= Support.
+	Support float64
+	// At is the kernel function. It will only be called with t in the
+	// range [0, Support).
+	At func(t float64) float64
+}
+
+// Scale implements the Scaler interface.
+func (q *Kernel) Scale(dst Image, dr image.Rectangle, src image.Image, sr image.Rectangle, op Op, opts *Options) {
+	q.newScaler(dr.Dx(), dr.Dy(), sr.Dx(), sr.Dy(), false).Scale(dst, dr, src, sr, op, opts)
+}
+
+// NewScaler returns a Scaler that is optimized for scaling multiple times with
+// the same fixed destination and source width and height.
+func (q *Kernel) NewScaler(dw, dh, sw, sh int) Scaler {
+	return q.newScaler(dw, dh, sw, sh, true)
+}
+
+func (q *Kernel) newScaler(dw, dh, sw, sh int, usePool bool) Scaler {
+	z := &kernelScaler{
+		kernel:     q,
+		dw:         int32(dw),
+		dh:         int32(dh),
+		sw:         int32(sw),
+		sh:         int32(sh),
+		horizontal: newDistrib(q, int32(dw), int32(sw)),
+		vertical:   newDistrib(q, int32(dh), int32(sh)),
+	}
+	if usePool {
+		z.pool.New = func() interface{} {
+			tmp := z.makeTmpBuf()
+			return &tmp
+		}
+	}
+	return z
+}
+
+var (
+	// NearestNeighbor is the nearest neighbor interpolator. It is very fast,
+	// but usually gives very low quality results. When scaling up, the result
+	// will look 'blocky'.
+	NearestNeighbor = Interpolator(nnInterpolator{})
+
+	// ApproxBiLinear is a mixture of the nearest neighbor and bi-linear
+	// interpolators. It is fast, but usually gives medium quality results.
+	//
+	// It implements bi-linear interpolation when upscaling and a bi-linear
+	// blend of the 4 nearest neighbor pixels when downscaling. This yields
+	// nicer quality than nearest neighbor interpolation when upscaling, but
+	// the time taken is independent of the number of source pixels, unlike the
+	// bi-linear interpolator. When downscaling a large image, the performance
+	// difference can be significant.
+	ApproxBiLinear = Interpolator(ablInterpolator{})
+
+	// BiLinear is the tent kernel. It is slow, but usually gives high quality
+	// results.
+	BiLinear = &Kernel{1, func(t float64) float64 {
+		return 1 - t
+	}}
+
+	// CatmullRom is the Catmull-Rom kernel. It is very slow, but usually gives
+	// very high quality results.
+	//
+	// It is an instance of the more general cubic BC-spline kernel with parameters
+	// B=0 and C=0.5. See Mitchell and Netravali, "Reconstruction Filters in
+	// Computer Graphics", Computer Graphics, Vol. 22, No. 4, pp. 221-228.
+	CatmullRom = &Kernel{2, func(t float64) float64 {
+		if t < 1 {
+			return (1.5*t-2.5)*t*t + 1
+		}
+		return ((-0.5*t+2.5)*t-4)*t + 2
+	}}
+
+	// TODO: a Kaiser-Bessel kernel?
+)
+
+type nnInterpolator struct{}
+
+type ablInterpolator struct{}
+
+type kernelScaler struct {
+	kernel               *Kernel
+	dw, dh, sw, sh       int32
+	horizontal, vertical distrib
+	pool                 sync.Pool
+}
+
+func (z *kernelScaler) makeTmpBuf() [][4]float64 {
+	return make([][4]float64, z.dw*z.sh)
+}
+
+// source is a range of contribs, their inverse total weight, and that ITW
+// divided by 0xffff.
+type source struct {
+	i, j               int32
+	invTotalWeight     float64
+	invTotalWeightFFFF float64
+}
+
+// contrib is the weight of a column or row.
+type contrib struct {
+	coord  int32
+	weight float64
+}
+
+// distrib measures how source pixels are distributed over destination pixels.
+type distrib struct {
+	// sources are what contribs each column or row in the source image owns,
+	// and the total weight of those contribs.
+	sources []source
+	// contribs are the contributions indexed by sources[s].i and sources[s].j.
+	contribs []contrib
+}
+
+// newDistrib returns a distrib that distributes sw source columns (or rows)
+// over dw destination columns (or rows).
+func newDistrib(q *Kernel, dw, sw int32) distrib {
+	scale := float64(sw) / float64(dw)
+	halfWidth, kernelArgScale := q.Support, 1.0
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	if scale > 1 {
+		halfWidth *= scale
+		kernelArgScale = 1 / scale
+	}
+
+	// Make the sources slice, one source for each column or row, and temporarily
+	// appropriate its elements' fields so that invTotalWeight is the scaled
+	// coordinate of the source column or row, and i and j are the lower and
+	// upper bounds of the range of destination columns or rows affected by the
+	// source column or row.
+	n, sources := int32(0), make([]source, dw)
+	for x := range sources {
+		center := (float64(x)+0.5)*scale - 0.5
+		i := int32(math.Floor(center - halfWidth))
+		if i < 0 {
+			i = 0
+		}
+		j := int32(math.Ceil(center + halfWidth))
+		if j > sw {
+			j = sw
+			if j < i {
+				j = i
+			}
+		}
+		sources[x] = source{i: i, j: j, invTotalWeight: center}
+		n += j - i
+	}
+
+	contribs := make([]contrib, 0, n)
+	for k, b := range sources {
+		totalWeight := 0.0
+		l := int32(len(contribs))
+		for coord := b.i; coord < b.j; coord++ {
+			t := abs((b.invTotalWeight - float64(coord)) * kernelArgScale)
+			if t >= q.Support {
+				continue
+			}
+			weight := q.At(t)
+			if weight == 0 {
+				continue
+			}
+			totalWeight += weight
+			contribs = append(contribs, contrib{coord, weight})
+		}
+		totalWeight = 1 / totalWeight
+		sources[k] = source{
+			i:                  l,
+			j:                  int32(len(contribs)),
+			invTotalWeight:     totalWeight,
+			invTotalWeightFFFF: totalWeight / 0xffff,
+		}
+	}
+
+	return distrib{sources, contribs}
+}
+
+// abs is like math.Abs, but it doesn't care about negative zero, infinities or
+// NaNs.
+func abs(f float64) float64 {
+	if f < 0 {
+		f = -f
+	}
+	return f
+}
+
+// ftou converts the range [0.0, 1.0] to [0, 0xffff].
+func ftou(f float64) uint16 {
+	i := int32(0xffff*f + 0.5)
+	if i > 0xffff {
+		return 0xffff
+	}
+	if i > 0 {
+		return uint16(i)
+	}
+	return 0
+}
+
+// fffftou converts the range [0.0, 65535.0] to [0, 0xffff].
+func fffftou(f float64) uint16 {
+	i := int32(f + 0.5)
+	if i > 0xffff {
+		return 0xffff
+	}
+	if i > 0 {
+		return uint16(i)
+	}
+	return 0
+}
+
+// invert returns the inverse of m.
+//
+// TODO: move this into the f64 package, once we work out the convention for
+// matrix methods in that package: do they modify the receiver, take a dst
+// pointer argument, or return a new value?
+func invert(m *f64.Aff3) f64.Aff3 {
+	m00 := +m[3*1+1]
+	m01 := -m[3*0+1]
+	m02 := +m[3*1+2]*m[3*0+1] - m[3*1+1]*m[3*0+2]
+	m10 := -m[3*1+0]
+	m11 := +m[3*0+0]
+	m12 := +m[3*1+0]*m[3*0+2] - m[3*1+2]*m[3*0+0]
+
+	det := m00*m11 - m10*m01
+
+	return f64.Aff3{
+		m00 / det,
+		m01 / det,
+		m02 / det,
+		m10 / det,
+		m11 / det,
+		m12 / det,
+	}
+}
+
+func matMul(p, q *f64.Aff3) f64.Aff3 {
+	return f64.Aff3{
+		p[3*0+0]*q[3*0+0] + p[3*0+1]*q[3*1+0],
+		p[3*0+0]*q[3*0+1] + p[3*0+1]*q[3*1+1],
+		p[3*0+0]*q[3*0+2] + p[3*0+1]*q[3*1+2] + p[3*0+2],
+		p[3*1+0]*q[3*0+0] + p[3*1+1]*q[3*1+0],
+		p[3*1+0]*q[3*0+1] + p[3*1+1]*q[3*1+1],
+		p[3*1+0]*q[3*0+2] + p[3*1+1]*q[3*1+2] + p[3*1+2],
+	}
+}
+
+// transformRect returns a rectangle dr that contains sr transformed by s2d.
+func transformRect(s2d *f64.Aff3, sr *image.Rectangle) (dr image.Rectangle) {
+	ps := [...]image.Point{
+		{sr.Min.X, sr.Min.Y},
+		{sr.Max.X, sr.Min.Y},
+		{sr.Min.X, sr.Max.Y},
+		{sr.Max.X, sr.Max.Y},
+	}
+	for i, p := range ps {
+		sxf := float64(p.X)
+		syf := float64(p.Y)
+		dx := int(math.Floor(s2d[0]*sxf + s2d[1]*syf + s2d[2]))
+		dy := int(math.Floor(s2d[3]*sxf + s2d[4]*syf + s2d[5]))
+
+		// The +1 adjustments below are because an image.Rectangle is inclusive
+		// on the low end but exclusive on the high end.
+
+		if i == 0 {
+			dr = image.Rectangle{
+				Min: image.Point{dx + 0, dy + 0},
+				Max: image.Point{dx + 1, dy + 1},
+			}
+			continue
+		}
+
+		if dr.Min.X > dx {
+			dr.Min.X = dx
+		}
+		dx++
+		if dr.Max.X < dx {
+			dr.Max.X = dx
+		}
+
+		if dr.Min.Y > dy {
+			dr.Min.Y = dy
+		}
+		dy++
+		if dr.Max.Y < dy {
+			dr.Max.Y = dy
+		}
+	}
+	return dr
+}
+
+func clipAffectedDestRect(adr image.Rectangle, dstMask image.Image, dstMaskP image.Point) (image.Rectangle, image.Image) {
+	if dstMask == nil {
+		return adr, nil
+	}
+	// TODO: enable this fast path once Go 1.5 is released, where an
+	// image.Rectangle implements image.Image.
+	// if r, ok := dstMask.(image.Rectangle); ok {
+	// 	return adr.Intersect(r.Sub(dstMaskP)), nil
+	// }
+	// TODO: clip to dstMask.Bounds() if the color model implies that out-of-bounds means 0 alpha?
+	return adr, dstMask
+}
+
+func transform_Uniform(dst Image, dr, adr image.Rectangle, d2s *f64.Aff3, src *image.Uniform, sr image.Rectangle, bias image.Point, op Op) {
+	switch op {
+	case Over:
+		switch dst := dst.(type) {
+		case *image.RGBA:
+			pr, pg, pb, pa := src.C.RGBA()
+			pa1 := (0xffff - pa) * 0x101
+
+			for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+				dyf := float64(dr.Min.Y+int(dy)) + 0.5
+				d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
+				for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+					dxf := float64(dr.Min.X+int(dx)) + 0.5
+					sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+					sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+					if !(image.Point{sx0, sy0}).In(sr) {
+						continue
+					}
+					dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + pr) >> 8)
+					dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + pg) >> 8)
+					dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + pb) >> 8)
+					dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + pa) >> 8)
+				}
+			}
+
+		default:
+			pr, pg, pb, pa := src.C.RGBA()
+			pa1 := 0xffff - pa
+			dstColorRGBA64 := &color.RGBA64{}
+			dstColor := color.Color(dstColorRGBA64)
+
+			for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+				dyf := float64(dr.Min.Y+int(dy)) + 0.5
+				for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+					dxf := float64(dr.Min.X+int(dx)) + 0.5
+					sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+					sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+					if !(image.Point{sx0, sy0}).In(sr) {
+						continue
+					}
+					qr, qg, qb, qa := dst.At(dr.Min.X+int(dx), dr.Min.Y+int(dy)).RGBA()
+					dstColorRGBA64.R = uint16(qr*pa1/0xffff + pr)
+					dstColorRGBA64.G = uint16(qg*pa1/0xffff + pg)
+					dstColorRGBA64.B = uint16(qb*pa1/0xffff + pb)
+					dstColorRGBA64.A = uint16(qa*pa1/0xffff + pa)
+					dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
+				}
+			}
+		}
+
+	case Src:
+		switch dst := dst.(type) {
+		case *image.RGBA:
+			pr, pg, pb, pa := src.C.RGBA()
+			pr8 := uint8(pr >> 8)
+			pg8 := uint8(pg >> 8)
+			pb8 := uint8(pb >> 8)
+			pa8 := uint8(pa >> 8)
+
+			for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+				dyf := float64(dr.Min.Y+int(dy)) + 0.5
+				d := dst.PixOffset(dr.Min.X+adr.Min.X, dr.Min.Y+int(dy))
+				for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+					dxf := float64(dr.Min.X+int(dx)) + 0.5
+					sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+					sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+					if !(image.Point{sx0, sy0}).In(sr) {
+						continue
+					}
+					dst.Pix[d+0] = pr8
+					dst.Pix[d+1] = pg8
+					dst.Pix[d+2] = pb8
+					dst.Pix[d+3] = pa8
+				}
+			}
+
+		default:
+			pr, pg, pb, pa := src.C.RGBA()
+			dstColorRGBA64 := &color.RGBA64{
+				uint16(pr),
+				uint16(pg),
+				uint16(pb),
+				uint16(pa),
+			}
+			dstColor := color.Color(dstColorRGBA64)
+
+			for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+				dyf := float64(dr.Min.Y+int(dy)) + 0.5
+				for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+					dxf := float64(dr.Min.X+int(dx)) + 0.5
+					sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+					sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+					if !(image.Point{sx0, sy0}).In(sr) {
+						continue
+					}
+					dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColor)
+				}
+			}
+		}
+	}
+}
+
+func opaque(m image.Image) bool {
+	o, ok := m.(interface {
+		Opaque() bool
+	})
+	return ok && o.Opaque()
+}
diff --git a/draw/scale_test.go b/draw/scale_test.go
new file mode 100644
index 0000000..5e184c2
--- /dev/null
+++ b/draw/scale_test.go
@@ -0,0 +1,731 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package draw
+
+import (
+	"bytes"
+	"flag"
+	"fmt"
+	"image"
+	"image/color"
+	"image/png"
+	"math/rand"
+	"os"
+	"reflect"
+	"testing"
+
+	"golang.org/x/image/math/f64"
+
+	_ "image/jpeg"
+)
+
+var genGoldenFiles = flag.Bool("gen_golden_files", false, "whether to generate the TestXxx golden files.")
+
+var transformMatrix = func(scale, tx, ty float64) f64.Aff3 {
+	const cos30, sin30 = 0.866025404, 0.5
+	return f64.Aff3{
+		+scale * cos30, -scale * sin30, tx,
+		+scale * sin30, +scale * cos30, ty,
+	}
+}
+
+func encode(filename string, m image.Image) error {
+	f, err := os.Create(filename)
+	if err != nil {
+		return fmt.Errorf("Create: %v", err)
+	}
+	defer f.Close()
+	if err := png.Encode(f, m); err != nil {
+		return fmt.Errorf("Encode: %v", err)
+	}
+	return nil
+}
+
+// testInterp tests that interpolating the source image gives the exact
+// destination image. This is to ensure that any refactoring or optimization of
+// the interpolation code doesn't change the behavior. Changing the actual
+// algorithm or kernel used by any particular quality setting will obviously
+// change the resultant pixels. In such a case, use the gen_golden_files flag
+// to regenerate the golden files.
+func testInterp(t *testing.T, w int, h int, direction, prefix, suffix string) {
+	f, err := os.Open("../testdata/" + prefix + suffix)
+	if err != nil {
+		t.Fatalf("Open: %v", err)
+	}
+	defer f.Close()
+	src, _, err := image.Decode(f)
+	if err != nil {
+		t.Fatalf("Decode: %v", err)
+	}
+
+	op, scale := Src, 3.75
+	if prefix == "tux" {
+		op, scale = Over, 0.125
+	}
+	green := image.NewUniform(color.RGBA{0x00, 0x22, 0x11, 0xff})
+
+	testCases := map[string]Interpolator{
+		"nn": NearestNeighbor,
+		"ab": ApproxBiLinear,
+		"bl": BiLinear,
+		"cr": CatmullRom,
+	}
+	for name, q := range testCases {
+		goldenFilename := fmt.Sprintf("../testdata/%s-%s-%s.png", prefix, direction, name)
+
+		got := image.NewRGBA(image.Rect(0, 0, w, h))
+		Copy(got, image.Point{}, green, got.Bounds(), Src, nil)
+		if direction == "rotate" {
+			q.Transform(got, transformMatrix(scale, 40, 10), src, src.Bounds(), op, nil)
+		} else {
+			q.Scale(got, got.Bounds(), src, src.Bounds(), op, nil)
+		}
+
+		if *genGoldenFiles {
+			if err := encode(goldenFilename, got); err != nil {
+				t.Error(err)
+			}
+			continue
+		}
+
+		g, err := os.Open(goldenFilename)
+		if err != nil {
+			t.Errorf("Open: %v", err)
+			continue
+		}
+		defer g.Close()
+		wantRaw, err := png.Decode(g)
+		if err != nil {
+			t.Errorf("Decode: %v", err)
+			continue
+		}
+		// convert wantRaw to RGBA.
+		want, ok := wantRaw.(*image.RGBA)
+		if !ok {
+			b := wantRaw.Bounds()
+			want = image.NewRGBA(b)
+			Draw(want, b, wantRaw, b.Min, Src)
+		}
+
+		if !reflect.DeepEqual(got, want) {
+			t.Errorf("%s: actual image differs from golden image", goldenFilename)
+			continue
+		}
+	}
+}
+
+func TestScaleDown(t *testing.T) { testInterp(t, 100, 100, "down", "go-turns-two", "-280x360.jpeg") }
+func TestScaleUp(t *testing.T)   { testInterp(t, 75, 100, "up", "go-turns-two", "-14x18.png") }
+func TestTformSrc(t *testing.T)  { testInterp(t, 100, 100, "rotate", "go-turns-two", "-14x18.png") }
+func TestTformOver(t *testing.T) { testInterp(t, 100, 100, "rotate", "tux", ".png") }
+
+// TestSimpleTransforms tests Scale and Transform calls that simplify to Copy
+// or Scale calls.
+func TestSimpleTransforms(t *testing.T) {
+	f, err := os.Open("../testdata/testpattern.png") // A 100x100 image.
+	if err != nil {
+		t.Fatalf("Open: %v", err)
+	}
+	defer f.Close()
+	src, _, err := image.Decode(f)
+	if err != nil {
+		t.Fatalf("Decode: %v", err)
+	}
+
+	dst0 := image.NewRGBA(image.Rect(0, 0, 120, 150))
+	dst1 := image.NewRGBA(image.Rect(0, 0, 120, 150))
+	for _, op := range []string{"scale/copy", "tform/copy", "tform/scale"} {
+		for _, epsilon := range []float64{0, 1e-50, 1e-1} {
+			Copy(dst0, image.Point{}, image.Transparent, dst0.Bounds(), Src, nil)
+			Copy(dst1, image.Point{}, image.Transparent, dst1.Bounds(), Src, nil)
+
+			switch op {
+			case "scale/copy":
+				dr := image.Rect(10, 30, 10+100, 30+100)
+				if epsilon > 1e-10 {
+					dr.Max.X++
+				}
+				Copy(dst0, image.Point{10, 30}, src, src.Bounds(), Src, nil)
+				ApproxBiLinear.Scale(dst1, dr, src, src.Bounds(), Src, nil)
+			case "tform/copy":
+				Copy(dst0, image.Point{10, 30}, src, src.Bounds(), Src, nil)
+				ApproxBiLinear.Transform(dst1, f64.Aff3{
+					1, 0 + epsilon, 10,
+					0, 1, 30,
+				}, src, src.Bounds(), Src, nil)
+			case "tform/scale":
+				ApproxBiLinear.Scale(dst0, image.Rect(10, 50, 10+50, 50+50), src, src.Bounds(), Src, nil)
+				ApproxBiLinear.Transform(dst1, f64.Aff3{
+					0.5, 0.0 + epsilon, 10,
+					0.0, 0.5, 50,
+				}, src, src.Bounds(), Src, nil)
+			}
+
+			differ := !bytes.Equal(dst0.Pix, dst1.Pix)
+			if epsilon > 1e-10 {
+				if !differ {
+					t.Errorf("%s yielded same pixels, want different pixels: epsilon=%v", op, epsilon)
+				}
+			} else {
+				if differ {
+					t.Errorf("%s yielded different pixels, want same pixels: epsilon=%v", op, epsilon)
+				}
+			}
+		}
+	}
+}
+
+func BenchmarkSimpleScaleCopy(b *testing.B) {
+	dst := image.NewRGBA(image.Rect(0, 0, 640, 480))
+	src := image.NewRGBA(image.Rect(0, 0, 400, 300))
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		ApproxBiLinear.Scale(dst, image.Rect(10, 20, 10+400, 20+300), src, src.Bounds(), Src, nil)
+	}
+}
+
+func BenchmarkSimpleTransformCopy(b *testing.B) {
+	dst := image.NewRGBA(image.Rect(0, 0, 640, 480))
+	src := image.NewRGBA(image.Rect(0, 0, 400, 300))
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		ApproxBiLinear.Transform(dst, f64.Aff3{
+			1, 0, 10,
+			0, 1, 20,
+		}, src, src.Bounds(), Src, nil)
+	}
+}
+
+func BenchmarkSimpleTransformScale(b *testing.B) {
+	dst := image.NewRGBA(image.Rect(0, 0, 640, 480))
+	src := image.NewRGBA(image.Rect(0, 0, 400, 300))
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		ApproxBiLinear.Transform(dst, f64.Aff3{
+			0.5, 0.0, 10,
+			0.0, 0.5, 20,
+		}, src, src.Bounds(), Src, nil)
+	}
+}
+
+func TestOps(t *testing.T) {
+	blue := image.NewUniform(color.RGBA{0x00, 0x00, 0xff, 0xff})
+	testCases := map[Op]color.RGBA{
+		Over: color.RGBA{0x7f, 0x00, 0x80, 0xff},
+		Src:  color.RGBA{0x7f, 0x00, 0x00, 0x7f},
+	}
+	for op, want := range testCases {
+		dst := image.NewRGBA(image.Rect(0, 0, 2, 2))
+		Copy(dst, image.Point{}, blue, dst.Bounds(), Src, nil)
+
+		src := image.NewRGBA(image.Rect(0, 0, 1, 1))
+		src.SetRGBA(0, 0, color.RGBA{0x7f, 0x00, 0x00, 0x7f})
+
+		NearestNeighbor.Scale(dst, dst.Bounds(), src, src.Bounds(), op, nil)
+
+		if got := dst.RGBAAt(0, 0); got != want {
+			t.Errorf("op=%v: got %v, want %v", op, got, want)
+		}
+	}
+}
+
+// TestNegativeWeights tests that scaling by a kernel that produces negative
+// weights, such as the Catmull-Rom kernel, doesn't produce an invalid color
+// according to Go's alpha-premultiplied model.
+func TestNegativeWeights(t *testing.T) {
+	check := func(m *image.RGBA) error {
+		b := m.Bounds()
+		for y := b.Min.Y; y < b.Max.Y; y++ {
+			for x := b.Min.X; x < b.Max.X; x++ {
+				if c := m.RGBAAt(x, y); c.R > c.A || c.G > c.A || c.B > c.A {
+					return fmt.Errorf("invalid color.RGBA at (%d, %d): %v", x, y, c)
+				}
+			}
+		}
+		return nil
+	}
+
+	src := image.NewRGBA(image.Rect(0, 0, 16, 16))
+	for y := 0; y < 16; y++ {
+		for x := 0; x < 16; x++ {
+			a := y * 0x11
+			src.Set(x, y, color.RGBA{
+				R: uint8(x * 0x11 * a / 0xff),
+				A: uint8(a),
+			})
+		}
+	}
+	if err := check(src); err != nil {
+		t.Fatalf("src image: %v", err)
+	}
+
+	dst := image.NewRGBA(image.Rect(0, 0, 32, 32))
+	CatmullRom.Scale(dst, dst.Bounds(), src, src.Bounds(), Over, nil)
+	if err := check(dst); err != nil {
+		t.Fatalf("dst image: %v", err)
+	}
+}
+
+func fillPix(r *rand.Rand, pixs ...[]byte) {
+	for _, pix := range pixs {
+		for i := range pix {
+			pix[i] = uint8(r.Intn(256))
+		}
+	}
+}
+
+func TestInterpClipCommute(t *testing.T) {
+	src := image.NewNRGBA(image.Rect(0, 0, 20, 20))
+	fillPix(rand.New(rand.NewSource(0)), src.Pix)
+
+	outer := image.Rect(1, 1, 8, 5)
+	inner := image.Rect(2, 3, 6, 5)
+	qs := []Interpolator{
+		NearestNeighbor,
+		ApproxBiLinear,
+		CatmullRom,
+	}
+	for _, transform := range []bool{false, true} {
+		for _, q := range qs {
+			dst0 := image.NewRGBA(image.Rect(1, 1, 10, 10))
+			dst1 := image.NewRGBA(image.Rect(1, 1, 10, 10))
+			for i := range dst0.Pix {
+				dst0.Pix[i] = uint8(i / 4)
+				dst1.Pix[i] = uint8(i / 4)
+			}
+
+			var interp func(dst *image.RGBA)
+			if transform {
+				interp = func(dst *image.RGBA) {
+					q.Transform(dst, transformMatrix(3.75, 2, 1), src, src.Bounds(), Over, nil)
+				}
+			} else {
+				interp = func(dst *image.RGBA) {
+					q.Scale(dst, outer, src, src.Bounds(), Over, nil)
+				}
+			}
+
+			// Interpolate then clip.
+			interp(dst0)
+			dst0 = dst0.SubImage(inner).(*image.RGBA)
+
+			// Clip then interpolate.
+			dst1 = dst1.SubImage(inner).(*image.RGBA)
+			interp(dst1)
+
+		loop:
+			for y := inner.Min.Y; y < inner.Max.Y; y++ {
+				for x := inner.Min.X; x < inner.Max.X; x++ {
+					if c0, c1 := dst0.RGBAAt(x, y), dst1.RGBAAt(x, y); c0 != c1 {
+						t.Errorf("q=%T: at (%d, %d): c0=%v, c1=%v", q, x, y, c0, c1)
+						break loop
+					}
+				}
+			}
+		}
+	}
+}
+
+// translatedImage is an image m translated by t.
+type translatedImage struct {
+	m image.Image
+	t image.Point
+}
+
+func (t *translatedImage) At(x, y int) color.Color { return t.m.At(x-t.t.X, y-t.t.Y) }
+func (t *translatedImage) Bounds() image.Rectangle { return t.m.Bounds().Add(t.t) }
+func (t *translatedImage) ColorModel() color.Model { return t.m.ColorModel() }
+
+// TestSrcTranslationInvariance tests that Scale and Transform are invariant
+// under src translations. Specifically, when some source pixels are not in the
+// bottom-right quadrant of src coordinate space, we consistently round down,
+// not round towards zero.
+func TestSrcTranslationInvariance(t *testing.T) {
+	f, err := os.Open("../testdata/testpattern.png")
+	if err != nil {
+		t.Fatalf("Open: %v", err)
+	}
+	defer f.Close()
+	src, _, err := image.Decode(f)
+	if err != nil {
+		t.Fatalf("Decode: %v", err)
+	}
+	sr := image.Rect(2, 3, 16, 12)
+	if !sr.In(src.Bounds()) {
+		t.Fatalf("src bounds too small: got %v", src.Bounds())
+	}
+	qs := []Interpolator{
+		NearestNeighbor,
+		ApproxBiLinear,
+		CatmullRom,
+	}
+	deltas := []image.Point{
+		{+0, +0},
+		{+0, +5},
+		{+0, -5},
+		{+5, +0},
+		{-5, +0},
+		{+8, +8},
+		{+8, -8},
+		{-8, +8},
+		{-8, -8},
+	}
+	m00 := transformMatrix(3.75, 0, 0)
+
+	for _, transform := range []bool{false, true} {
+		for _, q := range qs {
+			want := image.NewRGBA(image.Rect(0, 0, 20, 20))
+			if transform {
+				q.Transform(want, m00, src, sr, Over, nil)
+			} else {
+				q.Scale(want, want.Bounds(), src, sr, Over, nil)
+			}
+			for _, delta := range deltas {
+				tsrc := &translatedImage{src, delta}
+				got := image.NewRGBA(image.Rect(0, 0, 20, 20))
+				if transform {
+					m := matMul(&m00, &f64.Aff3{
+						1, 0, -float64(delta.X),
+						0, 1, -float64(delta.Y),
+					})
+					q.Transform(got, m, tsrc, sr.Add(delta), Over, nil)
+				} else {
+					q.Scale(got, got.Bounds(), tsrc, sr.Add(delta), Over, nil)
+				}
+				if !bytes.Equal(got.Pix, want.Pix) {
+					t.Errorf("pix differ for delta=%v, transform=%t, q=%T", delta, transform, q)
+				}
+			}
+		}
+	}
+}
+
+func TestSrcMask(t *testing.T) {
+	srcMask := image.NewRGBA(image.Rect(0, 0, 23, 1))
+	srcMask.SetRGBA(19, 0, color.RGBA{0x00, 0x00, 0x00, 0x7f})
+	srcMask.SetRGBA(20, 0, color.RGBA{0x00, 0x00, 0x00, 0xff})
+	srcMask.SetRGBA(21, 0, color.RGBA{0x00, 0x00, 0x00, 0x3f})
+	srcMask.SetRGBA(22, 0, color.RGBA{0x00, 0x00, 0x00, 0x00})
+	red := image.NewUniform(color.RGBA{0xff, 0x00, 0x00, 0xff})
+	blue := image.NewUniform(color.RGBA{0x00, 0x00, 0xff, 0xff})
+	dst := image.NewRGBA(image.Rect(0, 0, 6, 1))
+	Copy(dst, image.Point{}, blue, dst.Bounds(), Src, nil)
+	NearestNeighbor.Scale(dst, dst.Bounds(), red, image.Rect(0, 0, 3, 1), Over, &Options{
+		SrcMask:  srcMask,
+		SrcMaskP: image.Point{20, 0},
+	})
+	got := [6]color.RGBA{
+		dst.RGBAAt(0, 0),
+		dst.RGBAAt(1, 0),
+		dst.RGBAAt(2, 0),
+		dst.RGBAAt(3, 0),
+		dst.RGBAAt(4, 0),
+		dst.RGBAAt(5, 0),
+	}
+	want := [6]color.RGBA{
+		{0xff, 0x00, 0x00, 0xff},
+		{0xff, 0x00, 0x00, 0xff},
+		{0x3f, 0x00, 0xc0, 0xff},
+		{0x3f, 0x00, 0xc0, 0xff},
+		{0x00, 0x00, 0xff, 0xff},
+		{0x00, 0x00, 0xff, 0xff},
+	}
+	if got != want {
+		t.Errorf("\ngot  %v\nwant %v", got, want)
+	}
+}
+
+func TestDstMask(t *testing.T) {
+	dstMask := image.NewRGBA(image.Rect(0, 0, 23, 1))
+	dstMask.SetRGBA(19, 0, color.RGBA{0x00, 0x00, 0x00, 0x7f})
+	dstMask.SetRGBA(20, 0, color.RGBA{0x00, 0x00, 0x00, 0xff})
+	dstMask.SetRGBA(21, 0, color.RGBA{0x00, 0x00, 0x00, 0x3f})
+	dstMask.SetRGBA(22, 0, color.RGBA{0x00, 0x00, 0x00, 0x00})
+	red := image.NewRGBA(image.Rect(0, 0, 1, 1))
+	red.SetRGBA(0, 0, color.RGBA{0xff, 0x00, 0x00, 0xff})
+	blue := image.NewUniform(color.RGBA{0x00, 0x00, 0xff, 0xff})
+	qs := []Interpolator{
+		NearestNeighbor,
+		ApproxBiLinear,
+		CatmullRom,
+	}
+	for _, q := range qs {
+		dst := image.NewRGBA(image.Rect(0, 0, 3, 1))
+		Copy(dst, image.Point{}, blue, dst.Bounds(), Src, nil)
+		q.Scale(dst, dst.Bounds(), red, red.Bounds(), Over, &Options{
+			DstMask:  dstMask,
+			DstMaskP: image.Point{20, 0},
+		})
+		got := [3]color.RGBA{
+			dst.RGBAAt(0, 0),
+			dst.RGBAAt(1, 0),
+			dst.RGBAAt(2, 0),
+		}
+		want := [3]color.RGBA{
+			{0xff, 0x00, 0x00, 0xff},
+			{0x3f, 0x00, 0xc0, 0xff},
+			{0x00, 0x00, 0xff, 0xff},
+		}
+		if got != want {
+			t.Errorf("q=%T:\ngot  %v\nwant %v", q, got, want)
+		}
+	}
+}
+
+func TestRectDstMask(t *testing.T) {
+	f, err := os.Open("../testdata/testpattern.png")
+	if err != nil {
+		t.Fatalf("Open: %v", err)
+	}
+	defer f.Close()
+	src, _, err := image.Decode(f)
+	if err != nil {
+		t.Fatalf("Decode: %v", err)
+	}
+	m00 := transformMatrix(1, 0, 0)
+
+	bounds := image.Rect(0, 0, 50, 50)
+	dstOutside := image.NewRGBA(bounds)
+	for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
+		for x := bounds.Min.X; x < bounds.Max.X; x++ {
+			dstOutside.SetRGBA(x, y, color.RGBA{uint8(5 * x), uint8(5 * y), 0x00, 0xff})
+		}
+	}
+
+	mk := func(q Transformer, dstMask image.Image, dstMaskP image.Point) *image.RGBA {
+		m := image.NewRGBA(bounds)
+		Copy(m, bounds.Min, dstOutside, bounds, Src, nil)
+		q.Transform(m, m00, src, src.Bounds(), Over, &Options{
+			DstMask:  dstMask,
+			DstMaskP: dstMaskP,
+		})
+		return m
+	}
+
+	qs := []Interpolator{
+		NearestNeighbor,
+		ApproxBiLinear,
+		CatmullRom,
+	}
+	dstMaskPs := []image.Point{
+		{0, 0},
+		{5, 7},
+		{-3, 0},
+	}
+	rect := image.Rect(10, 10, 30, 40)
+	for _, q := range qs {
+		for _, dstMaskP := range dstMaskPs {
+			dstInside := mk(q, nil, image.Point{})
+			for _, wrap := range []bool{false, true} {
+				// TODO: replace "rectImage(rect)" with "rect" once Go 1.5 is
+				// released, where an image.Rectangle implements image.Image.
+				dstMask := image.Image(rectImage(rect))
+				if wrap {
+					dstMask = srcWrapper{dstMask}
+				}
+				dst := mk(q, dstMask, dstMaskP)
+
+				nError := 0
+			loop:
+				for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
+					for x := bounds.Min.X; x < bounds.Max.X; x++ {
+						which := dstOutside
+						if (image.Point{x, y}).Add(dstMaskP).In(rect) {
+							which = dstInside
+						}
+						if got, want := dst.RGBAAt(x, y), which.RGBAAt(x, y); got != want {
+							if nError == 10 {
+								t.Errorf("q=%T dmp=%v wrap=%v: ...and more errors", q, dstMaskP, wrap)
+								break loop
+							}
+							nError++
+							t.Errorf("q=%T dmp=%v wrap=%v: x=%3d y=%3d: got %v, want %v",
+								q, dstMaskP, wrap, x, y, got, want)
+						}
+					}
+				}
+			}
+		}
+	}
+}
+
+// TODO: delete this wrapper type once Go 1.5 is released, where an
+// image.Rectangle implements image.Image.
+type rectImage image.Rectangle
+
+func (r rectImage) ColorModel() color.Model { return color.Alpha16Model }
+func (r rectImage) Bounds() image.Rectangle { return image.Rectangle(r) }
+func (r rectImage) At(x, y int) color.Color {
+	if (image.Point{x, y}).In(image.Rectangle(r)) {
+		return color.Opaque
+	}
+	return color.Transparent
+}
+
+// The fooWrapper types wrap the dst or src image to avoid triggering the
+// type-specific fast path implementations.
+type (
+	dstWrapper struct{ Image }
+	srcWrapper struct{ image.Image }
+)
+
+func srcGray(boundsHint image.Rectangle) (image.Image, error) {
+	m := image.NewGray(boundsHint)
+	fillPix(rand.New(rand.NewSource(0)), m.Pix)
+	return m, nil
+}
+
+func srcNRGBA(boundsHint image.Rectangle) (image.Image, error) {
+	m := image.NewNRGBA(boundsHint)
+	fillPix(rand.New(rand.NewSource(1)), m.Pix)
+	return m, nil
+}
+
+func srcRGBA(boundsHint image.Rectangle) (image.Image, error) {
+	m := image.NewRGBA(boundsHint)
+	fillPix(rand.New(rand.NewSource(2)), m.Pix)
+	// RGBA is alpha-premultiplied, so the R, G and B values should
+	// be <= the A values.
+	for i := 0; i < len(m.Pix); i += 4 {
+		m.Pix[i+0] = uint8(uint32(m.Pix[i+0]) * uint32(m.Pix[i+3]) / 0xff)
+		m.Pix[i+1] = uint8(uint32(m.Pix[i+1]) * uint32(m.Pix[i+3]) / 0xff)
+		m.Pix[i+2] = uint8(uint32(m.Pix[i+2]) * uint32(m.Pix[i+3]) / 0xff)
+	}
+	return m, nil
+}
+
+func srcUnif(boundsHint image.Rectangle) (image.Image, error) {
+	return image.NewUniform(color.RGBA64{0x1234, 0x5555, 0x9181, 0xbeef}), nil
+}
+
+func srcYCbCr(boundsHint image.Rectangle) (image.Image, error) {
+	m := image.NewYCbCr(boundsHint, image.YCbCrSubsampleRatio420)
+	fillPix(rand.New(rand.NewSource(3)), m.Y, m.Cb, m.Cr)
+	return m, nil
+}
+
+func srcLarge(boundsHint image.Rectangle) (image.Image, error) {
+	// 3072 x 2304 is over 7 million pixels at 4:3, comparable to a
+	// 2015 smart-phone camera's output.
+	return srcYCbCr(image.Rect(0, 0, 3072, 2304))
+}
+
+func srcTux(boundsHint image.Rectangle) (image.Image, error) {
+	// tux.png is a 386 x 395 image.
+	f, err := os.Open("../testdata/tux.png")
+	if err != nil {
+		return nil, fmt.Errorf("Open: %v", err)
+	}
+	defer f.Close()
+	src, err := png.Decode(f)
+	if err != nil {
+		return nil, fmt.Errorf("Decode: %v", err)
+	}
+	return src, nil
+}
+
+func benchScale(b *testing.B, w int, h int, op Op, srcf func(image.Rectangle) (image.Image, error), q Interpolator) {
+	dst := image.NewRGBA(image.Rect(0, 0, w, h))
+	src, err := srcf(image.Rect(0, 0, 1024, 768))
+	if err != nil {
+		b.Fatal(err)
+	}
+	dr, sr := dst.Bounds(), src.Bounds()
+	scaler := Scaler(q)
+	if n, ok := q.(interface {
+		NewScaler(int, int, int, int) Scaler
+	}); ok {
+		scaler = n.NewScaler(dr.Dx(), dr.Dy(), sr.Dx(), sr.Dy())
+	}
+
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		scaler.Scale(dst, dr, src, sr, op, nil)
+	}
+}
+
+func benchTform(b *testing.B, w int, h int, op Op, srcf func(image.Rectangle) (image.Image, error), q Interpolator) {
+	dst := image.NewRGBA(image.Rect(0, 0, w, h))
+	src, err := srcf(image.Rect(0, 0, 1024, 768))
+	if err != nil {
+		b.Fatal(err)
+	}
+	sr := src.Bounds()
+	m := transformMatrix(3.75, 40, 10)
+
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		q.Transform(dst, m, src, sr, op, nil)
+	}
+}
+
+func BenchmarkScaleNNLargeDown(b *testing.B) { benchScale(b, 200, 150, Src, srcLarge, NearestNeighbor) }
+func BenchmarkScaleABLargeDown(b *testing.B) { benchScale(b, 200, 150, Src, srcLarge, ApproxBiLinear) }
+func BenchmarkScaleBLLargeDown(b *testing.B) { benchScale(b, 200, 150, Src, srcLarge, BiLinear) }
+func BenchmarkScaleCRLargeDown(b *testing.B) { benchScale(b, 200, 150, Src, srcLarge, CatmullRom) }
+
+func BenchmarkScaleNNDown(b *testing.B) { benchScale(b, 120, 80, Src, srcTux, NearestNeighbor) }
+func BenchmarkScaleABDown(b *testing.B) { benchScale(b, 120, 80, Src, srcTux, ApproxBiLinear) }
+func BenchmarkScaleBLDown(b *testing.B) { benchScale(b, 120, 80, Src, srcTux, BiLinear) }
+func BenchmarkScaleCRDown(b *testing.B) { benchScale(b, 120, 80, Src, srcTux, CatmullRom) }
+
+func BenchmarkScaleNNUp(b *testing.B) { benchScale(b, 800, 600, Src, srcTux, NearestNeighbor) }
+func BenchmarkScaleABUp(b *testing.B) { benchScale(b, 800, 600, Src, srcTux, ApproxBiLinear) }
+func BenchmarkScaleBLUp(b *testing.B) { benchScale(b, 800, 600, Src, srcTux, BiLinear) }
+func BenchmarkScaleCRUp(b *testing.B) { benchScale(b, 800, 600, Src, srcTux, CatmullRom) }
+
+func BenchmarkScaleNNSrcRGBA(b *testing.B) { benchScale(b, 200, 150, Src, srcRGBA, NearestNeighbor) }
+func BenchmarkScaleNNSrcUnif(b *testing.B) { benchScale(b, 200, 150, Src, srcUnif, NearestNeighbor) }
+
+func BenchmarkScaleNNOverRGBA(b *testing.B) { benchScale(b, 200, 150, Over, srcRGBA, NearestNeighbor) }
+func BenchmarkScaleNNOverUnif(b *testing.B) { benchScale(b, 200, 150, Over, srcUnif, NearestNeighbor) }
+
+func BenchmarkTformNNSrcRGBA(b *testing.B) { benchTform(b, 200, 150, Src, srcRGBA, NearestNeighbor) }
+func BenchmarkTformNNSrcUnif(b *testing.B) { benchTform(b, 200, 150, Src, srcUnif, NearestNeighbor) }
+
+func BenchmarkTformNNOverRGBA(b *testing.B) { benchTform(b, 200, 150, Over, srcRGBA, NearestNeighbor) }
+func BenchmarkTformNNOverUnif(b *testing.B) { benchTform(b, 200, 150, Over, srcUnif, NearestNeighbor) }
+
+func BenchmarkScaleABSrcGray(b *testing.B)  { benchScale(b, 200, 150, Src, srcGray, ApproxBiLinear) }
+func BenchmarkScaleABSrcNRGBA(b *testing.B) { benchScale(b, 200, 150, Src, srcNRGBA, ApproxBiLinear) }
+func BenchmarkScaleABSrcRGBA(b *testing.B)  { benchScale(b, 200, 150, Src, srcRGBA, ApproxBiLinear) }
+func BenchmarkScaleABSrcYCbCr(b *testing.B) { benchScale(b, 200, 150, Src, srcYCbCr, ApproxBiLinear) }
+
+func BenchmarkScaleABOverGray(b *testing.B)  { benchScale(b, 200, 150, Over, srcGray, ApproxBiLinear) }
+func BenchmarkScaleABOverNRGBA(b *testing.B) { benchScale(b, 200, 150, Over, srcNRGBA, ApproxBiLinear) }
+func BenchmarkScaleABOverRGBA(b *testing.B)  { benchScale(b, 200, 150, Over, srcRGBA, ApproxBiLinear) }
+func BenchmarkScaleABOverYCbCr(b *testing.B) { benchScale(b, 200, 150, Over, srcYCbCr, ApproxBiLinear) }
+
+func BenchmarkTformABSrcGray(b *testing.B)  { benchTform(b, 200, 150, Src, srcGray, ApproxBiLinear) }
+func BenchmarkTformABSrcNRGBA(b *testing.B) { benchTform(b, 200, 150, Src, srcNRGBA, ApproxBiLinear) }
+func BenchmarkTformABSrcRGBA(b *testing.B)  { benchTform(b, 200, 150, Src, srcRGBA, ApproxBiLinear) }
+func BenchmarkTformABSrcYCbCr(b *testing.B) { benchTform(b, 200, 150, Src, srcYCbCr, ApproxBiLinear) }
+
+func BenchmarkTformABOverGray(b *testing.B)  { benchTform(b, 200, 150, Over, srcGray, ApproxBiLinear) }
+func BenchmarkTformABOverNRGBA(b *testing.B) { benchTform(b, 200, 150, Over, srcNRGBA, ApproxBiLinear) }
+func BenchmarkTformABOverRGBA(b *testing.B)  { benchTform(b, 200, 150, Over, srcRGBA, ApproxBiLinear) }
+func BenchmarkTformABOverYCbCr(b *testing.B) { benchTform(b, 200, 150, Over, srcYCbCr, ApproxBiLinear) }
+
+func BenchmarkScaleCRSrcGray(b *testing.B)  { benchScale(b, 200, 150, Src, srcGray, CatmullRom) }
+func BenchmarkScaleCRSrcNRGBA(b *testing.B) { benchScale(b, 200, 150, Src, srcNRGBA, CatmullRom) }
+func BenchmarkScaleCRSrcRGBA(b *testing.B)  { benchScale(b, 200, 150, Src, srcRGBA, CatmullRom) }
+func BenchmarkScaleCRSrcYCbCr(b *testing.B) { benchScale(b, 200, 150, Src, srcYCbCr, CatmullRom) }
+
+func BenchmarkScaleCROverGray(b *testing.B)  { benchScale(b, 200, 150, Over, srcGray, CatmullRom) }
+func BenchmarkScaleCROverNRGBA(b *testing.B) { benchScale(b, 200, 150, Over, srcNRGBA, CatmullRom) }
+func BenchmarkScaleCROverRGBA(b *testing.B)  { benchScale(b, 200, 150, Over, srcRGBA, CatmullRom) }
+func BenchmarkScaleCROverYCbCr(b *testing.B) { benchScale(b, 200, 150, Over, srcYCbCr, CatmullRom) }
+
+func BenchmarkTformCRSrcGray(b *testing.B)  { benchTform(b, 200, 150, Src, srcGray, CatmullRom) }
+func BenchmarkTformCRSrcNRGBA(b *testing.B) { benchTform(b, 200, 150, Src, srcNRGBA, CatmullRom) }
+func BenchmarkTformCRSrcRGBA(b *testing.B)  { benchTform(b, 200, 150, Src, srcRGBA, CatmullRom) }
+func BenchmarkTformCRSrcYCbCr(b *testing.B) { benchTform(b, 200, 150, Src, srcYCbCr, CatmullRom) }
+
+func BenchmarkTformCROverGray(b *testing.B)  { benchTform(b, 200, 150, Over, srcGray, CatmullRom) }
+func BenchmarkTformCROverNRGBA(b *testing.B) { benchTform(b, 200, 150, Over, srcNRGBA, CatmullRom) }
+func BenchmarkTformCROverRGBA(b *testing.B)  { benchTform(b, 200, 150, Over, srcRGBA, CatmullRom) }
+func BenchmarkTformCROverYCbCr(b *testing.B) { benchTform(b, 200, 150, Over, srcYCbCr, CatmullRom) }
diff --git a/draw/stdlib_test.go b/draw/stdlib_test.go
new file mode 100644
index 0000000..c45f78c
--- /dev/null
+++ b/draw/stdlib_test.go
@@ -0,0 +1,96 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.5
+
+package draw
+
+// This file contains tests that depend on the exact behavior of the
+// image/color package in the standard library. The color conversion formula
+// from YCbCr to RGBA changed between Go 1.4 and Go 1.5, so this file's tests
+// are only enabled for Go 1.5 and above.
+
+import (
+	"bytes"
+	"image"
+	"image/color"
+	"testing"
+)
+
+// TestFastPaths tests that the fast path implementations produce identical
+// results to the generic implementation.
+func TestFastPaths(t *testing.T) {
+	drs := []image.Rectangle{
+		image.Rect(0, 0, 10, 10),   // The dst bounds.
+		image.Rect(3, 4, 8, 6),     // A strict subset of the dst bounds.
+		image.Rect(-3, -5, 2, 4),   // Partial out-of-bounds #0.
+		image.Rect(4, -2, 6, 12),   // Partial out-of-bounds #1.
+		image.Rect(12, 14, 23, 45), // Complete out-of-bounds.
+		image.Rect(5, 5, 5, 5),     // Empty.
+	}
+	srs := []image.Rectangle{
+		image.Rect(0, 0, 12, 9),    // The src bounds.
+		image.Rect(2, 2, 10, 8),    // A strict subset of the src bounds.
+		image.Rect(10, 5, 20, 20),  // Partial out-of-bounds #0.
+		image.Rect(-40, 0, 40, 8),  // Partial out-of-bounds #1.
+		image.Rect(-8, -8, -4, -4), // Complete out-of-bounds.
+		image.Rect(5, 5, 5, 5),     // Empty.
+	}
+	srcfs := []func(image.Rectangle) (image.Image, error){
+		srcGray,
+		srcNRGBA,
+		srcRGBA,
+		srcUnif,
+		srcYCbCr,
+	}
+	var srcs []image.Image
+	for _, srcf := range srcfs {
+		src, err := srcf(srs[0])
+		if err != nil {
+			t.Fatal(err)
+		}
+		srcs = append(srcs, src)
+	}
+	qs := []Interpolator{
+		NearestNeighbor,
+		ApproxBiLinear,
+		CatmullRom,
+	}
+	ops := []Op{
+		Over,
+		Src,
+	}
+	blue := image.NewUniform(color.RGBA{0x11, 0x22, 0x44, 0x7f})
+
+	for _, dr := range drs {
+		for _, src := range srcs {
+			for _, sr := range srs {
+				for _, transform := range []bool{false, true} {
+					for _, q := range qs {
+						for _, op := range ops {
+							dst0 := image.NewRGBA(drs[0])
+							dst1 := image.NewRGBA(drs[0])
+							Draw(dst0, dst0.Bounds(), blue, image.Point{}, Src)
+							Draw(dstWrapper{dst1}, dst1.Bounds(), srcWrapper{blue}, image.Point{}, Src)
+
+							if transform {
+								m := transformMatrix(3.75, 2, 1)
+								q.Transform(dst0, m, src, sr, op, nil)
+								q.Transform(dstWrapper{dst1}, m, srcWrapper{src}, sr, op, nil)
+							} else {
+								q.Scale(dst0, dr, src, sr, op, nil)
+								q.Scale(dstWrapper{dst1}, dr, srcWrapper{src}, sr, op, nil)
+							}
+
+							if !bytes.Equal(dst0.Pix, dst1.Pix) {
+								t.Errorf("pix differ for dr=%v, src=%T, sr=%v, transform=%t, q=%T",
+									dr, src, sr, transform, q)
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+}
diff --git a/math/f32/f32.go b/math/f32/f32.go
new file mode 100644
index 0000000..4ca1eb4
--- /dev/null
+++ b/math/f32/f32.go
@@ -0,0 +1,37 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package f32 implements float32 vector and matrix types.
+package f32 // import "golang.org/x/image/math/f32"
+
+// Vec2 is a 2-element vector.
+type Vec2 [2]float32
+
+// Vec3 is a 3-element vector.
+type Vec3 [3]float32
+
+// Vec4 is a 4-element vector.
+type Vec4 [4]float32
+
+// Mat3 is a 3x3 matrix in row major order.
+//
+// m[3*r + c] is the element in the r'th row and c'th column.
+type Mat3 [9]float32
+
+// Mat4 is a 4x4 matrix in row major order.
+//
+// m[4*r + c] is the element in the r'th row and c'th column.
+type Mat4 [16]float32
+
+// Aff3 is a 3x3 affine transformation matrix in row major order, where the
+// bottom row is implicitly [0 0 1].
+//
+// m[3*r + c] is the element in the r'th row and c'th column.
+type Aff3 [6]float32
+
+// Aff4 is a 4x4 affine transformation matrix in row major order, where the
+// bottom row is implicitly [0 0 0 1].
+//
+// m[4*r + c] is the element in the r'th row and c'th column.
+type Aff4 [12]float32
diff --git a/math/f64/f64.go b/math/f64/f64.go
new file mode 100644
index 0000000..a1f7fc0
--- /dev/null
+++ b/math/f64/f64.go
@@ -0,0 +1,37 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package f64 implements float64 vector and matrix types.
+package f64 // import "golang.org/x/image/math/f64"
+
+// Vec2 is a 2-element vector.
+type Vec2 [2]float64
+
+// Vec3 is a 3-element vector.
+type Vec3 [3]float64
+
+// Vec4 is a 4-element vector.
+type Vec4 [4]float64
+
+// Mat3 is a 3x3 matrix in row major order.
+//
+// m[3*r + c] is the element in the r'th row and c'th column.
+type Mat3 [9]float64
+
+// Mat4 is a 4x4 matrix in row major order.
+//
+// m[4*r + c] is the element in the r'th row and c'th column.
+type Mat4 [16]float64
+
+// Aff3 is a 3x3 affine transformation matrix in row major order, where the
+// bottom row is implicitly [0 0 1].
+//
+// m[3*r + c] is the element in the r'th row and c'th column.
+type Aff3 [6]float64
+
+// Aff4 is a 4x4 affine transformation matrix in row major order, where the
+// bottom row is implicitly [0 0 0 1].
+//
+// m[4*r + c] is the element in the r'th row and c'th column.
+type Aff4 [12]float64
diff --git a/math/fixed/fixed.go b/math/fixed/fixed.go
new file mode 100644
index 0000000..2f25d0e
--- /dev/null
+++ b/math/fixed/fixed.go
@@ -0,0 +1,172 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package fixed implements fixed-point integer types.
+package fixed // import "golang.org/x/image/math/fixed"
+
+import (
+	"fmt"
+)
+
+// TODO: implement fmt.Formatter for %f and %g.
+
+// I returns the integer value i as an Int26_6.
+//
+// For example, passing the integer value 2 yields Int26_6(128).
+func I(i int) Int26_6 {
+	return Int26_6(i << 6)
+}
+
+// Int26_6 is a signed 26.6 fixed-point number.
+//
+// The integer part ranges from -33554432 to 33554431, inclusive. The
+// fractional part has 6 bits of precision.
+//
+// For example, the number one-and-a-quarter is Int26_6(1<<6 + 1<<4).
+type Int26_6 int32
+
+// String returns a human-readable representation of a 26.6 fixed-point number.
+//
+// For example, the number one-and-a-quarter becomes "1:16".
+func (x Int26_6) String() string {
+	const shift, mask = 6, 1<<6 - 1
+	if x >= 0 {
+		return fmt.Sprintf("%d:%02d", int32(x>>shift), int32(x&mask))
+	}
+	x = -x
+	if x >= 0 {
+		return fmt.Sprintf("-%d:%02d", int32(x>>shift), int32(x&mask))
+	}
+	return "-33554432:00" // The minimum value is -(1<<25).
+}
+
+// Int52_12 is a signed 52.12 fixed-point number.
+//
+// The integer part ranges from -2251799813685248 to 2251799813685247,
+// inclusive. The fractional part has 12 bits of precision.
+//
+// For example, the number one-and-a-quarter is Int52_12(1<<12 + 1<<10).
+type Int52_12 int64
+
+// String returns a human-readable representation of a 52.12 fixed-point
+// number.
+//
+// For example, the number one-and-a-quarter becomes "1:1024".
+func (x Int52_12) String() string {
+	const shift, mask = 12, 1<<12 - 1
+	if x >= 0 {
+		return fmt.Sprintf("%d:%04d", int64(x>>shift), int64(x&mask))
+	}
+	x = -x
+	if x >= 0 {
+		return fmt.Sprintf("-%d:%04d", int64(x>>shift), int64(x&mask))
+	}
+	return "-2251799813685248:0000" // The minimum value is -(1<<51).
+}
+
+// P returns the integer values x and y as a Point26_6.
+//
+// For example, passing the integer values (2, -3) yields Point26_6{128, -192}.
+func P(x, y int) Point26_6 {
+	return Point26_6{Int26_6(x << 6), Int26_6(y << 6)}
+}
+
+// Point26_6 is a 26.6 fixed-point coordinate pair.
+//
+// It is analogous to the image.Point type in the standard library.
+type Point26_6 struct {
+	X, Y Int26_6
+}
+
+// Add returns the vector p+q.
+func (p Point26_6) Add(q Point26_6) Point26_6 {
+	return Point26_6{p.X + q.X, p.Y + q.Y}
+}
+
+// Sub returns the vector p-q.
+func (p Point26_6) Sub(q Point26_6) Point26_6 {
+	return Point26_6{p.X - q.X, p.Y - q.Y}
+}
+
+// Mul returns the vector p*k.
+func (p Point26_6) Mul(k Int26_6) Point26_6 {
+	return Point26_6{p.X * k / 64, p.Y * k / 64}
+}
+
+// Div returns the vector p/k.
+func (p Point26_6) Div(k Int26_6) Point26_6 {
+	return Point26_6{p.X * 64 / k, p.Y * 64 / k}
+}
+
+// Point52_12 is a 52.12 fixed-point coordinate pair.
+//
+// It is analogous to the image.Point type in the standard library.
+type Point52_12 struct {
+	X, Y Int52_12
+}
+
+// Add returns the vector p+q.
+func (p Point52_12) Add(q Point52_12) Point52_12 {
+	return Point52_12{p.X + q.X, p.Y + q.Y}
+}
+
+// Sub returns the vector p-q.
+func (p Point52_12) Sub(q Point52_12) Point52_12 {
+	return Point52_12{p.X - q.X, p.Y - q.Y}
+}
+
+// Mul returns the vector p*k.
+func (p Point52_12) Mul(k Int52_12) Point52_12 {
+	return Point52_12{p.X * k / 4096, p.Y * k / 4096}
+}
+
+// Div returns the vector p/k.
+func (p Point52_12) Div(k Int52_12) Point52_12 {
+	return Point52_12{p.X * 4096 / k, p.Y * 4096 / k}
+}
+
+// R returns the integer values minX, minY, maxX, maxY as a Rectangle26_6.
+//
+// For example, passing the integer values (0, 1, 2, 3) yields
+// Rectangle26_6{Point26_6{0, 64}, Point26_6{128, 192}}.
+//
+// Like the image.Rect function in the standard library, the returned rectangle
+// has minimum and maximum coordinates swapped if necessary so that it is
+// well-formed.
+func R(minX, minY, maxX, maxY int) Rectangle26_6 {
+	if minX > maxX {
+		minX, maxX = maxX, minX
+	}
+	if minY > maxY {
+		minY, maxY = maxY, minY
+	}
+	return Rectangle26_6{
+		Point26_6{
+			Int26_6(minX << 6),
+			Int26_6(minY << 6),
+		},
+		Point26_6{
+			Int26_6(maxX << 6),
+			Int26_6(maxY << 6),
+		},
+	}
+}
+
+// Rectangle26_6 is a 26.6 fixed-point coordinate rectangle. The Min bound is
+// inclusive and the Max bound is exclusive. It is well-formed if Min.X <=
+// Max.X and likewise for Y.
+//
+// It is analogous to the image.Rectangle type in the standard library.
+type Rectangle26_6 struct {
+	Min, Max Point26_6
+}
+
+// Rectangle52_12 is a 52.12 fixed-point coordinate rectangle. The Min bound is
+// inclusive and the Max bound is exclusive. It is well-formed if Min.X <=
+// Max.X and likewise for Y.
+//
+// It is analogous to the image.Rectangle type in the standard library.
+type Rectangle52_12 struct {
+	Min, Max Point52_12
+}
diff --git a/math/fixed/fixed_test.go b/math/fixed/fixed_test.go
new file mode 100644
index 0000000..e252de7
--- /dev/null
+++ b/math/fixed/fixed_test.go
@@ -0,0 +1,25 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package fixed
+
+import (
+	"testing"
+)
+
+func TestInt26_6(t *testing.T) {
+	got := Int26_6(1<<6 + 1<<4).String()
+	want := "1:16"
+	if got != want {
+		t.Fatalf("got %q, want %q", got, want)
+	}
+}
+
+func TestInt52_12(t *testing.T) {
+	got := Int52_12(1<<12 + 1<<10).String()
+	want := "1:1024"
+	if got != want {
+		t.Fatalf("got %q, want %q", got, want)
+	}
+}
diff --git a/riff/example_test.go b/riff/example_test.go
new file mode 100644
index 0000000..93c72b0
--- /dev/null
+++ b/riff/example_test.go
@@ -0,0 +1,113 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package riff_test
+
+import (
+	"fmt"
+	"io"
+	"io/ioutil"
+	"log"
+	"strings"
+
+	"golang.org/x/image/riff"
+)
+
+func ExampleReader() {
+	formType, r, err := riff.NewReader(strings.NewReader(data))
+	if err != nil {
+		log.Fatal(err)
+	}
+	fmt.Printf("RIFF(%s)\n", formType)
+	if err := dump(r, ".\t"); err != nil {
+		log.Fatal(err)
+	}
+	// Output:
+	// RIFF(ROOT)
+	// .	ZERO ""
+	// .	ONE  "a"
+	// .	LIST(META)
+	// .	.	LIST(GOOD)
+	// .	.	.	ONE  "a"
+	// .	.	.	FIVE "klmno"
+	// .	.	ZERO ""
+	// .	.	LIST(BAD )
+	// .	.	.	THRE "def"
+	// .	TWO  "bc"
+	// .	LIST(UGLY)
+	// .	.	FOUR "ghij"
+	// .	.	SIX  "pqrstu"
+}
+
+func dump(r *riff.Reader, indent string) error {
+	for {
+		chunkID, chunkLen, chunkData, err := r.Next()
+		if err == io.EOF {
+			return nil
+		}
+		if err != nil {
+			return err
+		}
+		if chunkID == riff.LIST {
+			listType, list, err := riff.NewListReader(chunkLen, chunkData)
+			if err != nil {
+				return err
+			}
+			fmt.Printf("%sLIST(%s)\n", indent, listType)
+			if err := dump(list, indent+".\t"); err != nil {
+				return err
+			}
+			continue
+		}
+		b, err := ioutil.ReadAll(chunkData)
+		if err != nil {
+			return err
+		}
+		fmt.Printf("%s%s %q\n", indent, chunkID, b)
+	}
+}
+
+func encodeU32(u uint32) string {
+	return string([]byte{
+		byte(u >> 0),
+		byte(u >> 8),
+		byte(u >> 16),
+		byte(u >> 24),
+	})
+}
+
+func encode(chunkID, contents string) string {
+	n := len(contents)
+	if n&1 == 1 {
+		contents += "\x00"
+	}
+	return chunkID + encodeU32(uint32(n)) + contents
+}
+
+func encodeMulti(typ0, typ1 string, chunks ...string) string {
+	n := 4
+	for _, c := range chunks {
+		n += len(c)
+	}
+	s := typ0 + encodeU32(uint32(n)) + typ1
+	for _, c := range chunks {
+		s += c
+	}
+	return s
+}
+
+var (
+	d0   = encode("ZERO", "")
+	d1   = encode("ONE ", "a")
+	d2   = encode("TWO ", "bc")
+	d3   = encode("THRE", "def")
+	d4   = encode("FOUR", "ghij")
+	d5   = encode("FIVE", "klmno")
+	d6   = encode("SIX ", "pqrstu")
+	l0   = encodeMulti("LIST", "GOOD", d1, d5)
+	l1   = encodeMulti("LIST", "BAD ", d3)
+	l2   = encodeMulti("LIST", "UGLY", d4, d6)
+	l01  = encodeMulti("LIST", "META", l0, d0, l1)
+	data = encodeMulti("RIFF", "ROOT", d0, d1, l01, d2, l2)
+)
diff --git a/riff/riff.go b/riff/riff.go
new file mode 100644
index 0000000..9b9f71d
--- /dev/null
+++ b/riff/riff.go
@@ -0,0 +1,179 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package riff implements the Resource Interchange File Format, used by media
+// formats such as AVI, WAVE and WEBP.
+//
+// A RIFF stream contains a sequence of chunks. Each chunk consists of an 8-byte
+// header (containing a 4-byte chunk type and a 4-byte chunk length), the chunk
+// data (presented as an io.Reader), and some padding bytes.
+//
+// A detailed description of the format is at
+// http://www.tactilemedia.com/info/MCI_Control_Info.html
+package riff // import "golang.org/x/image/riff"
+
+import (
+	"errors"
+	"io"
+	"io/ioutil"
+	"math"
+)
+
+var (
+	errMissingPaddingByte     = errors.New("riff: missing padding byte")
+	errMissingRIFFChunkHeader = errors.New("riff: missing RIFF chunk header")
+	errShortChunkData         = errors.New("riff: short chunk data")
+	errShortChunkHeader       = errors.New("riff: short chunk header")
+	errStaleReader            = errors.New("riff: stale reader")
+)
+
+// u32 decodes the first four bytes of b as a little-endian integer.
+func u32(b []byte) uint32 {
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+const chunkHeaderSize = 8
+
+// FourCC is a four character code.
+type FourCC [4]byte
+
+// LIST is the "LIST" FourCC.
+var LIST = FourCC{'L', 'I', 'S', 'T'}
+
+// NewReader returns the RIFF stream's form type, such as "AVI " or "WAVE", and
+// its chunks as a *Reader.
+func NewReader(r io.Reader) (formType FourCC, data *Reader, err error) {
+	var buf [chunkHeaderSize]byte
+	if _, err := io.ReadFull(r, buf[:]); err != nil {
+		if err == io.EOF || err == io.ErrUnexpectedEOF {
+			err = errMissingRIFFChunkHeader
+		}
+		return FourCC{}, nil, err
+	}
+	if buf[0] != 'R' || buf[1] != 'I' || buf[2] != 'F' || buf[3] != 'F' {
+		return FourCC{}, nil, errMissingRIFFChunkHeader
+	}
+	return NewListReader(u32(buf[4:]), r)
+}
+
+// NewListReader returns a LIST chunk's list type, such as "movi" or "wavl",
+// and its chunks as a *Reader.
+func NewListReader(chunkLen uint32, chunkData io.Reader) (listType FourCC, data *Reader, err error) {
+	if chunkLen < 4 {
+		return FourCC{}, nil, errShortChunkData
+	}
+	z := &Reader{r: chunkData}
+	if _, err := io.ReadFull(chunkData, z.buf[:4]); err != nil {
+		if err == io.EOF || err == io.ErrUnexpectedEOF {
+			err = errShortChunkData
+		}
+		return FourCC{}, nil, err
+	}
+	z.totalLen = chunkLen - 4
+	return FourCC{z.buf[0], z.buf[1], z.buf[2], z.buf[3]}, z, nil
+}
+
+// Reader reads chunks from an underlying io.Reader.
+type Reader struct {
+	r   io.Reader
+	err error
+
+	totalLen uint32
+	chunkLen uint32
+
+	chunkReader *chunkReader
+	buf         [chunkHeaderSize]byte
+	padded      bool
+}
+
+// Next returns the next chunk's ID, length and data. It returns io.EOF if there
+// are no more chunks. The io.Reader returned becomes stale after the next Next
+// call, and should no longer be used.
+//
+// It is valid to call Next even if all of the previous chunk's data has not
+// been read.
+func (z *Reader) Next() (chunkID FourCC, chunkLen uint32, chunkData io.Reader, err error) {
+	if z.err != nil {
+		return FourCC{}, 0, nil, z.err
+	}
+
+	// Drain the rest of the previous chunk.
+	if z.chunkLen != 0 {
+		_, z.err = io.Copy(ioutil.Discard, z.chunkReader)
+		if z.err != nil {
+			return FourCC{}, 0, nil, z.err
+		}
+	}
+	z.chunkReader = nil
+	if z.padded {
+		_, z.err = io.ReadFull(z.r, z.buf[:1])
+		if z.err != nil {
+			if z.err == io.EOF {
+				z.err = errMissingPaddingByte
+			}
+			return FourCC{}, 0, nil, z.err
+		}
+		z.totalLen--
+	}
+
+	// We are done if we have no more data.
+	if z.totalLen == 0 {
+		z.err = io.EOF
+		return FourCC{}, 0, nil, z.err
+	}
+
+	// Read the next chunk header.
+	if z.totalLen < chunkHeaderSize {
+		z.err = errShortChunkHeader
+		return FourCC{}, 0, nil, z.err
+	}
+	z.totalLen -= chunkHeaderSize
+	if _, err = io.ReadFull(z.r, z.buf[:chunkHeaderSize]); err != nil {
+		if z.err == io.EOF || z.err == io.ErrUnexpectedEOF {
+			z.err = errShortChunkHeader
+		}
+		return FourCC{}, 0, nil, z.err
+	}
+	chunkID = FourCC{z.buf[0], z.buf[1], z.buf[2], z.buf[3]}
+	z.chunkLen = u32(z.buf[4:])
+	z.padded = z.chunkLen&1 == 1
+	z.chunkReader = &chunkReader{z}
+	return chunkID, z.chunkLen, z.chunkReader, nil
+}
+
+type chunkReader struct {
+	z *Reader
+}
+
+func (c *chunkReader) Read(p []byte) (int, error) {
+	if c != c.z.chunkReader {
+		return 0, errStaleReader
+	}
+	z := c.z
+	if z.err != nil {
+		if z.err == io.EOF {
+			return 0, errStaleReader
+		}
+		return 0, z.err
+	}
+
+	n := int(z.chunkLen)
+	if n == 0 {
+		return 0, io.EOF
+	}
+	if n < 0 {
+		// Converting uint32 to int overflowed.
+		n = math.MaxInt32
+	}
+	if n > len(p) {
+		n = len(p)
+	}
+	n, err := z.r.Read(p[:n])
+	z.totalLen -= uint32(n)
+	z.chunkLen -= uint32(n)
+	if err != io.EOF {
+		z.err = err
+	}
+	return n, err
+}
diff --git a/testdata/blue-purple-pink-large.lossless.webp b/testdata/blue-purple-pink-large.lossless.webp
new file mode 100644
index 0000000..d00c81f
--- /dev/null
+++ b/testdata/blue-purple-pink-large.lossless.webp
Binary files differ
diff --git a/testdata/blue-purple-pink-large.no-filter.lossy.webp b/testdata/blue-purple-pink-large.no-filter.lossy.webp
new file mode 100644
index 0000000..9067f4d
--- /dev/null
+++ b/testdata/blue-purple-pink-large.no-filter.lossy.webp
Binary files differ
diff --git a/testdata/blue-purple-pink-large.no-filter.lossy.webp.ycbcr.png b/testdata/blue-purple-pink-large.no-filter.lossy.webp.ycbcr.png
new file mode 100644
index 0000000..2e32c28
--- /dev/null
+++ b/testdata/blue-purple-pink-large.no-filter.lossy.webp.ycbcr.png
Binary files differ
diff --git a/testdata/blue-purple-pink-large.normal-filter.lossy.webp b/testdata/blue-purple-pink-large.normal-filter.lossy.webp
new file mode 100644
index 0000000..a4ccc1a
--- /dev/null
+++ b/testdata/blue-purple-pink-large.normal-filter.lossy.webp
Binary files differ
diff --git a/testdata/blue-purple-pink-large.normal-filter.lossy.webp.ycbcr.png b/testdata/blue-purple-pink-large.normal-filter.lossy.webp.ycbcr.png
new file mode 100644
index 0000000..5f7ec42
--- /dev/null
+++ b/testdata/blue-purple-pink-large.normal-filter.lossy.webp.ycbcr.png
Binary files differ
diff --git a/testdata/blue-purple-pink-large.png b/testdata/blue-purple-pink-large.png
new file mode 100644
index 0000000..9755505
--- /dev/null
+++ b/testdata/blue-purple-pink-large.png
Binary files differ
diff --git a/testdata/blue-purple-pink-large.simple-filter.lossy.webp b/testdata/blue-purple-pink-large.simple-filter.lossy.webp
new file mode 100644
index 0000000..09fdb94
--- /dev/null
+++ b/testdata/blue-purple-pink-large.simple-filter.lossy.webp
Binary files differ
diff --git a/testdata/blue-purple-pink-large.simple-filter.lossy.webp.ycbcr.png b/testdata/blue-purple-pink-large.simple-filter.lossy.webp.ycbcr.png
new file mode 100644
index 0000000..946b3af
--- /dev/null
+++ b/testdata/blue-purple-pink-large.simple-filter.lossy.webp.ycbcr.png
Binary files differ
diff --git a/testdata/blue-purple-pink.lossless.webp b/testdata/blue-purple-pink.lossless.webp
new file mode 100644
index 0000000..b16a50d
--- /dev/null
+++ b/testdata/blue-purple-pink.lossless.webp
Binary files differ
diff --git a/testdata/blue-purple-pink.lossy.webp b/testdata/blue-purple-pink.lossy.webp
new file mode 100644
index 0000000..d5143c0
--- /dev/null
+++ b/testdata/blue-purple-pink.lossy.webp
Binary files differ
diff --git a/testdata/blue-purple-pink.lossy.webp.ycbcr.png b/testdata/blue-purple-pink.lossy.webp.ycbcr.png
new file mode 100644
index 0000000..eb51560
--- /dev/null
+++ b/testdata/blue-purple-pink.lossy.webp.ycbcr.png
Binary files differ
diff --git a/testdata/blue-purple-pink.lzwcompressed.tiff b/testdata/blue-purple-pink.lzwcompressed.tiff
new file mode 100644
index 0000000..5978f7a
--- /dev/null
+++ b/testdata/blue-purple-pink.lzwcompressed.tiff
Binary files differ
diff --git a/testdata/blue-purple-pink.png b/testdata/blue-purple-pink.png
new file mode 100644
index 0000000..d4fbf6b
--- /dev/null
+++ b/testdata/blue-purple-pink.png
Binary files differ
diff --git a/testdata/bw-deflate.tiff b/testdata/bw-deflate.tiff
new file mode 100644
index 0000000..137a0c3
--- /dev/null
+++ b/testdata/bw-deflate.tiff
Binary files differ
diff --git a/testdata/bw-packbits.tiff b/testdata/bw-packbits.tiff
new file mode 100644
index 0000000..d59fa4a
--- /dev/null
+++ b/testdata/bw-packbits.tiff
Binary files differ
diff --git a/testdata/bw-uncompressed.tiff b/testdata/bw-uncompressed.tiff
new file mode 100644
index 0000000..8390f11
--- /dev/null
+++ b/testdata/bw-uncompressed.tiff
Binary files differ
diff --git a/testdata/go-turns-two-14x18.png b/testdata/go-turns-two-14x18.png
new file mode 100644
index 0000000..b6494b6
--- /dev/null
+++ b/testdata/go-turns-two-14x18.png
Binary files differ
diff --git a/testdata/go-turns-two-280x360.jpeg b/testdata/go-turns-two-280x360.jpeg
new file mode 100644
index 0000000..b56e492
--- /dev/null
+++ b/testdata/go-turns-two-280x360.jpeg
Binary files differ
diff --git a/testdata/go-turns-two-down-ab.png b/testdata/go-turns-two-down-ab.png
new file mode 100644
index 0000000..317c3af
--- /dev/null
+++ b/testdata/go-turns-two-down-ab.png
Binary files differ
diff --git a/testdata/go-turns-two-down-bl.png b/testdata/go-turns-two-down-bl.png
new file mode 100644
index 0000000..597d362
--- /dev/null
+++ b/testdata/go-turns-two-down-bl.png
Binary files differ
diff --git a/testdata/go-turns-two-down-cr.png b/testdata/go-turns-two-down-cr.png
new file mode 100644
index 0000000..ad1c20a
--- /dev/null
+++ b/testdata/go-turns-two-down-cr.png
Binary files differ
diff --git a/testdata/go-turns-two-down-nn.png b/testdata/go-turns-two-down-nn.png
new file mode 100644
index 0000000..166841a
--- /dev/null
+++ b/testdata/go-turns-two-down-nn.png
Binary files differ
diff --git a/testdata/go-turns-two-rotate-ab.png b/testdata/go-turns-two-rotate-ab.png
new file mode 100644
index 0000000..04fceaa
--- /dev/null
+++ b/testdata/go-turns-two-rotate-ab.png
Binary files differ
diff --git a/testdata/go-turns-two-rotate-bl.png b/testdata/go-turns-two-rotate-bl.png
new file mode 100644
index 0000000..c8b717e
--- /dev/null
+++ b/testdata/go-turns-two-rotate-bl.png
Binary files differ
diff --git a/testdata/go-turns-two-rotate-cr.png b/testdata/go-turns-two-rotate-cr.png
new file mode 100644
index 0000000..7e5cd9f
--- /dev/null
+++ b/testdata/go-turns-two-rotate-cr.png
Binary files differ
diff --git a/testdata/go-turns-two-rotate-nn.png b/testdata/go-turns-two-rotate-nn.png
new file mode 100644
index 0000000..702c863
--- /dev/null
+++ b/testdata/go-turns-two-rotate-nn.png
Binary files differ
diff --git a/testdata/go-turns-two-up-ab.png b/testdata/go-turns-two-up-ab.png
new file mode 100644
index 0000000..072446d
--- /dev/null
+++ b/testdata/go-turns-two-up-ab.png
Binary files differ
diff --git a/testdata/go-turns-two-up-bl.png b/testdata/go-turns-two-up-bl.png
new file mode 100644
index 0000000..c1bf630
--- /dev/null
+++ b/testdata/go-turns-two-up-bl.png
Binary files differ
diff --git a/testdata/go-turns-two-up-cr.png b/testdata/go-turns-two-up-cr.png
new file mode 100644
index 0000000..0ac8300
--- /dev/null
+++ b/testdata/go-turns-two-up-cr.png
Binary files differ
diff --git a/testdata/go-turns-two-up-nn.png b/testdata/go-turns-two-up-nn.png
new file mode 100644
index 0000000..eb63cb9
--- /dev/null
+++ b/testdata/go-turns-two-up-nn.png
Binary files differ
diff --git a/testdata/gopher-doc.1bpp.lossless.webp b/testdata/gopher-doc.1bpp.lossless.webp
new file mode 100644
index 0000000..fcca028
--- /dev/null
+++ b/testdata/gopher-doc.1bpp.lossless.webp
Binary files differ
diff --git a/testdata/gopher-doc.1bpp.png b/testdata/gopher-doc.1bpp.png
new file mode 100644
index 0000000..9c5bb64
--- /dev/null
+++ b/testdata/gopher-doc.1bpp.png
Binary files differ
diff --git a/testdata/gopher-doc.2bpp.lossless.webp b/testdata/gopher-doc.2bpp.lossless.webp
new file mode 100644
index 0000000..d683d47
--- /dev/null
+++ b/testdata/gopher-doc.2bpp.lossless.webp
Binary files differ
diff --git a/testdata/gopher-doc.2bpp.png b/testdata/gopher-doc.2bpp.png
new file mode 100644
index 0000000..af96769
--- /dev/null
+++ b/testdata/gopher-doc.2bpp.png
Binary files differ
diff --git a/testdata/gopher-doc.4bpp.lossless.webp b/testdata/gopher-doc.4bpp.lossless.webp
new file mode 100644
index 0000000..11d8ef1
--- /dev/null
+++ b/testdata/gopher-doc.4bpp.lossless.webp
Binary files differ
diff --git a/testdata/gopher-doc.4bpp.png b/testdata/gopher-doc.4bpp.png
new file mode 100644
index 0000000..fc18137
--- /dev/null
+++ b/testdata/gopher-doc.4bpp.png
Binary files differ
diff --git a/testdata/gopher-doc.8bpp.lossless.webp b/testdata/gopher-doc.8bpp.lossless.webp
new file mode 100644
index 0000000..b6468e9
--- /dev/null
+++ b/testdata/gopher-doc.8bpp.lossless.webp
Binary files differ
diff --git a/testdata/gopher-doc.8bpp.png b/testdata/gopher-doc.8bpp.png
new file mode 100644
index 0000000..b877c54
--- /dev/null
+++ b/testdata/gopher-doc.8bpp.png
Binary files differ
diff --git a/testdata/no_compress.tiff b/testdata/no_compress.tiff
new file mode 100644
index 0000000..3f72b29
--- /dev/null
+++ b/testdata/no_compress.tiff
Binary files differ
diff --git a/testdata/no_rps.tiff b/testdata/no_rps.tiff
new file mode 100644
index 0000000..3280cf8
--- /dev/null
+++ b/testdata/no_rps.tiff
Binary files differ
diff --git a/testdata/testpattern.png b/testdata/testpattern.png
new file mode 100644
index 0000000..ec87bb5
--- /dev/null
+++ b/testdata/testpattern.png
Binary files differ
diff --git a/testdata/tux-rotate-ab.png b/testdata/tux-rotate-ab.png
new file mode 100644
index 0000000..181966c
--- /dev/null
+++ b/testdata/tux-rotate-ab.png
Binary files differ
diff --git a/testdata/tux-rotate-bl.png b/testdata/tux-rotate-bl.png
new file mode 100644
index 0000000..af3f4b0
--- /dev/null
+++ b/testdata/tux-rotate-bl.png
Binary files differ
diff --git a/testdata/tux-rotate-cr.png b/testdata/tux-rotate-cr.png
new file mode 100644
index 0000000..e5cff31
--- /dev/null
+++ b/testdata/tux-rotate-cr.png
Binary files differ
diff --git a/testdata/tux-rotate-nn.png b/testdata/tux-rotate-nn.png
new file mode 100644
index 0000000..c775c61
--- /dev/null
+++ b/testdata/tux-rotate-nn.png
Binary files differ
diff --git a/testdata/tux.lossless.webp b/testdata/tux.lossless.webp
new file mode 100644
index 0000000..3b32c02
--- /dev/null
+++ b/testdata/tux.lossless.webp
Binary files differ
diff --git a/testdata/tux.png b/testdata/tux.png
new file mode 100644
index 0000000..2567fe7
--- /dev/null
+++ b/testdata/tux.png
Binary files differ
diff --git a/testdata/video-001-16bit.tiff b/testdata/video-001-16bit.tiff
new file mode 100644
index 0000000..3b05ef0
--- /dev/null
+++ b/testdata/video-001-16bit.tiff
Binary files differ
diff --git a/testdata/video-001-gray-16bit.tiff b/testdata/video-001-gray-16bit.tiff
new file mode 100644
index 0000000..356882a
--- /dev/null
+++ b/testdata/video-001-gray-16bit.tiff
Binary files differ
diff --git a/testdata/video-001-gray.tiff b/testdata/video-001-gray.tiff
new file mode 100644
index 0000000..38fc9d2
--- /dev/null
+++ b/testdata/video-001-gray.tiff
Binary files differ
diff --git a/testdata/video-001-paletted.tiff b/testdata/video-001-paletted.tiff
new file mode 100644
index 0000000..5db84bc
--- /dev/null
+++ b/testdata/video-001-paletted.tiff
Binary files differ
diff --git a/testdata/video-001-strip-64.tiff b/testdata/video-001-strip-64.tiff
new file mode 100644
index 0000000..9cf6c32
--- /dev/null
+++ b/testdata/video-001-strip-64.tiff
Binary files differ
diff --git a/testdata/video-001-tile-64x64.tiff b/testdata/video-001-tile-64x64.tiff
new file mode 100644
index 0000000..fa56713
--- /dev/null
+++ b/testdata/video-001-tile-64x64.tiff
Binary files differ
diff --git a/testdata/video-001-uncompressed.tiff b/testdata/video-001-uncompressed.tiff
new file mode 100644
index 0000000..fad1471
--- /dev/null
+++ b/testdata/video-001-uncompressed.tiff
Binary files differ
diff --git a/testdata/video-001.bmp b/testdata/video-001.bmp
new file mode 100644
index 0000000..ca3dd42
--- /dev/null
+++ b/testdata/video-001.bmp
Binary files differ
diff --git a/testdata/video-001.lossy.webp b/testdata/video-001.lossy.webp
new file mode 100644
index 0000000..302198e
--- /dev/null
+++ b/testdata/video-001.lossy.webp
Binary files differ
diff --git a/testdata/video-001.lossy.webp.ycbcr.png b/testdata/video-001.lossy.webp.ycbcr.png
new file mode 100644
index 0000000..dc5f8cf
--- /dev/null
+++ b/testdata/video-001.lossy.webp.ycbcr.png
Binary files differ
diff --git a/testdata/video-001.png b/testdata/video-001.png
new file mode 100644
index 0000000..d3468bb
--- /dev/null
+++ b/testdata/video-001.png
Binary files differ
diff --git a/testdata/video-001.tiff b/testdata/video-001.tiff
new file mode 100644
index 0000000..0dd6cd9
--- /dev/null
+++ b/testdata/video-001.tiff
Binary files differ
diff --git a/testdata/yellow_rose-small.bmp b/testdata/yellow_rose-small.bmp
new file mode 100644
index 0000000..866fc7a
--- /dev/null
+++ b/testdata/yellow_rose-small.bmp
Binary files differ
diff --git a/testdata/yellow_rose-small.png b/testdata/yellow_rose-small.png
new file mode 100644
index 0000000..772c239
--- /dev/null
+++ b/testdata/yellow_rose-small.png
Binary files differ
diff --git a/testdata/yellow_rose.lossless.webp b/testdata/yellow_rose.lossless.webp
new file mode 100644
index 0000000..0c028f4
--- /dev/null
+++ b/testdata/yellow_rose.lossless.webp
Binary files differ
diff --git a/testdata/yellow_rose.lossy-with-alpha.webp b/testdata/yellow_rose.lossy-with-alpha.webp
new file mode 100644
index 0000000..64d3b5d
--- /dev/null
+++ b/testdata/yellow_rose.lossy-with-alpha.webp
Binary files differ
diff --git a/testdata/yellow_rose.lossy-with-alpha.webp.nycbcra.png b/testdata/yellow_rose.lossy-with-alpha.webp.nycbcra.png
new file mode 100644
index 0000000..4445315
--- /dev/null
+++ b/testdata/yellow_rose.lossy-with-alpha.webp.nycbcra.png
Binary files differ
diff --git a/testdata/yellow_rose.lossy.webp b/testdata/yellow_rose.lossy.webp
new file mode 100644
index 0000000..57a845e
--- /dev/null
+++ b/testdata/yellow_rose.lossy.webp
Binary files differ
diff --git a/testdata/yellow_rose.lossy.webp.ycbcr.png b/testdata/yellow_rose.lossy.webp.ycbcr.png
new file mode 100644
index 0000000..5e3bcd8
--- /dev/null
+++ b/testdata/yellow_rose.lossy.webp.ycbcr.png
Binary files differ
diff --git a/testdata/yellow_rose.png b/testdata/yellow_rose.png
new file mode 100644
index 0000000..bbaefa8
--- /dev/null
+++ b/testdata/yellow_rose.png
Binary files differ
diff --git a/tiff/buffer.go b/tiff/buffer.go
new file mode 100644
index 0000000..d1801be
--- /dev/null
+++ b/tiff/buffer.go
@@ -0,0 +1,69 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package tiff
+
+import "io"
+
+// buffer buffers an io.Reader to satisfy io.ReaderAt.
+type buffer struct {
+	r   io.Reader
+	buf []byte
+}
+
+// fill reads data from b.r until the buffer contains at least end bytes.
+func (b *buffer) fill(end int) error {
+	m := len(b.buf)
+	if end > m {
+		if end > cap(b.buf) {
+			newcap := 1024
+			for newcap < end {
+				newcap *= 2
+			}
+			newbuf := make([]byte, end, newcap)
+			copy(newbuf, b.buf)
+			b.buf = newbuf
+		} else {
+			b.buf = b.buf[:end]
+		}
+		if n, err := io.ReadFull(b.r, b.buf[m:end]); err != nil {
+			end = m + n
+			b.buf = b.buf[:end]
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *buffer) ReadAt(p []byte, off int64) (int, error) {
+	o := int(off)
+	end := o + len(p)
+	if int64(end) != off+int64(len(p)) {
+		return 0, io.ErrUnexpectedEOF
+	}
+
+	err := b.fill(end)
+	return copy(p, b.buf[o:end]), err
+}
+
+// Slice returns a slice of the underlying buffer. The slice contains
+// n bytes starting at offset off.
+func (b *buffer) Slice(off, n int) ([]byte, error) {
+	end := off + n
+	if err := b.fill(end); err != nil {
+		return nil, err
+	}
+	return b.buf[off:end], nil
+}
+
+// newReaderAt converts an io.Reader into an io.ReaderAt.
+func newReaderAt(r io.Reader) io.ReaderAt {
+	if ra, ok := r.(io.ReaderAt); ok {
+		return ra
+	}
+	return &buffer{
+		r:   r,
+		buf: make([]byte, 0, 1024),
+	}
+}
diff --git a/tiff/buffer_test.go b/tiff/buffer_test.go
new file mode 100644
index 0000000..e13afb3
--- /dev/null
+++ b/tiff/buffer_test.go
@@ -0,0 +1,36 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package tiff
+
+import (
+	"io"
+	"strings"
+	"testing"
+)
+
+var readAtTests = []struct {
+	n   int
+	off int64
+	s   string
+	err error
+}{
+	{2, 0, "ab", nil},
+	{6, 0, "abcdef", nil},
+	{3, 3, "def", nil},
+	{3, 5, "f", io.EOF},
+	{3, 6, "", io.EOF},
+}
+
+func TestReadAt(t *testing.T) {
+	r := newReaderAt(strings.NewReader("abcdef"))
+	b := make([]byte, 10)
+	for _, test := range readAtTests {
+		n, err := r.ReadAt(b[:test.n], test.off)
+		s := string(b[:n])
+		if s != test.s || err != test.err {
+			t.Errorf("buffer.ReadAt(<%v bytes>, %v): got %v, %q; want %v, %q", test.n, test.off, err, s, test.err, test.s)
+		}
+	}
+}
diff --git a/tiff/compress.go b/tiff/compress.go
new file mode 100644
index 0000000..3f176f0
--- /dev/null
+++ b/tiff/compress.go
@@ -0,0 +1,58 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package tiff
+
+import (
+	"bufio"
+	"io"
+)
+
+type byteReader interface {
+	io.Reader
+	io.ByteReader
+}
+
+// unpackBits decodes the PackBits-compressed data in src and returns the
+// uncompressed data.
+//
+// The PackBits compression format is described in section 9 (p. 42)
+// of the TIFF spec.
+func unpackBits(r io.Reader) ([]byte, error) {
+	buf := make([]byte, 128)
+	dst := make([]byte, 0, 1024)
+	br, ok := r.(byteReader)
+	if !ok {
+		br = bufio.NewReader(r)
+	}
+
+	for {
+		b, err := br.ReadByte()
+		if err != nil {
+			if err == io.EOF {
+				return dst, nil
+			}
+			return nil, err
+		}
+		code := int(int8(b))
+		switch {
+		case code >= 0:
+			n, err := io.ReadFull(br, buf[:code+1])
+			if err != nil {
+				return nil, err
+			}
+			dst = append(dst, buf[:n]...)
+		case code == -128:
+			// No-op.
+		default:
+			if b, err = br.ReadByte(); err != nil {
+				return nil, err
+			}
+			for j := 0; j < 1-code; j++ {
+				buf[j] = b
+			}
+			dst = append(dst, buf[:1-code]...)
+		}
+	}
+}
diff --git a/tiff/consts.go b/tiff/consts.go
new file mode 100644
index 0000000..3c51a70
--- /dev/null
+++ b/tiff/consts.go
@@ -0,0 +1,133 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package tiff
+
+// A tiff image file contains one or more images. The metadata
+// of each image is contained in an Image File Directory (IFD),
+// which contains entries of 12 bytes each and is described
+// on page 14-16 of the specification. An IFD entry consists of
+//
+//  - a tag, which describes the signification of the entry,
+//  - the data type and length of the entry,
+//  - the data itself or a pointer to it if it is more than 4 bytes.
+//
+// The presence of a length means that each IFD is effectively an array.
+
+const (
+	leHeader = "II\x2A\x00" // Header for little-endian files.
+	beHeader = "MM\x00\x2A" // Header for big-endian files.
+
+	ifdLen = 12 // Length of an IFD entry in bytes.
+)
+
+// Data types (p. 14-16 of the spec).
+const (
+	dtByte     = 1
+	dtASCII    = 2
+	dtShort    = 3
+	dtLong     = 4
+	dtRational = 5
+)
+
+// The length of one instance of each data type in bytes.
+var lengths = [...]uint32{0, 1, 1, 2, 4, 8}
+
+// Tags (see p. 28-41 of the spec).
+const (
+	tImageWidth                = 256
+	tImageLength               = 257
+	tBitsPerSample             = 258
+	tCompression               = 259
+	tPhotometricInterpretation = 262
+
+	tStripOffsets    = 273
+	tSamplesPerPixel = 277
+	tRowsPerStrip    = 278
+	tStripByteCounts = 279
+
+	tTileWidth      = 322
+	tTileLength     = 323
+	tTileOffsets    = 324
+	tTileByteCounts = 325
+
+	tXResolution    = 282
+	tYResolution    = 283
+	tResolutionUnit = 296
+
+	tPredictor    = 317
+	tColorMap     = 320
+	tExtraSamples = 338
+	tSampleFormat = 339
+)
+
+// Compression types (defined in various places in the spec and supplements).
+const (
+	cNone       = 1
+	cCCITT      = 2
+	cG3         = 3 // Group 3 Fax.
+	cG4         = 4 // Group 4 Fax.
+	cLZW        = 5
+	cJPEGOld    = 6 // Superseded by cJPEG.
+	cJPEG       = 7
+	cDeflate    = 8 // zlib compression.
+	cPackBits   = 32773
+	cDeflateOld = 32946 // Superseded by cDeflate.
+)
+
+// Photometric interpretation values (see p. 37 of the spec).
+const (
+	pWhiteIsZero = 0
+	pBlackIsZero = 1
+	pRGB         = 2
+	pPaletted    = 3
+	pTransMask   = 4 // transparency mask
+	pCMYK        = 5
+	pYCbCr       = 6
+	pCIELab      = 8
+)
+
+// Values for the tPredictor tag (page 64-65 of the spec).
+const (
+	prNone       = 1
+	prHorizontal = 2
+)
+
+// Values for the tResolutionUnit tag (page 18).
+const (
+	resNone    = 1
+	resPerInch = 2 // Dots per inch.
+	resPerCM   = 3 // Dots per centimeter.
+)
+
+// imageMode represents the mode of the image.
+type imageMode int
+
+const (
+	mBilevel imageMode = iota
+	mPaletted
+	mGray
+	mGrayInvert
+	mRGB
+	mRGBA
+	mNRGBA
+)
+
+// CompressionType describes the type of compression used in Options.
+type CompressionType int
+
+const (
+	Uncompressed CompressionType = iota
+	Deflate
+)
+
+// specValue returns the compression type constant from the TIFF spec that
+// is equivalent to c.
+func (c CompressionType) specValue() uint32 {
+	switch c {
+	case Deflate:
+		return cDeflate
+	}
+	return cNone
+}
diff --git a/tiff/lzw/reader.go b/tiff/lzw/reader.go
new file mode 100644
index 0000000..ad35819
--- /dev/null
+++ b/tiff/lzw/reader.go
@@ -0,0 +1,277 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package lzw implements the Lempel-Ziv-Welch compressed data format,
+// described in T. A. Welch, ``A Technique for High-Performance Data
+// Compression'', Computer, 17(6) (June 1984), pp 8-19.
+//
+// In particular, it implements LZW as used by the TIFF file format, including
+// an "off by one" algorithmic difference when compared to standard LZW.
+package lzw // import "golang.org/x/image/tiff/lzw"
+
+/*
+This file was branched from src/pkg/compress/lzw/reader.go in the
+standard library. Differences from the original are marked with "NOTE".
+
+The tif_lzw.c file in the libtiff C library has this comment:
+
+----
+The 5.0 spec describes a different algorithm than Aldus
+implements.  Specifically, Aldus does code length transitions
+one code earlier than should be done (for real LZW).
+Earlier versions of this library implemented the correct
+LZW algorithm, but emitted codes in a bit order opposite
+to the TIFF spec.  Thus, to maintain compatibility w/ Aldus
+we interpret MSB-LSB ordered codes to be images written w/
+old versions of this library, but otherwise adhere to the
+Aldus "off by one" algorithm.
+----
+
+The Go code doesn't read (invalid) TIFF files written by old versions of
+libtiff, but the LZW algorithm in this package still differs from the one in
+Go's standard package library to accomodate this "off by one" in valid TIFFs.
+*/
+
+import (
+	"bufio"
+	"errors"
+	"fmt"
+	"io"
+)
+
+// Order specifies the bit ordering in an LZW data stream.
+type Order int
+
+const (
+	// LSB means Least Significant Bits first, as used in the GIF file format.
+	LSB Order = iota
+	// MSB means Most Significant Bits first, as used in the TIFF and PDF
+	// file formats.
+	MSB
+)
+
+const (
+	maxWidth           = 12
+	decoderInvalidCode = 0xffff
+	flushBuffer        = 1 << maxWidth
+)
+
+// decoder is the state from which the readXxx method converts a byte
+// stream into a code stream.
+type decoder struct {
+	r        io.ByteReader
+	bits     uint32
+	nBits    uint
+	width    uint
+	read     func(*decoder) (uint16, error) // readLSB or readMSB
+	litWidth int                            // width in bits of literal codes
+	err      error
+
+	// The first 1<<litWidth codes are literal codes.
+	// The next two codes mean clear and EOF.
+	// Other valid codes are in the range [lo, hi] where lo := clear + 2,
+	// with the upper bound incrementing on each code seen.
+	// overflow is the code at which hi overflows the code width. NOTE: TIFF's LZW is "off by one".
+	// last is the most recently seen code, or decoderInvalidCode.
+	clear, eof, hi, overflow, last uint16
+
+	// Each code c in [lo, hi] expands to two or more bytes. For c != hi:
+	//   suffix[c] is the last of these bytes.
+	//   prefix[c] is the code for all but the last byte.
+	//   This code can either be a literal code or another code in [lo, c).
+	// The c == hi case is a special case.
+	suffix [1 << maxWidth]uint8
+	prefix [1 << maxWidth]uint16
+
+	// output is the temporary output buffer.
+	// Literal codes are accumulated from the start of the buffer.
+	// Non-literal codes decode to a sequence of suffixes that are first
+	// written right-to-left from the end of the buffer before being copied
+	// to the start of the buffer.
+	// It is flushed when it contains >= 1<<maxWidth bytes,
+	// so that there is always room to decode an entire code.
+	output [2 * 1 << maxWidth]byte
+	o      int    // write index into output
+	toRead []byte // bytes to return from Read
+}
+
+// readLSB returns the next code for "Least Significant Bits first" data.
+func (d *decoder) readLSB() (uint16, error) {
+	for d.nBits < d.width {
+		x, err := d.r.ReadByte()
+		if err != nil {
+			return 0, err
+		}
+		d.bits |= uint32(x) << d.nBits
+		d.nBits += 8
+	}
+	code := uint16(d.bits & (1<<d.width - 1))
+	d.bits >>= d.width
+	d.nBits -= d.width
+	return code, nil
+}
+
+// readMSB returns the next code for "Most Significant Bits first" data.
+func (d *decoder) readMSB() (uint16, error) {
+	for d.nBits < d.width {
+		x, err := d.r.ReadByte()
+		if err != nil {
+			return 0, err
+		}
+		d.bits |= uint32(x) << (24 - d.nBits)
+		d.nBits += 8
+	}
+	code := uint16(d.bits >> (32 - d.width))
+	d.bits <<= d.width
+	d.nBits -= d.width
+	return code, nil
+}
+
+func (d *decoder) Read(b []byte) (int, error) {
+	for {
+		if len(d.toRead) > 0 {
+			n := copy(b, d.toRead)
+			d.toRead = d.toRead[n:]
+			return n, nil
+		}
+		if d.err != nil {
+			return 0, d.err
+		}
+		d.decode()
+	}
+}
+
+// decode decompresses bytes from r and leaves them in d.toRead.
+// read specifies how to decode bytes into codes.
+// litWidth is the width in bits of literal codes.
+func (d *decoder) decode() {
+	// Loop over the code stream, converting codes into decompressed bytes.
+	for {
+		code, err := d.read(d)
+		if err != nil {
+			if err == io.EOF {
+				err = io.ErrUnexpectedEOF
+			}
+			d.err = err
+			d.flush()
+			return
+		}
+		switch {
+		case code < d.clear:
+			// We have a literal code.
+			d.output[d.o] = uint8(code)
+			d.o++
+			if d.last != decoderInvalidCode {
+				// Save what the hi code expands to.
+				d.suffix[d.hi] = uint8(code)
+				d.prefix[d.hi] = d.last
+			}
+		case code == d.clear:
+			d.width = 1 + uint(d.litWidth)
+			d.hi = d.eof
+			d.overflow = 1 << d.width
+			d.last = decoderInvalidCode
+			continue
+		case code == d.eof:
+			d.flush()
+			d.err = io.EOF
+			return
+		case code <= d.hi:
+			c, i := code, len(d.output)-1
+			if code == d.hi {
+				// code == hi is a special case which expands to the last expansion
+				// followed by the head of the last expansion. To find the head, we walk
+				// the prefix chain until we find a literal code.
+				c = d.last
+				for c >= d.clear {
+					c = d.prefix[c]
+				}
+				d.output[i] = uint8(c)
+				i--
+				c = d.last
+			}
+			// Copy the suffix chain into output and then write that to w.
+			for c >= d.clear {
+				d.output[i] = d.suffix[c]
+				i--
+				c = d.prefix[c]
+			}
+			d.output[i] = uint8(c)
+			d.o += copy(d.output[d.o:], d.output[i:])
+			if d.last != decoderInvalidCode {
+				// Save what the hi code expands to.
+				d.suffix[d.hi] = uint8(c)
+				d.prefix[d.hi] = d.last
+			}
+		default:
+			d.err = errors.New("lzw: invalid code")
+			d.flush()
+			return
+		}
+		d.last, d.hi = code, d.hi+1
+		if d.hi+1 >= d.overflow { // NOTE: the "+1" is where TIFF's LZW differs from the standard algorithm.
+			if d.width == maxWidth {
+				d.last = decoderInvalidCode
+			} else {
+				d.width++
+				d.overflow <<= 1
+			}
+		}
+		if d.o >= flushBuffer {
+			d.flush()
+			return
+		}
+	}
+}
+
+func (d *decoder) flush() {
+	d.toRead = d.output[:d.o]
+	d.o = 0
+}
+
+var errClosed = errors.New("lzw: reader/writer is closed")
+
+func (d *decoder) Close() error {
+	d.err = errClosed // in case any Reads come along
+	return nil
+}
+
+// NewReader creates a new io.ReadCloser.
+// Reads from the returned io.ReadCloser read and decompress data from r.
+// If r does not also implement io.ByteReader,
+// the decompressor may read more data than necessary from r.
+// It is the caller's responsibility to call Close on the ReadCloser when
+// finished reading.
+// The number of bits to use for literal codes, litWidth, must be in the
+// range [2,8] and is typically 8. It must equal the litWidth
+// used during compression.
+func NewReader(r io.Reader, order Order, litWidth int) io.ReadCloser {
+	d := new(decoder)
+	switch order {
+	case LSB:
+		d.read = (*decoder).readLSB
+	case MSB:
+		d.read = (*decoder).readMSB
+	default:
+		d.err = errors.New("lzw: unknown order")
+		return d
+	}
+	if litWidth < 2 || 8 < litWidth {
+		d.err = fmt.Errorf("lzw: litWidth %d out of range", litWidth)
+		return d
+	}
+	if br, ok := r.(io.ByteReader); ok {
+		d.r = br
+	} else {
+		d.r = bufio.NewReader(r)
+	}
+	d.litWidth = litWidth
+	d.width = 1 + uint(litWidth)
+	d.clear = uint16(1) << uint(litWidth)
+	d.eof, d.hi = d.clear+1, d.clear+1
+	d.overflow = uint16(1) << d.width
+	d.last = decoderInvalidCode
+
+	return d
+}
diff --git a/tiff/reader.go b/tiff/reader.go
new file mode 100644
index 0000000..df39e82
--- /dev/null
+++ b/tiff/reader.go
@@ -0,0 +1,681 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package tiff implements a TIFF image decoder and encoder.
+//
+// The TIFF specification is at http://partners.adobe.com/public/developer/en/tiff/TIFF6.pdf
+package tiff // import "golang.org/x/image/tiff"
+
+import (
+	"compress/zlib"
+	"encoding/binary"
+	"fmt"
+	"image"
+	"image/color"
+	"io"
+	"io/ioutil"
+	"math"
+
+	"golang.org/x/image/tiff/lzw"
+)
+
+// A FormatError reports that the input is not a valid TIFF image.
+type FormatError string
+
+func (e FormatError) Error() string {
+	return "tiff: invalid format: " + string(e)
+}
+
+// An UnsupportedError reports that the input uses a valid but
+// unimplemented feature.
+type UnsupportedError string
+
+func (e UnsupportedError) Error() string {
+	return "tiff: unsupported feature: " + string(e)
+}
+
+// An InternalError reports that an internal error was encountered.
+type InternalError string
+
+func (e InternalError) Error() string {
+	return "tiff: internal error: " + string(e)
+}
+
+var errNoPixels = FormatError("not enough pixel data")
+
+type decoder struct {
+	r         io.ReaderAt
+	byteOrder binary.ByteOrder
+	config    image.Config
+	mode      imageMode
+	bpp       uint
+	features  map[int][]uint
+	palette   []color.Color
+
+	buf   []byte
+	off   int    // Current offset in buf.
+	v     uint32 // Buffer value for reading with arbitrary bit depths.
+	nbits uint   // Remaining number of bits in v.
+}
+
+// firstVal returns the first uint of the features entry with the given tag,
+// or 0 if the tag does not exist.
+func (d *decoder) firstVal(tag int) uint {
+	f := d.features[tag]
+	if len(f) == 0 {
+		return 0
+	}
+	return f[0]
+}
+
+// ifdUint decodes the IFD entry in p, which must be of the Byte, Short
+// or Long type, and returns the decoded uint values.
+func (d *decoder) ifdUint(p []byte) (u []uint, err error) {
+	var raw []byte
+	if len(p) < ifdLen {
+		return nil, FormatError("bad IFD entry")
+	}
+
+	datatype := d.byteOrder.Uint16(p[2:4])
+	if dt := int(datatype); dt <= 0 || dt >= len(lengths) {
+		return nil, UnsupportedError("IFD entry datatype")
+	}
+
+	count := d.byteOrder.Uint32(p[4:8])
+	if count > math.MaxInt32/lengths[datatype] {
+		return nil, FormatError("IFD data too large")
+	}
+	if datalen := lengths[datatype] * count; datalen > 4 {
+		// The IFD contains a pointer to the real value.
+		raw = make([]byte, datalen)
+		_, err = d.r.ReadAt(raw, int64(d.byteOrder.Uint32(p[8:12])))
+	} else {
+		raw = p[8 : 8+datalen]
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	u = make([]uint, count)
+	switch datatype {
+	case dtByte:
+		for i := uint32(0); i < count; i++ {
+			u[i] = uint(raw[i])
+		}
+	case dtShort:
+		for i := uint32(0); i < count; i++ {
+			u[i] = uint(d.byteOrder.Uint16(raw[2*i : 2*(i+1)]))
+		}
+	case dtLong:
+		for i := uint32(0); i < count; i++ {
+			u[i] = uint(d.byteOrder.Uint32(raw[4*i : 4*(i+1)]))
+		}
+	default:
+		return nil, UnsupportedError("data type")
+	}
+	return u, nil
+}
+
+// parseIFD decides whether the the IFD entry in p is "interesting" and
+// stows away the data in the decoder.
+func (d *decoder) parseIFD(p []byte) error {
+	tag := d.byteOrder.Uint16(p[0:2])
+	switch tag {
+	case tBitsPerSample,
+		tExtraSamples,
+		tPhotometricInterpretation,
+		tCompression,
+		tPredictor,
+		tStripOffsets,
+		tStripByteCounts,
+		tRowsPerStrip,
+		tTileWidth,
+		tTileLength,
+		tTileOffsets,
+		tTileByteCounts,
+		tImageLength,
+		tImageWidth:
+		val, err := d.ifdUint(p)
+		if err != nil {
+			return err
+		}
+		d.features[int(tag)] = val
+	case tColorMap:
+		val, err := d.ifdUint(p)
+		if err != nil {
+			return err
+		}
+		numcolors := len(val) / 3
+		if len(val)%3 != 0 || numcolors <= 0 || numcolors > 256 {
+			return FormatError("bad ColorMap length")
+		}
+		d.palette = make([]color.Color, numcolors)
+		for i := 0; i < numcolors; i++ {
+			d.palette[i] = color.RGBA64{
+				uint16(val[i]),
+				uint16(val[i+numcolors]),
+				uint16(val[i+2*numcolors]),
+				0xffff,
+			}
+		}
+	case tSampleFormat:
+		// Page 27 of the spec: If the SampleFormat is present and
+		// the value is not 1 [= unsigned integer data], a Baseline
+		// TIFF reader that cannot handle the SampleFormat value
+		// must terminate the import process gracefully.
+		val, err := d.ifdUint(p)
+		if err != nil {
+			return err
+		}
+		for _, v := range val {
+			if v != 1 {
+				return UnsupportedError("sample format")
+			}
+		}
+	}
+	return nil
+}
+
+// readBits reads n bits from the internal buffer starting at the current offset.
+func (d *decoder) readBits(n uint) (v uint32, ok bool) {
+	for d.nbits < n {
+		d.v <<= 8
+		if d.off >= len(d.buf) {
+			return 0, false
+		}
+		d.v |= uint32(d.buf[d.off])
+		d.off++
+		d.nbits += 8
+	}
+	d.nbits -= n
+	rv := d.v >> d.nbits
+	d.v &^= rv << d.nbits
+	return rv, true
+}
+
+// flushBits discards the unread bits in the buffer used by readBits.
+// It is used at the end of a line.
+func (d *decoder) flushBits() {
+	d.v = 0
+	d.nbits = 0
+}
+
+// minInt returns the smaller of x or y.
+func minInt(a, b int) int {
+	if a <= b {
+		return a
+	}
+	return b
+}
+
+// decode decodes the raw data of an image.
+// It reads from d.buf and writes the strip or tile into dst.
+func (d *decoder) decode(dst image.Image, xmin, ymin, xmax, ymax int) error {
+	d.off = 0
+
+	// Apply horizontal predictor if necessary.
+	// In this case, p contains the color difference to the preceding pixel.
+	// See page 64-65 of the spec.
+	if d.firstVal(tPredictor) == prHorizontal {
+		switch d.bpp {
+		case 16:
+			var off int
+			n := 2 * len(d.features[tBitsPerSample]) // bytes per sample times samples per pixel
+			for y := ymin; y < ymax; y++ {
+				off += n
+				for x := 0; x < (xmax-xmin-1)*n; x += 2 {
+					if off+2 > len(d.buf) {
+						return errNoPixels
+					}
+					v0 := d.byteOrder.Uint16(d.buf[off-n : off-n+2])
+					v1 := d.byteOrder.Uint16(d.buf[off : off+2])
+					d.byteOrder.PutUint16(d.buf[off:off+2], v1+v0)
+					off += 2
+				}
+			}
+		case 8:
+			var off int
+			n := 1 * len(d.features[tBitsPerSample]) // bytes per sample times samples per pixel
+			for y := ymin; y < ymax; y++ {
+				off += n
+				for x := 0; x < (xmax-xmin-1)*n; x++ {
+					if off >= len(d.buf) {
+						return errNoPixels
+					}
+					d.buf[off] += d.buf[off-n]
+					off++
+				}
+			}
+		case 1:
+			return UnsupportedError("horizontal predictor with 1 BitsPerSample")
+		}
+	}
+
+	rMaxX := minInt(xmax, dst.Bounds().Max.X)
+	rMaxY := minInt(ymax, dst.Bounds().Max.Y)
+	switch d.mode {
+	case mGray, mGrayInvert:
+		if d.bpp == 16 {
+			img := dst.(*image.Gray16)
+			for y := ymin; y < rMaxY; y++ {
+				for x := xmin; x < rMaxX; x++ {
+					if d.off+2 > len(d.buf) {
+						return errNoPixels
+					}
+					v := d.byteOrder.Uint16(d.buf[d.off : d.off+2])
+					d.off += 2
+					if d.mode == mGrayInvert {
+						v = 0xffff - v
+					}
+					img.SetGray16(x, y, color.Gray16{v})
+				}
+			}
+		} else {
+			img := dst.(*image.Gray)
+			max := uint32((1 << d.bpp) - 1)
+			for y := ymin; y < rMaxY; y++ {
+				for x := xmin; x < rMaxX; x++ {
+					v, ok := d.readBits(d.bpp)
+					if !ok {
+						return errNoPixels
+					}
+					v = v * 0xff / max
+					if d.mode == mGrayInvert {
+						v = 0xff - v
+					}
+					img.SetGray(x, y, color.Gray{uint8(v)})
+				}
+				d.flushBits()
+			}
+		}
+	case mPaletted:
+		img := dst.(*image.Paletted)
+		for y := ymin; y < rMaxY; y++ {
+			for x := xmin; x < rMaxX; x++ {
+				v, ok := d.readBits(d.bpp)
+				if !ok {
+					return errNoPixels
+				}
+				img.SetColorIndex(x, y, uint8(v))
+			}
+			d.flushBits()
+		}
+	case mRGB:
+		if d.bpp == 16 {
+			img := dst.(*image.RGBA64)
+			for y := ymin; y < rMaxY; y++ {
+				for x := xmin; x < rMaxX; x++ {
+					if d.off+6 > len(d.buf) {
+						return errNoPixels
+					}
+					r := d.byteOrder.Uint16(d.buf[d.off+0 : d.off+2])
+					g := d.byteOrder.Uint16(d.buf[d.off+2 : d.off+4])
+					b := d.byteOrder.Uint16(d.buf[d.off+4 : d.off+6])
+					d.off += 6
+					img.SetRGBA64(x, y, color.RGBA64{r, g, b, 0xffff})
+				}
+			}
+		} else {
+			img := dst.(*image.RGBA)
+			for y := ymin; y < rMaxY; y++ {
+				min := img.PixOffset(xmin, y)
+				max := img.PixOffset(rMaxX, y)
+				off := (y - ymin) * (xmax - xmin) * 3
+				for i := min; i < max; i += 4 {
+					if off+3 > len(d.buf) {
+						return errNoPixels
+					}
+					img.Pix[i+0] = d.buf[off+0]
+					img.Pix[i+1] = d.buf[off+1]
+					img.Pix[i+2] = d.buf[off+2]
+					img.Pix[i+3] = 0xff
+					off += 3
+				}
+			}
+		}
+	case mNRGBA:
+		if d.bpp == 16 {
+			img := dst.(*image.NRGBA64)
+			for y := ymin; y < rMaxY; y++ {
+				for x := xmin; x < rMaxX; x++ {
+					if d.off+8 > len(d.buf) {
+						return errNoPixels
+					}
+					r := d.byteOrder.Uint16(d.buf[d.off+0 : d.off+2])
+					g := d.byteOrder.Uint16(d.buf[d.off+2 : d.off+4])
+					b := d.byteOrder.Uint16(d.buf[d.off+4 : d.off+6])
+					a := d.byteOrder.Uint16(d.buf[d.off+6 : d.off+8])
+					d.off += 8
+					img.SetNRGBA64(x, y, color.NRGBA64{r, g, b, a})
+				}
+			}
+		} else {
+			img := dst.(*image.NRGBA)
+			for y := ymin; y < rMaxY; y++ {
+				min := img.PixOffset(xmin, y)
+				max := img.PixOffset(rMaxX, y)
+				i0, i1 := (y-ymin)*(xmax-xmin)*4, (y-ymin+1)*(xmax-xmin)*4
+				if i1 > len(d.buf) {
+					return errNoPixels
+				}
+				copy(img.Pix[min:max], d.buf[i0:i1])
+			}
+		}
+	case mRGBA:
+		if d.bpp == 16 {
+			img := dst.(*image.RGBA64)
+			for y := ymin; y < rMaxY; y++ {
+				for x := xmin; x < rMaxX; x++ {
+					if d.off+8 > len(d.buf) {
+						return errNoPixels
+					}
+					r := d.byteOrder.Uint16(d.buf[d.off+0 : d.off+2])
+					g := d.byteOrder.Uint16(d.buf[d.off+2 : d.off+4])
+					b := d.byteOrder.Uint16(d.buf[d.off+4 : d.off+6])
+					a := d.byteOrder.Uint16(d.buf[d.off+6 : d.off+8])
+					d.off += 8
+					img.SetRGBA64(x, y, color.RGBA64{r, g, b, a})
+				}
+			}
+		} else {
+			img := dst.(*image.RGBA)
+			for y := ymin; y < rMaxY; y++ {
+				min := img.PixOffset(xmin, y)
+				max := img.PixOffset(rMaxX, y)
+				i0, i1 := (y-ymin)*(xmax-xmin)*4, (y-ymin+1)*(xmax-xmin)*4
+				if i1 > len(d.buf) {
+					return errNoPixels
+				}
+				copy(img.Pix[min:max], d.buf[i0:i1])
+			}
+		}
+	}
+
+	return nil
+}
+
+func newDecoder(r io.Reader) (*decoder, error) {
+	d := &decoder{
+		r:        newReaderAt(r),
+		features: make(map[int][]uint),
+	}
+
+	p := make([]byte, 8)
+	if _, err := d.r.ReadAt(p, 0); err != nil {
+		return nil, err
+	}
+	switch string(p[0:4]) {
+	case leHeader:
+		d.byteOrder = binary.LittleEndian
+	case beHeader:
+		d.byteOrder = binary.BigEndian
+	default:
+		return nil, FormatError("malformed header")
+	}
+
+	ifdOffset := int64(d.byteOrder.Uint32(p[4:8]))
+
+	// The first two bytes contain the number of entries (12 bytes each).
+	if _, err := d.r.ReadAt(p[0:2], ifdOffset); err != nil {
+		return nil, err
+	}
+	numItems := int(d.byteOrder.Uint16(p[0:2]))
+
+	// All IFD entries are read in one chunk.
+	p = make([]byte, ifdLen*numItems)
+	if _, err := d.r.ReadAt(p, ifdOffset+2); err != nil {
+		return nil, err
+	}
+
+	for i := 0; i < len(p); i += ifdLen {
+		if err := d.parseIFD(p[i : i+ifdLen]); err != nil {
+			return nil, err
+		}
+	}
+
+	d.config.Width = int(d.firstVal(tImageWidth))
+	d.config.Height = int(d.firstVal(tImageLength))
+
+	if _, ok := d.features[tBitsPerSample]; !ok {
+		return nil, FormatError("BitsPerSample tag missing")
+	}
+	d.bpp = d.firstVal(tBitsPerSample)
+	switch d.bpp {
+	case 0:
+		return nil, FormatError("BitsPerSample must not be 0")
+	case 1, 8, 16:
+		// Nothing to do, these are accepted by this implementation.
+	default:
+		return nil, UnsupportedError(fmt.Sprintf("BitsPerSample of %v", d.bpp))
+	}
+
+	// Determine the image mode.
+	switch d.firstVal(tPhotometricInterpretation) {
+	case pRGB:
+		if d.bpp == 16 {
+			for _, b := range d.features[tBitsPerSample] {
+				if b != 16 {
+					return nil, FormatError("wrong number of samples for 16bit RGB")
+				}
+			}
+		} else {
+			for _, b := range d.features[tBitsPerSample] {
+				if b != 8 {
+					return nil, FormatError("wrong number of samples for 8bit RGB")
+				}
+			}
+		}
+		// RGB images normally have 3 samples per pixel.
+		// If there are more, ExtraSamples (p. 31-32 of the spec)
+		// gives their meaning (usually an alpha channel).
+		//
+		// This implementation does not support extra samples
+		// of an unspecified type.
+		switch len(d.features[tBitsPerSample]) {
+		case 3:
+			d.mode = mRGB
+			if d.bpp == 16 {
+				d.config.ColorModel = color.RGBA64Model
+			} else {
+				d.config.ColorModel = color.RGBAModel
+			}
+		case 4:
+			switch d.firstVal(tExtraSamples) {
+			case 1:
+				d.mode = mRGBA
+				if d.bpp == 16 {
+					d.config.ColorModel = color.RGBA64Model
+				} else {
+					d.config.ColorModel = color.RGBAModel
+				}
+			case 2:
+				d.mode = mNRGBA
+				if d.bpp == 16 {
+					d.config.ColorModel = color.NRGBA64Model
+				} else {
+					d.config.ColorModel = color.NRGBAModel
+				}
+			default:
+				return nil, FormatError("wrong number of samples for RGB")
+			}
+		default:
+			return nil, FormatError("wrong number of samples for RGB")
+		}
+	case pPaletted:
+		d.mode = mPaletted
+		d.config.ColorModel = color.Palette(d.palette)
+	case pWhiteIsZero:
+		d.mode = mGrayInvert
+		if d.bpp == 16 {
+			d.config.ColorModel = color.Gray16Model
+		} else {
+			d.config.ColorModel = color.GrayModel
+		}
+	case pBlackIsZero:
+		d.mode = mGray
+		if d.bpp == 16 {
+			d.config.ColorModel = color.Gray16Model
+		} else {
+			d.config.ColorModel = color.GrayModel
+		}
+	default:
+		return nil, UnsupportedError("color model")
+	}
+
+	return d, nil
+}
+
+// DecodeConfig returns the color model and dimensions of a TIFF image without
+// decoding the entire image.
+func DecodeConfig(r io.Reader) (image.Config, error) {
+	d, err := newDecoder(r)
+	if err != nil {
+		return image.Config{}, err
+	}
+	return d.config, nil
+}
+
+// Decode reads a TIFF image from r and returns it as an image.Image.
+// The type of Image returned depends on the contents of the TIFF.
+func Decode(r io.Reader) (img image.Image, err error) {
+	d, err := newDecoder(r)
+	if err != nil {
+		return
+	}
+
+	blockPadding := false
+	blockWidth := d.config.Width
+	blockHeight := d.config.Height
+	blocksAcross := 1
+	blocksDown := 1
+
+	if d.config.Width == 0 {
+		blocksAcross = 0
+	}
+	if d.config.Height == 0 {
+		blocksDown = 0
+	}
+
+	var blockOffsets, blockCounts []uint
+
+	if int(d.firstVal(tTileWidth)) != 0 {
+		blockPadding = true
+
+		blockWidth = int(d.firstVal(tTileWidth))
+		blockHeight = int(d.firstVal(tTileLength))
+
+		if blockWidth != 0 {
+			blocksAcross = (d.config.Width + blockWidth - 1) / blockWidth
+		}
+		if blockHeight != 0 {
+			blocksDown = (d.config.Height + blockHeight - 1) / blockHeight
+		}
+
+		blockCounts = d.features[tTileByteCounts]
+		blockOffsets = d.features[tTileOffsets]
+
+	} else {
+		if int(d.firstVal(tRowsPerStrip)) != 0 {
+			blockHeight = int(d.firstVal(tRowsPerStrip))
+		}
+
+		if blockHeight != 0 {
+			blocksDown = (d.config.Height + blockHeight - 1) / blockHeight
+		}
+
+		blockOffsets = d.features[tStripOffsets]
+		blockCounts = d.features[tStripByteCounts]
+	}
+
+	// Check if we have the right number of strips/tiles, offsets and counts.
+	if n := blocksAcross * blocksDown; len(blockOffsets) < n || len(blockCounts) < n {
+		return nil, FormatError("inconsistent header")
+	}
+
+	imgRect := image.Rect(0, 0, d.config.Width, d.config.Height)
+	switch d.mode {
+	case mGray, mGrayInvert:
+		if d.bpp == 16 {
+			img = image.NewGray16(imgRect)
+		} else {
+			img = image.NewGray(imgRect)
+		}
+	case mPaletted:
+		img = image.NewPaletted(imgRect, d.palette)
+	case mNRGBA:
+		if d.bpp == 16 {
+			img = image.NewNRGBA64(imgRect)
+		} else {
+			img = image.NewNRGBA(imgRect)
+		}
+	case mRGB, mRGBA:
+		if d.bpp == 16 {
+			img = image.NewRGBA64(imgRect)
+		} else {
+			img = image.NewRGBA(imgRect)
+		}
+	}
+
+	for i := 0; i < blocksAcross; i++ {
+		blkW := blockWidth
+		if !blockPadding && i == blocksAcross-1 && d.config.Width%blockWidth != 0 {
+			blkW = d.config.Width % blockWidth
+		}
+		for j := 0; j < blocksDown; j++ {
+			blkH := blockHeight
+			if !blockPadding && j == blocksDown-1 && d.config.Height%blockHeight != 0 {
+				blkH = d.config.Height % blockHeight
+			}
+			offset := int64(blockOffsets[j*blocksAcross+i])
+			n := int64(blockCounts[j*blocksAcross+i])
+			switch d.firstVal(tCompression) {
+
+			// According to the spec, Compression does not have a default value,
+			// but some tools interpret a missing Compression value as none so we do
+			// the same.
+			case cNone, 0:
+				if b, ok := d.r.(*buffer); ok {
+					d.buf, err = b.Slice(int(offset), int(n))
+				} else {
+					d.buf = make([]byte, n)
+					_, err = d.r.ReadAt(d.buf, offset)
+				}
+			case cLZW:
+				r := lzw.NewReader(io.NewSectionReader(d.r, offset, n), lzw.MSB, 8)
+				d.buf, err = ioutil.ReadAll(r)
+				r.Close()
+			case cDeflate, cDeflateOld:
+				var r io.ReadCloser
+				r, err = zlib.NewReader(io.NewSectionReader(d.r, offset, n))
+				if err != nil {
+					return nil, err
+				}
+				d.buf, err = ioutil.ReadAll(r)
+				r.Close()
+			case cPackBits:
+				d.buf, err = unpackBits(io.NewSectionReader(d.r, offset, n))
+			default:
+				err = UnsupportedError(fmt.Sprintf("compression value %d", d.firstVal(tCompression)))
+			}
+			if err != nil {
+				return nil, err
+			}
+
+			xmin := i * blockWidth
+			ymin := j * blockHeight
+			xmax := xmin + blkW
+			ymax := ymin + blkH
+			err = d.decode(img, xmin, ymin, xmax, ymax)
+			if err != nil {
+				return nil, err
+			}
+		}
+	}
+	return
+}
+
+func init() {
+	image.RegisterFormat("tiff", leHeader, Decode, DecodeConfig)
+	image.RegisterFormat("tiff", beHeader, Decode, DecodeConfig)
+}
diff --git a/tiff/reader_test.go b/tiff/reader_test.go
new file mode 100644
index 0000000..f5c02e6
--- /dev/null
+++ b/tiff/reader_test.go
@@ -0,0 +1,377 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package tiff
+
+import (
+	"bytes"
+	"encoding/binary"
+	"encoding/hex"
+	"errors"
+	"image"
+	"io/ioutil"
+	"os"
+	"strings"
+	"testing"
+
+	_ "image/png"
+)
+
+const testdataDir = "../testdata/"
+
+// Read makes *buffer implements io.Reader, so that we can pass one to Decode.
+func (*buffer) Read([]byte) (int, error) {
+	panic("unimplemented")
+}
+
+func load(name string) (image.Image, error) {
+	f, err := os.Open(testdataDir + name)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	img, _, err := image.Decode(f)
+	if err != nil {
+		return nil, err
+	}
+	return img, nil
+}
+
+// TestNoRPS tests decoding an image that has no RowsPerStrip tag. The tag is
+// mandatory according to the spec but some software omits it in the case of a
+// single strip.
+func TestNoRPS(t *testing.T) {
+	_, err := load("no_rps.tiff")
+	if err != nil {
+		t.Fatal(err)
+	}
+}
+
+// TestNoCompression tests decoding an image that has no Compression tag. This
+// tag is mandatory, but most tools interpret a missing value as no
+// compression.
+func TestNoCompression(t *testing.T) {
+	_, err := load("no_compress.tiff")
+	if err != nil {
+		t.Fatal(err)
+	}
+}
+
+// TestUnpackBits tests the decoding of PackBits-encoded data.
+func TestUnpackBits(t *testing.T) {
+	var unpackBitsTests = []struct {
+		compressed   string
+		uncompressed string
+	}{{
+		// Example data from Wikipedia.
+		"\xfe\xaa\x02\x80\x00\x2a\xfd\xaa\x03\x80\x00\x2a\x22\xf7\xaa",
+		"\xaa\xaa\xaa\x80\x00\x2a\xaa\xaa\xaa\xaa\x80\x00\x2a\x22\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa",
+	}}
+	for _, u := range unpackBitsTests {
+		buf, err := unpackBits(strings.NewReader(u.compressed))
+		if err != nil {
+			t.Fatal(err)
+		}
+		if string(buf) != u.uncompressed {
+			t.Fatalf("unpackBits: want %x, got %x", u.uncompressed, buf)
+		}
+	}
+}
+
+func TestShortBlockData(t *testing.T) {
+	b, err := ioutil.ReadFile("../testdata/bw-uncompressed.tiff")
+	if err != nil {
+		t.Fatal(err)
+	}
+	// The bw-uncompressed.tiff image is a 153x55 bi-level image. This is 1 bit
+	// per pixel, or 20 bytes per row, times 55 rows, or 1100 bytes of pixel
+	// data. 1100 in hex is 0x44c, or "\x4c\x04" in little-endian. We replace
+	// that byte count (StripByteCounts-tagged data) by something less than
+	// that, so that there is not enough pixel data.
+	old := []byte{0x4c, 0x04}
+	new := []byte{0x01, 0x01}
+	i := bytes.Index(b, old)
+	if i < 0 {
+		t.Fatal(`could not find "\x4c\x04" byte count`)
+	}
+	if bytes.Contains(b[i+len(old):], old) {
+		t.Fatal(`too many occurrences of "\x4c\x04"`)
+	}
+	b[i+0] = new[0]
+	b[i+1] = new[1]
+	if _, err = Decode(bytes.NewReader(b)); err == nil {
+		t.Fatal("got nil error, want non-nil")
+	}
+}
+
+func TestDecodeInvalidDataType(t *testing.T) {
+	b, err := ioutil.ReadFile("../testdata/bw-uncompressed.tiff")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// off is the offset of the ImageWidth tag. It is the offset of the overall
+	// IFD block (0x00000454), plus 2 for the uint16 number of IFD entries, plus 12
+	// to skip the first entry.
+	const off = 0x00000454 + 2 + 12*1
+
+	if v := binary.LittleEndian.Uint16(b[off : off+2]); v != tImageWidth {
+		t.Fatal(`could not find ImageWidth tag`)
+	}
+	binary.LittleEndian.PutUint16(b[off+2:], uint16(len(lengths))) // invalid datatype
+
+	if _, err = Decode(bytes.NewReader(b)); err == nil {
+		t.Fatal("got nil error, want non-nil")
+	}
+}
+
+func compare(t *testing.T, img0, img1 image.Image) {
+	b0 := img0.Bounds()
+	b1 := img1.Bounds()
+	if b0.Dx() != b1.Dx() || b0.Dy() != b1.Dy() {
+		t.Fatalf("wrong image size: want %s, got %s", b0, b1)
+	}
+	x1 := b1.Min.X - b0.Min.X
+	y1 := b1.Min.Y - b0.Min.Y
+	for y := b0.Min.Y; y < b0.Max.Y; y++ {
+		for x := b0.Min.X; x < b0.Max.X; x++ {
+			c0 := img0.At(x, y)
+			c1 := img1.At(x+x1, y+y1)
+			r0, g0, b0, a0 := c0.RGBA()
+			r1, g1, b1, a1 := c1.RGBA()
+			if r0 != r1 || g0 != g1 || b0 != b1 || a0 != a1 {
+				t.Fatalf("pixel at (%d, %d) has wrong color: want %v, got %v", x, y, c0, c1)
+			}
+		}
+	}
+}
+
+// TestDecode tests that decoding a PNG image and a TIFF image result in the
+// same pixel data.
+func TestDecode(t *testing.T) {
+	img0, err := load("video-001.png")
+	if err != nil {
+		t.Fatal(err)
+	}
+	img1, err := load("video-001.tiff")
+	if err != nil {
+		t.Fatal(err)
+	}
+	img2, err := load("video-001-strip-64.tiff")
+	if err != nil {
+		t.Fatal(err)
+	}
+	img3, err := load("video-001-tile-64x64.tiff")
+	if err != nil {
+		t.Fatal(err)
+	}
+	img4, err := load("video-001-16bit.tiff")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	compare(t, img0, img1)
+	compare(t, img0, img2)
+	compare(t, img0, img3)
+	compare(t, img0, img4)
+}
+
+// TestDecodeLZW tests that decoding a PNG image and a LZW-compressed TIFF
+// image result in the same pixel data.
+func TestDecodeLZW(t *testing.T) {
+	img0, err := load("blue-purple-pink.png")
+	if err != nil {
+		t.Fatal(err)
+	}
+	img1, err := load("blue-purple-pink.lzwcompressed.tiff")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	compare(t, img0, img1)
+}
+
+// TestDecompress tests that decoding some TIFF images that use different
+// compression formats result in the same pixel data.
+func TestDecompress(t *testing.T) {
+	var decompressTests = []string{
+		"bw-uncompressed.tiff",
+		"bw-deflate.tiff",
+		"bw-packbits.tiff",
+	}
+	var img0 image.Image
+	for _, name := range decompressTests {
+		img1, err := load(name)
+		if err != nil {
+			t.Fatalf("decoding %s: %v", name, err)
+		}
+		if img0 == nil {
+			img0 = img1
+			continue
+		}
+		compare(t, img0, img1)
+	}
+}
+
+func replace(src []byte, find, repl string) ([]byte, error) {
+	removeSpaces := func(r rune) rune {
+		if r != ' ' {
+			return r
+		}
+		return -1
+	}
+
+	f, err := hex.DecodeString(strings.Map(removeSpaces, find))
+	if err != nil {
+		return nil, err
+	}
+	r, err := hex.DecodeString(strings.Map(removeSpaces, repl))
+	if err != nil {
+		return nil, err
+	}
+	dst := bytes.Replace(src, f, r, 1)
+	if bytes.Equal(dst, src) {
+		return nil, errors.New("replacement failed")
+	}
+	return dst, nil
+}
+
+// TestZeroBitsPerSample tests that an IFD with a bitsPerSample of 0 does not
+// cause a crash.
+// Issue 10711.
+func TestZeroBitsPerSample(t *testing.T) {
+	b0, err := ioutil.ReadFile(testdataDir + "bw-deflate.tiff")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Mutate the loaded image to have the problem.
+	// 02 01: tag number (tBitsPerSample)
+	// 03 00: data type (short, or uint16)
+	// 01 00 00 00: count
+	// ?? 00 00 00: value (1 -> 0)
+	b1, err := replace(b0,
+		"02 01 03 00 01 00 00 00 01 00 00 00",
+		"02 01 03 00 01 00 00 00 00 00 00 00",
+	)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	_, err = Decode(bytes.NewReader(b1))
+	if err == nil {
+		t.Fatal("Decode with 0 bits per sample: got nil error, want non-nil")
+	}
+}
+
+// TestTileTooBig tests that we do not panic when a tile is too big compared to
+// the data available.
+// Issue 10712
+func TestTileTooBig(t *testing.T) {
+	b0, err := ioutil.ReadFile(testdataDir + "video-001-tile-64x64.tiff")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Mutate the loaded image to have the problem.
+	//
+	// 42 01: tag number (tTileWidth)
+	// 03 00: data type (short, or uint16)
+	// 01 00 00 00: count
+	// xx 00 00 00: value (0x40 -> 0x44: a wider tile consumes more data
+	// than is available)
+	b1, err := replace(b0,
+		"42 01 03 00 01 00 00 00 40 00 00 00",
+		"42 01 03 00 01 00 00 00 44 00 00 00",
+	)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Turn off the predictor, which makes it possible to hit the
+	// place with the defect. Without this patch to the image, we run
+	// out of data too early, and do not hit the part of the code where
+	// the original panic was.
+	//
+	// 3d 01: tag number (tPredictor)
+	// 03 00: data type (short, or uint16)
+	// 01 00 00 00: count
+	// xx 00 00 00: value (2 -> 1: 2 = horizontal, 1 = none)
+	b2, err := replace(b1,
+		"3d 01 03 00 01 00 00 00 02 00 00 00",
+		"3d 01 03 00 01 00 00 00 01 00 00 00",
+	)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	_, err = Decode(bytes.NewReader(b2))
+	if err == nil {
+		t.Fatal("did not expect nil error")
+	}
+}
+
+// TestZeroSizedImages tests that decoding does not panic when image dimensions
+// are zero, and returns a zero-sized image instead.
+// Issue 10393.
+func TestZeroSizedImages(t *testing.T) {
+	testsizes := []struct {
+		w, h int
+	}{
+		{0, 0},
+		{1, 0},
+		{0, 1},
+		{1, 1},
+	}
+	for _, r := range testsizes {
+		img := image.NewRGBA(image.Rect(0, 0, r.w, r.h))
+		var buf bytes.Buffer
+		if err := Encode(&buf, img, nil); err != nil {
+			t.Errorf("encode w=%d h=%d: %v", r.w, r.h, err)
+			continue
+		}
+		if _, err := Decode(&buf); err != nil {
+			t.Errorf("decode w=%d h=%d: %v", r.w, r.h, err)
+		}
+	}
+}
+
+// TestLargeIFDEntry tests that a large IFD entry does not cause Decode to
+// panic.
+// Issue 10596.
+func TestLargeIFDEntry(t *testing.T) {
+	testdata := "II*\x00\x08\x00\x00\x00\f\x000000000000" +
+		"00000000000000000000" +
+		"00000000000000000000" +
+		"00000000000000000000" +
+		"00000000000000\x17\x01\x04\x00\x01\x00" +
+		"\x00\xc0000000000000000000" +
+		"00000000000000000000" +
+		"00000000000000000000" +
+		"000000"
+	_, err := Decode(strings.NewReader(testdata))
+	if err == nil {
+		t.Fatal("Decode with large IFD entry: got nil error, want non-nil")
+	}
+}
+
+// benchmarkDecode benchmarks the decoding of an image.
+func benchmarkDecode(b *testing.B, filename string) {
+	b.StopTimer()
+	contents, err := ioutil.ReadFile(testdataDir + filename)
+	if err != nil {
+		b.Fatal(err)
+	}
+	r := &buffer{buf: contents}
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		_, err := Decode(r)
+		if err != nil {
+			b.Fatal("Decode:", err)
+		}
+	}
+}
+
+func BenchmarkDecodeCompressed(b *testing.B)   { benchmarkDecode(b, "video-001.tiff") }
+func BenchmarkDecodeUncompressed(b *testing.B) { benchmarkDecode(b, "video-001-uncompressed.tiff") }
diff --git a/tiff/writer.go b/tiff/writer.go
new file mode 100644
index 0000000..c8a01ce
--- /dev/null
+++ b/tiff/writer.go
@@ -0,0 +1,438 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package tiff
+
+import (
+	"bytes"
+	"compress/zlib"
+	"encoding/binary"
+	"image"
+	"io"
+	"sort"
+)
+
+// The TIFF format allows to choose the order of the different elements freely.
+// The basic structure of a TIFF file written by this package is:
+//
+//   1. Header (8 bytes).
+//   2. Image data.
+//   3. Image File Directory (IFD).
+//   4. "Pointer area" for larger entries in the IFD.
+
+// We only write little-endian TIFF files.
+var enc = binary.LittleEndian
+
+// An ifdEntry is a single entry in an Image File Directory.
+// A value of type dtRational is composed of two 32-bit values,
+// thus data contains two uints (numerator and denominator) for a single number.
+type ifdEntry struct {
+	tag      int
+	datatype int
+	data     []uint32
+}
+
+func (e ifdEntry) putData(p []byte) {
+	for _, d := range e.data {
+		switch e.datatype {
+		case dtByte, dtASCII:
+			p[0] = byte(d)
+			p = p[1:]
+		case dtShort:
+			enc.PutUint16(p, uint16(d))
+			p = p[2:]
+		case dtLong, dtRational:
+			enc.PutUint32(p, uint32(d))
+			p = p[4:]
+		}
+	}
+}
+
+type byTag []ifdEntry
+
+func (d byTag) Len() int           { return len(d) }
+func (d byTag) Less(i, j int) bool { return d[i].tag < d[j].tag }
+func (d byTag) Swap(i, j int)      { d[i], d[j] = d[j], d[i] }
+
+func encodeGray(w io.Writer, pix []uint8, dx, dy, stride int, predictor bool) error {
+	if !predictor {
+		return writePix(w, pix, dy, dx, stride)
+	}
+	buf := make([]byte, dx)
+	for y := 0; y < dy; y++ {
+		min := y*stride + 0
+		max := y*stride + dx
+		off := 0
+		var v0 uint8
+		for i := min; i < max; i++ {
+			v1 := pix[i]
+			buf[off] = v1 - v0
+			v0 = v1
+			off++
+		}
+		if _, err := w.Write(buf); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func encodeGray16(w io.Writer, pix []uint8, dx, dy, stride int, predictor bool) error {
+	buf := make([]byte, dx*2)
+	for y := 0; y < dy; y++ {
+		min := y*stride + 0
+		max := y*stride + dx*2
+		off := 0
+		var v0 uint16
+		for i := min; i < max; i += 2 {
+			// An image.Gray16's Pix is in big-endian order.
+			v1 := uint16(pix[i])<<8 | uint16(pix[i+1])
+			if predictor {
+				v0, v1 = v1, v1-v0
+			}
+			// We only write little-endian TIFF files.
+			buf[off+0] = byte(v1)
+			buf[off+1] = byte(v1 >> 8)
+			off += 2
+		}
+		if _, err := w.Write(buf); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func encodeRGBA(w io.Writer, pix []uint8, dx, dy, stride int, predictor bool) error {
+	if !predictor {
+		return writePix(w, pix, dy, dx*4, stride)
+	}
+	buf := make([]byte, dx*4)
+	for y := 0; y < dy; y++ {
+		min := y*stride + 0
+		max := y*stride + dx*4
+		off := 0
+		var r0, g0, b0, a0 uint8
+		for i := min; i < max; i += 4 {
+			r1, g1, b1, a1 := pix[i+0], pix[i+1], pix[i+2], pix[i+3]
+			buf[off+0] = r1 - r0
+			buf[off+1] = g1 - g0
+			buf[off+2] = b1 - b0
+			buf[off+3] = a1 - a0
+			off += 4
+			r0, g0, b0, a0 = r1, g1, b1, a1
+		}
+		if _, err := w.Write(buf); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func encodeRGBA64(w io.Writer, pix []uint8, dx, dy, stride int, predictor bool) error {
+	buf := make([]byte, dx*8)
+	for y := 0; y < dy; y++ {
+		min := y*stride + 0
+		max := y*stride + dx*8
+		off := 0
+		var r0, g0, b0, a0 uint16
+		for i := min; i < max; i += 8 {
+			// An image.RGBA64's Pix is in big-endian order.
+			r1 := uint16(pix[i+0])<<8 | uint16(pix[i+1])
+			g1 := uint16(pix[i+2])<<8 | uint16(pix[i+3])
+			b1 := uint16(pix[i+4])<<8 | uint16(pix[i+5])
+			a1 := uint16(pix[i+6])<<8 | uint16(pix[i+7])
+			if predictor {
+				r0, r1 = r1, r1-r0
+				g0, g1 = g1, g1-g0
+				b0, b1 = b1, b1-b0
+				a0, a1 = a1, a1-a0
+			}
+			// We only write little-endian TIFF files.
+			buf[off+0] = byte(r1)
+			buf[off+1] = byte(r1 >> 8)
+			buf[off+2] = byte(g1)
+			buf[off+3] = byte(g1 >> 8)
+			buf[off+4] = byte(b1)
+			buf[off+5] = byte(b1 >> 8)
+			buf[off+6] = byte(a1)
+			buf[off+7] = byte(a1 >> 8)
+			off += 8
+		}
+		if _, err := w.Write(buf); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func encode(w io.Writer, m image.Image, predictor bool) error {
+	bounds := m.Bounds()
+	buf := make([]byte, 4*bounds.Dx())
+	for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
+		off := 0
+		if predictor {
+			var r0, g0, b0, a0 uint8
+			for x := bounds.Min.X; x < bounds.Max.X; x++ {
+				r, g, b, a := m.At(x, y).RGBA()
+				r1 := uint8(r >> 8)
+				g1 := uint8(g >> 8)
+				b1 := uint8(b >> 8)
+				a1 := uint8(a >> 8)
+				buf[off+0] = r1 - r0
+				buf[off+1] = g1 - g0
+				buf[off+2] = b1 - b0
+				buf[off+3] = a1 - a0
+				off += 4
+				r0, g0, b0, a0 = r1, g1, b1, a1
+			}
+		} else {
+			for x := bounds.Min.X; x < bounds.Max.X; x++ {
+				r, g, b, a := m.At(x, y).RGBA()
+				buf[off+0] = uint8(r >> 8)
+				buf[off+1] = uint8(g >> 8)
+				buf[off+2] = uint8(b >> 8)
+				buf[off+3] = uint8(a >> 8)
+				off += 4
+			}
+		}
+		if _, err := w.Write(buf); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// writePix writes the internal byte array of an image to w. It is less general
+// but much faster then encode. writePix is used when pix directly
+// corresponds to one of the TIFF image types.
+func writePix(w io.Writer, pix []byte, nrows, length, stride int) error {
+	if length == stride {
+		_, err := w.Write(pix[:nrows*length])
+		return err
+	}
+	for ; nrows > 0; nrows-- {
+		if _, err := w.Write(pix[:length]); err != nil {
+			return err
+		}
+		pix = pix[stride:]
+	}
+	return nil
+}
+
+func writeIFD(w io.Writer, ifdOffset int, d []ifdEntry) error {
+	var buf [ifdLen]byte
+	// Make space for "pointer area" containing IFD entry data
+	// longer than 4 bytes.
+	parea := make([]byte, 1024)
+	pstart := ifdOffset + ifdLen*len(d) + 6
+	var o int // Current offset in parea.
+
+	// The IFD has to be written with the tags in ascending order.
+	sort.Sort(byTag(d))
+
+	// Write the number of entries in this IFD.
+	if err := binary.Write(w, enc, uint16(len(d))); err != nil {
+		return err
+	}
+	for _, ent := range d {
+		enc.PutUint16(buf[0:2], uint16(ent.tag))
+		enc.PutUint16(buf[2:4], uint16(ent.datatype))
+		count := uint32(len(ent.data))
+		if ent.datatype == dtRational {
+			count /= 2
+		}
+		enc.PutUint32(buf[4:8], count)
+		datalen := int(count * lengths[ent.datatype])
+		if datalen <= 4 {
+			ent.putData(buf[8:12])
+		} else {
+			if (o + datalen) > len(parea) {
+				newlen := len(parea) + 1024
+				for (o + datalen) > newlen {
+					newlen += 1024
+				}
+				newarea := make([]byte, newlen)
+				copy(newarea, parea)
+				parea = newarea
+			}
+			ent.putData(parea[o : o+datalen])
+			enc.PutUint32(buf[8:12], uint32(pstart+o))
+			o += datalen
+		}
+		if _, err := w.Write(buf[:]); err != nil {
+			return err
+		}
+	}
+	// The IFD ends with the offset of the next IFD in the file,
+	// or zero if it is the last one (page 14).
+	if err := binary.Write(w, enc, uint32(0)); err != nil {
+		return err
+	}
+	_, err := w.Write(parea[:o])
+	return err
+}
+
+// Options are the encoding parameters.
+type Options struct {
+	// Compression is the type of compression used.
+	Compression CompressionType
+	// Predictor determines whether a differencing predictor is used;
+	// if true, instead of each pixel's color, the color difference to the
+	// preceding one is saved.  This improves the compression for certain
+	// types of images and compressors. For example, it works well for
+	// photos with Deflate compression.
+	Predictor bool
+}
+
+// Encode writes the image m to w. opt determines the options used for
+// encoding, such as the compression type. If opt is nil, an uncompressed
+// image is written.
+func Encode(w io.Writer, m image.Image, opt *Options) error {
+	d := m.Bounds().Size()
+
+	compression := uint32(cNone)
+	predictor := false
+	if opt != nil {
+		compression = opt.Compression.specValue()
+		// The predictor field is only used with LZW. See page 64 of the spec.
+		predictor = opt.Predictor && compression == cLZW
+	}
+
+	_, err := io.WriteString(w, leHeader)
+	if err != nil {
+		return err
+	}
+
+	// Compressed data is written into a buffer first, so that we
+	// know the compressed size.
+	var buf bytes.Buffer
+	// dst holds the destination for the pixel data of the image --
+	// either w or a writer to buf.
+	var dst io.Writer
+	// imageLen is the length of the pixel data in bytes.
+	// The offset of the IFD is imageLen + 8 header bytes.
+	var imageLen int
+
+	switch compression {
+	case cNone:
+		dst = w
+		// Write IFD offset before outputting pixel data.
+		switch m.(type) {
+		case *image.Paletted:
+			imageLen = d.X * d.Y * 1
+		case *image.Gray:
+			imageLen = d.X * d.Y * 1
+		case *image.Gray16:
+			imageLen = d.X * d.Y * 2
+		case *image.RGBA64:
+			imageLen = d.X * d.Y * 8
+		case *image.NRGBA64:
+			imageLen = d.X * d.Y * 8
+		default:
+			imageLen = d.X * d.Y * 4
+		}
+		err = binary.Write(w, enc, uint32(imageLen+8))
+		if err != nil {
+			return err
+		}
+	case cDeflate:
+		dst = zlib.NewWriter(&buf)
+	}
+
+	pr := uint32(prNone)
+	photometricInterpretation := uint32(pRGB)
+	samplesPerPixel := uint32(4)
+	bitsPerSample := []uint32{8, 8, 8, 8}
+	extraSamples := uint32(0)
+	colorMap := []uint32{}
+
+	if predictor {
+		pr = prHorizontal
+	}
+	switch m := m.(type) {
+	case *image.Paletted:
+		photometricInterpretation = pPaletted
+		samplesPerPixel = 1
+		bitsPerSample = []uint32{8}
+		colorMap = make([]uint32, 256*3)
+		for i := 0; i < 256 && i < len(m.Palette); i++ {
+			r, g, b, _ := m.Palette[i].RGBA()
+			colorMap[i+0*256] = uint32(r)
+			colorMap[i+1*256] = uint32(g)
+			colorMap[i+2*256] = uint32(b)
+		}
+		err = encodeGray(dst, m.Pix, d.X, d.Y, m.Stride, predictor)
+	case *image.Gray:
+		photometricInterpretation = pBlackIsZero
+		samplesPerPixel = 1
+		bitsPerSample = []uint32{8}
+		err = encodeGray(dst, m.Pix, d.X, d.Y, m.Stride, predictor)
+	case *image.Gray16:
+		photometricInterpretation = pBlackIsZero
+		samplesPerPixel = 1
+		bitsPerSample = []uint32{16}
+		err = encodeGray16(dst, m.Pix, d.X, d.Y, m.Stride, predictor)
+	case *image.NRGBA:
+		extraSamples = 2 // Unassociated alpha.
+		err = encodeRGBA(dst, m.Pix, d.X, d.Y, m.Stride, predictor)
+	case *image.NRGBA64:
+		extraSamples = 2 // Unassociated alpha.
+		bitsPerSample = []uint32{16, 16, 16, 16}
+		err = encodeRGBA64(dst, m.Pix, d.X, d.Y, m.Stride, predictor)
+	case *image.RGBA:
+		extraSamples = 1 // Associated alpha.
+		err = encodeRGBA(dst, m.Pix, d.X, d.Y, m.Stride, predictor)
+	case *image.RGBA64:
+		extraSamples = 1 // Associated alpha.
+		bitsPerSample = []uint32{16, 16, 16, 16}
+		err = encodeRGBA64(dst, m.Pix, d.X, d.Y, m.Stride, predictor)
+	default:
+		extraSamples = 1 // Associated alpha.
+		err = encode(dst, m, predictor)
+	}
+	if err != nil {
+		return err
+	}
+
+	if compression != cNone {
+		if err = dst.(io.Closer).Close(); err != nil {
+			return err
+		}
+		imageLen = buf.Len()
+		if err = binary.Write(w, enc, uint32(imageLen+8)); err != nil {
+			return err
+		}
+		if _, err = buf.WriteTo(w); err != nil {
+			return err
+		}
+	}
+
+	ifd := []ifdEntry{
+		{tImageWidth, dtShort, []uint32{uint32(d.X)}},
+		{tImageLength, dtShort, []uint32{uint32(d.Y)}},
+		{tBitsPerSample, dtShort, bitsPerSample},
+		{tCompression, dtShort, []uint32{compression}},
+		{tPhotometricInterpretation, dtShort, []uint32{photometricInterpretation}},
+		{tStripOffsets, dtLong, []uint32{8}},
+		{tSamplesPerPixel, dtShort, []uint32{samplesPerPixel}},
+		{tRowsPerStrip, dtShort, []uint32{uint32(d.Y)}},
+		{tStripByteCounts, dtLong, []uint32{uint32(imageLen)}},
+		// There is currently no support for storing the image
+		// resolution, so give a bogus value of 72x72 dpi.
+		{tXResolution, dtRational, []uint32{72, 1}},
+		{tYResolution, dtRational, []uint32{72, 1}},
+		{tResolutionUnit, dtShort, []uint32{resPerInch}},
+	}
+	if pr != prNone {
+		ifd = append(ifd, ifdEntry{tPredictor, dtShort, []uint32{pr}})
+	}
+	if len(colorMap) != 0 {
+		ifd = append(ifd, ifdEntry{tColorMap, dtShort, colorMap})
+	}
+	if extraSamples > 0 {
+		ifd = append(ifd, ifdEntry{tExtraSamples, dtShort, []uint32{extraSamples}})
+	}
+
+	return writeIFD(w, imageLen+8, ifd)
+}
diff --git a/tiff/writer_test.go b/tiff/writer_test.go
new file mode 100644
index 0000000..c8fb7bf
--- /dev/null
+++ b/tiff/writer_test.go
@@ -0,0 +1,95 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package tiff
+
+import (
+	"bytes"
+	"image"
+	"io/ioutil"
+	"os"
+	"testing"
+)
+
+var roundtripTests = []struct {
+	filename string
+	opts     *Options
+}{
+	{"video-001.tiff", nil},
+	{"video-001-16bit.tiff", nil},
+	{"video-001-gray.tiff", nil},
+	{"video-001-gray-16bit.tiff", nil},
+	{"video-001-paletted.tiff", nil},
+	{"bw-packbits.tiff", nil},
+	{"video-001.tiff", &Options{Predictor: true}},
+	{"video-001.tiff", &Options{Compression: Deflate}},
+	{"video-001.tiff", &Options{Predictor: true, Compression: Deflate}},
+}
+
+func openImage(filename string) (image.Image, error) {
+	f, err := os.Open(testdataDir + filename)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	return Decode(f)
+}
+
+func TestRoundtrip(t *testing.T) {
+	for _, rt := range roundtripTests {
+		img, err := openImage(rt.filename)
+		if err != nil {
+			t.Fatal(err)
+		}
+		out := new(bytes.Buffer)
+		err = Encode(out, img, rt.opts)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		img2, err := Decode(&buffer{buf: out.Bytes()})
+		if err != nil {
+			t.Fatal(err)
+		}
+		compare(t, img, img2)
+	}
+}
+
+// TestRoundtrip2 tests that encoding and decoding an image whose
+// origin is not (0, 0) gives the same thing.
+func TestRoundtrip2(t *testing.T) {
+	m0 := image.NewRGBA(image.Rect(3, 4, 9, 8))
+	for i := range m0.Pix {
+		m0.Pix[i] = byte(i)
+	}
+	out := new(bytes.Buffer)
+	if err := Encode(out, m0, nil); err != nil {
+		t.Fatal(err)
+	}
+	m1, err := Decode(&buffer{buf: out.Bytes()})
+	if err != nil {
+		t.Fatal(err)
+	}
+	compare(t, m0, m1)
+}
+
+func benchmarkEncode(b *testing.B, name string, pixelSize int) {
+	img, err := openImage(name)
+	if err != nil {
+		b.Fatal(err)
+	}
+	s := img.Bounds().Size()
+	b.SetBytes(int64(s.X * s.Y * pixelSize))
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		Encode(ioutil.Discard, img, nil)
+	}
+}
+
+func BenchmarkEncode(b *testing.B)         { benchmarkEncode(b, "video-001.tiff", 4) }
+func BenchmarkEncodePaletted(b *testing.B) { benchmarkEncode(b, "video-001-paletted.tiff", 1) }
+func BenchmarkEncodeGray(b *testing.B)     { benchmarkEncode(b, "video-001-gray.tiff", 1) }
+func BenchmarkEncodeGray16(b *testing.B)   { benchmarkEncode(b, "video-001-gray-16bit.tiff", 2) }
+func BenchmarkEncodeRGBA(b *testing.B)     { benchmarkEncode(b, "video-001.tiff", 4) }
+func BenchmarkEncodeRGBA64(b *testing.B)   { benchmarkEncode(b, "video-001-16bit.tiff", 8) }
diff --git a/vp8/decode.go b/vp8/decode.go
new file mode 100644
index 0000000..1bb5028
--- /dev/null
+++ b/vp8/decode.go
@@ -0,0 +1,403 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package vp8 implements a decoder for the VP8 lossy image format.
+//
+// The VP8 specification is RFC 6386.
+package vp8 // import "golang.org/x/image/vp8"
+
+// This file implements the top-level decoding algorithm.
+
+import (
+	"errors"
+	"image"
+	"io"
+)
+
+// limitReader wraps an io.Reader to read at most n bytes from it.
+type limitReader struct {
+	r io.Reader
+	n int
+}
+
+// ReadFull reads exactly len(p) bytes into p.
+func (r *limitReader) ReadFull(p []byte) error {
+	if len(p) > r.n {
+		return io.ErrUnexpectedEOF
+	}
+	n, err := io.ReadFull(r.r, p)
+	r.n -= n
+	return err
+}
+
+// FrameHeader is a frame header, as specified in section 9.1.
+type FrameHeader struct {
+	KeyFrame          bool
+	VersionNumber     uint8
+	ShowFrame         bool
+	FirstPartitionLen uint32
+	Width             int
+	Height            int
+	XScale            uint8
+	YScale            uint8
+}
+
+const (
+	nSegment     = 4
+	nSegmentProb = 3
+)
+
+// segmentHeader holds segment-related header information.
+type segmentHeader struct {
+	useSegment     bool
+	updateMap      bool
+	relativeDelta  bool
+	quantizer      [nSegment]int8
+	filterStrength [nSegment]int8
+	prob           [nSegmentProb]uint8
+}
+
+const (
+	nRefLFDelta  = 4
+	nModeLFDelta = 4
+)
+
+// filterHeader holds filter-related header information.
+type filterHeader struct {
+	simple          bool
+	level           int8
+	sharpness       uint8
+	useLFDelta      bool
+	refLFDelta      [nRefLFDelta]int8
+	modeLFDelta     [nModeLFDelta]int8
+	perSegmentLevel [nSegment]int8
+}
+
+// mb is the per-macroblock decode state. A decoder maintains mbw+1 of these
+// as it is decoding macroblocks left-to-right and top-to-bottom: mbw for the
+// macroblocks in the row above, and one for the macroblock to the left.
+type mb struct {
+	// pred is the predictor mode for the 4 bottom or right 4x4 luma regions.
+	pred [4]uint8
+	// nzMask is a mask of 8 bits: 4 for the bottom or right 4x4 luma regions,
+	// and 2 + 2 for the bottom or right 4x4 chroma regions. A 1 bit indicates
+	// that that region has non-zero coefficients.
+	nzMask uint8
+	// nzY16 is a 0/1 value that is 1 if the macroblock used Y16 prediction and
+	// had non-zero coefficients.
+	nzY16 uint8
+}
+
+// Decoder decodes VP8 bitstreams into frames. Decoding one frame consists of
+// calling Init, DecodeFrameHeader and then DecodeFrame in that order.
+// A Decoder can be re-used to decode multiple frames.
+type Decoder struct {
+	// r is the input bitsream.
+	r limitReader
+	// scratch is a scratch buffer.
+	scratch [8]byte
+	// img is the YCbCr image to decode into.
+	img *image.YCbCr
+	// mbw and mbh are the number of 16x16 macroblocks wide and high the image is.
+	mbw, mbh int
+	// frameHeader is the frame header. When decoding multiple frames,
+	// frames that aren't key frames will inherit the Width, Height,
+	// XScale and YScale of the most recent key frame.
+	frameHeader FrameHeader
+	// Other headers.
+	segmentHeader segmentHeader
+	filterHeader  filterHeader
+	// The image data is divided into a number of independent partitions.
+	// There is 1 "first partition" and between 1 and 8 "other partitions"
+	// for coefficient data.
+	fp  partition
+	op  [8]partition
+	nOP int
+	// Quantization factors.
+	quant [nSegment]quant
+	// DCT/WHT coefficient decoding probabilities.
+	tokenProb   [nPlane][nBand][nContext][nProb]uint8
+	useSkipProb bool
+	skipProb    uint8
+	// Loop filter parameters.
+	filterParams      [nSegment][2]filterParam
+	perMBFilterParams []filterParam
+
+	// The eight fields below relate to the current macroblock being decoded.
+	//
+	// Segment-based adjustments.
+	segment int
+	// Per-macroblock state for the macroblock immediately left of and those
+	// macroblocks immediately above the current macroblock.
+	leftMB mb
+	upMB   []mb
+	// Bitmasks for which 4x4 regions of coeff contain non-zero coefficients.
+	nzDCMask, nzACMask uint32
+	// Predictor modes.
+	usePredY16 bool // The libwebp C code calls this !is_i4x4_.
+	predY16    uint8
+	predC8     uint8
+	predY4     [4][4]uint8
+
+	// The two fields below form a workspace for reconstructing a macroblock.
+	// Their specific sizes are documented in reconstruct.go.
+	coeff [1*16*16 + 2*8*8 + 1*4*4]int16
+	ybr   [1 + 16 + 1 + 8][32]uint8
+}
+
+// NewDecoder returns a new Decoder.
+func NewDecoder() *Decoder {
+	return &Decoder{}
+}
+
+// Init initializes the decoder to read at most n bytes from r.
+func (d *Decoder) Init(r io.Reader, n int) {
+	d.r = limitReader{r, n}
+}
+
+// DecodeFrameHeader decodes the frame header.
+func (d *Decoder) DecodeFrameHeader() (fh FrameHeader, err error) {
+	// All frame headers are at least 3 bytes long.
+	b := d.scratch[:3]
+	if err = d.r.ReadFull(b); err != nil {
+		return
+	}
+	d.frameHeader.KeyFrame = (b[0] & 1) == 0
+	d.frameHeader.VersionNumber = (b[0] >> 1) & 7
+	d.frameHeader.ShowFrame = (b[0]>>4)&1 == 1
+	d.frameHeader.FirstPartitionLen = uint32(b[0])>>5 | uint32(b[1])<<3 | uint32(b[2])<<11
+	if !d.frameHeader.KeyFrame {
+		return d.frameHeader, nil
+	}
+	// Frame headers for key frames are an additional 7 bytes long.
+	b = d.scratch[:7]
+	if err = d.r.ReadFull(b); err != nil {
+		return
+	}
+	// Check the magic sync code.
+	if b[0] != 0x9d || b[1] != 0x01 || b[2] != 0x2a {
+		err = errors.New("vp8: invalid format")
+		return
+	}
+	d.frameHeader.Width = int(b[4]&0x3f)<<8 | int(b[3])
+	d.frameHeader.Height = int(b[6]&0x3f)<<8 | int(b[5])
+	d.frameHeader.XScale = b[4] >> 6
+	d.frameHeader.YScale = b[6] >> 6
+	d.mbw = (d.frameHeader.Width + 0x0f) >> 4
+	d.mbh = (d.frameHeader.Height + 0x0f) >> 4
+	d.segmentHeader = segmentHeader{
+		prob: [3]uint8{0xff, 0xff, 0xff},
+	}
+	d.tokenProb = defaultTokenProb
+	d.segment = 0
+	return d.frameHeader, nil
+}
+
+// ensureImg ensures that d.img is large enough to hold the decoded frame.
+func (d *Decoder) ensureImg() {
+	if d.img != nil {
+		p0, p1 := d.img.Rect.Min, d.img.Rect.Max
+		if p0.X == 0 && p0.Y == 0 && p1.X >= 16*d.mbw && p1.Y >= 16*d.mbh {
+			return
+		}
+	}
+	m := image.NewYCbCr(image.Rect(0, 0, 16*d.mbw, 16*d.mbh), image.YCbCrSubsampleRatio420)
+	d.img = m.SubImage(image.Rect(0, 0, d.frameHeader.Width, d.frameHeader.Height)).(*image.YCbCr)
+	d.perMBFilterParams = make([]filterParam, d.mbw*d.mbh)
+	d.upMB = make([]mb, d.mbw)
+}
+
+// parseSegmentHeader parses the segment header, as specified in section 9.3.
+func (d *Decoder) parseSegmentHeader() {
+	d.segmentHeader.useSegment = d.fp.readBit(uniformProb)
+	if !d.segmentHeader.useSegment {
+		d.segmentHeader.updateMap = false
+		return
+	}
+	d.segmentHeader.updateMap = d.fp.readBit(uniformProb)
+	if d.fp.readBit(uniformProb) {
+		d.segmentHeader.relativeDelta = !d.fp.readBit(uniformProb)
+		for i := range d.segmentHeader.quantizer {
+			d.segmentHeader.quantizer[i] = int8(d.fp.readOptionalInt(uniformProb, 7))
+		}
+		for i := range d.segmentHeader.filterStrength {
+			d.segmentHeader.filterStrength[i] = int8(d.fp.readOptionalInt(uniformProb, 6))
+		}
+	}
+	if !d.segmentHeader.updateMap {
+		return
+	}
+	for i := range d.segmentHeader.prob {
+		if d.fp.readBit(uniformProb) {
+			d.segmentHeader.prob[i] = uint8(d.fp.readUint(uniformProb, 8))
+		} else {
+			d.segmentHeader.prob[i] = 0xff
+		}
+	}
+}
+
+// parseFilterHeader parses the filter header, as specified in section 9.4.
+func (d *Decoder) parseFilterHeader() {
+	d.filterHeader.simple = d.fp.readBit(uniformProb)
+	d.filterHeader.level = int8(d.fp.readUint(uniformProb, 6))
+	d.filterHeader.sharpness = uint8(d.fp.readUint(uniformProb, 3))
+	d.filterHeader.useLFDelta = d.fp.readBit(uniformProb)
+	if d.filterHeader.useLFDelta && d.fp.readBit(uniformProb) {
+		for i := range d.filterHeader.refLFDelta {
+			d.filterHeader.refLFDelta[i] = int8(d.fp.readOptionalInt(uniformProb, 6))
+		}
+		for i := range d.filterHeader.modeLFDelta {
+			d.filterHeader.modeLFDelta[i] = int8(d.fp.readOptionalInt(uniformProb, 6))
+		}
+	}
+	if d.filterHeader.level == 0 {
+		return
+	}
+	if d.segmentHeader.useSegment {
+		for i := range d.filterHeader.perSegmentLevel {
+			strength := d.segmentHeader.filterStrength[i]
+			if d.segmentHeader.relativeDelta {
+				strength += d.filterHeader.level
+			}
+			d.filterHeader.perSegmentLevel[i] = strength
+		}
+	} else {
+		d.filterHeader.perSegmentLevel[0] = d.filterHeader.level
+	}
+	d.computeFilterParams()
+}
+
+// parseOtherPartitions parses the other partitions, as specified in section 9.5.
+func (d *Decoder) parseOtherPartitions() error {
+	const maxNOP = 1 << 3
+	var partLens [maxNOP]int
+	d.nOP = 1 << d.fp.readUint(uniformProb, 2)
+
+	// The final partition length is implied by the the remaining chunk data
+	// (d.r.n) and the other d.nOP-1 partition lengths. Those d.nOP-1 partition
+	// lengths are stored as 24-bit uints, i.e. up to 16 MiB per partition.
+	n := 3 * (d.nOP - 1)
+	partLens[d.nOP-1] = d.r.n - n
+	if partLens[d.nOP-1] < 0 {
+		return io.ErrUnexpectedEOF
+	}
+	if n > 0 {
+		buf := make([]byte, n)
+		if err := d.r.ReadFull(buf); err != nil {
+			return err
+		}
+		for i := 0; i < d.nOP-1; i++ {
+			pl := int(buf[3*i+0]) | int(buf[3*i+1])<<8 | int(buf[3*i+2])<<16
+			if pl > partLens[d.nOP-1] {
+				return io.ErrUnexpectedEOF
+			}
+			partLens[i] = pl
+			partLens[d.nOP-1] -= pl
+		}
+	}
+
+	// We check if the final partition length can also fit into a 24-bit uint.
+	// Strictly speaking, this isn't part of the spec, but it guards against a
+	// malicious WEBP image that is too large to ReadFull the encoded DCT
+	// coefficients into memory, whether that's because the actual WEBP file is
+	// too large, or whether its RIFF metadata lists too large a chunk.
+	if 1<<24 <= partLens[d.nOP-1] {
+		return errors.New("vp8: too much data to decode")
+	}
+
+	buf := make([]byte, d.r.n)
+	if err := d.r.ReadFull(buf); err != nil {
+		return err
+	}
+	for i, pl := range partLens {
+		if i == d.nOP {
+			break
+		}
+		d.op[i].init(buf[:pl])
+		buf = buf[pl:]
+	}
+	return nil
+}
+
+// parseOtherHeaders parses header information other than the frame header.
+func (d *Decoder) parseOtherHeaders() error {
+	// Initialize and parse the first partition.
+	firstPartition := make([]byte, d.frameHeader.FirstPartitionLen)
+	if err := d.r.ReadFull(firstPartition); err != nil {
+		return err
+	}
+	d.fp.init(firstPartition)
+	if d.frameHeader.KeyFrame {
+		// Read and ignore the color space and pixel clamp values. They are
+		// specified in section 9.2, but are unimplemented.
+		d.fp.readBit(uniformProb)
+		d.fp.readBit(uniformProb)
+	}
+	d.parseSegmentHeader()
+	d.parseFilterHeader()
+	if err := d.parseOtherPartitions(); err != nil {
+		return err
+	}
+	d.parseQuant()
+	if !d.frameHeader.KeyFrame {
+		// Golden and AltRef frames are specified in section 9.7.
+		// TODO(nigeltao): implement. Note that they are only used for video, not still images.
+		return errors.New("vp8: Golden / AltRef frames are not implemented")
+	}
+	// Read and ignore the refreshLastFrameBuffer bit, specified in section 9.8.
+	// It applies only to video, and not still images.
+	d.fp.readBit(uniformProb)
+	d.parseTokenProb()
+	d.useSkipProb = d.fp.readBit(uniformProb)
+	if d.useSkipProb {
+		d.skipProb = uint8(d.fp.readUint(uniformProb, 8))
+	}
+	if d.fp.unexpectedEOF {
+		return io.ErrUnexpectedEOF
+	}
+	return nil
+}
+
+// DecodeFrame decodes the frame and returns it as an YCbCr image.
+// The image's contents are valid up until the next call to Decoder.Init.
+func (d *Decoder) DecodeFrame() (*image.YCbCr, error) {
+	d.ensureImg()
+	if err := d.parseOtherHeaders(); err != nil {
+		return nil, err
+	}
+	// Reconstruct the rows.
+	for mbx := 0; mbx < d.mbw; mbx++ {
+		d.upMB[mbx] = mb{}
+	}
+	for mby := 0; mby < d.mbh; mby++ {
+		d.leftMB = mb{}
+		for mbx := 0; mbx < d.mbw; mbx++ {
+			skip := d.reconstruct(mbx, mby)
+			fs := d.filterParams[d.segment][btou(!d.usePredY16)]
+			fs.inner = fs.inner || !skip
+			d.perMBFilterParams[d.mbw*mby+mbx] = fs
+		}
+	}
+	if d.fp.unexpectedEOF {
+		return nil, io.ErrUnexpectedEOF
+	}
+	for i := 0; i < d.nOP; i++ {
+		if d.op[i].unexpectedEOF {
+			return nil, io.ErrUnexpectedEOF
+		}
+	}
+	// Apply the loop filter.
+	//
+	// Even if we are using per-segment levels, section 15 says that "loop
+	// filtering must be skipped entirely if loop_filter_level at either the
+	// frame header level or macroblock override level is 0".
+	if d.filterHeader.level != 0 {
+		if d.filterHeader.simple {
+			d.simpleFilter()
+		} else {
+			d.normalFilter()
+		}
+	}
+	return d.img, nil
+}
diff --git a/vp8/filter.go b/vp8/filter.go
new file mode 100644
index 0000000..e34a811
--- /dev/null
+++ b/vp8/filter.go
@@ -0,0 +1,273 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package vp8
+
+// filter2 modifies a 2-pixel wide or 2-pixel high band along an edge.
+func filter2(pix []byte, level, index, iStep, jStep int) {
+	for n := 16; n > 0; n, index = n-1, index+iStep {
+		p1 := int(pix[index-2*jStep])
+		p0 := int(pix[index-1*jStep])
+		q0 := int(pix[index+0*jStep])
+		q1 := int(pix[index+1*jStep])
+		if abs(p0-q0)<<1+abs(p1-q1)>>1 > level {
+			continue
+		}
+		a := 3*(q0-p0) + clamp127(p1-q1)
+		a1 := clamp15((a + 4) >> 3)
+		a2 := clamp15((a + 3) >> 3)
+		pix[index-1*jStep] = clamp255(p0 + a2)
+		pix[index+0*jStep] = clamp255(q0 - a1)
+	}
+}
+
+// filter246 modifies a 2-, 4- or 6-pixel wide or high band along an edge.
+func filter246(pix []byte, n, level, ilevel, hlevel, index, iStep, jStep int, fourNotSix bool) {
+	for ; n > 0; n, index = n-1, index+iStep {
+		p3 := int(pix[index-4*jStep])
+		p2 := int(pix[index-3*jStep])
+		p1 := int(pix[index-2*jStep])
+		p0 := int(pix[index-1*jStep])
+		q0 := int(pix[index+0*jStep])
+		q1 := int(pix[index+1*jStep])
+		q2 := int(pix[index+2*jStep])
+		q3 := int(pix[index+3*jStep])
+		if abs(p0-q0)<<1+abs(p1-q1)>>1 > level {
+			continue
+		}
+		if abs(p3-p2) > ilevel ||
+			abs(p2-p1) > ilevel ||
+			abs(p1-p0) > ilevel ||
+			abs(q1-q0) > ilevel ||
+			abs(q2-q1) > ilevel ||
+			abs(q3-q2) > ilevel {
+			continue
+		}
+		if abs(p1-p0) > hlevel || abs(q1-q0) > hlevel {
+			// Filter 2 pixels.
+			a := 3*(q0-p0) + clamp127(p1-q1)
+			a1 := clamp15((a + 4) >> 3)
+			a2 := clamp15((a + 3) >> 3)
+			pix[index-1*jStep] = clamp255(p0 + a2)
+			pix[index+0*jStep] = clamp255(q0 - a1)
+		} else if fourNotSix {
+			// Filter 4 pixels.
+			a := 3 * (q0 - p0)
+			a1 := clamp15((a + 4) >> 3)
+			a2 := clamp15((a + 3) >> 3)
+			a3 := (a1 + 1) >> 1
+			pix[index-2*jStep] = clamp255(p1 + a3)
+			pix[index-1*jStep] = clamp255(p0 + a2)
+			pix[index+0*jStep] = clamp255(q0 - a1)
+			pix[index+1*jStep] = clamp255(q1 - a3)
+		} else {
+			// Filter 6 pixels.
+			a := clamp127(3*(q0-p0) + clamp127(p1-q1))
+			a1 := (27*a + 63) >> 7
+			a2 := (18*a + 63) >> 7
+			a3 := (9*a + 63) >> 7
+			pix[index-3*jStep] = clamp255(p2 + a3)
+			pix[index-2*jStep] = clamp255(p1 + a2)
+			pix[index-1*jStep] = clamp255(p0 + a1)
+			pix[index+0*jStep] = clamp255(q0 - a1)
+			pix[index+1*jStep] = clamp255(q1 - a2)
+			pix[index+2*jStep] = clamp255(q2 - a3)
+		}
+	}
+}
+
+// simpleFilter implements the simple filter, as specified in section 15.2.
+func (d *Decoder) simpleFilter() {
+	for mby := 0; mby < d.mbh; mby++ {
+		for mbx := 0; mbx < d.mbw; mbx++ {
+			f := d.perMBFilterParams[d.mbw*mby+mbx]
+			if f.level == 0 {
+				continue
+			}
+			l := int(f.level)
+			yIndex := (mby*d.img.YStride + mbx) * 16
+			if mbx > 0 {
+				filter2(d.img.Y, l+4, yIndex, d.img.YStride, 1)
+			}
+			if f.inner {
+				filter2(d.img.Y, l, yIndex+0x4, d.img.YStride, 1)
+				filter2(d.img.Y, l, yIndex+0x8, d.img.YStride, 1)
+				filter2(d.img.Y, l, yIndex+0xc, d.img.YStride, 1)
+			}
+			if mby > 0 {
+				filter2(d.img.Y, l+4, yIndex, 1, d.img.YStride)
+			}
+			if f.inner {
+				filter2(d.img.Y, l, yIndex+d.img.YStride*0x4, 1, d.img.YStride)
+				filter2(d.img.Y, l, yIndex+d.img.YStride*0x8, 1, d.img.YStride)
+				filter2(d.img.Y, l, yIndex+d.img.YStride*0xc, 1, d.img.YStride)
+			}
+		}
+	}
+}
+
+// normalFilter implements the normal filter, as specified in section 15.3.
+func (d *Decoder) normalFilter() {
+	for mby := 0; mby < d.mbh; mby++ {
+		for mbx := 0; mbx < d.mbw; mbx++ {
+			f := d.perMBFilterParams[d.mbw*mby+mbx]
+			if f.level == 0 {
+				continue
+			}
+			l, il, hl := int(f.level), int(f.ilevel), int(f.hlevel)
+			yIndex := (mby*d.img.YStride + mbx) * 16
+			cIndex := (mby*d.img.CStride + mbx) * 8
+			if mbx > 0 {
+				filter246(d.img.Y, 16, l+4, il, hl, yIndex, d.img.YStride, 1, false)
+				filter246(d.img.Cb, 8, l+4, il, hl, cIndex, d.img.CStride, 1, false)
+				filter246(d.img.Cr, 8, l+4, il, hl, cIndex, d.img.CStride, 1, false)
+			}
+			if f.inner {
+				filter246(d.img.Y, 16, l, il, hl, yIndex+0x4, d.img.YStride, 1, true)
+				filter246(d.img.Y, 16, l, il, hl, yIndex+0x8, d.img.YStride, 1, true)
+				filter246(d.img.Y, 16, l, il, hl, yIndex+0xc, d.img.YStride, 1, true)
+				filter246(d.img.Cb, 8, l, il, hl, cIndex+0x4, d.img.CStride, 1, true)
+				filter246(d.img.Cr, 8, l, il, hl, cIndex+0x4, d.img.CStride, 1, true)
+			}
+			if mby > 0 {
+				filter246(d.img.Y, 16, l+4, il, hl, yIndex, 1, d.img.YStride, false)
+				filter246(d.img.Cb, 8, l+4, il, hl, cIndex, 1, d.img.CStride, false)
+				filter246(d.img.Cr, 8, l+4, il, hl, cIndex, 1, d.img.CStride, false)
+			}
+			if f.inner {
+				filter246(d.img.Y, 16, l, il, hl, yIndex+d.img.YStride*0x4, 1, d.img.YStride, true)
+				filter246(d.img.Y, 16, l, il, hl, yIndex+d.img.YStride*0x8, 1, d.img.YStride, true)
+				filter246(d.img.Y, 16, l, il, hl, yIndex+d.img.YStride*0xc, 1, d.img.YStride, true)
+				filter246(d.img.Cb, 8, l, il, hl, cIndex+d.img.CStride*0x4, 1, d.img.CStride, true)
+				filter246(d.img.Cr, 8, l, il, hl, cIndex+d.img.CStride*0x4, 1, d.img.CStride, true)
+			}
+		}
+	}
+}
+
+// filterParam holds the loop filter parameters for a macroblock.
+type filterParam struct {
+	// The first three fields are thresholds used by the loop filter to smooth
+	// over the edges and interior of a macroblock. level is used by both the
+	// simple and normal filters. The inner level and high edge variance level
+	// are only used by the normal filter.
+	level, ilevel, hlevel uint8
+	// inner is whether the inner loop filter cannot be optimized out as a
+	// no-op for this particular macroblock.
+	inner bool
+}
+
+// computeFilterParams computes the loop filter parameters, as specified in
+// section 15.4.
+func (d *Decoder) computeFilterParams() {
+	for i := range d.filterParams {
+		baseLevel := d.filterHeader.level
+		if d.segmentHeader.useSegment {
+			baseLevel = d.segmentHeader.filterStrength[i]
+			if d.segmentHeader.relativeDelta {
+				baseLevel += d.filterHeader.level
+			}
+		}
+
+		for j := range d.filterParams[i] {
+			p := &d.filterParams[i][j]
+			p.inner = j != 0
+			level := baseLevel
+			if d.filterHeader.useLFDelta {
+				// The libwebp C code has a "TODO: only CURRENT is handled for now."
+				level += d.filterHeader.refLFDelta[0]
+				if j != 0 {
+					level += d.filterHeader.modeLFDelta[0]
+				}
+			}
+			if level <= 0 {
+				p.level = 0
+				continue
+			}
+			if level > 63 {
+				level = 63
+			}
+			ilevel := level
+			if d.filterHeader.sharpness > 0 {
+				if d.filterHeader.sharpness > 4 {
+					ilevel >>= 2
+				} else {
+					ilevel >>= 1
+				}
+				if x := int8(9 - d.filterHeader.sharpness); ilevel > x {
+					ilevel = x
+				}
+			}
+			if ilevel < 1 {
+				ilevel = 1
+			}
+			p.ilevel = uint8(ilevel)
+			p.level = uint8(2*level + ilevel)
+			if d.frameHeader.KeyFrame {
+				if level < 15 {
+					p.hlevel = 0
+				} else if level < 40 {
+					p.hlevel = 1
+				} else {
+					p.hlevel = 2
+				}
+			} else {
+				if level < 15 {
+					p.hlevel = 0
+				} else if level < 20 {
+					p.hlevel = 1
+				} else if level < 40 {
+					p.hlevel = 2
+				} else {
+					p.hlevel = 3
+				}
+			}
+		}
+	}
+}
+
+// intSize is either 32 or 64.
+const intSize = 32 << (^uint(0) >> 63)
+
+func abs(x int) int {
+	// m := -1 if x < 0. m := 0 otherwise.
+	m := x >> (intSize - 1)
+
+	// In two's complement representation, the negative number
+	// of any number (except the smallest one) can be computed
+	// by flipping all the bits and add 1. This is faster than
+	// code with a branch.
+	// See Hacker's Delight, section 2-4.
+	return (x ^ m) - m
+}
+
+func clamp15(x int) int {
+	if x < -16 {
+		return -16
+	}
+	if x > 15 {
+		return 15
+	}
+	return x
+}
+
+func clamp127(x int) int {
+	if x < -128 {
+		return -128
+	}
+	if x > 127 {
+		return 127
+	}
+	return x
+}
+
+func clamp255(x int) uint8 {
+	if x < 0 {
+		return 0
+	}
+	if x > 255 {
+		return 255
+	}
+	return uint8(x)
+}
diff --git a/vp8/idct.go b/vp8/idct.go
new file mode 100644
index 0000000..929af2c
--- /dev/null
+++ b/vp8/idct.go
@@ -0,0 +1,98 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package vp8
+
+// This file implements the inverse Discrete Cosine Transform and the inverse
+// Walsh Hadamard Transform (WHT), as specified in sections 14.3 and 14.4.
+
+func clip8(i int32) uint8 {
+	if i < 0 {
+		return 0
+	}
+	if i > 255 {
+		return 255
+	}
+	return uint8(i)
+}
+
+func (z *Decoder) inverseDCT4(y, x, coeffBase int) {
+	const (
+		c1 = 85627 // 65536 * cos(pi/8) * sqrt(2).
+		c2 = 35468 // 65536 * sin(pi/8) * sqrt(2).
+	)
+	var m [4][4]int32
+	for i := 0; i < 4; i++ {
+		a := int32(z.coeff[coeffBase+0]) + int32(z.coeff[coeffBase+8])
+		b := int32(z.coeff[coeffBase+0]) - int32(z.coeff[coeffBase+8])
+		c := (int32(z.coeff[coeffBase+4])*c2)>>16 - (int32(z.coeff[coeffBase+12])*c1)>>16
+		d := (int32(z.coeff[coeffBase+4])*c1)>>16 + (int32(z.coeff[coeffBase+12])*c2)>>16
+		m[i][0] = a + d
+		m[i][1] = b + c
+		m[i][2] = b - c
+		m[i][3] = a - d
+		coeffBase++
+	}
+	for j := 0; j < 4; j++ {
+		dc := m[0][j] + 4
+		a := dc + m[2][j]
+		b := dc - m[2][j]
+		c := (m[1][j]*c2)>>16 - (m[3][j]*c1)>>16
+		d := (m[1][j]*c1)>>16 + (m[3][j]*c2)>>16
+		z.ybr[y+j][x+0] = clip8(int32(z.ybr[y+j][x+0]) + (a+d)>>3)
+		z.ybr[y+j][x+1] = clip8(int32(z.ybr[y+j][x+1]) + (b+c)>>3)
+		z.ybr[y+j][x+2] = clip8(int32(z.ybr[y+j][x+2]) + (b-c)>>3)
+		z.ybr[y+j][x+3] = clip8(int32(z.ybr[y+j][x+3]) + (a-d)>>3)
+	}
+}
+
+func (z *Decoder) inverseDCT4DCOnly(y, x, coeffBase int) {
+	dc := (int32(z.coeff[coeffBase+0]) + 4) >> 3
+	for j := 0; j < 4; j++ {
+		for i := 0; i < 4; i++ {
+			z.ybr[y+j][x+i] = clip8(int32(z.ybr[y+j][x+i]) + dc)
+		}
+	}
+}
+
+func (z *Decoder) inverseDCT8(y, x, coeffBase int) {
+	z.inverseDCT4(y+0, x+0, coeffBase+0*16)
+	z.inverseDCT4(y+0, x+4, coeffBase+1*16)
+	z.inverseDCT4(y+4, x+0, coeffBase+2*16)
+	z.inverseDCT4(y+4, x+4, coeffBase+3*16)
+}
+
+func (z *Decoder) inverseDCT8DCOnly(y, x, coeffBase int) {
+	z.inverseDCT4DCOnly(y+0, x+0, coeffBase+0*16)
+	z.inverseDCT4DCOnly(y+0, x+4, coeffBase+1*16)
+	z.inverseDCT4DCOnly(y+4, x+0, coeffBase+2*16)
+	z.inverseDCT4DCOnly(y+4, x+4, coeffBase+3*16)
+}
+
+func (d *Decoder) inverseWHT16() {
+	var m [16]int32
+	for i := 0; i < 4; i++ {
+		a0 := int32(d.coeff[384+0+i]) + int32(d.coeff[384+12+i])
+		a1 := int32(d.coeff[384+4+i]) + int32(d.coeff[384+8+i])
+		a2 := int32(d.coeff[384+4+i]) - int32(d.coeff[384+8+i])
+		a3 := int32(d.coeff[384+0+i]) - int32(d.coeff[384+12+i])
+		m[0+i] = a0 + a1
+		m[8+i] = a0 - a1
+		m[4+i] = a3 + a2
+		m[12+i] = a3 - a2
+	}
+	out := 0
+	for i := 0; i < 4; i++ {
+		dc := m[0+i*4] + 3
+		a0 := dc + m[3+i*4]
+		a1 := m[1+i*4] + m[2+i*4]
+		a2 := m[1+i*4] - m[2+i*4]
+		a3 := dc - m[3+i*4]
+		d.coeff[out+0] = int16((a0 + a1) >> 3)
+		d.coeff[out+16] = int16((a3 + a2) >> 3)
+		d.coeff[out+32] = int16((a0 - a1) >> 3)
+		d.coeff[out+48] = int16((a3 - a2) >> 3)
+		out += 64
+	}
+}
diff --git a/vp8/partition.go b/vp8/partition.go
new file mode 100644
index 0000000..72288bd
--- /dev/null
+++ b/vp8/partition.go
@@ -0,0 +1,129 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package vp8
+
+// Each VP8 frame consists of between 2 and 9 bitstream partitions.
+// Each partition is byte-aligned and is independently arithmetic-encoded.
+//
+// This file implements decoding a partition's bitstream, as specified in
+// chapter 7. The implementation follows libwebp's approach instead of the
+// specification's reference C implementation. For example, we use a look-up
+// table instead of a for loop to recalibrate the encoded range.
+
+var (
+	lutShift = [127]uint8{
+		7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+		3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	}
+	lutRangeM1 = [127]uint8{
+		127,
+		127, 191,
+		127, 159, 191, 223,
+		127, 143, 159, 175, 191, 207, 223, 239,
+		127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247,
+		127, 131, 135, 139, 143, 147, 151, 155, 159, 163, 167, 171, 175, 179, 183, 187,
+		191, 195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, 239, 243, 247, 251,
+		127, 129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149, 151, 153, 155, 157,
+		159, 161, 163, 165, 167, 169, 171, 173, 175, 177, 179, 181, 183, 185, 187, 189,
+		191, 193, 195, 197, 199, 201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221,
+		223, 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253,
+	}
+)
+
+// uniformProb represents a 50% probability that the next bit is 0.
+const uniformProb = 128
+
+// partition holds arithmetic-coded bits.
+type partition struct {
+	// buf is the input bytes.
+	buf []byte
+	// r is how many of buf's bytes have been consumed.
+	r int
+	// rangeM1 is range minus 1, where range is in the arithmetic coding sense,
+	// not the Go language sense.
+	rangeM1 uint32
+	// bits and nBits hold those bits shifted out of buf but not yet consumed.
+	bits  uint32
+	nBits uint8
+	// unexpectedEOF tells whether we tried to read past buf.
+	unexpectedEOF bool
+}
+
+// init initializes the partition.
+func (p *partition) init(buf []byte) {
+	p.buf = buf
+	p.r = 0
+	p.rangeM1 = 254
+	p.bits = 0
+	p.nBits = 0
+	p.unexpectedEOF = false
+}
+
+// readBit returns the next bit.
+func (p *partition) readBit(prob uint8) bool {
+	if p.nBits < 8 {
+		if p.r >= len(p.buf) {
+			p.unexpectedEOF = true
+			return false
+		}
+		// Expression split for 386 compiler.
+		x := uint32(p.buf[p.r])
+		p.bits |= x << (8 - p.nBits)
+		p.r++
+		p.nBits += 8
+	}
+	split := (p.rangeM1*uint32(prob))>>8 + 1
+	bit := p.bits >= split<<8
+	if bit {
+		p.rangeM1 -= split
+		p.bits -= split << 8
+	} else {
+		p.rangeM1 = split - 1
+	}
+	if p.rangeM1 < 127 {
+		shift := lutShift[p.rangeM1]
+		p.rangeM1 = uint32(lutRangeM1[p.rangeM1])
+		p.bits <<= shift
+		p.nBits -= shift
+	}
+	return bit
+}
+
+// readUint returns the next n-bit unsigned integer.
+func (p *partition) readUint(prob, n uint8) uint32 {
+	var u uint32
+	for n > 0 {
+		n--
+		if p.readBit(prob) {
+			u |= 1 << n
+		}
+	}
+	return u
+}
+
+// readInt returns the next n-bit signed integer.
+func (p *partition) readInt(prob, n uint8) int32 {
+	u := p.readUint(prob, n)
+	b := p.readBit(prob)
+	if b {
+		return -int32(u)
+	}
+	return int32(u)
+}
+
+// readOptionalInt returns the next n-bit signed integer in an encoding
+// where the likely result is zero.
+func (p *partition) readOptionalInt(prob, n uint8) int32 {
+	if !p.readBit(prob) {
+		return 0
+	}
+	return p.readInt(prob, n)
+}
diff --git a/vp8/pred.go b/vp8/pred.go
new file mode 100644
index 0000000..58c2689
--- /dev/null
+++ b/vp8/pred.go
@@ -0,0 +1,201 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package vp8
+
+// This file implements parsing the predictor modes, as specified in chapter
+// 11.
+
+func (d *Decoder) parsePredModeY16(mbx int) {
+	var p uint8
+	if !d.fp.readBit(156) {
+		if !d.fp.readBit(163) {
+			p = predDC
+		} else {
+			p = predVE
+		}
+	} else if !d.fp.readBit(128) {
+		p = predHE
+	} else {
+		p = predTM
+	}
+	for i := 0; i < 4; i++ {
+		d.upMB[mbx].pred[i] = p
+		d.leftMB.pred[i] = p
+	}
+	d.predY16 = p
+}
+
+func (d *Decoder) parsePredModeC8() {
+	if !d.fp.readBit(142) {
+		d.predC8 = predDC
+	} else if !d.fp.readBit(114) {
+		d.predC8 = predVE
+	} else if !d.fp.readBit(183) {
+		d.predC8 = predHE
+	} else {
+		d.predC8 = predTM
+	}
+}
+
+func (d *Decoder) parsePredModeY4(mbx int) {
+	for j := 0; j < 4; j++ {
+		p := d.leftMB.pred[j]
+		for i := 0; i < 4; i++ {
+			prob := &predProb[d.upMB[mbx].pred[i]][p]
+			if !d.fp.readBit(prob[0]) {
+				p = predDC
+			} else if !d.fp.readBit(prob[1]) {
+				p = predTM
+			} else if !d.fp.readBit(prob[2]) {
+				p = predVE
+			} else if !d.fp.readBit(prob[3]) {
+				if !d.fp.readBit(prob[4]) {
+					p = predHE
+				} else if !d.fp.readBit(prob[5]) {
+					p = predRD
+				} else {
+					p = predVR
+				}
+			} else if !d.fp.readBit(prob[6]) {
+				p = predLD
+			} else if !d.fp.readBit(prob[7]) {
+				p = predVL
+			} else if !d.fp.readBit(prob[8]) {
+				p = predHD
+			} else {
+				p = predHU
+			}
+			d.predY4[j][i] = p
+			d.upMB[mbx].pred[i] = p
+		}
+		d.leftMB.pred[j] = p
+	}
+}
+
+// predProb are the probabilities to decode a 4x4 region's predictor mode given
+// the predictor modes of the regions above and left of it.
+// These values are specified in section 11.5.
+var predProb = [nPred][nPred][9]uint8{
+	{
+		{231, 120, 48, 89, 115, 113, 120, 152, 112},
+		{152, 179, 64, 126, 170, 118, 46, 70, 95},
+		{175, 69, 143, 80, 85, 82, 72, 155, 103},
+		{56, 58, 10, 171, 218, 189, 17, 13, 152},
+		{114, 26, 17, 163, 44, 195, 21, 10, 173},
+		{121, 24, 80, 195, 26, 62, 44, 64, 85},
+		{144, 71, 10, 38, 171, 213, 144, 34, 26},
+		{170, 46, 55, 19, 136, 160, 33, 206, 71},
+		{63, 20, 8, 114, 114, 208, 12, 9, 226},
+		{81, 40, 11, 96, 182, 84, 29, 16, 36},
+	},
+	{
+		{134, 183, 89, 137, 98, 101, 106, 165, 148},
+		{72, 187, 100, 130, 157, 111, 32, 75, 80},
+		{66, 102, 167, 99, 74, 62, 40, 234, 128},
+		{41, 53, 9, 178, 241, 141, 26, 8, 107},
+		{74, 43, 26, 146, 73, 166, 49, 23, 157},
+		{65, 38, 105, 160, 51, 52, 31, 115, 128},
+		{104, 79, 12, 27, 217, 255, 87, 17, 7},
+		{87, 68, 71, 44, 114, 51, 15, 186, 23},
+		{47, 41, 14, 110, 182, 183, 21, 17, 194},
+		{66, 45, 25, 102, 197, 189, 23, 18, 22},
+	},
+	{
+		{88, 88, 147, 150, 42, 46, 45, 196, 205},
+		{43, 97, 183, 117, 85, 38, 35, 179, 61},
+		{39, 53, 200, 87, 26, 21, 43, 232, 171},
+		{56, 34, 51, 104, 114, 102, 29, 93, 77},
+		{39, 28, 85, 171, 58, 165, 90, 98, 64},
+		{34, 22, 116, 206, 23, 34, 43, 166, 73},
+		{107, 54, 32, 26, 51, 1, 81, 43, 31},
+		{68, 25, 106, 22, 64, 171, 36, 225, 114},
+		{34, 19, 21, 102, 132, 188, 16, 76, 124},
+		{62, 18, 78, 95, 85, 57, 50, 48, 51},
+	},
+	{
+		{193, 101, 35, 159, 215, 111, 89, 46, 111},
+		{60, 148, 31, 172, 219, 228, 21, 18, 111},
+		{112, 113, 77, 85, 179, 255, 38, 120, 114},
+		{40, 42, 1, 196, 245, 209, 10, 25, 109},
+		{88, 43, 29, 140, 166, 213, 37, 43, 154},
+		{61, 63, 30, 155, 67, 45, 68, 1, 209},
+		{100, 80, 8, 43, 154, 1, 51, 26, 71},
+		{142, 78, 78, 16, 255, 128, 34, 197, 171},
+		{41, 40, 5, 102, 211, 183, 4, 1, 221},
+		{51, 50, 17, 168, 209, 192, 23, 25, 82},
+	},
+	{
+		{138, 31, 36, 171, 27, 166, 38, 44, 229},
+		{67, 87, 58, 169, 82, 115, 26, 59, 179},
+		{63, 59, 90, 180, 59, 166, 93, 73, 154},
+		{40, 40, 21, 116, 143, 209, 34, 39, 175},
+		{47, 15, 16, 183, 34, 223, 49, 45, 183},
+		{46, 17, 33, 183, 6, 98, 15, 32, 183},
+		{57, 46, 22, 24, 128, 1, 54, 17, 37},
+		{65, 32, 73, 115, 28, 128, 23, 128, 205},
+		{40, 3, 9, 115, 51, 192, 18, 6, 223},
+		{87, 37, 9, 115, 59, 77, 64, 21, 47},
+	},
+	{
+		{104, 55, 44, 218, 9, 54, 53, 130, 226},
+		{64, 90, 70, 205, 40, 41, 23, 26, 57},
+		{54, 57, 112, 184, 5, 41, 38, 166, 213},
+		{30, 34, 26, 133, 152, 116, 10, 32, 134},
+		{39, 19, 53, 221, 26, 114, 32, 73, 255},
+		{31, 9, 65, 234, 2, 15, 1, 118, 73},
+		{75, 32, 12, 51, 192, 255, 160, 43, 51},
+		{88, 31, 35, 67, 102, 85, 55, 186, 85},
+		{56, 21, 23, 111, 59, 205, 45, 37, 192},
+		{55, 38, 70, 124, 73, 102, 1, 34, 98},
+	},
+	{
+		{125, 98, 42, 88, 104, 85, 117, 175, 82},
+		{95, 84, 53, 89, 128, 100, 113, 101, 45},
+		{75, 79, 123, 47, 51, 128, 81, 171, 1},
+		{57, 17, 5, 71, 102, 57, 53, 41, 49},
+		{38, 33, 13, 121, 57, 73, 26, 1, 85},
+		{41, 10, 67, 138, 77, 110, 90, 47, 114},
+		{115, 21, 2, 10, 102, 255, 166, 23, 6},
+		{101, 29, 16, 10, 85, 128, 101, 196, 26},
+		{57, 18, 10, 102, 102, 213, 34, 20, 43},
+		{117, 20, 15, 36, 163, 128, 68, 1, 26},
+	},
+	{
+		{102, 61, 71, 37, 34, 53, 31, 243, 192},
+		{69, 60, 71, 38, 73, 119, 28, 222, 37},
+		{68, 45, 128, 34, 1, 47, 11, 245, 171},
+		{62, 17, 19, 70, 146, 85, 55, 62, 70},
+		{37, 43, 37, 154, 100, 163, 85, 160, 1},
+		{63, 9, 92, 136, 28, 64, 32, 201, 85},
+		{75, 15, 9, 9, 64, 255, 184, 119, 16},
+		{86, 6, 28, 5, 64, 255, 25, 248, 1},
+		{56, 8, 17, 132, 137, 255, 55, 116, 128},
+		{58, 15, 20, 82, 135, 57, 26, 121, 40},
+	},
+	{
+		{164, 50, 31, 137, 154, 133, 25, 35, 218},
+		{51, 103, 44, 131, 131, 123, 31, 6, 158},
+		{86, 40, 64, 135, 148, 224, 45, 183, 128},
+		{22, 26, 17, 131, 240, 154, 14, 1, 209},
+		{45, 16, 21, 91, 64, 222, 7, 1, 197},
+		{56, 21, 39, 155, 60, 138, 23, 102, 213},
+		{83, 12, 13, 54, 192, 255, 68, 47, 28},
+		{85, 26, 85, 85, 128, 128, 32, 146, 171},
+		{18, 11, 7, 63, 144, 171, 4, 4, 246},
+		{35, 27, 10, 146, 174, 171, 12, 26, 128},
+	},
+	{
+		{190, 80, 35, 99, 180, 80, 126, 54, 45},
+		{85, 126, 47, 87, 176, 51, 41, 20, 32},
+		{101, 75, 128, 139, 118, 146, 116, 128, 85},
+		{56, 41, 15, 176, 236, 85, 37, 9, 62},
+		{71, 30, 17, 119, 118, 255, 17, 18, 138},
+		{101, 38, 60, 138, 55, 70, 43, 26, 142},
+		{146, 36, 19, 30, 171, 255, 97, 27, 20},
+		{138, 45, 61, 62, 219, 1, 81, 188, 64},
+		{32, 41, 20, 117, 151, 142, 20, 21, 163},
+		{112, 19, 12, 61, 195, 128, 48, 4, 24},
+	},
+}
diff --git a/vp8/predfunc.go b/vp8/predfunc.go
new file mode 100644
index 0000000..f899958
--- /dev/null
+++ b/vp8/predfunc.go
@@ -0,0 +1,553 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package vp8
+
+// This file implements the predicition functions, as specified in chapter 12.
+//
+// For each macroblock (of 1x16x16 luma and 2x8x8 chroma coefficients), the
+// luma values are either predicted as one large 16x16 region or 16 separate
+// 4x4 regions. The chroma values are always predicted as one 8x8 region.
+//
+// For 4x4 regions, the target block's predicted values (Xs) are a function of
+// its previously-decoded top and left border values, as well as a number of
+// pixels from the top-right:
+//
+//	a b c d e f g h
+//	p X X X X
+//	q X X X X
+//	r X X X X
+//	s X X X X
+//
+// The predictor modes are:
+//	- DC: all Xs = (b + c + d + e + p + q + r + s + 4) / 8.
+//	- TM: the first X = (b + p - a), the second X = (c + p - a), and so on.
+//	- VE: each X = the weighted average of its column's top value and that
+//	      value's neighbors, i.e. averages of abc, bcd, cde or def.
+//	- HE: similar to VE except rows instead of columns, and the final row is
+//	      an average of r, s and s.
+//	- RD, VR, LD, VL, HD, HU: these diagonal modes ("Right Down", "Vertical
+//	      Right", etc) are more complicated and are described in section 12.3.
+// All Xs are clipped to the range [0, 255].
+//
+// For 8x8 and 16x16 regions, the target block's predicted values are a
+// function of the top and left border values without the top-right overhang,
+// i.e. without the 8x8 or 16x16 equivalent of f, g and h. Furthermore:
+//	- There are no diagonal predictor modes, only DC, TM, VE and HE.
+//	- The DC mode has variants for macroblocks in the top row and/or left
+//	  column, i.e. for macroblocks with mby == 0 || mbx == 0.
+//	- The VE and HE modes take only the column top or row left values; they do
+//	  not smooth that top/left value with its neighbors.
+
+// nPred is the number of predictor modes, not including the Top/Left versions
+// of the DC predictor mode.
+const nPred = 10
+
+const (
+	predDC = iota
+	predTM
+	predVE
+	predHE
+	predRD
+	predVR
+	predLD
+	predVL
+	predHD
+	predHU
+	predDCTop
+	predDCLeft
+	predDCTopLeft
+)
+
+func checkTopLeftPred(mbx, mby int, p uint8) uint8 {
+	if p != predDC {
+		return p
+	}
+	if mbx == 0 {
+		if mby == 0 {
+			return predDCTopLeft
+		}
+		return predDCLeft
+	}
+	if mby == 0 {
+		return predDCTop
+	}
+	return predDC
+}
+
+var predFunc4 = [...]func(*Decoder, int, int){
+	predFunc4DC,
+	predFunc4TM,
+	predFunc4VE,
+	predFunc4HE,
+	predFunc4RD,
+	predFunc4VR,
+	predFunc4LD,
+	predFunc4VL,
+	predFunc4HD,
+	predFunc4HU,
+	nil,
+	nil,
+	nil,
+}
+
+var predFunc8 = [...]func(*Decoder, int, int){
+	predFunc8DC,
+	predFunc8TM,
+	predFunc8VE,
+	predFunc8HE,
+	nil,
+	nil,
+	nil,
+	nil,
+	nil,
+	nil,
+	predFunc8DCTop,
+	predFunc8DCLeft,
+	predFunc8DCTopLeft,
+}
+
+var predFunc16 = [...]func(*Decoder, int, int){
+	predFunc16DC,
+	predFunc16TM,
+	predFunc16VE,
+	predFunc16HE,
+	nil,
+	nil,
+	nil,
+	nil,
+	nil,
+	nil,
+	predFunc16DCTop,
+	predFunc16DCLeft,
+	predFunc16DCTopLeft,
+}
+
+func predFunc4DC(z *Decoder, y, x int) {
+	sum := uint32(4)
+	for i := 0; i < 4; i++ {
+		sum += uint32(z.ybr[y-1][x+i])
+	}
+	for j := 0; j < 4; j++ {
+		sum += uint32(z.ybr[y+j][x-1])
+	}
+	avg := uint8(sum / 8)
+	for j := 0; j < 4; j++ {
+		for i := 0; i < 4; i++ {
+			z.ybr[y+j][x+i] = avg
+		}
+	}
+}
+
+func predFunc4TM(z *Decoder, y, x int) {
+	delta0 := -int32(z.ybr[y-1][x-1])
+	for j := 0; j < 4; j++ {
+		delta1 := delta0 + int32(z.ybr[y+j][x-1])
+		for i := 0; i < 4; i++ {
+			delta2 := delta1 + int32(z.ybr[y-1][x+i])
+			z.ybr[y+j][x+i] = uint8(clip(delta2, 0, 255))
+		}
+	}
+}
+
+func predFunc4VE(z *Decoder, y, x int) {
+	a := int32(z.ybr[y-1][x-1])
+	b := int32(z.ybr[y-1][x+0])
+	c := int32(z.ybr[y-1][x+1])
+	d := int32(z.ybr[y-1][x+2])
+	e := int32(z.ybr[y-1][x+3])
+	f := int32(z.ybr[y-1][x+4])
+	abc := uint8((a + 2*b + c + 2) / 4)
+	bcd := uint8((b + 2*c + d + 2) / 4)
+	cde := uint8((c + 2*d + e + 2) / 4)
+	def := uint8((d + 2*e + f + 2) / 4)
+	for j := 0; j < 4; j++ {
+		z.ybr[y+j][x+0] = abc
+		z.ybr[y+j][x+1] = bcd
+		z.ybr[y+j][x+2] = cde
+		z.ybr[y+j][x+3] = def
+	}
+}
+
+func predFunc4HE(z *Decoder, y, x int) {
+	s := int32(z.ybr[y+3][x-1])
+	r := int32(z.ybr[y+2][x-1])
+	q := int32(z.ybr[y+1][x-1])
+	p := int32(z.ybr[y+0][x-1])
+	a := int32(z.ybr[y-1][x-1])
+	ssr := uint8((s + 2*s + r + 2) / 4)
+	srq := uint8((s + 2*r + q + 2) / 4)
+	rqp := uint8((r + 2*q + p + 2) / 4)
+	apq := uint8((a + 2*p + q + 2) / 4)
+	for i := 0; i < 4; i++ {
+		z.ybr[y+0][x+i] = apq
+		z.ybr[y+1][x+i] = rqp
+		z.ybr[y+2][x+i] = srq
+		z.ybr[y+3][x+i] = ssr
+	}
+}
+
+func predFunc4RD(z *Decoder, y, x int) {
+	s := int32(z.ybr[y+3][x-1])
+	r := int32(z.ybr[y+2][x-1])
+	q := int32(z.ybr[y+1][x-1])
+	p := int32(z.ybr[y+0][x-1])
+	a := int32(z.ybr[y-1][x-1])
+	b := int32(z.ybr[y-1][x+0])
+	c := int32(z.ybr[y-1][x+1])
+	d := int32(z.ybr[y-1][x+2])
+	e := int32(z.ybr[y-1][x+3])
+	srq := uint8((s + 2*r + q + 2) / 4)
+	rqp := uint8((r + 2*q + p + 2) / 4)
+	qpa := uint8((q + 2*p + a + 2) / 4)
+	pab := uint8((p + 2*a + b + 2) / 4)
+	abc := uint8((a + 2*b + c + 2) / 4)
+	bcd := uint8((b + 2*c + d + 2) / 4)
+	cde := uint8((c + 2*d + e + 2) / 4)
+	z.ybr[y+0][x+0] = pab
+	z.ybr[y+0][x+1] = abc
+	z.ybr[y+0][x+2] = bcd
+	z.ybr[y+0][x+3] = cde
+	z.ybr[y+1][x+0] = qpa
+	z.ybr[y+1][x+1] = pab
+	z.ybr[y+1][x+2] = abc
+	z.ybr[y+1][x+3] = bcd
+	z.ybr[y+2][x+0] = rqp
+	z.ybr[y+2][x+1] = qpa
+	z.ybr[y+2][x+2] = pab
+	z.ybr[y+2][x+3] = abc
+	z.ybr[y+3][x+0] = srq
+	z.ybr[y+3][x+1] = rqp
+	z.ybr[y+3][x+2] = qpa
+	z.ybr[y+3][x+3] = pab
+}
+
+func predFunc4VR(z *Decoder, y, x int) {
+	r := int32(z.ybr[y+2][x-1])
+	q := int32(z.ybr[y+1][x-1])
+	p := int32(z.ybr[y+0][x-1])
+	a := int32(z.ybr[y-1][x-1])
+	b := int32(z.ybr[y-1][x+0])
+	c := int32(z.ybr[y-1][x+1])
+	d := int32(z.ybr[y-1][x+2])
+	e := int32(z.ybr[y-1][x+3])
+	ab := uint8((a + b + 1) / 2)
+	bc := uint8((b + c + 1) / 2)
+	cd := uint8((c + d + 1) / 2)
+	de := uint8((d + e + 1) / 2)
+	rqp := uint8((r + 2*q + p + 2) / 4)
+	qpa := uint8((q + 2*p + a + 2) / 4)
+	pab := uint8((p + 2*a + b + 2) / 4)
+	abc := uint8((a + 2*b + c + 2) / 4)
+	bcd := uint8((b + 2*c + d + 2) / 4)
+	cde := uint8((c + 2*d + e + 2) / 4)
+	z.ybr[y+0][x+0] = ab
+	z.ybr[y+0][x+1] = bc
+	z.ybr[y+0][x+2] = cd
+	z.ybr[y+0][x+3] = de
+	z.ybr[y+1][x+0] = pab
+	z.ybr[y+1][x+1] = abc
+	z.ybr[y+1][x+2] = bcd
+	z.ybr[y+1][x+3] = cde
+	z.ybr[y+2][x+0] = qpa
+	z.ybr[y+2][x+1] = ab
+	z.ybr[y+2][x+2] = bc
+	z.ybr[y+2][x+3] = cd
+	z.ybr[y+3][x+0] = rqp
+	z.ybr[y+3][x+1] = pab
+	z.ybr[y+3][x+2] = abc
+	z.ybr[y+3][x+3] = bcd
+}
+
+func predFunc4LD(z *Decoder, y, x int) {
+	a := int32(z.ybr[y-1][x+0])
+	b := int32(z.ybr[y-1][x+1])
+	c := int32(z.ybr[y-1][x+2])
+	d := int32(z.ybr[y-1][x+3])
+	e := int32(z.ybr[y-1][x+4])
+	f := int32(z.ybr[y-1][x+5])
+	g := int32(z.ybr[y-1][x+6])
+	h := int32(z.ybr[y-1][x+7])
+	abc := uint8((a + 2*b + c + 2) / 4)
+	bcd := uint8((b + 2*c + d + 2) / 4)
+	cde := uint8((c + 2*d + e + 2) / 4)
+	def := uint8((d + 2*e + f + 2) / 4)
+	efg := uint8((e + 2*f + g + 2) / 4)
+	fgh := uint8((f + 2*g + h + 2) / 4)
+	ghh := uint8((g + 2*h + h + 2) / 4)
+	z.ybr[y+0][x+0] = abc
+	z.ybr[y+0][x+1] = bcd
+	z.ybr[y+0][x+2] = cde
+	z.ybr[y+0][x+3] = def
+	z.ybr[y+1][x+0] = bcd
+	z.ybr[y+1][x+1] = cde
+	z.ybr[y+1][x+2] = def
+	z.ybr[y+1][x+3] = efg
+	z.ybr[y+2][x+0] = cde
+	z.ybr[y+2][x+1] = def
+	z.ybr[y+2][x+2] = efg
+	z.ybr[y+2][x+3] = fgh
+	z.ybr[y+3][x+0] = def
+	z.ybr[y+3][x+1] = efg
+	z.ybr[y+3][x+2] = fgh
+	z.ybr[y+3][x+3] = ghh
+}
+
+func predFunc4VL(z *Decoder, y, x int) {
+	a := int32(z.ybr[y-1][x+0])
+	b := int32(z.ybr[y-1][x+1])
+	c := int32(z.ybr[y-1][x+2])
+	d := int32(z.ybr[y-1][x+3])
+	e := int32(z.ybr[y-1][x+4])
+	f := int32(z.ybr[y-1][x+5])
+	g := int32(z.ybr[y-1][x+6])
+	h := int32(z.ybr[y-1][x+7])
+	ab := uint8((a + b + 1) / 2)
+	bc := uint8((b + c + 1) / 2)
+	cd := uint8((c + d + 1) / 2)
+	de := uint8((d + e + 1) / 2)
+	abc := uint8((a + 2*b + c + 2) / 4)
+	bcd := uint8((b + 2*c + d + 2) / 4)
+	cde := uint8((c + 2*d + e + 2) / 4)
+	def := uint8((d + 2*e + f + 2) / 4)
+	efg := uint8((e + 2*f + g + 2) / 4)
+	fgh := uint8((f + 2*g + h + 2) / 4)
+	z.ybr[y+0][x+0] = ab
+	z.ybr[y+0][x+1] = bc
+	z.ybr[y+0][x+2] = cd
+	z.ybr[y+0][x+3] = de
+	z.ybr[y+1][x+0] = abc
+	z.ybr[y+1][x+1] = bcd
+	z.ybr[y+1][x+2] = cde
+	z.ybr[y+1][x+3] = def
+	z.ybr[y+2][x+0] = bc
+	z.ybr[y+2][x+1] = cd
+	z.ybr[y+2][x+2] = de
+	z.ybr[y+2][x+3] = efg
+	z.ybr[y+3][x+0] = bcd
+	z.ybr[y+3][x+1] = cde
+	z.ybr[y+3][x+2] = def
+	z.ybr[y+3][x+3] = fgh
+}
+
+func predFunc4HD(z *Decoder, y, x int) {
+	s := int32(z.ybr[y+3][x-1])
+	r := int32(z.ybr[y+2][x-1])
+	q := int32(z.ybr[y+1][x-1])
+	p := int32(z.ybr[y+0][x-1])
+	a := int32(z.ybr[y-1][x-1])
+	b := int32(z.ybr[y-1][x+0])
+	c := int32(z.ybr[y-1][x+1])
+	d := int32(z.ybr[y-1][x+2])
+	sr := uint8((s + r + 1) / 2)
+	rq := uint8((r + q + 1) / 2)
+	qp := uint8((q + p + 1) / 2)
+	pa := uint8((p + a + 1) / 2)
+	srq := uint8((s + 2*r + q + 2) / 4)
+	rqp := uint8((r + 2*q + p + 2) / 4)
+	qpa := uint8((q + 2*p + a + 2) / 4)
+	pab := uint8((p + 2*a + b + 2) / 4)
+	abc := uint8((a + 2*b + c + 2) / 4)
+	bcd := uint8((b + 2*c + d + 2) / 4)
+	z.ybr[y+0][x+0] = pa
+	z.ybr[y+0][x+1] = pab
+	z.ybr[y+0][x+2] = abc
+	z.ybr[y+0][x+3] = bcd
+	z.ybr[y+1][x+0] = qp
+	z.ybr[y+1][x+1] = qpa
+	z.ybr[y+1][x+2] = pa
+	z.ybr[y+1][x+3] = pab
+	z.ybr[y+2][x+0] = rq
+	z.ybr[y+2][x+1] = rqp
+	z.ybr[y+2][x+2] = qp
+	z.ybr[y+2][x+3] = qpa
+	z.ybr[y+3][x+0] = sr
+	z.ybr[y+3][x+1] = srq
+	z.ybr[y+3][x+2] = rq
+	z.ybr[y+3][x+3] = rqp
+}
+
+func predFunc4HU(z *Decoder, y, x int) {
+	s := int32(z.ybr[y+3][x-1])
+	r := int32(z.ybr[y+2][x-1])
+	q := int32(z.ybr[y+1][x-1])
+	p := int32(z.ybr[y+0][x-1])
+	pq := uint8((p + q + 1) / 2)
+	qr := uint8((q + r + 1) / 2)
+	rs := uint8((r + s + 1) / 2)
+	pqr := uint8((p + 2*q + r + 2) / 4)
+	qrs := uint8((q + 2*r + s + 2) / 4)
+	rss := uint8((r + 2*s + s + 2) / 4)
+	sss := uint8(s)
+	z.ybr[y+0][x+0] = pq
+	z.ybr[y+0][x+1] = pqr
+	z.ybr[y+0][x+2] = qr
+	z.ybr[y+0][x+3] = qrs
+	z.ybr[y+1][x+0] = qr
+	z.ybr[y+1][x+1] = qrs
+	z.ybr[y+1][x+2] = rs
+	z.ybr[y+1][x+3] = rss
+	z.ybr[y+2][x+0] = rs
+	z.ybr[y+2][x+1] = rss
+	z.ybr[y+2][x+2] = sss
+	z.ybr[y+2][x+3] = sss
+	z.ybr[y+3][x+0] = sss
+	z.ybr[y+3][x+1] = sss
+	z.ybr[y+3][x+2] = sss
+	z.ybr[y+3][x+3] = sss
+}
+
+func predFunc8DC(z *Decoder, y, x int) {
+	sum := uint32(8)
+	for i := 0; i < 8; i++ {
+		sum += uint32(z.ybr[y-1][x+i])
+	}
+	for j := 0; j < 8; j++ {
+		sum += uint32(z.ybr[y+j][x-1])
+	}
+	avg := uint8(sum / 16)
+	for j := 0; j < 8; j++ {
+		for i := 0; i < 8; i++ {
+			z.ybr[y+j][x+i] = avg
+		}
+	}
+}
+
+func predFunc8TM(z *Decoder, y, x int) {
+	delta0 := -int32(z.ybr[y-1][x-1])
+	for j := 0; j < 8; j++ {
+		delta1 := delta0 + int32(z.ybr[y+j][x-1])
+		for i := 0; i < 8; i++ {
+			delta2 := delta1 + int32(z.ybr[y-1][x+i])
+			z.ybr[y+j][x+i] = uint8(clip(delta2, 0, 255))
+		}
+	}
+}
+
+func predFunc8VE(z *Decoder, y, x int) {
+	for j := 0; j < 8; j++ {
+		for i := 0; i < 8; i++ {
+			z.ybr[y+j][x+i] = z.ybr[y-1][x+i]
+		}
+	}
+}
+
+func predFunc8HE(z *Decoder, y, x int) {
+	for j := 0; j < 8; j++ {
+		for i := 0; i < 8; i++ {
+			z.ybr[y+j][x+i] = z.ybr[y+j][x-1]
+		}
+	}
+}
+
+func predFunc8DCTop(z *Decoder, y, x int) {
+	sum := uint32(4)
+	for j := 0; j < 8; j++ {
+		sum += uint32(z.ybr[y+j][x-1])
+	}
+	avg := uint8(sum / 8)
+	for j := 0; j < 8; j++ {
+		for i := 0; i < 8; i++ {
+			z.ybr[y+j][x+i] = avg
+		}
+	}
+}
+
+func predFunc8DCLeft(z *Decoder, y, x int) {
+	sum := uint32(4)
+	for i := 0; i < 8; i++ {
+		sum += uint32(z.ybr[y-1][x+i])
+	}
+	avg := uint8(sum / 8)
+	for j := 0; j < 8; j++ {
+		for i := 0; i < 8; i++ {
+			z.ybr[y+j][x+i] = avg
+		}
+	}
+}
+
+func predFunc8DCTopLeft(z *Decoder, y, x int) {
+	for j := 0; j < 8; j++ {
+		for i := 0; i < 8; i++ {
+			z.ybr[y+j][x+i] = 0x80
+		}
+	}
+}
+
+func predFunc16DC(z *Decoder, y, x int) {
+	sum := uint32(16)
+	for i := 0; i < 16; i++ {
+		sum += uint32(z.ybr[y-1][x+i])
+	}
+	for j := 0; j < 16; j++ {
+		sum += uint32(z.ybr[y+j][x-1])
+	}
+	avg := uint8(sum / 32)
+	for j := 0; j < 16; j++ {
+		for i := 0; i < 16; i++ {
+			z.ybr[y+j][x+i] = avg
+		}
+	}
+}
+
+func predFunc16TM(z *Decoder, y, x int) {
+	delta0 := -int32(z.ybr[y-1][x-1])
+	for j := 0; j < 16; j++ {
+		delta1 := delta0 + int32(z.ybr[y+j][x-1])
+		for i := 0; i < 16; i++ {
+			delta2 := delta1 + int32(z.ybr[y-1][x+i])
+			z.ybr[y+j][x+i] = uint8(clip(delta2, 0, 255))
+		}
+	}
+}
+
+func predFunc16VE(z *Decoder, y, x int) {
+	for j := 0; j < 16; j++ {
+		for i := 0; i < 16; i++ {
+			z.ybr[y+j][x+i] = z.ybr[y-1][x+i]
+		}
+	}
+}
+
+func predFunc16HE(z *Decoder, y, x int) {
+	for j := 0; j < 16; j++ {
+		for i := 0; i < 16; i++ {
+			z.ybr[y+j][x+i] = z.ybr[y+j][x-1]
+		}
+	}
+}
+
+func predFunc16DCTop(z *Decoder, y, x int) {
+	sum := uint32(8)
+	for j := 0; j < 16; j++ {
+		sum += uint32(z.ybr[y+j][x-1])
+	}
+	avg := uint8(sum / 16)
+	for j := 0; j < 16; j++ {
+		for i := 0; i < 16; i++ {
+			z.ybr[y+j][x+i] = avg
+		}
+	}
+}
+
+func predFunc16DCLeft(z *Decoder, y, x int) {
+	sum := uint32(8)
+	for i := 0; i < 16; i++ {
+		sum += uint32(z.ybr[y-1][x+i])
+	}
+	avg := uint8(sum / 16)
+	for j := 0; j < 16; j++ {
+		for i := 0; i < 16; i++ {
+			z.ybr[y+j][x+i] = avg
+		}
+	}
+}
+
+func predFunc16DCTopLeft(z *Decoder, y, x int) {
+	for j := 0; j < 16; j++ {
+		for i := 0; i < 16; i++ {
+			z.ybr[y+j][x+i] = 0x80
+		}
+	}
+}
diff --git a/vp8/quant.go b/vp8/quant.go
new file mode 100644
index 0000000..da43616
--- /dev/null
+++ b/vp8/quant.go
@@ -0,0 +1,98 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package vp8
+
+// This file implements parsing the quantization factors.
+
+// quant are DC/AC quantization factors.
+type quant struct {
+	y1 [2]uint16
+	y2 [2]uint16
+	uv [2]uint16
+}
+
+// clip clips x to the range [min, max] inclusive.
+func clip(x, min, max int32) int32 {
+	if x < min {
+		return min
+	}
+	if x > max {
+		return max
+	}
+	return x
+}
+
+// parseQuant parses the quantization factors, as specified in section 9.6.
+func (d *Decoder) parseQuant() {
+	baseQ0 := d.fp.readUint(uniformProb, 7)
+	dqy1DC := d.fp.readOptionalInt(uniformProb, 4)
+	const dqy1AC = 0
+	dqy2DC := d.fp.readOptionalInt(uniformProb, 4)
+	dqy2AC := d.fp.readOptionalInt(uniformProb, 4)
+	dquvDC := d.fp.readOptionalInt(uniformProb, 4)
+	dquvAC := d.fp.readOptionalInt(uniformProb, 4)
+	for i := 0; i < nSegment; i++ {
+		q := int32(baseQ0)
+		if d.segmentHeader.useSegment {
+			if d.segmentHeader.relativeDelta {
+				q += int32(d.segmentHeader.quantizer[i])
+			} else {
+				q = int32(d.segmentHeader.quantizer[i])
+			}
+		}
+		d.quant[i].y1[0] = dequantTableDC[clip(q+dqy1DC, 0, 127)]
+		d.quant[i].y1[1] = dequantTableAC[clip(q+dqy1AC, 0, 127)]
+		d.quant[i].y2[0] = dequantTableDC[clip(q+dqy2DC, 0, 127)] * 2
+		d.quant[i].y2[1] = dequantTableAC[clip(q+dqy2AC, 0, 127)] * 155 / 100
+		if d.quant[i].y2[1] < 8 {
+			d.quant[i].y2[1] = 8
+		}
+		// The 117 is not a typo. The dequant_init function in the spec's Reference
+		// Decoder Source Code (http://tools.ietf.org/html/rfc6386#section-9.6 Page 145)
+		// says to clamp the LHS value at 132, which is equal to dequantTableDC[117].
+		d.quant[i].uv[0] = dequantTableDC[clip(q+dquvDC, 0, 117)]
+		d.quant[i].uv[1] = dequantTableAC[clip(q+dquvAC, 0, 127)]
+	}
+}
+
+// The dequantization tables are specified in section 14.1.
+var (
+	dequantTableDC = [128]uint16{
+		4, 5, 6, 7, 8, 9, 10, 10,
+		11, 12, 13, 14, 15, 16, 17, 17,
+		18, 19, 20, 20, 21, 21, 22, 22,
+		23, 23, 24, 25, 25, 26, 27, 28,
+		29, 30, 31, 32, 33, 34, 35, 36,
+		37, 37, 38, 39, 40, 41, 42, 43,
+		44, 45, 46, 46, 47, 48, 49, 50,
+		51, 52, 53, 54, 55, 56, 57, 58,
+		59, 60, 61, 62, 63, 64, 65, 66,
+		67, 68, 69, 70, 71, 72, 73, 74,
+		75, 76, 76, 77, 78, 79, 80, 81,
+		82, 83, 84, 85, 86, 87, 88, 89,
+		91, 93, 95, 96, 98, 100, 101, 102,
+		104, 106, 108, 110, 112, 114, 116, 118,
+		122, 124, 126, 128, 130, 132, 134, 136,
+		138, 140, 143, 145, 148, 151, 154, 157,
+	}
+	dequantTableAC = [128]uint16{
+		4, 5, 6, 7, 8, 9, 10, 11,
+		12, 13, 14, 15, 16, 17, 18, 19,
+		20, 21, 22, 23, 24, 25, 26, 27,
+		28, 29, 30, 31, 32, 33, 34, 35,
+		36, 37, 38, 39, 40, 41, 42, 43,
+		44, 45, 46, 47, 48, 49, 50, 51,
+		52, 53, 54, 55, 56, 57, 58, 60,
+		62, 64, 66, 68, 70, 72, 74, 76,
+		78, 80, 82, 84, 86, 88, 90, 92,
+		94, 96, 98, 100, 102, 104, 106, 108,
+		110, 112, 114, 116, 119, 122, 125, 128,
+		131, 134, 137, 140, 143, 146, 149, 152,
+		155, 158, 161, 164, 167, 170, 173, 177,
+		181, 185, 189, 193, 197, 201, 205, 209,
+		213, 217, 221, 225, 229, 234, 239, 245,
+		249, 254, 259, 264, 269, 274, 279, 284,
+	}
+)
diff --git a/vp8/reconstruct.go b/vp8/reconstruct.go
new file mode 100644
index 0000000..c1cc4b5
--- /dev/null
+++ b/vp8/reconstruct.go
@@ -0,0 +1,442 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package vp8
+
+// This file implements decoding DCT/WHT residual coefficients and
+// reconstructing YCbCr data equal to predicted values plus residuals.
+//
+// There are 1*16*16 + 2*8*8 + 1*4*4 coefficients per macroblock:
+//	- 1*16*16 luma DCT coefficients,
+//	- 2*8*8 chroma DCT coefficients, and
+//	- 1*4*4 luma WHT coefficients.
+// Coefficients are read in lots of 16, and the later coefficients in each lot
+// are often zero.
+//
+// The YCbCr data consists of 1*16*16 luma values and 2*8*8 chroma values,
+// plus previously decoded values along the top and left borders. The combined
+// values are laid out as a [1+16+1+8][32]uint8 so that vertically adjacent
+// samples are 32 bytes apart. In detail, the layout is:
+//
+//	0 1 2 3 4 5 6 7  8 9 0 1 2 3 4 5  6 7 8 9 0 1 2 3  4 5 6 7 8 9 0 1
+//	. . . . . . . a  b b b b b b b b  b b b b b b b b  c c c c . . . .	0
+//	. . . . . . . d  Y Y Y Y Y Y Y Y  Y Y Y Y Y Y Y Y  . . . . . . . .	1
+//	. . . . . . . d  Y Y Y Y Y Y Y Y  Y Y Y Y Y Y Y Y  . . . . . . . .	2
+//	. . . . . . . d  Y Y Y Y Y Y Y Y  Y Y Y Y Y Y Y Y  . . . . . . . .	3
+//	. . . . . . . d  Y Y Y Y Y Y Y Y  Y Y Y Y Y Y Y Y  c c c c . . . .	4
+//	. . . . . . . d  Y Y Y Y Y Y Y Y  Y Y Y Y Y Y Y Y  . . . . . . . .	5
+//	. . . . . . . d  Y Y Y Y Y Y Y Y  Y Y Y Y Y Y Y Y  . . . . . . . .	6
+//	. . . . . . . d  Y Y Y Y Y Y Y Y  Y Y Y Y Y Y Y Y  . . . . . . . .	7
+//	. . . . . . . d  Y Y Y Y Y Y Y Y  Y Y Y Y Y Y Y Y  c c c c . . . .	8
+//	. . . . . . . d  Y Y Y Y Y Y Y Y  Y Y Y Y Y Y Y Y  . . . . . . . .	9
+//	. . . . . . . d  Y Y Y Y Y Y Y Y  Y Y Y Y Y Y Y Y  . . . . . . . .	10
+//	. . . . . . . d  Y Y Y Y Y Y Y Y  Y Y Y Y Y Y Y Y  . . . . . . . .	11
+//	. . . . . . . d  Y Y Y Y Y Y Y Y  Y Y Y Y Y Y Y Y  c c c c . . . .	12
+//	. . . . . . . d  Y Y Y Y Y Y Y Y  Y Y Y Y Y Y Y Y  . . . . . . . .	13
+//	. . . . . . . d  Y Y Y Y Y Y Y Y  Y Y Y Y Y Y Y Y  . . . . . . . .	14
+//	. . . . . . . d  Y Y Y Y Y Y Y Y  Y Y Y Y Y Y Y Y  . . . . . . . .	15
+//	. . . . . . . d  Y Y Y Y Y Y Y Y  Y Y Y Y Y Y Y Y  . . . . . . . .	16
+//	. . . . . . . e  f f f f f f f f  . . . . . . . g  h h h h h h h h	17
+//	. . . . . . . i  B B B B B B B B  . . . . . . . j  R R R R R R R R	18
+//	. . . . . . . i  B B B B B B B B  . . . . . . . j  R R R R R R R R	19
+//	. . . . . . . i  B B B B B B B B  . . . . . . . j  R R R R R R R R	20
+//	. . . . . . . i  B B B B B B B B  . . . . . . . j  R R R R R R R R	21
+//	. . . . . . . i  B B B B B B B B  . . . . . . . j  R R R R R R R R	22
+//	. . . . . . . i  B B B B B B B B  . . . . . . . j  R R R R R R R R	23
+//	. . . . . . . i  B B B B B B B B  . . . . . . . j  R R R R R R R R	24
+//	. . . . . . . i  B B B B B B B B  . . . . . . . j  R R R R R R R R	25
+//
+// Y, B and R are the reconstructed luma (Y) and chroma (B, R) values.
+// The Y values are predicted (either as one 16x16 region or 16 4x4 regions)
+// based on the row above's Y values (some combination of {abc} or {dYC}) and
+// the column left's Y values (either {ad} or {bY}). Similarly, B and R values
+// are predicted on the row above and column left of their respective 8x8
+// region: {efi} for B, {ghj} for R.
+//
+// For uppermost macroblocks (i.e. those with mby == 0), the {abcefgh} values
+// are initialized to 0x81. Otherwise, they are copied from the bottom row of
+// the macroblock above. The {c} values are then duplicated from row 0 to rows
+// 4, 8 and 12 of the ybr workspace.
+// Similarly, for leftmost macroblocks (i.e. those with mbx == 0), the {adeigj}
+// values are initialized to 0x7f. Otherwise, they are copied from the right
+// column of the macroblock to the left.
+// For the top-left macroblock (with mby == 0 && mbx == 0), {aeg} is 0x81.
+//
+// When moving from one macroblock to the next horizontally, the {adeigj}
+// values can simply be copied from the workspace to itself, shifted by 8 or
+// 16 columns. When moving from one macroblock to the next vertically,
+// filtering can occur and hence the row values have to be copied from the
+// post-filtered image instead of the pre-filtered workspace.
+
+const (
+	bCoeffBase   = 1*16*16 + 0*8*8
+	rCoeffBase   = 1*16*16 + 1*8*8
+	whtCoeffBase = 1*16*16 + 2*8*8
+)
+
+const (
+	ybrYX = 8
+	ybrYY = 1
+	ybrBX = 8
+	ybrBY = 18
+	ybrRX = 24
+	ybrRY = 18
+)
+
+// prepareYBR prepares the {abcdefghij} elements of ybr.
+func (d *Decoder) prepareYBR(mbx, mby int) {
+	if mbx == 0 {
+		for y := 0; y < 17; y++ {
+			d.ybr[y][7] = 0x81
+		}
+		for y := 17; y < 26; y++ {
+			d.ybr[y][7] = 0x81
+			d.ybr[y][23] = 0x81
+		}
+	} else {
+		for y := 0; y < 17; y++ {
+			d.ybr[y][7] = d.ybr[y][7+16]
+		}
+		for y := 17; y < 26; y++ {
+			d.ybr[y][7] = d.ybr[y][15]
+			d.ybr[y][23] = d.ybr[y][31]
+		}
+	}
+	if mby == 0 {
+		for x := 7; x < 28; x++ {
+			d.ybr[0][x] = 0x7f
+		}
+		for x := 7; x < 16; x++ {
+			d.ybr[17][x] = 0x7f
+		}
+		for x := 23; x < 32; x++ {
+			d.ybr[17][x] = 0x7f
+		}
+	} else {
+		for i := 0; i < 16; i++ {
+			d.ybr[0][8+i] = d.img.Y[(16*mby-1)*d.img.YStride+16*mbx+i]
+		}
+		for i := 0; i < 8; i++ {
+			d.ybr[17][8+i] = d.img.Cb[(8*mby-1)*d.img.CStride+8*mbx+i]
+		}
+		for i := 0; i < 8; i++ {
+			d.ybr[17][24+i] = d.img.Cr[(8*mby-1)*d.img.CStride+8*mbx+i]
+		}
+		if mbx == d.mbw-1 {
+			for i := 16; i < 20; i++ {
+				d.ybr[0][8+i] = d.img.Y[(16*mby-1)*d.img.YStride+16*mbx+15]
+			}
+		} else {
+			for i := 16; i < 20; i++ {
+				d.ybr[0][8+i] = d.img.Y[(16*mby-1)*d.img.YStride+16*mbx+i]
+			}
+		}
+	}
+	for y := 4; y < 16; y += 4 {
+		d.ybr[y][24] = d.ybr[0][24]
+		d.ybr[y][25] = d.ybr[0][25]
+		d.ybr[y][26] = d.ybr[0][26]
+		d.ybr[y][27] = d.ybr[0][27]
+	}
+}
+
+// btou converts a bool to a 0/1 value.
+func btou(b bool) uint8 {
+	if b {
+		return 1
+	}
+	return 0
+}
+
+// pack packs four 0/1 values into four bits of a uint32.
+func pack(x [4]uint8, shift int) uint32 {
+	u := uint32(x[0])<<0 | uint32(x[1])<<1 | uint32(x[2])<<2 | uint32(x[3])<<3
+	return u << uint(shift)
+}
+
+// unpack unpacks four 0/1 values from a four-bit value.
+var unpack = [16][4]uint8{
+	{0, 0, 0, 0},
+	{1, 0, 0, 0},
+	{0, 1, 0, 0},
+	{1, 1, 0, 0},
+	{0, 0, 1, 0},
+	{1, 0, 1, 0},
+	{0, 1, 1, 0},
+	{1, 1, 1, 0},
+	{0, 0, 0, 1},
+	{1, 0, 0, 1},
+	{0, 1, 0, 1},
+	{1, 1, 0, 1},
+	{0, 0, 1, 1},
+	{1, 0, 1, 1},
+	{0, 1, 1, 1},
+	{1, 1, 1, 1},
+}
+
+var (
+	// The mapping from 4x4 region position to band is specified in section 13.3.
+	bands = [17]uint8{0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 0}
+	// Category probabilties are specified in section 13.2.
+	// Decoding categories 1 and 2 are done inline.
+	cat3456 = [4][12]uint8{
+		{173, 148, 140, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+		{176, 155, 140, 135, 0, 0, 0, 0, 0, 0, 0, 0},
+		{180, 157, 141, 134, 130, 0, 0, 0, 0, 0, 0, 0},
+		{254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0},
+	}
+	// The zigzag order is:
+	//	0  1  5  6
+	//	2  4  7 12
+	//	3  8 11 13
+	//	9 10 14 15
+	zigzag = [16]uint8{0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15}
+)
+
+// parseResiduals4 parses a 4x4 region of residual coefficients, as specified
+// in section 13.3, and returns a 0/1 value indicating whether there was at
+// least one non-zero coefficient.
+// r is the partition to read bits from.
+// plane and context describe which token probability table to use. context is
+// either 0, 1 or 2, and equals how many of the macroblock left and macroblock
+// above have non-zero coefficients.
+// quant are the DC/AC quantization factors.
+// skipFirstCoeff is whether the DC coefficient has already been parsed.
+// coeffBase is the base index of d.coeff to write to.
+func (d *Decoder) parseResiduals4(r *partition, plane int, context uint8, quant [2]uint16, skipFirstCoeff bool, coeffBase int) uint8 {
+	prob, n := &d.tokenProb[plane], 0
+	if skipFirstCoeff {
+		n = 1
+	}
+	p := prob[bands[n]][context]
+	if !r.readBit(p[0]) {
+		return 0
+	}
+	for n != 16 {
+		n++
+		if !r.readBit(p[1]) {
+			p = prob[bands[n]][0]
+			continue
+		}
+		var v uint32
+		if !r.readBit(p[2]) {
+			v = 1
+			p = prob[bands[n]][1]
+		} else {
+			if !r.readBit(p[3]) {
+				if !r.readBit(p[4]) {
+					v = 2
+				} else {
+					v = 3 + r.readUint(p[5], 1)
+				}
+			} else if !r.readBit(p[6]) {
+				if !r.readBit(p[7]) {
+					// Category 1.
+					v = 5 + r.readUint(159, 1)
+				} else {
+					// Category 2.
+					v = 7 + 2*r.readUint(165, 1) + r.readUint(145, 1)
+				}
+			} else {
+				// Categories 3, 4, 5 or 6.
+				b1 := r.readUint(p[8], 1)
+				b0 := r.readUint(p[9+b1], 1)
+				cat := 2*b1 + b0
+				tab := &cat3456[cat]
+				v = 0
+				for i := 0; tab[i] != 0; i++ {
+					v *= 2
+					v += r.readUint(tab[i], 1)
+				}
+				v += 3 + (8 << cat)
+			}
+			p = prob[bands[n]][2]
+		}
+		z := zigzag[n-1]
+		c := int32(v) * int32(quant[btou(z > 0)])
+		if r.readBit(uniformProb) {
+			c = -c
+		}
+		d.coeff[coeffBase+int(z)] = int16(c)
+		if n == 16 || !r.readBit(p[0]) {
+			return 1
+		}
+	}
+	return 1
+}
+
+// parseResiduals parses the residuals and returns whether inner loop filtering
+// should be skipped for this macroblock.
+func (d *Decoder) parseResiduals(mbx, mby int) (skip bool) {
+	partition := &d.op[mby&(d.nOP-1)]
+	plane := planeY1SansY2
+	quant := &d.quant[d.segment]
+
+	// Parse the DC coefficient of each 4x4 luma region.
+	if d.usePredY16 {
+		nz := d.parseResiduals4(partition, planeY2, d.leftMB.nzY16+d.upMB[mbx].nzY16, quant.y2, false, whtCoeffBase)
+		d.leftMB.nzY16 = nz
+		d.upMB[mbx].nzY16 = nz
+		d.inverseWHT16()
+		plane = planeY1WithY2
+	}
+
+	var (
+		nzDC, nzAC         [4]uint8
+		nzDCMask, nzACMask uint32
+		coeffBase          int
+	)
+
+	// Parse the luma coefficients.
+	lnz := unpack[d.leftMB.nzMask&0x0f]
+	unz := unpack[d.upMB[mbx].nzMask&0x0f]
+	for y := 0; y < 4; y++ {
+		nz := lnz[y]
+		for x := 0; x < 4; x++ {
+			nz = d.parseResiduals4(partition, plane, nz+unz[x], quant.y1, d.usePredY16, coeffBase)
+			unz[x] = nz
+			nzAC[x] = nz
+			nzDC[x] = btou(d.coeff[coeffBase] != 0)
+			coeffBase += 16
+		}
+		lnz[y] = nz
+		nzDCMask |= pack(nzDC, y*4)
+		nzACMask |= pack(nzAC, y*4)
+	}
+	lnzMask := pack(lnz, 0)
+	unzMask := pack(unz, 0)
+
+	// Parse the chroma coefficients.
+	lnz = unpack[d.leftMB.nzMask>>4]
+	unz = unpack[d.upMB[mbx].nzMask>>4]
+	for c := 0; c < 4; c += 2 {
+		for y := 0; y < 2; y++ {
+			nz := lnz[y+c]
+			for x := 0; x < 2; x++ {
+				nz = d.parseResiduals4(partition, planeUV, nz+unz[x+c], quant.uv, false, coeffBase)
+				unz[x+c] = nz
+				nzAC[y*2+x] = nz
+				nzDC[y*2+x] = btou(d.coeff[coeffBase] != 0)
+				coeffBase += 16
+			}
+			lnz[y+c] = nz
+		}
+		nzDCMask |= pack(nzDC, 16+c*2)
+		nzACMask |= pack(nzAC, 16+c*2)
+	}
+	lnzMask |= pack(lnz, 4)
+	unzMask |= pack(unz, 4)
+
+	// Save decoder state.
+	d.leftMB.nzMask = uint8(lnzMask)
+	d.upMB[mbx].nzMask = uint8(unzMask)
+	d.nzDCMask = nzDCMask
+	d.nzACMask = nzACMask
+
+	// Section 15.1 of the spec says that "Steps 2 and 4 [of the loop filter]
+	// are skipped... [if] there is no DCT coefficient coded for the whole
+	// macroblock."
+	return nzDCMask == 0 && nzACMask == 0
+}
+
+// reconstructMacroblock applies the predictor functions and adds the inverse-
+// DCT transformed residuals to recover the YCbCr data.
+func (d *Decoder) reconstructMacroblock(mbx, mby int) {
+	if d.usePredY16 {
+		p := checkTopLeftPred(mbx, mby, d.predY16)
+		predFunc16[p](d, 1, 8)
+		for j := 0; j < 4; j++ {
+			for i := 0; i < 4; i++ {
+				n := 4*j + i
+				y := 4*j + 1
+				x := 4*i + 8
+				mask := uint32(1) << uint(n)
+				if d.nzACMask&mask != 0 {
+					d.inverseDCT4(y, x, 16*n)
+				} else if d.nzDCMask&mask != 0 {
+					d.inverseDCT4DCOnly(y, x, 16*n)
+				}
+			}
+		}
+	} else {
+		for j := 0; j < 4; j++ {
+			for i := 0; i < 4; i++ {
+				n := 4*j + i
+				y := 4*j + 1
+				x := 4*i + 8
+				predFunc4[d.predY4[j][i]](d, y, x)
+				mask := uint32(1) << uint(n)
+				if d.nzACMask&mask != 0 {
+					d.inverseDCT4(y, x, 16*n)
+				} else if d.nzDCMask&mask != 0 {
+					d.inverseDCT4DCOnly(y, x, 16*n)
+				}
+			}
+		}
+	}
+	p := checkTopLeftPred(mbx, mby, d.predC8)
+	predFunc8[p](d, ybrBY, ybrBX)
+	if d.nzACMask&0x0f0000 != 0 {
+		d.inverseDCT8(ybrBY, ybrBX, bCoeffBase)
+	} else if d.nzDCMask&0x0f0000 != 0 {
+		d.inverseDCT8DCOnly(ybrBY, ybrBX, bCoeffBase)
+	}
+	predFunc8[p](d, ybrRY, ybrRX)
+	if d.nzACMask&0xf00000 != 0 {
+		d.inverseDCT8(ybrRY, ybrRX, rCoeffBase)
+	} else if d.nzDCMask&0xf00000 != 0 {
+		d.inverseDCT8DCOnly(ybrRY, ybrRX, rCoeffBase)
+	}
+}
+
+// reconstruct reconstructs one macroblock and returns whether inner loop
+// filtering should be skipped for it.
+func (d *Decoder) reconstruct(mbx, mby int) (skip bool) {
+	if d.segmentHeader.updateMap {
+		if !d.fp.readBit(d.segmentHeader.prob[0]) {
+			d.segment = int(d.fp.readUint(d.segmentHeader.prob[1], 1))
+		} else {
+			d.segment = int(d.fp.readUint(d.segmentHeader.prob[2], 1)) + 2
+		}
+	}
+	if d.useSkipProb {
+		skip = d.fp.readBit(d.skipProb)
+	}
+	// Prepare the workspace.
+	for i := range d.coeff {
+		d.coeff[i] = 0
+	}
+	d.prepareYBR(mbx, mby)
+	// Parse the predictor modes.
+	d.usePredY16 = d.fp.readBit(145)
+	if d.usePredY16 {
+		d.parsePredModeY16(mbx)
+	} else {
+		d.parsePredModeY4(mbx)
+	}
+	d.parsePredModeC8()
+	// Parse the residuals.
+	if !skip {
+		skip = d.parseResiduals(mbx, mby)
+	} else {
+		if d.usePredY16 {
+			d.leftMB.nzY16 = 0
+			d.upMB[mbx].nzY16 = 0
+		}
+		d.leftMB.nzMask = 0
+		d.upMB[mbx].nzMask = 0
+		d.nzDCMask = 0
+		d.nzACMask = 0
+	}
+	// Reconstruct the YCbCr data and copy it to the image.
+	d.reconstructMacroblock(mbx, mby)
+	for i, y := (mby*d.img.YStride+mbx)*16, 0; y < 16; i, y = i+d.img.YStride, y+1 {
+		copy(d.img.Y[i:i+16], d.ybr[ybrYY+y][ybrYX:ybrYX+16])
+	}
+	for i, y := (mby*d.img.CStride+mbx)*8, 0; y < 8; i, y = i+d.img.CStride, y+1 {
+		copy(d.img.Cb[i:i+8], d.ybr[ybrBY+y][ybrBX:ybrBX+8])
+		copy(d.img.Cr[i:i+8], d.ybr[ybrRY+y][ybrRX:ybrRX+8])
+	}
+	return skip
+}
diff --git a/vp8/token.go b/vp8/token.go
new file mode 100644
index 0000000..da99cf0
--- /dev/null
+++ b/vp8/token.go
@@ -0,0 +1,381 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package vp8
+
+// This file contains token probabilities for decoding DCT/WHT coefficients, as
+// specified in chapter 13.
+
+func (d *Decoder) parseTokenProb() {
+	for i := range d.tokenProb {
+		for j := range d.tokenProb[i] {
+			for k := range d.tokenProb[i][j] {
+				for l := range d.tokenProb[i][j][k] {
+					if d.fp.readBit(tokenProbUpdateProb[i][j][k][l]) {
+						d.tokenProb[i][j][k][l] = uint8(d.fp.readUint(uniformProb, 8))
+					}
+				}
+			}
+		}
+	}
+}
+
+// The plane enumeration is specified in section 13.3.
+const (
+	planeY1WithY2 = iota
+	planeY2
+	planeUV
+	planeY1SansY2
+	nPlane
+)
+
+const (
+	nBand    = 8
+	nContext = 3
+	nProb    = 11
+)
+
+// Token probability update probabilities are specified in section 13.4.
+var tokenProbUpdateProb = [nPlane][nBand][nContext][nProb]uint8{
+	{
+		{
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255},
+			{249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255},
+			{234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255},
+			{253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255},
+			{239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255},
+			{254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255},
+			{251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255},
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255},
+			{251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255},
+			{254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255},
+			{250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255},
+			{254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+	},
+	{
+		{
+			{217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255},
+			{234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255},
+		},
+		{
+			{255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255},
+			{238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255},
+			{249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255},
+			{252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255},
+			{253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255},
+			{250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+	},
+	{
+		{
+			{186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255},
+			{234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255},
+			{251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255},
+		},
+		{
+			{255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255},
+			{236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255},
+			{251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255},
+			{254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255},
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+	},
+	{
+		{
+			{248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255},
+			{248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255},
+			{246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255},
+			{252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255},
+			{248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255},
+			{253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255},
+			{245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255},
+			{253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255},
+			{252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255},
+			{255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255},
+			{255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255},
+			{250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+		{
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+			{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
+		},
+	},
+}
+
+// Default token probabilities are specified in section 13.5.
+var defaultTokenProb = [nPlane][nBand][nContext][nProb]uint8{
+	{
+		{
+			{128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+			{128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+			{128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+		},
+		{
+			{253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128},
+			{189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128},
+			{106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128},
+		},
+		{
+			{1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128},
+			{181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128},
+			{78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128},
+		},
+		{
+			{1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128},
+			{184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128},
+			{77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128},
+		},
+		{
+			{1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128},
+			{170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128},
+			{37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128},
+		},
+		{
+			{1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128},
+			{207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128},
+			{102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128},
+		},
+		{
+			{1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128},
+			{177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128},
+			{80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128},
+		},
+		{
+			{1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128},
+			{246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128},
+			{255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+		},
+	},
+	{
+		{
+			{198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62},
+			{131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1},
+			{68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128},
+		},
+		{
+			{1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128},
+			{184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128},
+			{81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128},
+		},
+		{
+			{1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128},
+			{99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128},
+			{23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128},
+		},
+		{
+			{1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128},
+			{109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128},
+			{44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128},
+		},
+		{
+			{1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128},
+			{94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128},
+			{22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128},
+		},
+		{
+			{1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128},
+			{124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128},
+			{35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128},
+		},
+		{
+			{1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128},
+			{121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128},
+			{45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128},
+		},
+		{
+			{1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128},
+			{203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128},
+			{137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128},
+		},
+	},
+	{
+		{
+			{253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128},
+			{175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128},
+			{73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128},
+		},
+		{
+			{1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128},
+			{239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128},
+			{155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128},
+		},
+		{
+			{1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128},
+			{201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128},
+			{69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128},
+		},
+		{
+			{1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128},
+			{223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128},
+			{141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128},
+		},
+		{
+			{1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128},
+			{190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128},
+			{149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128},
+		},
+		{
+			{1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128},
+			{247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128},
+			{240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128},
+		},
+		{
+			{1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128},
+			{213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128},
+			{55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128},
+		},
+		{
+			{128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+			{128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+			{128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+		},
+	},
+	{
+		{
+			{202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255},
+			{126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128},
+			{61, 46, 138, 219, 151, 178, 240, 170, 255, 216, 128},
+		},
+		{
+			{1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128},
+			{166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128},
+			{39, 77, 162, 232, 172, 180, 245, 178, 255, 255, 128},
+		},
+		{
+			{1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128},
+			{124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128},
+			{24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128},
+		},
+		{
+			{1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128},
+			{149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128},
+			{28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128},
+		},
+		{
+			{1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128},
+			{123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128},
+			{20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128},
+		},
+		{
+			{1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128},
+			{168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128},
+			{47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128},
+		},
+		{
+			{1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128},
+			{141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128},
+			{42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128},
+		},
+		{
+			{1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128},
+			{244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128},
+			{238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128},
+		},
+	},
+}
diff --git a/vp8l/decode.go b/vp8l/decode.go
new file mode 100644
index 0000000..4319487
--- /dev/null
+++ b/vp8l/decode.go
@@ -0,0 +1,603 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package vp8l implements a decoder for the VP8L lossless image format.
+//
+// The VP8L specification is at:
+// https://developers.google.com/speed/webp/docs/riff_container
+package vp8l // import "golang.org/x/image/vp8l"
+
+import (
+	"bufio"
+	"errors"
+	"image"
+	"image/color"
+	"io"
+)
+
+var (
+	errInvalidCodeLengths = errors.New("vp8l: invalid code lengths")
+	errInvalidHuffmanTree = errors.New("vp8l: invalid Huffman tree")
+)
+
+// colorCacheMultiplier is the multiplier used for the color cache hash
+// function, specified in section 4.2.3.
+const colorCacheMultiplier = 0x1e35a7bd
+
+// distanceMapTable is the look-up table for distanceMap.
+var distanceMapTable = [120]uint8{
+	0x18, 0x07, 0x17, 0x19, 0x28, 0x06, 0x27, 0x29, 0x16, 0x1a,
+	0x26, 0x2a, 0x38, 0x05, 0x37, 0x39, 0x15, 0x1b, 0x36, 0x3a,
+	0x25, 0x2b, 0x48, 0x04, 0x47, 0x49, 0x14, 0x1c, 0x35, 0x3b,
+	0x46, 0x4a, 0x24, 0x2c, 0x58, 0x45, 0x4b, 0x34, 0x3c, 0x03,
+	0x57, 0x59, 0x13, 0x1d, 0x56, 0x5a, 0x23, 0x2d, 0x44, 0x4c,
+	0x55, 0x5b, 0x33, 0x3d, 0x68, 0x02, 0x67, 0x69, 0x12, 0x1e,
+	0x66, 0x6a, 0x22, 0x2e, 0x54, 0x5c, 0x43, 0x4d, 0x65, 0x6b,
+	0x32, 0x3e, 0x78, 0x01, 0x77, 0x79, 0x53, 0x5d, 0x11, 0x1f,
+	0x64, 0x6c, 0x42, 0x4e, 0x76, 0x7a, 0x21, 0x2f, 0x75, 0x7b,
+	0x31, 0x3f, 0x63, 0x6d, 0x52, 0x5e, 0x00, 0x74, 0x7c, 0x41,
+	0x4f, 0x10, 0x20, 0x62, 0x6e, 0x30, 0x73, 0x7d, 0x51, 0x5f,
+	0x40, 0x72, 0x7e, 0x61, 0x6f, 0x50, 0x71, 0x7f, 0x60, 0x70,
+}
+
+// distanceMap maps a LZ77 backwards reference distance to a two-dimensional
+// pixel offset, specified in section 4.2.2.
+func distanceMap(w int32, code uint32) int32 {
+	if int32(code) > int32(len(distanceMapTable)) {
+		return int32(code) - int32(len(distanceMapTable))
+	}
+	distCode := int32(distanceMapTable[code-1])
+	yOffset := distCode >> 4
+	xOffset := 8 - distCode&0xf
+	if d := yOffset*w + xOffset; d >= 1 {
+		return d
+	}
+	return 1
+}
+
+// decoder holds the bit-stream for a VP8L image.
+type decoder struct {
+	r     io.ByteReader
+	bits  uint32
+	nBits uint32
+}
+
+// read reads the next n bits from the decoder's bit-stream.
+func (d *decoder) read(n uint32) (uint32, error) {
+	for d.nBits < n {
+		c, err := d.r.ReadByte()
+		if err != nil {
+			if err == io.EOF {
+				err = io.ErrUnexpectedEOF
+			}
+			return 0, err
+		}
+		d.bits |= uint32(c) << d.nBits
+		d.nBits += 8
+	}
+	u := d.bits & (1<<n - 1)
+	d.bits >>= n
+	d.nBits -= n
+	return u, nil
+}
+
+// decodeTransform decodes the next transform and the width of the image after
+// transformation (or equivalently, before inverse transformation), specified
+// in section 3.
+func (d *decoder) decodeTransform(w int32, h int32) (t transform, newWidth int32, err error) {
+	t.oldWidth = w
+	t.transformType, err = d.read(2)
+	if err != nil {
+		return transform{}, 0, err
+	}
+	switch t.transformType {
+	case transformTypePredictor, transformTypeCrossColor:
+		t.bits, err = d.read(3)
+		if err != nil {
+			return transform{}, 0, err
+		}
+		t.bits += 2
+		t.pix, err = d.decodePix(nTiles(w, t.bits), nTiles(h, t.bits), 0, false)
+		if err != nil {
+			return transform{}, 0, err
+		}
+	case transformTypeSubtractGreen:
+		// No-op.
+	case transformTypeColorIndexing:
+		nColors, err := d.read(8)
+		if err != nil {
+			return transform{}, 0, err
+		}
+		nColors++
+		t.bits = 0
+		switch {
+		case nColors <= 2:
+			t.bits = 3
+		case nColors <= 4:
+			t.bits = 2
+		case nColors <= 16:
+			t.bits = 1
+		}
+		w = nTiles(w, t.bits)
+		pix, err := d.decodePix(int32(nColors), 1, 4*256, false)
+		if err != nil {
+			return transform{}, 0, err
+		}
+		for p := 4; p < len(pix); p += 4 {
+			pix[p+0] += pix[p-4]
+			pix[p+1] += pix[p-3]
+			pix[p+2] += pix[p-2]
+			pix[p+3] += pix[p-1]
+		}
+		// The spec says that "if the index is equal or larger than color_table_size,
+		// the argb color value should be set to 0x00000000 (transparent black)."
+		// We re-slice up to 256 4-byte pixels.
+		t.pix = pix[:4*256]
+	}
+	return t, w, nil
+}
+
+// repeatsCodeLength is the minimum code length for repeated codes.
+const repeatsCodeLength = 16
+
+// These magic numbers are specified at the end of section 5.2.2.
+// The 3-length arrays apply to code lengths >= repeatsCodeLength.
+var (
+	codeLengthCodeOrder = [19]uint8{
+		17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+	}
+	repeatBits    = [3]uint8{2, 3, 7}
+	repeatOffsets = [3]uint8{3, 3, 11}
+)
+
+// decodeCodeLengths decodes a Huffman tree's code lengths which are themselves
+// encoded via a Huffman tree, specified in section 5.2.2.
+func (d *decoder) decodeCodeLengths(dst []uint32, codeLengthCodeLengths []uint32) error {
+	h := hTree{}
+	if err := h.build(codeLengthCodeLengths); err != nil {
+		return err
+	}
+
+	maxSymbol := len(dst)
+	useLength, err := d.read(1)
+	if err != nil {
+		return err
+	}
+	if useLength != 0 {
+		n, err := d.read(3)
+		if err != nil {
+			return err
+		}
+		n = 2 + 2*n
+		ms, err := d.read(n)
+		if err != nil {
+			return err
+		}
+		maxSymbol = int(ms) + 2
+		if maxSymbol > len(dst) {
+			return errInvalidCodeLengths
+		}
+	}
+
+	// The spec says that "if code 16 [meaning repeat] is used before
+	// a non-zero value has been emitted, a value of 8 is repeated."
+	prevCodeLength := uint32(8)
+
+	for symbol := 0; symbol < len(dst); {
+		if maxSymbol == 0 {
+			break
+		}
+		maxSymbol--
+		codeLength, err := h.next(d)
+		if err != nil {
+			return err
+		}
+		if codeLength < repeatsCodeLength {
+			dst[symbol] = codeLength
+			symbol++
+			if codeLength != 0 {
+				prevCodeLength = codeLength
+			}
+			continue
+		}
+
+		repeat, err := d.read(uint32(repeatBits[codeLength-repeatsCodeLength]))
+		if err != nil {
+			return err
+		}
+		repeat += uint32(repeatOffsets[codeLength-repeatsCodeLength])
+		if symbol+int(repeat) > len(dst) {
+			return errInvalidCodeLengths
+		}
+		// A code length of 16 repeats the previous non-zero code.
+		// A code length of 17 or 18 repeats zeroes.
+		cl := uint32(0)
+		if codeLength == 16 {
+			cl = prevCodeLength
+		}
+		for ; repeat > 0; repeat-- {
+			dst[symbol] = cl
+			symbol++
+		}
+	}
+	return nil
+}
+
+// decodeHuffmanTree decodes a Huffman tree into h.
+func (d *decoder) decodeHuffmanTree(h *hTree, alphabetSize uint32) error {
+	useSimple, err := d.read(1)
+	if err != nil {
+		return err
+	}
+	if useSimple != 0 {
+		nSymbols, err := d.read(1)
+		if err != nil {
+			return err
+		}
+		nSymbols++
+		firstSymbolLengthCode, err := d.read(1)
+		if err != nil {
+			return err
+		}
+		firstSymbolLengthCode = 7*firstSymbolLengthCode + 1
+		var symbols [2]uint32
+		symbols[0], err = d.read(firstSymbolLengthCode)
+		if err != nil {
+			return err
+		}
+		if nSymbols == 2 {
+			symbols[1], err = d.read(8)
+			if err != nil {
+				return err
+			}
+		}
+		return h.buildSimple(nSymbols, symbols, alphabetSize)
+	}
+
+	nCodes, err := d.read(4)
+	if err != nil {
+		return err
+	}
+	nCodes += 4
+	if int(nCodes) > len(codeLengthCodeOrder) {
+		return errInvalidHuffmanTree
+	}
+	codeLengthCodeLengths := [len(codeLengthCodeOrder)]uint32{}
+	for i := uint32(0); i < nCodes; i++ {
+		codeLengthCodeLengths[codeLengthCodeOrder[i]], err = d.read(3)
+		if err != nil {
+			return err
+		}
+	}
+	codeLengths := make([]uint32, alphabetSize)
+	if err = d.decodeCodeLengths(codeLengths, codeLengthCodeLengths[:]); err != nil {
+		return err
+	}
+	return h.build(codeLengths)
+}
+
+const (
+	huffGreen    = 0
+	huffRed      = 1
+	huffBlue     = 2
+	huffAlpha    = 3
+	huffDistance = 4
+	nHuff        = 5
+)
+
+// hGroup is an array of 5 Huffman trees.
+type hGroup [nHuff]hTree
+
+// decodeHuffmanGroups decodes the one or more hGroups used to decode the pixel
+// data. If one hGroup is used for the entire image, then hPix and hBits will
+// be zero. If more than one hGroup is used, then hPix contains the meta-image
+// that maps tiles to hGroup index, and hBits contains the log-2 tile size.
+func (d *decoder) decodeHuffmanGroups(w int32, h int32, topLevel bool, ccBits uint32) (
+	hGroups []hGroup, hPix []byte, hBits uint32, err error) {
+
+	maxHGroupIndex := 0
+	if topLevel {
+		useMeta, err := d.read(1)
+		if err != nil {
+			return nil, nil, 0, err
+		}
+		if useMeta != 0 {
+			hBits, err = d.read(3)
+			if err != nil {
+				return nil, nil, 0, err
+			}
+			hBits += 2
+			hPix, err = d.decodePix(nTiles(w, hBits), nTiles(h, hBits), 0, false)
+			if err != nil {
+				return nil, nil, 0, err
+			}
+			for p := 0; p < len(hPix); p += 4 {
+				i := int(hPix[p])<<8 | int(hPix[p+1])
+				if maxHGroupIndex < i {
+					maxHGroupIndex = i
+				}
+			}
+		}
+	}
+	hGroups = make([]hGroup, maxHGroupIndex+1)
+	for i := range hGroups {
+		for j, alphabetSize := range alphabetSizes {
+			if j == 0 && ccBits > 0 {
+				alphabetSize += 1 << ccBits
+			}
+			if err := d.decodeHuffmanTree(&hGroups[i][j], alphabetSize); err != nil {
+				return nil, nil, 0, err
+			}
+		}
+	}
+	return hGroups, hPix, hBits, nil
+}
+
+const (
+	nLiteralCodes  = 256
+	nLengthCodes   = 24
+	nDistanceCodes = 40
+)
+
+var alphabetSizes = [nHuff]uint32{
+	nLiteralCodes + nLengthCodes,
+	nLiteralCodes,
+	nLiteralCodes,
+	nLiteralCodes,
+	nDistanceCodes,
+}
+
+// decodePix decodes pixel data, specified in section 5.2.2.
+func (d *decoder) decodePix(w int32, h int32, minCap int32, topLevel bool) ([]byte, error) {
+	// Decode the color cache parameters.
+	ccBits, ccShift, ccEntries := uint32(0), uint32(0), ([]uint32)(nil)
+	useColorCache, err := d.read(1)
+	if err != nil {
+		return nil, err
+	}
+	if useColorCache != 0 {
+		ccBits, err = d.read(4)
+		if err != nil {
+			return nil, err
+		}
+		if ccBits < 1 || 11 < ccBits {
+			return nil, errors.New("vp8l: invalid color cache parameters")
+		}
+		ccShift = 32 - ccBits
+		ccEntries = make([]uint32, 1<<ccBits)
+	}
+
+	// Decode the Huffman groups.
+	hGroups, hPix, hBits, err := d.decodeHuffmanGroups(w, h, topLevel, ccBits)
+	if err != nil {
+		return nil, err
+	}
+	hMask, tilesPerRow := int32(0), int32(0)
+	if hBits != 0 {
+		hMask, tilesPerRow = 1<<hBits-1, nTiles(w, hBits)
+	}
+
+	// Decode the pixels.
+	if minCap < 4*w*h {
+		minCap = 4 * w * h
+	}
+	pix := make([]byte, 4*w*h, minCap)
+	p, cachedP := 0, 0
+	x, y := int32(0), int32(0)
+	hg, lookupHG := &hGroups[0], hMask != 0
+	for p < len(pix) {
+		if lookupHG {
+			i := 4 * (tilesPerRow*(y>>hBits) + (x >> hBits))
+			hg = &hGroups[uint32(hPix[i])<<8|uint32(hPix[i+1])]
+		}
+
+		green, err := hg[huffGreen].next(d)
+		if err != nil {
+			return nil, err
+		}
+		switch {
+		case green < nLiteralCodes:
+			// We have a literal pixel.
+			red, err := hg[huffRed].next(d)
+			if err != nil {
+				return nil, err
+			}
+			blue, err := hg[huffBlue].next(d)
+			if err != nil {
+				return nil, err
+			}
+			alpha, err := hg[huffAlpha].next(d)
+			if err != nil {
+				return nil, err
+			}
+			pix[p+0] = uint8(red)
+			pix[p+1] = uint8(green)
+			pix[p+2] = uint8(blue)
+			pix[p+3] = uint8(alpha)
+			p += 4
+
+			x++
+			if x == w {
+				x, y = 0, y+1
+			}
+			lookupHG = hMask != 0 && x&hMask == 0
+
+		case green < nLiteralCodes+nLengthCodes:
+			// We have a LZ77 backwards reference.
+			length, err := d.lz77Param(green - nLiteralCodes)
+			if err != nil {
+				return nil, err
+			}
+			distSym, err := hg[huffDistance].next(d)
+			if err != nil {
+				return nil, err
+			}
+			distCode, err := d.lz77Param(distSym)
+			if err != nil {
+				return nil, err
+			}
+			dist := distanceMap(w, distCode)
+			pEnd := p + 4*int(length)
+			q := p - 4*int(dist)
+			qEnd := pEnd - 4*int(dist)
+			if p < 0 || len(pix) < pEnd || q < 0 || len(pix) < qEnd {
+				return nil, errors.New("vp8l: invalid LZ77 parameters")
+			}
+			for ; p < pEnd; p, q = p+1, q+1 {
+				pix[p] = pix[q]
+			}
+
+			x += int32(length)
+			for x >= w {
+				x, y = x-w, y+1
+			}
+			lookupHG = hMask != 0
+
+		default:
+			// We have a color cache lookup. First, insert previous pixels
+			// into the cache. Note that VP8L assumes ARGB order, but the
+			// Go image.RGBA type is in RGBA order.
+			for ; cachedP < p; cachedP += 4 {
+				argb := uint32(pix[cachedP+0])<<16 |
+					uint32(pix[cachedP+1])<<8 |
+					uint32(pix[cachedP+2])<<0 |
+					uint32(pix[cachedP+3])<<24
+				ccEntries[(argb*colorCacheMultiplier)>>ccShift] = argb
+			}
+			green -= nLiteralCodes + nLengthCodes
+			if int(green) >= len(ccEntries) {
+				return nil, errors.New("vp8l: invalid color cache index")
+			}
+			argb := ccEntries[green]
+			pix[p+0] = uint8(argb >> 16)
+			pix[p+1] = uint8(argb >> 8)
+			pix[p+2] = uint8(argb >> 0)
+			pix[p+3] = uint8(argb >> 24)
+			p += 4
+
+			x++
+			if x == w {
+				x, y = 0, y+1
+			}
+			lookupHG = hMask != 0 && x&hMask == 0
+		}
+	}
+	return pix, nil
+}
+
+// lz77Param returns the next LZ77 parameter: a length or a distance, specified
+// in section 4.2.2.
+func (d *decoder) lz77Param(symbol uint32) (uint32, error) {
+	if symbol < 4 {
+		return symbol + 1, nil
+	}
+	extraBits := (symbol - 2) >> 1
+	offset := (2 + symbol&1) << extraBits
+	n, err := d.read(extraBits)
+	if err != nil {
+		return 0, err
+	}
+	return offset + n + 1, nil
+}
+
+// decodeHeader decodes the VP8L header from r.
+func decodeHeader(r io.Reader) (d *decoder, w int32, h int32, err error) {
+	rr, ok := r.(io.ByteReader)
+	if !ok {
+		rr = bufio.NewReader(r)
+	}
+	d = &decoder{r: rr}
+	magic, err := d.read(8)
+	if err != nil {
+		return nil, 0, 0, err
+	}
+	if magic != 0x2f {
+		return nil, 0, 0, errors.New("vp8l: invalid header")
+	}
+	width, err := d.read(14)
+	if err != nil {
+		return nil, 0, 0, err
+	}
+	width++
+	height, err := d.read(14)
+	if err != nil {
+		return nil, 0, 0, err
+	}
+	height++
+	_, err = d.read(1) // Read and ignore the hasAlpha hint.
+	if err != nil {
+		return nil, 0, 0, err
+	}
+	version, err := d.read(3)
+	if err != nil {
+		return nil, 0, 0, err
+	}
+	if version != 0 {
+		return nil, 0, 0, errors.New("vp8l: invalid version")
+	}
+	return d, int32(width), int32(height), nil
+}
+
+// DecodeConfig decodes the color model and dimensions of a VP8L image from r.
+func DecodeConfig(r io.Reader) (image.Config, error) {
+	_, w, h, err := decodeHeader(r)
+	if err != nil {
+		return image.Config{}, err
+	}
+	return image.Config{
+		ColorModel: color.NRGBAModel,
+		Width:      int(w),
+		Height:     int(h),
+	}, nil
+}
+
+// Decode decodes a VP8L image from r.
+func Decode(r io.Reader) (image.Image, error) {
+	d, w, h, err := decodeHeader(r)
+	if err != nil {
+		return nil, err
+	}
+	// Decode the transforms.
+	var (
+		nTransforms    int
+		transforms     [nTransformTypes]transform
+		transformsSeen [nTransformTypes]bool
+		originalW      = w
+	)
+	for {
+		more, err := d.read(1)
+		if err != nil {
+			return nil, err
+		}
+		if more == 0 {
+			break
+		}
+		var t transform
+		t, w, err = d.decodeTransform(w, h)
+		if err != nil {
+			return nil, err
+		}
+		if transformsSeen[t.transformType] {
+			return nil, errors.New("vp8l: repeated transform")
+		}
+		transformsSeen[t.transformType] = true
+		transforms[nTransforms] = t
+		nTransforms++
+	}
+	// Decode the transformed pixels.
+	pix, err := d.decodePix(w, h, 0, true)
+	if err != nil {
+		return nil, err
+	}
+	// Apply the inverse transformations.
+	for i := nTransforms - 1; i >= 0; i-- {
+		t := &transforms[i]
+		pix = inverseTransforms[t.transformType](t, pix, h)
+	}
+	return &image.NRGBA{
+		Pix:    pix,
+		Stride: 4 * int(originalW),
+		Rect:   image.Rect(0, 0, int(originalW), int(h)),
+	}, nil
+}
diff --git a/vp8l/huffman.go b/vp8l/huffman.go
new file mode 100644
index 0000000..36368a8
--- /dev/null
+++ b/vp8l/huffman.go
@@ -0,0 +1,245 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package vp8l
+
+import (
+	"io"
+)
+
+// reverseBits reverses the bits in a byte.
+var reverseBits = [256]uint8{
+	0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
+	0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8, 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
+	0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4, 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
+	0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec, 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
+	0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2, 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
+	0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea, 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
+	0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6, 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
+	0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee, 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
+	0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1, 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
+	0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9, 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
+	0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5, 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
+	0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed, 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
+	0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3, 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
+	0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb, 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
+	0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7, 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
+	0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef, 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff,
+}
+
+// hNode is a node in a Huffman tree.
+type hNode struct {
+	// symbol is the symbol held by this node.
+	symbol uint32
+	// children, if positive, is the hTree.nodes index of the first of
+	// this node's two children. Zero means an uninitialized node,
+	// and -1 means a leaf node.
+	children int32
+}
+
+const leafNode = -1
+
+// lutSize is the log-2 size of an hTree's look-up table.
+const lutSize, lutMask = 7, 1<<7 - 1
+
+// hTree is a Huffman tree.
+type hTree struct {
+	// nodes are the nodes of the Huffman tree. During construction,
+	// len(nodes) grows from 1 up to cap(nodes) by steps of two.
+	// After construction, len(nodes) == cap(nodes), and both equal
+	// 2*theNumberOfSymbols - 1.
+	nodes []hNode
+	// lut is a look-up table for walking the nodes. The x in lut[x] is
+	// the next lutSize bits in the bit-stream. The low 8 bits of lut[x]
+	// equals 1 plus the number of bits in the next code, or 0 if the
+	// next code requires more than lutSize bits. The high 24 bits are:
+	//   - the symbol, if the code requires lutSize or fewer bits, or
+	//   - the hTree.nodes index to start the tree traversal from, if
+	//     the next code requires more than lutSize bits.
+	lut [1 << lutSize]uint32
+}
+
+// insert inserts into the hTree a symbol whose encoding is the least
+// significant codeLength bits of code.
+func (h *hTree) insert(symbol uint32, code uint32, codeLength uint32) error {
+	if symbol > 0xffff || codeLength > 0xfe {
+		return errInvalidHuffmanTree
+	}
+	baseCode := uint32(0)
+	if codeLength > lutSize {
+		baseCode = uint32(reverseBits[(code>>(codeLength-lutSize))&0xff]) >> (8 - lutSize)
+	} else {
+		baseCode = uint32(reverseBits[code&0xff]) >> (8 - codeLength)
+		for i := 0; i < 1<<(lutSize-codeLength); i++ {
+			h.lut[baseCode|uint32(i)<<codeLength] = symbol<<8 | (codeLength + 1)
+		}
+	}
+
+	n := uint32(0)
+	for jump := lutSize; codeLength > 0; {
+		codeLength--
+		if int(n) > len(h.nodes) {
+			return errInvalidHuffmanTree
+		}
+		switch h.nodes[n].children {
+		case leafNode:
+			return errInvalidHuffmanTree
+		case 0:
+			if len(h.nodes) == cap(h.nodes) {
+				return errInvalidHuffmanTree
+			}
+			// Create two empty child nodes.
+			h.nodes[n].children = int32(len(h.nodes))
+			h.nodes = h.nodes[:len(h.nodes)+2]
+		}
+		n = uint32(h.nodes[n].children) + 1&(code>>codeLength)
+		jump--
+		if jump == 0 && h.lut[baseCode] == 0 {
+			h.lut[baseCode] = n << 8
+		}
+	}
+
+	switch h.nodes[n].children {
+	case leafNode:
+		// No-op.
+	case 0:
+		// Turn the uninitialized node into a leaf.
+		h.nodes[n].children = leafNode
+	default:
+		return errInvalidHuffmanTree
+	}
+	h.nodes[n].symbol = symbol
+	return nil
+}
+
+// codeLengthsToCodes returns the canonical Huffman codes implied by the
+// sequence of code lengths.
+func codeLengthsToCodes(codeLengths []uint32) ([]uint32, error) {
+	maxCodeLength := uint32(0)
+	for _, cl := range codeLengths {
+		if maxCodeLength < cl {
+			maxCodeLength = cl
+		}
+	}
+	const maxAllowedCodeLength = 15
+	if len(codeLengths) == 0 || maxCodeLength > maxAllowedCodeLength {
+		return nil, errInvalidHuffmanTree
+	}
+	histogram := [maxAllowedCodeLength + 1]uint32{}
+	for _, cl := range codeLengths {
+		histogram[cl]++
+	}
+	currCode, nextCodes := uint32(0), [maxAllowedCodeLength + 1]uint32{}
+	for cl := 1; cl < len(nextCodes); cl++ {
+		currCode = (currCode + histogram[cl-1]) << 1
+		nextCodes[cl] = currCode
+	}
+	codes := make([]uint32, len(codeLengths))
+	for symbol, cl := range codeLengths {
+		if cl > 0 {
+			codes[symbol] = nextCodes[cl]
+			nextCodes[cl]++
+		}
+	}
+	return codes, nil
+}
+
+// build builds a canonical Huffman tree from the given code lengths.
+func (h *hTree) build(codeLengths []uint32) error {
+	// Calculate the number of symbols.
+	var nSymbols, lastSymbol uint32
+	for symbol, cl := range codeLengths {
+		if cl != 0 {
+			nSymbols++
+			lastSymbol = uint32(symbol)
+		}
+	}
+	if nSymbols == 0 {
+		return errInvalidHuffmanTree
+	}
+	h.nodes = make([]hNode, 1, 2*nSymbols-1)
+	// Handle the trivial case.
+	if nSymbols == 1 {
+		if len(codeLengths) <= int(lastSymbol) {
+			return errInvalidHuffmanTree
+		}
+		return h.insert(lastSymbol, 0, 0)
+	}
+	// Handle the non-trivial case.
+	codes, err := codeLengthsToCodes(codeLengths)
+	if err != nil {
+		return err
+	}
+	for symbol, cl := range codeLengths {
+		if cl > 0 {
+			if err := h.insert(uint32(symbol), codes[symbol], cl); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+// buildSimple builds a Huffman tree with 1 or 2 symbols.
+func (h *hTree) buildSimple(nSymbols uint32, symbols [2]uint32, alphabetSize uint32) error {
+	h.nodes = make([]hNode, 1, 2*nSymbols-1)
+	for i := uint32(0); i < nSymbols; i++ {
+		if symbols[i] >= alphabetSize {
+			return errInvalidHuffmanTree
+		}
+		if err := h.insert(symbols[i], i, nSymbols-1); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// next returns the next Huffman-encoded symbol from the bit-stream d.
+func (h *hTree) next(d *decoder) (uint32, error) {
+	var n uint32
+	// Read enough bits so that we can use the look-up table.
+	if d.nBits < lutSize {
+		c, err := d.r.ReadByte()
+		if err != nil {
+			if err == io.EOF {
+				// There are no more bytes of data, but we may still be able
+				// to read the next symbol out of the previously read bits.
+				goto slowPath
+			}
+			return 0, err
+		}
+		d.bits |= uint32(c) << d.nBits
+		d.nBits += 8
+	}
+	// Use the look-up table.
+	n = h.lut[d.bits&lutMask]
+	if b := n & 0xff; b != 0 {
+		b--
+		d.bits >>= b
+		d.nBits -= b
+		return n >> 8, nil
+	}
+	n >>= 8
+	d.bits >>= lutSize
+	d.nBits -= lutSize
+
+slowPath:
+	for h.nodes[n].children != leafNode {
+		if d.nBits == 0 {
+			c, err := d.r.ReadByte()
+			if err != nil {
+				if err == io.EOF {
+					err = io.ErrUnexpectedEOF
+				}
+				return 0, err
+			}
+			d.bits = uint32(c)
+			d.nBits = 8
+		}
+		n = uint32(h.nodes[n].children) + 1&d.bits
+		d.bits >>= 1
+		d.nBits--
+	}
+	return h.nodes[n].symbol, nil
+}
diff --git a/vp8l/transform.go b/vp8l/transform.go
new file mode 100644
index 0000000..06543da
--- /dev/null
+++ b/vp8l/transform.go
@@ -0,0 +1,299 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package vp8l
+
+// This file deals with image transforms, specified in section 3.
+
+// nTiles returns the number of tiles needed to cover size pixels, where each
+// tile's side is 1<<bits pixels long.
+func nTiles(size int32, bits uint32) int32 {
+	return (size + 1<<bits - 1) >> bits
+}
+
+const (
+	transformTypePredictor     = 0
+	transformTypeCrossColor    = 1
+	transformTypeSubtractGreen = 2
+	transformTypeColorIndexing = 3
+	nTransformTypes            = 4
+)
+
+// transform holds the parameters for an invertible transform.
+type transform struct {
+	// transformType is the type of the transform.
+	transformType uint32
+	// oldWidth is the width of the image before transformation (or
+	// equivalently, after inverse transformation). The color-indexing
+	// transform can reduce the width. For example, a 50-pixel-wide
+	// image that only needs 4 bits (half a byte) per color index can
+	// be transformed into a 25-pixel-wide image.
+	oldWidth int32
+	// bits is the log-2 size of the transform's tiles, for the predictor
+	// and cross-color transforms. 8>>bits is the number of bits per
+	// color index, for the color-index transform.
+	bits uint32
+	// pix is the tile values, for the predictor and cross-color
+	// transforms, and the color palette, for the color-index transform.
+	pix []byte
+}
+
+var inverseTransforms = [nTransformTypes]func(*transform, []byte, int32) []byte{
+	transformTypePredictor:     inversePredictor,
+	transformTypeCrossColor:    inverseCrossColor,
+	transformTypeSubtractGreen: inverseSubtractGreen,
+	transformTypeColorIndexing: inverseColorIndexing,
+}
+
+func inversePredictor(t *transform, pix []byte, h int32) []byte {
+	if t.oldWidth == 0 || h == 0 {
+		return pix
+	}
+	// The first pixel's predictor is mode 0 (opaque black).
+	pix[3] += 0xff
+	p, mask := int32(4), int32(1)<<t.bits-1
+	for x := int32(1); x < t.oldWidth; x++ {
+		// The rest of the first row's predictor is mode 1 (L).
+		pix[p+0] += pix[p-4]
+		pix[p+1] += pix[p-3]
+		pix[p+2] += pix[p-2]
+		pix[p+3] += pix[p-1]
+		p += 4
+	}
+	top, tilesPerRow := 0, nTiles(t.oldWidth, t.bits)
+	for y := int32(1); y < h; y++ {
+		// The first column's predictor is mode 2 (T).
+		pix[p+0] += pix[top+0]
+		pix[p+1] += pix[top+1]
+		pix[p+2] += pix[top+2]
+		pix[p+3] += pix[top+3]
+		p, top = p+4, top+4
+
+		q := 4 * (y >> t.bits) * tilesPerRow
+		predictorMode := t.pix[q+1] & 0x0f
+		q += 4
+		for x := int32(1); x < t.oldWidth; x++ {
+			if x&mask == 0 {
+				predictorMode = t.pix[q+1] & 0x0f
+				q += 4
+			}
+			switch predictorMode {
+			case 0: // Opaque black.
+				pix[p+3] += 0xff
+
+			case 1: // L.
+				pix[p+0] += pix[p-4]
+				pix[p+1] += pix[p-3]
+				pix[p+2] += pix[p-2]
+				pix[p+3] += pix[p-1]
+
+			case 2: // T.
+				pix[p+0] += pix[top+0]
+				pix[p+1] += pix[top+1]
+				pix[p+2] += pix[top+2]
+				pix[p+3] += pix[top+3]
+
+			case 3: // TR.
+				pix[p+0] += pix[top+4]
+				pix[p+1] += pix[top+5]
+				pix[p+2] += pix[top+6]
+				pix[p+3] += pix[top+7]
+
+			case 4: // TL.
+				pix[p+0] += pix[top-4]
+				pix[p+1] += pix[top-3]
+				pix[p+2] += pix[top-2]
+				pix[p+3] += pix[top-1]
+
+			case 5: // Average2(Average2(L, TR), T).
+				pix[p+0] += avg2(avg2(pix[p-4], pix[top+4]), pix[top+0])
+				pix[p+1] += avg2(avg2(pix[p-3], pix[top+5]), pix[top+1])
+				pix[p+2] += avg2(avg2(pix[p-2], pix[top+6]), pix[top+2])
+				pix[p+3] += avg2(avg2(pix[p-1], pix[top+7]), pix[top+3])
+
+			case 6: // Average2(L, TL).
+				pix[p+0] += avg2(pix[p-4], pix[top-4])
+				pix[p+1] += avg2(pix[p-3], pix[top-3])
+				pix[p+2] += avg2(pix[p-2], pix[top-2])
+				pix[p+3] += avg2(pix[p-1], pix[top-1])
+
+			case 7: // Average2(L, T).
+				pix[p+0] += avg2(pix[p-4], pix[top+0])
+				pix[p+1] += avg2(pix[p-3], pix[top+1])
+				pix[p+2] += avg2(pix[p-2], pix[top+2])
+				pix[p+3] += avg2(pix[p-1], pix[top+3])
+
+			case 8: // Average2(TL, T).
+				pix[p+0] += avg2(pix[top-4], pix[top+0])
+				pix[p+1] += avg2(pix[top-3], pix[top+1])
+				pix[p+2] += avg2(pix[top-2], pix[top+2])
+				pix[p+3] += avg2(pix[top-1], pix[top+3])
+
+			case 9: // Average2(T, TR).
+				pix[p+0] += avg2(pix[top+0], pix[top+4])
+				pix[p+1] += avg2(pix[top+1], pix[top+5])
+				pix[p+2] += avg2(pix[top+2], pix[top+6])
+				pix[p+3] += avg2(pix[top+3], pix[top+7])
+
+			case 10: // Average2(Average2(L, TL), Average2(T, TR)).
+				pix[p+0] += avg2(avg2(pix[p-4], pix[top-4]), avg2(pix[top+0], pix[top+4]))
+				pix[p+1] += avg2(avg2(pix[p-3], pix[top-3]), avg2(pix[top+1], pix[top+5]))
+				pix[p+2] += avg2(avg2(pix[p-2], pix[top-2]), avg2(pix[top+2], pix[top+6]))
+				pix[p+3] += avg2(avg2(pix[p-1], pix[top-1]), avg2(pix[top+3], pix[top+7]))
+
+			case 11: // Select(L, T, TL).
+				l0 := int32(pix[p-4])
+				l1 := int32(pix[p-3])
+				l2 := int32(pix[p-2])
+				l3 := int32(pix[p-1])
+				c0 := int32(pix[top-4])
+				c1 := int32(pix[top-3])
+				c2 := int32(pix[top-2])
+				c3 := int32(pix[top-1])
+				t0 := int32(pix[top+0])
+				t1 := int32(pix[top+1])
+				t2 := int32(pix[top+2])
+				t3 := int32(pix[top+3])
+				l := abs(c0-t0) + abs(c1-t1) + abs(c2-t2) + abs(c3-t3)
+				t := abs(c0-l0) + abs(c1-l1) + abs(c2-l2) + abs(c3-l3)
+				if l < t {
+					pix[p+0] += uint8(l0)
+					pix[p+1] += uint8(l1)
+					pix[p+2] += uint8(l2)
+					pix[p+3] += uint8(l3)
+				} else {
+					pix[p+0] += uint8(t0)
+					pix[p+1] += uint8(t1)
+					pix[p+2] += uint8(t2)
+					pix[p+3] += uint8(t3)
+				}
+
+			case 12: // ClampAddSubtractFull(L, T, TL).
+				pix[p+0] += clampAddSubtractFull(pix[p-4], pix[top+0], pix[top-4])
+				pix[p+1] += clampAddSubtractFull(pix[p-3], pix[top+1], pix[top-3])
+				pix[p+2] += clampAddSubtractFull(pix[p-2], pix[top+2], pix[top-2])
+				pix[p+3] += clampAddSubtractFull(pix[p-1], pix[top+3], pix[top-1])
+
+			case 13: // ClampAddSubtractHalf(Average2(L, T), TL).
+				pix[p+0] += clampAddSubtractHalf(avg2(pix[p-4], pix[top+0]), pix[top-4])
+				pix[p+1] += clampAddSubtractHalf(avg2(pix[p-3], pix[top+1]), pix[top-3])
+				pix[p+2] += clampAddSubtractHalf(avg2(pix[p-2], pix[top+2]), pix[top-2])
+				pix[p+3] += clampAddSubtractHalf(avg2(pix[p-1], pix[top+3]), pix[top-1])
+			}
+			p, top = p+4, top+4
+		}
+	}
+	return pix
+}
+
+func inverseCrossColor(t *transform, pix []byte, h int32) []byte {
+	var greenToRed, greenToBlue, redToBlue int32
+	p, mask, tilesPerRow := int32(0), int32(1)<<t.bits-1, nTiles(t.oldWidth, t.bits)
+	for y := int32(0); y < h; y++ {
+		q := 4 * (y >> t.bits) * tilesPerRow
+		for x := int32(0); x < t.oldWidth; x++ {
+			if x&mask == 0 {
+				redToBlue = int32(int8(t.pix[q+0]))
+				greenToBlue = int32(int8(t.pix[q+1]))
+				greenToRed = int32(int8(t.pix[q+2]))
+				q += 4
+			}
+			red := pix[p+0]
+			green := pix[p+1]
+			blue := pix[p+2]
+			red += uint8(uint32(greenToRed*int32(int8(green))) >> 5)
+			blue += uint8(uint32(greenToBlue*int32(int8(green))) >> 5)
+			blue += uint8(uint32(redToBlue*int32(int8(red))) >> 5)
+			pix[p+0] = red
+			pix[p+2] = blue
+			p += 4
+		}
+	}
+	return pix
+}
+
+func inverseSubtractGreen(t *transform, pix []byte, h int32) []byte {
+	for p := 0; p < len(pix); p += 4 {
+		green := pix[p+1]
+		pix[p+0] += green
+		pix[p+2] += green
+	}
+	return pix
+}
+
+func inverseColorIndexing(t *transform, pix []byte, h int32) []byte {
+	if t.bits == 0 {
+		for p := 0; p < len(pix); p += 4 {
+			i := 4 * uint32(pix[p+1])
+			pix[p+0] = t.pix[i+0]
+			pix[p+1] = t.pix[i+1]
+			pix[p+2] = t.pix[i+2]
+			pix[p+3] = t.pix[i+3]
+		}
+		return pix
+	}
+
+	vMask, xMask, bitsPerPixel := uint32(0), int32(0), uint32(8>>t.bits)
+	switch t.bits {
+	case 1:
+		vMask, xMask = 0x0f, 0x01
+	case 2:
+		vMask, xMask = 0x03, 0x03
+	case 3:
+		vMask, xMask = 0x01, 0x07
+	}
+
+	d, p, v, dst := 0, 0, uint32(0), make([]byte, 4*t.oldWidth*h)
+	for y := int32(0); y < h; y++ {
+		for x := int32(0); x < t.oldWidth; x++ {
+			if x&xMask == 0 {
+				v = uint32(pix[p+1])
+				p += 4
+			}
+
+			i := 4 * (v & vMask)
+			dst[d+0] = t.pix[i+0]
+			dst[d+1] = t.pix[i+1]
+			dst[d+2] = t.pix[i+2]
+			dst[d+3] = t.pix[i+3]
+			d += 4
+
+			v >>= bitsPerPixel
+		}
+	}
+	return dst
+}
+
+func abs(x int32) int32 {
+	if x < 0 {
+		return -x
+	}
+	return x
+}
+
+func avg2(a, b uint8) uint8 {
+	return uint8((int32(a) + int32(b)) / 2)
+}
+
+func clampAddSubtractFull(a, b, c uint8) uint8 {
+	x := int32(a) + int32(b) - int32(c)
+	if x < 0 {
+		return 0
+	}
+	if x > 255 {
+		return 255
+	}
+	return uint8(x)
+}
+
+func clampAddSubtractHalf(a, b uint8) uint8 {
+	x := int32(a) + (int32(a)-int32(b))/2
+	if x < 0 {
+		return 0
+	}
+	if x > 255 {
+		return 255
+	}
+	return uint8(x)
+}
diff --git a/webp/decode.go b/webp/decode.go
new file mode 100644
index 0000000..60fb556
--- /dev/null
+++ b/webp/decode.go
@@ -0,0 +1,275 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package webp implements a decoder for WEBP images.
+//
+// WEBP is defined at:
+// https://developers.google.com/speed/webp/docs/riff_container
+package webp // import "golang.org/x/image/webp"
+
+import (
+	"bytes"
+	"errors"
+	"image"
+	"image/color"
+	"io"
+
+	"golang.org/x/image/riff"
+	"golang.org/x/image/vp8"
+	"golang.org/x/image/vp8l"
+	"golang.org/x/image/webp/nycbcra"
+)
+
+var errInvalidFormat = errors.New("webp: invalid format")
+
+var (
+	fccALPH = riff.FourCC{'A', 'L', 'P', 'H'}
+	fccVP8  = riff.FourCC{'V', 'P', '8', ' '}
+	fccVP8L = riff.FourCC{'V', 'P', '8', 'L'}
+	fccVP8X = riff.FourCC{'V', 'P', '8', 'X'}
+	fccWEBP = riff.FourCC{'W', 'E', 'B', 'P'}
+)
+
+func decode(r io.Reader, configOnly bool) (image.Image, image.Config, error) {
+	formType, riffReader, err := riff.NewReader(r)
+	if err != nil {
+		return nil, image.Config{}, err
+	}
+	if formType != fccWEBP {
+		return nil, image.Config{}, errInvalidFormat
+	}
+
+	var (
+		alpha          []byte
+		alphaStride    int
+		wantAlpha      bool
+		widthMinusOne  uint32
+		heightMinusOne uint32
+		buf            [10]byte
+	)
+	for {
+		chunkID, chunkLen, chunkData, err := riffReader.Next()
+		if err == io.EOF {
+			err = errInvalidFormat
+		}
+		if err != nil {
+			return nil, image.Config{}, err
+		}
+
+		switch chunkID {
+		case fccALPH:
+			if !wantAlpha {
+				return nil, image.Config{}, errInvalidFormat
+			}
+			wantAlpha = false
+			// Read the Pre-processing | Filter | Compression byte.
+			if _, err := io.ReadFull(chunkData, buf[:1]); err != nil {
+				if err == io.EOF {
+					err = errInvalidFormat
+				}
+				return nil, image.Config{}, err
+			}
+			alpha, alphaStride, err = readAlpha(chunkData, widthMinusOne, heightMinusOne, buf[0]&0x03)
+			if err != nil {
+				return nil, image.Config{}, err
+			}
+			unfilterAlpha(alpha, alphaStride, (buf[0]>>2)&0x03)
+
+		case fccVP8:
+			if wantAlpha || int32(chunkLen) < 0 {
+				return nil, image.Config{}, errInvalidFormat
+			}
+			d := vp8.NewDecoder()
+			d.Init(chunkData, int(chunkLen))
+			fh, err := d.DecodeFrameHeader()
+			if err != nil {
+				return nil, image.Config{}, err
+			}
+			if configOnly {
+				return nil, image.Config{
+					ColorModel: color.YCbCrModel,
+					Width:      fh.Width,
+					Height:     fh.Height,
+				}, nil
+			}
+			m, err := d.DecodeFrame()
+			if err != nil {
+				return nil, image.Config{}, err
+			}
+			if alpha != nil {
+				return &nycbcra.Image{
+					YCbCr:   *m,
+					A:       alpha,
+					AStride: alphaStride,
+				}, image.Config{}, nil
+			}
+			return m, image.Config{}, nil
+
+		case fccVP8L:
+			if wantAlpha || alpha != nil {
+				return nil, image.Config{}, errInvalidFormat
+			}
+			if configOnly {
+				c, err := vp8l.DecodeConfig(chunkData)
+				return nil, c, err
+			}
+			m, err := vp8l.Decode(chunkData)
+			return m, image.Config{}, err
+
+		case fccVP8X:
+			if chunkLen != 10 {
+				return nil, image.Config{}, errInvalidFormat
+			}
+			if _, err := io.ReadFull(chunkData, buf[:10]); err != nil {
+				return nil, image.Config{}, err
+			}
+			const (
+				animationBit    = 1 << 1
+				xmpMetadataBit  = 1 << 2
+				exifMetadataBit = 1 << 3
+				alphaBit        = 1 << 4
+				iccProfileBit   = 1 << 5
+			)
+			if buf[0] != alphaBit {
+				return nil, image.Config{}, errors.New("webp: non-Alpha VP8X is not implemented")
+			}
+			widthMinusOne = uint32(buf[4]) | uint32(buf[5])<<8 | uint32(buf[6])<<16
+			heightMinusOne = uint32(buf[7]) | uint32(buf[8])<<8 | uint32(buf[9])<<16
+			if configOnly {
+				return nil, image.Config{
+					ColorModel: nycbcra.ColorModel,
+					Width:      int(widthMinusOne) + 1,
+					Height:     int(heightMinusOne) + 1,
+				}, nil
+			}
+			wantAlpha = true
+
+		default:
+			return nil, image.Config{}, errInvalidFormat
+		}
+	}
+}
+
+func readAlpha(chunkData io.Reader, widthMinusOne, heightMinusOne uint32, compression byte) (
+	alpha []byte, alphaStride int, err error) {
+
+	switch compression {
+	case 0:
+		w := int(widthMinusOne) + 1
+		h := int(heightMinusOne) + 1
+		alpha = make([]byte, w*h)
+		if _, err := io.ReadFull(chunkData, alpha); err != nil {
+			return nil, 0, err
+		}
+		return alpha, w, nil
+
+	case 1:
+		// Read the VP8L-compressed alpha values. First, synthesize a 5-byte VP8L header:
+		// a 1-byte magic number, a 14-bit widthMinusOne, a 14-bit heightMinusOne,
+		// a 1-bit (ignored, zero) alphaIsUsed and a 3-bit (zero) version.
+		// TODO(nigeltao): be more efficient than decoding an *image.NRGBA just to
+		// extract the green values to a separately allocated []byte. Fixing this
+		// will require changes to the vp8l package's API.
+		if widthMinusOne > 0x3fff || heightMinusOne > 0x3fff {
+			return nil, 0, errors.New("webp: invalid format")
+		}
+		alphaImage, err := vp8l.Decode(io.MultiReader(
+			bytes.NewReader([]byte{
+				0x2f, // VP8L magic number.
+				uint8(widthMinusOne),
+				uint8(widthMinusOne>>8) | uint8(heightMinusOne<<6),
+				uint8(heightMinusOne >> 2),
+				uint8(heightMinusOne >> 10),
+			}),
+			chunkData,
+		))
+		if err != nil {
+			return nil, 0, err
+		}
+		// The green values of the inner NRGBA image are the alpha values of the
+		// outer NYCbCrA image.
+		pix := alphaImage.(*image.NRGBA).Pix
+		alpha = make([]byte, len(pix)/4)
+		for i := range alpha {
+			alpha[i] = pix[4*i+1]
+		}
+		return alpha, int(widthMinusOne) + 1, nil
+	}
+	return nil, 0, errInvalidFormat
+}
+
+func unfilterAlpha(alpha []byte, alphaStride int, filter byte) {
+	if len(alpha) == 0 || alphaStride == 0 {
+		return
+	}
+	switch filter {
+	case 1: // Horizontal filter.
+		for i := 1; i < alphaStride; i++ {
+			alpha[i] += alpha[i-1]
+		}
+		for i := alphaStride; i < len(alpha); i += alphaStride {
+			// The first column is equivalent to the vertical filter.
+			alpha[i] += alpha[i-alphaStride]
+
+			for j := 1; j < alphaStride; j++ {
+				alpha[i+j] += alpha[i+j-1]
+			}
+		}
+
+	case 2: // Vertical filter.
+		// The first row is equivalent to the horizontal filter.
+		for i := 1; i < alphaStride; i++ {
+			alpha[i] += alpha[i-1]
+		}
+
+		for i := alphaStride; i < len(alpha); i++ {
+			alpha[i] += alpha[i-alphaStride]
+		}
+
+	case 3: // Gradient filter.
+		// The first row is equivalent to the horizontal filter.
+		for i := 1; i < alphaStride; i++ {
+			alpha[i] += alpha[i-1]
+		}
+
+		for i := alphaStride; i < len(alpha); i += alphaStride {
+			// The first column is equivalent to the vertical filter.
+			alpha[i] += alpha[i-alphaStride]
+
+			// The interior is predicted on the three top/left pixels.
+			for j := 1; j < alphaStride; j++ {
+				c := int(alpha[i+j-alphaStride-1])
+				b := int(alpha[i+j-alphaStride])
+				a := int(alpha[i+j-1])
+				x := a + b - c
+				if x < 0 {
+					x = 0
+				} else if x > 255 {
+					x = 255
+				}
+				alpha[i+j] += uint8(x)
+			}
+		}
+	}
+}
+
+// Decode reads a WEBP image from r and returns it as an image.Image.
+func Decode(r io.Reader) (image.Image, error) {
+	m, _, err := decode(r, false)
+	if err != nil {
+		return nil, err
+	}
+	return m, err
+}
+
+// DecodeConfig returns the color model and dimensions of a WEBP image without
+// decoding the entire image.
+func DecodeConfig(r io.Reader) (image.Config, error) {
+	_, c, err := decode(r, true)
+	return c, err
+}
+
+func init() {
+	image.RegisterFormat("webp", "RIFF????WEBPVP8", Decode, DecodeConfig)
+}
diff --git a/webp/decode_test.go b/webp/decode_test.go
new file mode 100644
index 0000000..4b69f90
--- /dev/null
+++ b/webp/decode_test.go
@@ -0,0 +1,296 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package webp
+
+import (
+	"bytes"
+	"fmt"
+	"image"
+	"image/png"
+	"io/ioutil"
+	"os"
+	"strings"
+	"testing"
+
+	"golang.org/x/image/webp/nycbcra"
+)
+
+// hex is like fmt.Sprintf("% x", x) but also inserts dots every 16 bytes, to
+// delineate VP8 macroblock boundaries.
+func hex(x []byte) string {
+	buf := new(bytes.Buffer)
+	for len(x) > 0 {
+		n := len(x)
+		if n > 16 {
+			n = 16
+		}
+		fmt.Fprintf(buf, " . % x", x[:n])
+		x = x[n:]
+	}
+	return buf.String()
+}
+
+func testDecodeLossy(t *testing.T, tc string, withAlpha bool) {
+	webpFilename := "../testdata/" + tc + ".lossy.webp"
+	pngFilename := webpFilename + ".ycbcr.png"
+	if withAlpha {
+		webpFilename = "../testdata/" + tc + ".lossy-with-alpha.webp"
+		pngFilename = webpFilename + ".nycbcra.png"
+	}
+
+	f0, err := os.Open(webpFilename)
+	if err != nil {
+		t.Errorf("%s: Open WEBP: %v", tc, err)
+		return
+	}
+	defer f0.Close()
+	img0, err := Decode(f0)
+	if err != nil {
+		t.Errorf("%s: Decode WEBP: %v", tc, err)
+		return
+	}
+
+	var (
+		m0 *image.YCbCr
+		a0 *nycbcra.Image
+		ok bool
+	)
+	if withAlpha {
+		a0, ok = img0.(*nycbcra.Image)
+		if ok {
+			m0 = &a0.YCbCr
+		}
+	} else {
+		m0, ok = img0.(*image.YCbCr)
+	}
+	if !ok || m0.SubsampleRatio != image.YCbCrSubsampleRatio420 {
+		t.Errorf("%s: decoded WEBP image is not a 4:2:0 YCbCr or 4:2:0 NYCbCrA", tc)
+		return
+	}
+	// w2 and h2 are the half-width and half-height, rounded up.
+	w, h := m0.Bounds().Dx(), m0.Bounds().Dy()
+	w2, h2 := int((w+1)/2), int((h+1)/2)
+
+	f1, err := os.Open(pngFilename)
+	if err != nil {
+		t.Errorf("%s: Open PNG: %v", tc, err)
+		return
+	}
+	defer f1.Close()
+	img1, err := png.Decode(f1)
+	if err != nil {
+		t.Errorf("%s: Open PNG: %v", tc, err)
+		return
+	}
+
+	// The split-into-YCbCr-planes golden image is a 2*w2 wide and h+h2 high
+	// (or 2*h+h2 high, if with Alpha) gray image arranged in IMC4 format:
+	//   YYYY
+	//   YYYY
+	//   BBRR
+	//   AAAA
+	// See http://www.fourcc.org/yuv.php#IMC4
+	pngW, pngH := 2*w2, h+h2
+	if withAlpha {
+		pngH += h
+	}
+	if got, want := img1.Bounds(), image.Rect(0, 0, pngW, pngH); got != want {
+		t.Errorf("%s: bounds0: got %v, want %v", tc, got, want)
+		return
+	}
+	m1, ok := img1.(*image.Gray)
+	if !ok {
+		t.Errorf("%s: decoded PNG image is not a Gray", tc)
+		return
+	}
+
+	type plane struct {
+		name     string
+		m0Pix    []uint8
+		m0Stride int
+		m1Rect   image.Rectangle
+	}
+	planes := []plane{
+		{"Y", m0.Y, m0.YStride, image.Rect(0, 0, w, h)},
+		{"Cb", m0.Cb, m0.CStride, image.Rect(0*w2, h, 1*w2, h+h2)},
+		{"Cr", m0.Cr, m0.CStride, image.Rect(1*w2, h, 2*w2, h+h2)},
+	}
+	if withAlpha {
+		planes = append(planes, plane{
+			"A", a0.A, a0.AStride, image.Rect(0, h+h2, w, 2*h+h2),
+		})
+	}
+
+	for _, plane := range planes {
+		dx := plane.m1Rect.Dx()
+		nDiff, diff := 0, make([]byte, dx)
+		for j, y := 0, plane.m1Rect.Min.Y; y < plane.m1Rect.Max.Y; j, y = j+1, y+1 {
+			got := plane.m0Pix[j*plane.m0Stride:][:dx]
+			want := m1.Pix[y*m1.Stride+plane.m1Rect.Min.X:][:dx]
+			if bytes.Equal(got, want) {
+				continue
+			}
+			nDiff++
+			if nDiff > 10 {
+				t.Errorf("%s: %s plane: more rows differ", tc, plane.name)
+				break
+			}
+			for i := range got {
+				diff[i] = got[i] - want[i]
+			}
+			t.Errorf("%s: %s plane: m0 row %d, m1 row %d\ngot %s\nwant%s\ndiff%s",
+				tc, plane.name, j, y, hex(got), hex(want), hex(diff))
+		}
+	}
+}
+
+func TestDecodeVP8(t *testing.T) {
+	testCases := []string{
+		"blue-purple-pink",
+		"blue-purple-pink-large.no-filter",
+		"blue-purple-pink-large.simple-filter",
+		"blue-purple-pink-large.normal-filter",
+		"video-001",
+		"yellow_rose",
+	}
+
+	for _, tc := range testCases {
+		testDecodeLossy(t, tc, false)
+	}
+}
+
+func TestDecodeVP8XAlpha(t *testing.T) {
+	testCases := []string{
+		"yellow_rose",
+	}
+
+	for _, tc := range testCases {
+		testDecodeLossy(t, tc, true)
+	}
+}
+
+func TestDecodeVP8L(t *testing.T) {
+	testCases := []string{
+		"blue-purple-pink",
+		"blue-purple-pink-large",
+		"gopher-doc.1bpp",
+		"gopher-doc.2bpp",
+		"gopher-doc.4bpp",
+		"gopher-doc.8bpp",
+		"tux",
+		"yellow_rose",
+	}
+
+loop:
+	for _, tc := range testCases {
+		f0, err := os.Open("../testdata/" + tc + ".lossless.webp")
+		if err != nil {
+			t.Errorf("%s: Open WEBP: %v", tc, err)
+			continue
+		}
+		defer f0.Close()
+		img0, err := Decode(f0)
+		if err != nil {
+			t.Errorf("%s: Decode WEBP: %v", tc, err)
+			continue
+		}
+		m0, ok := img0.(*image.NRGBA)
+		if !ok {
+			t.Errorf("%s: WEBP image is %T, want *image.NRGBA", tc, img0)
+			continue
+		}
+
+		f1, err := os.Open("../testdata/" + tc + ".png")
+		if err != nil {
+			t.Errorf("%s: Open PNG: %v", tc, err)
+			continue
+		}
+		defer f1.Close()
+		img1, err := png.Decode(f1)
+		if err != nil {
+			t.Errorf("%s: Decode PNG: %v", tc, err)
+			continue
+		}
+		m1, ok := img1.(*image.NRGBA)
+		if !ok {
+			rgba1, ok := img1.(*image.RGBA)
+			if !ok {
+				t.Fatalf("%s: PNG image is %T, want *image.NRGBA", tc, img1)
+				continue
+			}
+			if !rgba1.Opaque() {
+				t.Fatalf("%s: PNG image is non-opaque *image.RGBA, want *image.NRGBA", tc)
+				continue
+			}
+			// The image is fully opaque, so we can re-interpret the RGBA pixels
+			// as NRGBA pixels.
+			m1 = &image.NRGBA{
+				Pix:    rgba1.Pix,
+				Stride: rgba1.Stride,
+				Rect:   rgba1.Rect,
+			}
+		}
+
+		b0, b1 := m0.Bounds(), m1.Bounds()
+		if b0 != b1 {
+			t.Errorf("%s: bounds: got %v, want %v", tc, b0, b1)
+			continue
+		}
+		for i := range m0.Pix {
+			if m0.Pix[i] != m1.Pix[i] {
+				y := i / m0.Stride
+				x := (i - y*m0.Stride) / 4
+				i = 4 * (y*m0.Stride + x)
+				t.Errorf("%s: at (%d, %d):\ngot  %02x %02x %02x %02x\nwant %02x %02x %02x %02x",
+					tc, x, y,
+					m0.Pix[i+0], m0.Pix[i+1], m0.Pix[i+2], m0.Pix[i+3],
+					m1.Pix[i+0], m1.Pix[i+1], m1.Pix[i+2], m1.Pix[i+3],
+				)
+				continue loop
+			}
+		}
+	}
+}
+
+// TestDecodePartitionTooLarge tests that decoding a malformed WEBP image
+// doesn't try to allocate an unreasonable amount of memory. This WEBP image
+// claims a RIFF chunk length of 0x12345678 bytes (291 MiB) compressed,
+// independent of the actual image size (0 pixels wide * 0 pixels high).
+//
+// This is based on golang.org/issue/10790.
+func TestDecodePartitionTooLarge(t *testing.T) {
+	data := "RIFF\xff\xff\xff\x7fWEBPVP8 " +
+		"\x78\x56\x34\x12" + // RIFF chunk length.
+		"\xbd\x01\x00\x14\x00\x00\xb2\x34\x0a\x9d\x01\x2a\x96\x00\x67\x00"
+	_, err := Decode(strings.NewReader(data))
+	if err == nil {
+		t.Fatal("got nil error, want non-nil")
+	}
+	if got, want := err.Error(), "too much data"; !strings.Contains(got, want) {
+		t.Fatalf("got error %q, want something containing %q", got, want)
+	}
+}
+
+func benchmarkDecode(b *testing.B, filename string) {
+	data, err := ioutil.ReadFile("../testdata/blue-purple-pink-large." + filename + ".webp")
+	if err != nil {
+		b.Fatal(err)
+	}
+	s := string(data)
+	cfg, err := DecodeConfig(strings.NewReader(s))
+	if err != nil {
+		b.Fatal(err)
+	}
+	b.SetBytes(int64(cfg.Width * cfg.Height * 4))
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		Decode(strings.NewReader(s))
+	}
+}
+
+func BenchmarkDecodeVP8NoFilter(b *testing.B)     { benchmarkDecode(b, "no-filter.lossy") }
+func BenchmarkDecodeVP8SimpleFilter(b *testing.B) { benchmarkDecode(b, "simple-filter.lossy") }
+func BenchmarkDecodeVP8NormalFilter(b *testing.B) { benchmarkDecode(b, "normal-filter.lossy") }
+func BenchmarkDecodeVP8L(b *testing.B)            { benchmarkDecode(b, "lossless") }
diff --git a/webp/nycbcra/nycbcra.go b/webp/nycbcra/nycbcra.go
new file mode 100644
index 0000000..2e1fe05
--- /dev/null
+++ b/webp/nycbcra/nycbcra.go
@@ -0,0 +1,186 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package nycbcra provides non-alpha-premultiplied Y'CbCr-with-alpha image and
+// color types.
+package nycbcra // import "golang.org/x/image/webp/nycbcra"
+
+import (
+	"image"
+	"image/color"
+)
+
+// TODO: move this to the standard image and image/color packages, so that the
+// image/draw package can have fast-path code. Moving would rename:
+//	nycbcra.Color      to color.NYCbCrA
+//	nycbcra.ColorModel to color.NYCbCrAModel
+//	nycbcra.Image      to image.NYCbCrA
+
+// Color represents a non-alpha-premultiplied Y'CbCr-with-alpha color, having
+// 8 bits each for one luma, two chroma and one alpha component.
+type Color struct {
+	color.YCbCr
+	A uint8
+}
+
+func (c Color) RGBA() (r, g, b, a uint32) {
+	r8, g8, b8 := color.YCbCrToRGB(c.Y, c.Cb, c.Cr)
+	a = uint32(c.A) * 0x101
+	r = uint32(r8) * 0x101 * a / 0xffff
+	g = uint32(g8) * 0x101 * a / 0xffff
+	b = uint32(b8) * 0x101 * a / 0xffff
+	return
+}
+
+// ColorModel is the Model for non-alpha-premultiplied Y'CbCr-with-alpha colors.
+var ColorModel color.Model = color.ModelFunc(nYCbCrAModel)
+
+func nYCbCrAModel(c color.Color) color.Color {
+	switch c := c.(type) {
+	case Color:
+		return c
+	case color.YCbCr:
+		return Color{c, 0xff}
+	}
+	r, g, b, a := c.RGBA()
+
+	// Convert from alpha-premultiplied to non-alpha-premultiplied.
+	if a != 0 {
+		r = (r * 0xffff) / a
+		g = (g * 0xffff) / a
+		b = (b * 0xffff) / a
+	}
+
+	y, u, v := color.RGBToYCbCr(uint8(r>>8), uint8(g>>8), uint8(b>>8))
+	return Color{color.YCbCr{Y: y, Cb: u, Cr: v}, uint8(a >> 8)}
+}
+
+// Image is an in-memory image of non-alpha-premultiplied Y'CbCr-with-alpha
+// colors. A and AStride are analogous to the Y and YStride fields of the
+// embedded YCbCr.
+type Image struct {
+	image.YCbCr
+	A       []uint8
+	AStride int
+}
+
+func (p *Image) ColorModel() color.Model {
+	return ColorModel
+}
+
+func (p *Image) At(x, y int) color.Color {
+	return p.NYCbCrAAt(x, y)
+}
+
+func (p *Image) NYCbCrAAt(x, y int) Color {
+	if !(image.Point{X: x, Y: y}.In(p.Rect)) {
+		return Color{}
+	}
+	yi := p.YOffset(x, y)
+	ci := p.COffset(x, y)
+	ai := p.AOffset(x, y)
+	return Color{
+		color.YCbCr{
+			Y:  p.Y[yi],
+			Cb: p.Cb[ci],
+			Cr: p.Cr[ci],
+		},
+		p.A[ai],
+	}
+}
+
+// AOffset returns the index of the first element of A that corresponds to
+// the pixel at (x, y).
+func (p *Image) AOffset(x, y int) int {
+	return (y-p.Rect.Min.Y)*p.AStride + (x - p.Rect.Min.X)
+}
+
+// SubImage returns an image representing the portion of the image p visible
+// through r. The returned value shares pixels with the original image.
+func (p *Image) SubImage(r image.Rectangle) image.Image {
+	// TODO: share code with image.NewYCbCr when this type moves into the
+	// standard image package.
+	r = r.Intersect(p.Rect)
+	// If r1 and r2 are Rectangles, r1.Intersect(r2) is not guaranteed to be inside
+	// either r1 or r2 if the intersection is empty. Without explicitly checking for
+	// this, the Pix[i:] expression below can panic.
+	if r.Empty() {
+		return &Image{
+			YCbCr: image.YCbCr{
+				SubsampleRatio: p.SubsampleRatio,
+			},
+		}
+	}
+	yi := p.YOffset(r.Min.X, r.Min.Y)
+	ci := p.COffset(r.Min.X, r.Min.Y)
+	ai := p.AOffset(r.Min.X, r.Min.Y)
+	return &Image{
+		YCbCr: image.YCbCr{
+			Y:              p.Y[yi:],
+			Cb:             p.Cb[ci:],
+			Cr:             p.Cr[ci:],
+			SubsampleRatio: p.SubsampleRatio,
+			YStride:        p.YStride,
+			CStride:        p.CStride,
+			Rect:           r,
+		},
+		A:       p.A[ai:],
+		AStride: p.AStride,
+	}
+}
+
+// Opaque scans the entire image and reports whether it is fully opaque.
+func (p *Image) Opaque() bool {
+	if p.Rect.Empty() {
+		return true
+	}
+	i0, i1 := 0, p.Rect.Dx()
+	for y := p.Rect.Min.Y; y < p.Rect.Max.Y; y++ {
+		for _, a := range p.A[i0:i1] {
+			if a != 0xff {
+				return false
+			}
+		}
+		i0 += p.AStride
+		i1 += p.AStride
+	}
+	return true
+}
+
+// New returns a new Image with the given bounds and subsample ratio.
+func New(r image.Rectangle, subsampleRatio image.YCbCrSubsampleRatio) *Image {
+	// TODO: share code with image.NewYCbCr when this type moves into the
+	// standard image package.
+	w, h, cw, ch := r.Dx(), r.Dy(), 0, 0
+	switch subsampleRatio {
+	case image.YCbCrSubsampleRatio422:
+		cw = (r.Max.X+1)/2 - r.Min.X/2
+		ch = h
+	case image.YCbCrSubsampleRatio420:
+		cw = (r.Max.X+1)/2 - r.Min.X/2
+		ch = (r.Max.Y+1)/2 - r.Min.Y/2
+	case image.YCbCrSubsampleRatio440:
+		cw = w
+		ch = (r.Max.Y+1)/2 - r.Min.Y/2
+	default:
+		// Default to 4:4:4 subsampling.
+		cw = w
+		ch = h
+	}
+	b := make([]byte, 2*w*h+2*cw*ch)
+	// TODO: use s[i:j:k] notation to set the cap.
+	return &Image{
+		YCbCr: image.YCbCr{
+			Y:              b[:w*h],
+			Cb:             b[w*h+0*cw*ch : w*h+1*cw*ch],
+			Cr:             b[w*h+1*cw*ch : w*h+2*cw*ch],
+			SubsampleRatio: subsampleRatio,
+			YStride:        w,
+			CStride:        cw,
+			Rect:           r,
+		},
+		A:       b[w*h+2*cw*ch:],
+		AStride: w,
+	}
+}