go.image/vp8l: new package.

The blue-purple-pink image comes from
http://blog.golang.org/gophercon

The tux and yellow_rose images come from
https://developers.google.com/speed/webp/gallery2 and according to
that page, those images are in the public domain.

The gopher-doc images are http://golang.org/doc/gopher/doc.png
after quantizing its palette to 2/4/16/256 colors.

LGTM=r
R=r
CC=golang-codereviews
https://golang.org/cl/109010043
diff --git a/testdata/blue-purple-pink.lossless.webp b/testdata/blue-purple-pink.lossless.webp
new file mode 100644
index 0000000..b16a50d
--- /dev/null
+++ b/testdata/blue-purple-pink.lossless.webp
Binary files differ
diff --git a/testdata/gopher-doc.1bpp.lossless.webp b/testdata/gopher-doc.1bpp.lossless.webp
new file mode 100644
index 0000000..fcca028
--- /dev/null
+++ b/testdata/gopher-doc.1bpp.lossless.webp
Binary files differ
diff --git a/testdata/gopher-doc.1bpp.png b/testdata/gopher-doc.1bpp.png
new file mode 100644
index 0000000..9c5bb64
--- /dev/null
+++ b/testdata/gopher-doc.1bpp.png
Binary files differ
diff --git a/testdata/gopher-doc.2bpp.lossless.webp b/testdata/gopher-doc.2bpp.lossless.webp
new file mode 100644
index 0000000..d683d47
--- /dev/null
+++ b/testdata/gopher-doc.2bpp.lossless.webp
Binary files differ
diff --git a/testdata/gopher-doc.2bpp.png b/testdata/gopher-doc.2bpp.png
new file mode 100644
index 0000000..af96769
--- /dev/null
+++ b/testdata/gopher-doc.2bpp.png
Binary files differ
diff --git a/testdata/gopher-doc.4bpp.lossless.webp b/testdata/gopher-doc.4bpp.lossless.webp
new file mode 100644
index 0000000..11d8ef1
--- /dev/null
+++ b/testdata/gopher-doc.4bpp.lossless.webp
Binary files differ
diff --git a/testdata/gopher-doc.4bpp.png b/testdata/gopher-doc.4bpp.png
new file mode 100644
index 0000000..fc18137
--- /dev/null
+++ b/testdata/gopher-doc.4bpp.png
Binary files differ
diff --git a/testdata/gopher-doc.8bpp.lossless.webp b/testdata/gopher-doc.8bpp.lossless.webp
new file mode 100644
index 0000000..b6468e9
--- /dev/null
+++ b/testdata/gopher-doc.8bpp.lossless.webp
Binary files differ
diff --git a/testdata/gopher-doc.8bpp.png b/testdata/gopher-doc.8bpp.png
new file mode 100644
index 0000000..b877c54
--- /dev/null
+++ b/testdata/gopher-doc.8bpp.png
Binary files differ
diff --git a/testdata/tux.lossless.webp b/testdata/tux.lossless.webp
new file mode 100644
index 0000000..3b32c02
--- /dev/null
+++ b/testdata/tux.lossless.webp
Binary files differ
diff --git a/testdata/tux.png b/testdata/tux.png
new file mode 100644
index 0000000..2567fe7
--- /dev/null
+++ b/testdata/tux.png
Binary files differ
diff --git a/testdata/yellow_rose.lossless.webp b/testdata/yellow_rose.lossless.webp
new file mode 100644
index 0000000..0c028f4
--- /dev/null
+++ b/testdata/yellow_rose.lossless.webp
Binary files differ
diff --git a/testdata/yellow_rose.lossy.webp b/testdata/yellow_rose.lossy.webp
new file mode 100644
index 0000000..57a845e
--- /dev/null
+++ b/testdata/yellow_rose.lossy.webp
Binary files differ
diff --git a/testdata/yellow_rose.png b/testdata/yellow_rose.png
new file mode 100644
index 0000000..bbaefa8
--- /dev/null
+++ b/testdata/yellow_rose.png
Binary files differ
diff --git a/vp8/decode.go b/vp8/decode.go
index 825fb00..ab1115c 100644
--- a/vp8/decode.go
+++ b/vp8/decode.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// Package vp8 implements a vp8 image and video decoder.
+// Package vp8 implements a decoder for the VP8 lossy image format.
 //
 // The VP8 specification is RFC 6386.
 package vp8
diff --git a/vp8l/decode.go b/vp8l/decode.go
new file mode 100644
index 0000000..13fa6d9
--- /dev/null
+++ b/vp8l/decode.go
@@ -0,0 +1,599 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package vp8l implements a decoder for the VP8L lossless image format.
+//
+// The VP8L specification is at:
+// https://developers.google.com/speed/webp/docs/riff_container
+package vp8l
+
+import (
+	"bufio"
+	"errors"
+	"image"
+	"image/color"
+	"io"
+)
+
+var (
+	errInvalidCodeLengths = errors.New("vp8l: invalid code lengths")
+	errInvalidHuffmanTree = errors.New("vp8l: invalid Huffman tree")
+)
+
+// colorCacheMultiplier is the multiplier used for the color cache hash
+// function, specified in section 4.2.3.
+const colorCacheMultiplier = 0x1e35a7bd
+
+// distanceMapTable is the look-up table for distanceMap.
+var distanceMapTable = [120]uint8{
+	0x18, 0x07, 0x17, 0x19, 0x28, 0x06, 0x27, 0x29, 0x16, 0x1a,
+	0x26, 0x2a, 0x38, 0x05, 0x37, 0x39, 0x15, 0x1b, 0x36, 0x3a,
+	0x25, 0x2b, 0x48, 0x04, 0x47, 0x49, 0x14, 0x1c, 0x35, 0x3b,
+	0x46, 0x4a, 0x24, 0x2c, 0x58, 0x45, 0x4b, 0x34, 0x3c, 0x03,
+	0x57, 0x59, 0x13, 0x1d, 0x56, 0x5a, 0x23, 0x2d, 0x44, 0x4c,
+	0x55, 0x5b, 0x33, 0x3d, 0x68, 0x02, 0x67, 0x69, 0x12, 0x1e,
+	0x66, 0x6a, 0x22, 0x2e, 0x54, 0x5c, 0x43, 0x4d, 0x65, 0x6b,
+	0x32, 0x3e, 0x78, 0x01, 0x77, 0x79, 0x53, 0x5d, 0x11, 0x1f,
+	0x64, 0x6c, 0x42, 0x4e, 0x76, 0x7a, 0x21, 0x2f, 0x75, 0x7b,
+	0x31, 0x3f, 0x63, 0x6d, 0x52, 0x5e, 0x00, 0x74, 0x7c, 0x41,
+	0x4f, 0x10, 0x20, 0x62, 0x6e, 0x30, 0x73, 0x7d, 0x51, 0x5f,
+	0x40, 0x72, 0x7e, 0x61, 0x6f, 0x50, 0x71, 0x7f, 0x60, 0x70,
+}
+
+// distanceMap maps a LZ77 backwards reference distance to a two-dimensional
+// pixel offset, specified in section 4.2.2.
+func distanceMap(w int32, code uint32) int32 {
+	if int32(code) > int32(len(distanceMapTable)) {
+		return int32(code) - int32(len(distanceMapTable))
+	}
+	distCode := int32(distanceMapTable[code-1])
+	yOffset := distCode >> 4
+	xOffset := 8 - distCode&0xf
+	if d := yOffset*w + xOffset; d >= 1 {
+		return d
+	}
+	return 1
+}
+
+// decoder holds the bit-stream for a VP8L image.
+type decoder struct {
+	r     io.ByteReader
+	bits  uint32
+	nBits uint32
+}
+
+// read reads the next n bits from the decoder's bit-stream.
+func (d *decoder) read(n uint32) (uint32, error) {
+	for d.nBits < n {
+		c, err := d.r.ReadByte()
+		if err != nil {
+			if err == io.EOF {
+				err = io.ErrUnexpectedEOF
+			}
+			return 0, err
+		}
+		d.bits |= uint32(c) << d.nBits
+		d.nBits += 8
+	}
+	u := d.bits & (1<<n - 1)
+	d.bits >>= n
+	d.nBits -= n
+	return u, nil
+}
+
+// decodeTransform decodes the next transform and the width of the image after
+// transformation (or equivalently, before inverse transformation), specified
+// in section 3.
+func (d *decoder) decodeTransform(w int32, h int32) (t transform, newWidth int32, err error) {
+	t.oldWidth = w
+	t.transformType, err = d.read(2)
+	if err != nil {
+		return transform{}, 0, err
+	}
+	switch t.transformType {
+	case transformTypePredictor, transformTypeCrossColor:
+		t.bits, err = d.read(3)
+		if err != nil {
+			return transform{}, 0, err
+		}
+		t.bits += 2
+		t.pix, err = d.decodePix(nTiles(w, t.bits), nTiles(h, t.bits), 0, false)
+		if err != nil {
+			return transform{}, 0, err
+		}
+	case transformTypeSubtractGreen:
+		// No-op.
+	case transformTypeColorIndexing:
+		nColors, err := d.read(8)
+		if err != nil {
+			return transform{}, 0, err
+		}
+		nColors++
+		t.bits = 0
+		switch {
+		case nColors <= 2:
+			t.bits = 3
+		case nColors <= 4:
+			t.bits = 2
+		case nColors <= 16:
+			t.bits = 1
+		}
+		w = nTiles(w, t.bits)
+		pix, err := d.decodePix(int32(nColors), 1, 4*256, false)
+		if err != nil {
+			return transform{}, 0, err
+		}
+		for p := 4; p < len(pix); p += 4 {
+			pix[p+0] += pix[p-4]
+			pix[p+1] += pix[p-3]
+			pix[p+2] += pix[p-2]
+			pix[p+3] += pix[p-1]
+		}
+		// The C code fills in palette entries past the nColors upper limit as
+		// transparent black. In Go, we re-slice up to 256 4-byte pixels.
+		t.pix = pix[:4*256]
+	}
+	return t, w, nil
+}
+
+// repeatsCodeLength is the minimum code length for repeated codes.
+const repeatsCodeLength = 16
+
+// These magic numbers are specified at the end of section 5.2.2.
+// The 3-length arrays apply to code lengths >= repeatsCodeLength.
+var (
+	codeLengthCodeOrder = [19]uint8{
+		17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+	}
+	repeatBits    = [3]uint8{2, 3, 7}
+	repeatOffsets = [3]uint8{3, 3, 11}
+)
+
+// decodeCodeLengths decodes a Huffman tree's code lengths which are themselves
+// encoded via a Huffman tree, specified in section 5.2.2.
+func (d *decoder) decodeCodeLengths(dst []uint32, codeLengthCodeLengths []uint32) error {
+	h := hTree{}
+	if err := h.build(codeLengthCodeLengths); err != nil {
+		return err
+	}
+
+	maxSymbol := len(dst)
+	useLength, err := d.read(1)
+	if err != nil {
+		return err
+	}
+	if useLength != 0 {
+		n, err := d.read(3)
+		if err != nil {
+			return err
+		}
+		n = 2 + 2*n
+		ms, err := d.read(n)
+		if err != nil {
+			return err
+		}
+		maxSymbol = int(ms) + 2
+		if maxSymbol > len(dst) {
+			return errInvalidCodeLengths
+		}
+	}
+
+	prevCodeLength := uint32(0)
+	for symbol := 0; symbol < len(dst); {
+		if maxSymbol == 0 {
+			break
+		}
+		maxSymbol--
+		codeLength, err := h.next(d)
+		if err != nil {
+			return err
+		}
+		if codeLength < repeatsCodeLength {
+			dst[symbol] = codeLength
+			symbol++
+			if codeLength != 0 {
+				prevCodeLength = codeLength
+			}
+			continue
+		}
+
+		repeat, err := d.read(uint32(repeatBits[codeLength-repeatsCodeLength]))
+		if err != nil {
+			return err
+		}
+		repeat += uint32(repeatOffsets[codeLength-repeatsCodeLength])
+		if symbol+int(repeat) > len(dst) {
+			return errInvalidCodeLengths
+		}
+		// A code length of 16 repeats the previous non-zero code.
+		// A code length of 17 or 18 repeats zeroes.
+		cl := uint32(0)
+		if codeLength == 16 {
+			cl = prevCodeLength
+		}
+		for ; repeat > 0; repeat-- {
+			dst[symbol] = cl
+			symbol++
+		}
+	}
+	return nil
+}
+
+// decodeHuffmanTree decodes a Huffman tree into h.
+func (d *decoder) decodeHuffmanTree(h *hTree, alphabetSize uint32) error {
+	useSimple, err := d.read(1)
+	if err != nil {
+		return err
+	}
+	if useSimple != 0 {
+		nSymbols, err := d.read(1)
+		if err != nil {
+			return err
+		}
+		nSymbols++
+		firstSymbolLengthCode, err := d.read(1)
+		if err != nil {
+			return err
+		}
+		firstSymbolLengthCode = 7*firstSymbolLengthCode + 1
+		var symbols [2]uint32
+		symbols[0], err = d.read(firstSymbolLengthCode)
+		if err != nil {
+			return err
+		}
+		if nSymbols == 2 {
+			symbols[1], err = d.read(8)
+			if err != nil {
+				return err
+			}
+		}
+		return h.buildSimple(nSymbols, symbols, alphabetSize)
+	}
+
+	nCodes, err := d.read(4)
+	if err != nil {
+		return err
+	}
+	nCodes += 4
+	if int(nCodes) > len(codeLengthCodeOrder) {
+		return errInvalidHuffmanTree
+	}
+	codeLengthCodeLengths := [len(codeLengthCodeOrder)]uint32{}
+	for i := uint32(0); i < nCodes; i++ {
+		codeLengthCodeLengths[codeLengthCodeOrder[i]], err = d.read(3)
+		if err != nil {
+			return err
+		}
+	}
+	codeLengths := make([]uint32, alphabetSize)
+	if err = d.decodeCodeLengths(codeLengths, codeLengthCodeLengths[:]); err != nil {
+		return err
+	}
+	return h.build(codeLengths)
+}
+
+const (
+	huffGreen    = 0
+	huffRed      = 1
+	huffBlue     = 2
+	huffAlpha    = 3
+	huffDistance = 4
+	nHuff        = 5
+)
+
+// hGroup is an array of 5 Huffman trees.
+type hGroup [nHuff]hTree
+
+// decodeHuffmanGroups decodes the one or more hGroups used to decode the pixel
+// data. If one hGroup is used for the entire image, then hPix and hBits will
+// be zero. If more than one hGroup is used, then hPix contains the meta-image
+// that maps tiles to hGroup index, and hBits contains the log-2 tile size.
+func (d *decoder) decodeHuffmanGroups(w int32, h int32, topLevel bool, ccBits uint32) (
+	hGroups []hGroup, hPix []byte, hBits uint32, err error) {
+
+	maxHGroupIndex := 0
+	if topLevel {
+		useMeta, err := d.read(1)
+		if err != nil {
+			return nil, nil, 0, err
+		}
+		if useMeta != 0 {
+			hBits, err = d.read(3)
+			if err != nil {
+				return nil, nil, 0, err
+			}
+			hBits += 2
+			hPix, err = d.decodePix(nTiles(w, hBits), nTiles(h, hBits), 0, false)
+			if err != nil {
+				return nil, nil, 0, err
+			}
+			for p := 0; p < len(hPix); p += 4 {
+				i := int(hPix[p])<<8 | int(hPix[p+1])
+				if maxHGroupIndex < i {
+					maxHGroupIndex = i
+				}
+			}
+		}
+	}
+	hGroups = make([]hGroup, maxHGroupIndex+1)
+	for i := range hGroups {
+		for j, alphabetSize := range alphabetSizes {
+			if j == 0 && ccBits > 0 {
+				alphabetSize += 1 << ccBits
+			}
+			if err := d.decodeHuffmanTree(&hGroups[i][j], alphabetSize); err != nil {
+				return nil, nil, 0, err
+			}
+		}
+	}
+	return hGroups, hPix, hBits, nil
+}
+
+const (
+	nLiteralCodes  = 256
+	nLengthCodes   = 24
+	nDistanceCodes = 40
+)
+
+var alphabetSizes = [nHuff]uint32{
+	nLiteralCodes + nLengthCodes,
+	nLiteralCodes,
+	nLiteralCodes,
+	nLiteralCodes,
+	nDistanceCodes,
+}
+
+// decodePix decodes pixel data, specified in section 5.2.2.
+func (d *decoder) decodePix(w int32, h int32, minCap int32, topLevel bool) ([]byte, error) {
+	// Decode the color cache parameters.
+	ccBits, ccShift, ccEntries := uint32(0), uint32(0), ([]uint32)(nil)
+	useColorCache, err := d.read(1)
+	if err != nil {
+		return nil, err
+	}
+	if useColorCache != 0 {
+		ccBits, err = d.read(4)
+		if err != nil {
+			return nil, err
+		}
+		if ccBits < 1 || 11 < ccBits {
+			return nil, errors.New("vp8l: invalid color cache parameters")
+		}
+		ccShift = 32 - ccBits
+		ccEntries = make([]uint32, 1<<ccBits)
+	}
+
+	// Decode the Huffman groups.
+	hGroups, hPix, hBits, err := d.decodeHuffmanGroups(w, h, topLevel, ccBits)
+	if err != nil {
+		return nil, err
+	}
+	hMask, tilesPerRow := int32(0), int32(0)
+	if hBits != 0 {
+		hMask, tilesPerRow = 1<<hBits-1, nTiles(w, hBits)
+	}
+
+	// Decode the pixels.
+	if minCap < 4*w*h {
+		minCap = 4 * w * h
+	}
+	pix := make([]byte, 4*w*h, minCap)
+	p, cachedP := 0, 0
+	x, y := int32(0), int32(0)
+	hg, lookupHG := &hGroups[0], hMask != 0
+	for p < len(pix) {
+		if lookupHG {
+			i := 4 * (tilesPerRow*(y>>hBits) + (x >> hBits))
+			hg = &hGroups[uint32(hPix[i])<<8|uint32(hPix[i+1])]
+		}
+
+		green, err := hg[huffGreen].next(d)
+		if err != nil {
+			return nil, err
+		}
+		switch {
+		case green < nLiteralCodes:
+			// We have a literal pixel.
+			red, err := hg[huffRed].next(d)
+			if err != nil {
+				return nil, err
+			}
+			blue, err := hg[huffBlue].next(d)
+			if err != nil {
+				return nil, err
+			}
+			alpha, err := hg[huffAlpha].next(d)
+			if err != nil {
+				return nil, err
+			}
+			pix[p+0] = uint8(red)
+			pix[p+1] = uint8(green)
+			pix[p+2] = uint8(blue)
+			pix[p+3] = uint8(alpha)
+			p += 4
+
+			x++
+			if x == w {
+				x, y = 0, y+1
+			}
+			lookupHG = hMask != 0 && x&hMask == 0
+
+		case green < nLiteralCodes+nLengthCodes:
+			// We have a LZ77 backwards reference.
+			length, err := d.lz77Param(green - nLiteralCodes)
+			if err != nil {
+				return nil, err
+			}
+			distSym, err := hg[huffDistance].next(d)
+			if err != nil {
+				return nil, err
+			}
+			distCode, err := d.lz77Param(distSym)
+			if err != nil {
+				return nil, err
+			}
+			dist := distanceMap(w, distCode)
+			pEnd := p + 4*int(length)
+			q := p - 4*int(dist)
+			qEnd := pEnd - 4*int(dist)
+			if p < 0 || len(pix) < pEnd || q < 0 || len(pix) < qEnd {
+				return nil, errors.New("vp8l: invalid LZ77 parameters")
+			}
+			for ; p < pEnd; p, q = p+1, q+1 {
+				pix[p] = pix[q]
+			}
+
+			x += int32(length)
+			for x >= w {
+				x, y = x-w, y+1
+			}
+			lookupHG = hMask != 0
+
+		default:
+			// We have a color cache lookup. First, insert previous pixels
+			// into the cache. Note that VP8L assumes ARGB order, but the
+			// Go image.RGBA type is in RGBA order.
+			for ; cachedP < p; cachedP += 4 {
+				argb := uint32(pix[cachedP+0])<<16 |
+					uint32(pix[cachedP+1])<<8 |
+					uint32(pix[cachedP+2])<<0 |
+					uint32(pix[cachedP+3])<<24
+				ccEntries[(argb*colorCacheMultiplier)>>ccShift] = argb
+			}
+			green -= nLiteralCodes + nLengthCodes
+			if int(green) >= len(ccEntries) {
+				return nil, errors.New("vp8l: invalid color cache index")
+			}
+			argb := ccEntries[green]
+			pix[p+0] = uint8(argb >> 16)
+			pix[p+1] = uint8(argb >> 8)
+			pix[p+2] = uint8(argb >> 0)
+			pix[p+3] = uint8(argb >> 24)
+			p += 4
+
+			x++
+			if x == w {
+				x, y = 0, y+1
+			}
+			lookupHG = hMask != 0 && x&hMask == 0
+		}
+	}
+	return pix, nil
+}
+
+// lz77Param returns the next LZ77 parameter: a length or a distance, specified
+// in section 4.2.2.
+func (d *decoder) lz77Param(symbol uint32) (uint32, error) {
+	if symbol < 4 {
+		return symbol + 1, nil
+	}
+	extraBits := (symbol - 2) >> 1
+	offset := (2 + symbol&1) << extraBits
+	n, err := d.read(extraBits)
+	if err != nil {
+		return 0, nil
+	}
+	return offset + n + 1, nil
+}
+
+// decodeHeader decodes the VP8L header from r.
+func decodeHeader(r io.Reader) (d *decoder, w int32, h int32, err error) {
+	rr, ok := r.(io.ByteReader)
+	if !ok {
+		rr = bufio.NewReader(r)
+	}
+	d = &decoder{r: rr}
+	magic, err := d.read(8)
+	if err != nil {
+		return nil, 0, 0, err
+	}
+	if magic != 0x2f {
+		return nil, 0, 0, errors.New("vp8l: invalid header")
+	}
+	width, err := d.read(14)
+	if err != nil {
+		return nil, 0, 0, err
+	}
+	width++
+	height, err := d.read(14)
+	if err != nil {
+		return nil, 0, 0, err
+	}
+	height++
+	_, err = d.read(1) // Read and ignore the hasAlpha hint.
+	if err != nil {
+		return nil, 0, 0, err
+	}
+	version, err := d.read(3)
+	if err != nil {
+		return nil, 0, 0, err
+	}
+	if version != 0 {
+		return nil, 0, 0, errors.New("vp8l: unsupported version")
+	}
+	return d, int32(width), int32(height), nil
+}
+
+// DecodeConfig decodes the color model and dimensions of a VP8L image from r.
+func DecodeConfig(r io.Reader) (image.Config, error) {
+	_, w, h, err := decodeHeader(r)
+	if err != nil {
+		return image.Config{}, err
+	}
+	return image.Config{
+		ColorModel: color.NRGBAModel,
+		Width:      int(w),
+		Height:     int(h),
+	}, nil
+}
+
+// Decode decodes a VP8L image from r.
+func Decode(r io.Reader) (image.Image, error) {
+	d, w, h, err := decodeHeader(r)
+	if err != nil {
+		return nil, err
+	}
+	// Decode the transforms.
+	var (
+		nTransforms    int
+		transforms     [nTransformTypes]transform
+		transformsSeen [nTransformTypes]bool
+		originalW      = w
+	)
+	for {
+		more, err := d.read(1)
+		if err != nil {
+			return nil, err
+		}
+		if more == 0 {
+			break
+		}
+		var t transform
+		t, w, err = d.decodeTransform(w, h)
+		if err != nil {
+			return nil, err
+		}
+		if transformsSeen[t.transformType] {
+			return nil, errors.New("vp8l: repeated transform")
+		}
+		transformsSeen[t.transformType] = true
+		transforms[nTransforms] = t
+		nTransforms++
+	}
+	// Decode the transformed pixels.
+	pix, err := d.decodePix(w, h, 0, true)
+	if err != nil {
+		return nil, err
+	}
+	// Apply the inverse transformations.
+	for i := nTransforms - 1; i >= 0; i-- {
+		t := &transforms[i]
+		pix = inverseTransforms[t.transformType](t, pix, h)
+	}
+	return &image.NRGBA{
+		Pix:    pix,
+		Stride: 4 * int(originalW),
+		Rect:   image.Rect(0, 0, int(originalW), int(h)),
+	}, nil
+}
diff --git a/vp8l/huffman.go b/vp8l/huffman.go
new file mode 100644
index 0000000..36368a8
--- /dev/null
+++ b/vp8l/huffman.go
@@ -0,0 +1,245 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package vp8l
+
+import (
+	"io"
+)
+
+// reverseBits reverses the bits in a byte.
+var reverseBits = [256]uint8{
+	0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
+	0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8, 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
+	0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4, 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
+	0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec, 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
+	0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2, 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
+	0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea, 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
+	0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6, 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
+	0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee, 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
+	0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1, 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
+	0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9, 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
+	0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5, 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
+	0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed, 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
+	0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3, 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
+	0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb, 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
+	0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7, 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
+	0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef, 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff,
+}
+
+// hNode is a node in a Huffman tree.
+type hNode struct {
+	// symbol is the symbol held by this node.
+	symbol uint32
+	// children, if positive, is the hTree.nodes index of the first of
+	// this node's two children. Zero means an uninitialized node,
+	// and -1 means a leaf node.
+	children int32
+}
+
+const leafNode = -1
+
+// lutSize is the log-2 size of an hTree's look-up table.
+const lutSize, lutMask = 7, 1<<7 - 1
+
+// hTree is a Huffman tree.
+type hTree struct {
+	// nodes are the nodes of the Huffman tree. During construction,
+	// len(nodes) grows from 1 up to cap(nodes) by steps of two.
+	// After construction, len(nodes) == cap(nodes), and both equal
+	// 2*theNumberOfSymbols - 1.
+	nodes []hNode
+	// lut is a look-up table for walking the nodes. The x in lut[x] is
+	// the next lutSize bits in the bit-stream. The low 8 bits of lut[x]
+	// equals 1 plus the number of bits in the next code, or 0 if the
+	// next code requires more than lutSize bits. The high 24 bits are:
+	//   - the symbol, if the code requires lutSize or fewer bits, or
+	//   - the hTree.nodes index to start the tree traversal from, if
+	//     the next code requires more than lutSize bits.
+	lut [1 << lutSize]uint32
+}
+
+// insert inserts into the hTree a symbol whose encoding is the least
+// significant codeLength bits of code.
+func (h *hTree) insert(symbol uint32, code uint32, codeLength uint32) error {
+	if symbol > 0xffff || codeLength > 0xfe {
+		return errInvalidHuffmanTree
+	}
+	baseCode := uint32(0)
+	if codeLength > lutSize {
+		baseCode = uint32(reverseBits[(code>>(codeLength-lutSize))&0xff]) >> (8 - lutSize)
+	} else {
+		baseCode = uint32(reverseBits[code&0xff]) >> (8 - codeLength)
+		for i := 0; i < 1<<(lutSize-codeLength); i++ {
+			h.lut[baseCode|uint32(i)<<codeLength] = symbol<<8 | (codeLength + 1)
+		}
+	}
+
+	n := uint32(0)
+	for jump := lutSize; codeLength > 0; {
+		codeLength--
+		if int(n) > len(h.nodes) {
+			return errInvalidHuffmanTree
+		}
+		switch h.nodes[n].children {
+		case leafNode:
+			return errInvalidHuffmanTree
+		case 0:
+			if len(h.nodes) == cap(h.nodes) {
+				return errInvalidHuffmanTree
+			}
+			// Create two empty child nodes.
+			h.nodes[n].children = int32(len(h.nodes))
+			h.nodes = h.nodes[:len(h.nodes)+2]
+		}
+		n = uint32(h.nodes[n].children) + 1&(code>>codeLength)
+		jump--
+		if jump == 0 && h.lut[baseCode] == 0 {
+			h.lut[baseCode] = n << 8
+		}
+	}
+
+	switch h.nodes[n].children {
+	case leafNode:
+		// No-op.
+	case 0:
+		// Turn the uninitialized node into a leaf.
+		h.nodes[n].children = leafNode
+	default:
+		return errInvalidHuffmanTree
+	}
+	h.nodes[n].symbol = symbol
+	return nil
+}
+
+// codeLengthsToCodes returns the canonical Huffman codes implied by the
+// sequence of code lengths.
+func codeLengthsToCodes(codeLengths []uint32) ([]uint32, error) {
+	maxCodeLength := uint32(0)
+	for _, cl := range codeLengths {
+		if maxCodeLength < cl {
+			maxCodeLength = cl
+		}
+	}
+	const maxAllowedCodeLength = 15
+	if len(codeLengths) == 0 || maxCodeLength > maxAllowedCodeLength {
+		return nil, errInvalidHuffmanTree
+	}
+	histogram := [maxAllowedCodeLength + 1]uint32{}
+	for _, cl := range codeLengths {
+		histogram[cl]++
+	}
+	currCode, nextCodes := uint32(0), [maxAllowedCodeLength + 1]uint32{}
+	for cl := 1; cl < len(nextCodes); cl++ {
+		currCode = (currCode + histogram[cl-1]) << 1
+		nextCodes[cl] = currCode
+	}
+	codes := make([]uint32, len(codeLengths))
+	for symbol, cl := range codeLengths {
+		if cl > 0 {
+			codes[symbol] = nextCodes[cl]
+			nextCodes[cl]++
+		}
+	}
+	return codes, nil
+}
+
+// build builds a canonical Huffman tree from the given code lengths.
+func (h *hTree) build(codeLengths []uint32) error {
+	// Calculate the number of symbols.
+	var nSymbols, lastSymbol uint32
+	for symbol, cl := range codeLengths {
+		if cl != 0 {
+			nSymbols++
+			lastSymbol = uint32(symbol)
+		}
+	}
+	if nSymbols == 0 {
+		return errInvalidHuffmanTree
+	}
+	h.nodes = make([]hNode, 1, 2*nSymbols-1)
+	// Handle the trivial case.
+	if nSymbols == 1 {
+		if len(codeLengths) <= int(lastSymbol) {
+			return errInvalidHuffmanTree
+		}
+		return h.insert(lastSymbol, 0, 0)
+	}
+	// Handle the non-trivial case.
+	codes, err := codeLengthsToCodes(codeLengths)
+	if err != nil {
+		return err
+	}
+	for symbol, cl := range codeLengths {
+		if cl > 0 {
+			if err := h.insert(uint32(symbol), codes[symbol], cl); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+// buildSimple builds a Huffman tree with 1 or 2 symbols.
+func (h *hTree) buildSimple(nSymbols uint32, symbols [2]uint32, alphabetSize uint32) error {
+	h.nodes = make([]hNode, 1, 2*nSymbols-1)
+	for i := uint32(0); i < nSymbols; i++ {
+		if symbols[i] >= alphabetSize {
+			return errInvalidHuffmanTree
+		}
+		if err := h.insert(symbols[i], i, nSymbols-1); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// next returns the next Huffman-encoded symbol from the bit-stream d.
+func (h *hTree) next(d *decoder) (uint32, error) {
+	var n uint32
+	// Read enough bits so that we can use the look-up table.
+	if d.nBits < lutSize {
+		c, err := d.r.ReadByte()
+		if err != nil {
+			if err == io.EOF {
+				// There are no more bytes of data, but we may still be able
+				// to read the next symbol out of the previously read bits.
+				goto slowPath
+			}
+			return 0, err
+		}
+		d.bits |= uint32(c) << d.nBits
+		d.nBits += 8
+	}
+	// Use the look-up table.
+	n = h.lut[d.bits&lutMask]
+	if b := n & 0xff; b != 0 {
+		b--
+		d.bits >>= b
+		d.nBits -= b
+		return n >> 8, nil
+	}
+	n >>= 8
+	d.bits >>= lutSize
+	d.nBits -= lutSize
+
+slowPath:
+	for h.nodes[n].children != leafNode {
+		if d.nBits == 0 {
+			c, err := d.r.ReadByte()
+			if err != nil {
+				if err == io.EOF {
+					err = io.ErrUnexpectedEOF
+				}
+				return 0, err
+			}
+			d.bits = uint32(c)
+			d.nBits = 8
+		}
+		n = uint32(h.nodes[n].children) + 1&d.bits
+		d.bits >>= 1
+		d.nBits--
+	}
+	return h.nodes[n].symbol, nil
+}
diff --git a/vp8l/transform.go b/vp8l/transform.go
new file mode 100644
index 0000000..f79431c
--- /dev/null
+++ b/vp8l/transform.go
@@ -0,0 +1,299 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package vp8l
+
+// This file deals with image transforms, specified in section 3.
+
+// nTiles returns the number of tiles needed to cover size pixels, where each
+// tile's side is 1<<bits pixels long.
+func nTiles(size int32, bits uint32) int32 {
+	return (size + 1<<bits - 1) >> bits
+}
+
+const (
+	transformTypePredictor     = 0
+	transformTypeCrossColor    = 1
+	transformTypeSubtractGreen = 2
+	transformTypeColorIndexing = 3
+	nTransformTypes            = 4
+)
+
+// transform holds the parameters for an invertible transform.
+type transform struct {
+	// transformType is the type of the transform.
+	transformType uint32
+	// oldWidth is the width of the image before transformation (or
+	// equivalently, after inverse transformation). The color-indexing
+	// transform can reduce the width. For example, a 50-pixel-wide
+	// image that only needs 4 bits (half a byte) per color index can
+	// be transformed into a 25-pixel-wide image.
+	oldWidth int32
+	// bits is the log-2 size of the transform's tiles, for the predictor
+	// and cross-color transforms. 8>>bits is the number of bits per
+	// color index, for the color-index transform.
+	bits uint32
+	// pix is the tile values, for the predictor and cross-color
+	// transforms, and the color palette, for the color-index transform.
+	pix []byte
+}
+
+var inverseTransforms = [nTransformTypes]func(*transform, []byte, int32) []byte{
+	transformTypePredictor:     inversePredictor,
+	transformTypeCrossColor:    inverseCrossColor,
+	transformTypeSubtractGreen: inverseSubtractGreen,
+	transformTypeColorIndexing: inverseColorIndexing,
+}
+
+func inversePredictor(t *transform, pix []byte, h int32) []byte {
+	if t.oldWidth == 0 || h == 0 {
+		return pix
+	}
+	// The first pixel's predictor is mode 0 (opaque black).
+	pix[3] += 0xff
+	p, mask := int32(4), int32(1)<<t.bits-1
+	for x := int32(1); x < t.oldWidth; x++ {
+		// The rest of the first row's predictor is mode 1 (L).
+		pix[p+0] += pix[p-4]
+		pix[p+1] += pix[p-3]
+		pix[p+2] += pix[p-2]
+		pix[p+3] += pix[p-1]
+		p += 4
+	}
+	top, tilesPerRow := 0, nTiles(t.oldWidth, t.bits)
+	for y := int32(1); y < h; y++ {
+		// The first column's predictor is mode 2 (T).
+		pix[p+0] += pix[top+0]
+		pix[p+1] += pix[top+1]
+		pix[p+2] += pix[top+2]
+		pix[p+3] += pix[top+3]
+		p, top = p+4, top+4
+
+		q := 4 * (y >> t.bits) * tilesPerRow
+		predictorMode := t.pix[q+1] & 0x0f
+		q += 4
+		for x := int32(1); x < t.oldWidth; x++ {
+			if x&mask == 0 {
+				predictorMode = t.pix[q+1] & 0x0f
+				q += 4
+			}
+			switch predictorMode {
+			case 0: // Opaque black.
+				pix[p+3] += 0xff
+
+			case 1: // L.
+				pix[p+0] += pix[p-4]
+				pix[p+1] += pix[p-3]
+				pix[p+2] += pix[p-2]
+				pix[p+3] += pix[p-1]
+
+			case 2: // T.
+				pix[p+0] += pix[top+0]
+				pix[p+1] += pix[top+1]
+				pix[p+2] += pix[top+2]
+				pix[p+3] += pix[top+3]
+
+			case 3: // TR.
+				pix[p+0] += pix[top+4]
+				pix[p+1] += pix[top+5]
+				pix[p+2] += pix[top+6]
+				pix[p+3] += pix[top+7]
+
+			case 4: // TL.
+				pix[p+0] += pix[top-4]
+				pix[p+1] += pix[top-3]
+				pix[p+2] += pix[top-2]
+				pix[p+3] += pix[top-1]
+
+			case 5: // Average2(Average2(L, TR), T).
+				pix[p+0] += avg2(avg2(pix[p-4], pix[top+4]), pix[top+0])
+				pix[p+1] += avg2(avg2(pix[p-3], pix[top+5]), pix[top+1])
+				pix[p+2] += avg2(avg2(pix[p-2], pix[top+6]), pix[top+2])
+				pix[p+3] += avg2(avg2(pix[p-1], pix[top+7]), pix[top+3])
+
+			case 6: // Average2(L, TL).
+				pix[p+0] += avg2(pix[p-4], pix[top-4])
+				pix[p+1] += avg2(pix[p-3], pix[top-3])
+				pix[p+2] += avg2(pix[p-2], pix[top-2])
+				pix[p+3] += avg2(pix[p-1], pix[top-1])
+
+			case 7: // Average2(L, T).
+				pix[p+0] += avg2(pix[p-4], pix[top+0])
+				pix[p+1] += avg2(pix[p-3], pix[top+1])
+				pix[p+2] += avg2(pix[p-2], pix[top+2])
+				pix[p+3] += avg2(pix[p-1], pix[top+3])
+
+			case 8: // Average2(TL, T).
+				pix[p+0] += avg2(pix[top-4], pix[top+0])
+				pix[p+1] += avg2(pix[top-3], pix[top+1])
+				pix[p+2] += avg2(pix[top-2], pix[top+2])
+				pix[p+3] += avg2(pix[top-1], pix[top+3])
+
+			case 9: // Average2(T, TR).
+				pix[p+0] += avg2(pix[top+0], pix[top+4])
+				pix[p+1] += avg2(pix[top+1], pix[top+5])
+				pix[p+2] += avg2(pix[top+2], pix[top+6])
+				pix[p+3] += avg2(pix[top+3], pix[top+7])
+
+			case 10: // Average2(Average2(L, TL), Average2(T, TR)).
+				pix[p+0] += avg2(avg2(pix[p-4], pix[top-4]), avg2(pix[top+0], pix[top+4]))
+				pix[p+1] += avg2(avg2(pix[p-3], pix[top-3]), avg2(pix[top+1], pix[top+5]))
+				pix[p+2] += avg2(avg2(pix[p-2], pix[top-2]), avg2(pix[top+2], pix[top+6]))
+				pix[p+3] += avg2(avg2(pix[p-1], pix[top-1]), avg2(pix[top+3], pix[top+7]))
+
+			case 11: // Select(T, L, TL).
+				l0 := int32(pix[p-4])
+				l1 := int32(pix[p-3])
+				l2 := int32(pix[p-2])
+				l3 := int32(pix[p-1])
+				c0 := int32(pix[top-4])
+				c1 := int32(pix[top-3])
+				c2 := int32(pix[top-2])
+				c3 := int32(pix[top-1])
+				t0 := int32(pix[top+0])
+				t1 := int32(pix[top+1])
+				t2 := int32(pix[top+2])
+				t3 := int32(pix[top+3])
+				t := abs(c0-l0) + abs(c1-l1) + abs(c2-l2) + abs(c3-l3)
+				l := abs(c0-t0) + abs(c1-t1) + abs(c2-t2) + abs(c3-t3)
+				if t <= l {
+					pix[p+0] += uint8(t0)
+					pix[p+1] += uint8(t1)
+					pix[p+2] += uint8(t2)
+					pix[p+3] += uint8(t3)
+				} else {
+					pix[p+0] += uint8(l0)
+					pix[p+1] += uint8(l1)
+					pix[p+2] += uint8(l2)
+					pix[p+3] += uint8(l3)
+				}
+
+			case 12: // ClampAddSubtractFull(L, T, TL).
+				pix[p+0] += clampAddSubtractFull(pix[p-4], pix[top+0], pix[top-4])
+				pix[p+1] += clampAddSubtractFull(pix[p-3], pix[top+1], pix[top-3])
+				pix[p+2] += clampAddSubtractFull(pix[p-2], pix[top+2], pix[top-2])
+				pix[p+3] += clampAddSubtractFull(pix[p-1], pix[top+3], pix[top-1])
+
+			case 13: // ClampAddSubtractHalf(Average2(L, T), TL).
+				pix[p+0] += clampAddSubtractHalf(avg2(pix[p-4], pix[top+0]), pix[top-4])
+				pix[p+1] += clampAddSubtractHalf(avg2(pix[p-3], pix[top+1]), pix[top-3])
+				pix[p+2] += clampAddSubtractHalf(avg2(pix[p-2], pix[top+2]), pix[top-2])
+				pix[p+3] += clampAddSubtractHalf(avg2(pix[p-1], pix[top+3]), pix[top-1])
+			}
+			p, top = p+4, top+4
+		}
+	}
+	return pix
+}
+
+func inverseCrossColor(t *transform, pix []byte, h int32) []byte {
+	var greenToRed, greenToBlue, redToBlue int32
+	p, mask, tilesPerRow := int32(0), int32(1)<<t.bits-1, nTiles(t.oldWidth, t.bits)
+	for y := int32(0); y < h; y++ {
+		q := 4 * (y >> t.bits) * tilesPerRow
+		for x := int32(0); x < t.oldWidth; x++ {
+			if x&mask == 0 {
+				redToBlue = int32(int8(t.pix[q+0]))
+				greenToBlue = int32(int8(t.pix[q+1]))
+				greenToRed = int32(int8(t.pix[q+2]))
+				q += 4
+			}
+			red := pix[p+0]
+			green := pix[p+1]
+			blue := pix[p+2]
+			red += uint8(uint32(greenToRed*int32(int8(green))) >> 5)
+			blue += uint8(uint32(greenToBlue*int32(int8(green))) >> 5)
+			blue += uint8(uint32(redToBlue*int32(int8(red))) >> 5)
+			pix[p+0] = red
+			pix[p+2] = blue
+			p += 4
+		}
+	}
+	return pix
+}
+
+func inverseSubtractGreen(t *transform, pix []byte, h int32) []byte {
+	for p := 0; p < len(pix); p += 4 {
+		green := pix[p+1]
+		pix[p+0] += green
+		pix[p+2] += green
+	}
+	return pix
+}
+
+func inverseColorIndexing(t *transform, pix []byte, h int32) []byte {
+	if t.bits == 0 {
+		for p := 0; p < len(pix); p += 4 {
+			i := 4 * uint32(pix[p+1])
+			pix[p+0] = t.pix[i+0]
+			pix[p+1] = t.pix[i+1]
+			pix[p+2] = t.pix[i+2]
+			pix[p+3] = t.pix[i+3]
+		}
+		return pix
+	}
+
+	vMask, xMask, bitsPerPixel := uint32(0), int32(0), uint32(8>>t.bits)
+	switch t.bits {
+	case 1:
+		vMask, xMask = 0x0f, 0x01
+	case 2:
+		vMask, xMask = 0x03, 0x03
+	case 3:
+		vMask, xMask = 0x01, 0x07
+	}
+
+	d, p, v, dst := 0, 0, uint32(0), make([]byte, 4*t.oldWidth*h)
+	for y := int32(0); y < h; y++ {
+		for x := int32(0); x < t.oldWidth; x++ {
+			if x&xMask == 0 {
+				v = uint32(pix[p+1])
+				p += 4
+			}
+
+			i := 4 * (v & vMask)
+			dst[d+0] = t.pix[i+0]
+			dst[d+1] = t.pix[i+1]
+			dst[d+2] = t.pix[i+2]
+			dst[d+3] = t.pix[i+3]
+			d += 4
+
+			v >>= bitsPerPixel
+		}
+	}
+	return dst
+}
+
+func abs(x int32) int32 {
+	if x < 0 {
+		return -x
+	}
+	return x
+}
+
+func avg2(a, b uint8) uint8 {
+	return uint8((int32(a) + int32(b)) / 2)
+}
+
+func clampAddSubtractFull(a, b, c uint8) uint8 {
+	x := int32(a) + int32(b) - int32(c)
+	if x < 0 {
+		return 0
+	}
+	if x > 255 {
+		return 255
+	}
+	return uint8(x)
+}
+
+func clampAddSubtractHalf(a, b uint8) uint8 {
+	x := int32(a) + (int32(a)-int32(b))/2
+	if x < 0 {
+		return 0
+	}
+	if x > 255 {
+		return 255
+	}
+	return uint8(x)
+}
diff --git a/webp/decode.go b/webp/decode.go
index e58cb53..2ed7f36 100644
--- a/webp/decode.go
+++ b/webp/decode.go
@@ -4,8 +4,8 @@
 
 // Package webp implements a decoder for WEBP images.
 //
-// WEBP is defined in the VP8 specification at:
-// http://datatracker.ietf.org/doc/rfc6386/
+// WEBP is defined at:
+// https://developers.google.com/speed/webp/docs/riff_container
 package webp
 
 import (
@@ -15,61 +15,81 @@
 	"io"
 
 	"code.google.com/p/go.image/vp8"
+	"code.google.com/p/go.image/vp8l"
 )
 
-func decode(r io.Reader) (d *vp8.Decoder, fh vp8.FrameHeader, err error) {
+const (
+	formatVP8  = 1
+	formatVP8L = 2
+)
+
+func decode(r io.Reader, configOnly bool) (image.Image, image.Config, error) {
 	var b [20]byte
-	if _, err = io.ReadFull(r, b[:]); err != nil {
-		return
+	if _, err := io.ReadFull(r, b[:]); err != nil {
+		return nil, image.Config{}, err
 	}
-	if string(b[0:4]) != "RIFF" || string(b[8:16]) != "WEBPVP8 " {
-		err = errors.New("webp: invalid format")
-		return
+	format := 0
+	switch string(b[8:16]) {
+	case "WEBPVP8 ":
+		format = formatVP8
+	case "WEBPVP8L":
+		format = formatVP8L
+	}
+	if string(b[:4]) != "RIFF" || format == 0 {
+		return nil, image.Config{}, errors.New("webp: invalid format")
 	}
 	riffLen := uint32(b[4]) | uint32(b[5])<<8 | uint32(b[6])<<16 | uint32(b[7])<<24
 	dataLen := uint32(b[16]) | uint32(b[17])<<8 | uint32(b[18])<<16 | uint32(b[19])<<24
 	if riffLen < dataLen+12 {
-		err = errors.New("webp: invalid format")
-		return
+		return nil, image.Config{}, errors.New("webp: invalid format")
 	}
 	if dataLen >= 1<<31 {
-		err = errors.New("webp: invalid format")
-		return
+		return nil, image.Config{}, errors.New("webp: invalid format")
 	}
-	d = vp8.NewDecoder()
-	d.Init(r, int(dataLen))
-	fh, err = d.DecodeFrameHeader()
-	if err != nil {
-		d, fh = nil, vp8.FrameHeader{}
-		return
+
+	if format == formatVP8 {
+		d := vp8.NewDecoder()
+		d.Init(r, int(dataLen))
+		fh, err := d.DecodeFrameHeader()
+		if err != nil {
+			return nil, image.Config{}, err
+		}
+		if configOnly {
+			return nil, image.Config{
+				ColorModel: color.YCbCrModel,
+				Width:      fh.Width,
+				Height:     fh.Height,
+			}, nil
+		}
+		m, err := d.DecodeFrame()
+		return m, image.Config{}, nil
 	}
-	return
+
+	r = &io.LimitedReader{r, int64(dataLen)}
+	if configOnly {
+		c, err := vp8l.DecodeConfig(r)
+		return nil, c, err
+	}
+	m, err := vp8l.Decode(r)
+	return m, image.Config{}, err
 }
 
 // Decode reads a WEBP image from r and returns it as an image.Image.
 func Decode(r io.Reader) (image.Image, error) {
-	d, _, err := decode(r)
+	m, _, err := decode(r, false)
 	if err != nil {
 		return nil, err
 	}
-	return d.DecodeFrame()
+	return m, err
 }
 
 // DecodeConfig returns the color model and dimensions of a WEBP image without
 // decoding the entire image.
 func DecodeConfig(r io.Reader) (image.Config, error) {
-	_, fh, err := decode(r)
-	if err != nil {
-		return image.Config{}, err
-	}
-	c := image.Config{
-		ColorModel: color.YCbCrModel,
-		Width:      fh.Width,
-		Height:     fh.Height,
-	}
-	return c, nil
+	_, c, err := decode(r, true)
+	return c, err
 }
 
 func init() {
-	image.RegisterFormat("webp", "RIFF????WEBPVP8 ", Decode, DecodeConfig)
+	image.RegisterFormat("webp", "RIFF????WEBPVP8", Decode, DecodeConfig)
 }
diff --git a/webp/decode_test.go b/webp/decode_test.go
index 516ff15..37d6913 100644
--- a/webp/decode_test.go
+++ b/webp/decode_test.go
@@ -9,7 +9,9 @@
 	"fmt"
 	"image"
 	"image/png"
+	"io/ioutil"
 	"os"
+	"strings"
 	"testing"
 )
 
@@ -108,3 +110,105 @@
 		}
 	}
 }
+
+func TestDecodeVP8L(t *testing.T) {
+	testCases := []string{
+		"blue-purple-pink",
+		"gopher-doc.1bpp",
+		"gopher-doc.2bpp",
+		"gopher-doc.4bpp",
+		"gopher-doc.8bpp",
+		"tux",
+		"yellow_rose",
+	}
+
+loop:
+	for _, tc := range testCases {
+		f0, err := os.Open("../testdata/" + tc + ".lossless.webp")
+		if err != nil {
+			t.Errorf("%s: Open WEBP: %v", tc, err)
+			continue
+		}
+		defer f0.Close()
+		img0, err := Decode(f0)
+		if err != nil {
+			t.Errorf("%s: Decode WEBP: %v", tc, err)
+			continue
+		}
+		m0, ok := img0.(*image.NRGBA)
+		if !ok {
+			t.Errorf("%s: WEBP image is %T, want *image.NRGBA", tc, img0)
+			continue
+		}
+
+		f1, err := os.Open("../testdata/" + tc + ".png")
+		if err != nil {
+			t.Errorf("%s: Open PNG: %v", tc, err)
+			continue
+		}
+		defer f1.Close()
+		img1, err := png.Decode(f1)
+		if err != nil {
+			t.Errorf("%s: Decode PNG: %v", tc, err)
+			continue
+		}
+		m1, ok := img1.(*image.NRGBA)
+		if !ok {
+			rgba1, ok := img1.(*image.RGBA)
+			if !ok {
+				t.Fatalf("%s: PNG image is %T, want *image.NRGBA", tc, img1)
+				continue
+			}
+			if !rgba1.Opaque() {
+				t.Fatalf("%s: PNG image is non-opaque *image.RGBA, want *image.NRGBA", tc)
+				continue
+			}
+			// The image is fully opaque, so we can re-interpret the RGBA pixels
+			// as NRGBA pixels.
+			m1 = &image.NRGBA{
+				Pix:    rgba1.Pix,
+				Stride: rgba1.Stride,
+				Rect:   rgba1.Rect,
+			}
+		}
+
+		b0, b1 := m0.Bounds(), m1.Bounds()
+		if b0 != b1 {
+			t.Errorf("%s: bounds: got %v, want %v", tc, b0, b1)
+			continue
+		}
+		for i := range m0.Pix {
+			if m0.Pix[i] != m1.Pix[i] {
+				y := i / m0.Stride
+				x := (i - y*m0.Stride) / 4
+				i = 4 * (y*m0.Stride + x)
+				t.Errorf("%s: at (%d, %d):\ngot  %02x %02x %02x %02x\nwant %02x %02x %02x %02x",
+					tc, x, y,
+					m0.Pix[i+0], m0.Pix[i+1], m0.Pix[i+2], m0.Pix[i+3],
+					m1.Pix[i+0], m1.Pix[i+1], m1.Pix[i+2], m1.Pix[i+3],
+				)
+				continue loop
+			}
+		}
+	}
+}
+
+func benchmarkDecode(b *testing.B, filename string) {
+	data, err := ioutil.ReadFile("../testdata/" + filename + ".webp")
+	if err != nil {
+		b.Fatal(err)
+	}
+	s := string(data)
+	cfg, err := DecodeConfig(strings.NewReader(s))
+	if err != nil {
+		b.Fatal(err)
+	}
+	b.SetBytes(int64(cfg.Width * cfg.Height * 4))
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		Decode(strings.NewReader(s))
+	}
+}
+
+func BenchmarkDecodeVP8(b *testing.B)  { benchmarkDecode(b, "yellow_rose.lossy") }
+func BenchmarkDecodeVP8L(b *testing.B) { benchmarkDecode(b, "yellow_rose.lossless") }