go.image/vp8: implement the loop filter.

The testdata files were created by:
cwebp video-001.png -o video-001.webp
dwebp -pgm video-001.webp -o tmp.pgm
convert tmp.pgm video-001.webp.ycbcr.png
rm tmp.pgm

LGTM=r
R=r
CC=chaishushan, golang-codereviews
https://golang.org/cl/91700044
diff --git a/testdata/video-001.webp b/testdata/video-001.webp
new file mode 100644
index 0000000..302198e
--- /dev/null
+++ b/testdata/video-001.webp
Binary files differ
diff --git a/testdata/video-001.webp.ycbcr.png b/testdata/video-001.webp.ycbcr.png
new file mode 100644
index 0000000..dc5f8cf
--- /dev/null
+++ b/testdata/video-001.webp.ycbcr.png
Binary files differ
diff --git a/vp8/decode.go b/vp8/decode.go
index 9cb2a71..825fb00 100644
--- a/vp8/decode.go
+++ b/vp8/decode.go
@@ -4,8 +4,7 @@
 
 // Package vp8 implements a vp8 image and video decoder.
 //
-// The VP8 specification is at:
-// http://datatracker.ietf.org/doc/rfc6386/
+// The VP8 specification is RFC 6386.
 package vp8
 
 // This file implements the top-level decoding algorithm.
@@ -121,6 +120,9 @@
 	tokenProb   [nPlane][nBand][nContext][nProb]uint8
 	useSkipProb bool
 	skipProb    uint8
+	// Loop filter parameters.
+	filterParams      [nSegment][2]filterParam
+	perMBFilterParams []filterParam
 
 	// The eight fields below relate to the current macroblock being decoded.
 	//
@@ -133,7 +135,7 @@
 	// Bitmasks for which 4x4 regions of coeff contain non-zero coefficients.
 	nzDCMask, nzACMask uint32
 	// Predictor modes.
-	usePredY16 bool
+	usePredY16 bool // The libwebp C code calls this !is_i4x4_.
 	predY16    uint8
 	predC8     uint8
 	predY4     [4][4]uint8
@@ -202,6 +204,7 @@
 	}
 	m := image.NewYCbCr(image.Rect(0, 0, 16*d.mbw, 16*d.mbh), image.YCbCrSubsampleRatio420)
 	d.img = m.SubImage(image.Rect(0, 0, d.frameHeader.Width, d.frameHeader.Height)).(*image.YCbCr)
+	d.perMBFilterParams = make([]filterParam, d.mbw*d.mbh)
 	d.upMB = make([]mb, d.mbw)
 }
 
@@ -262,6 +265,7 @@
 	} else {
 		d.filterHeader.perSegmentLevel[0] = d.filterHeader.level
 	}
+	d.computeFilterParams()
 }
 
 // parseOtherPartitions parses the other partitions, as specified in section 9.5.
@@ -334,15 +338,18 @@
 	if err := d.parseOtherHeaders(); err != nil {
 		return nil, err
 	}
+	// Reconstruct the rows.
 	for mbx := 0; mbx < d.mbw; mbx++ {
 		d.upMB[mbx] = mb{}
 	}
 	for mby := 0; mby < d.mbh; mby++ {
 		d.leftMB = mb{}
 		for mbx := 0; mbx < d.mbw; mbx++ {
-			d.reconstruct(mbx, mby)
+			skip := d.reconstruct(mbx, mby)
+			fs := d.filterParams[d.segment][btou(!d.usePredY16)]
+			fs.inner = fs.inner || !skip
+			d.perMBFilterParams[d.mbw*mby+mbx] = fs
 		}
-		// TODO(nigeltao): filter, as specified in chapter 15.
 	}
 	if d.fp.unexpectedEOF {
 		return nil, io.ErrUnexpectedEOF
@@ -352,5 +359,17 @@
 			return nil, io.ErrUnexpectedEOF
 		}
 	}
+	// Apply the loop filter.
+	//
+	// Even if we are using per-segment levels, section 15 says that "loop
+	// filtering must be skipped entirely if loop_filter_level at either the
+	// frame header level or macroblock override level is 0".
+	if d.filterHeader.level != 0 {
+		if d.filterHeader.simple {
+			d.simpleFilter()
+		} else {
+			// TODO(nigeltao): normal filtering.
+		}
+	}
 	return d.img, nil
 }
diff --git a/vp8/filter.go b/vp8/filter.go
new file mode 100644
index 0000000..aa7e119
--- /dev/null
+++ b/vp8/filter.go
@@ -0,0 +1,368 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package vp8
+
+// filter2 modifies a 2-pixel-wide or 2-pixel-high band along an edge.
+func filter2(pix []byte, thresh int, index int, iStep int, jStep int) {
+	for i := 0; i < 16; i, index = i+1, index+iStep {
+		p1 := int(pix[index-2*jStep])
+		p0 := int(pix[index-1*jStep])
+		q0 := int(pix[index+0*jStep])
+		q1 := int(pix[index+1*jStep])
+		if int(lutAbs[lutAbsBase+p0-q0])<<1+int(lutAbs[lutAbsBase+p1-q1])>>1 > thresh {
+			continue
+		}
+		a := 3*(q0-p0) + int(lutClamp127[lutClamp127Base+p1-q1])
+		a1 := int(lutClamp15[lutClamp15Base+((a+4)>>3)])
+		a2 := int(lutClamp15[lutClamp15Base+((a+3)>>3)])
+		pix[index-1*jStep] = lutClamp255[lutClamp255Base+p0+a2]
+		pix[index+0*jStep] = lutClamp255[lutClamp255Base+q0-a1]
+	}
+}
+
+// TODO(nigeltao): filter4 and filter6 functions, for normal filtering.
+
+// simpleFilter implements the simple filter, as specified in section 15.2.
+func (d *Decoder) simpleFilter() {
+	for mby := 0; mby < d.mbh; mby++ {
+		for mbx := 0; mbx < d.mbw; mbx++ {
+			f := d.perMBFilterParams[d.mbw*mby+mbx]
+			if f.limit == 0 {
+				continue
+			}
+			index := (mby*d.img.YStride + mbx) * 16
+			if mbx > 0 {
+				filter2(d.img.Y, int(f.limit)+4, index, d.img.YStride, 1)
+			}
+			if f.inner {
+				filter2(d.img.Y, int(f.limit), index+4, d.img.YStride, 1)
+				filter2(d.img.Y, int(f.limit), index+8, d.img.YStride, 1)
+				filter2(d.img.Y, int(f.limit), index+12, d.img.YStride, 1)
+			}
+			if mby > 0 {
+				filter2(d.img.Y, int(f.limit)+4, index, 1, d.img.YStride)
+			}
+			if f.inner {
+				filter2(d.img.Y, int(f.limit), index+d.img.YStride*4, 1, d.img.YStride)
+				filter2(d.img.Y, int(f.limit), index+d.img.YStride*8, 1, d.img.YStride)
+				filter2(d.img.Y, int(f.limit), index+d.img.YStride*12, 1, d.img.YStride)
+			}
+		}
+	}
+}
+
+// filterParam holds the loop filter parameters for a macroblock.
+type filterParam struct {
+	limit      uint8
+	inner      bool
+	innerLevel uint8
+	hevThresh  uint8
+}
+
+// computeFilterParams computes the loop filter parameters, as specified in
+// section 15.4.
+func (d *Decoder) computeFilterParams() {
+	for i := range d.filterParams {
+		baseLevel := d.filterHeader.level
+		if d.segmentHeader.useSegment {
+			baseLevel = d.segmentHeader.filterStrength[i]
+			if d.segmentHeader.relativeDelta {
+				baseLevel += d.filterHeader.level
+			}
+		}
+
+		for j := range d.filterParams[i] {
+			p := &d.filterParams[i][j]
+			p.inner = j != 0
+			level := baseLevel
+			if d.filterHeader.useLFDelta {
+				// The libwebp C code has a "TODO: only CURRENT is handled for now."
+				level += d.filterHeader.refLFDelta[0]
+				if j != 0 {
+					level += d.filterHeader.modeLFDelta[0]
+				}
+			}
+			if level <= 0 {
+				p.limit = 0
+				continue
+			}
+			if level > 63 {
+				level = 63
+			}
+			ilevel := level
+			if d.filterHeader.sharpness > 0 {
+				if d.filterHeader.sharpness > 4 {
+					ilevel >>= 2
+				} else {
+					ilevel >>= 1
+				}
+				if x := int8(9 - d.filterHeader.sharpness); ilevel > x {
+					ilevel = x
+				}
+			}
+			if ilevel < 1 {
+				ilevel = 1
+			}
+			p.innerLevel = uint8(ilevel)
+			p.limit = uint8(2*level + ilevel)
+			if d.frameHeader.KeyFrame {
+				if level < 15 {
+					p.hevThresh = 0
+				} else if level < 40 {
+					p.hevThresh = 1
+				} else {
+					p.hevThresh = 2
+				}
+			} else {
+				if level < 15 {
+					p.hevThresh = 0
+				} else if level < 20 {
+					p.hevThresh = 1
+				} else if level < 40 {
+					p.hevThresh = 2
+				} else {
+					p.hevThresh = 3
+				}
+			}
+		}
+	}
+}
+
+// Look-up tables.
+//
+// TODO(nigeltao): are these actually worth it in Go, as opposed to C,
+// due to the bounds checking?
+
+const lutAbsBase = 255
+
+// lutAbs[lutAbsBase+x] is equal to abs(x), for x in [-255, 255].
+var lutAbs = [255 + 1 + 255]uint8{
+	0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0,
+	0xef, 0xee, 0xed, 0xec, 0xeb, 0xea, 0xe9, 0xe8, 0xe7, 0xe6, 0xe5, 0xe4, 0xe3, 0xe2, 0xe1, 0xe0,
+	0xdf, 0xde, 0xdd, 0xdc, 0xdb, 0xda, 0xd9, 0xd8, 0xd7, 0xd6, 0xd5, 0xd4, 0xd3, 0xd2, 0xd1, 0xd0,
+	0xcf, 0xce, 0xcd, 0xcc, 0xcb, 0xca, 0xc9, 0xc8, 0xc7, 0xc6, 0xc5, 0xc4, 0xc3, 0xc2, 0xc1, 0xc0,
+	0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8, 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0,
+	0xaf, 0xae, 0xad, 0xac, 0xab, 0xaa, 0xa9, 0xa8, 0xa7, 0xa6, 0xa5, 0xa4, 0xa3, 0xa2, 0xa1, 0xa0,
+	0x9f, 0x9e, 0x9d, 0x9c, 0x9b, 0x9a, 0x99, 0x98, 0x97, 0x96, 0x95, 0x94, 0x93, 0x92, 0x91, 0x90,
+	0x8f, 0x8e, 0x8d, 0x8c, 0x8b, 0x8a, 0x89, 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80,
+	0x7f, 0x7e, 0x7d, 0x7c, 0x7b, 0x7a, 0x79, 0x78, 0x77, 0x76, 0x75, 0x74, 0x73, 0x72, 0x71, 0x70,
+	0x6f, 0x6e, 0x6d, 0x6c, 0x6b, 0x6a, 0x69, 0x68, 0x67, 0x66, 0x65, 0x64, 0x63, 0x62, 0x61, 0x60,
+	0x5f, 0x5e, 0x5d, 0x5c, 0x5b, 0x5a, 0x59, 0x58, 0x57, 0x56, 0x55, 0x54, 0x53, 0x52, 0x51, 0x50,
+	0x4f, 0x4e, 0x4d, 0x4c, 0x4b, 0x4a, 0x49, 0x48, 0x47, 0x46, 0x45, 0x44, 0x43, 0x42, 0x41, 0x40,
+	0x3f, 0x3e, 0x3d, 0x3c, 0x3b, 0x3a, 0x39, 0x38, 0x37, 0x36, 0x35, 0x34, 0x33, 0x32, 0x31, 0x30,
+	0x2f, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29, 0x28, 0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21, 0x20,
+	0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
+	0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+	0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
+	0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
+	0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
+	0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40,
+	0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50,
+	0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60,
+	0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70,
+	0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80,
+	0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90,
+	0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0,
+	0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
+	0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0,
+	0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0,
+	0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0,
+	0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0,
+	0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+}
+
+const lutClamp15Base = 255
+
+// lutClamp15[lutClamp15Base+x] is equal to clamp(x, -16, +15), for x in [-255, 255].
+var lutClamp15 = [255 + 1 + 255]int8{
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10, -0x10,
+	-0x0f, -0x0e, -0x0d, -0x0c, -0x0b, -0x0a, -0x09, -0x08,
+	-0x07, -0x06, -0x05, -0x04, -0x03, -0x02, -0x01, +0x00,
+	+0x01, +0x02, +0x03, +0x04, +0x05, +0x06, +0x07, +0x08,
+	+0x09, +0x0a, +0x0b, +0x0c, +0x0d, +0x0e, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+	+0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f, +0x0f,
+}
+
+const lutClamp127Base = 255
+
+// lutClamp127[lutClamp127Base+x] is equal to clamp(x, -128, +127), for x in [-255, 255].
+var lutClamp127 = [255 + 1 + 255]int8{
+	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
+	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
+	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
+	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
+	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
+	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
+	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
+	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
+	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
+	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
+	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
+	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
+	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
+	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
+	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
+	-0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
+	-0x7f, -0x7e, -0x7d, -0x7c, -0x7b, -0x7a, -0x79, -0x78,
+	-0x77, -0x76, -0x75, -0x74, -0x73, -0x72, -0x71, -0x70,
+	-0x6f, -0x6e, -0x6d, -0x6c, -0x6b, -0x6a, -0x69, -0x68,
+	-0x67, -0x66, -0x65, -0x64, -0x63, -0x62, -0x61, -0x60,
+	-0x5f, -0x5e, -0x5d, -0x5c, -0x5b, -0x5a, -0x59, -0x58,
+	-0x57, -0x56, -0x55, -0x54, -0x53, -0x52, -0x51, -0x50,
+	-0x4f, -0x4e, -0x4d, -0x4c, -0x4b, -0x4a, -0x49, -0x48,
+	-0x47, -0x46, -0x45, -0x44, -0x43, -0x42, -0x41, -0x40,
+	-0x3f, -0x3e, -0x3d, -0x3c, -0x3b, -0x3a, -0x39, -0x38,
+	-0x37, -0x36, -0x35, -0x34, -0x33, -0x32, -0x31, -0x30,
+	-0x2f, -0x2e, -0x2d, -0x2c, -0x2b, -0x2a, -0x29, -0x28,
+	-0x27, -0x26, -0x25, -0x24, -0x23, -0x22, -0x21, -0x20,
+	-0x1f, -0x1e, -0x1d, -0x1c, -0x1b, -0x1a, -0x19, -0x18,
+	-0x17, -0x16, -0x15, -0x14, -0x13, -0x12, -0x11, -0x10,
+	-0x0f, -0x0e, -0x0d, -0x0c, -0x0b, -0x0a, -0x09, -0x08,
+	-0x07, -0x06, -0x05, -0x04, -0x03, -0x02, -0x01, +0x00,
+	+0x01, +0x02, +0x03, +0x04, +0x05, +0x06, +0x07, +0x08,
+	+0x09, +0x0a, +0x0b, +0x0c, +0x0d, +0x0e, +0x0f, +0x10,
+	+0x11, +0x12, +0x13, +0x14, +0x15, +0x16, +0x17, +0x18,
+	+0x19, +0x1a, +0x1b, +0x1c, +0x1d, +0x1e, +0x1f, +0x20,
+	+0x21, +0x22, +0x23, +0x24, +0x25, +0x26, +0x27, +0x28,
+	+0x29, +0x2a, +0x2b, +0x2c, +0x2d, +0x2e, +0x2f, +0x30,
+	+0x31, +0x32, +0x33, +0x34, +0x35, +0x36, +0x37, +0x38,
+	+0x39, +0x3a, +0x3b, +0x3c, +0x3d, +0x3e, +0x3f, +0x40,
+	+0x41, +0x42, +0x43, +0x44, +0x45, +0x46, +0x47, +0x48,
+	+0x49, +0x4a, +0x4b, +0x4c, +0x4d, +0x4e, +0x4f, +0x50,
+	+0x51, +0x52, +0x53, +0x54, +0x55, +0x56, +0x57, +0x58,
+	+0x59, +0x5a, +0x5b, +0x5c, +0x5d, +0x5e, +0x5f, +0x60,
+	+0x61, +0x62, +0x63, +0x64, +0x65, +0x66, +0x67, +0x68,
+	+0x69, +0x6a, +0x6b, +0x6c, +0x6d, +0x6e, +0x6f, +0x70,
+	+0x71, +0x72, +0x73, +0x74, +0x75, +0x76, +0x77, +0x78,
+	+0x79, +0x7a, +0x7b, +0x7c, +0x7d, +0x7e, +0x7f, +0x7f,
+	+0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f,
+	+0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f,
+	+0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f,
+	+0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f,
+	+0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f,
+	+0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f,
+	+0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f,
+	+0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f,
+	+0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f,
+	+0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f,
+	+0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f,
+	+0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f,
+	+0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f,
+	+0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f,
+	+0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f,
+	+0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f, +0x7f,
+}
+
+const lutClamp255Base = 255
+
+// lutClamp255[lutClamp255Base+x] is equal to clamp(x, 0, 255), for x in [-255, 510].
+var lutClamp255 = [255 + 1 + 510]uint8{
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
+	0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
+	0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
+	0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40,
+	0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50,
+	0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60,
+	0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70,
+	0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80,
+	0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90,
+	0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0,
+	0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
+	0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0,
+	0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0,
+	0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0,
+	0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0,
+	0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+}
diff --git a/vp8/reconstruct.go b/vp8/reconstruct.go
index 525e442..309d1c1 100644
--- a/vp8/reconstruct.go
+++ b/vp8/reconstruct.go
@@ -385,7 +385,7 @@
 }
 
 // reconstruct reconstructs one macroblock.
-func (d *Decoder) reconstruct(mbx, mby int) {
+func (d *Decoder) reconstruct(mbx, mby int) (skip bool) {
 	if d.segmentHeader.updateMap {
 		if !d.fp.readBit(d.segmentHeader.prob[0]) {
 			d.segment = int(d.fp.readUint(d.segmentHeader.prob[1], 1))
@@ -393,7 +393,6 @@
 			d.segment = int(d.fp.readUint(d.segmentHeader.prob[2], 1)) + 2
 		}
 	}
-	skip := false
 	if d.useSkipProb {
 		skip = d.fp.readBit(d.skipProb)
 	}
@@ -412,6 +411,8 @@
 	d.parsePredModeC8()
 	// Parse the residuals.
 	if !skip {
+		// TODO(nigeltao): make d.parseResiduals return a bool, and change this to
+		// skip = d.parseResiduals(mbx, mby)
 		d.parseResiduals(mbx, mby)
 	} else {
 		if d.usePredY16 {
@@ -432,4 +433,5 @@
 		copy(d.img.Cb[i:i+8], d.ybr[ybrBY+y][ybrBX:ybrBX+8])
 		copy(d.img.Cr[i:i+8], d.ybr[ybrRY+y][ybrRX:ybrRX+8])
 	}
+	return skip
 }
diff --git a/webp/decode_test.go b/webp/decode_test.go
new file mode 100644
index 0000000..516ff15
--- /dev/null
+++ b/webp/decode_test.go
@@ -0,0 +1,110 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package webp
+
+import (
+	"bytes"
+	"fmt"
+	"image"
+	"image/png"
+	"os"
+	"testing"
+)
+
+// hex is like fmt.Sprintf("% x", x) but also inserts dots every 16 bytes, to
+// delineate VP8 macroblock boundaries.
+func hex(x []byte) string {
+	buf := new(bytes.Buffer)
+	for len(x) > 0 {
+		n := len(x)
+		if n > 16 {
+			n = 16
+		}
+		fmt.Fprintf(buf, " . % x", x[:n])
+		x = x[n:]
+	}
+	return buf.String()
+}
+
+func TestDecodeVP8(t *testing.T) {
+	// The original video-001.png image is 150x103.
+	const w, h = 150, 103
+	// w2 and h2 are the half-width and half-height, rounded up.
+	const w2, h2 = int((w + 1) / 2), int((h + 1) / 2)
+
+	f0, err := os.Open("../testdata/video-001.webp.ycbcr.png")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f0.Close()
+	img0, err := png.Decode(f0)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// The split-into-YCbCr-planes golden image is a 2*w2 wide and h+h2 high
+	// gray image arranged in IMC4 format:
+	//   YYYY
+	//   YYYY
+	//   BBRR
+	// See http://www.fourcc.org/yuv.php#IMC4
+	if got, want := img0.Bounds(), image.Rect(0, 0, 2*w2, h+h2); got != want {
+		t.Fatalf("bounds0: got %v, want %v", got, want)
+	}
+	m0, ok := img0.(*image.Gray)
+	if !ok {
+		t.Fatal("decoded PNG image is not a Gray")
+	}
+
+	f1, err := os.Open("../testdata/video-001.webp")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f1.Close()
+	img1, err := Decode(f1)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if got, want := img1.Bounds(), image.Rect(0, 0, w, h); got != want {
+		t.Fatalf("bounds1: got %v, want %v", got, want)
+	}
+	m1, ok := img1.(*image.YCbCr)
+	if !ok || m1.SubsampleRatio != image.YCbCrSubsampleRatio420 {
+		t.Fatal("decoded WEBP image is not a 4:2:0 YCbCr")
+	}
+
+	planes := []struct {
+		name     string
+		m1Pix    []uint8
+		m1Stride int
+		m0Rect   image.Rectangle
+	}{
+		{"Y", m1.Y, m1.YStride, image.Rect(0, 0, w, h)},
+		{"Cb", m1.Cb, m1.CStride, image.Rect(0*w2, h, 1*w2, h+h2)},
+		{"Cr", m1.Cr, m1.CStride, image.Rect(1*w2, h, 2*w2, h+h2)},
+	}
+	for _, plane := range planes {
+		dx := plane.m0Rect.Dx()
+		nDiff, diff := 0, make([]byte, dx)
+		for j, y := 0, plane.m0Rect.Min.Y; y < plane.m0Rect.Max.Y; j, y = j+1, y+1 {
+			got := plane.m1Pix[j*plane.m1Stride:][:dx]
+			want := m0.Pix[y*m0.Stride+plane.m0Rect.Min.X:][:dx]
+			if bytes.Equal(got, want) {
+				continue
+			}
+			nDiff++
+			if nDiff > 10 {
+				t.Errorf("%s plane: more rows differ", plane.name)
+				break
+			}
+			for i := range got {
+				diff[i] = got[i] - want[i]
+			}
+			t.Errorf("%s plane: m0 row %d, m1 row %d\ngot %s\nwant%s\ndiff%s",
+				plane.name, y, j, hex(got), hex(want), hex(diff))
+		}
+	}
+}