src/compress/bzip2/bzip2.go - go.git - Git at Google

 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 // Package bzip2 implements bzip2 decompression.
 package bzip2

 import "io"

 // There's no RFC for bzip2. I used the Wikipedia page for reference and a lot
 // of guessing: https://en.wikipedia.org/wiki/Bzip2
 // The source code to pyflate was useful for debugging:
 // http://www.paul.sladen.org/projects/pyflate

 // A StructuralError is returned when the bzip2 data is found to be
 // syntactically invalid.
 type StructuralError string

 func (s StructuralError) Error() string {
 	return "bzip2 data invalid: " + string(s)
 }

 // A reader decompresses bzip2 compressed data.
 type reader struct {
 	br           bitReader
 	fileCRC      uint32
 	blockCRC     uint32
 	wantBlockCRC uint32
 	setupDone    bool // true if we have parsed the bzip2 header.
 	eof          bool
 	blockSize    int       // blockSize in bytes, i.e. 900 * 1000.
 	c            [256]uint // the ``C'' array for the inverse BWT.
 	tt           []uint32  // mirrors the ``tt'' array in the bzip2 source and contains the P array in the upper 24 bits.
 	tPos         uint32    // Index of the next output byte in tt.

 	preRLE      []uint32 // contains the RLE data still to be processed.
 	preRLEUsed  int      // number of entries of preRLE used.
 	lastByte    int      // the last byte value seen.
 	byteRepeats uint     // the number of repeats of lastByte seen.
 	repeats     uint     // the number of copies of lastByte to output.
 }

 // NewReader returns an io.Reader which decompresses bzip2 data from r.
 // If r does not also implement [io.ByteReader],
 // the decompressor may read more data than necessary from r.
 func NewReader(r io.Reader) io.Reader {
 	bz2 := new(reader)
 	bz2.br = newBitReader(r)
 	return bz2
 }

 const bzip2FileMagic = 0x425a // "BZ"
 const bzip2BlockMagic = 0x314159265359
 const bzip2FinalMagic = 0x177245385090

 // setup parses the bzip2 header.
 func (bz2 *reader) setup(needMagic bool) error {
 	br := &bz2.br

 	if needMagic {
 		magic := br.ReadBits(16)
 		if magic != bzip2FileMagic {
 			return StructuralError("bad magic value")
 		}
 	}

 	t := br.ReadBits(8)
 	if t != 'h' {
 		return StructuralError("non-Huffman entropy encoding")
 	}

 	level := br.ReadBits(8)
 	if level < '1' || level > '9' {
 		return StructuralError("invalid compression level")
 	}

 	bz2.fileCRC = 0
 	bz2.blockSize = 100 * 1000 * (level - '0')
 	if bz2.blockSize > len(bz2.tt) {
 		bz2.tt = make([]uint32, bz2.blockSize)
 	}
 	return nil
 }

 func (bz2 *reader) Read(buf []byte) (n int, err error) {
 	if bz2.eof {
 		return 0, io.EOF
 	}

 	if !bz2.setupDone {
 		err = bz2.setup(true)
 		brErr := bz2.br.Err()
 		if brErr != nil {
 			err = brErr
 		}
 		if err != nil {
 			return 0, err
 		}
 		bz2.setupDone = true
 	}

 	n, err = bz2.read(buf)
 	brErr := bz2.br.Err()
 	if brErr != nil {
 		err = brErr
 	}
 	return
 }

 func (bz2 *reader) readFromBlock(buf []byte) int {
 	// bzip2 is a block based compressor, except that it has a run-length
 	// preprocessing step. The block based nature means that we can
 	// preallocate fixed-size buffers and reuse them. However, the RLE
 	// preprocessing would require allocating huge buffers to store the
 	// maximum expansion. Thus we process blocks all at once, except for
 	// the RLE which we decompress as required.
 	n := 0
 	for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) {
 		// We have RLE data pending.

 		// The run-length encoding works like this:
 		// Any sequence of four equal bytes is followed by a length
 		// byte which contains the number of repeats of that byte to
 		// include. (The number of repeats can be zero.) Because we are
 		// decompressing on-demand our state is kept in the reader
 		// object.

 		if bz2.repeats > 0 {
 			buf[n] = byte(bz2.lastByte)
 			n++
 			bz2.repeats--
 			if bz2.repeats == 0 {
 				bz2.lastByte = -1
 			}
 			continue
 		}

 		bz2.tPos = bz2.preRLE[bz2.tPos]
 		b := byte(bz2.tPos)
 		bz2.tPos >>= 8
 		bz2.preRLEUsed++

 		if bz2.byteRepeats == 3 {
 			bz2.repeats = uint(b)
 			bz2.byteRepeats = 0
 			continue
 		}

 		if bz2.lastByte == int(b) {
 			bz2.byteRepeats++
 		} else {
 			bz2.byteRepeats = 0
 		}
 		bz2.lastByte = int(b)

 		buf[n] = b
 		n++
 	}

 	return n
 }

 func (bz2 *reader) read(buf []byte) (int, error) {
 	for {
 		n := bz2.readFromBlock(buf)
 		if n > 0 || len(buf) == 0 {
 			bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n])
 			return n, nil
 		}

 		// End of block. Check CRC.
 		if bz2.blockCRC != bz2.wantBlockCRC {
 			bz2.br.err = StructuralError("block checksum mismatch")
 			return 0, bz2.br.err
 		}

 		// Find next block.
 		br := &bz2.br
 		switch br.ReadBits64(48) {
 		default:
 			return 0, StructuralError("bad magic value found")

 		case bzip2BlockMagic:
 			// Start of block.
 			err := bz2.readBlock()
 			if err != nil {
 				return 0, err
 			}

 		case bzip2FinalMagic:
 			// Check end-of-file CRC.
 			wantFileCRC := uint32(br.ReadBits64(32))
 			if br.err != nil {
 				return 0, br.err
 			}
 			if bz2.fileCRC != wantFileCRC {
 				br.err = StructuralError("file checksum mismatch")
 				return 0, br.err
 			}

 			// Skip ahead to byte boundary.
 			// Is there a file concatenated to this one?
 			// It would start with BZ.
 			if br.bits%8 != 0 {
 				br.ReadBits(br.bits % 8)
 			}
 			b, err := br.r.ReadByte()
 			if err == io.EOF {
 				br.err = io.EOF
 				bz2.eof = true
 				return 0, io.EOF
 			}
 			if err != nil {
 				br.err = err
 				return 0, err
 			}
 			z, err := br.r.ReadByte()
 			if err != nil {
 				if err == io.EOF {
 					err = io.ErrUnexpectedEOF
 				}
 				br.err = err
 				return 0, err
 			}
 			if b != 'B' || z != 'Z' {
 				return 0, StructuralError("bad magic value in continuation file")
 			}
 			if err := bz2.setup(false); err != nil {
 				return 0, err
 			}
 		}
 	}
 }

 // readBlock reads a bzip2 block. The magic number should already have been consumed.
 func (bz2 *reader) readBlock() (err error) {
 	br := &bz2.br
 	bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is.
 	bz2.blockCRC = 0
 	bz2.fileCRC = (bz2.fileCRC<<1 | bz2.fileCRC>>31) ^ bz2.wantBlockCRC
 	randomized := br.ReadBits(1)
 	if randomized != 0 {
 		return StructuralError("deprecated randomized files")
 	}
 	origPtr := uint(br.ReadBits(24))

 	// If not every byte value is used in the block (i.e., it's text) then
 	// the symbol set is reduced. The symbols used are stored as a
 	// two-level, 16x16 bitmap.
 	symbolRangeUsedBitmap := br.ReadBits(16)
 	symbolPresent := make([]bool, 256)
 	numSymbols := 0
 	for symRange := uint(0); symRange < 16; symRange++ {
 		if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 {
 			bits := br.ReadBits(16)
 			for symbol := uint(0); symbol < 16; symbol++ {
 				if bits&(1<<(15-symbol)) != 0 {
 					symbolPresent[16*symRange+symbol] = true
 					numSymbols++
 				}
 			}
 		}
 	}

 	if numSymbols == 0 {
 		// There must be an EOF symbol.
 		return StructuralError("no symbols in input")
 	}

 	// A block uses between two and six different Huffman trees.
 	numHuffmanTrees := br.ReadBits(3)
 	if numHuffmanTrees < 2 || numHuffmanTrees > 6 {
 		return StructuralError("invalid number of Huffman trees")
 	}

 	// The Huffman tree can switch every 50 symbols so there's a list of
 	// tree indexes telling us which tree to use for each 50 symbol block.
 	numSelectors := br.ReadBits(15)
 	treeIndexes := make([]uint8, numSelectors)

 	// The tree indexes are move-to-front transformed and stored as unary
 	// numbers.
 	mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees)
 	for i := range treeIndexes {
 		c := 0
 		for {
 			inc := br.ReadBits(1)
 			if inc == 0 {
 				break
 			}
 			c++
 		}
 		if c >= numHuffmanTrees {
 			return StructuralError("tree index too large")
 		}
 		treeIndexes[i] = mtfTreeDecoder.Decode(c)
 	}

 	// The list of symbols for the move-to-front transform is taken from
 	// the previously decoded symbol bitmap.
 	symbols := make([]byte, numSymbols)
 	nextSymbol := 0
 	for i := 0; i < 256; i++ {
 		if symbolPresent[i] {
 			symbols[nextSymbol] = byte(i)
 			nextSymbol++
 		}
 	}
 	mtf := newMTFDecoder(symbols)

 	numSymbols += 2 // to account for RUNA and RUNB symbols
 	huffmanTrees := make([]huffmanTree, numHuffmanTrees)

 	// Now we decode the arrays of code-lengths for each tree.
 	lengths := make([]uint8, numSymbols)
 	for i := range huffmanTrees {
 		// The code lengths are delta encoded from a 5-bit base value.
 		length := br.ReadBits(5)
 		for j := range lengths {
 			for {
 				if length < 1 || length > 20 {
 					return StructuralError("Huffman length out of range")
 				}
 				if !br.ReadBit() {
 					break
 				}
 				if br.ReadBit() {
 					length--
 				} else {
 					length++
 				}
 			}
 			lengths[j] = uint8(length)
 		}
 		huffmanTrees[i], err = newHuffmanTree(lengths)
 		if err != nil {
 			return err
 		}
 	}

 	selectorIndex := 1 // the next tree index to use
 	if len(treeIndexes) == 0 {
 		return StructuralError("no tree selectors given")
 	}
 	if int(treeIndexes[0]) >= len(huffmanTrees) {
 		return StructuralError("tree selector out of range")
 	}
 	currentHuffmanTree := huffmanTrees[treeIndexes[0]]
 	bufIndex := 0 // indexes bz2.buf, the output buffer.
 	// The output of the move-to-front transform is run-length encoded and
 	// we merge the decoding into the Huffman parsing loop. These two
 	// variables accumulate the repeat count. See the Wikipedia page for
 	// details.
 	repeat := 0
 	repeatPower := 0

 	// The `C' array (used by the inverse BWT) needs to be zero initialized.
 	clear(bz2.c[:])

 	decoded := 0 // counts the number of symbols decoded by the current tree.
 	for {
 		if decoded == 50 {
 			if selectorIndex >= numSelectors {
 				return StructuralError("insufficient selector indices for number of symbols")
 			}
 			if int(treeIndexes[selectorIndex]) >= len(huffmanTrees) {
 				return StructuralError("tree selector out of range")
 			}
 			currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]]
 			selectorIndex++
 			decoded = 0
 		}

 		v := currentHuffmanTree.Decode(br)
 		decoded++

 		if v < 2 {
 			// This is either the RUNA or RUNB symbol.
 			if repeat == 0 {
 				repeatPower = 1
 			}
 			repeat += repeatPower << v
 			repeatPower <<= 1

 			// This limit of 2 million comes from the bzip2 source
 			// code. It prevents repeat from overflowing.
 			if repeat > 2*1024*1024 {
 				return StructuralError("repeat count too large")
 			}
 			continue
 		}

 		if repeat > 0 {
 			// We have decoded a complete run-length so we need to
 			// replicate the last output symbol.
 			if repeat > bz2.blockSize-bufIndex {
 				return StructuralError("repeats past end of block")
 			}
 			for i := 0; i < repeat; i++ {
 				b := mtf.First()
 				bz2.tt[bufIndex] = uint32(b)
 				bz2.c[b]++
 				bufIndex++
 			}
 			repeat = 0
 		}

 		if int(v) == numSymbols-1 {
 			// This is the EOF symbol. Because it's always at the
 			// end of the move-to-front list, and never gets moved
 			// to the front, it has this unique value.
 			break
 		}

 		// Since two metasymbols (RUNA and RUNB) have values 0 and 1,
 		// one would expect |v-2| to be passed to the MTF decoder.
 		// However, the front of the MTF list is never referenced as 0,
 		// it's always referenced with a run-length of 1. Thus 0
 		// doesn't need to be encoded and we have |v-1| in the next
 		// line.
 		b := mtf.Decode(int(v - 1))
 		if bufIndex >= bz2.blockSize {
 			return StructuralError("data exceeds block size")
 		}
 		bz2.tt[bufIndex] = uint32(b)
 		bz2.c[b]++
 		bufIndex++
 	}

 	if origPtr >= uint(bufIndex) {
 		return StructuralError("origPtr out of bounds")
 	}

 	// We have completed the entropy decoding. Now we can perform the
 	// inverse BWT and setup the RLE buffer.
 	bz2.preRLE = bz2.tt[:bufIndex]
 	bz2.preRLEUsed = 0
 	bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:])
 	bz2.lastByte = -1
 	bz2.byteRepeats = 0
 	bz2.repeats = 0

 	return nil
 }

 // inverseBWT implements the inverse Burrows-Wheeler transform as described in
 // http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2.
 // In that document, origPtr is called “I” and c is the “C” array after the
 // first pass over the data. It's an argument here because we merge the first
 // pass with the Huffman decoding.
 //
 // This also implements the “single array” method from the bzip2 source code
 // which leaves the output, still shuffled, in the bottom 8 bits of tt with the
 // index of the next byte in the top 24-bits. The index of the first byte is
 // returned.
 func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 {
 	sum := uint(0)
 	for i := 0; i < 256; i++ {
 		sum += c[i]
 		c[i] = sum - c[i]
 	}

 	for i := range tt {
 		b := tt[i] & 0xff
 		tt[c[b]] |= uint32(i) << 8
 		c[b]++
 	}

 	return tt[origPtr] >> 8
 }

 // This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed,
 // causing the bits in the input to be processed in the reverse of the usual order.

 var crctab [256]uint32

 func init() {
 	const poly = 0x04C11DB7
 	for i := range crctab {
 		crc := uint32(i) << 24
 		for j := 0; j < 8; j++ {
 			if crc&0x80000000 != 0 {
 				crc = (crc << 1) ^ poly
 			} else {
 				crc <<= 1
 			}
 		}
 		crctab[i] = crc
 	}
 }

 // updateCRC updates the crc value to incorporate the data in b.
 // The initial value is 0.
 func updateCRC(val uint32, b []byte) uint32 {
 	crc := ^val
 	for _, v := range b {
 		crc = crctab[byte(crc>>24)^v] ^ (crc << 8)
 	}
 	return ^crc
 }
	// Copyright 2011 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	// Package bzip2 implements bzip2 decompression.
	package bzip2

	import "io"

	// There's no RFC for bzip2. I used the Wikipedia page for reference and a lot
	// of guessing: https://en.wikipedia.org/wiki/Bzip2
	// The source code to pyflate was useful for debugging:
	// http://www.paul.sladen.org/projects/pyflate

	// A StructuralError is returned when the bzip2 data is found to be
	// syntactically invalid.
	type StructuralError string

	func (s StructuralError) Error() string {
	return "bzip2 data invalid: " + string(s)
	}

	// A reader decompresses bzip2 compressed data.
	type reader struct {
	br bitReader
	fileCRC uint32
	blockCRC uint32
	wantBlockCRC uint32
	setupDone bool // true if we have parsed the bzip2 header.
	eof bool
	blockSize int // blockSize in bytes, i.e. 900 * 1000.
	c [256]uint // the ``C'' array for the inverse BWT.
	tt []uint32 // mirrors the ``tt'' array in the bzip2 source and contains the P array in the upper 24 bits.
	tPos uint32 // Index of the next output byte in tt.

	preRLE []uint32 // contains the RLE data still to be processed.
	preRLEUsed int // number of entries of preRLE used.
	lastByte int // the last byte value seen.
	byteRepeats uint // the number of repeats of lastByte seen.
	repeats uint // the number of copies of lastByte to output.
	}

	// NewReader returns an io.Reader which decompresses bzip2 data from r.
	// If r does not also implement [io.ByteReader],
	// the decompressor may read more data than necessary from r.
	func NewReader(r io.Reader) io.Reader {
	bz2 := new(reader)
	bz2.br = newBitReader(r)
	return bz2
	}

	const bzip2FileMagic = 0x425a // "BZ"
	const bzip2BlockMagic = 0x314159265359
	const bzip2FinalMagic = 0x177245385090

	// setup parses the bzip2 header.
	func (bz2 *reader) setup(needMagic bool) error {
	br := &bz2.br

	if needMagic {
	magic := br.ReadBits(16)
	if magic != bzip2FileMagic {
	return StructuralError("bad magic value")
	}
	}

	t := br.ReadBits(8)
	if t != 'h' {
	return StructuralError("non-Huffman entropy encoding")
	}

	level := br.ReadBits(8)
	if level < '1' \|\| level > '9' {
	return StructuralError("invalid compression level")
	}

	bz2.fileCRC = 0
	bz2.blockSize = 100 * 1000 * (level - '0')
	if bz2.blockSize > len(bz2.tt) {
	bz2.tt = make([]uint32, bz2.blockSize)
	}
	return nil
	}

	func (bz2 *reader) Read(buf []byte) (n int, err error) {
	if bz2.eof {
	return 0, io.EOF
	}

	if !bz2.setupDone {
	err = bz2.setup(true)
	brErr := bz2.br.Err()
	if brErr != nil {
	err = brErr
	}
	if err != nil {
	return 0, err
	}
	bz2.setupDone = true
	}

	n, err = bz2.read(buf)
	brErr := bz2.br.Err()
	if brErr != nil {
	err = brErr
	}
	return
	}

	func (bz2 *reader) readFromBlock(buf []byte) int {
	// bzip2 is a block based compressor, except that it has a run-length
	// preprocessing step. The block based nature means that we can
	// preallocate fixed-size buffers and reuse them. However, the RLE
	// preprocessing would require allocating huge buffers to store the
	// maximum expansion. Thus we process blocks all at once, except for
	// the RLE which we decompress as required.
	n := 0
	for (bz2.repeats > 0 \|\| bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) {
	// We have RLE data pending.

	// The run-length encoding works like this:
	// Any sequence of four equal bytes is followed by a length
	// byte which contains the number of repeats of that byte to
	// include. (The number of repeats can be zero.) Because we are
	// decompressing on-demand our state is kept in the reader
	// object.

	if bz2.repeats > 0 {
	buf[n] = byte(bz2.lastByte)
	n++
	bz2.repeats--
	if bz2.repeats == 0 {
	bz2.lastByte = -1
	}
	continue
	}

	bz2.tPos = bz2.preRLE[bz2.tPos]
	b := byte(bz2.tPos)
	bz2.tPos >>= 8
	bz2.preRLEUsed++

	if bz2.byteRepeats == 3 {
	bz2.repeats = uint(b)
	bz2.byteRepeats = 0
	continue
	}

	if bz2.lastByte == int(b) {
	bz2.byteRepeats++
	} else {
	bz2.byteRepeats = 0
	}
	bz2.lastByte = int(b)

	buf[n] = b
	n++
	}

	return n
	}

	func (bz2 *reader) read(buf []byte) (int, error) {
	for {
	n := bz2.readFromBlock(buf)
	if n > 0 \|\| len(buf) == 0 {
	bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n])
	return n, nil
	}

	// End of block. Check CRC.
	if bz2.blockCRC != bz2.wantBlockCRC {
	bz2.br.err = StructuralError("block checksum mismatch")
	return 0, bz2.br.err
	}

	// Find next block.
	br := &bz2.br
	switch br.ReadBits64(48) {
	default:
	return 0, StructuralError("bad magic value found")

	case bzip2BlockMagic:
	// Start of block.
	err := bz2.readBlock()
	if err != nil {
	return 0, err
	}

	case bzip2FinalMagic:
	// Check end-of-file CRC.
	wantFileCRC := uint32(br.ReadBits64(32))
	if br.err != nil {
	return 0, br.err
	}
	if bz2.fileCRC != wantFileCRC {
	br.err = StructuralError("file checksum mismatch")
	return 0, br.err
	}

	// Skip ahead to byte boundary.
	// Is there a file concatenated to this one?
	// It would start with BZ.
	if br.bits%8 != 0 {
	br.ReadBits(br.bits % 8)
	}
	b, err := br.r.ReadByte()
	if err == io.EOF {
	br.err = io.EOF
	bz2.eof = true
	return 0, io.EOF
	}
	if err != nil {
	br.err = err
	return 0, err
	}
	z, err := br.r.ReadByte()
	if err != nil {
	if err == io.EOF {
	err = io.ErrUnexpectedEOF
	}
	br.err = err
	return 0, err
	}
	if b != 'B' \|\| z != 'Z' {
	return 0, StructuralError("bad magic value in continuation file")
	}
	if err := bz2.setup(false); err != nil {
	return 0, err
	}
	}
	}
	}

	// readBlock reads a bzip2 block. The magic number should already have been consumed.
	func (bz2 *reader) readBlock() (err error) {
	br := &bz2.br
	bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is.
	bz2.blockCRC = 0
	bz2.fileCRC = (bz2.fileCRC<<1 \| bz2.fileCRC>>31) ^ bz2.wantBlockCRC
	randomized := br.ReadBits(1)
	if randomized != 0 {
	return StructuralError("deprecated randomized files")
	}
	origPtr := uint(br.ReadBits(24))

	// If not every byte value is used in the block (i.e., it's text) then
	// the symbol set is reduced. The symbols used are stored as a
	// two-level, 16x16 bitmap.
	symbolRangeUsedBitmap := br.ReadBits(16)
	symbolPresent := make([]bool, 256)
	numSymbols := 0
	for symRange := uint(0); symRange < 16; symRange++ {
	if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 {
	bits := br.ReadBits(16)
	for symbol := uint(0); symbol < 16; symbol++ {
	if bits&(1<<(15-symbol)) != 0 {
	symbolPresent[16*symRange+symbol] = true
	numSymbols++
	}
	}
	}
	}

	if numSymbols == 0 {
	// There must be an EOF symbol.
	return StructuralError("no symbols in input")
	}

	// A block uses between two and six different Huffman trees.
	numHuffmanTrees := br.ReadBits(3)
	if numHuffmanTrees < 2 \|\| numHuffmanTrees > 6 {
	return StructuralError("invalid number of Huffman trees")
	}

	// The Huffman tree can switch every 50 symbols so there's a list of
	// tree indexes telling us which tree to use for each 50 symbol block.
	numSelectors := br.ReadBits(15)
	treeIndexes := make([]uint8, numSelectors)

	// The tree indexes are move-to-front transformed and stored as unary
	// numbers.
	mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees)
	for i := range treeIndexes {
	c := 0
	for {
	inc := br.ReadBits(1)
	if inc == 0 {
	break
	}
	c++
	}
	if c >= numHuffmanTrees {
	return StructuralError("tree index too large")
	}
	treeIndexes[i] = mtfTreeDecoder.Decode(c)
	}

	// The list of symbols for the move-to-front transform is taken from
	// the previously decoded symbol bitmap.
	symbols := make([]byte, numSymbols)
	nextSymbol := 0
	for i := 0; i < 256; i++ {
	if symbolPresent[i] {
	symbols[nextSymbol] = byte(i)
	nextSymbol++
	}
	}
	mtf := newMTFDecoder(symbols)

	numSymbols += 2 // to account for RUNA and RUNB symbols
	huffmanTrees := make([]huffmanTree, numHuffmanTrees)

	// Now we decode the arrays of code-lengths for each tree.
	lengths := make([]uint8, numSymbols)
	for i := range huffmanTrees {
	// The code lengths are delta encoded from a 5-bit base value.
	length := br.ReadBits(5)
	for j := range lengths {
	for {
	if length < 1 \|\| length > 20 {
	return StructuralError("Huffman length out of range")
	}
	if !br.ReadBit() {
	break
	}
	if br.ReadBit() {
	length--
	} else {
	length++
	}
	}
	lengths[j] = uint8(length)
	}
	huffmanTrees[i], err = newHuffmanTree(lengths)
	if err != nil {
	return err
	}
	}

	selectorIndex := 1 // the next tree index to use
	if len(treeIndexes) == 0 {
	return StructuralError("no tree selectors given")
	}
	if int(treeIndexes[0]) >= len(huffmanTrees) {
	return StructuralError("tree selector out of range")
	}
	currentHuffmanTree := huffmanTrees[treeIndexes[0]]
	bufIndex := 0 // indexes bz2.buf, the output buffer.
	// The output of the move-to-front transform is run-length encoded and
	// we merge the decoding into the Huffman parsing loop. These two
	// variables accumulate the repeat count. See the Wikipedia page for
	// details.
	repeat := 0
	repeatPower := 0

	// The `C' array (used by the inverse BWT) needs to be zero initialized.
	clear(bz2.c[:])

	decoded := 0 // counts the number of symbols decoded by the current tree.
	for {
	if decoded == 50 {
	if selectorIndex >= numSelectors {
	return StructuralError("insufficient selector indices for number of symbols")
	}
	if int(treeIndexes[selectorIndex]) >= len(huffmanTrees) {
	return StructuralError("tree selector out of range")
	}
	currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]]
	selectorIndex++
	decoded = 0
	}

	v := currentHuffmanTree.Decode(br)
	decoded++

	if v < 2 {
	// This is either the RUNA or RUNB symbol.
	if repeat == 0 {
	repeatPower = 1
	}
	repeat += repeatPower << v
	repeatPower <<= 1

	// This limit of 2 million comes from the bzip2 source
	// code. It prevents repeat from overflowing.
	if repeat > 210241024 {
	return StructuralError("repeat count too large")
	}
	continue
	}

	if repeat > 0 {
	// We have decoded a complete run-length so we need to
	// replicate the last output symbol.
	if repeat > bz2.blockSize-bufIndex {
	return StructuralError("repeats past end of block")
	}
	for i := 0; i < repeat; i++ {
	b := mtf.First()
	bz2.tt[bufIndex] = uint32(b)
	bz2.c[b]++
	bufIndex++
	}
	repeat = 0
	}

	if int(v) == numSymbols-1 {
	// This is the EOF symbol. Because it's always at the
	// end of the move-to-front list, and never gets moved
	// to the front, it has this unique value.
	break
	}

	// Since two metasymbols (RUNA and RUNB) have values 0 and 1,
	// one would expect \|v-2\| to be passed to the MTF decoder.
	// However, the front of the MTF list is never referenced as 0,
	// it's always referenced with a run-length of 1. Thus 0
	// doesn't need to be encoded and we have \|v-1\| in the next
	// line.
	b := mtf.Decode(int(v - 1))
	if bufIndex >= bz2.blockSize {
	return StructuralError("data exceeds block size")
	}
	bz2.tt[bufIndex] = uint32(b)
	bz2.c[b]++
	bufIndex++
	}

	if origPtr >= uint(bufIndex) {
	return StructuralError("origPtr out of bounds")
	}

	// We have completed the entropy decoding. Now we can perform the
	// inverse BWT and setup the RLE buffer.
	bz2.preRLE = bz2.tt[:bufIndex]
	bz2.preRLEUsed = 0
	bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:])
	bz2.lastByte = -1
	bz2.byteRepeats = 0
	bz2.repeats = 0

	return nil
	}

	// inverseBWT implements the inverse Burrows-Wheeler transform as described in
	// http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2.
	// In that document, origPtr is called “I” and c is the “C” array after the
	// first pass over the data. It's an argument here because we merge the first
	// pass with the Huffman decoding.
	//
	// This also implements the “single array” method from the bzip2 source code
	// which leaves the output, still shuffled, in the bottom 8 bits of tt with the
	// index of the next byte in the top 24-bits. The index of the first byte is
	// returned.
	func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 {
	sum := uint(0)
	for i := 0; i < 256; i++ {
	sum += c[i]
	c[i] = sum - c[i]
	}

	for i := range tt {
	b := tt[i] & 0xff
	tt[c[b]] \|= uint32(i) << 8
	c[b]++
	}

	return tt[origPtr] >> 8
	}

	// This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed,
	// causing the bits in the input to be processed in the reverse of the usual order.

	var crctab [256]uint32

	func init() {
	const poly = 0x04C11DB7
	for i := range crctab {
	crc := uint32(i) << 24
	for j := 0; j < 8; j++ {
	if crc&0x80000000 != 0 {
	crc = (crc << 1) ^ poly
	} else {
	crc <<= 1
	}
	}
	crctab[i] = crc
	}
	}

	// updateCRC updates the crc value to incorporate the data in b.
	// The initial value is 0.
	func updateCRC(val uint32, b []byte) uint32 {
	crc := ^val
	for _, v := range b {
	crc = crctab[byte(crc>>24)^v] ^ (crc << 8)
	}
	return ^crc
	}