| // Copyright 2011 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // Package bzip2 implements bzip2 decompression. |
| package bzip2 |
| |
| import "io" |
| |
| // There's no RFC for bzip2. I used the Wikipedia page for reference and a lot |
| // of guessing: http://en.wikipedia.org/wiki/Bzip2 |
| // The source code to pyflate was useful for debugging: |
| // http://www.paul.sladen.org/projects/pyflate |
| |
| // A StructuralError is returned when the bzip2 data is found to be |
| // syntactically invalid. |
| type StructuralError string |
| |
| func (s StructuralError) Error() string { |
| return "bzip2 data invalid: " + string(s) |
| } |
| |
| // A reader decompresses bzip2 compressed data. |
| type reader struct { |
| br bitReader |
| fileCRC uint32 |
| blockCRC uint32 |
| wantBlockCRC uint32 |
| setupDone bool // true if we have parsed the bzip2 header. |
| blockSize int // blockSize in bytes, i.e. 900 * 1000. |
| eof bool |
| c [256]uint // the `C' array for the inverse BWT. |
| tt []uint32 // mirrors the `tt' array in the bzip2 source and contains the P array in the upper 24 bits. |
| tPos uint32 // Index of the next output byte in tt. |
| |
| preRLE []uint32 // contains the RLE data still to be processed. |
| preRLEUsed int // number of entries of preRLE used. |
| lastByte int // the last byte value seen. |
| byteRepeats uint // the number of repeats of lastByte seen. |
| repeats uint // the number of copies of lastByte to output. |
| } |
| |
| // NewReader returns an io.Reader which decompresses bzip2 data from r. |
| // If r does not also implement io.ByteReader, |
| // the decompressor may read more data than necessary from r. |
| func NewReader(r io.Reader) io.Reader { |
| bz2 := new(reader) |
| bz2.br = newBitReader(r) |
| return bz2 |
| } |
| |
| const bzip2FileMagic = 0x425a // "BZ" |
| const bzip2BlockMagic = 0x314159265359 |
| const bzip2FinalMagic = 0x177245385090 |
| |
| // setup parses the bzip2 header. |
| func (bz2 *reader) setup(needMagic bool) error { |
| br := &bz2.br |
| |
| if needMagic { |
| magic := br.ReadBits(16) |
| if magic != bzip2FileMagic { |
| return StructuralError("bad magic value") |
| } |
| } |
| |
| t := br.ReadBits(8) |
| if t != 'h' { |
| return StructuralError("non-Huffman entropy encoding") |
| } |
| |
| level := br.ReadBits(8) |
| if level < '1' || level > '9' { |
| return StructuralError("invalid compression level") |
| } |
| |
| bz2.fileCRC = 0 |
| bz2.blockSize = 100 * 1000 * (level - '0') |
| if bz2.blockSize > len(bz2.tt) { |
| bz2.tt = make([]uint32, bz2.blockSize) |
| } |
| return nil |
| } |
| |
| func (bz2 *reader) Read(buf []byte) (n int, err error) { |
| if bz2.eof { |
| return 0, io.EOF |
| } |
| |
| if !bz2.setupDone { |
| err = bz2.setup(true) |
| brErr := bz2.br.Err() |
| if brErr != nil { |
| err = brErr |
| } |
| if err != nil { |
| return 0, err |
| } |
| bz2.setupDone = true |
| } |
| |
| n, err = bz2.read(buf) |
| brErr := bz2.br.Err() |
| if brErr != nil { |
| err = brErr |
| } |
| return |
| } |
| |
| func (bz2 *reader) readFromBlock(buf []byte) int { |
| // bzip2 is a block based compressor, except that it has a run-length |
| // preprocessing step. The block based nature means that we can |
| // preallocate fixed-size buffers and reuse them. However, the RLE |
| // preprocessing would require allocating huge buffers to store the |
| // maximum expansion. Thus we process blocks all at once, except for |
| // the RLE which we decompress as required. |
| n := 0 |
| for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) { |
| // We have RLE data pending. |
| |
| // The run-length encoding works like this: |
| // Any sequence of four equal bytes is followed by a length |
| // byte which contains the number of repeats of that byte to |
| // include. (The number of repeats can be zero.) Because we are |
| // decompressing on-demand our state is kept in the reader |
| // object. |
| |
| if bz2.repeats > 0 { |
| buf[n] = byte(bz2.lastByte) |
| n++ |
| bz2.repeats-- |
| if bz2.repeats == 0 { |
| bz2.lastByte = -1 |
| } |
| continue |
| } |
| |
| bz2.tPos = bz2.preRLE[bz2.tPos] |
| b := byte(bz2.tPos) |
| bz2.tPos >>= 8 |
| bz2.preRLEUsed++ |
| |
| if bz2.byteRepeats == 3 { |
| bz2.repeats = uint(b) |
| bz2.byteRepeats = 0 |
| continue |
| } |
| |
| if bz2.lastByte == int(b) { |
| bz2.byteRepeats++ |
| } else { |
| bz2.byteRepeats = 0 |
| } |
| bz2.lastByte = int(b) |
| |
| buf[n] = b |
| n++ |
| } |
| |
| return n |
| } |
| |
| func (bz2 *reader) read(buf []byte) (int, error) { |
| for { |
| n := bz2.readFromBlock(buf) |
| if n > 0 { |
| bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n]) |
| return n, nil |
| } |
| |
| // End of block. Check CRC. |
| if bz2.blockCRC != bz2.wantBlockCRC { |
| bz2.br.err = StructuralError("block checksum mismatch") |
| return 0, bz2.br.err |
| } |
| |
| // Find next block. |
| br := &bz2.br |
| switch br.ReadBits64(48) { |
| default: |
| return 0, StructuralError("bad magic value found") |
| |
| case bzip2BlockMagic: |
| // Start of block. |
| err := bz2.readBlock() |
| if err != nil { |
| return 0, err |
| } |
| |
| case bzip2FinalMagic: |
| // Check end-of-file CRC. |
| wantFileCRC := uint32(br.ReadBits64(32)) |
| if br.err != nil { |
| return 0, br.err |
| } |
| if bz2.fileCRC != wantFileCRC { |
| br.err = StructuralError("file checksum mismatch") |
| return 0, br.err |
| } |
| |
| // Skip ahead to byte boundary. |
| // Is there a file concatenated to this one? |
| // It would start with BZ. |
| if br.bits%8 != 0 { |
| br.ReadBits(br.bits % 8) |
| } |
| b, err := br.r.ReadByte() |
| if err == io.EOF { |
| br.err = io.EOF |
| bz2.eof = true |
| return 0, io.EOF |
| } |
| if err != nil { |
| br.err = err |
| return 0, err |
| } |
| z, err := br.r.ReadByte() |
| if err != nil { |
| if err == io.EOF { |
| err = io.ErrUnexpectedEOF |
| } |
| br.err = err |
| return 0, err |
| } |
| if b != 'B' || z != 'Z' { |
| return 0, StructuralError("bad magic value in continuation file") |
| } |
| if err := bz2.setup(false); err != nil { |
| return 0, err |
| } |
| } |
| } |
| } |
| |
| // readBlock reads a bzip2 block. The magic number should already have been consumed. |
| func (bz2 *reader) readBlock() (err error) { |
| br := &bz2.br |
| bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is. |
| bz2.blockCRC = 0 |
| bz2.fileCRC = (bz2.fileCRC<<1 | bz2.fileCRC>>31) ^ bz2.wantBlockCRC |
| randomized := br.ReadBits(1) |
| if randomized != 0 { |
| return StructuralError("deprecated randomized files") |
| } |
| origPtr := uint(br.ReadBits(24)) |
| |
| // If not every byte value is used in the block (i.e., it's text) then |
| // the symbol set is reduced. The symbols used are stored as a |
| // two-level, 16x16 bitmap. |
| symbolRangeUsedBitmap := br.ReadBits(16) |
| symbolPresent := make([]bool, 256) |
| numSymbols := 0 |
| for symRange := uint(0); symRange < 16; symRange++ { |
| if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 { |
| bits := br.ReadBits(16) |
| for symbol := uint(0); symbol < 16; symbol++ { |
| if bits&(1<<(15-symbol)) != 0 { |
| symbolPresent[16*symRange+symbol] = true |
| numSymbols++ |
| } |
| } |
| } |
| } |
| |
| if numSymbols == 0 { |
| // There must be an EOF symbol. |
| return StructuralError("no symbols in input") |
| } |
| |
| // A block uses between two and six different Huffman trees. |
| numHuffmanTrees := br.ReadBits(3) |
| if numHuffmanTrees < 2 || numHuffmanTrees > 6 { |
| return StructuralError("invalid number of Huffman trees") |
| } |
| |
| // The Huffman tree can switch every 50 symbols so there's a list of |
| // tree indexes telling us which tree to use for each 50 symbol block. |
| numSelectors := br.ReadBits(15) |
| treeIndexes := make([]uint8, numSelectors) |
| |
| // The tree indexes are move-to-front transformed and stored as unary |
| // numbers. |
| mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees) |
| for i := range treeIndexes { |
| c := 0 |
| for { |
| inc := br.ReadBits(1) |
| if inc == 0 { |
| break |
| } |
| c++ |
| } |
| if c >= numHuffmanTrees { |
| return StructuralError("tree index too large") |
| } |
| treeIndexes[i] = mtfTreeDecoder.Decode(c) |
| } |
| |
| // The list of symbols for the move-to-front transform is taken from |
| // the previously decoded symbol bitmap. |
| symbols := make([]byte, numSymbols) |
| nextSymbol := 0 |
| for i := 0; i < 256; i++ { |
| if symbolPresent[i] { |
| symbols[nextSymbol] = byte(i) |
| nextSymbol++ |
| } |
| } |
| mtf := newMTFDecoder(symbols) |
| |
| numSymbols += 2 // to account for RUNA and RUNB symbols |
| huffmanTrees := make([]huffmanTree, numHuffmanTrees) |
| |
| // Now we decode the arrays of code-lengths for each tree. |
| lengths := make([]uint8, numSymbols) |
| for i := range huffmanTrees { |
| // The code lengths are delta encoded from a 5-bit base value. |
| length := br.ReadBits(5) |
| for j := range lengths { |
| for { |
| if length < 1 || length > 20 { |
| return StructuralError("Huffman length out of range") |
| } |
| if !br.ReadBit() { |
| break |
| } |
| if br.ReadBit() { |
| length-- |
| } else { |
| length++ |
| } |
| } |
| lengths[j] = uint8(length) |
| } |
| huffmanTrees[i], err = newHuffmanTree(lengths) |
| if err != nil { |
| return err |
| } |
| } |
| |
| selectorIndex := 1 // the next tree index to use |
| if len(treeIndexes) == 0 { |
| return StructuralError("no tree selectors given") |
| } |
| if int(treeIndexes[0]) >= len(huffmanTrees) { |
| return StructuralError("tree selector out of range") |
| } |
| currentHuffmanTree := huffmanTrees[treeIndexes[0]] |
| bufIndex := 0 // indexes bz2.buf, the output buffer. |
| // The output of the move-to-front transform is run-length encoded and |
| // we merge the decoding into the Huffman parsing loop. These two |
| // variables accumulate the repeat count. See the Wikipedia page for |
| // details. |
| repeat := 0 |
| repeatPower := 0 |
| |
| // The `C' array (used by the inverse BWT) needs to be zero initialized. |
| for i := range bz2.c { |
| bz2.c[i] = 0 |
| } |
| |
| decoded := 0 // counts the number of symbols decoded by the current tree. |
| for { |
| if decoded == 50 { |
| if selectorIndex >= numSelectors { |
| return StructuralError("insufficient selector indices for number of symbols") |
| } |
| if int(treeIndexes[selectorIndex]) >= len(huffmanTrees) { |
| return StructuralError("tree selector out of range") |
| } |
| currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]] |
| selectorIndex++ |
| decoded = 0 |
| } |
| |
| v := currentHuffmanTree.Decode(br) |
| decoded++ |
| |
| if v < 2 { |
| // This is either the RUNA or RUNB symbol. |
| if repeat == 0 { |
| repeatPower = 1 |
| } |
| repeat += repeatPower << v |
| repeatPower <<= 1 |
| |
| // This limit of 2 million comes from the bzip2 source |
| // code. It prevents repeat from overflowing. |
| if repeat > 2*1024*1024 { |
| return StructuralError("repeat count too large") |
| } |
| continue |
| } |
| |
| if repeat > 0 { |
| // We have decoded a complete run-length so we need to |
| // replicate the last output symbol. |
| if repeat > bz2.blockSize-bufIndex { |
| return StructuralError("repeats past end of block") |
| } |
| for i := 0; i < repeat; i++ { |
| b := mtf.First() |
| bz2.tt[bufIndex] = uint32(b) |
| bz2.c[b]++ |
| bufIndex++ |
| } |
| repeat = 0 |
| } |
| |
| if int(v) == numSymbols-1 { |
| // This is the EOF symbol. Because it's always at the |
| // end of the move-to-front list, and never gets moved |
| // to the front, it has this unique value. |
| break |
| } |
| |
| // Since two metasymbols (RUNA and RUNB) have values 0 and 1, |
| // one would expect |v-2| to be passed to the MTF decoder. |
| // However, the front of the MTF list is never referenced as 0, |
| // it's always referenced with a run-length of 1. Thus 0 |
| // doesn't need to be encoded and we have |v-1| in the next |
| // line. |
| b := mtf.Decode(int(v - 1)) |
| if bufIndex >= bz2.blockSize { |
| return StructuralError("data exceeds block size") |
| } |
| bz2.tt[bufIndex] = uint32(b) |
| bz2.c[b]++ |
| bufIndex++ |
| } |
| |
| if origPtr >= uint(bufIndex) { |
| return StructuralError("origPtr out of bounds") |
| } |
| |
| // We have completed the entropy decoding. Now we can perform the |
| // inverse BWT and setup the RLE buffer. |
| bz2.preRLE = bz2.tt[:bufIndex] |
| bz2.preRLEUsed = 0 |
| bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:]) |
| bz2.lastByte = -1 |
| bz2.byteRepeats = 0 |
| bz2.repeats = 0 |
| |
| return nil |
| } |
| |
| // inverseBWT implements the inverse Burrows-Wheeler transform as described in |
| // http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2. |
| // In that document, origPtr is called `I' and c is the `C' array after the |
| // first pass over the data. It's an argument here because we merge the first |
| // pass with the Huffman decoding. |
| // |
| // This also implements the `single array' method from the bzip2 source code |
| // which leaves the output, still shuffled, in the bottom 8 bits of tt with the |
| // index of the next byte in the top 24-bits. The index of the first byte is |
| // returned. |
| func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 { |
| sum := uint(0) |
| for i := 0; i < 256; i++ { |
| sum += c[i] |
| c[i] = sum - c[i] |
| } |
| |
| for i := range tt { |
| b := tt[i] & 0xff |
| tt[c[b]] |= uint32(i) << 8 |
| c[b]++ |
| } |
| |
| return tt[origPtr] >> 8 |
| } |
| |
| // This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed, |
| // causing the bits in the input to be processed in the reverse of the usual order. |
| |
| var crctab [256]uint32 |
| |
| func init() { |
| const poly = 0x04C11DB7 |
| for i := range crctab { |
| crc := uint32(i) << 24 |
| for j := 0; j < 8; j++ { |
| if crc&0x80000000 != 0 { |
| crc = (crc << 1) ^ poly |
| } else { |
| crc <<= 1 |
| } |
| } |
| crctab[i] = crc |
| } |
| } |
| |
| // updateCRC updates the crc value to incorporate the data in b. |
| // The initial value is 0. |
| func updateCRC(val uint32, b []byte) uint32 { |
| crc := ^val |
| for _, v := range b { |
| crc = crctab[byte(crc>>24)^v] ^ (crc << 8) |
| } |
| return ^crc |
| } |