src/compress/bzip2/huffman.go - go - Git at Google

 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package bzip2

 import "sort"

 // A huffmanTree is a binary tree which is navigated, bit-by-bit to reach a
 // symbol.
 type huffmanTree struct {
 	// nodes contains all the non-leaf nodes in the tree. nodes[0] is the
 	// root of the tree and nextNode contains the index of the next element
 	// of nodes to use when the tree is being constructed.
 	nodes    []huffmanNode
 	nextNode int
 }

 // A huffmanNode is a node in the tree. left and right contain indexes into the
 // nodes slice of the tree. If left or right is invalidNodeValue then the child
 // is a left node and its value is in leftValue/rightValue.
 //
 // The symbols are uint16s because bzip2 encodes not only MTF indexes in the
 // tree, but also two magic values for run-length encoding and an EOF symbol.
 // Thus there are more than 256 possible symbols.
 type huffmanNode struct {
 	left, right           uint16
 	leftValue, rightValue uint16
 }

 // invalidNodeValue is an invalid index which marks a leaf node in the tree.
 const invalidNodeValue = 0xffff

 // Decode reads bits from the given bitReader and navigates the tree until a
 // symbol is found.
 func (t *huffmanTree) Decode(br *bitReader) (v uint16) {
 	nodeIndex := uint16(0) // node 0 is the root of the tree.

 	for {
 		node := &t.nodes[nodeIndex]
 		bit, ok := br.TryReadBit()
 		if !ok && br.ReadBit() {
 			bit = 1
 		}
 		// bzip2 encodes left as a true bit.
 		if bit != 0 {
 			// left
 			if node.left == invalidNodeValue {
 				return node.leftValue
 			}
 			nodeIndex = node.left
 		} else {
 			// right
 			if node.right == invalidNodeValue {
 				return node.rightValue
 			}
 			nodeIndex = node.right
 		}
 	}
 }

 // newHuffmanTree builds a Huffman tree from a slice containing the code
 // lengths of each symbol. The maximum code length is 32 bits.
 func newHuffmanTree(lengths []uint8) (huffmanTree, error) {
 	// There are many possible trees that assign the same code length to
 	// each symbol (consider reflecting a tree down the middle, for
 	// example). Since the code length assignments determine the
 	// efficiency of the tree, each of these trees is equally good. In
 	// order to minimize the amount of information needed to build a tree
 	// bzip2 uses a canonical tree so that it can be reconstructed given
 	// only the code length assignments.

 	if len(lengths) < 2 {
 		panic("newHuffmanTree: too few symbols")
 	}

 	var t huffmanTree

 	// First we sort the code length assignments by ascending code length,
 	// using the symbol value to break ties.
 	pairs := huffmanSymbolLengthPairs(make([]huffmanSymbolLengthPair, len(lengths)))
 	for i, length := range lengths {
 		pairs[i].value = uint16(i)
 		pairs[i].length = length
 	}

 	sort.Sort(pairs)

 	// Now we assign codes to the symbols, starting with the longest code.
 	// We keep the codes packed into a uint32, at the most-significant end.
 	// So branches are taken from the MSB downwards. This makes it easy to
 	// sort them later.
 	code := uint32(0)
 	length := uint8(32)

 	codes := huffmanCodes(make([]huffmanCode, len(lengths)))
 	for i := len(pairs) - 1; i >= 0; i-- {
 		if length > pairs[i].length {
 			// If the code length decreases we shift in order to
 			// zero any bits beyond the end of the code.
 			length >>= 32 - pairs[i].length
 			length <<= 32 - pairs[i].length
 			length = pairs[i].length
 		}
 		codes[i].code = code
 		codes[i].codeLen = length
 		codes[i].value = pairs[i].value
 		// We need to 'increment' the code, which means treating |code|
 		// like a |length| bit number.
 		code += 1 << (32 - length)
 	}

 	// Now we can sort by the code so that the left half of each branch are
 	// grouped together, recursively.
 	sort.Sort(codes)

 	t.nodes = make([]huffmanNode, len(codes))
 	_, err := buildHuffmanNode(&t, codes, 0)
 	return t, err
 }

 // huffmanSymbolLengthPair contains a symbol and its code length.
 type huffmanSymbolLengthPair struct {
 	value  uint16
 	length uint8
 }

 // huffmanSymbolLengthPair is used to provide an interface for sorting.
 type huffmanSymbolLengthPairs []huffmanSymbolLengthPair

 func (h huffmanSymbolLengthPairs) Len() int {
 	return len(h)
 }

 func (h huffmanSymbolLengthPairs) Less(i, j int) bool {
 	if h[i].length < h[j].length {
 		return true
 	}
 	if h[i].length > h[j].length {
 		return false
 	}
 	if h[i].value < h[j].value {
 		return true
 	}
 	return false
 }

 func (h huffmanSymbolLengthPairs) Swap(i, j int) {
 	h[i], h[j] = h[j], h[i]
 }

 // huffmanCode contains a symbol, its code and code length.
 type huffmanCode struct {
 	code    uint32
 	codeLen uint8
 	value   uint16
 }

 // huffmanCodes is used to provide an interface for sorting.
 type huffmanCodes []huffmanCode

 func (n huffmanCodes) Len() int {
 	return len(n)
 }

 func (n huffmanCodes) Less(i, j int) bool {
 	return n[i].code < n[j].code
 }

 func (n huffmanCodes) Swap(i, j int) {
 	n[i], n[j] = n[j], n[i]
 }

 // buildHuffmanNode takes a slice of sorted huffmanCodes and builds a node in
 // the Huffman tree at the given level. It returns the index of the newly
 // constructed node.
 func buildHuffmanNode(t *huffmanTree, codes []huffmanCode, level uint32) (nodeIndex uint16, err error) {
 	test := uint32(1) << (31 - level)

 	// We have to search the list of codes to find the divide between the left and right sides.
 	firstRightIndex := len(codes)
 	for i, code := range codes {
 		if code.code&test != 0 {
 			firstRightIndex = i
 			break
 		}
 	}

 	left := codes[:firstRightIndex]
 	right := codes[firstRightIndex:]

 	if len(left) == 0 || len(right) == 0 {
 		// There is a superfluous level in the Huffman tree indicating
 		// a bug in the encoder. However, this bug has been observed in
 		// the wild so we handle it.

 		// If this function was called recursively then we know that
 		// len(codes) >= 2 because, otherwise, we would have hit the
 		// "leaf node" case, below, and not recursed.
 		//
 		// However, for the initial call it's possible that len(codes)
 		// is zero or one. Both cases are invalid because a zero length
 		// tree cannot encode anything and a length-1 tree can only
 		// encode EOF and so is superfluous. We reject both.
 		if len(codes) < 2 {
 			return 0, StructuralError("empty Huffman tree")
 		}

 		// In this case the recursion doesn't always reduce the length
 		// of codes so we need to ensure termination via another
 		// mechanism.
 		if level == 31 {
 			// Since len(codes) >= 2 the only way that the values
 			// can match at all 32 bits is if they are equal, which
 			// is invalid. This ensures that we never enter
 			// infinite recursion.
 			return 0, StructuralError("equal symbols in Huffman tree")
 		}

 		if len(left) == 0 {
 			return buildHuffmanNode(t, right, level+1)
 		}
 		return buildHuffmanNode(t, left, level+1)
 	}

 	nodeIndex = uint16(t.nextNode)
 	node := &t.nodes[t.nextNode]
 	t.nextNode++

 	if len(left) == 1 {
 		// leaf node
 		node.left = invalidNodeValue
 		node.leftValue = left[0].value
 	} else {
 		node.left, err = buildHuffmanNode(t, left, level+1)
 	}

 	if err != nil {
 		return
 	}

 	if len(right) == 1 {
 		// leaf node
 		node.right = invalidNodeValue
 		node.rightValue = right[0].value
 	} else {
 		node.right, err = buildHuffmanNode(t, right, level+1)
 	}

 	return
 }
	// Copyright 2011 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package bzip2

	import "sort"

	// A huffmanTree is a binary tree which is navigated, bit-by-bit to reach a
	// symbol.
	type huffmanTree struct {
	// nodes contains all the non-leaf nodes in the tree. nodes[0] is the
	// root of the tree and nextNode contains the index of the next element
	// of nodes to use when the tree is being constructed.
	nodes []huffmanNode
	nextNode int
	}

	// A huffmanNode is a node in the tree. left and right contain indexes into the
	// nodes slice of the tree. If left or right is invalidNodeValue then the child
	// is a left node and its value is in leftValue/rightValue.
	//
	// The symbols are uint16s because bzip2 encodes not only MTF indexes in the
	// tree, but also two magic values for run-length encoding and an EOF symbol.
	// Thus there are more than 256 possible symbols.
	type huffmanNode struct {
	left, right uint16
	leftValue, rightValue uint16
	}

	// invalidNodeValue is an invalid index which marks a leaf node in the tree.
	const invalidNodeValue = 0xffff

	// Decode reads bits from the given bitReader and navigates the tree until a
	// symbol is found.
	func (t huffmanTree) Decode(br bitReader) (v uint16) {
	nodeIndex := uint16(0) // node 0 is the root of the tree.

	for {
	node := &t.nodes[nodeIndex]
	bit, ok := br.TryReadBit()
	if !ok && br.ReadBit() {
	bit = 1
	}
	// bzip2 encodes left as a true bit.
	if bit != 0 {
	// left
	if node.left == invalidNodeValue {
	return node.leftValue
	}
	nodeIndex = node.left
	} else {
	// right
	if node.right == invalidNodeValue {
	return node.rightValue
	}
	nodeIndex = node.right
	}
	}
	}

	// newHuffmanTree builds a Huffman tree from a slice containing the code
	// lengths of each symbol. The maximum code length is 32 bits.
	func newHuffmanTree(lengths []uint8) (huffmanTree, error) {
	// There are many possible trees that assign the same code length to
	// each symbol (consider reflecting a tree down the middle, for
	// example). Since the code length assignments determine the
	// efficiency of the tree, each of these trees is equally good. In
	// order to minimize the amount of information needed to build a tree
	// bzip2 uses a canonical tree so that it can be reconstructed given
	// only the code length assignments.

	if len(lengths) < 2 {
	panic("newHuffmanTree: too few symbols")
	}

	var t huffmanTree

	// First we sort the code length assignments by ascending code length,
	// using the symbol value to break ties.
	pairs := huffmanSymbolLengthPairs(make([]huffmanSymbolLengthPair, len(lengths)))
	for i, length := range lengths {
	pairs[i].value = uint16(i)
	pairs[i].length = length
	}

	sort.Sort(pairs)

	// Now we assign codes to the symbols, starting with the longest code.
	// We keep the codes packed into a uint32, at the most-significant end.
	// So branches are taken from the MSB downwards. This makes it easy to
	// sort them later.
	code := uint32(0)
	length := uint8(32)

	codes := huffmanCodes(make([]huffmanCode, len(lengths)))
	for i := len(pairs) - 1; i >= 0; i-- {
	if length > pairs[i].length {
	// If the code length decreases we shift in order to
	// zero any bits beyond the end of the code.
	length >>= 32 - pairs[i].length
	length <<= 32 - pairs[i].length
	length = pairs[i].length
	}
	codes[i].code = code
	codes[i].codeLen = length
	codes[i].value = pairs[i].value
	// We need to 'increment' the code, which means treating \|code\|
	// like a \|length\| bit number.
	code += 1 << (32 - length)
	}

	// Now we can sort by the code so that the left half of each branch are
	// grouped together, recursively.
	sort.Sort(codes)

	t.nodes = make([]huffmanNode, len(codes))
	_, err := buildHuffmanNode(&t, codes, 0)
	return t, err
	}

	// huffmanSymbolLengthPair contains a symbol and its code length.
	type huffmanSymbolLengthPair struct {
	value uint16
	length uint8
	}

	// huffmanSymbolLengthPair is used to provide an interface for sorting.
	type huffmanSymbolLengthPairs []huffmanSymbolLengthPair

	func (h huffmanSymbolLengthPairs) Len() int {
	return len(h)
	}

	func (h huffmanSymbolLengthPairs) Less(i, j int) bool {
	if h[i].length < h[j].length {
	return true
	}
	if h[i].length > h[j].length {
	return false
	}
	if h[i].value < h[j].value {
	return true
	}
	return false
	}

	func (h huffmanSymbolLengthPairs) Swap(i, j int) {
	h[i], h[j] = h[j], h[i]
	}

	// huffmanCode contains a symbol, its code and code length.
	type huffmanCode struct {
	code uint32
	codeLen uint8
	value uint16
	}

	// huffmanCodes is used to provide an interface for sorting.
	type huffmanCodes []huffmanCode

	func (n huffmanCodes) Len() int {
	return len(n)
	}

	func (n huffmanCodes) Less(i, j int) bool {
	return n[i].code < n[j].code
	}

	func (n huffmanCodes) Swap(i, j int) {
	n[i], n[j] = n[j], n[i]
	}

	// buildHuffmanNode takes a slice of sorted huffmanCodes and builds a node in
	// the Huffman tree at the given level. It returns the index of the newly
	// constructed node.
	func buildHuffmanNode(t *huffmanTree, codes []huffmanCode, level uint32) (nodeIndex uint16, err error) {
	test := uint32(1) << (31 - level)

	// We have to search the list of codes to find the divide between the left and right sides.
	firstRightIndex := len(codes)
	for i, code := range codes {
	if code.code&test != 0 {
	firstRightIndex = i
	break
	}
	}

	left := codes[:firstRightIndex]
	right := codes[firstRightIndex:]

	if len(left) == 0 \|\| len(right) == 0 {
	// There is a superfluous level in the Huffman tree indicating
	// a bug in the encoder. However, this bug has been observed in
	// the wild so we handle it.

	// If this function was called recursively then we know that
	// len(codes) >= 2 because, otherwise, we would have hit the
	// "leaf node" case, below, and not recursed.
	//
	// However, for the initial call it's possible that len(codes)
	// is zero or one. Both cases are invalid because a zero length
	// tree cannot encode anything and a length-1 tree can only
	// encode EOF and so is superfluous. We reject both.
	if len(codes) < 2 {
	return 0, StructuralError("empty Huffman tree")
	}

	// In this case the recursion doesn't always reduce the length
	// of codes so we need to ensure termination via another
	// mechanism.
	if level == 31 {
	// Since len(codes) >= 2 the only way that the values
	// can match at all 32 bits is if they are equal, which
	// is invalid. This ensures that we never enter
	// infinite recursion.
	return 0, StructuralError("equal symbols in Huffman tree")
	}

	if len(left) == 0 {
	return buildHuffmanNode(t, right, level+1)
	}
	return buildHuffmanNode(t, left, level+1)
	}

	nodeIndex = uint16(t.nextNode)
	node := &t.nodes[t.nextNode]
	t.nextNode++

	if len(left) == 1 {
	// leaf node
	node.left = invalidNodeValue
	node.leftValue = left[0].value
	} else {
	node.left, err = buildHuffmanNode(t, left, level+1)
	}

	if err != nil {
	return
	}

	if len(right) == 1 {
	// leaf node
	node.right = invalidNodeValue
	node.rightValue = right[0].value
	} else {
	node.right, err = buildHuffmanNode(t, right, level+1)
	}

	return
	}