internal/triegen/triegen.go - text - Git at Google

 // Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 // Package triegen implements a code generator for a trie for associating
 // unsigned integer values with UTF-8 encoded runes.
 //
 // Many of the go.text packages use tries for storing per-rune information.  A
 // trie is especially useful if many of the runes have the same value. If this
 // is the case, many blocks can be expected to be shared allowing for
 // information on many runes to be stored in little space.
 //
 // As most of the lookups are done directly on []byte slices, the tries use the
 // UTF-8 bytes directly for the lookup. This saves a conversion from UTF-8 to
 // runes and contributes a little bit to better performance. It also naturally
 // provides a fast path for ASCII.
 //
 // Space is also an issue. There are many code points defined in Unicode and as
 // a result tables can get quite large. So every byte counts. The triegen
 // package automatically chooses the smallest integer values to represent the
 // tables. Compacters allow further compression of the trie by allowing for
 // alternative representations of individual trie blocks.
 //
 // triegen allows generating multiple tries as a single structure. This is
 // useful when, for example, one wants to generate tries for several languages
 // that have a lot of values in common. Some existing libraries for
 // internationalization store all per-language data as a dynamically loadable
 // chunk. The go.text packages are designed with the assumption that the user
 // typically wants to compile in support for all supported languages, in line
 // with the approach common to Go to create a single standalone binary. The
 // multi-root trie approach can give significant storage savings in this
 // scenario.
 //
 // triegen generates both tables and code. The code is optimized to use the
 // automatically chosen data types. The following code is generated for a Trie
 // or multiple Tries named "foo":
 //	- type fooTrie
 //		The trie type.
 //
 //	- func newFooTrie(x int) *fooTrie
 //		Trie constructor, where x is the index of the trie passed to Gen.
 //
 //	- func (t *fooTrie) lookup(s []byte) (v uintX, sz int)
 //		The lookup method, where uintX is automatically chosen.
 //
 //	- func lookupString, lookupUnsafe and lookupStringUnsafe
 //		Variants of the above.
 //
 //	- var fooValues and fooIndex and any tables generated by Compacters.
 //		The core trie data.
 //
 //	- var fooTrieHandles
 //		Indexes of starter blocks in case of multiple trie roots.
 //
 // It is recommended that users test the generated trie by checking the returned
 // value for every rune. Such exhaustive tests are possible as the number of
 // runes in Unicode is limited.
 package triegen // import "golang.org/x/text/internal/triegen"

 // TODO: Arguably, the internally optimized data types would not have to be
 // exposed in the generated API. We could also investigate not generating the
 // code, but using it through a package. We would have to investigate the impact
 // on performance of making such change, though. For packages like unicode/norm,
 // small changes like this could tank performance.

 import (
 	"encoding/binary"
 	"fmt"
 	"hash/crc64"
 	"io"
 	"log"
 	"unicode/utf8"
 )

 // builder builds a set of tries for associating values with runes. The set of
 // tries can share common index and value blocks.
 type builder struct {
 	Name string

 	// ValueType is the type of the trie values looked up.
 	ValueType string

 	// ValueSize is the byte size of the ValueType.
 	ValueSize int

 	// IndexType is the type of trie index values used for all UTF-8 bytes of
 	// a rune except the last one.
 	IndexType string

 	// IndexSize is the byte size of the IndexType.
 	IndexSize int

 	// SourceType is used when generating the lookup functions. If the user
 	// requests StringSupport, all lookup functions will be generated for
 	// string input as well.
 	SourceType string

 	Trie []*Trie

 	IndexBlocks []*node
 	ValueBlocks [][]uint64
 	Compactions []compaction
 	Checksum    uint64

 	ASCIIBlock   string
 	StarterBlock string

 	indexBlockIdx map[uint64]int
 	valueBlockIdx map[uint64]nodeIndex
 	asciiBlockIdx map[uint64]int

 	// Stats are used to fill out the template.
 	Stats struct {
 		NValueEntries int
 		NValueBytes   int
 		NIndexEntries int
 		NIndexBytes   int
 		NHandleBytes  int
 	}

 	err error
 }

 // A nodeIndex encodes the index of a node, which is defined by the compaction
 // which stores it and an index within the compaction. For internal nodes, the
 // compaction is always 0.
 type nodeIndex struct {
 	compaction int
 	index      int
 }

 // compaction keeps track of stats used for the compaction.
 type compaction struct {
 	c         Compacter
 	blocks    []*node
 	maxHandle uint32
 	totalSize int

 	// Used by template-based generator and thus exported.
 	Cutoff  uint32
 	Offset  uint32
 	Handler string
 }

 func (b *builder) setError(err error) {
 	if b.err == nil {
 		b.err = err
 	}
 }

 // An Option can be passed to Gen.
 type Option func(b *builder) error

 // Compact configures the trie generator to use the given Compacter.
 func Compact(c Compacter) Option {
 	return func(b *builder) error {
 		b.Compactions = append(b.Compactions, compaction{
 			c:       c,
 			Handler: c.Handler() + "(n, b)"})
 		return nil
 	}
 }

 // Gen writes Go code for a shared trie lookup structure to w for the given
 // Tries. The generated trie type will be called nameTrie. newNameTrie(x) will
 // return the *nameTrie for tries[x]. A value can be looked up by using one of
 // the various lookup methods defined on nameTrie. It returns the table size of
 // the generated trie.
 func Gen(w io.Writer, name string, tries []*Trie, opts ...Option) (sz int, err error) {
 	// The index contains two dummy blocks, followed by the zero block. The zero
 	// block is at offset 0x80, so that the offset for the zero block for
 	// continuation bytes is 0.
 	b := &builder{
 		Name:        name,
 		Trie:        tries,
 		IndexBlocks: []*node{{}, {}, {}},
 		Compactions: []compaction{{
 			Handler: name + "Values[n<<6+uint32(b)]",
 		}},
 		// The 0 key in indexBlockIdx and valueBlockIdx is the hash of the zero
 		// block.
 		indexBlockIdx: map[uint64]int{0: 0},
 		valueBlockIdx: map[uint64]nodeIndex{0: {}},
 		asciiBlockIdx: map[uint64]int{},
 	}
 	b.Compactions[0].c = (*simpleCompacter)(b)

 	for _, f := range opts {
 		if err := f(b); err != nil {
 			return 0, err
 		}
 	}
 	b.build()
 	if b.err != nil {
 		return 0, b.err
 	}
 	if err = b.print(w); err != nil {
 		return 0, err
 	}
 	return b.Size(), nil
 }

 // A Trie represents a single root node of a trie. A builder may build several
 // overlapping tries at once.
 type Trie struct {
 	root *node

 	hiddenTrie
 }

 // hiddenTrie contains values we want to be visible to the template generator,
 // but hidden from the API documentation.
 type hiddenTrie struct {
 	Name         string
 	Checksum     uint64
 	ASCIIIndex   int
 	StarterIndex int
 }

 // NewTrie returns a new trie root.
 func NewTrie(name string) *Trie {
 	return &Trie{
 		&node{
 			children: make([]*node, blockSize),
 			values:   make([]uint64, utf8.RuneSelf),
 		},
 		hiddenTrie{Name: name},
 	}
 }

 // Gen is a convenience wrapper around the Gen func passing t as the only trie
 // and uses the name passed to NewTrie. It returns the size of the generated
 // tables.
 func (t *Trie) Gen(w io.Writer, opts ...Option) (sz int, err error) {
 	return Gen(w, t.Name, []*Trie{t}, opts...)
 }

 // node is a node of the intermediate trie structure.
 type node struct {
 	// children holds this node's children. It is always of length 64.
 	// A child node may be nil.
 	children []*node

 	// values contains the values of this node. If it is non-nil, this node is
 	// either a root or leaf node:
 	// For root nodes, len(values) == 128 and it maps the bytes in [0x00, 0x7F].
 	// For leaf nodes, len(values) ==  64 and it maps the bytes in [0x80, 0xBF].
 	values []uint64

 	index nodeIndex
 }

 // Insert associates value with the given rune. Insert will panic if a non-zero
 // value is passed for an invalid rune.
 func (t *Trie) Insert(r rune, value uint64) {
 	if value == 0 {
 		return
 	}
 	s := string(r)
 	if []rune(s)[0] != r && value != 0 {
 		// Note: The UCD tables will always assign what amounts to a zero value
 		// to a surrogate. Allowing a zero value for an illegal rune allows
 		// users to iterate over [0..MaxRune] without having to explicitly
 		// exclude surrogates, which would be tedious.
 		panic(fmt.Sprintf("triegen: non-zero value for invalid rune %U", r))
 	}
 	if len(s) == 1 {
 		// It is a root node value (ASCII).
 		t.root.values[s[0]] = value
 		return
 	}

 	n := t.root
 	for ; len(s) > 1; s = s[1:] {
 		if n.children == nil {
 			n.children = make([]*node, blockSize)
 		}
 		p := s[0] % blockSize
 		c := n.children[p]
 		if c == nil {
 			c = &node{}
 			n.children[p] = c
 		}
 		if len(s) > 2 && c.values != nil {
 			log.Fatalf("triegen: insert(%U): found internal node with values", r)
 		}
 		n = c
 	}
 	if n.values == nil {
 		n.values = make([]uint64, blockSize)
 	}
 	if n.children != nil {
 		log.Fatalf("triegen: insert(%U): found leaf node that also has child nodes", r)
 	}
 	n.values[s[0]-0x80] = value
 }

 // Size returns the number of bytes the generated trie will take to store. It
 // needs to be exported as it is used in the templates.
 func (b *builder) Size() int {
 	// Index blocks.
 	sz := len(b.IndexBlocks) * blockSize * b.IndexSize

 	// Skip the first compaction, which represents the normal value blocks, as
 	// its totalSize does not account for the ASCII blocks, which are managed
 	// separately.
 	sz += len(b.ValueBlocks) * blockSize * b.ValueSize
 	for _, c := range b.Compactions[1:] {
 		sz += c.totalSize
 	}

 	// TODO: this computation does not account for the fixed overhead of a using
 	// a compaction, either code or data. As for data, though, the typical
 	// overhead of data is in the order of bytes (2 bytes for cases). Further,
 	// the savings of using a compaction should anyway be substantial for it to
 	// be worth it.

 	// For multi-root tries, we also need to account for the handles.
 	if len(b.Trie) > 1 {
 		sz += 2 * b.IndexSize * len(b.Trie)
 	}
 	return sz
 }

 func (b *builder) build() {
 	// Compute the sizes of the values.
 	var vmax uint64
 	for _, t := range b.Trie {
 		vmax = maxValue(t.root, vmax)
 	}
 	b.ValueType, b.ValueSize = getIntType(vmax)

 	// Compute all block allocations.
 	// TODO: first compute the ASCII blocks for all tries and then the other
 	// nodes. ASCII blocks are more restricted in placement, as they require two
 	// blocks to be placed consecutively. Processing them first may improve
 	// sharing (at least one zero block can be expected to be saved.)
 	for _, t := range b.Trie {
 		b.Checksum += b.buildTrie(t)
 	}

 	// Compute the offsets for all the Compacters.
 	offset := uint32(0)
 	for i := range b.Compactions {
 		c := &b.Compactions[i]
 		c.Offset = offset
 		offset += c.maxHandle + 1
 		c.Cutoff = offset
 	}

 	// Compute the sizes of indexes.
 	// TODO: different byte positions could have different sizes. So far we have
 	// not found a case where this is beneficial.
 	imax := uint64(b.Compactions[len(b.Compactions)-1].Cutoff)
 	for _, ib := range b.IndexBlocks {
 		if x := uint64(ib.index.index); x > imax {
 			imax = x
 		}
 	}
 	b.IndexType, b.IndexSize = getIntType(imax)
 }

 func maxValue(n *node, max uint64) uint64 {
 	if n == nil {
 		return max
 	}
 	for _, c := range n.children {
 		max = maxValue(c, max)
 	}
 	for _, v := range n.values {
 		if max < v {
 			max = v
 		}
 	}
 	return max
 }

 func getIntType(v uint64) (string, int) {
 	switch {
 	case v < 1<<8:
 		return "uint8", 1
 	case v < 1<<16:
 		return "uint16", 2
 	case v < 1<<32:
 		return "uint32", 4
 	}
 	return "uint64", 8
 }

 const (
 	blockSize = 64

 	// Subtract two blocks to offset 0x80, the first continuation byte.
 	blockOffset = 2

 	// Subtract three blocks to offset 0xC0, the first non-ASCII starter.
 	rootBlockOffset = 3
 )

 var crcTable = crc64.MakeTable(crc64.ISO)

 func (b *builder) buildTrie(t *Trie) uint64 {
 	n := t.root

 	// Get the ASCII offset. For the first trie, the ASCII block will be at
 	// position 0.
 	hasher := crc64.New(crcTable)
 	binary.Write(hasher, binary.BigEndian, n.values)
 	hash := hasher.Sum64()

 	v, ok := b.asciiBlockIdx[hash]
 	if !ok {
 		v = len(b.ValueBlocks)
 		b.asciiBlockIdx[hash] = v

 		b.ValueBlocks = append(b.ValueBlocks, n.values[:blockSize], n.values[blockSize:])
 		if v == 0 {
 			// Add the zero block at position 2 so that it will be assigned a
 			// zero reference in the lookup blocks.
 			// TODO: always do this? This would allow us to remove a check from
 			// the trie lookup, but at the expense of extra space. Analyze
 			// performance for unicode/norm.
 			b.ValueBlocks = append(b.ValueBlocks, make([]uint64, blockSize))
 		}
 	}
 	t.ASCIIIndex = v

 	// Compute remaining offsets.
 	t.Checksum = b.computeOffsets(n, true)
 	// We already subtracted the normal blockOffset from the index. Subtract the
 	// difference for starter bytes.
 	t.StarterIndex = n.index.index - (rootBlockOffset - blockOffset)
 	return t.Checksum
 }

 func (b *builder) computeOffsets(n *node, root bool) uint64 {
 	// For the first trie, the root lookup block will be at position 3, which is
 	// the offset for UTF-8 non-ASCII starter bytes.
 	first := len(b.IndexBlocks) == rootBlockOffset
 	if first {
 		b.IndexBlocks = append(b.IndexBlocks, n)
 	}

 	// We special-case the cases where all values recursively are 0. This allows
 	// for the use of a zero block to which all such values can be directed.
 	hash := uint64(0)
 	if n.children != nil || n.values != nil {
 		hasher := crc64.New(crcTable)
 		for _, c := range n.children {
 			var v uint64
 			if c != nil {
 				v = b.computeOffsets(c, false)
 			}
 			binary.Write(hasher, binary.BigEndian, v)
 		}
 		binary.Write(hasher, binary.BigEndian, n.values)
 		hash = hasher.Sum64()
 	}

 	if first {
 		b.indexBlockIdx[hash] = rootBlockOffset - blockOffset
 	}

 	// Compacters don't apply to internal nodes.
 	if n.children != nil {
 		v, ok := b.indexBlockIdx[hash]
 		if !ok {
 			v = len(b.IndexBlocks) - blockOffset
 			b.IndexBlocks = append(b.IndexBlocks, n)
 			b.indexBlockIdx[hash] = v
 		}
 		n.index = nodeIndex{0, v}
 	} else {
 		h, ok := b.valueBlockIdx[hash]
 		if !ok {
 			bestI, bestSize := 0, blockSize*b.ValueSize
 			for i, c := range b.Compactions[1:] {
 				if sz, ok := c.c.Size(n.values); ok && bestSize > sz {
 					bestI, bestSize = i+1, sz
 				}
 			}
 			c := &b.Compactions[bestI]
 			c.totalSize += bestSize
 			v := c.c.Store(n.values)
 			if c.maxHandle < v {
 				c.maxHandle = v
 			}
 			h = nodeIndex{bestI, int(v)}
 			b.valueBlockIdx[hash] = h
 		}
 		n.index = h
 	}
 	return hash
 }
	// Copyright 2014 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	// Package triegen implements a code generator for a trie for associating
	// unsigned integer values with UTF-8 encoded runes.
	//
	// Many of the go.text packages use tries for storing per-rune information. A
	// trie is especially useful if many of the runes have the same value. If this
	// is the case, many blocks can be expected to be shared allowing for
	// information on many runes to be stored in little space.
	//
	// As most of the lookups are done directly on []byte slices, the tries use the
	// UTF-8 bytes directly for the lookup. This saves a conversion from UTF-8 to
	// runes and contributes a little bit to better performance. It also naturally
	// provides a fast path for ASCII.
	//
	// Space is also an issue. There are many code points defined in Unicode and as
	// a result tables can get quite large. So every byte counts. The triegen
	// package automatically chooses the smallest integer values to represent the
	// tables. Compacters allow further compression of the trie by allowing for
	// alternative representations of individual trie blocks.
	//
	// triegen allows generating multiple tries as a single structure. This is
	// useful when, for example, one wants to generate tries for several languages
	// that have a lot of values in common. Some existing libraries for
	// internationalization store all per-language data as a dynamically loadable
	// chunk. The go.text packages are designed with the assumption that the user
	// typically wants to compile in support for all supported languages, in line
	// with the approach common to Go to create a single standalone binary. The
	// multi-root trie approach can give significant storage savings in this
	// scenario.
	//
	// triegen generates both tables and code. The code is optimized to use the
	// automatically chosen data types. The following code is generated for a Trie
	// or multiple Tries named "foo":
	// - type fooTrie
	// The trie type.
	//
	// - func newFooTrie(x int) *fooTrie
	// Trie constructor, where x is the index of the trie passed to Gen.
	//
	// - func (t *fooTrie) lookup(s []byte) (v uintX, sz int)
	// The lookup method, where uintX is automatically chosen.
	//
	// - func lookupString, lookupUnsafe and lookupStringUnsafe
	// Variants of the above.
	//
	// - var fooValues and fooIndex and any tables generated by Compacters.
	// The core trie data.
	//
	// - var fooTrieHandles
	// Indexes of starter blocks in case of multiple trie roots.
	//
	// It is recommended that users test the generated trie by checking the returned
	// value for every rune. Such exhaustive tests are possible as the number of
	// runes in Unicode is limited.
	package triegen // import "golang.org/x/text/internal/triegen"

	// TODO: Arguably, the internally optimized data types would not have to be
	// exposed in the generated API. We could also investigate not generating the
	// code, but using it through a package. We would have to investigate the impact
	// on performance of making such change, though. For packages like unicode/norm,
	// small changes like this could tank performance.

	import (
	"encoding/binary"
	"fmt"
	"hash/crc64"
	"io"
	"log"
	"unicode/utf8"
	)

	// builder builds a set of tries for associating values with runes. The set of
	// tries can share common index and value blocks.
	type builder struct {
	Name string

	// ValueType is the type of the trie values looked up.
	ValueType string

	// ValueSize is the byte size of the ValueType.
	ValueSize int

	// IndexType is the type of trie index values used for all UTF-8 bytes of
	// a rune except the last one.
	IndexType string

	// IndexSize is the byte size of the IndexType.
	IndexSize int

	// SourceType is used when generating the lookup functions. If the user
	// requests StringSupport, all lookup functions will be generated for
	// string input as well.
	SourceType string

	Trie []*Trie

	IndexBlocks []*node
	ValueBlocks [][]uint64
	Compactions []compaction
	Checksum uint64

	ASCIIBlock string
	StarterBlock string

	indexBlockIdx map[uint64]int
	valueBlockIdx map[uint64]nodeIndex
	asciiBlockIdx map[uint64]int

	// Stats are used to fill out the template.
	Stats struct {
	NValueEntries int
	NValueBytes int
	NIndexEntries int
	NIndexBytes int
	NHandleBytes int
	}

	err error
	}

	// A nodeIndex encodes the index of a node, which is defined by the compaction
	// which stores it and an index within the compaction. For internal nodes, the
	// compaction is always 0.
	type nodeIndex struct {
	compaction int
	index int
	}

	// compaction keeps track of stats used for the compaction.
	type compaction struct {
	c Compacter
	blocks []*node
	maxHandle uint32
	totalSize int

	// Used by template-based generator and thus exported.
	Cutoff uint32
	Offset uint32
	Handler string
	}

	func (b *builder) setError(err error) {
	if b.err == nil {
	b.err = err
	}
	}

	// An Option can be passed to Gen.
	type Option func(b *builder) error

	// Compact configures the trie generator to use the given Compacter.
	func Compact(c Compacter) Option {
	return func(b *builder) error {
	b.Compactions = append(b.Compactions, compaction{
	c: c,
	Handler: c.Handler() + "(n, b)"})
	return nil
	}
	}

	// Gen writes Go code for a shared trie lookup structure to w for the given
	// Tries. The generated trie type will be called nameTrie. newNameTrie(x) will
	// return the *nameTrie for tries[x]. A value can be looked up by using one of
	// the various lookup methods defined on nameTrie. It returns the table size of
	// the generated trie.
	func Gen(w io.Writer, name string, tries []*Trie, opts ...Option) (sz int, err error) {
	// The index contains two dummy blocks, followed by the zero block. The zero
	// block is at offset 0x80, so that the offset for the zero block for
	// continuation bytes is 0.
	b := &builder{
	Name: name,
	Trie: tries,
	IndexBlocks: []*node{{}, {}, {}},
	Compactions: []compaction{{
	Handler: name + "Values[n<<6+uint32(b)]",
	}},
	// The 0 key in indexBlockIdx and valueBlockIdx is the hash of the zero
	// block.
	indexBlockIdx: map[uint64]int{0: 0},
	valueBlockIdx: map[uint64]nodeIndex{0: {}},
	asciiBlockIdx: map[uint64]int{},
	}
	b.Compactions[0].c = (*simpleCompacter)(b)

	for _, f := range opts {
	if err := f(b); err != nil {
	return 0, err
	}
	}
	b.build()
	if b.err != nil {
	return 0, b.err
	}
	if err = b.print(w); err != nil {
	return 0, err
	}
	return b.Size(), nil
	}

	// A Trie represents a single root node of a trie. A builder may build several
	// overlapping tries at once.
	type Trie struct {
	root *node

	hiddenTrie
	}

	// hiddenTrie contains values we want to be visible to the template generator,
	// but hidden from the API documentation.
	type hiddenTrie struct {
	Name string
	Checksum uint64
	ASCIIIndex int
	StarterIndex int
	}

	// NewTrie returns a new trie root.
	func NewTrie(name string) *Trie {
	return &Trie{
	&node{
	children: make([]*node, blockSize),
	values: make([]uint64, utf8.RuneSelf),
	},
	hiddenTrie{Name: name},
	}
	}

	// Gen is a convenience wrapper around the Gen func passing t as the only trie
	// and uses the name passed to NewTrie. It returns the size of the generated
	// tables.
	func (t *Trie) Gen(w io.Writer, opts ...Option) (sz int, err error) {
	return Gen(w, t.Name, []*Trie{t}, opts...)
	}

	// node is a node of the intermediate trie structure.
	type node struct {
	// children holds this node's children. It is always of length 64.
	// A child node may be nil.
	children []*node

	// values contains the values of this node. If it is non-nil, this node is
	// either a root or leaf node:
	// For root nodes, len(values) == 128 and it maps the bytes in [0x00, 0x7F].
	// For leaf nodes, len(values) == 64 and it maps the bytes in [0x80, 0xBF].
	values []uint64

	index nodeIndex
	}

	// Insert associates value with the given rune. Insert will panic if a non-zero
	// value is passed for an invalid rune.
	func (t *Trie) Insert(r rune, value uint64) {
	if value == 0 {
	return
	}
	s := string(r)
	if []rune(s)[0] != r && value != 0 {
	// Note: The UCD tables will always assign what amounts to a zero value
	// to a surrogate. Allowing a zero value for an illegal rune allows
	// users to iterate over [0..MaxRune] without having to explicitly
	// exclude surrogates, which would be tedious.
	panic(fmt.Sprintf("triegen: non-zero value for invalid rune %U", r))
	}
	if len(s) == 1 {
	// It is a root node value (ASCII).
	t.root.values[s[0]] = value
	return
	}

	n := t.root
	for ; len(s) > 1; s = s[1:] {
	if n.children == nil {
	n.children = make([]*node, blockSize)
	}
	p := s[0] % blockSize
	c := n.children[p]
	if c == nil {
	c = &node{}
	n.children[p] = c
	}
	if len(s) > 2 && c.values != nil {
	log.Fatalf("triegen: insert(%U): found internal node with values", r)
	}
	n = c
	}
	if n.values == nil {
	n.values = make([]uint64, blockSize)
	}
	if n.children != nil {
	log.Fatalf("triegen: insert(%U): found leaf node that also has child nodes", r)
	}
	n.values[s[0]-0x80] = value
	}

	// Size returns the number of bytes the generated trie will take to store. It
	// needs to be exported as it is used in the templates.
	func (b *builder) Size() int {
	// Index blocks.
	sz := len(b.IndexBlocks) * blockSize * b.IndexSize

	// Skip the first compaction, which represents the normal value blocks, as
	// its totalSize does not account for the ASCII blocks, which are managed
	// separately.
	sz += len(b.ValueBlocks) * blockSize * b.ValueSize
	for _, c := range b.Compactions[1:] {
	sz += c.totalSize
	}

	// TODO: this computation does not account for the fixed overhead of a using
	// a compaction, either code or data. As for data, though, the typical
	// overhead of data is in the order of bytes (2 bytes for cases). Further,
	// the savings of using a compaction should anyway be substantial for it to
	// be worth it.

	// For multi-root tries, we also need to account for the handles.
	if len(b.Trie) > 1 {
	sz += 2 * b.IndexSize * len(b.Trie)
	}
	return sz
	}

	func (b *builder) build() {
	// Compute the sizes of the values.
	var vmax uint64
	for _, t := range b.Trie {
	vmax = maxValue(t.root, vmax)
	}
	b.ValueType, b.ValueSize = getIntType(vmax)

	// Compute all block allocations.
	// TODO: first compute the ASCII blocks for all tries and then the other
	// nodes. ASCII blocks are more restricted in placement, as they require two
	// blocks to be placed consecutively. Processing them first may improve
	// sharing (at least one zero block can be expected to be saved.)
	for _, t := range b.Trie {
	b.Checksum += b.buildTrie(t)
	}

	// Compute the offsets for all the Compacters.
	offset := uint32(0)
	for i := range b.Compactions {
	c := &b.Compactions[i]
	c.Offset = offset
	offset += c.maxHandle + 1
	c.Cutoff = offset
	}

	// Compute the sizes of indexes.
	// TODO: different byte positions could have different sizes. So far we have
	// not found a case where this is beneficial.
	imax := uint64(b.Compactions[len(b.Compactions)-1].Cutoff)
	for _, ib := range b.IndexBlocks {
	if x := uint64(ib.index.index); x > imax {
	imax = x
	}
	}
	b.IndexType, b.IndexSize = getIntType(imax)
	}

	func maxValue(n *node, max uint64) uint64 {
	if n == nil {
	return max
	}
	for _, c := range n.children {
	max = maxValue(c, max)
	}
	for _, v := range n.values {
	if max < v {
	max = v
	}
	}
	return max
	}

	func getIntType(v uint64) (string, int) {
	switch {
	case v < 1<<8:
	return "uint8", 1
	case v < 1<<16:
	return "uint16", 2
	case v < 1<<32:
	return "uint32", 4
	}
	return "uint64", 8
	}

	const (
	blockSize = 64

	// Subtract two blocks to offset 0x80, the first continuation byte.
	blockOffset = 2

	// Subtract three blocks to offset 0xC0, the first non-ASCII starter.
	rootBlockOffset = 3
	)

	var crcTable = crc64.MakeTable(crc64.ISO)

	func (b builder) buildTrie(t Trie) uint64 {
	n := t.root

	// Get the ASCII offset. For the first trie, the ASCII block will be at
	// position 0.
	hasher := crc64.New(crcTable)
	binary.Write(hasher, binary.BigEndian, n.values)
	hash := hasher.Sum64()

	v, ok := b.asciiBlockIdx[hash]
	if !ok {
	v = len(b.ValueBlocks)
	b.asciiBlockIdx[hash] = v

	b.ValueBlocks = append(b.ValueBlocks, n.values[:blockSize], n.values[blockSize:])
	if v == 0 {
	// Add the zero block at position 2 so that it will be assigned a
	// zero reference in the lookup blocks.
	// TODO: always do this? This would allow us to remove a check from
	// the trie lookup, but at the expense of extra space. Analyze
	// performance for unicode/norm.
	b.ValueBlocks = append(b.ValueBlocks, make([]uint64, blockSize))
	}
	}
	t.ASCIIIndex = v

	// Compute remaining offsets.
	t.Checksum = b.computeOffsets(n, true)
	// We already subtracted the normal blockOffset from the index. Subtract the
	// difference for starter bytes.
	t.StarterIndex = n.index.index - (rootBlockOffset - blockOffset)
	return t.Checksum
	}

	func (b builder) computeOffsets(n node, root bool) uint64 {
	// For the first trie, the root lookup block will be at position 3, which is
	// the offset for UTF-8 non-ASCII starter bytes.
	first := len(b.IndexBlocks) == rootBlockOffset
	if first {
	b.IndexBlocks = append(b.IndexBlocks, n)
	}

	// We special-case the cases where all values recursively are 0. This allows
	// for the use of a zero block to which all such values can be directed.
	hash := uint64(0)
	if n.children != nil \|\| n.values != nil {
	hasher := crc64.New(crcTable)
	for _, c := range n.children {
	var v uint64
	if c != nil {
	v = b.computeOffsets(c, false)
	}
	binary.Write(hasher, binary.BigEndian, v)
	}
	binary.Write(hasher, binary.BigEndian, n.values)
	hash = hasher.Sum64()
	}

	if first {
	b.indexBlockIdx[hash] = rootBlockOffset - blockOffset
	}

	// Compacters don't apply to internal nodes.
	if n.children != nil {
	v, ok := b.indexBlockIdx[hash]
	if !ok {
	v = len(b.IndexBlocks) - blockOffset
	b.IndexBlocks = append(b.IndexBlocks, n)
	b.indexBlockIdx[hash] = v
	}
	n.index = nodeIndex{0, v}
	} else {
	h, ok := b.valueBlockIdx[hash]
	if !ok {
	bestI, bestSize := 0, blockSize*b.ValueSize
	for i, c := range b.Compactions[1:] {
	if sz, ok := c.c.Size(n.values); ok && bestSize > sz {
	bestI, bestSize = i+1, sz
	}
	}
	c := &b.Compactions[bestI]
	c.totalSize += bestSize
	v := c.c.Store(n.values)
	if c.maxHandle < v {
	c.maxHandle = v
	}
	h = nodeIndex{bestI, int(v)}
	b.valueBlockIdx[hash] = h
	}
	n.index = h
	}
	return hash
	}