| // Copyright 2014 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // Package triegen implements a code generator for a trie for associating |
| // unsigned integer values with UTF-8 encoded runes. |
| // |
| // Many of the go.text packages use tries for storing per-rune information. A |
| // trie is especially useful if many of the runes have the same value. If this |
| // is the case, many blocks can be expected to be shared allowing for |
| // information on many runes to be stored in little space. |
| // |
| // As most of the lookups are done directly on []byte slices, the tries use the |
| // UTF-8 bytes directly for the lookup. This saves a conversion from UTF-8 to |
| // runes and contributes a little bit to better performance. It also naturally |
| // provides a fast path for ASCII. |
| // |
| // Space is also an issue. There are many code points defined in Unicode and as |
| // a result tables can get quite large. So every byte counts. The triegen |
| // package automatically chooses the smallest integer values to represent the |
| // tables. Compacters allow further compression of the trie by allowing for |
| // alternative representations of individual trie blocks. |
| // |
| // triegen allows generating multiple tries as a single structure. This is |
| // useful when, for example, one wants to generate tries for several languages |
| // that have a lot of values in common. Some existing libraries for |
| // internationalization store all per-language data as a dynamically loadable |
| // chunk. The go.text packages are designed with the assumption that the user |
| // typically wants to compile in support for all supported languages, in line |
| // with the approach common to Go to create a single standalone binary. The |
| // multi-root trie approach can give significant storage savings in this |
| // scenario. |
| // |
| // triegen generates both tables and code. The code is optimized to use the |
| // automatically chosen data types. The following code is generated for a Trie |
| // or multiple Tries named "foo": |
| // |
| // - type fooTrie |
| // The trie type. |
| // |
| // - func newFooTrie(x int) *fooTrie |
| // Trie constructor, where x is the index of the trie passed to Gen. |
| // |
| // - func (t *fooTrie) lookup(s []byte) (v uintX, sz int) |
| // The lookup method, where uintX is automatically chosen. |
| // |
| // - func lookupString, lookupUnsafe and lookupStringUnsafe |
| // Variants of the above. |
| // |
| // - var fooValues and fooIndex and any tables generated by Compacters. |
| // The core trie data. |
| // |
| // - var fooTrieHandles |
| // Indexes of starter blocks in case of multiple trie roots. |
| // |
| // It is recommended that users test the generated trie by checking the returned |
| // value for every rune. Such exhaustive tests are possible as the number of |
| // runes in Unicode is limited. |
| package triegen // import "golang.org/x/text/internal/triegen" |
| |
| // TODO: Arguably, the internally optimized data types would not have to be |
| // exposed in the generated API. We could also investigate not generating the |
| // code, but using it through a package. We would have to investigate the impact |
| // on performance of making such change, though. For packages like unicode/norm, |
| // small changes like this could tank performance. |
| |
| import ( |
| "encoding/binary" |
| "fmt" |
| "hash/crc64" |
| "io" |
| "log" |
| "unicode/utf8" |
| ) |
| |
| // builder builds a set of tries for associating values with runes. The set of |
| // tries can share common index and value blocks. |
| type builder struct { |
| Name string |
| |
| // ValueType is the type of the trie values looked up. |
| ValueType string |
| |
| // ValueSize is the byte size of the ValueType. |
| ValueSize int |
| |
| // IndexType is the type of trie index values used for all UTF-8 bytes of |
| // a rune except the last one. |
| IndexType string |
| |
| // IndexSize is the byte size of the IndexType. |
| IndexSize int |
| |
| // SourceType is used when generating the lookup functions. If the user |
| // requests StringSupport, all lookup functions will be generated for |
| // string input as well. |
| SourceType string |
| |
| Trie []*Trie |
| |
| IndexBlocks []*node |
| ValueBlocks [][]uint64 |
| Compactions []compaction |
| Checksum uint64 |
| |
| ASCIIBlock string |
| StarterBlock string |
| |
| indexBlockIdx map[uint64]int |
| valueBlockIdx map[uint64]nodeIndex |
| asciiBlockIdx map[uint64]int |
| |
| // Stats are used to fill out the template. |
| Stats struct { |
| NValueEntries int |
| NValueBytes int |
| NIndexEntries int |
| NIndexBytes int |
| NHandleBytes int |
| } |
| |
| err error |
| } |
| |
| // A nodeIndex encodes the index of a node, which is defined by the compaction |
| // which stores it and an index within the compaction. For internal nodes, the |
| // compaction is always 0. |
| type nodeIndex struct { |
| compaction int |
| index int |
| } |
| |
| // compaction keeps track of stats used for the compaction. |
| type compaction struct { |
| c Compacter |
| blocks []*node |
| maxHandle uint32 |
| totalSize int |
| |
| // Used by template-based generator and thus exported. |
| Cutoff uint32 |
| Offset uint32 |
| Handler string |
| } |
| |
| func (b *builder) setError(err error) { |
| if b.err == nil { |
| b.err = err |
| } |
| } |
| |
| // An Option can be passed to Gen. |
| type Option func(b *builder) error |
| |
| // Compact configures the trie generator to use the given Compacter. |
| func Compact(c Compacter) Option { |
| return func(b *builder) error { |
| b.Compactions = append(b.Compactions, compaction{ |
| c: c, |
| Handler: c.Handler() + "(n, b)"}) |
| return nil |
| } |
| } |
| |
| // Gen writes Go code for a shared trie lookup structure to w for the given |
| // Tries. The generated trie type will be called nameTrie. newNameTrie(x) will |
| // return the *nameTrie for tries[x]. A value can be looked up by using one of |
| // the various lookup methods defined on nameTrie. It returns the table size of |
| // the generated trie. |
| func Gen(w io.Writer, name string, tries []*Trie, opts ...Option) (sz int, err error) { |
| // The index contains two dummy blocks, followed by the zero block. The zero |
| // block is at offset 0x80, so that the offset for the zero block for |
| // continuation bytes is 0. |
| b := &builder{ |
| Name: name, |
| Trie: tries, |
| IndexBlocks: []*node{{}, {}, {}}, |
| Compactions: []compaction{{ |
| Handler: name + "Values[n<<6+uint32(b)]", |
| }}, |
| // The 0 key in indexBlockIdx and valueBlockIdx is the hash of the zero |
| // block. |
| indexBlockIdx: map[uint64]int{0: 0}, |
| valueBlockIdx: map[uint64]nodeIndex{0: {}}, |
| asciiBlockIdx: map[uint64]int{}, |
| } |
| b.Compactions[0].c = (*simpleCompacter)(b) |
| |
| for _, f := range opts { |
| if err := f(b); err != nil { |
| return 0, err |
| } |
| } |
| b.build() |
| if b.err != nil { |
| return 0, b.err |
| } |
| if err = b.print(w); err != nil { |
| return 0, err |
| } |
| return b.Size(), nil |
| } |
| |
| // A Trie represents a single root node of a trie. A builder may build several |
| // overlapping tries at once. |
| type Trie struct { |
| root *node |
| |
| hiddenTrie |
| } |
| |
| // hiddenTrie contains values we want to be visible to the template generator, |
| // but hidden from the API documentation. |
| type hiddenTrie struct { |
| Name string |
| Checksum uint64 |
| ASCIIIndex int |
| StarterIndex int |
| } |
| |
| // NewTrie returns a new trie root. |
| func NewTrie(name string) *Trie { |
| return &Trie{ |
| &node{ |
| children: make([]*node, blockSize), |
| values: make([]uint64, utf8.RuneSelf), |
| }, |
| hiddenTrie{Name: name}, |
| } |
| } |
| |
| // Gen is a convenience wrapper around the Gen func passing t as the only trie |
| // and uses the name passed to NewTrie. It returns the size of the generated |
| // tables. |
| func (t *Trie) Gen(w io.Writer, opts ...Option) (sz int, err error) { |
| return Gen(w, t.Name, []*Trie{t}, opts...) |
| } |
| |
| // node is a node of the intermediate trie structure. |
| type node struct { |
| // children holds this node's children. It is always of length 64. |
| // A child node may be nil. |
| children []*node |
| |
| // values contains the values of this node. If it is non-nil, this node is |
| // either a root or leaf node: |
| // For root nodes, len(values) == 128 and it maps the bytes in [0x00, 0x7F]. |
| // For leaf nodes, len(values) == 64 and it maps the bytes in [0x80, 0xBF]. |
| values []uint64 |
| |
| index nodeIndex |
| } |
| |
| // Insert associates value with the given rune. Insert will panic if a non-zero |
| // value is passed for an invalid rune. |
| func (t *Trie) Insert(r rune, value uint64) { |
| if value == 0 { |
| return |
| } |
| s := string(r) |
| if []rune(s)[0] != r && value != 0 { |
| // Note: The UCD tables will always assign what amounts to a zero value |
| // to a surrogate. Allowing a zero value for an illegal rune allows |
| // users to iterate over [0..MaxRune] without having to explicitly |
| // exclude surrogates, which would be tedious. |
| panic(fmt.Sprintf("triegen: non-zero value for invalid rune %U", r)) |
| } |
| if len(s) == 1 { |
| // It is a root node value (ASCII). |
| t.root.values[s[0]] = value |
| return |
| } |
| |
| n := t.root |
| for ; len(s) > 1; s = s[1:] { |
| if n.children == nil { |
| n.children = make([]*node, blockSize) |
| } |
| p := s[0] % blockSize |
| c := n.children[p] |
| if c == nil { |
| c = &node{} |
| n.children[p] = c |
| } |
| if len(s) > 2 && c.values != nil { |
| log.Fatalf("triegen: insert(%U): found internal node with values", r) |
| } |
| n = c |
| } |
| if n.values == nil { |
| n.values = make([]uint64, blockSize) |
| } |
| if n.children != nil { |
| log.Fatalf("triegen: insert(%U): found leaf node that also has child nodes", r) |
| } |
| n.values[s[0]-0x80] = value |
| } |
| |
| // Size returns the number of bytes the generated trie will take to store. It |
| // needs to be exported as it is used in the templates. |
| func (b *builder) Size() int { |
| // Index blocks. |
| sz := len(b.IndexBlocks) * blockSize * b.IndexSize |
| |
| // Skip the first compaction, which represents the normal value blocks, as |
| // its totalSize does not account for the ASCII blocks, which are managed |
| // separately. |
| sz += len(b.ValueBlocks) * blockSize * b.ValueSize |
| for _, c := range b.Compactions[1:] { |
| sz += c.totalSize |
| } |
| |
| // TODO: this computation does not account for the fixed overhead of a using |
| // a compaction, either code or data. As for data, though, the typical |
| // overhead of data is in the order of bytes (2 bytes for cases). Further, |
| // the savings of using a compaction should anyway be substantial for it to |
| // be worth it. |
| |
| // For multi-root tries, we also need to account for the handles. |
| if len(b.Trie) > 1 { |
| sz += 2 * b.IndexSize * len(b.Trie) |
| } |
| return sz |
| } |
| |
| func (b *builder) build() { |
| // Compute the sizes of the values. |
| var vmax uint64 |
| for _, t := range b.Trie { |
| vmax = maxValue(t.root, vmax) |
| } |
| b.ValueType, b.ValueSize = getIntType(vmax) |
| |
| // Compute all block allocations. |
| // TODO: first compute the ASCII blocks for all tries and then the other |
| // nodes. ASCII blocks are more restricted in placement, as they require two |
| // blocks to be placed consecutively. Processing them first may improve |
| // sharing (at least one zero block can be expected to be saved.) |
| for _, t := range b.Trie { |
| b.Checksum += b.buildTrie(t) |
| } |
| |
| // Compute the offsets for all the Compacters. |
| offset := uint32(0) |
| for i := range b.Compactions { |
| c := &b.Compactions[i] |
| c.Offset = offset |
| offset += c.maxHandle + 1 |
| c.Cutoff = offset |
| } |
| |
| // Compute the sizes of indexes. |
| // TODO: different byte positions could have different sizes. So far we have |
| // not found a case where this is beneficial. |
| imax := uint64(b.Compactions[len(b.Compactions)-1].Cutoff) |
| for _, ib := range b.IndexBlocks { |
| if x := uint64(ib.index.index); x > imax { |
| imax = x |
| } |
| } |
| b.IndexType, b.IndexSize = getIntType(imax) |
| } |
| |
| func maxValue(n *node, max uint64) uint64 { |
| if n == nil { |
| return max |
| } |
| for _, c := range n.children { |
| max = maxValue(c, max) |
| } |
| for _, v := range n.values { |
| if max < v { |
| max = v |
| } |
| } |
| return max |
| } |
| |
| func getIntType(v uint64) (string, int) { |
| switch { |
| case v < 1<<8: |
| return "uint8", 1 |
| case v < 1<<16: |
| return "uint16", 2 |
| case v < 1<<32: |
| return "uint32", 4 |
| } |
| return "uint64", 8 |
| } |
| |
| const ( |
| blockSize = 64 |
| |
| // Subtract two blocks to offset 0x80, the first continuation byte. |
| blockOffset = 2 |
| |
| // Subtract three blocks to offset 0xC0, the first non-ASCII starter. |
| rootBlockOffset = 3 |
| ) |
| |
| var crcTable = crc64.MakeTable(crc64.ISO) |
| |
| func (b *builder) buildTrie(t *Trie) uint64 { |
| n := t.root |
| |
| // Get the ASCII offset. For the first trie, the ASCII block will be at |
| // position 0. |
| hasher := crc64.New(crcTable) |
| binary.Write(hasher, binary.BigEndian, n.values) |
| hash := hasher.Sum64() |
| |
| v, ok := b.asciiBlockIdx[hash] |
| if !ok { |
| v = len(b.ValueBlocks) |
| b.asciiBlockIdx[hash] = v |
| |
| b.ValueBlocks = append(b.ValueBlocks, n.values[:blockSize], n.values[blockSize:]) |
| if v == 0 { |
| // Add the zero block at position 2 so that it will be assigned a |
| // zero reference in the lookup blocks. |
| // TODO: always do this? This would allow us to remove a check from |
| // the trie lookup, but at the expense of extra space. Analyze |
| // performance for unicode/norm. |
| b.ValueBlocks = append(b.ValueBlocks, make([]uint64, blockSize)) |
| } |
| } |
| t.ASCIIIndex = v |
| |
| // Compute remaining offsets. |
| t.Checksum = b.computeOffsets(n, true) |
| // We already subtracted the normal blockOffset from the index. Subtract the |
| // difference for starter bytes. |
| t.StarterIndex = n.index.index - (rootBlockOffset - blockOffset) |
| return t.Checksum |
| } |
| |
| func (b *builder) computeOffsets(n *node, root bool) uint64 { |
| // For the first trie, the root lookup block will be at position 3, which is |
| // the offset for UTF-8 non-ASCII starter bytes. |
| first := len(b.IndexBlocks) == rootBlockOffset |
| if first { |
| b.IndexBlocks = append(b.IndexBlocks, n) |
| } |
| |
| // We special-case the cases where all values recursively are 0. This allows |
| // for the use of a zero block to which all such values can be directed. |
| hash := uint64(0) |
| if n.children != nil || n.values != nil { |
| hasher := crc64.New(crcTable) |
| for _, c := range n.children { |
| var v uint64 |
| if c != nil { |
| v = b.computeOffsets(c, false) |
| } |
| binary.Write(hasher, binary.BigEndian, v) |
| } |
| binary.Write(hasher, binary.BigEndian, n.values) |
| hash = hasher.Sum64() |
| } |
| |
| if first { |
| b.indexBlockIdx[hash] = rootBlockOffset - blockOffset |
| } |
| |
| // Compacters don't apply to internal nodes. |
| if n.children != nil { |
| v, ok := b.indexBlockIdx[hash] |
| if !ok { |
| v = len(b.IndexBlocks) - blockOffset |
| b.IndexBlocks = append(b.IndexBlocks, n) |
| b.indexBlockIdx[hash] = v |
| } |
| n.index = nodeIndex{0, v} |
| } else { |
| h, ok := b.valueBlockIdx[hash] |
| if !ok { |
| bestI, bestSize := 0, blockSize*b.ValueSize |
| for i, c := range b.Compactions[1:] { |
| if sz, ok := c.c.Size(n.values); ok && bestSize > sz { |
| bestI, bestSize = i+1, sz |
| } |
| } |
| c := &b.Compactions[bestI] |
| c.totalSize += bestSize |
| v := c.c.Store(n.values) |
| if c.maxHandle < v { |
| c.maxHandle = v |
| } |
| h = nodeIndex{bestI, int(v)} |
| b.valueBlockIdx[hash] = h |
| } |
| n.index = h |
| } |
| return hash |
| } |