| // Copyright 2012 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // The trie in this file is used to associate the first full character |
| // in a UTF-8 string to a collation element. |
| // All but the last byte in a UTF-8 byte sequence are |
| // used to look up offsets in the index table to be used for the next byte. |
| // The last byte is used to index into a table of collation elements. |
| // This file contains the code for the generation of the trie. |
| |
| package build |
| |
| import ( |
| "fmt" |
| "hash/fnv" |
| "io" |
| "reflect" |
| ) |
| |
| const ( |
| blockSize = 64 |
| blockOffset = 2 // Subtract 2 blocks to compensate for the 0x80 added to continuation bytes. |
| ) |
| |
| type trieHandle struct { |
| lookupStart uint16 // offset in table for first byte |
| valueStart uint16 // offset in table for first byte |
| } |
| |
| type trie struct { |
| index []uint16 |
| values []uint32 |
| } |
| |
| // trieNode is the intermediate trie structure used for generating a trie. |
| type trieNode struct { |
| index []*trieNode |
| value []uint32 |
| b byte |
| refValue uint16 |
| refIndex uint16 |
| } |
| |
| func newNode() *trieNode { |
| return &trieNode{ |
| index: make([]*trieNode, 64), |
| value: make([]uint32, 128), // root node size is 128 instead of 64 |
| } |
| } |
| |
| func (n *trieNode) isInternal() bool { |
| return n.value != nil |
| } |
| |
| func (n *trieNode) insert(r rune, value uint32) { |
| const maskx = 0x3F // mask out two most-significant bits |
| str := string(r) |
| if len(str) == 1 { |
| n.value[str[0]] = value |
| return |
| } |
| for i := 0; i < len(str)-1; i++ { |
| b := str[i] & maskx |
| if n.index == nil { |
| n.index = make([]*trieNode, blockSize) |
| } |
| nn := n.index[b] |
| if nn == nil { |
| nn = &trieNode{} |
| nn.b = b |
| n.index[b] = nn |
| } |
| n = nn |
| } |
| if n.value == nil { |
| n.value = make([]uint32, blockSize) |
| } |
| b := str[len(str)-1] & maskx |
| n.value[b] = value |
| } |
| |
| type trieBuilder struct { |
| t *trie |
| |
| roots []*trieHandle |
| |
| lookupBlocks []*trieNode |
| valueBlocks []*trieNode |
| |
| lookupBlockIdx map[uint32]*trieNode |
| valueBlockIdx map[uint32]*trieNode |
| } |
| |
| func newTrieBuilder() *trieBuilder { |
| index := &trieBuilder{} |
| index.lookupBlocks = make([]*trieNode, 0) |
| index.valueBlocks = make([]*trieNode, 0) |
| index.lookupBlockIdx = make(map[uint32]*trieNode) |
| index.valueBlockIdx = make(map[uint32]*trieNode) |
| // The third nil is the default null block. The other two blocks |
| // are used to guarantee an offset of at least 3 for each block. |
| index.lookupBlocks = append(index.lookupBlocks, nil, nil, nil) |
| index.t = &trie{} |
| return index |
| } |
| |
| func (b *trieBuilder) computeOffsets(n *trieNode) *trieNode { |
| hasher := fnv.New32() |
| if n.index != nil { |
| for i, nn := range n.index { |
| var vi, vv uint16 |
| if nn != nil { |
| nn = b.computeOffsets(nn) |
| n.index[i] = nn |
| vi = nn.refIndex |
| vv = nn.refValue |
| } |
| hasher.Write([]byte{byte(vi >> 8), byte(vi)}) |
| hasher.Write([]byte{byte(vv >> 8), byte(vv)}) |
| } |
| h := hasher.Sum32() |
| nn, ok := b.lookupBlockIdx[h] |
| if !ok { |
| n.refIndex = uint16(len(b.lookupBlocks)) - blockOffset |
| b.lookupBlocks = append(b.lookupBlocks, n) |
| b.lookupBlockIdx[h] = n |
| } else { |
| n = nn |
| } |
| } else { |
| for _, v := range n.value { |
| hasher.Write([]byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}) |
| } |
| h := hasher.Sum32() |
| nn, ok := b.valueBlockIdx[h] |
| if !ok { |
| n.refValue = uint16(len(b.valueBlocks)) - blockOffset |
| n.refIndex = n.refValue |
| b.valueBlocks = append(b.valueBlocks, n) |
| b.valueBlockIdx[h] = n |
| } else { |
| n = nn |
| } |
| } |
| return n |
| } |
| |
| func (b *trieBuilder) addStartValueBlock(n *trieNode) uint16 { |
| hasher := fnv.New32() |
| for _, v := range n.value[:2*blockSize] { |
| hasher.Write([]byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)}) |
| } |
| h := hasher.Sum32() |
| nn, ok := b.valueBlockIdx[h] |
| if !ok { |
| n.refValue = uint16(len(b.valueBlocks)) |
| n.refIndex = n.refValue |
| b.valueBlocks = append(b.valueBlocks, n) |
| // Add a dummy block to accommodate the double block size. |
| b.valueBlocks = append(b.valueBlocks, nil) |
| b.valueBlockIdx[h] = n |
| } else { |
| n = nn |
| } |
| return n.refValue |
| } |
| |
| func genValueBlock(t *trie, n *trieNode) { |
| if n != nil { |
| for _, v := range n.value { |
| t.values = append(t.values, v) |
| } |
| } |
| } |
| |
| func genLookupBlock(t *trie, n *trieNode) { |
| for _, nn := range n.index { |
| v := uint16(0) |
| if nn != nil { |
| if n.index != nil { |
| v = nn.refIndex |
| } else { |
| v = nn.refValue |
| } |
| } |
| t.index = append(t.index, v) |
| } |
| } |
| |
| func (b *trieBuilder) addTrie(n *trieNode) *trieHandle { |
| h := &trieHandle{} |
| b.roots = append(b.roots, h) |
| h.valueStart = b.addStartValueBlock(n) |
| if len(b.roots) == 1 { |
| // We insert a null block after the first start value block. |
| // This ensures that continuation bytes UTF-8 sequences of length |
| // greater than 2 will automatically hit a null block if there |
| // was an undefined entry. |
| b.valueBlocks = append(b.valueBlocks, nil) |
| } |
| n = b.computeOffsets(n) |
| // Offset by one extra block as the first byte starts at 0xC0 instead of 0x80. |
| h.lookupStart = n.refIndex - 1 |
| return h |
| } |
| |
| // generate generates and returns the trie for n. |
| func (b *trieBuilder) generate() (t *trie, err error) { |
| t = b.t |
| if len(b.valueBlocks) >= 1<<16 { |
| return nil, fmt.Errorf("maximum number of value blocks exceeded (%d > %d)", len(b.valueBlocks), 1<<16) |
| } |
| if len(b.lookupBlocks) >= 1<<16 { |
| return nil, fmt.Errorf("maximum number of lookup blocks exceeded (%d > %d)", len(b.lookupBlocks), 1<<16) |
| } |
| genValueBlock(t, b.valueBlocks[0]) |
| genValueBlock(t, &trieNode{value: make([]uint32, 64)}) |
| for i := 2; i < len(b.valueBlocks); i++ { |
| genValueBlock(t, b.valueBlocks[i]) |
| } |
| n := &trieNode{index: make([]*trieNode, 64)} |
| genLookupBlock(t, n) |
| genLookupBlock(t, n) |
| genLookupBlock(t, n) |
| for i := 3; i < len(b.lookupBlocks); i++ { |
| genLookupBlock(t, b.lookupBlocks[i]) |
| } |
| return b.t, nil |
| } |
| |
| func (t *trie) printArrays(w io.Writer, name string) (n, size int, err error) { |
| p := func(f string, a ...interface{}) { |
| nn, e := fmt.Fprintf(w, f, a...) |
| n += nn |
| if err == nil { |
| err = e |
| } |
| } |
| nv := len(t.values) |
| p("// %sValues: %d entries, %d bytes\n", name, nv, nv*4) |
| p("// Block 2 is the null block.\n") |
| p("var %sValues = [%d]uint32 {", name, nv) |
| var printnewline bool |
| for i, v := range t.values { |
| if i%blockSize == 0 { |
| p("\n\t// Block %#x, offset %#x", i/blockSize, i) |
| } |
| if i%4 == 0 { |
| printnewline = true |
| } |
| if v != 0 { |
| if printnewline { |
| p("\n\t") |
| printnewline = false |
| } |
| p("%#04x:%#08x, ", i, v) |
| } |
| } |
| p("\n}\n\n") |
| ni := len(t.index) |
| p("// %sLookup: %d entries, %d bytes\n", name, ni, ni*2) |
| p("// Block 0 is the null block.\n") |
| p("var %sLookup = [%d]uint16 {", name, ni) |
| printnewline = false |
| for i, v := range t.index { |
| if i%blockSize == 0 { |
| p("\n\t// Block %#x, offset %#x", i/blockSize, i) |
| } |
| if i%8 == 0 { |
| printnewline = true |
| } |
| if v != 0 { |
| if printnewline { |
| p("\n\t") |
| printnewline = false |
| } |
| p("%#03x:%#02x, ", i, v) |
| } |
| } |
| p("\n}\n\n") |
| return n, nv*4 + ni*2, err |
| } |
| |
| func (t *trie) printStruct(w io.Writer, handle *trieHandle, name string) (n, sz int, err error) { |
| const msg = "trie{ %sLookup[%d:], %sValues[%d:], %sLookup[:], %sValues[:]}" |
| n, err = fmt.Fprintf(w, msg, name, handle.lookupStart*blockSize, name, handle.valueStart*blockSize, name, name) |
| sz += int(reflect.TypeOf(trie{}).Size()) |
| return |
| } |