| // Copyright 2016 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| //go:build ignore |
| // +build ignore |
| |
| // This program generates the trie for idna operations. The Unicode casing |
| // algorithm requires the lookup of various properties and mappings for each |
| // rune. The table generated by this generator combines several of the most |
| // frequently used of these into a single trie so that they can be accessed |
| // with a single lookup. |
| package main |
| |
| import ( |
| "fmt" |
| "io" |
| "log" |
| "unicode" |
| "unicode/utf8" |
| |
| "golang.org/x/text/internal/gen" |
| "golang.org/x/text/internal/triegen" |
| "golang.org/x/text/internal/ucd" |
| "golang.org/x/text/unicode/bidi" |
| ) |
| |
| func main() { |
| gen.Init() |
| genTables() |
| gen.Repackage("gen_trieval.go", "trieval.go", "idna") |
| gen.Repackage("gen_common.go", "common_test.go", "idna") |
| } |
| |
| var runes = map[rune]info{} |
| |
| func genTables() { |
| t := triegen.NewTrie("idna") |
| |
| ucd.Parse(gen.OpenUCDFile("DerivedNormalizationProps.txt"), func(p *ucd.Parser) { |
| r := p.Rune(0) |
| if p.String(1) == "NFC_QC" { // p.String(2) is "N" or "M" |
| runes[r] = mayNeedNorm |
| } |
| }) |
| ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) { |
| r := p.Rune(0) |
| |
| const cccVirama = 9 |
| if p.Int(ucd.CanonicalCombiningClass) == cccVirama { |
| runes[p.Rune(0)] = viramaModifier |
| } |
| switch { |
| case unicode.In(r, unicode.Mark): |
| runes[r] |= modifier | mayNeedNorm |
| } |
| // TODO: by using UnicodeData.txt we don't mark undefined codepoints |
| // that are earmarked as RTL properly. However, an undefined cp will |
| // always fail, so there is no need to store this info. |
| switch p, _ := bidi.LookupRune(r); p.Class() { |
| case bidi.R, bidi.AL, bidi.AN: |
| if x := runes[r]; x != 0 && x != mayNeedNorm { |
| log.Fatalf("%U: rune both modifier and RTL letter/number", r) |
| } |
| runes[r] = rtl |
| } |
| }) |
| |
| ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) { |
| switch v := p.String(1); v { |
| case "L", "D", "T", "R": |
| runes[p.Rune(0)] |= joinType[v] << joinShift |
| } |
| }) |
| |
| ucd.Parse(gen.OpenUnicodeFile("idna", "", "IdnaMappingTable.txt"), func(p *ucd.Parser) { |
| r := p.Rune(0) |
| |
| // The mappings table explicitly defines surrogates as invalid. |
| if !utf8.ValidRune(r) { |
| return |
| } |
| |
| cat := catFromEntry(p) |
| isMapped := cat == mapped || cat == disallowedSTD3Mapped || cat == deviation |
| if !isMapped { |
| // Only include additional category information for non-mapped |
| // runes. The additional information is only used after mapping and |
| // the bits would clash with mapping information. |
| // TODO: it would be possible to inline this data and avoid |
| // additional lookups. This is quite tedious, though, so let's first |
| // see if we need this. |
| cat |= category(runes[r]) |
| } |
| |
| s := string(p.Runes(2)) |
| if s != "" && !isMapped { |
| log.Fatalf("%U: Mapping with non-mapping category %d", r, cat) |
| } |
| t.Insert(r, uint64(makeEntry(r, s))+uint64(cat)) |
| }) |
| |
| w := gen.NewCodeWriter() |
| defer w.WriteVersionedGoFile("tables.go", "idna") |
| |
| gen.WriteUnicodeVersion(w) |
| |
| w.WriteVar("mappings", string(mappings)) |
| w.WriteVar("mappingIndex", mappingIndex) |
| w.WriteVar("xorData", string(xorData)) |
| |
| sz, err := t.Gen(w, triegen.Compact(&normCompacter{})) |
| if err != nil { |
| log.Fatal(err) |
| } |
| w.Size += sz |
| } |
| |
| var ( |
| // mappings contains replacement strings for mapped runes. |
| mappings = []byte{} |
| |
| // mappingIndex contains an offset in mappingBytes representing the start |
| // of a mapping. Then next entry in mappingIndex points past the end of the |
| // string. |
| mappingIndex = []uint16{0} |
| mapCache = map[string]int{} |
| |
| // xorData is like mappings, except that it contains XOR data. |
| // We split these two tables so that we don't get an overflow. |
| xorData = []byte{} |
| xorCache = map[string]int{} |
| ) |
| |
| // makeEntry creates a trie entry. |
| func makeEntry(r rune, mapped string) info { |
| orig := string(r) |
| |
| if len(orig) != len(mapped) { |
| // Store the mapped value as is in the mappings table. |
| index := len(mappingIndex) - 1 |
| if x, ok := mapCache[mapped]; ok { |
| index = x |
| } else { |
| mapCache[mapped] = index |
| mappings = append(mappings, mapped...) |
| mappingIndex = append(mappingIndex, uint16(len(mappings))) |
| } |
| return info(index) << indexShift |
| } |
| |
| // Create per-byte XOR mask. |
| var b []byte |
| for i := 0; i < len(orig); i++ { |
| b = append(b, orig[i]^mapped[i]) |
| } |
| |
| // Remove leading 0 bytes, but keep at least one byte. |
| for ; len(b) > 1 && b[0] == 0; b = b[1:] { |
| } |
| |
| if len(b) == 1 { |
| return xorBit | inlineXOR | info(b[0])<<indexShift |
| } |
| mapped = string(b) |
| |
| // Store the mapped value as is in the mappings table. |
| index := len(xorData) |
| if x, ok := xorCache[mapped]; ok { |
| index = x |
| } else { |
| xorCache[mapped] = index |
| xorData = append(xorData, byte(len(mapped))) |
| xorData = append(xorData, mapped...) |
| } |
| return xorBit | info(index)<<indexShift |
| } |
| |
| // The following code implements a triegen.Compacter that was originally |
| // designed for normalization. The IDNA table has some similarities with the |
| // norm table. Using this compacter, together with the XOR pattern approach, |
| // reduces the table size by roughly 100K. It can probably be compressed further |
| // by also including elements of the compacter used by cases, but for now it is |
| // good enough. |
| |
| const maxSparseEntries = 16 |
| |
| type normCompacter struct { |
| sparseBlocks [][]uint64 |
| sparseOffset []uint16 |
| sparseCount int |
| } |
| |
| func mostFrequentStride(a []uint64) int { |
| counts := make(map[int]int) |
| var v int |
| for _, x := range a { |
| if stride := int(x) - v; v != 0 && stride >= 0 { |
| counts[stride]++ |
| } |
| v = int(x) |
| } |
| var maxs, maxc int |
| for stride, cnt := range counts { |
| if cnt > maxc || (cnt == maxc && stride < maxs) { |
| maxs, maxc = stride, cnt |
| } |
| } |
| return maxs |
| } |
| |
| func countSparseEntries(a []uint64) int { |
| stride := mostFrequentStride(a) |
| var v, count int |
| for _, tv := range a { |
| if int(tv)-v != stride { |
| if tv != 0 { |
| count++ |
| } |
| } |
| v = int(tv) |
| } |
| return count |
| } |
| |
| func (c *normCompacter) Size(v []uint64) (sz int, ok bool) { |
| if n := countSparseEntries(v); n <= maxSparseEntries { |
| return (n+1)*4 + 2, true |
| } |
| return 0, false |
| } |
| |
| func (c *normCompacter) Store(v []uint64) uint32 { |
| h := uint32(len(c.sparseOffset)) |
| c.sparseBlocks = append(c.sparseBlocks, v) |
| c.sparseOffset = append(c.sparseOffset, uint16(c.sparseCount)) |
| c.sparseCount += countSparseEntries(v) + 1 |
| return h |
| } |
| |
| func (c *normCompacter) Handler() string { |
| return "idnaSparse.lookup" |
| } |
| |
| func (c *normCompacter) Print(w io.Writer) (retErr error) { |
| p := func(f string, x ...interface{}) { |
| if _, err := fmt.Fprintf(w, f, x...); retErr == nil && err != nil { |
| retErr = err |
| } |
| } |
| |
| ls := len(c.sparseBlocks) |
| p("// idnaSparseOffset: %d entries, %d bytes\n", ls, ls*2) |
| p("var idnaSparseOffset = %#v\n\n", c.sparseOffset) |
| |
| ns := c.sparseCount |
| p("// idnaSparseValues: %d entries, %d bytes\n", ns, ns*4) |
| p("var idnaSparseValues = [%d]valueRange {", ns) |
| for i, b := range c.sparseBlocks { |
| p("\n// Block %#x, offset %#x", i, c.sparseOffset[i]) |
| var v int |
| stride := mostFrequentStride(b) |
| n := countSparseEntries(b) |
| p("\n{value:%#04x,lo:%#02x},", stride, uint8(n)) |
| for i, nv := range b { |
| if int(nv)-v != stride { |
| if v != 0 { |
| p(",hi:%#02x},", 0x80+i-1) |
| } |
| if nv != 0 { |
| p("\n{value:%#04x,lo:%#02x", nv, 0x80+i) |
| } |
| } |
| v = int(nv) |
| } |
| if v != 0 { |
| p(",hi:%#02x},", 0x80+len(b)-1) |
| } |
| } |
| p("\n}\n\n") |
| return |
| } |