blob: 1109c18954b4dadc5c1ed837944ed6d94425055b [file] [log] [blame]
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build ignore
// +build ignore
// This program generates the trie for idna operations. The Unicode casing
// algorithm requires the lookup of various properties and mappings for each
// rune. The table generated by this generator combines several of the most
// frequently used of these into a single trie so that they can be accessed
// with a single lookup.
package main
import (
"fmt"
"io"
"log"
"unicode"
"unicode/utf8"
"golang.org/x/text/internal/gen"
"golang.org/x/text/internal/triegen"
"golang.org/x/text/internal/ucd"
"golang.org/x/text/unicode/bidi"
)
func main() {
gen.Init()
genTables()
gen.Repackage("gen_trieval.go", "trieval.go", "idna")
gen.Repackage("gen_common.go", "common_test.go", "idna")
}
var runes = map[rune]info{}
func genTables() {
t := triegen.NewTrie("idna")
ucd.Parse(gen.OpenUCDFile("DerivedNormalizationProps.txt"), func(p *ucd.Parser) {
r := p.Rune(0)
if p.String(1) == "NFC_QC" { // p.String(2) is "N" or "M"
runes[r] = mayNeedNorm
}
})
ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
r := p.Rune(0)
const cccVirama = 9
if p.Int(ucd.CanonicalCombiningClass) == cccVirama {
runes[p.Rune(0)] = viramaModifier
}
switch {
case unicode.In(r, unicode.Mark):
runes[r] |= modifier | mayNeedNorm
}
// TODO: by using UnicodeData.txt we don't mark undefined codepoints
// that are earmarked as RTL properly. However, an undefined cp will
// always fail, so there is no need to store this info.
switch p, _ := bidi.LookupRune(r); p.Class() {
case bidi.R, bidi.AL, bidi.AN:
if x := runes[r]; x != 0 && x != mayNeedNorm {
log.Fatalf("%U: rune both modifier and RTL letter/number", r)
}
runes[r] = rtl
}
})
ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
switch v := p.String(1); v {
case "L", "D", "T", "R":
runes[p.Rune(0)] |= joinType[v] << joinShift
}
})
ucd.Parse(gen.OpenUnicodeFile("idna", "", "IdnaMappingTable.txt"), func(p *ucd.Parser) {
r := p.Rune(0)
// The mappings table explicitly defines surrogates as invalid.
if !utf8.ValidRune(r) {
return
}
cat := catFromEntry(p)
isMapped := cat == mapped || cat == disallowedSTD3Mapped || cat == deviation
if !isMapped {
// Only include additional category information for non-mapped
// runes. The additional information is only used after mapping and
// the bits would clash with mapping information.
// TODO: it would be possible to inline this data and avoid
// additional lookups. This is quite tedious, though, so let's first
// see if we need this.
cat |= category(runes[r])
}
s := string(p.Runes(2))
if s != "" && !isMapped {
log.Fatalf("%U: Mapping with non-mapping category %d", r, cat)
}
t.Insert(r, uint64(makeEntry(r, s))+uint64(cat))
})
w := gen.NewCodeWriter()
defer w.WriteVersionedGoFile("tables.go", "idna")
gen.WriteUnicodeVersion(w)
w.WriteVar("mappings", string(mappings))
w.WriteVar("xorData", string(xorData))
sz, err := t.Gen(w, triegen.Compact(&normCompacter{}))
if err != nil {
log.Fatal(err)
}
w.Size += sz
}
var (
// mappings contains replacement strings for mapped runes, each prefixed
// with a byte containing the length of the following string.
mappings = []byte{}
mapCache = map[string]int{}
// xorData is like mappings, except that it contains XOR data.
// We split these two tables so that we don't get an overflow.
xorData = []byte{}
xorCache = map[string]int{}
)
// makeEntry creates a trie entry.
func makeEntry(r rune, mapped string) info {
orig := string(r)
if len(orig) != len(mapped) {
// Store the mapped value as is in the mappings table.
index := len(mappings)
if x, ok := mapCache[mapped]; ok {
index = x
} else {
mapCache[mapped] = index
mappings = append(mappings, byte(len(mapped)))
mappings = append(mappings, mapped...)
}
return info(index) << indexShift
}
// Create per-byte XOR mask.
var b []byte
for i := 0; i < len(orig); i++ {
b = append(b, orig[i]^mapped[i])
}
// Remove leading 0 bytes, but keep at least one byte.
for ; len(b) > 1 && b[0] == 0; b = b[1:] {
}
if len(b) == 1 {
return xorBit | inlineXOR | info(b[0])<<indexShift
}
mapped = string(b)
// Store the mapped value as is in the mappings table.
index := len(xorData)
if x, ok := xorCache[mapped]; ok {
index = x
} else {
xorCache[mapped] = index
xorData = append(xorData, byte(len(mapped)))
xorData = append(xorData, mapped...)
}
return xorBit | info(index)<<indexShift
}
// The following code implements a triegen.Compacter that was originally
// designed for normalization. The IDNA table has some similarities with the
// norm table. Using this compacter, together with the XOR pattern approach,
// reduces the table size by roughly 100K. It can probably be compressed further
// by also including elements of the compacter used by cases, but for now it is
// good enough.
const maxSparseEntries = 16
type normCompacter struct {
sparseBlocks [][]uint64
sparseOffset []uint16
sparseCount int
}
func mostFrequentStride(a []uint64) int {
counts := make(map[int]int)
var v int
for _, x := range a {
if stride := int(x) - v; v != 0 && stride >= 0 {
counts[stride]++
}
v = int(x)
}
var maxs, maxc int
for stride, cnt := range counts {
if cnt > maxc || (cnt == maxc && stride < maxs) {
maxs, maxc = stride, cnt
}
}
return maxs
}
func countSparseEntries(a []uint64) int {
stride := mostFrequentStride(a)
var v, count int
for _, tv := range a {
if int(tv)-v != stride {
if tv != 0 {
count++
}
}
v = int(tv)
}
return count
}
func (c *normCompacter) Size(v []uint64) (sz int, ok bool) {
if n := countSparseEntries(v); n <= maxSparseEntries {
return (n+1)*4 + 2, true
}
return 0, false
}
func (c *normCompacter) Store(v []uint64) uint32 {
h := uint32(len(c.sparseOffset))
c.sparseBlocks = append(c.sparseBlocks, v)
c.sparseOffset = append(c.sparseOffset, uint16(c.sparseCount))
c.sparseCount += countSparseEntries(v) + 1
return h
}
func (c *normCompacter) Handler() string {
return "idnaSparse.lookup"
}
func (c *normCompacter) Print(w io.Writer) (retErr error) {
p := func(f string, x ...interface{}) {
if _, err := fmt.Fprintf(w, f, x...); retErr == nil && err != nil {
retErr = err
}
}
ls := len(c.sparseBlocks)
p("// idnaSparseOffset: %d entries, %d bytes\n", ls, ls*2)
p("var idnaSparseOffset = %#v\n\n", c.sparseOffset)
ns := c.sparseCount
p("// idnaSparseValues: %d entries, %d bytes\n", ns, ns*4)
p("var idnaSparseValues = [%d]valueRange {", ns)
for i, b := range c.sparseBlocks {
p("\n// Block %#x, offset %#x", i, c.sparseOffset[i])
var v int
stride := mostFrequentStride(b)
n := countSparseEntries(b)
p("\n{value:%#04x,lo:%#02x},", stride, uint8(n))
for i, nv := range b {
if int(nv)-v != stride {
if v != 0 {
p(",hi:%#02x},", 0x80+i-1)
}
if nv != 0 {
p("\n{value:%#04x,lo:%#02x", nv, 0x80+i)
}
}
v = int(nv)
}
if v != 0 {
p(",hi:%#02x},", 0x80+len(b)-1)
}
}
p("\n}\n\n")
return
}