| // Copyright 2015 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // +build ignore |
| |
| // This program generates the trie for width operations. The generated table |
| // includes width category information as well as the normalization mappings. |
| package main |
| |
| import ( |
| "bytes" |
| "fmt" |
| "io" |
| "log" |
| "math" |
| "unicode/utf8" |
| |
| "golang.org/x/text/internal/gen" |
| "golang.org/x/text/internal/triegen" |
| ) |
| |
| // See gen_common.go for flags. |
| |
| func main() { |
| gen.Init() |
| genTables() |
| genTests() |
| gen.Repackage("gen_trieval.go", "trieval.go", "width") |
| gen.Repackage("gen_common.go", "common_test.go", "width") |
| } |
| |
| func genTables() { |
| t := triegen.NewTrie("width") |
| // fold and inverse mappings. See mapComment for a description of the format |
| // of each entry. Add dummy value to make an index of 0 mean no mapping. |
| inverse := [][4]byte{{}} |
| mapping := map[[4]byte]int{[4]byte{}: 0} |
| |
| getWidthData(func(r rune, tag elem, alt rune) { |
| idx := 0 |
| if alt != 0 { |
| var buf [4]byte |
| buf[0] = byte(utf8.EncodeRune(buf[1:], alt)) |
| s := string(r) |
| buf[buf[0]] ^= s[len(s)-1] |
| var ok bool |
| if idx, ok = mapping[buf]; !ok { |
| idx = len(mapping) |
| if idx > math.MaxUint8 { |
| log.Fatalf("Index %d does not fit in a byte.", idx) |
| } |
| mapping[buf] = idx |
| inverse = append(inverse, buf) |
| } |
| } |
| t.Insert(r, uint64(tag|elem(idx))) |
| }) |
| |
| w := &bytes.Buffer{} |
| gen.WriteUnicodeVersion(w) |
| |
| sz, err := t.Gen(w) |
| if err != nil { |
| log.Fatal(err) |
| } |
| |
| sz += writeMappings(w, inverse) |
| |
| fmt.Fprintf(w, "// Total table size %d bytes (%dKiB)\n", sz, sz/1024) |
| |
| gen.WriteVersionedGoFile(*outputFile, "width", w.Bytes()) |
| } |
| |
| const inverseDataComment = ` |
| // inverseData contains 4-byte entries of the following format: |
| // <length> <modified UTF-8-encoded rune> <0 padding> |
| // The last byte of the UTF-8-encoded rune is xor-ed with the last byte of the |
| // UTF-8 encoding of the original rune. Mappings often have the following |
| // pattern: |
| // A -> A (U+FF21 -> U+0041) |
| // B -> B (U+FF22 -> U+0042) |
| // ... |
| // By xor-ing the last byte the same entry can be shared by many mappings. This |
| // reduces the total number of distinct entries by about two thirds. |
| // The resulting entry for the aforementioned mappings is |
| // { 0x01, 0xE0, 0x00, 0x00 } |
| // Using this entry to map U+FF21 (UTF-8 [EF BC A1]), we get |
| // E0 ^ A1 = 41. |
| // Similarly, for U+FF22 (UTF-8 [EF BC A2]), we get |
| // E0 ^ A2 = 42. |
| // Note that because of the xor-ing, the byte sequence stored in the entry is |
| // not valid UTF-8.` |
| |
| func writeMappings(w io.Writer, data [][4]byte) int { |
| fmt.Fprintln(w, inverseDataComment) |
| fmt.Fprintf(w, "var inverseData = [%d][4]byte{\n", len(data)) |
| for _, x := range data { |
| fmt.Fprintf(w, "{ 0x%02x, 0x%02x, 0x%02x, 0x%02x },\n", x[0], x[1], x[2], x[3]) |
| } |
| fmt.Fprintln(w, "}") |
| return len(data) * 4 |
| } |
| |
| func genTests() { |
| w := &bytes.Buffer{} |
| fmt.Fprintf(w, "\nvar mapRunes = map[rune]struct{r rune; e elem}{\n") |
| getWidthData(func(r rune, tag elem, alt rune) { |
| if alt != 0 { |
| fmt.Fprintf(w, "\t0x%X: {0x%X, 0x%X},\n", r, alt, tag) |
| } |
| }) |
| fmt.Fprintln(w, "}") |
| gen.WriteGoFile("runes_test.go", "width", w.Bytes()) |
| } |