| // Copyright 2015 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // Unicode table generator. |
| // Data read from the web. |
| |
| // +build ignore |
| |
| package main |
| |
| import ( |
| "flag" |
| "log" |
| "unicode" |
| "unicode/utf8" |
| |
| "golang.org/x/text/internal/gen" |
| "golang.org/x/text/internal/triegen" |
| "golang.org/x/text/internal/ucd" |
| "golang.org/x/text/unicode/norm" |
| "golang.org/x/text/unicode/rangetable" |
| ) |
| |
| var outputFile = flag.String("output", "tables.go", "output file for generated tables; default tables.go") |
| |
| var assigned, disallowedRunes *unicode.RangeTable |
| |
| var runeCategory = map[rune]category{} |
| |
| var overrides = map[category]category{ |
| viramaModifier: viramaJoinT, |
| greek: greekJoinT, |
| hebrew: hebrewJoinT, |
| } |
| |
| func setCategory(r rune, cat category) { |
| if c, ok := runeCategory[r]; ok { |
| if override, ok := overrides[c]; cat == joiningT && ok { |
| cat = override |
| } else { |
| log.Fatalf("%U: multiple categories for rune (%v and %v)", r, c, cat) |
| } |
| } |
| runeCategory[r] = cat |
| } |
| |
| func init() { |
| if numCategories > 1<<propShift { |
| log.Fatalf("Number of categories is %d; may at most be %d", numCategories, 1<<propShift) |
| } |
| } |
| |
| func main() { |
| gen.Init() |
| |
| // Load data |
| runes := []rune{} |
| // PrecisIgnorableProperties: https://tools.ietf.org/html/rfc7564#section-9.13 |
| ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) { |
| if p.String(1) == "Default_Ignorable_Code_Point" { |
| runes = append(runes, p.Rune(0)) |
| } |
| }) |
| ucd.Parse(gen.OpenUCDFile("PropList.txt"), func(p *ucd.Parser) { |
| switch p.String(1) { |
| case "Noncharacter_Code_Point": |
| runes = append(runes, p.Rune(0)) |
| } |
| }) |
| // OldHangulJamo: https://tools.ietf.org/html/rfc5892#section-2.9 |
| ucd.Parse(gen.OpenUCDFile("HangulSyllableType.txt"), func(p *ucd.Parser) { |
| switch p.String(1) { |
| case "L", "V", "T": |
| runes = append(runes, p.Rune(0)) |
| } |
| }) |
| |
| disallowedRunes = rangetable.New(runes...) |
| assigned = rangetable.Assigned(unicode.Version) |
| |
| // Load category data. |
| runeCategory['l'] = latinSmallL |
| ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) { |
| const cccVirama = 9 |
| if p.Int(ucd.CanonicalCombiningClass) == cccVirama { |
| setCategory(p.Rune(0), viramaModifier) |
| } |
| }) |
| ucd.Parse(gen.OpenUCDFile("Scripts.txt"), func(p *ucd.Parser) { |
| switch p.String(1) { |
| case "Greek": |
| setCategory(p.Rune(0), greek) |
| case "Hebrew": |
| setCategory(p.Rune(0), hebrew) |
| case "Hiragana", "Katakana", "Han": |
| setCategory(p.Rune(0), japanese) |
| } |
| }) |
| |
| // Set the rule categories associated with exceptions. This overrides any |
| // previously set categories. The original categories are manually |
| // reintroduced in the categoryTransitions table. |
| for r, e := range exceptions { |
| if e.cat != 0 { |
| runeCategory[r] = e.cat |
| } |
| } |
| cat := map[string]category{ |
| "L": joiningL, |
| "D": joiningD, |
| "T": joiningT, |
| |
| "R": joiningR, |
| } |
| ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) { |
| switch v := p.String(1); v { |
| case "L", "D", "T", "R": |
| setCategory(p.Rune(0), cat[v]) |
| } |
| }) |
| |
| writeTables() |
| gen.Repackage("gen_trieval.go", "trieval.go", "precis") |
| } |
| |
| type exception struct { |
| prop property |
| cat category |
| } |
| |
| func init() { |
| // Programmatically add the Arabic and Indic digits to the exceptions map. |
| // See comment in the exceptions map below why these are marked disallowed. |
| for i := rune(0); i <= 9; i++ { |
| exceptions[0x0660+i] = exception{ |
| prop: disallowed, |
| cat: arabicIndicDigit, |
| } |
| exceptions[0x06F0+i] = exception{ |
| prop: disallowed, |
| cat: extendedArabicIndicDigit, |
| } |
| } |
| } |
| |
| // The Exceptions class as defined in RFC 5892 |
| // https://tools.ietf.org/html/rfc5892#section-2.6 |
| var exceptions = map[rune]exception{ |
| 0x00DF: {prop: pValid}, |
| 0x03C2: {prop: pValid}, |
| 0x06FD: {prop: pValid}, |
| 0x06FE: {prop: pValid}, |
| 0x0F0B: {prop: pValid}, |
| 0x3007: {prop: pValid}, |
| |
| // ContextO|J rules are marked as disallowed, taking a "guilty until proven |
| // innocent" approach. The main reason for this is that the check for |
| // whether a context rule should be applied can be moved to the logic for |
| // handing disallowed runes, taken it off the common path. The exception to |
| // this rule is for katakanaMiddleDot, as the rule logic is handled without |
| // using a rule function. |
| |
| // ContextJ (Join control) |
| 0x200C: {prop: disallowed, cat: zeroWidthNonJoiner}, |
| 0x200D: {prop: disallowed, cat: zeroWidthJoiner}, |
| |
| // ContextO |
| 0x00B7: {prop: disallowed, cat: middleDot}, |
| 0x0375: {prop: disallowed, cat: greekLowerNumeralSign}, |
| 0x05F3: {prop: disallowed, cat: hebrewPreceding}, // punctuation Geresh |
| 0x05F4: {prop: disallowed, cat: hebrewPreceding}, // punctuation Gershayim |
| 0x30FB: {prop: pValid, cat: katakanaMiddleDot}, |
| |
| // These are officially ContextO, but the implementation does not require |
| // special treatment of these, so we simply mark them as valid. |
| 0x0660: {prop: pValid}, |
| 0x0661: {prop: pValid}, |
| 0x0662: {prop: pValid}, |
| 0x0663: {prop: pValid}, |
| 0x0664: {prop: pValid}, |
| 0x0665: {prop: pValid}, |
| 0x0666: {prop: pValid}, |
| 0x0667: {prop: pValid}, |
| 0x0668: {prop: pValid}, |
| 0x0669: {prop: pValid}, |
| 0x06F0: {prop: pValid}, |
| 0x06F1: {prop: pValid}, |
| 0x06F2: {prop: pValid}, |
| 0x06F3: {prop: pValid}, |
| 0x06F4: {prop: pValid}, |
| 0x06F5: {prop: pValid}, |
| 0x06F6: {prop: pValid}, |
| 0x06F7: {prop: pValid}, |
| 0x06F8: {prop: pValid}, |
| 0x06F9: {prop: pValid}, |
| |
| 0x0640: {prop: disallowed}, |
| 0x07FA: {prop: disallowed}, |
| 0x302E: {prop: disallowed}, |
| 0x302F: {prop: disallowed}, |
| 0x3031: {prop: disallowed}, |
| 0x3032: {prop: disallowed}, |
| 0x3033: {prop: disallowed}, |
| 0x3034: {prop: disallowed}, |
| 0x3035: {prop: disallowed}, |
| 0x303B: {prop: disallowed}, |
| } |
| |
| // LetterDigits: https://tools.ietf.org/html/rfc5892#section-2.1 |
| // r in {Ll, Lu, Lo, Nd, Lm, Mn, Mc}. |
| func isLetterDigits(r rune) bool { |
| return unicode.In(r, |
| unicode.Ll, unicode.Lu, unicode.Lm, unicode.Lo, // Letters |
| unicode.Mn, unicode.Mc, // Modifiers |
| unicode.Nd, // Digits |
| ) |
| } |
| |
| func isIdDisAndFreePVal(r rune) bool { |
| return unicode.In(r, |
| // OtherLetterDigits: https://tools.ietf.org/html/rfc7564#section-9.18 |
| // r in in {Lt, Nl, No, Me} |
| unicode.Lt, unicode.Nl, unicode.No, // Other letters / numbers |
| unicode.Me, // Modifiers |
| |
| // Spaces: https://tools.ietf.org/html/rfc7564#section-9.14 |
| // r in in {Zs} |
| unicode.Zs, |
| |
| // Symbols: https://tools.ietf.org/html/rfc7564#section-9.15 |
| // r in {Sm, Sc, Sk, So} |
| unicode.Sm, unicode.Sc, unicode.Sk, unicode.So, |
| |
| // Punctuation: https://tools.ietf.org/html/rfc7564#section-9.16 |
| // r in {Pc, Pd, Ps, Pe, Pi, Pf, Po} |
| unicode.Pc, unicode.Pd, unicode.Ps, unicode.Pe, |
| unicode.Pi, unicode.Pf, unicode.Po, |
| ) |
| } |
| |
| // HasCompat: https://tools.ietf.org/html/rfc7564#section-9.17 |
| func hasCompat(r rune) bool { |
| return !norm.NFKC.IsNormalString(string(r)) |
| } |
| |
| // From https://tools.ietf.org/html/rfc5892: |
| // |
| // If .cp. .in. Exceptions Then Exceptions(cp); |
| // Else If .cp. .in. BackwardCompatible Then BackwardCompatible(cp); |
| // Else If .cp. .in. Unassigned Then UNASSIGNED; |
| // Else If .cp. .in. ASCII7 Then PVALID; |
| // Else If .cp. .in. JoinControl Then CONTEXTJ; |
| // Else If .cp. .in. OldHangulJamo Then DISALLOWED; |
| // Else If .cp. .in. PrecisIgnorableProperties Then DISALLOWED; |
| // Else If .cp. .in. Controls Then DISALLOWED; |
| // Else If .cp. .in. HasCompat Then ID_DIS or FREE_PVAL; |
| // Else If .cp. .in. LetterDigits Then PVALID; |
| // Else If .cp. .in. OtherLetterDigits Then ID_DIS or FREE_PVAL; |
| // Else If .cp. .in. Spaces Then ID_DIS or FREE_PVAL; |
| // Else If .cp. .in. Symbols Then ID_DIS or FREE_PVAL; |
| // Else If .cp. .in. Punctuation Then ID_DIS or FREE_PVAL; |
| // Else DISALLOWED; |
| |
| func writeTables() { |
| propTrie := triegen.NewTrie("derivedProperties") |
| w := gen.NewCodeWriter() |
| defer w.WriteVersionedGoFile(*outputFile, "precis") |
| gen.WriteUnicodeVersion(w) |
| |
| // Iterate over all the runes... |
| for i := rune(0); i < unicode.MaxRune; i++ { |
| r := rune(i) |
| |
| if !utf8.ValidRune(r) { |
| continue |
| } |
| |
| e, ok := exceptions[i] |
| p := e.prop |
| switch { |
| case ok: |
| case !unicode.In(r, assigned): |
| p = unassigned |
| case r >= 0x0021 && r <= 0x007e: // Is ASCII 7 |
| p = pValid |
| case unicode.In(r, disallowedRunes, unicode.Cc): |
| p = disallowed |
| case hasCompat(r): |
| p = idDisOrFreePVal |
| case isLetterDigits(r): |
| p = pValid |
| case isIdDisAndFreePVal(r): |
| p = idDisOrFreePVal |
| default: |
| p = disallowed |
| } |
| cat := runeCategory[r] |
| // Don't set category for runes that are disallowed. |
| if p == disallowed { |
| cat = exceptions[r].cat |
| } |
| propTrie.Insert(r, uint64(p)|uint64(cat)) |
| } |
| sz, err := propTrie.Gen(w) |
| if err != nil { |
| log.Fatal(err) |
| } |
| w.Size += sz |
| } |