secure/precis/gen.go - text - Git at Google

 // Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 // Unicode table generator.
 // Data read from the web.

 // +build ignore

 package main

 import (
 	"flag"
 	"log"
 	"unicode"
 	"unicode/utf8"

 	"golang.org/x/text/internal/gen"
 	"golang.org/x/text/internal/triegen"
 	"golang.org/x/text/internal/ucd"
 	"golang.org/x/text/unicode/norm"
 	"golang.org/x/text/unicode/rangetable"
 )

 var outputFile = flag.String("output", "tables.go", "output file for generated tables; default tables.go")

 var assigned, disallowedRunes *unicode.RangeTable

 var runeCategory = map[rune]category{}

 var overrides = map[category]category{
 	viramaModifier: viramaJoinT,
 	greek:          greekJoinT,
 	hebrew:         hebrewJoinT,
 }

 func setCategory(r rune, cat category) {
 	if c, ok := runeCategory[r]; ok {
 		if override, ok := overrides[c]; cat == joiningT && ok {
 			cat = override
 		} else {
 			log.Fatalf("%U: multiple categories for rune (%v and %v)", r, c, cat)
 		}
 	}
 	runeCategory[r] = cat
 }

 func init() {
 	if numCategories > 1<<propShift {
 		log.Fatalf("Number of categories is %d; may at most be %d", numCategories, 1<<propShift)
 	}
 }

 func main() {
 	gen.Init()

 	// Load data
 	runes := []rune{}
 	// PrecisIgnorableProperties: https://tools.ietf.org/html/rfc7564#section-9.13
 	ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) {
 		if p.String(1) == "Default_Ignorable_Code_Point" {
 			runes = append(runes, p.Rune(0))
 		}
 	})
 	ucd.Parse(gen.OpenUCDFile("PropList.txt"), func(p *ucd.Parser) {
 		switch p.String(1) {
 		case "Noncharacter_Code_Point":
 			runes = append(runes, p.Rune(0))
 		}
 	})
 	// OldHangulJamo: https://tools.ietf.org/html/rfc5892#section-2.9
 	ucd.Parse(gen.OpenUCDFile("HangulSyllableType.txt"), func(p *ucd.Parser) {
 		switch p.String(1) {
 		case "L", "V", "T":
 			runes = append(runes, p.Rune(0))
 		}
 	})

 	disallowedRunes = rangetable.New(runes...)
 	assigned = rangetable.Assigned(unicode.Version)

 	// Load category data.
 	runeCategory['l'] = latinSmallL
 	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
 		const cccVirama = 9
 		if p.Int(ucd.CanonicalCombiningClass) == cccVirama {
 			setCategory(p.Rune(0), viramaModifier)
 		}
 	})
 	ucd.Parse(gen.OpenUCDFile("Scripts.txt"), func(p *ucd.Parser) {
 		switch p.String(1) {
 		case "Greek":
 			setCategory(p.Rune(0), greek)
 		case "Hebrew":
 			setCategory(p.Rune(0), hebrew)
 		case "Hiragana", "Katakana", "Han":
 			setCategory(p.Rune(0), japanese)
 		}
 	})

 	// Set the rule categories associated with exceptions. This overrides any
 	// previously set categories. The original categories are manually
 	// reintroduced in the categoryTransitions table.
 	for r, e := range exceptions {
 		if e.cat != 0 {
 			runeCategory[r] = e.cat
 		}
 	}
 	cat := map[string]category{
 		"L": joiningL,
 		"D": joiningD,
 		"T": joiningT,

 		"R": joiningR,
 	}
 	ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
 		switch v := p.String(1); v {
 		case "L", "D", "T", "R":
 			setCategory(p.Rune(0), cat[v])
 		}
 	})

 	writeTables()
 	gen.Repackage("gen_trieval.go", "trieval.go", "precis")
 }

 type exception struct {
 	prop property
 	cat  category
 }

 func init() {
 	// Programmatically add the Arabic and Indic digits to the exceptions map.
 	// See comment in the exceptions map below why these are marked disallowed.
 	for i := rune(0); i <= 9; i++ {
 		exceptions[0x0660+i] = exception{
 			prop: disallowed,
 			cat:  arabicIndicDigit,
 		}
 		exceptions[0x06F0+i] = exception{
 			prop: disallowed,
 			cat:  extendedArabicIndicDigit,
 		}
 	}
 }

 // The Exceptions class as defined in RFC 5892
 // https://tools.ietf.org/html/rfc5892#section-2.6
 var exceptions = map[rune]exception{
 	0x00DF: {prop: pValid},
 	0x03C2: {prop: pValid},
 	0x06FD: {prop: pValid},
 	0x06FE: {prop: pValid},
 	0x0F0B: {prop: pValid},
 	0x3007: {prop: pValid},

 	// ContextO|J rules are marked as disallowed, taking a "guilty until proven
 	// innocent" approach. The main reason for this is that the check for
 	// whether a context rule should be applied can be moved to the logic for
 	// handing disallowed runes, taken it off the common path. The exception to
 	// this rule is for katakanaMiddleDot, as the rule logic is handled without
 	// using a rule function.

 	// ContextJ (Join control)
 	0x200C: {prop: disallowed, cat: zeroWidthNonJoiner},
 	0x200D: {prop: disallowed, cat: zeroWidthJoiner},

 	// ContextO
 	0x00B7: {prop: disallowed, cat: middleDot},
 	0x0375: {prop: disallowed, cat: greekLowerNumeralSign},
 	0x05F3: {prop: disallowed, cat: hebrewPreceding}, // punctuation Geresh
 	0x05F4: {prop: disallowed, cat: hebrewPreceding}, // punctuation Gershayim
 	0x30FB: {prop: pValid, cat: katakanaMiddleDot},

 	// These are officially ContextO, but the implementation does not require
 	// special treatment of these, so we simply mark them as valid.
 	0x0660: {prop: pValid},
 	0x0661: {prop: pValid},
 	0x0662: {prop: pValid},
 	0x0663: {prop: pValid},
 	0x0664: {prop: pValid},
 	0x0665: {prop: pValid},
 	0x0666: {prop: pValid},
 	0x0667: {prop: pValid},
 	0x0668: {prop: pValid},
 	0x0669: {prop: pValid},
 	0x06F0: {prop: pValid},
 	0x06F1: {prop: pValid},
 	0x06F2: {prop: pValid},
 	0x06F3: {prop: pValid},
 	0x06F4: {prop: pValid},
 	0x06F5: {prop: pValid},
 	0x06F6: {prop: pValid},
 	0x06F7: {prop: pValid},
 	0x06F8: {prop: pValid},
 	0x06F9: {prop: pValid},

 	0x0640: {prop: disallowed},
 	0x07FA: {prop: disallowed},
 	0x302E: {prop: disallowed},
 	0x302F: {prop: disallowed},
 	0x3031: {prop: disallowed},
 	0x3032: {prop: disallowed},
 	0x3033: {prop: disallowed},
 	0x3034: {prop: disallowed},
 	0x3035: {prop: disallowed},
 	0x303B: {prop: disallowed},
 }

 // LetterDigits: https://tools.ietf.org/html/rfc5892#section-2.1
 // r in {Ll, Lu, Lo, Nd, Lm, Mn, Mc}.
 func isLetterDigits(r rune) bool {
 	return unicode.In(r,
 		unicode.Ll, unicode.Lu, unicode.Lm, unicode.Lo, // Letters
 		unicode.Mn, unicode.Mc, // Modifiers
 		unicode.Nd, // Digits
 	)
 }

 func isIdDisAndFreePVal(r rune) bool {
 	return unicode.In(r,
 		// OtherLetterDigits: https://tools.ietf.org/html/rfc7564#section-9.18
 		// r in in {Lt, Nl, No, Me}
 		unicode.Lt, unicode.Nl, unicode.No, // Other letters / numbers
 		unicode.Me, // Modifiers

 		// Spaces: https://tools.ietf.org/html/rfc7564#section-9.14
 		// r in in {Zs}
 		unicode.Zs,

 		// Symbols: https://tools.ietf.org/html/rfc7564#section-9.15
 		// r in {Sm, Sc, Sk, So}
 		unicode.Sm, unicode.Sc, unicode.Sk, unicode.So,

 		// Punctuation: https://tools.ietf.org/html/rfc7564#section-9.16
 		// r in {Pc, Pd, Ps, Pe, Pi, Pf, Po}
 		unicode.Pc, unicode.Pd, unicode.Ps, unicode.Pe,
 		unicode.Pi, unicode.Pf, unicode.Po,
 	)
 }

 // HasCompat: https://tools.ietf.org/html/rfc7564#section-9.17
 func hasCompat(r rune) bool {
 	return !norm.NFKC.IsNormalString(string(r))
 }

 // From https://tools.ietf.org/html/rfc5892:
 //
 // If .cp. .in. Exceptions Then Exceptions(cp);
 //   Else If .cp. .in. BackwardCompatible Then BackwardCompatible(cp);
 //   Else If .cp. .in. Unassigned Then UNASSIGNED;
 //   Else If .cp. .in. ASCII7 Then PVALID;
 //   Else If .cp. .in. JoinControl Then CONTEXTJ;
 //   Else If .cp. .in. OldHangulJamo Then DISALLOWED;
 //   Else If .cp. .in. PrecisIgnorableProperties Then DISALLOWED;
 //   Else If .cp. .in. Controls Then DISALLOWED;
 //   Else If .cp. .in. HasCompat Then ID_DIS or FREE_PVAL;
 //   Else If .cp. .in. LetterDigits Then PVALID;
 //   Else If .cp. .in. OtherLetterDigits Then ID_DIS or FREE_PVAL;
 //   Else If .cp. .in. Spaces Then ID_DIS or FREE_PVAL;
 //   Else If .cp. .in. Symbols Then ID_DIS or FREE_PVAL;
 //   Else If .cp. .in. Punctuation Then ID_DIS or FREE_PVAL;
 //   Else DISALLOWED;

 func writeTables() {
 	propTrie := triegen.NewTrie("derivedProperties")
 	w := gen.NewCodeWriter()
 	defer w.WriteVersionedGoFile(*outputFile, "precis")
 	gen.WriteUnicodeVersion(w)

 	// Iterate over all the runes...
 	for i := rune(0); i < unicode.MaxRune; i++ {
 		r := rune(i)

 		if !utf8.ValidRune(r) {
 			continue
 		}

 		e, ok := exceptions[i]
 		p := e.prop
 		switch {
 		case ok:
 		case !unicode.In(r, assigned):
 			p = unassigned
 		case r >= 0x0021 && r <= 0x007e: // Is ASCII 7
 			p = pValid
 		case unicode.In(r, disallowedRunes, unicode.Cc):
 			p = disallowed
 		case hasCompat(r):
 			p = idDisOrFreePVal
 		case isLetterDigits(r):
 			p = pValid
 		case isIdDisAndFreePVal(r):
 			p = idDisOrFreePVal
 		default:
 			p = disallowed
 		}
 		cat := runeCategory[r]
 		// Don't set category for runes that are disallowed.
 		if p == disallowed {
 			cat = exceptions[r].cat
 		}
 		propTrie.Insert(r, uint64(p)|uint64(cat))
 	}
 	sz, err := propTrie.Gen(w)
 	if err != nil {
 		log.Fatal(err)
 	}
 	w.Size += sz
 }
	// Copyright 2015 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	// Unicode table generator.
	// Data read from the web.

	// +build ignore

	package main

	import (
	"flag"
	"log"
	"unicode"
	"unicode/utf8"

	"golang.org/x/text/internal/gen"
	"golang.org/x/text/internal/triegen"
	"golang.org/x/text/internal/ucd"
	"golang.org/x/text/unicode/norm"
	"golang.org/x/text/unicode/rangetable"
	)

	var outputFile = flag.String("output", "tables.go", "output file for generated tables; default tables.go")

	var assigned, disallowedRunes *unicode.RangeTable

	var runeCategory = map[rune]category{}

	var overrides = map[category]category{
	viramaModifier: viramaJoinT,
	greek: greekJoinT,
	hebrew: hebrewJoinT,
	}

	func setCategory(r rune, cat category) {
	if c, ok := runeCategory[r]; ok {
	if override, ok := overrides[c]; cat == joiningT && ok {
	cat = override
	} else {
	log.Fatalf("%U: multiple categories for rune (%v and %v)", r, c, cat)
	}
	}
	runeCategory[r] = cat
	}

	func init() {
	if numCategories > 1<<propShift {
	log.Fatalf("Number of categories is %d; may at most be %d", numCategories, 1<<propShift)
	}
	}

	func main() {
	gen.Init()

	// Load data
	runes := []rune{}
	// PrecisIgnorableProperties: https://tools.ietf.org/html/rfc7564#section-9.13
	ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) {
	if p.String(1) == "Default_Ignorable_Code_Point" {
	runes = append(runes, p.Rune(0))
	}
	})
	ucd.Parse(gen.OpenUCDFile("PropList.txt"), func(p *ucd.Parser) {
	switch p.String(1) {
	case "Noncharacter_Code_Point":
	runes = append(runes, p.Rune(0))
	}
	})
	// OldHangulJamo: https://tools.ietf.org/html/rfc5892#section-2.9
	ucd.Parse(gen.OpenUCDFile("HangulSyllableType.txt"), func(p *ucd.Parser) {
	switch p.String(1) {
	case "L", "V", "T":
	runes = append(runes, p.Rune(0))
	}
	})

	disallowedRunes = rangetable.New(runes...)
	assigned = rangetable.Assigned(unicode.Version)

	// Load category data.
	runeCategory['l'] = latinSmallL
	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
	const cccVirama = 9
	if p.Int(ucd.CanonicalCombiningClass) == cccVirama {
	setCategory(p.Rune(0), viramaModifier)
	}
	})
	ucd.Parse(gen.OpenUCDFile("Scripts.txt"), func(p *ucd.Parser) {
	switch p.String(1) {
	case "Greek":
	setCategory(p.Rune(0), greek)
	case "Hebrew":
	setCategory(p.Rune(0), hebrew)
	case "Hiragana", "Katakana", "Han":
	setCategory(p.Rune(0), japanese)
	}
	})

	// Set the rule categories associated with exceptions. This overrides any
	// previously set categories. The original categories are manually
	// reintroduced in the categoryTransitions table.
	for r, e := range exceptions {
	if e.cat != 0 {
	runeCategory[r] = e.cat
	}
	}
	cat := map[string]category{
	"L": joiningL,
	"D": joiningD,
	"T": joiningT,

	"R": joiningR,
	}
	ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
	switch v := p.String(1); v {
	case "L", "D", "T", "R":
	setCategory(p.Rune(0), cat[v])
	}
	})

	writeTables()
	gen.Repackage("gen_trieval.go", "trieval.go", "precis")
	}

	type exception struct {
	prop property
	cat category
	}

	func init() {
	// Programmatically add the Arabic and Indic digits to the exceptions map.
	// See comment in the exceptions map below why these are marked disallowed.
	for i := rune(0); i <= 9; i++ {
	exceptions[0x0660+i] = exception{
	prop: disallowed,
	cat: arabicIndicDigit,
	}
	exceptions[0x06F0+i] = exception{
	prop: disallowed,
	cat: extendedArabicIndicDigit,
	}
	}
	}

	// The Exceptions class as defined in RFC 5892
	// https://tools.ietf.org/html/rfc5892#section-2.6
	var exceptions = map[rune]exception{
	0x00DF: {prop: pValid},
	0x03C2: {prop: pValid},
	0x06FD: {prop: pValid},
	0x06FE: {prop: pValid},
	0x0F0B: {prop: pValid},
	0x3007: {prop: pValid},

	// ContextO\|J rules are marked as disallowed, taking a "guilty until proven
	// innocent" approach. The main reason for this is that the check for
	// whether a context rule should be applied can be moved to the logic for
	// handing disallowed runes, taken it off the common path. The exception to
	// this rule is for katakanaMiddleDot, as the rule logic is handled without
	// using a rule function.

	// ContextJ (Join control)
	0x200C: {prop: disallowed, cat: zeroWidthNonJoiner},
	0x200D: {prop: disallowed, cat: zeroWidthJoiner},

	// ContextO
	0x00B7: {prop: disallowed, cat: middleDot},
	0x0375: {prop: disallowed, cat: greekLowerNumeralSign},
	0x05F3: {prop: disallowed, cat: hebrewPreceding}, // punctuation Geresh
	0x05F4: {prop: disallowed, cat: hebrewPreceding}, // punctuation Gershayim
	0x30FB: {prop: pValid, cat: katakanaMiddleDot},

	// These are officially ContextO, but the implementation does not require
	// special treatment of these, so we simply mark them as valid.
	0x0660: {prop: pValid},
	0x0661: {prop: pValid},
	0x0662: {prop: pValid},
	0x0663: {prop: pValid},
	0x0664: {prop: pValid},
	0x0665: {prop: pValid},
	0x0666: {prop: pValid},
	0x0667: {prop: pValid},
	0x0668: {prop: pValid},
	0x0669: {prop: pValid},
	0x06F0: {prop: pValid},
	0x06F1: {prop: pValid},
	0x06F2: {prop: pValid},
	0x06F3: {prop: pValid},
	0x06F4: {prop: pValid},
	0x06F5: {prop: pValid},
	0x06F6: {prop: pValid},
	0x06F7: {prop: pValid},
	0x06F8: {prop: pValid},
	0x06F9: {prop: pValid},

	0x0640: {prop: disallowed},
	0x07FA: {prop: disallowed},
	0x302E: {prop: disallowed},
	0x302F: {prop: disallowed},
	0x3031: {prop: disallowed},
	0x3032: {prop: disallowed},
	0x3033: {prop: disallowed},
	0x3034: {prop: disallowed},
	0x3035: {prop: disallowed},
	0x303B: {prop: disallowed},
	}

	// LetterDigits: https://tools.ietf.org/html/rfc5892#section-2.1
	// r in {Ll, Lu, Lo, Nd, Lm, Mn, Mc}.
	func isLetterDigits(r rune) bool {
	return unicode.In(r,
	unicode.Ll, unicode.Lu, unicode.Lm, unicode.Lo, // Letters
	unicode.Mn, unicode.Mc, // Modifiers
	unicode.Nd, // Digits
	)
	}

	func isIdDisAndFreePVal(r rune) bool {
	return unicode.In(r,
	// OtherLetterDigits: https://tools.ietf.org/html/rfc7564#section-9.18
	// r in in {Lt, Nl, No, Me}
	unicode.Lt, unicode.Nl, unicode.No, // Other letters / numbers
	unicode.Me, // Modifiers

	// Spaces: https://tools.ietf.org/html/rfc7564#section-9.14
	// r in in {Zs}
	unicode.Zs,

	// Symbols: https://tools.ietf.org/html/rfc7564#section-9.15
	// r in {Sm, Sc, Sk, So}
	unicode.Sm, unicode.Sc, unicode.Sk, unicode.So,

	// Punctuation: https://tools.ietf.org/html/rfc7564#section-9.16
	// r in {Pc, Pd, Ps, Pe, Pi, Pf, Po}
	unicode.Pc, unicode.Pd, unicode.Ps, unicode.Pe,
	unicode.Pi, unicode.Pf, unicode.Po,
	)
	}

	// HasCompat: https://tools.ietf.org/html/rfc7564#section-9.17
	func hasCompat(r rune) bool {
	return !norm.NFKC.IsNormalString(string(r))
	}

	// From https://tools.ietf.org/html/rfc5892:
	//
	// If .cp. .in. Exceptions Then Exceptions(cp);
	// Else If .cp. .in. BackwardCompatible Then BackwardCompatible(cp);
	// Else If .cp. .in. Unassigned Then UNASSIGNED;
	// Else If .cp. .in. ASCII7 Then PVALID;
	// Else If .cp. .in. JoinControl Then CONTEXTJ;
	// Else If .cp. .in. OldHangulJamo Then DISALLOWED;
	// Else If .cp. .in. PrecisIgnorableProperties Then DISALLOWED;
	// Else If .cp. .in. Controls Then DISALLOWED;
	// Else If .cp. .in. HasCompat Then ID_DIS or FREE_PVAL;
	// Else If .cp. .in. LetterDigits Then PVALID;
	// Else If .cp. .in. OtherLetterDigits Then ID_DIS or FREE_PVAL;
	// Else If .cp. .in. Spaces Then ID_DIS or FREE_PVAL;
	// Else If .cp. .in. Symbols Then ID_DIS or FREE_PVAL;
	// Else If .cp. .in. Punctuation Then ID_DIS or FREE_PVAL;
	// Else DISALLOWED;

	func writeTables() {
	propTrie := triegen.NewTrie("derivedProperties")
	w := gen.NewCodeWriter()
	defer w.WriteVersionedGoFile(*outputFile, "precis")
	gen.WriteUnicodeVersion(w)

	// Iterate over all the runes...
	for i := rune(0); i < unicode.MaxRune; i++ {
	r := rune(i)

	if !utf8.ValidRune(r) {
	continue
	}

	e, ok := exceptions[i]
	p := e.prop
	switch {
	case ok:
	case !unicode.In(r, assigned):
	p = unassigned
	case r >= 0x0021 && r <= 0x007e: // Is ASCII 7
	p = pValid
	case unicode.In(r, disallowedRunes, unicode.Cc):
	p = disallowed
	case hasCompat(r):
	p = idDisOrFreePVal
	case isLetterDigits(r):
	p = pValid
	case isIdDisAndFreePVal(r):
	p = idDisOrFreePVal
	default:
	p = disallowed
	}
	cat := runeCategory[r]
	// Don't set category for runes that are disallowed.
	if p == disallowed {
	cat = exceptions[r].cat
	}
	propTrie.Insert(r, uint64(p)\|uint64(cat))
	}
	sz, err := propTrie.Gen(w)
	if err != nil {
	log.Fatal(err)
	}
	w.Size += sz
	}