| // Copyright 2012 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // +build ignore |
| |
| // Collation table generator. |
| // Data read from the web. |
| |
| package main |
| |
| import ( |
| "archive/zip" |
| "bufio" |
| "bytes" |
| "encoding/xml" |
| "flag" |
| "fmt" |
| "io" |
| "io/ioutil" |
| "log" |
| "net/http" |
| "os" |
| "path" |
| "regexp" |
| "sort" |
| "strconv" |
| "strings" |
| "unicode" |
| "unicode/utf8" |
| |
| "code.google.com/p/go.exp/locale/collate" |
| "code.google.com/p/go.exp/locale/collate/build" |
| "code.google.com/p/go.exp/locale/collate/colltab" |
| ) |
| |
| var ( |
| root = flag.String("root", |
| "http://unicode.org/Public/UCA/"+unicode.Version+"/CollationAuxiliary.zip", |
| `URL of the Default Unicode Collation Element Table (DUCET). This can be a zip |
| file containing the file allkeys_CLDR.txt or an allkeys.txt file.`) |
| cldr = flag.String("cldr", |
| "http://www.unicode.org/Public/cldr/22/core.zip", |
| "URL of CLDR archive.") |
| test = flag.Bool("test", false, |
| "test existing tables; can be used to compare web data with package data.") |
| localFiles = flag.Bool("local", false, |
| "data files have been copied to the current directory; for debugging only.") |
| short = flag.Bool("short", false, `Use "short" alternatives, when available.`) |
| draft = flag.Bool("draft", false, `Use draft versions, when available.`) |
| tags = flag.String("tags", "", "build tags to be included after +build directive") |
| pkg = flag.String("package", "collate", |
| "the name of the package in which the generated file is to be included") |
| |
| tables = flagStringSetAllowAll("tables", "collate", "collate,chars", |
| "comma-spearated list of tables to generate.") |
| exclude = flagStringSet("exclude", "zh2", "", |
| "comma-separated list of languages to exclude.") |
| include = flagStringSet("include", "", "", |
| "comma-separated list of languages to include. Include trumps exclude.") |
| types = flagStringSetAllowAll("types", "", "", |
| "comma-separated list of types that should be included in addition to the standard type.") |
| ) |
| |
| // stringSet implements an ordered set based on a list. It implements flag.Value |
| // to allow a set to be specified as a comma-separated list. |
| type stringSet struct { |
| s []string |
| allowed *stringSet |
| dirty bool // needs compaction if true |
| all bool |
| allowAll bool |
| } |
| |
| func flagStringSet(name, def, allowed, usage string) *stringSet { |
| ss := &stringSet{} |
| if allowed != "" { |
| usage += fmt.Sprintf(" (allowed values: any of %s)", allowed) |
| ss.allowed = &stringSet{} |
| failOnError(ss.allowed.Set(allowed)) |
| } |
| ss.Set(def) |
| flag.Var(ss, name, usage) |
| return ss |
| } |
| |
| func flagStringSetAllowAll(name, def, allowed, usage string) *stringSet { |
| ss := &stringSet{allowAll: true} |
| if allowed == "" { |
| flag.Var(ss, name, usage+fmt.Sprintf(` Use "all" to select all.`)) |
| } else { |
| ss.allowed = &stringSet{} |
| failOnError(ss.allowed.Set(allowed)) |
| flag.Var(ss, name, usage+fmt.Sprintf(` (allowed values: "all" or any of %s)`, allowed)) |
| } |
| ss.Set(def) |
| return ss |
| } |
| |
| func (ss stringSet) Len() int { |
| return len(ss.s) |
| } |
| |
| func (ss stringSet) String() string { |
| return strings.Join(ss.s, ",") |
| } |
| |
| func (ss *stringSet) Set(s string) error { |
| if ss.allowAll && s == "all" { |
| ss.s = nil |
| ss.all = true |
| return nil |
| } |
| ss.s = ss.s[:0] |
| for _, s := range strings.Split(s, ",") { |
| if s := strings.TrimSpace(s); s != "" { |
| if ss.allowed != nil && !ss.allowed.contains(s) { |
| return fmt.Errorf("unsupported value %q; must be one of %s", s, ss.allowed) |
| } |
| ss.add(s) |
| } |
| } |
| ss.compact() |
| return nil |
| } |
| |
| func (ss *stringSet) add(s string) { |
| ss.s = append(ss.s, s) |
| ss.dirty = true |
| } |
| |
| func (ss *stringSet) values() []string { |
| ss.compact() |
| return ss.s |
| } |
| |
| func (ss *stringSet) contains(s string) bool { |
| if ss.all { |
| return true |
| } |
| for _, v := range ss.s { |
| if v == s { |
| return true |
| } |
| } |
| return false |
| } |
| |
| func (ss *stringSet) compact() { |
| if !ss.dirty { |
| return |
| } |
| a := ss.s |
| sort.Strings(a) |
| k := 0 |
| for i := 1; i < len(a); i++ { |
| if a[k] != a[i] { |
| a[k+1] = a[i] |
| k++ |
| } |
| } |
| ss.s = a[:k+1] |
| ss.dirty = false |
| } |
| |
| func skipLang(l string) bool { |
| if include.Len() > 0 { |
| return !include.contains(l) |
| } |
| return exclude.contains(l) |
| } |
| |
| func skipAlt(a string) bool { |
| if *draft && a == "proposed" { |
| return false |
| } |
| if *short && a == "short" { |
| return false |
| } |
| return true |
| } |
| |
| func failOnError(e error) { |
| if e != nil { |
| log.Panic(e) |
| } |
| } |
| |
| // openReader opens the URL or file given by url and returns it as an io.ReadCloser |
| // or nil on error. |
| func openReader(url *string) (io.ReadCloser, error) { |
| if *localFiles { |
| pwd, _ := os.Getwd() |
| *url = "file://" + path.Join(pwd, path.Base(*url)) |
| } |
| t := &http.Transport{} |
| t.RegisterProtocol("file", http.NewFileTransport(http.Dir("/"))) |
| c := &http.Client{Transport: t} |
| resp, err := c.Get(*url) |
| if err != nil { |
| return nil, err |
| } |
| if resp.StatusCode != 200 { |
| return nil, fmt.Errorf(`bad GET status for "%s": %s`, *url, resp.Status) |
| } |
| return resp.Body, nil |
| } |
| |
| func openArchive(url *string) *zip.Reader { |
| f, err := openReader(url) |
| failOnError(err) |
| buffer, err := ioutil.ReadAll(f) |
| f.Close() |
| failOnError(err) |
| archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer))) |
| failOnError(err) |
| return archive |
| } |
| |
| // parseUCA parses a Default Unicode Collation Element Table of the format |
| // specified in http://www.unicode.org/reports/tr10/#File_Format. |
| // It returns the variable top. |
| func parseUCA(builder *build.Builder) { |
| var r io.ReadCloser |
| var err error |
| if strings.HasSuffix(*root, ".zip") { |
| for _, f := range openArchive(root).File { |
| if strings.HasSuffix(f.Name, "allkeys_CLDR.txt") { |
| r, err = f.Open() |
| } |
| } |
| if r == nil { |
| err = fmt.Errorf("file allkeys_CLDR.txt not found in archive %q", *root) |
| } |
| } else { |
| r, err = openReader(root) |
| } |
| failOnError(err) |
| defer r.Close() |
| scanner := bufio.NewScanner(r) |
| colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`) |
| for i := 1; scanner.Scan(); i++ { |
| line := scanner.Text() |
| if len(line) == 0 || line[0] == '#' { |
| continue |
| } |
| if line[0] == '@' { |
| // parse properties |
| switch { |
| case strings.HasPrefix(line[1:], "version "): |
| a := strings.Split(line[1:], " ") |
| if a[1] != unicode.Version { |
| log.Fatalf("incompatible version %s; want %s", a[1], unicode.Version) |
| } |
| case strings.HasPrefix(line[1:], "backwards "): |
| log.Fatalf("%d: unsupported option backwards", i) |
| default: |
| log.Printf("%d: unknown option %s", i, line[1:]) |
| } |
| } else { |
| // parse entries |
| part := strings.Split(line, " ; ") |
| if len(part) != 2 { |
| log.Fatalf("%d: production rule without ';': %v", i, line) |
| } |
| lhs := []rune{} |
| for _, v := range strings.Split(part[0], " ") { |
| if v == "" { |
| continue |
| } |
| lhs = append(lhs, rune(convHex(i, v))) |
| } |
| var n int |
| var vars []int |
| rhs := [][]int{} |
| for i, m := range colelem.FindAllStringSubmatch(part[1], -1) { |
| n += len(m[0]) |
| elem := []int{} |
| for _, h := range strings.Split(m[2], ".") { |
| elem = append(elem, convHex(i, h)) |
| } |
| if m[1] == "*" { |
| vars = append(vars, i) |
| } |
| rhs = append(rhs, elem) |
| } |
| if len(part[1]) < n+3 || part[1][n+1] != '#' { |
| log.Fatalf("%d: expected comment; found %s", i, part[1][n:]) |
| } |
| if *test { |
| testInput.add(string(lhs)) |
| } |
| failOnError(builder.Add(lhs, rhs, vars)) |
| } |
| } |
| if scanner.Err() != nil { |
| log.Fatal(scanner.Err()) |
| } |
| } |
| |
| func convHex(line int, s string) int { |
| r, e := strconv.ParseInt(s, 16, 32) |
| if e != nil { |
| log.Fatalf("%d: %v", line, e) |
| } |
| return int(r) |
| } |
| |
| var testInput = stringSet{} |
| |
| // LDML holds all collation information parsed from an LDML XML file. |
| // The format of these files is defined in http://unicode.org/reports/tr35/. |
| type LDML struct { |
| XMLName xml.Name `xml:"ldml"` |
| Language Attr `xml:"identity>language"` |
| Territory Attr `xml:"identity>territory"` |
| Chars *struct { |
| ExemplarCharacters []AttrValue `xml:"exemplarCharacters"` |
| MoreInformaton string `xml:"moreInformation,omitempty"` |
| } `xml:"characters"` |
| Default Attr `xml:"collations>default"` |
| Collations []Collation `xml:"collations>collation"` |
| } |
| |
| type Attr struct { |
| XMLName xml.Name |
| Attr string `xml:"type,attr"` |
| } |
| |
| func (t Attr) String() string { |
| return t.Attr |
| } |
| |
| type AttrValue struct { |
| Type string `xml:"type,attr"` |
| Key string `xml:"key,attr,omitempty"` |
| Draft string `xml:"draft,attr,omitempty"` |
| Value string `xml:",innerxml"` |
| } |
| |
| type Collation struct { |
| Type string `xml:"type,attr"` |
| Alt string `xml:"alt,attr"` |
| SuppressContraction string `xml:"suppress_contractions,omitempty"` |
| Settings *Settings `xml:"settings"` |
| Optimize string `xml:"optimize"` |
| Rules Rules `xml:"rules"` |
| } |
| |
| type Optimize struct { |
| XMLName xml.Name `xml:"optimize"` |
| Data string `xml:"chardata"` |
| } |
| |
| type Suppression struct { |
| XMLName xml.Name `xml:"suppress_contractions"` |
| Data string `xml:"chardata"` |
| } |
| |
| type Settings struct { |
| Strength string `xml:"strenght,attr,omitempty"` |
| Backwards string `xml:"backwards,attr,omitempty"` |
| Normalization string `xml:"normalization,attr,omitempty"` |
| CaseLevel string `xml:"caseLevel,attr,omitempty"` |
| CaseFirst string `xml:"caseFirst,attr,omitempty"` |
| HiraganaQuarternary string `xml:"hiraganaQuartenary,attr,omitempty"` |
| Numeric string `xml:"numeric,attr,omitempty"` |
| VariableTop string `xml:"variableTop,attr,omitempty"` |
| } |
| |
| type Rules struct { |
| XMLName xml.Name `xml:"rules"` |
| Any []RuleElem `xml:",any"` |
| } |
| |
| type RuleElem struct { |
| XMLName xml.Name |
| Value string `xml:",innerxml"` |
| Before string `xml:"before,attr"` |
| Any []RuleElem `xml:",any"` // for <x> elements |
| } |
| |
| var charRe = regexp.MustCompile(`&#x([0-9A-F]*);`) |
| var tagRe = regexp.MustCompile(`<([a-z_]*) */>`) |
| |
| func (r *RuleElem) rewrite() { |
| // Convert hexadecimal Unicode codepoint notation to a string. |
| if m := charRe.FindAllStringSubmatch(r.Value, -1); m != nil { |
| runes := []rune{} |
| for _, sa := range m { |
| runes = append(runes, rune(convHex(-1, sa[1]))) |
| } |
| r.Value = string(runes) |
| } |
| // Strip spaces from reset positions. |
| if m := tagRe.FindStringSubmatch(r.Value); m != nil { |
| r.Value = fmt.Sprintf("<%s/>", m[1]) |
| } |
| for _, rr := range r.Any { |
| rr.rewrite() |
| } |
| } |
| |
| func decodeXML(f *zip.File) *LDML { |
| r, err := f.Open() |
| failOnError(err) |
| d := xml.NewDecoder(r) |
| var x LDML |
| err = d.Decode(&x) |
| failOnError(err) |
| return &x |
| } |
| |
| var mainLocales = []string{} |
| |
| // charsets holds a list of exemplar characters per category. |
| type charSets map[string][]string |
| |
| func (p charSets) fprint(w io.Writer) { |
| fmt.Fprintln(w, "[exN]string{") |
| for i, k := range []string{"", "contractions", "punctuation", "auxiliary", "currencySymbol", "index"} { |
| if set := p[k]; len(set) != 0 { |
| fmt.Fprintf(w, "\t\t%d: %q,\n", i, strings.Join(set, " ")) |
| } |
| } |
| fmt.Fprintln(w, "\t},") |
| } |
| |
| var localeChars = make(map[string]charSets) |
| |
| const exemplarHeader = ` |
| type exemplarType int |
| const ( |
| exCharacters exemplarType = iota |
| exContractions |
| exPunctuation |
| exAuxiliary |
| exCurrency |
| exIndex |
| exN |
| ) |
| ` |
| |
| func printExemplarCharacters(w io.Writer) { |
| fmt.Fprintln(w, exemplarHeader) |
| fmt.Fprintln(w, "var exemplarCharacters = map[string][exN]string{") |
| for _, loc := range mainLocales { |
| fmt.Fprintf(w, "\t%q: ", loc) |
| localeChars[loc].fprint(w) |
| } |
| fmt.Fprintln(w, "}") |
| } |
| |
| var mainRe = regexp.MustCompile(`.*/main/(.*)\.xml`) |
| |
| // parseMain parses XML files in the main directory of the CLDR core.zip file. |
| func parseMain() { |
| for _, f := range openArchive(cldr).File { |
| if m := mainRe.FindStringSubmatch(f.Name); m != nil { |
| locale := m[1] |
| x := decodeXML(f) |
| if skipLang(x.Language.Attr) { |
| continue |
| } |
| if x.Chars != nil { |
| for _, ec := range x.Chars.ExemplarCharacters { |
| if ec.Draft != "" { |
| continue |
| } |
| if _, ok := localeChars[locale]; !ok { |
| mainLocales = append(mainLocales, locale) |
| localeChars[locale] = make(charSets) |
| } |
| localeChars[locale][ec.Type] = parseCharacters(ec.Value) |
| } |
| } |
| } |
| } |
| } |
| |
| func parseCharacters(chars string) []string { |
| parseSingle := func(s string) (r rune, tail string, escaped bool) { |
| if s[0] == '\\' { |
| if s[1] == 'u' || s[1] == 'U' { |
| r, _, tail, err := strconv.UnquoteChar(s, 0) |
| failOnError(err) |
| return r, tail, false |
| } else if strings.HasPrefix(s[1:], "&") { |
| return '&', s[6:], false |
| } |
| return rune(s[1]), s[2:], true |
| } else if strings.HasPrefix(s, """) { |
| return '"', s[6:], false |
| } |
| r, sz := utf8.DecodeRuneInString(s) |
| return r, s[sz:], false |
| } |
| chars = strings.Trim(chars, "[ ]") |
| list := []string{} |
| var r, last, end rune |
| for len(chars) > 0 { |
| if chars[0] == '{' { // character sequence |
| buf := []rune{} |
| for chars = chars[1:]; len(chars) > 0; { |
| r, chars, _ = parseSingle(chars) |
| if r == '}' { |
| break |
| } |
| if r == ' ' { |
| log.Fatalf("space not supported in sequence %q", chars) |
| } |
| buf = append(buf, r) |
| } |
| list = append(list, string(buf)) |
| last = 0 |
| } else { // single character |
| escaped := false |
| r, chars, escaped = parseSingle(chars) |
| if r != ' ' { |
| if r == '-' && !escaped { |
| if last == 0 { |
| log.Fatal("'-' should be preceded by a character") |
| } |
| end, chars, _ = parseSingle(chars) |
| for ; last <= end; last++ { |
| list = append(list, string(last)) |
| } |
| last = 0 |
| } else { |
| list = append(list, string(r)) |
| last = r |
| } |
| } |
| } |
| } |
| return list |
| } |
| |
| var fileRe = regexp.MustCompile(`.*/collation/(.*)\.xml`) |
| |
| // parseCollation parses XML files in the collation directory of the CLDR core.zip file. |
| func parseCollation(b *build.Builder) { |
| for _, f := range openArchive(cldr).File { |
| if m := fileRe.FindStringSubmatch(f.Name); m != nil { |
| lang := m[1] |
| x := decodeXML(f) |
| if skipLang(x.Language.Attr) { |
| continue |
| } |
| def := "standard" |
| if x.Default.Attr != "" { |
| def = x.Default.Attr |
| } |
| todo := make(map[string]Collation) |
| for _, c := range x.Collations { |
| if c.Type != def && !types.contains(c.Type) { |
| continue |
| } |
| if c.Alt != "" && skipAlt(c.Alt) { |
| continue |
| } |
| for j := range c.Rules.Any { |
| c.Rules.Any[j].rewrite() |
| } |
| locale := lang |
| if c.Type != def { |
| locale += "_u_co_" + c.Type |
| } |
| _, exists := todo[locale] |
| if c.Alt != "" || !exists { |
| todo[locale] = c |
| } |
| } |
| for _, c := range x.Collations { |
| locale := lang |
| if c.Type != def { |
| locale += "_u_co_" + c.Type |
| } |
| if d, ok := todo[locale]; ok && d.Alt == c.Alt { |
| insertCollation(b, locale, &c) |
| } |
| } |
| } |
| } |
| } |
| |
| var lmap = map[byte]colltab.Level{ |
| 'p': colltab.Primary, |
| 's': colltab.Secondary, |
| 't': colltab.Tertiary, |
| 'i': colltab.Identity, |
| } |
| |
| // cldrIndex is a Unicode-reserved sentinel value used. |
| // We ignore any rule that starts with this rune. |
| // See http://unicode.org/reports/tr35/#Collation_Elements for details. |
| const cldrIndex = 0xFDD0 |
| |
| func insertTailoring(t *build.Tailoring, r RuleElem, context, extend string) { |
| switch l := r.XMLName.Local; l { |
| case "p", "s", "t", "i": |
| if []rune(r.Value)[0] != cldrIndex { |
| str := context + r.Value |
| if *test { |
| testInput.add(str) |
| } |
| err := t.Insert(lmap[l[0]], str, context+extend) |
| failOnError(err) |
| } |
| case "pc", "sc", "tc", "ic": |
| level := lmap[l[0]] |
| for _, s := range r.Value { |
| str := context + string(s) |
| if *test { |
| testInput.add(str) |
| } |
| err := t.Insert(level, str, context+extend) |
| failOnError(err) |
| } |
| default: |
| log.Fatalf("unsupported tag: %q", l) |
| } |
| } |
| |
| func insertCollation(builder *build.Builder, locale string, c *Collation) { |
| t := builder.Tailoring(locale) |
| for _, r := range c.Rules.Any { |
| switch r.XMLName.Local { |
| case "reset": |
| if r.Before == "" { |
| failOnError(t.SetAnchor(r.Value)) |
| } else { |
| failOnError(t.SetAnchorBefore(r.Value)) |
| } |
| case "x": |
| var context, extend string |
| for _, r1 := range r.Any { |
| switch r1.XMLName.Local { |
| case "context": |
| context = r1.Value |
| case "extend": |
| extend = r1.Value |
| } |
| } |
| for _, r1 := range r.Any { |
| if t := r1.XMLName.Local; t == "context" || t == "extend" { |
| continue |
| } |
| insertTailoring(t, r1, context, extend) |
| } |
| default: |
| insertTailoring(t, r, "", "") |
| } |
| } |
| } |
| |
| func testCollator(c *collate.Collator) { |
| c0 := collate.New("") |
| |
| // iterator over all characters for all locales and check |
| // whether Key is equal. |
| buf := collate.Buffer{} |
| |
| // Add all common and not too uncommon runes to the test set. |
| for i := rune(0); i < 0x30000; i++ { |
| testInput.add(string(i)) |
| } |
| for i := rune(0xE0000); i < 0xF0000; i++ { |
| testInput.add(string(i)) |
| } |
| for _, str := range testInput.values() { |
| k0 := c0.KeyFromString(&buf, str) |
| k := c.KeyFromString(&buf, str) |
| if !bytes.Equal(k0, k) { |
| failOnError(fmt.Errorf("test:%U: keys differ (%x vs %x)", []rune(str), k0, k)) |
| } |
| buf.Reset() |
| } |
| fmt.Println("PASS") |
| } |
| |
| func main() { |
| flag.Parse() |
| b := build.NewBuilder() |
| if *root != "" { |
| parseUCA(b) |
| } |
| if *cldr != "" { |
| if tables.contains("chars") { |
| parseMain() |
| } |
| parseCollation(b) |
| } |
| |
| c, err := b.Build() |
| failOnError(err) |
| |
| if *test { |
| testCollator(collate.NewFromTable(c)) |
| } else { |
| fmt.Println("// Generated by running") |
| fmt.Printf("// maketables -root=%s -cldr=%s\n", *root, *cldr) |
| fmt.Println("// DO NOT EDIT") |
| fmt.Println("// TODO: implement more compact representation for sparse blocks.") |
| if *tags != "" { |
| fmt.Printf("// +build %s\n", *tags) |
| } |
| fmt.Println("") |
| fmt.Printf("package %s\n", *pkg) |
| if tables.contains("collate") { |
| fmt.Println("") |
| _, err = b.Print(os.Stdout) |
| failOnError(err) |
| } |
| if tables.contains("chars") { |
| printExemplarCharacters(os.Stdout) |
| } |
| } |
| } |