| // Copyright 2013 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // +build ignore |
| |
| // Locale identifier table generator. |
| // Data read from the web. |
| |
| package main |
| |
| import ( |
| "bufio" |
| "code.google.com/p/go.exp/locale/cldr" |
| "flag" |
| "fmt" |
| "hash" |
| "hash/fnv" |
| "io" |
| "log" |
| "net/http" |
| "os" |
| "path" |
| "reflect" |
| "sort" |
| "strconv" |
| "strings" |
| ) |
| |
| var ( |
| url = flag.String("cldr", |
| "http://www.unicode.org/Public/cldr/22/core.zip", |
| "URL of CLDR archive.") |
| test = flag.Bool("test", false, |
| "test existing tables; can be used to compare web data with package data.") |
| localFiles = flag.Bool("local", false, |
| "data files have been copied to the current directory; for debugging only.") |
| ) |
| |
| var comment = []string{ |
| ` |
| lang holds an alphabetically sorted list of bcp47 language identifiers. |
| All entries are 4 bytes. The index of the identifier (divided by 4) is the language ID. |
| For 2-byte language identifiers, the two successive bytes have the following meaning: |
| - if the first letter of the 2- and 3-letter ISO codes are the same: |
| the second and third letter of the 3-letter ISO code. |
| - otherwise: a 0 and a by 2 bits right-shifted index into mappedLang. |
| For 3-byte language identifiers the 4th byte is 0.`, |
| ` |
| mappedLang holds an alphabetically sorted list of non-canonical language |
| identifiers (by definition of BCP47 or CLDR) with a mapping to their cannonical |
| equivalents. Each entry is 4 bytes. The first 3 bytes are the language code. |
| (May be a 2-letter code followed by a space.) The 4th byte is one of the following values: |
| - [a-z]: The canonical code is the first letter of the non-canonical code plus |
| this character. The majority of mappings can be expressed this way. |
| - [0-'a']: Index into mappedLangID, an array of language ids.`, |
| ` |
| mappedLangID holds a list of language IDs, which correspond to the 4-byte index |
| into lang. A negative index indicates a mapping to a tag.`, |
| ` |
| tagAlias holds a mapping from legacy and grandfathered tags to their locale ID.`, |
| ` |
| scripts is an alphabetically sorted list of ISO 15924 codes. The index |
| of the script in the string, divided by 4, is the internal script ID.`, |
| ` |
| isoRegionOffset needs to be added to the index of regionISO to obtain the regionID |
| for 2-letter ISO codes. (The first isoRegionOffset regionIDs are reserved for |
| the UN.M49 codes used for groups.)`, |
| ` |
| regionISO holds a list of alphabetically sorted 2-letter ISO region codes. |
| Each 2-letter codes is followed by two bytes with the following meaning: |
| - [A-Z}{2}: the first letter of the 2-letter code plus these two |
| letters form the 3-letter ISO code. |
| - 0, n: index into altRegionISO3.`, |
| ` |
| m49 maps regionIDs to UN.M49 codes. The first isoRegionOffset entries are |
| codes indicating collections of regions.`, |
| ` |
| altRegionISO3 holds a list of 3-letter region codes that cannot be |
| mapped to 2-letter codes using the default algorithm. This is a short list.`, |
| ` |
| altRegionIDs holsd a list of regionIDs the positions of which match those |
| of the 3-letter ISO codes in altRegionISO3.`, |
| ` |
| currency holds an alphabetically sorted list of canonical 3-letter currency identifiers. |
| Each identifier is followed by a byte of which the 6 most significant bits |
| indicated the rounding and the least 2 significant bits indicate the |
| number of decimal positions.`, |
| } |
| |
| // TODO: consider changing some of these strutures to tries. This can reduce |
| // memory, but may increase the need for memory allocations. This could be |
| // mitigated if we can piggyback on locale strings for common cases. |
| |
| func failOnError(e error) { |
| if e != nil { |
| log.Panic(e) |
| } |
| } |
| |
| type setType int |
| |
| const ( |
| Indexed setType = 1 + iota // all elements must be of same size |
| Linear |
| ) |
| |
| type stringSet struct { |
| s []string |
| sorted, frozen bool |
| |
| // We often need to update values after the creation of an index is completed. |
| // We include a convenience map for keeping track of this. |
| update map[string]string |
| typ setType // used for checking. |
| } |
| |
| func (ss *stringSet) clone() stringSet { |
| c := *ss |
| c.s = append([]string(nil), c.s...) |
| return c |
| } |
| |
| func (ss *stringSet) setType(t setType) { |
| if ss.typ != t && ss.typ != 0 { |
| log.Panicf("type %d cannot be assigned as it was already %d", t, ss.typ) |
| } |
| } |
| |
| // parse parses a whitespace-separated string and initializes ss with its |
| // components. |
| func (ss *stringSet) parse(s string) { |
| scan := bufio.NewScanner(strings.NewReader(s)) |
| scan.Split(bufio.ScanWords) |
| for scan.Scan() { |
| ss.add(scan.Text()) |
| } |
| } |
| |
| func (ss *stringSet) assertChangeable() { |
| if ss.frozen { |
| log.Panic("attempt to modify a frozen stringSet") |
| } |
| } |
| |
| func (ss *stringSet) add(s string) { |
| ss.assertChangeable() |
| ss.s = append(ss.s, s) |
| ss.sorted = ss.frozen |
| } |
| |
| func (ss *stringSet) freeze() { |
| ss.compact() |
| ss.frozen = true |
| } |
| |
| func (ss *stringSet) compact() { |
| if ss.sorted { |
| return |
| } |
| a := ss.s |
| sort.Strings(a) |
| k := 0 |
| for i := 1; i < len(a); i++ { |
| if a[k] != a[i] { |
| a[k+1] = a[i] |
| k++ |
| } |
| } |
| ss.s = a[:k+1] |
| ss.sorted = ss.frozen |
| } |
| |
| func (ss *stringSet) remove(s string) { |
| ss.assertChangeable() |
| if i, ok := ss.find(s); ok { |
| copy(ss.s[i:], ss.s[i+1:]) |
| ss.s = ss.s[:len(ss.s)-1] |
| } |
| } |
| |
| func (ss *stringSet) replace(ol, nu string) { |
| ss.s[ss.index(ol)] = nu |
| ss.sorted = ss.frozen |
| } |
| |
| func (ss *stringSet) index(s string) int { |
| ss.setType(Indexed) |
| i, ok := ss.find(s) |
| if !ok { |
| log.Println(ss.s) |
| if i < len(ss.s) { |
| log.Panicf("find: item %q is not in list. Closest match is %q.", s, ss.s[i]) |
| } |
| log.Panicf("find: item %q is not in list", s) |
| |
| } |
| return i |
| } |
| |
| func (ss *stringSet) find(s string) (int, bool) { |
| ss.compact() |
| i := sort.SearchStrings(ss.s, s) |
| return i, i != len(ss.s) && ss.s[i] == s |
| } |
| |
| func (ss *stringSet) slice() []string { |
| ss.compact() |
| return ss.s |
| } |
| |
| func (ss *stringSet) updateLater(v, key string) { |
| if ss.update == nil { |
| ss.update = map[string]string{} |
| } |
| ss.update[v] = key |
| } |
| |
| // join joins the string and ensures that all entries are of the same length. |
| func (ss *stringSet) join() string { |
| ss.setType(Indexed) |
| n := len(ss.s[0]) |
| for _, s := range ss.s { |
| if len(s) != n { |
| log.Panic("join: not all entries are of the same length") |
| } |
| } |
| ss.s = append(ss.s, strings.Repeat("\xff", n)) |
| return strings.Join(ss.s, "") |
| } |
| |
| type builder struct { |
| w io.Writer // multi writer |
| out io.Writer // set to Stdout |
| hash32 hash.Hash32 // for checking whether tables have changed. |
| size int |
| data *cldr.CLDR |
| supp *cldr.SupplementalData |
| |
| // indices |
| locale stringSet // common locales |
| lang stringSet // canonical language ids (2 or 3 letter ISO codes) |
| script stringSet // 4-letter ISO codes |
| region stringSet // 2-letter ISO or 3-digit UN M49 codes |
| currency stringSet // 3-letter ISO currency codes |
| } |
| |
| func newBuilder(url *string) *builder { |
| if *localFiles { |
| pwd, _ := os.Getwd() |
| *url = "file://" + path.Join(pwd, path.Base(*url)) |
| } |
| t := &http.Transport{} |
| t.RegisterProtocol("file", http.NewFileTransport(http.Dir("/"))) |
| c := &http.Client{Transport: t} |
| resp, err := c.Get(*url) |
| failOnError(err) |
| defer resp.Body.Close() |
| if resp.StatusCode != 200 { |
| log.Fatalf(`bad GET status for "%s": %s`, *url, resp.Status) |
| } |
| d := &cldr.Decoder{} |
| d.SetDirFilter("supplemental") |
| data, err := d.DecodeZip(resp.Body) |
| failOnError(err) |
| b := builder{ |
| out: os.Stdout, |
| data: data, |
| supp: data.Supplemental(), |
| hash32: fnv.New32(), |
| } |
| b.w = io.MultiWriter(b.out, b.hash32) |
| return &b |
| } |
| |
| var commentIndex = make(map[string]string) |
| |
| func init() { |
| for _, s := range comment { |
| key := strings.TrimSpace(strings.SplitN(s, " ", 2)[0]) |
| commentIndex[key] = strings.Replace(s, "\n", "\n// ", -1) |
| } |
| } |
| |
| func (b *builder) comment(name string) { |
| fmt.Fprintln(b.out, commentIndex[name]) |
| } |
| |
| func (b *builder) pf(f string, x ...interface{}) { |
| fmt.Fprintf(b.w, f, x...) |
| fmt.Fprint(b.w, "\n") |
| } |
| |
| func (b *builder) p(x ...interface{}) { |
| fmt.Fprintln(b.w, x...) |
| } |
| |
| func (b *builder) addSize(s int) { |
| b.size += s |
| b.pf("// Size: %d bytes", s) |
| } |
| |
| func (b *builder) addArraySize(s, n int) { |
| b.size += s |
| b.pf("// Size: %d bytes, %d elements", s, n) |
| } |
| |
| func (b *builder) writeConst(name string, x interface{}) { |
| b.comment(name) |
| b.pf("const %s = %v", name, x) |
| } |
| |
| func (b *builder) writeSlice(name string, ss interface{}) { |
| b.comment(name) |
| v := reflect.ValueOf(ss) |
| t := v.Type().Elem() |
| b.addArraySize(v.Len()*int(t.Size()), v.Len()) |
| fmt.Fprintf(b.w, `var %s = [%d]%s{`, name, v.Len(), t) |
| for i := 0; i < v.Len(); i++ { |
| if i%12 == 0 { |
| fmt.Fprintf(b.w, "\n\t") |
| } |
| fmt.Fprintf(b.w, "%+v, ", v.Index(i).Interface()) |
| } |
| b.p("\n}") |
| } |
| |
| // writeStringSlice writes a slice of strings. This produces a lot |
| // of overhead. It should typically only be used for debugging. |
| // TODO: remove |
| func (b *builder) writeStringSlice(name string, ss []string) { |
| b.comment(name) |
| t := reflect.TypeOf(ss).Elem() |
| sz := len(ss) * int(t.Size()) |
| for _, s := range ss { |
| sz += len(s) |
| } |
| b.addArraySize(sz, len(ss)) |
| b.pf(`var %s = [%d]%s{`, name, len(ss), t) |
| for i := 0; i < len(ss); i++ { |
| b.pf("\t%q,", ss[i]) |
| } |
| b.p("}") |
| } |
| |
| func (b *builder) writeString(name, s string) { |
| b.comment(name) |
| b.addSize(len(s) + int(reflect.TypeOf(s).Size())) |
| if len(s) < 40 { |
| b.pf(`var %s string = %q`, name, s) |
| return |
| } |
| const cpl = 60 |
| b.pf(`var %s string = "" +`, name) |
| for { |
| n := cpl |
| if n > len(s) { |
| n = len(s) |
| } |
| var q string |
| for { |
| q = strconv.Quote(s[:n]) |
| if len(q) <= cpl+2 { |
| break |
| } |
| n-- |
| } |
| if n < len(s) { |
| b.pf(` %s +`, q) |
| s = s[n:] |
| } else { |
| b.pf(` %s`, q) |
| break |
| } |
| } |
| } |
| |
| // TODO: convert this type into a list or two-stage trie. |
| func (b *builder) writeMapFunc(name string, m map[string]string, f func(string) uint16) { |
| b.comment(name) |
| v := reflect.ValueOf(m) |
| sz := v.Len() * (2 + int(v.Type().Key().Size())) |
| for _, k := range m { |
| sz += len(k) |
| } |
| b.addSize(sz) |
| keys := []string{} |
| b.pf(`var %s = map[string]uint16{`, name) |
| for k := range m { |
| keys = append(keys, k) |
| } |
| sort.Strings(keys) |
| for _, k := range keys { |
| b.pf("\t%q: %v,", k, f(m[k])) |
| } |
| b.p("}") |
| } |
| |
| func (ss *stringSet) parseKeyed(slice interface{}, key, value string) { |
| v := reflect.ValueOf(slice) |
| for i := 0; i < v.Len(); i++ { |
| if v.Index(i).Elem().FieldByName(key).String() == value { |
| ss.parse(v.Index(i).Interface().(cldr.Elem).GetCommon().Data()) |
| break |
| } |
| } |
| } |
| |
| func (b *builder) parseIndices() { |
| meta := b.supp.Metadata |
| |
| // canonical language codes |
| b.lang.parseKeyed(meta.Validity.Variable, "Id", "$language") |
| for _, a := range meta.Alias.LanguageAlias { |
| if r := a.Replacement; len(r) >= 2 && len(r) <= 3 { |
| b.lang.add(r) |
| } |
| if a.Reason == "macrolanguage" { |
| b.lang.add(a.Type) |
| } |
| remove := a.Reason == "overlong" || a.Reason == "deprecated" |
| if remove { |
| b.lang.remove(a.Type) |
| } |
| } |
| b.lang.remove("root") |
| |
| // script codes |
| b.script.parseKeyed(meta.Validity.Variable, "Id", "$script") |
| |
| // canonical regions codes |
| for _, g := range b.supp.TerritoryContainment.Group { |
| if len(g.Type) == 3 { // UN M49 code |
| b.region.add(g.Type) |
| } |
| } |
| for _, tc := range b.supp.CodeMappings.TerritoryCodes { |
| b.region.add(tc.Type) |
| } |
| |
| // currency codes |
| b.currency.parseKeyed(meta.Validity.Variable, "Id", "$currency") |
| |
| // common locales |
| b.locale.parse(meta.DefaultContent.Locales) |
| } |
| |
| // writeLanguage generates all tables needed for language canonicalization. |
| func (b *builder) writeLanguage() { |
| meta := b.supp.Metadata |
| |
| b.writeConst("unknownLang", b.lang.index("und")) |
| |
| // Get language codes that need to be mapped (overlong 3-letter codes, deprecated |
| // 2-letter codes and grandfathered tags. |
| mappedLang := stringSet{} |
| |
| // langSpecial maps from non-canonical to canonical ISO language codes. |
| // TODO: Map to Locale id, instead of language. This allows sh and bhs to be |
| // mapped to sr_Latn. |
| langSpecial := stringSet{} |
| |
| // legacyTag maps from tag to language code. |
| legacyTag := make(map[string]string) |
| |
| lang := b.lang.clone() |
| for _, a := range meta.Alias.LanguageAlias { |
| if a.Replacement == "" { |
| a.Replacement = "und" |
| } |
| if len(a.Type) <= 3 { |
| code := fmt.Sprintf("%-3s", a.Type) |
| if len(a.Replacement) != 2 || a.Type[0] != a.Replacement[0] { |
| langSpecial.add(a.Replacement) |
| mappedLang.updateLater(code, a.Replacement) |
| mappedLang.add(code) |
| } else if a.Reason != "overlong" || len(a.Type) != 3 { |
| code += a.Replacement[1:] |
| mappedLang.add(code) |
| } |
| if a.Reason == "overlong" && len(a.Type) == 3 && len(a.Replacement) == 2 { |
| lang.updateLater(a.Replacement, a.Type) |
| } |
| } else { |
| legacyTag[strings.Replace(a.Type, "_", "-", -1)] = a.Replacement |
| } |
| } |
| |
| // Complete canonialized language tags. |
| lang.freeze() |
| for i, v := range lang.s { |
| // We can avoid these manual entries by using the IANI registry directly. |
| // Seems easier to update the list manually, as changes are rare. |
| // The panic in this loop will trigger if we miss an entry. |
| lang.update["no"] = "nor" |
| lang.update["sh"] = "scr" |
| lang.update["tl"] = "tgl" |
| lang.update["tw"] = "twi" |
| // Fix CLDR ambiguities. |
| lang.update["nb"] = "nob" |
| lang.update["ak"] = "aka" |
| add := "" |
| if s, ok := lang.update[v]; ok { |
| if s[0] == v[0] { |
| add = s[1:] |
| } else { |
| add = string([]byte{0, byte(mappedLang.index(s))}) |
| } |
| } else if len(v) == 3 { |
| add = "\x00" |
| } else { |
| log.Panicf("no data for long form of %q", v) |
| } |
| lang.s[i] += add |
| } |
| b.writeString("lang", lang.join()) |
| |
| // Generate tables for non-canonicalized tags. |
| mappedLang.freeze() |
| mappedLangID := []int16{} |
| altTag := "" |
| for _, v := range langSpecial.slice() { |
| i := 0 |
| if len(v) <= 3 { |
| i = b.lang.index(v) |
| } else { |
| i = -1 - len(altTag) |
| altTag += v |
| } |
| mappedLangID = append(mappedLangID, int16(i)) |
| } |
| |
| for k, v := range mappedLang.update { |
| i := mappedLang.index(k) |
| mappedLang.s[i] += string(langSpecial.index(v)) |
| } |
| b.writeString("mappedLang", mappedLang.join()) |
| b.writeSlice("mappedLangID", mappedLangID) |
| b.writeString("altTag", altTag) |
| b.writeMapFunc("tagAlias", legacyTag, func(s string) uint16 { |
| return uint16(b.lang.index(s)) |
| }) |
| } |
| |
| func (b *builder) writeScript() { |
| b.writeConst("unknownScript", b.script.index("Zzzz")) |
| b.writeString("script", b.script.join()) |
| } |
| |
| func parseM49(s string) uint16 { |
| if len(s) == 0 { |
| return 0 |
| } |
| v, err := strconv.ParseUint(s, 10, 10) |
| failOnError(err) |
| return uint16(v) |
| } |
| |
| func (b *builder) writeRegion() { |
| b.writeConst("unknownRegion", b.region.index("ZZ")) |
| |
| isoOffset := b.region.index("AA") |
| m49map := make([]uint16, len(b.region.slice())) |
| altRegionISO3 := "" |
| altRegionIDs := []uint16{} |
| |
| b.writeConst("isoRegionOffset", isoOffset) |
| |
| // 2-letter region lookup and mapping to numeric codes. |
| regionISO := b.region.clone() |
| regionISO.s = regionISO.s[isoOffset:] |
| regionISO.sorted = false |
| for i, tc := range b.supp.CodeMappings.TerritoryCodes { |
| if tc.Type != regionISO.s[i] { |
| log.Panicf("writeRegion: found %q; want %q", regionISO.s[i], tc.Type) |
| } |
| if len(tc.Alpha3) == 3 { |
| if tc.Alpha3[0] == tc.Type[0] { |
| regionISO.s[i] += tc.Alpha3[1:] |
| } else { |
| regionISO.s[i] += string([]byte{0, byte(len(altRegionISO3))}) |
| altRegionISO3 += tc.Alpha3 |
| altRegionIDs = append(altRegionIDs, uint16(isoOffset+i)) |
| } |
| } else { |
| regionISO.s[i] += " " |
| } |
| if d := m49map[isoOffset+i]; d != 0 { |
| log.Panicf("%s found as a duplicate UN.M49 code of %03d", tc.Numeric, d) |
| } |
| m49map[isoOffset+i] = parseM49(tc.Numeric) |
| } |
| b.writeString("regionISO", regionISO.join()) |
| b.writeString("altRegionISO3", altRegionISO3) |
| b.writeSlice("altRegionIDs", altRegionIDs) |
| |
| // 3-digit region lookup, groupings. |
| for i := 0; i < isoOffset; i++ { |
| m49map[i] = parseM49(b.region.s[i]) |
| } |
| b.writeSlice("m49", m49map) |
| } |
| |
| func (b *builder) writeLocale() { |
| b.writeStringSlice("locale", b.locale.slice()) |
| } |
| |
| func (b *builder) writeLanguageInfo() { |
| } |
| |
| func (b *builder) writeCurrencies() { |
| unknown := b.currency.index("XXX") |
| digits := map[string]uint64{} |
| rounding := map[string]uint64{} |
| for _, info := range b.supp.CurrencyData.Fractions[0].Info { |
| var err error |
| digits[info.Iso4217], err = strconv.ParseUint(info.Digits, 10, 2) |
| failOnError(err) |
| rounding[info.Iso4217], err = strconv.ParseUint(info.Rounding, 10, 6) |
| failOnError(err) |
| } |
| for i, cur := range b.currency.slice() { |
| d := uint64(2) // default number of decimal positions |
| if dd, ok := digits[cur]; ok { |
| d = dd |
| } |
| var r uint64 |
| if r = rounding[cur]; r == 0 { |
| r = 1 // default rounding increment in units 10^{-digits) |
| } |
| b.currency.s[i] += string([]byte{byte(r<<2 + d)}) |
| } |
| b.writeString("currency", b.currency.join()) |
| // Hack alert: gofmt indents a trailing comment after an indented string. |
| // Write this constant after currency to force a proper indentation of |
| // the final comment. |
| b.writeConst("unknownCurrency", unknown) |
| } |
| |
| var header = `// Generated by running |
| // maketables -url=%s |
| // DO NOT EDIT |
| |
| package locale |
| ` |
| |
| func main() { |
| flag.Parse() |
| b := newBuilder(url) |
| fmt.Fprintf(b.out, header, *url) |
| |
| b.parseIndices() |
| b.writeLanguage() |
| b.writeScript() |
| b.writeRegion() |
| // TODO: b.writeLocale() |
| b.writeCurrencies() |
| |
| fmt.Fprintf(b.out, "\n// Size: %.1fK (%d bytes); Check: %X\n", float32(b.size)/1024, b.size, b.hash32.Sum32()) |
| } |