blob: c2695e7b41f8bda7f6c2e485d5d6c15e4cc6b8be [file] [log] [blame]
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
// Locale identifier table generator.
// Data read from the web.
package main
import (
"bufio"
"code.google.com/p/go.exp/locale/cldr"
"flag"
"fmt"
"hash"
"hash/fnv"
"io"
"log"
"net/http"
"os"
"path"
"reflect"
"sort"
"strconv"
"strings"
)
var (
url = flag.String("cldr",
"http://www.unicode.org/Public/cldr/22/core.zip",
"URL of CLDR archive.")
test = flag.Bool("test", false,
"test existing tables; can be used to compare web data with package data.")
localFiles = flag.Bool("local", false,
"data files have been copied to the current directory; for debugging only.")
)
var comment = []string{
`
lang holds an alphabetically sorted list of bcp47 language identifiers.
All entries are 4 bytes. The index of the identifier (divided by 4) is the language ID.
For 2-byte language identifiers, the two successive bytes have the following meaning:
- if the first letter of the 2- and 3-letter ISO codes are the same:
the second and third letter of the 3-letter ISO code.
- otherwise: a 0 and a by 2 bits right-shifted index into mappedLang.
For 3-byte language identifiers the 4th byte is 0.`,
`
mappedLang holds an alphabetically sorted list of non-canonical language
identifiers (by definition of BCP47 or CLDR) with a mapping to their cannonical
equivalents. Each entry is 4 bytes. The first 3 bytes are the language code.
(May be a 2-letter code followed by a space.) The 4th byte is one of the following values:
- [a-z]: The canonical code is the first letter of the non-canonical code plus
this character. The majority of mappings can be expressed this way.
- [0-'a']: Index into mappedLangID, an array of language ids.`,
`
mappedLangID holds a list of language IDs, which correspond to the 4-byte index
into lang. A negative index indicates a mapping to a tag.`,
`
tagAlias holds a mapping from legacy and grandfathered tags to their locale ID.`,
`
scripts is an alphabetically sorted list of ISO 15924 codes. The index
of the script in the string, divided by 4, is the internal script ID.`,
`
isoRegionOffset needs to be added to the index of regionISO to obtain the regionID
for 2-letter ISO codes. (The first isoRegionOffset regionIDs are reserved for
the UN.M49 codes used for groups.)`,
`
regionISO holds a list of alphabetically sorted 2-letter ISO region codes.
Each 2-letter codes is followed by two bytes with the following meaning:
- [A-Z}{2}: the first letter of the 2-letter code plus these two
letters form the 3-letter ISO code.
- 0, n: index into altRegionISO3.`,
`
m49 maps regionIDs to UN.M49 codes. The first isoRegionOffset entries are
codes indicating collections of regions.`,
`
altRegionISO3 holds a list of 3-letter region codes that cannot be
mapped to 2-letter codes using the default algorithm. This is a short list.`,
`
altRegionIDs holsd a list of regionIDs the positions of which match those
of the 3-letter ISO codes in altRegionISO3.`,
`
currency holds an alphabetically sorted list of canonical 3-letter currency identifiers.
Each identifier is followed by a byte of which the 6 most significant bits
indicated the rounding and the least 2 significant bits indicate the
number of decimal positions.`,
}
// TODO: consider changing some of these strutures to tries. This can reduce
// memory, but may increase the need for memory allocations. This could be
// mitigated if we can piggyback on locale strings for common cases.
func failOnError(e error) {
if e != nil {
log.Panic(e)
}
}
type setType int
const (
Indexed setType = 1 + iota // all elements must be of same size
Linear
)
type stringSet struct {
s []string
sorted, frozen bool
// We often need to update values after the creation of an index is completed.
// We include a convenience map for keeping track of this.
update map[string]string
typ setType // used for checking.
}
func (ss *stringSet) clone() stringSet {
c := *ss
c.s = append([]string(nil), c.s...)
return c
}
func (ss *stringSet) setType(t setType) {
if ss.typ != t && ss.typ != 0 {
log.Panicf("type %d cannot be assigned as it was already %d", t, ss.typ)
}
}
// parse parses a whitespace-separated string and initializes ss with its
// components.
func (ss *stringSet) parse(s string) {
scan := bufio.NewScanner(strings.NewReader(s))
scan.Split(bufio.ScanWords)
for scan.Scan() {
ss.add(scan.Text())
}
}
func (ss *stringSet) assertChangeable() {
if ss.frozen {
log.Panic("attempt to modify a frozen stringSet")
}
}
func (ss *stringSet) add(s string) {
ss.assertChangeable()
ss.s = append(ss.s, s)
ss.sorted = ss.frozen
}
func (ss *stringSet) freeze() {
ss.compact()
ss.frozen = true
}
func (ss *stringSet) compact() {
if ss.sorted {
return
}
a := ss.s
sort.Strings(a)
k := 0
for i := 1; i < len(a); i++ {
if a[k] != a[i] {
a[k+1] = a[i]
k++
}
}
ss.s = a[:k+1]
ss.sorted = ss.frozen
}
func (ss *stringSet) remove(s string) {
ss.assertChangeable()
if i, ok := ss.find(s); ok {
copy(ss.s[i:], ss.s[i+1:])
ss.s = ss.s[:len(ss.s)-1]
}
}
func (ss *stringSet) replace(ol, nu string) {
ss.s[ss.index(ol)] = nu
ss.sorted = ss.frozen
}
func (ss *stringSet) index(s string) int {
ss.setType(Indexed)
i, ok := ss.find(s)
if !ok {
log.Println(ss.s)
if i < len(ss.s) {
log.Panicf("find: item %q is not in list. Closest match is %q.", s, ss.s[i])
}
log.Panicf("find: item %q is not in list", s)
}
return i
}
func (ss *stringSet) find(s string) (int, bool) {
ss.compact()
i := sort.SearchStrings(ss.s, s)
return i, i != len(ss.s) && ss.s[i] == s
}
func (ss *stringSet) slice() []string {
ss.compact()
return ss.s
}
func (ss *stringSet) updateLater(v, key string) {
if ss.update == nil {
ss.update = map[string]string{}
}
ss.update[v] = key
}
// join joins the string and ensures that all entries are of the same length.
func (ss *stringSet) join() string {
ss.setType(Indexed)
n := len(ss.s[0])
for _, s := range ss.s {
if len(s) != n {
log.Panic("join: not all entries are of the same length")
}
}
ss.s = append(ss.s, strings.Repeat("\xff", n))
return strings.Join(ss.s, "")
}
type builder struct {
w io.Writer // multi writer
out io.Writer // set to Stdout
hash32 hash.Hash32 // for checking whether tables have changed.
size int
data *cldr.CLDR
supp *cldr.SupplementalData
// indices
locale stringSet // common locales
lang stringSet // canonical language ids (2 or 3 letter ISO codes)
script stringSet // 4-letter ISO codes
region stringSet // 2-letter ISO or 3-digit UN M49 codes
currency stringSet // 3-letter ISO currency codes
}
func newBuilder(url *string) *builder {
if *localFiles {
pwd, _ := os.Getwd()
*url = "file://" + path.Join(pwd, path.Base(*url))
}
t := &http.Transport{}
t.RegisterProtocol("file", http.NewFileTransport(http.Dir("/")))
c := &http.Client{Transport: t}
resp, err := c.Get(*url)
failOnError(err)
defer resp.Body.Close()
if resp.StatusCode != 200 {
log.Fatalf(`bad GET status for "%s": %s`, *url, resp.Status)
}
d := &cldr.Decoder{}
d.SetDirFilter("supplemental")
data, err := d.DecodeZip(resp.Body)
failOnError(err)
b := builder{
out: os.Stdout,
data: data,
supp: data.Supplemental(),
hash32: fnv.New32(),
}
b.w = io.MultiWriter(b.out, b.hash32)
return &b
}
var commentIndex = make(map[string]string)
func init() {
for _, s := range comment {
key := strings.TrimSpace(strings.SplitN(s, " ", 2)[0])
commentIndex[key] = strings.Replace(s, "\n", "\n// ", -1)
}
}
func (b *builder) comment(name string) {
fmt.Fprintln(b.out, commentIndex[name])
}
func (b *builder) pf(f string, x ...interface{}) {
fmt.Fprintf(b.w, f, x...)
fmt.Fprint(b.w, "\n")
}
func (b *builder) p(x ...interface{}) {
fmt.Fprintln(b.w, x...)
}
func (b *builder) addSize(s int) {
b.size += s
b.pf("// Size: %d bytes", s)
}
func (b *builder) addArraySize(s, n int) {
b.size += s
b.pf("// Size: %d bytes, %d elements", s, n)
}
func (b *builder) writeConst(name string, x interface{}) {
b.comment(name)
b.pf("const %s = %v", name, x)
}
func (b *builder) writeSlice(name string, ss interface{}) {
b.comment(name)
v := reflect.ValueOf(ss)
t := v.Type().Elem()
b.addArraySize(v.Len()*int(t.Size()), v.Len())
fmt.Fprintf(b.w, `var %s = [%d]%s{`, name, v.Len(), t)
for i := 0; i < v.Len(); i++ {
if i%12 == 0 {
fmt.Fprintf(b.w, "\n\t")
}
fmt.Fprintf(b.w, "%+v, ", v.Index(i).Interface())
}
b.p("\n}")
}
// writeStringSlice writes a slice of strings. This produces a lot
// of overhead. It should typically only be used for debugging.
// TODO: remove
func (b *builder) writeStringSlice(name string, ss []string) {
b.comment(name)
t := reflect.TypeOf(ss).Elem()
sz := len(ss) * int(t.Size())
for _, s := range ss {
sz += len(s)
}
b.addArraySize(sz, len(ss))
b.pf(`var %s = [%d]%s{`, name, len(ss), t)
for i := 0; i < len(ss); i++ {
b.pf("\t%q,", ss[i])
}
b.p("}")
}
func (b *builder) writeString(name, s string) {
b.comment(name)
b.addSize(len(s) + int(reflect.TypeOf(s).Size()))
if len(s) < 40 {
b.pf(`var %s string = %q`, name, s)
return
}
const cpl = 60
b.pf(`var %s string = "" +`, name)
for {
n := cpl
if n > len(s) {
n = len(s)
}
var q string
for {
q = strconv.Quote(s[:n])
if len(q) <= cpl+2 {
break
}
n--
}
if n < len(s) {
b.pf(` %s +`, q)
s = s[n:]
} else {
b.pf(` %s`, q)
break
}
}
}
// TODO: convert this type into a list or two-stage trie.
func (b *builder) writeMapFunc(name string, m map[string]string, f func(string) uint16) {
b.comment(name)
v := reflect.ValueOf(m)
sz := v.Len() * (2 + int(v.Type().Key().Size()))
for _, k := range m {
sz += len(k)
}
b.addSize(sz)
keys := []string{}
b.pf(`var %s = map[string]uint16{`, name)
for k := range m {
keys = append(keys, k)
}
sort.Strings(keys)
for _, k := range keys {
b.pf("\t%q: %v,", k, f(m[k]))
}
b.p("}")
}
func (ss *stringSet) parseKeyed(slice interface{}, key, value string) {
v := reflect.ValueOf(slice)
for i := 0; i < v.Len(); i++ {
if v.Index(i).Elem().FieldByName(key).String() == value {
ss.parse(v.Index(i).Interface().(cldr.Elem).GetCommon().Data())
break
}
}
}
func (b *builder) parseIndices() {
meta := b.supp.Metadata
// canonical language codes
b.lang.parseKeyed(meta.Validity.Variable, "Id", "$language")
for _, a := range meta.Alias.LanguageAlias {
if r := a.Replacement; len(r) >= 2 && len(r) <= 3 {
b.lang.add(r)
}
if a.Reason == "macrolanguage" {
b.lang.add(a.Type)
}
remove := a.Reason == "overlong" || a.Reason == "deprecated"
if remove {
b.lang.remove(a.Type)
}
}
b.lang.remove("root")
// script codes
b.script.parseKeyed(meta.Validity.Variable, "Id", "$script")
// canonical regions codes
for _, g := range b.supp.TerritoryContainment.Group {
if len(g.Type) == 3 { // UN M49 code
b.region.add(g.Type)
}
}
for _, tc := range b.supp.CodeMappings.TerritoryCodes {
b.region.add(tc.Type)
}
// currency codes
b.currency.parseKeyed(meta.Validity.Variable, "Id", "$currency")
// common locales
b.locale.parse(meta.DefaultContent.Locales)
}
// writeLanguage generates all tables needed for language canonicalization.
func (b *builder) writeLanguage() {
meta := b.supp.Metadata
b.writeConst("unknownLang", b.lang.index("und"))
// Get language codes that need to be mapped (overlong 3-letter codes, deprecated
// 2-letter codes and grandfathered tags.
mappedLang := stringSet{}
// langSpecial maps from non-canonical to canonical ISO language codes.
// TODO: Map to Locale id, instead of language. This allows sh and bhs to be
// mapped to sr_Latn.
langSpecial := stringSet{}
// legacyTag maps from tag to language code.
legacyTag := make(map[string]string)
lang := b.lang.clone()
for _, a := range meta.Alias.LanguageAlias {
if a.Replacement == "" {
a.Replacement = "und"
}
if len(a.Type) <= 3 {
code := fmt.Sprintf("%-3s", a.Type)
if len(a.Replacement) != 2 || a.Type[0] != a.Replacement[0] {
langSpecial.add(a.Replacement)
mappedLang.updateLater(code, a.Replacement)
mappedLang.add(code)
} else if a.Reason != "overlong" || len(a.Type) != 3 {
code += a.Replacement[1:]
mappedLang.add(code)
}
if a.Reason == "overlong" && len(a.Type) == 3 && len(a.Replacement) == 2 {
lang.updateLater(a.Replacement, a.Type)
}
} else {
legacyTag[strings.Replace(a.Type, "_", "-", -1)] = a.Replacement
}
}
// Complete canonialized language tags.
lang.freeze()
for i, v := range lang.s {
// We can avoid these manual entries by using the IANI registry directly.
// Seems easier to update the list manually, as changes are rare.
// The panic in this loop will trigger if we miss an entry.
lang.update["no"] = "nor"
lang.update["sh"] = "scr"
lang.update["tl"] = "tgl"
lang.update["tw"] = "twi"
// Fix CLDR ambiguities.
lang.update["nb"] = "nob"
lang.update["ak"] = "aka"
add := ""
if s, ok := lang.update[v]; ok {
if s[0] == v[0] {
add = s[1:]
} else {
add = string([]byte{0, byte(mappedLang.index(s))})
}
} else if len(v) == 3 {
add = "\x00"
} else {
log.Panicf("no data for long form of %q", v)
}
lang.s[i] += add
}
b.writeString("lang", lang.join())
// Generate tables for non-canonicalized tags.
mappedLang.freeze()
mappedLangID := []int16{}
altTag := ""
for _, v := range langSpecial.slice() {
i := 0
if len(v) <= 3 {
i = b.lang.index(v)
} else {
i = -1 - len(altTag)
altTag += v
}
mappedLangID = append(mappedLangID, int16(i))
}
for k, v := range mappedLang.update {
i := mappedLang.index(k)
mappedLang.s[i] += string(langSpecial.index(v))
}
b.writeString("mappedLang", mappedLang.join())
b.writeSlice("mappedLangID", mappedLangID)
b.writeString("altTag", altTag)
b.writeMapFunc("tagAlias", legacyTag, func(s string) uint16 {
return uint16(b.lang.index(s))
})
}
func (b *builder) writeScript() {
b.writeConst("unknownScript", b.script.index("Zzzz"))
b.writeString("script", b.script.join())
}
func parseM49(s string) uint16 {
if len(s) == 0 {
return 0
}
v, err := strconv.ParseUint(s, 10, 10)
failOnError(err)
return uint16(v)
}
func (b *builder) writeRegion() {
b.writeConst("unknownRegion", b.region.index("ZZ"))
isoOffset := b.region.index("AA")
m49map := make([]uint16, len(b.region.slice()))
altRegionISO3 := ""
altRegionIDs := []uint16{}
b.writeConst("isoRegionOffset", isoOffset)
// 2-letter region lookup and mapping to numeric codes.
regionISO := b.region.clone()
regionISO.s = regionISO.s[isoOffset:]
regionISO.sorted = false
for i, tc := range b.supp.CodeMappings.TerritoryCodes {
if tc.Type != regionISO.s[i] {
log.Panicf("writeRegion: found %q; want %q", regionISO.s[i], tc.Type)
}
if len(tc.Alpha3) == 3 {
if tc.Alpha3[0] == tc.Type[0] {
regionISO.s[i] += tc.Alpha3[1:]
} else {
regionISO.s[i] += string([]byte{0, byte(len(altRegionISO3))})
altRegionISO3 += tc.Alpha3
altRegionIDs = append(altRegionIDs, uint16(isoOffset+i))
}
} else {
regionISO.s[i] += " "
}
if d := m49map[isoOffset+i]; d != 0 {
log.Panicf("%s found as a duplicate UN.M49 code of %03d", tc.Numeric, d)
}
m49map[isoOffset+i] = parseM49(tc.Numeric)
}
b.writeString("regionISO", regionISO.join())
b.writeString("altRegionISO3", altRegionISO3)
b.writeSlice("altRegionIDs", altRegionIDs)
// 3-digit region lookup, groupings.
for i := 0; i < isoOffset; i++ {
m49map[i] = parseM49(b.region.s[i])
}
b.writeSlice("m49", m49map)
}
func (b *builder) writeLocale() {
b.writeStringSlice("locale", b.locale.slice())
}
func (b *builder) writeLanguageInfo() {
}
func (b *builder) writeCurrencies() {
unknown := b.currency.index("XXX")
digits := map[string]uint64{}
rounding := map[string]uint64{}
for _, info := range b.supp.CurrencyData.Fractions[0].Info {
var err error
digits[info.Iso4217], err = strconv.ParseUint(info.Digits, 10, 2)
failOnError(err)
rounding[info.Iso4217], err = strconv.ParseUint(info.Rounding, 10, 6)
failOnError(err)
}
for i, cur := range b.currency.slice() {
d := uint64(2) // default number of decimal positions
if dd, ok := digits[cur]; ok {
d = dd
}
var r uint64
if r = rounding[cur]; r == 0 {
r = 1 // default rounding increment in units 10^{-digits)
}
b.currency.s[i] += string([]byte{byte(r<<2 + d)})
}
b.writeString("currency", b.currency.join())
// Hack alert: gofmt indents a trailing comment after an indented string.
// Write this constant after currency to force a proper indentation of
// the final comment.
b.writeConst("unknownCurrency", unknown)
}
var header = `// Generated by running
// maketables -url=%s
// DO NOT EDIT
package locale
`
func main() {
flag.Parse()
b := newBuilder(url)
fmt.Fprintf(b.out, header, *url)
b.parseIndices()
b.writeLanguage()
b.writeScript()
b.writeRegion()
// TODO: b.writeLocale()
b.writeCurrencies()
fmt.Fprintf(b.out, "\n// Size: %.1fK (%d bytes); Check: %X\n", float32(b.size)/1024, b.size, b.hash32.Sum32())
}