| // Copyright 2017 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // Package cldrtree builds and generates a CLDR index file, including all |
| // inheritance. |
| package cldrtree |
| |
| //go:generate go test -gen |
| |
| // cldrtree stores CLDR data in a tree-like structure called Tree. In the CLDR |
| // data each branch in the tree is indicated by either an element name or an |
| // attribute value. A Tree does not distinguish between these two cases, but |
| // rather assumes that all branches can be accessed by an enum with a compact |
| // range of positive integer values starting from 0. |
| // |
| // Each Tree consists of three parts: |
| // - a slice mapping compact language identifiers to an offset into a set of |
| // indices, |
| // - a set of indices, stored as a large blob of uint16 values that encode |
| // the actual tree structure of data, and |
| // - a set of buckets that each holds a collection of strings. |
| // each of which is explained in more detail below. |
| // |
| // |
| // Tree lookup |
| // A tree lookup is done by providing a locale and a "path", which is a |
| // sequence of enum values. The search starts with getting the index for the |
| // given locale and then incrementally jumping into the index using the path |
| // values. If an element cannot be found in the index, the search starts anew |
| // for the locale's parent locale. The path may change during lookup by means |
| // of aliasing, described below. |
| // |
| // Buckets |
| // Buckets hold the actual string data of the leaf values of the CLDR tree. |
| // This data is stored in buckets, rather than one large string, for multiple |
| // reasons: |
| // - it allows representing leaf values more compactly, by storing all leaf |
| // values in a single bucket and then needing only needing a uint16 to index |
| // into this bucket for all leaf values, |
| // - (TBD) allow multiple trees to share subsets of buckets, mostly to allow |
| // linking in a smaller amount of data if only a subset of the buckets is |
| // needed, |
| // - to be nice to go fmt and the compiler. |
| // |
| // indices |
| // An index is a slice of uint16 for which the values are interpreted in one of |
| // two ways: as a node or a set of leaf values. |
| // A set of leaf values has the following form: |
| // <max_size>, <bucket>, <offset>... |
| // max_size indicates the maximum enum value for which an offset is defined. |
| // An offset value of 0xFFFF (missingValue) also indicates an undefined value. |
| // If defined offset indicates the offset within the given bucket of the string. |
| // A node value has the following form: |
| // <max_size>, <offset_or_alias>... |
| // max_size indicates the maximum value for which an offset is defined. |
| // A missing offset may also be indicated with 0. If the high bit (0x8000, or |
| // inheritMask) is not set, the offset points to the offset within the index |
| // for the current locale. |
| // An offset with high bit set is an alias. In this case the uint16 has the form |
| // bits: |
| // 15: 1 |
| // 14-12: negative offset into path relative to current position |
| // 0-11: new enum value for path element. |
| // On encountering an alias, the path is modified accordingly and the lookup is |
| // restarted for the given locale. |
| |
| import ( |
| "fmt" |
| "reflect" |
| "regexp" |
| "strings" |
| "unicode/utf8" |
| |
| "golang.org/x/text/internal/gen" |
| "golang.org/x/text/language" |
| "golang.org/x/text/unicode/cldr" |
| ) |
| |
| // TODO: |
| // - allow two Trees to share the same set of buckets. |
| |
| // A Builder allows storing CLDR data in compact form. |
| type Builder struct { |
| table []string |
| |
| rootMeta *metaData |
| locales []locale |
| strToBucket map[string]stringInfo |
| buckets [][]byte |
| enums []*enum |
| err error |
| |
| // Stats |
| size int |
| sizeAll int |
| bucketWaste int |
| } |
| |
| const ( |
| maxBucketSize = 8 * 1024 // 8K |
| maxStrlen = 254 // allow 0xFF sentinel |
| ) |
| |
| func (b *Builder) setError(err error) { |
| if b.err == nil { |
| b.err = err |
| } |
| } |
| |
| func (b *Builder) addString(data string) stringInfo { |
| data = b.makeString(data) |
| info, ok := b.strToBucket[data] |
| if !ok { |
| b.size += len(data) |
| x := len(b.buckets) - 1 |
| bucket := b.buckets[x] |
| if len(bucket)+len(data) < maxBucketSize { |
| info.bucket = uint16(x) |
| info.bucketPos = uint16(len(bucket)) |
| b.buckets[x] = append(bucket, data...) |
| } else { |
| info.bucket = uint16(len(b.buckets)) |
| info.bucketPos = 0 |
| b.buckets = append(b.buckets, []byte(data)) |
| } |
| b.strToBucket[data] = info |
| } |
| return info |
| } |
| |
| func (b *Builder) addStringToBucket(data string, bucket uint16) stringInfo { |
| data = b.makeString(data) |
| info, ok := b.strToBucket[data] |
| if !ok || info.bucket != bucket { |
| if ok { |
| b.bucketWaste += len(data) |
| } |
| b.size += len(data) |
| bk := b.buckets[bucket] |
| info.bucket = bucket |
| info.bucketPos = uint16(len(bk)) |
| b.buckets[bucket] = append(bk, data...) |
| b.strToBucket[data] = info |
| } |
| return info |
| } |
| |
| func (b *Builder) makeString(data string) string { |
| if len(data) > maxStrlen { |
| b.setError(fmt.Errorf("string %q exceeds maximum length of %d", data, maxStrlen)) |
| data = data[:maxStrlen] |
| for i := len(data) - 1; i > len(data)-4; i-- { |
| if utf8.RuneStart(data[i]) { |
| data = data[:i] |
| break |
| } |
| } |
| } |
| data = string([]byte{byte(len(data))}) + data |
| b.sizeAll += len(data) |
| return data |
| } |
| |
| type stringInfo struct { |
| bufferPos uint32 |
| bucket uint16 |
| bucketPos uint16 |
| } |
| |
| // New creates a new Builder. |
| func New(tableName string) *Builder { |
| b := &Builder{ |
| strToBucket: map[string]stringInfo{}, |
| buckets: [][]byte{nil}, // initialize with first bucket. |
| } |
| b.rootMeta = &metaData{ |
| b: b, |
| typeInfo: &typeInfo{}, |
| } |
| return b |
| } |
| |
| // Gen writes all the tables and types for the collected data. |
| func (b *Builder) Gen(w *gen.CodeWriter) error { |
| t, err := build(b) |
| if err != nil { |
| return err |
| } |
| return generate(b, t, w) |
| } |
| |
| // GenTestData generates tables useful for testing data generated with Gen. |
| func (b *Builder) GenTestData(w *gen.CodeWriter) error { |
| return generateTestData(b, w) |
| } |
| |
| type locale struct { |
| tag language.Tag |
| root *Index |
| } |
| |
| // Locale creates an index for the given locale. |
| func (b *Builder) Locale(t language.Tag) *Index { |
| index := &Index{ |
| meta: b.rootMeta, |
| } |
| b.locales = append(b.locales, locale{tag: t, root: index}) |
| return index |
| } |
| |
| // An Index holds a map of either leaf values or other indices. |
| type Index struct { |
| meta *metaData |
| |
| subIndex []*Index |
| values []keyValue |
| } |
| |
| func (i *Index) setError(err error) { i.meta.b.setError(err) } |
| |
| type keyValue struct { |
| key enumIndex |
| value stringInfo |
| } |
| |
| // Element is a CLDR XML element. |
| type Element interface { |
| GetCommon() *cldr.Common |
| } |
| |
| // Index creates a subindex where the type and enum values are not shared |
| // with siblings by default. The name is derived from the elem. If elem is |
| // an alias reference, the alias will be resolved and linked. If elem is nil |
| // Index returns nil. |
| func (i *Index) Index(elem Element, opt ...Option) *Index { |
| if elem == nil || reflect.ValueOf(elem).IsNil() { |
| return nil |
| } |
| c := elem.GetCommon() |
| o := &options{ |
| parent: i, |
| name: c.GetCommon().Element(), |
| } |
| o.fill(opt) |
| o.setAlias(elem) |
| return i.subIndexForKey(o) |
| } |
| |
| // IndexWithName is like Section but derives the name from the given name. |
| func (i *Index) IndexWithName(name string, opt ...Option) *Index { |
| o := &options{parent: i, name: name} |
| o.fill(opt) |
| return i.subIndexForKey(o) |
| } |
| |
| // IndexFromType creates a subindex the value of tye type attribute as key. It |
| // will also configure the Index to share the enumeration values with all |
| // sibling values. If elem is an alias, it will be resolved and linked. |
| func (i *Index) IndexFromType(elem Element, opts ...Option) *Index { |
| o := &options{ |
| parent: i, |
| name: elem.GetCommon().Type, |
| } |
| o.fill(opts) |
| o.setAlias(elem) |
| useSharedType()(o) |
| return i.subIndexForKey(o) |
| } |
| |
| // IndexFromAlt creates a subindex the value of tye alt attribute as key. It |
| // will also configure the Index to share the enumeration values with all |
| // sibling values. If elem is an alias, it will be resolved and linked. |
| func (i *Index) IndexFromAlt(elem Element, opts ...Option) *Index { |
| o := &options{ |
| parent: i, |
| name: elem.GetCommon().Alt, |
| } |
| o.fill(opts) |
| o.setAlias(elem) |
| useSharedType()(o) |
| return i.subIndexForKey(o) |
| } |
| |
| func (i *Index) subIndexForKey(opts *options) *Index { |
| key := opts.name |
| if len(i.values) > 0 { |
| panic(fmt.Errorf("cldrtree: adding Index for %q when value already exists", key)) |
| } |
| meta := i.meta.sub(key, opts) |
| for _, x := range i.subIndex { |
| if x.meta == meta { |
| return x |
| } |
| } |
| if alias := opts.alias; alias != nil { |
| if a := alias.GetCommon().Alias; a != nil { |
| if a.Source != "locale" { |
| i.setError(fmt.Errorf("cldrtree: non-locale alias not supported %v", a.Path)) |
| } |
| if meta.inheritOffset < 0 { |
| i.setError(fmt.Errorf("cldrtree: alias was already set %v", a.Path)) |
| } |
| path := a.Path |
| for ; strings.HasPrefix(path, "../"); path = path[len("../"):] { |
| meta.inheritOffset-- |
| } |
| m := aliasRe.FindStringSubmatch(path) |
| if m == nil { |
| i.setError(fmt.Errorf("cldrtree: could not parse alias %q", a.Path)) |
| } else { |
| key := m[4] |
| if key == "" { |
| key = m[1] |
| } |
| meta.inheritIndex = key |
| } |
| } |
| } |
| x := &Index{meta: meta} |
| i.subIndex = append(i.subIndex, x) |
| return x |
| } |
| |
| var aliasRe = regexp.MustCompile(`^([a-zA-Z]+)(\[@([a-zA-Z-]+)='([a-zA-Z-]+)'\])?`) |
| |
| // SetValue sets the value, the data from a CLDR XML element, for the given key. |
| func (i *Index) SetValue(key string, value Element, opt ...Option) { |
| if len(i.subIndex) > 0 { |
| panic(fmt.Errorf("adding value for key %q when index already exists", key)) |
| } |
| o := &options{parent: i} |
| o.fill(opt) |
| c := value.GetCommon() |
| if c.Alias != nil { |
| i.setError(fmt.Errorf("cldrtree: alias not supported for SetValue %v", c.Alias.Path)) |
| } |
| i.setValue(key, c.Data(), o) |
| } |
| |
| func (i *Index) setValue(key, data string, o *options) { |
| index, _ := i.meta.typeInfo.lookupSubtype(key, o) |
| kv := keyValue{key: index} |
| if len(i.values) > 0 { |
| // Add string to the same bucket as the other values. |
| bucket := i.values[0].value.bucket |
| kv.value = i.meta.b.addStringToBucket(data, bucket) |
| } else { |
| kv.value = i.meta.b.addString(data) |
| } |
| i.values = append(i.values, kv) |
| } |