blob: a6cbcc6d64c13f2bd9a3e203bbd9555495426495 [file] [log] [blame]
Marcel van Lohuizenb8e57db2015-12-05 12:05:49 +01001// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
Russ Cox8f690f22021-02-19 18:54:44 -05005//go:build ignore
Marcel van Lohuizenb8e57db2015-12-05 12:05:49 +01006// +build ignore
7
8// Generator for display name tables.
9
10package main
11
12import (
13 "bytes"
14 "flag"
15 "fmt"
16 "log"
17 "reflect"
18 "sort"
19 "strings"
20
Marcel van Lohuizenb8e57db2015-12-05 12:05:49 +010021 "golang.org/x/text/internal/gen"
22 "golang.org/x/text/language"
Marcel van Lohuizen51beaed2015-12-07 10:14:37 +010023 "golang.org/x/text/unicode/cldr"
Marcel van Lohuizenb8e57db2015-12-05 12:05:49 +010024)
25
26var (
27 test = flag.Bool("test", false,
28 "test existing tables; can be used to compare web data with package data.")
29 outputFile = flag.String("output", "tables.go", "output file")
30
31 stats = flag.Bool("stats", false, "prints statistics to stderr")
32
33 short = flag.Bool("short", false, `Use "short" alternatives, when available.`)
34 draft = flag.String("draft",
35 "contributed",
36 `Minimal draft requirements (approved, contributed, provisional, unconfirmed).`)
37 pkg = flag.String("package",
38 "display",
39 "the name of the package in which the generated file is to be included")
40
41 tags = newTagSet("tags",
42 []language.Tag{},
43 "space-separated list of tags to include or empty for all")
44 dict = newTagSet("dict",
45 dictTags(),
46 "space-separated list or tags for which to include a Dictionary. "+
47 `"" means the common list from go.text/language.`)
48)
49
50func dictTags() (tag []language.Tag) {
51 // TODO: replace with language.Common.Tags() once supported.
52 const str = "af am ar ar-001 az bg bn ca cs da de el en en-US en-GB " +
53 "es es-ES es-419 et fa fi fil fr fr-CA gu he hi hr hu hy id is it ja " +
54 "ka kk km kn ko ky lo lt lv mk ml mn mr ms my ne nl no pa pl pt pt-BR " +
55 "pt-PT ro ru si sk sl sq sr sr-Latn sv sw ta te th tr uk ur uz vi " +
56 "zh zh-Hans zh-Hant zu"
57
58 for _, s := range strings.Split(str, " ") {
59 tag = append(tag, language.MustParse(s))
60 }
61 return tag
62}
63
64func main() {
65 gen.Init()
66
67 // Read the CLDR zip file.
68 r := gen.OpenCLDRCoreZip()
69 defer r.Close()
70
71 d := &cldr.Decoder{}
72 d.SetDirFilter("main", "supplemental")
73 d.SetSectionFilter("localeDisplayNames")
74 data, err := d.DecodeZip(r)
75 if err != nil {
76 log.Fatalf("DecodeZip: %v", err)
77 }
78
79 w := gen.NewCodeWriter()
80 defer w.WriteGoFile(*outputFile, "display")
81
82 gen.WriteCLDRVersion(w)
83
84 b := builder{
85 w: w,
86 data: data,
87 group: make(map[string]*group),
88 }
89 b.generate()
90}
91
92const tagForm = language.All
93
94// tagSet is used to parse command line flags of tags. It implements the
95// flag.Value interface.
96type tagSet map[language.Tag]bool
97
98func newTagSet(name string, tags []language.Tag, usage string) tagSet {
99 f := tagSet(make(map[language.Tag]bool))
100 for _, t := range tags {
101 f[t] = true
102 }
103 flag.Var(f, name, usage)
104 return f
105}
106
107// String implements the String method of the flag.Value interface.
108func (f tagSet) String() string {
109 tags := []string{}
110 for t := range f {
111 tags = append(tags, t.String())
112 }
113 sort.Strings(tags)
114 return strings.Join(tags, " ")
115}
116
117// Set implements Set from the flag.Value interface.
118func (f tagSet) Set(s string) error {
119 if s != "" {
120 for _, s := range strings.Split(s, " ") {
121 if s != "" {
122 tag, err := tagForm.Parse(s)
123 if err != nil {
124 return err
125 }
126 f[tag] = true
127 }
128 }
129 }
130 return nil
131}
132
133func (f tagSet) contains(t language.Tag) bool {
134 if len(f) == 0 {
135 return true
136 }
137 return f[t]
138}
139
140// builder is used to create all tables with display name information.
141type builder struct {
142 w *gen.CodeWriter
143
144 data *cldr.CLDR
145
146 fromLocs []string
147
148 // destination tags for the current locale.
149 toTags []string
150 toTagIndex map[string]int
151
152 // list of supported tags
153 supported []language.Tag
154
155 // key-value pairs per group
156 group map[string]*group
157
158 // statistics
159 sizeIndex int // total size of all indexes of headers
160 sizeData int // total size of all data of headers
161 totalSize int
162}
163
164type group struct {
165 // Maps from a given language to the Namer data for this language.
166 lang map[language.Tag]keyValues
167 headers []header
168
169 toTags []string
170 threeStart int
171 fourPlusStart int
172}
173
174// set sets the typ to the name for locale loc.
175func (g *group) set(t language.Tag, typ, name string) {
176 kv := g.lang[t]
177 if kv == nil {
178 kv = make(keyValues)
179 g.lang[t] = kv
180 }
181 if kv[typ] == "" {
182 kv[typ] = name
183 }
184}
185
186type keyValues map[string]string
187
188type header struct {
189 tag language.Tag
190 data string
191 index []uint16
192}
193
194var versionInfo = `// Version is deprecated. Use CLDRVersion.
195const Version = %#v
196
197`
198
199var self = language.MustParse("mul")
200
201// generate builds and writes all tables.
202func (b *builder) generate() {
203 fmt.Fprintf(b.w, versionInfo, cldr.Version)
204
205 b.filter()
206 b.setData("lang", func(g *group, loc language.Tag, ldn *cldr.LocaleDisplayNames) {
207 if ldn.Languages != nil {
208 for _, v := range ldn.Languages.Language {
Marcel van Lohuizenacd49d42017-09-06 11:14:54 +0200209 lang := v.Type
210 if lang == "root" {
211 // We prefer the data from "und"
212 // TODO: allow both the data for root and und somehow.
213 continue
214 }
215 tag := tagForm.MustParse(lang)
Marcel van Lohuizenb8e57db2015-12-05 12:05:49 +0100216 if tags.contains(tag) {
217 g.set(loc, tag.String(), v.Data())
218 }
219 }
220 }
221 })
222 b.setData("script", func(g *group, loc language.Tag, ldn *cldr.LocaleDisplayNames) {
223 if ldn.Scripts != nil {
224 for _, v := range ldn.Scripts.Script {
Marcel van Lohuizenf79ed802015-12-14 18:19:08 +0100225 code := language.MustParseScript(v.Type)
226 if code.IsPrivateUse() { // Qaaa..Qabx
227 // TODO: data currently appears to be very meager.
228 // Reconsider if we have data for English.
229 if loc == language.English {
230 log.Fatal("Consider including data for private use scripts.")
231 }
232 continue
233 }
234 g.set(loc, code.String(), v.Data())
Marcel van Lohuizenb8e57db2015-12-05 12:05:49 +0100235 }
236 }
237 })
238 b.setData("region", func(g *group, loc language.Tag, ldn *cldr.LocaleDisplayNames) {
239 if ldn.Territories != nil {
240 for _, v := range ldn.Territories.Territory {
241 g.set(loc, language.MustParseRegion(v.Type).String(), v.Data())
242 }
243 }
244 })
245
246 b.makeSupported()
247
248 b.writeParents()
249
250 b.writeGroup("lang")
251 b.writeGroup("script")
252 b.writeGroup("region")
253
254 b.w.WriteConst("numSupported", len(b.supported))
255 buf := bytes.Buffer{}
256 for _, tag := range b.supported {
257 fmt.Fprint(&buf, tag.String(), "|")
258 }
259 b.w.WriteConst("supported", buf.String())
260
261 b.writeDictionaries()
262
263 b.supported = []language.Tag{self}
264
265 // Compute the names of locales in their own language. Some of these names
266 // may be specified in their parent locales. We iterate the maximum depth
267 // of the parent three times to match successive parents of tags until a
268 // possible match is found.
269 for i := 0; i < 4; i++ {
270 b.setData("self", func(g *group, tag language.Tag, ldn *cldr.LocaleDisplayNames) {
271 parent := tag
272 if b, s, r := tag.Raw(); i > 0 && (s != language.Script{} && r == language.Region{}) {
273 parent, _ = language.Raw.Compose(b)
274 }
275 if ldn.Languages != nil {
276 for _, v := range ldn.Languages.Language {
277 key := tagForm.MustParse(v.Type)
278 saved := key
279 if key == parent {
280 g.set(self, tag.String(), v.Data())
281 }
282 for k := 0; k < i; k++ {
283 key = key.Parent()
284 }
285 if key == tag {
286 g.set(self, saved.String(), v.Data()) // set does not overwrite a value.
287 }
288 }
289 }
290 })
291 }
292
293 b.writeGroup("self")
294}
295
296func (b *builder) setData(name string, f func(*group, language.Tag, *cldr.LocaleDisplayNames)) {
297 b.sizeIndex = 0
298 b.sizeData = 0
299 b.toTags = nil
300 b.fromLocs = nil
301 b.toTagIndex = make(map[string]int)
302
303 g := b.group[name]
304 if g == nil {
305 g = &group{lang: make(map[language.Tag]keyValues)}
306 b.group[name] = g
307 }
308 for _, loc := range b.data.Locales() {
309 // We use RawLDML instead of LDML as we are managing our own inheritance
310 // in this implementation.
311 ldml := b.data.RawLDML(loc)
312
313 // We do not support the POSIX variant (it is not a supported BCP 47
314 // variant). This locale also doesn't happen to contain any data, so
315 // we'll skip it by checking for this.
316 tag, err := tagForm.Parse(loc)
317 if err != nil {
318 if ldml.LocaleDisplayNames != nil {
319 log.Fatalf("setData: %v", err)
320 }
321 continue
322 }
323 if ldml.LocaleDisplayNames != nil && tags.contains(tag) {
324 f(g, tag, ldml.LocaleDisplayNames)
325 }
326 }
327}
328
329func (b *builder) filter() {
330 filter := func(s *cldr.Slice) {
331 if *short {
332 s.SelectOnePerGroup("alt", []string{"short", ""})
333 } else {
334 s.SelectOnePerGroup("alt", []string{"stand-alone", ""})
335 }
336 d, err := cldr.ParseDraft(*draft)
337 if err != nil {
338 log.Fatalf("filter: %v", err)
339 }
340 s.SelectDraft(d)
341 }
342 for _, loc := range b.data.Locales() {
343 if ldn := b.data.RawLDML(loc).LocaleDisplayNames; ldn != nil {
344 if ldn.Languages != nil {
345 s := cldr.MakeSlice(&ldn.Languages.Language)
346 if filter(&s); len(ldn.Languages.Language) == 0 {
347 ldn.Languages = nil
348 }
349 }
350 if ldn.Scripts != nil {
351 s := cldr.MakeSlice(&ldn.Scripts.Script)
352 if filter(&s); len(ldn.Scripts.Script) == 0 {
353 ldn.Scripts = nil
354 }
355 }
356 if ldn.Territories != nil {
357 s := cldr.MakeSlice(&ldn.Territories.Territory)
358 if filter(&s); len(ldn.Territories.Territory) == 0 {
359 ldn.Territories = nil
360 }
361 }
362 }
363 }
364}
365
366// makeSupported creates a list of all supported locales.
367func (b *builder) makeSupported() {
368 // tags across groups
369 for _, g := range b.group {
370 for t, _ := range g.lang {
371 b.supported = append(b.supported, t)
372 }
373 }
374 b.supported = b.supported[:unique(tagsSorter(b.supported))]
375
376}
377
378type tagsSorter []language.Tag
379
380func (a tagsSorter) Len() int { return len(a) }
381func (a tagsSorter) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
382func (a tagsSorter) Less(i, j int) bool { return a[i].String() < a[j].String() }
383
384func (b *builder) writeGroup(name string) {
385 g := b.group[name]
386
387 for _, kv := range g.lang {
388 for t, _ := range kv {
389 g.toTags = append(g.toTags, t)
390 }
391 }
392 g.toTags = g.toTags[:unique(tagsBySize(g.toTags))]
393
394 // Allocate header per supported value.
395 g.headers = make([]header, len(b.supported))
396 for i, sup := range b.supported {
397 kv, ok := g.lang[sup]
398 if !ok {
399 g.headers[i].tag = sup
400 continue
401 }
402 data := []byte{}
403 index := make([]uint16, len(g.toTags), len(g.toTags)+1)
404 for j, t := range g.toTags {
405 index[j] = uint16(len(data))
406 data = append(data, kv[t]...)
407 }
408 index = append(index, uint16(len(data)))
409
410 // Trim the tail of the index.
411 // TODO: indexes can be reduced in size quite a bit more.
412 n := len(index)
413 for ; n >= 2 && index[n-2] == index[n-1]; n-- {
414 }
415 index = index[:n]
416
417 // Workaround for a bug in CLDR 26.
Kevin Burke647d7ef2018-08-04 08:55:54 -0700418 // See https://unicode.org/cldr/trac/ticket/8042.
Marcel van Lohuizenb8e57db2015-12-05 12:05:49 +0100419 if cldr.Version == "26" && sup.String() == "hsb" {
420 data = bytes.Replace(data, []byte{'"'}, nil, 1)
421 }
422 g.headers[i] = header{sup, string(data), index}
423 }
424 g.writeTable(b.w, name)
425}
426
427type tagsBySize []string
428
429func (l tagsBySize) Len() int { return len(l) }
430func (l tagsBySize) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
431func (l tagsBySize) Less(i, j int) bool {
432 a, b := l[i], l[j]
433 // Sort single-tag entries based on size first. Otherwise alphabetic.
434 if len(a) != len(b) && (len(a) <= 4 || len(b) <= 4) {
435 return len(a) < len(b)
436 }
437 return a < b
438}
439
440// parentIndices returns slice a of len(tags) where tags[a[i]] is the parent
441// of tags[i].
442func parentIndices(tags []language.Tag) []int16 {
443 index := make(map[language.Tag]int16)
444 for i, t := range tags {
445 index[t] = int16(i)
446 }
447
448 // Construct default parents.
449 parents := make([]int16, len(tags))
450 for i, t := range tags {
451 parents[i] = -1
452 for t = t.Parent(); t != language.Und; t = t.Parent() {
453 if j, ok := index[t]; ok {
454 parents[i] = j
455 break
456 }
457 }
458 }
459 return parents
460}
461
462func (b *builder) writeParents() {
463 parents := parentIndices(b.supported)
464 fmt.Fprintf(b.w, "var parents = ")
465 b.w.WriteArray(parents)
466}
467
468// writeKeys writes keys to a special index used by the display package.
469// tags are assumed to be sorted by length.
470func writeKeys(w *gen.CodeWriter, name string, keys []string) {
471 w.Size += int(3 * reflect.TypeOf("").Size())
472 w.WriteComment("Number of keys: %d", len(keys))
473 fmt.Fprintf(w, "var (\n\t%sIndex = tagIndex{\n", name)
474 for i := 2; i <= 4; i++ {
475 sub := []string{}
476 for _, t := range keys {
477 if len(t) != i {
478 break
479 }
480 sub = append(sub, t)
481 }
482 s := strings.Join(sub, "")
483 w.WriteString(s)
484 fmt.Fprintf(w, ",\n")
485 keys = keys[len(sub):]
486 }
487 fmt.Fprintln(w, "\t}")
488 if len(keys) > 0 {
489 w.Size += int(reflect.TypeOf([]string{}).Size())
490 fmt.Fprintf(w, "\t%sTagsLong = ", name)
491 w.WriteSlice(keys)
492 }
493 fmt.Fprintln(w, ")\n")
494}
495
496// identifier creates an identifier from the given tag.
497func identifier(t language.Tag) string {
498 return strings.Replace(t.String(), "-", "", -1)
499}
500
501func (h *header) writeEntry(w *gen.CodeWriter, name string) {
502 if len(dict) > 0 && dict.contains(h.tag) {
503 fmt.Fprintf(w, "\t{ // %s\n", h.tag)
504 fmt.Fprintf(w, "\t\t%[1]s%[2]sStr,\n\t\t%[1]s%[2]sIdx,\n", identifier(h.tag), name)
505 fmt.Fprintln(w, "\t},")
506 } else if len(h.data) == 0 {
507 fmt.Fprintln(w, "\t\t{}, //", h.tag)
508 } else {
509 fmt.Fprintf(w, "\t{ // %s\n", h.tag)
510 w.WriteString(h.data)
511 fmt.Fprintln(w, ",")
512 w.WriteSlice(h.index)
513 fmt.Fprintln(w, ",\n\t},")
514 }
515}
516
517// write the data for the given header as single entries. The size for this data
518// was already accounted for in writeEntry.
519func (h *header) writeSingle(w *gen.CodeWriter, name string) {
520 if len(dict) > 0 && dict.contains(h.tag) {
521 tag := identifier(h.tag)
522 w.WriteConst(tag+name+"Str", h.data)
523
524 // Note that we create a slice instead of an array. If we use an array
525 // we need to refer to it as a[:] in other tables, which will cause the
526 // array to always be included by the linker. See Issue 7651.
527 w.WriteVar(tag+name+"Idx", h.index)
528 }
529}
530
531// WriteTable writes an entry for a single Namer.
532func (g *group) writeTable(w *gen.CodeWriter, name string) {
533 start := w.Size
534 writeKeys(w, name, g.toTags)
535 w.Size += len(g.headers) * int(reflect.ValueOf(g.headers[0]).Type().Size())
536
537 fmt.Fprintf(w, "var %sHeaders = [%d]header{\n", name, len(g.headers))
538
539 title := strings.Title(name)
540 for _, h := range g.headers {
541 h.writeEntry(w, title)
542 }
543 fmt.Fprintln(w, "}\n")
544
545 for _, h := range g.headers {
546 h.writeSingle(w, title)
547 }
548 n := w.Size - start
549 fmt.Fprintf(w, "// Total size for %s: %d bytes (%d KB)\n\n", name, n, n/1000)
550}
551
552func (b *builder) writeDictionaries() {
553 fmt.Fprintln(b.w, "// Dictionary entries of frequent languages")
554 fmt.Fprintln(b.w, "var (")
555 parents := parentIndices(b.supported)
556
557 for i, t := range b.supported {
558 if dict.contains(t) {
559 ident := identifier(t)
560 fmt.Fprintf(b.w, "\t%s = Dictionary{ // %s\n", ident, t)
561 if p := parents[i]; p == -1 {
562 fmt.Fprintln(b.w, "\t\tnil,")
563 } else {
564 fmt.Fprintf(b.w, "\t\t&%s,\n", identifier(b.supported[p]))
565 }
566 fmt.Fprintf(b.w, "\t\theader{%[1]sLangStr, %[1]sLangIdx},\n", ident)
567 fmt.Fprintf(b.w, "\t\theader{%[1]sScriptStr, %[1]sScriptIdx},\n", ident)
568 fmt.Fprintf(b.w, "\t\theader{%[1]sRegionStr, %[1]sRegionIdx},\n", ident)
569 fmt.Fprintln(b.w, "\t}")
570 }
571 }
572 fmt.Fprintln(b.w, ")")
573
574 var s string
575 var a []uint16
576 sz := reflect.TypeOf(s).Size()
577 sz += reflect.TypeOf(a).Size()
578 sz *= 3
579 sz += reflect.TypeOf(&a).Size()
580 n := int(sz) * len(dict)
581 fmt.Fprintf(b.w, "// Total size for %d entries: %d bytes (%d KB)\n\n", len(dict), n, n/1000)
582
583 b.w.Size += n
584}
585
586// unique sorts the given lists and removes duplicate entries by swapping them
587// past position k, where k is the number of unique values. It returns k.
588func unique(a sort.Interface) int {
589 if a.Len() == 0 {
590 return 0
591 }
592 sort.Sort(a)
593 k := 1
594 for i := 1; i < a.Len(); i++ {
595 if a.Less(k-1, i) {
596 if k != i {
597 a.Swap(k, i)
598 }
599 k++
600 }
601 }
602 return k
603}