| // Copyright 2013 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package language |
| |
| import ( |
| "bytes" |
| "fmt" |
| "sort" |
| "strconv" |
| |
| "golang.org/x/text/internal/tag" |
| ) |
| |
| // findIndex tries to find the given tag in idx and returns a standardized error |
| // if it could not be found. |
| func findIndex(idx tag.Index, key []byte, form string) (index int, err error) { |
| if !tag.FixCase(form, key) { |
| return 0, ErrSyntax |
| } |
| i := idx.Index(key) |
| if i == -1 { |
| return 0, NewValueError(key) |
| } |
| return i, nil |
| } |
| |
| func searchUint(imap []uint16, key uint16) int { |
| return sort.Search(len(imap), func(i int) bool { |
| return imap[i] >= key |
| }) |
| } |
| |
| type Language uint16 |
| |
| // getLangID returns the langID of s if s is a canonical subtag |
| // or langUnknown if s is not a canonical subtag. |
| func getLangID(s []byte) (Language, error) { |
| if len(s) == 2 { |
| return getLangISO2(s) |
| } |
| return getLangISO3(s) |
| } |
| |
| // TODO language normalization as well as the AliasMaps could be moved to the |
| // higher level package, but it is a bit tricky to separate the generation. |
| |
| func (id Language) Canonicalize() (Language, AliasType) { |
| return normLang(id) |
| } |
| |
| // mapLang returns the mapped langID of id according to mapping m. |
| func normLang(id Language) (Language, AliasType) { |
| k := sort.Search(len(AliasMap), func(i int) bool { |
| return AliasMap[i].From >= uint16(id) |
| }) |
| if k < len(AliasMap) && AliasMap[k].From == uint16(id) { |
| return Language(AliasMap[k].To), AliasTypes[k] |
| } |
| return id, AliasTypeUnknown |
| } |
| |
| // getLangISO2 returns the langID for the given 2-letter ISO language code |
| // or unknownLang if this does not exist. |
| func getLangISO2(s []byte) (Language, error) { |
| if !tag.FixCase("zz", s) { |
| return 0, ErrSyntax |
| } |
| if i := lang.Index(s); i != -1 && lang.Elem(i)[3] != 0 { |
| return Language(i), nil |
| } |
| return 0, NewValueError(s) |
| } |
| |
| const base = 'z' - 'a' + 1 |
| |
| func strToInt(s []byte) uint { |
| v := uint(0) |
| for i := 0; i < len(s); i++ { |
| v *= base |
| v += uint(s[i] - 'a') |
| } |
| return v |
| } |
| |
| // converts the given integer to the original ASCII string passed to strToInt. |
| // len(s) must match the number of characters obtained. |
| func intToStr(v uint, s []byte) { |
| for i := len(s) - 1; i >= 0; i-- { |
| s[i] = byte(v%base) + 'a' |
| v /= base |
| } |
| } |
| |
| // getLangISO3 returns the langID for the given 3-letter ISO language code |
| // or unknownLang if this does not exist. |
| func getLangISO3(s []byte) (Language, error) { |
| if tag.FixCase("und", s) { |
| // first try to match canonical 3-letter entries |
| for i := lang.Index(s[:2]); i != -1; i = lang.Next(s[:2], i) { |
| if e := lang.Elem(i); e[3] == 0 && e[2] == s[2] { |
| // We treat "und" as special and always translate it to "unspecified". |
| // Note that ZZ and Zzzz are private use and are not treated as |
| // unspecified by default. |
| id := Language(i) |
| if id == nonCanonicalUnd { |
| return 0, nil |
| } |
| return id, nil |
| } |
| } |
| if i := altLangISO3.Index(s); i != -1 { |
| return Language(altLangIndex[altLangISO3.Elem(i)[3]]), nil |
| } |
| n := strToInt(s) |
| if langNoIndex[n/8]&(1<<(n%8)) != 0 { |
| return Language(n) + langNoIndexOffset, nil |
| } |
| // Check for non-canonical uses of ISO3. |
| for i := lang.Index(s[:1]); i != -1; i = lang.Next(s[:1], i) { |
| if e := lang.Elem(i); e[2] == s[1] && e[3] == s[2] { |
| return Language(i), nil |
| } |
| } |
| return 0, NewValueError(s) |
| } |
| return 0, ErrSyntax |
| } |
| |
| // StringToBuf writes the string to b and returns the number of bytes |
| // written. cap(b) must be >= 3. |
| func (id Language) StringToBuf(b []byte) int { |
| if id >= langNoIndexOffset { |
| intToStr(uint(id)-langNoIndexOffset, b[:3]) |
| return 3 |
| } else if id == 0 { |
| return copy(b, "und") |
| } |
| l := lang[id<<2:] |
| if l[3] == 0 { |
| return copy(b, l[:3]) |
| } |
| return copy(b, l[:2]) |
| } |
| |
| // String returns the BCP 47 representation of the langID. |
| // Use b as variable name, instead of id, to ensure the variable |
| // used is consistent with that of Base in which this type is embedded. |
| func (b Language) String() string { |
| if b == 0 { |
| return "und" |
| } else if b >= langNoIndexOffset { |
| b -= langNoIndexOffset |
| buf := [3]byte{} |
| intToStr(uint(b), buf[:]) |
| return string(buf[:]) |
| } |
| l := lang.Elem(int(b)) |
| if l[3] == 0 { |
| return l[:3] |
| } |
| return l[:2] |
| } |
| |
| // ISO3 returns the ISO 639-3 language code. |
| func (b Language) ISO3() string { |
| if b == 0 || b >= langNoIndexOffset { |
| return b.String() |
| } |
| l := lang.Elem(int(b)) |
| if l[3] == 0 { |
| return l[:3] |
| } else if l[2] == 0 { |
| return altLangISO3.Elem(int(l[3]))[:3] |
| } |
| // This allocation will only happen for 3-letter ISO codes |
| // that are non-canonical BCP 47 language identifiers. |
| return l[0:1] + l[2:4] |
| } |
| |
| // IsPrivateUse reports whether this language code is reserved for private use. |
| func (b Language) IsPrivateUse() bool { |
| return langPrivateStart <= b && b <= langPrivateEnd |
| } |
| |
| // SuppressScript returns the script marked as SuppressScript in the IANA |
| // language tag repository, or 0 if there is no such script. |
| func (b Language) SuppressScript() Script { |
| if b < langNoIndexOffset { |
| return Script(suppressScript[b]) |
| } |
| return 0 |
| } |
| |
| type Region uint16 |
| |
| // getRegionID returns the region id for s if s is a valid 2-letter region code |
| // or unknownRegion. |
| func getRegionID(s []byte) (Region, error) { |
| if len(s) == 3 { |
| if isAlpha(s[0]) { |
| return getRegionISO3(s) |
| } |
| if i, err := strconv.ParseUint(string(s), 10, 10); err == nil { |
| return getRegionM49(int(i)) |
| } |
| } |
| return getRegionISO2(s) |
| } |
| |
| // getRegionISO2 returns the regionID for the given 2-letter ISO country code |
| // or unknownRegion if this does not exist. |
| func getRegionISO2(s []byte) (Region, error) { |
| i, err := findIndex(regionISO, s, "ZZ") |
| if err != nil { |
| return 0, err |
| } |
| return Region(i) + isoRegionOffset, nil |
| } |
| |
| // getRegionISO3 returns the regionID for the given 3-letter ISO country code |
| // or unknownRegion if this does not exist. |
| func getRegionISO3(s []byte) (Region, error) { |
| if tag.FixCase("ZZZ", s) { |
| for i := regionISO.Index(s[:1]); i != -1; i = regionISO.Next(s[:1], i) { |
| if e := regionISO.Elem(i); e[2] == s[1] && e[3] == s[2] { |
| return Region(i) + isoRegionOffset, nil |
| } |
| } |
| for i := 0; i < len(altRegionISO3); i += 3 { |
| if tag.Compare(altRegionISO3[i:i+3], s) == 0 { |
| return Region(altRegionIDs[i/3]), nil |
| } |
| } |
| return 0, NewValueError(s) |
| } |
| return 0, ErrSyntax |
| } |
| |
| func getRegionM49(n int) (Region, error) { |
| if 0 < n && n <= 999 { |
| const ( |
| searchBits = 7 |
| regionBits = 9 |
| regionMask = 1<<regionBits - 1 |
| ) |
| idx := n >> searchBits |
| buf := fromM49[m49Index[idx]:m49Index[idx+1]] |
| val := uint16(n) << regionBits // we rely on bits shifting out |
| i := sort.Search(len(buf), func(i int) bool { |
| return buf[i] >= val |
| }) |
| if r := fromM49[int(m49Index[idx])+i]; r&^regionMask == val { |
| return Region(r & regionMask), nil |
| } |
| } |
| var e ValueError |
| fmt.Fprint(bytes.NewBuffer([]byte(e.v[:])), n) |
| return 0, e |
| } |
| |
| // normRegion returns a region if r is deprecated or 0 otherwise. |
| // TODO: consider supporting BYS (-> BLR), CSK (-> 200 or CZ), PHI (-> PHL) and AFI (-> DJ). |
| // TODO: consider mapping split up regions to new most populous one (like CLDR). |
| func normRegion(r Region) Region { |
| m := regionOldMap |
| k := sort.Search(len(m), func(i int) bool { |
| return m[i].From >= uint16(r) |
| }) |
| if k < len(m) && m[k].From == uint16(r) { |
| return Region(m[k].To) |
| } |
| return 0 |
| } |
| |
| const ( |
| iso3166UserAssigned = 1 << iota |
| ccTLD |
| bcp47Region |
| ) |
| |
| func (r Region) typ() byte { |
| return regionTypes[r] |
| } |
| |
| // String returns the BCP 47 representation for the region. |
| // It returns "ZZ" for an unspecified region. |
| func (r Region) String() string { |
| if r < isoRegionOffset { |
| if r == 0 { |
| return "ZZ" |
| } |
| return fmt.Sprintf("%03d", r.M49()) |
| } |
| r -= isoRegionOffset |
| return regionISO.Elem(int(r))[:2] |
| } |
| |
| // ISO3 returns the 3-letter ISO code of r. |
| // Note that not all regions have a 3-letter ISO code. |
| // In such cases this method returns "ZZZ". |
| func (r Region) ISO3() string { |
| if r < isoRegionOffset { |
| return "ZZZ" |
| } |
| r -= isoRegionOffset |
| reg := regionISO.Elem(int(r)) |
| switch reg[2] { |
| case 0: |
| return altRegionISO3[reg[3]:][:3] |
| case ' ': |
| return "ZZZ" |
| } |
| return reg[0:1] + reg[2:4] |
| } |
| |
| // M49 returns the UN M.49 encoding of r, or 0 if this encoding |
| // is not defined for r. |
| func (r Region) M49() int { |
| return int(m49[r]) |
| } |
| |
| // IsPrivateUse reports whether r has the ISO 3166 User-assigned status. This |
| // may include private-use tags that are assigned by CLDR and used in this |
| // implementation. So IsPrivateUse and IsCountry can be simultaneously true. |
| func (r Region) IsPrivateUse() bool { |
| return r.typ()&iso3166UserAssigned != 0 |
| } |
| |
| type Script uint8 |
| |
| // getScriptID returns the script id for string s. It assumes that s |
| // is of the format [A-Z][a-z]{3}. |
| func getScriptID(idx tag.Index, s []byte) (Script, error) { |
| i, err := findIndex(idx, s, "Zzzz") |
| return Script(i), err |
| } |
| |
| // String returns the script code in title case. |
| // It returns "Zzzz" for an unspecified script. |
| func (s Script) String() string { |
| if s == 0 { |
| return "Zzzz" |
| } |
| return script.Elem(int(s)) |
| } |
| |
| // IsPrivateUse reports whether this script code is reserved for private use. |
| func (s Script) IsPrivateUse() bool { |
| return _Qaaa <= s && s <= _Qabx |
| } |
| |
| const ( |
| maxAltTaglen = len("en-US-POSIX") |
| maxLen = maxAltTaglen |
| ) |
| |
| var ( |
| // grandfatheredMap holds a mapping from legacy and grandfathered tags to |
| // their base language or index to more elaborate tag. |
| grandfatheredMap = map[[maxLen]byte]int16{ |
| [maxLen]byte{'a', 'r', 't', '-', 'l', 'o', 'j', 'b', 'a', 'n'}: _jbo, // art-lojban |
| [maxLen]byte{'i', '-', 'a', 'm', 'i'}: _ami, // i-ami |
| [maxLen]byte{'i', '-', 'b', 'n', 'n'}: _bnn, // i-bnn |
| [maxLen]byte{'i', '-', 'h', 'a', 'k'}: _hak, // i-hak |
| [maxLen]byte{'i', '-', 'k', 'l', 'i', 'n', 'g', 'o', 'n'}: _tlh, // i-klingon |
| [maxLen]byte{'i', '-', 'l', 'u', 'x'}: _lb, // i-lux |
| [maxLen]byte{'i', '-', 'n', 'a', 'v', 'a', 'j', 'o'}: _nv, // i-navajo |
| [maxLen]byte{'i', '-', 'p', 'w', 'n'}: _pwn, // i-pwn |
| [maxLen]byte{'i', '-', 't', 'a', 'o'}: _tao, // i-tao |
| [maxLen]byte{'i', '-', 't', 'a', 'y'}: _tay, // i-tay |
| [maxLen]byte{'i', '-', 't', 's', 'u'}: _tsu, // i-tsu |
| [maxLen]byte{'n', 'o', '-', 'b', 'o', 'k'}: _nb, // no-bok |
| [maxLen]byte{'n', 'o', '-', 'n', 'y', 'n'}: _nn, // no-nyn |
| [maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'f', 'r'}: _sfb, // sgn-BE-FR |
| [maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'n', 'l'}: _vgt, // sgn-BE-NL |
| [maxLen]byte{'s', 'g', 'n', '-', 'c', 'h', '-', 'd', 'e'}: _sgg, // sgn-CH-DE |
| [maxLen]byte{'z', 'h', '-', 'g', 'u', 'o', 'y', 'u'}: _cmn, // zh-guoyu |
| [maxLen]byte{'z', 'h', '-', 'h', 'a', 'k', 'k', 'a'}: _hak, // zh-hakka |
| [maxLen]byte{'z', 'h', '-', 'm', 'i', 'n', '-', 'n', 'a', 'n'}: _nan, // zh-min-nan |
| [maxLen]byte{'z', 'h', '-', 'x', 'i', 'a', 'n', 'g'}: _hsn, // zh-xiang |
| |
| // Grandfathered tags with no modern replacement will be converted as |
| // follows: |
| [maxLen]byte{'c', 'e', 'l', '-', 'g', 'a', 'u', 'l', 'i', 's', 'h'}: -1, // cel-gaulish |
| [maxLen]byte{'e', 'n', '-', 'g', 'b', '-', 'o', 'e', 'd'}: -2, // en-GB-oed |
| [maxLen]byte{'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'}: -3, // i-default |
| [maxLen]byte{'i', '-', 'e', 'n', 'o', 'c', 'h', 'i', 'a', 'n'}: -4, // i-enochian |
| [maxLen]byte{'i', '-', 'm', 'i', 'n', 'g', 'o'}: -5, // i-mingo |
| [maxLen]byte{'z', 'h', '-', 'm', 'i', 'n'}: -6, // zh-min |
| |
| // CLDR-specific tag. |
| [maxLen]byte{'r', 'o', 'o', 't'}: 0, // root |
| [maxLen]byte{'e', 'n', '-', 'u', 's', '-', 'p', 'o', 's', 'i', 'x'}: -7, // en_US_POSIX" |
| } |
| |
| altTagIndex = [...]uint8{0, 17, 31, 45, 61, 74, 86, 102} |
| |
| altTags = "xtg-x-cel-gaulishen-GB-oxendicten-x-i-defaultund-x-i-enochiansee-x-i-mingonan-x-zh-minen-US-u-va-posix" |
| ) |
| |
| func grandfathered(s [maxAltTaglen]byte) (t Tag, ok bool) { |
| if v, ok := grandfatheredMap[s]; ok { |
| if v < 0 { |
| return Make(altTags[altTagIndex[-v-1]:altTagIndex[-v]]), true |
| } |
| t.LangID = Language(v) |
| return t, true |
| } |
| return t, false |
| } |