language: implement paradigm locales
This allows deleting parent distance.
This also fully implements allowing the user to
order dialects non-contiguously.
Change-Id: I09a8d21a6c6e18edc24db136b280ba960eb72370
Reviewed-on: https://go-review.googlesource.com/55911
Run-TryBot: Marcel van Lohuizen <mpvl@golang.org>
Reviewed-by: Nigel Tao <nigeltao@golang.org>
diff --git a/language/gen.go b/language/gen.go
index 50f772d..7c260e5 100644
--- a/language/gen.go
+++ b/language/gen.go
@@ -1417,6 +1417,22 @@
}
b.writeSlice("regionToGroups", regionToGroups)
+ // maps language id to in- and out-of-group region.
+ paradigmLocales := [][3]uint16{}
+ locales := strings.Split(lm[0].ParadigmLocales[0].Locales, " ")
+ for i := 0; i < len(locales); i += 2 {
+ x := [3]uint16{}
+ for j := 0; j < 2; j++ {
+ pc := strings.SplitN(locales[i+j], "-", 2)
+ x[0] = b.langIndex(pc[0])
+ if len(pc) == 2 {
+ x[1+j] = uint16(b.region.index(pc[1]))
+ }
+ }
+ paradigmLocales = append(paradigmLocales, x)
+ }
+ b.writeSlice("paradigmLocales", paradigmLocales)
+
b.writeType(mutualIntelligibility{})
b.writeType(scriptIntelligibility{})
b.writeType(regionIntelligibility{})
diff --git a/language/match.go b/language/match.go
index 2f14436..b7779ed 100644
--- a/language/match.go
+++ b/language/match.go
@@ -559,9 +559,6 @@
}
}
- // TODO: include alt script.
- // - don't replace regions, but allow regions to be made more specific.
-
// update is used to add indexes in the map for equivalent languages.
// update will only add entries to original indexes, thus not computing any
// transitive relations.
@@ -687,16 +684,18 @@
// bestMatch accumulates the best match so far.
type bestMatch struct {
- have *haveTag
- want Tag
- conf Confidence
- pinLanguage bool
+ have *haveTag
+ want Tag
+ conf Confidence
+ pinnedRegion regionID
+ pinLanguage bool
+ sameRegionGroup bool
// Cached results from applying tie-breaking rules.
origLang bool
origReg bool
+ paradigmReg bool
regGroupDist uint8
origScript bool
- parentDist uint8 // 255 if have is not an ancestor of want tag.
}
// update updates the existing best match if the new pair is considered to be a
@@ -723,12 +722,20 @@
if m.pinLanguage && tag.lang != m.want.lang {
return
}
- if c == Exact && have.tag.script == tag.script {
+ // Pin the region group if we are comparing tags for the same language.
+ if tag.lang == m.want.lang && m.sameRegionGroup {
+ _, sameGroup := regionGroupDist(m.pinnedRegion, have.maxRegion, have.maxScript, m.want.lang)
+ if !sameGroup {
+ return
+ }
+ }
+ if c == Exact && have.maxScript == maxScript {
+ // If there is another language and then another entry of this language,
+ // don't pin anything, otherwise pin the language.
m.pinLanguage = pin
}
if have.tag.equalsRest(tag) {
} else if have.maxScript != maxScript {
- // fmt.Println("FFFFF", maxScript, have.maxScript)
// There is usually very little comprehension between different scripts.
// In a few cases there may still be Low comprehension. This possibility
// is pre-computed and stored in have.altScript.
@@ -737,9 +744,8 @@
}
c = Low
} else if have.maxRegion != maxRegion {
- // There is usually a small difference between languages across regions.
- // We use the region distance (below) to disambiguate between equal matches.
if High < c {
+ // There is usually a small difference between languages across regions.
c = High
}
}
@@ -766,7 +772,16 @@
beaten = true
}
- regGroupDist := regionGroupDist(have.maxRegion, maxRegion, maxScript, tag.lang)
+ // We prefer if the pre-maximized region was specified and identical.
+ origReg := have.tag.region == tag.region && tag.region != 0
+ if !beaten && m.origReg != origReg {
+ if m.origReg {
+ return
+ }
+ beaten = true
+ }
+
+ regGroupDist, sameGroup := regionGroupDist(have.maxRegion, maxRegion, maxScript, tag.lang)
if !beaten && m.regGroupDist != regGroupDist {
if regGroupDist > m.regGroupDist {
return
@@ -774,10 +789,9 @@
beaten = true
}
- // We prefer if the pre-maximized region was specified and identical.
- origReg := have.tag.region == tag.region && tag.region != 0
- if !beaten && m.origReg != origReg {
- if m.origReg {
+ paradigmReg := isParadigmLocale(tag.lang, have.maxRegion)
+ if !beaten && m.paradigmReg != paradigmReg {
+ if !paradigmReg {
return
}
beaten = true
@@ -792,48 +806,35 @@
beaten = true
}
- // TODO: remove parent distance once primary locales are implemented.
- parentDist := parentDistance(have.tag.region, tag)
- if !beaten && m.parentDist != parentDist {
- if parentDist > m.parentDist {
- return
- }
- beaten = true
- }
-
// Update m to the newly found best match.
if beaten {
m.have = have
m.want = tag
m.conf = c
+ m.pinnedRegion = maxRegion
+ m.sameRegionGroup = sameGroup
m.origLang = origLang
m.origReg = origReg
+ m.paradigmReg = paradigmReg
m.origScript = origScript
m.regGroupDist = regGroupDist
- m.parentDist = parentDist
}
}
-// parentDistance returns the number of times Parent must be called before the
-// regions match. It is assumed that it has already been checked that lang and
-// script are identical. If haveRegion does not occur in the ancestor chain of
-// tag, it returns 255.
-func parentDistance(haveRegion regionID, tag Tag) uint8 {
- p := tag.Parent()
- d := uint8(1)
- for haveRegion != p.region {
- if p.region == 0 {
- return 255
+func isParadigmLocale(lang langID, r regionID) bool {
+ for _, e := range paradigmLocales {
+ if langID(e[0]) == lang && (r == regionID(e[1]) || r == regionID(e[2])) {
+ return true
}
- p = p.Parent()
- d++
}
- return d
+ return false
}
// regionGroupDist computes the distance between two regions based on their
// CLDR grouping.
-func regionGroupDist(a, b regionID, script scriptID, lang langID) uint8 {
+func regionGroupDist(a, b regionID, script scriptID, lang langID) (dist uint8, same bool) {
+ const defaultDistance = 4
+
aGroup := uint(regionToGroups[a]) << 1
bGroup := uint(regionToGroups[b]) << 1
for _, ri := range matchRegion {
@@ -841,17 +842,16 @@
group := uint(1 << (ri.group &^ 0x80))
if 0x80&ri.group == 0 {
if aGroup&bGroup&group != 0 { // Both regions are in the group.
- return ri.distance
+ return ri.distance, ri.distance == defaultDistance
}
} else {
if (aGroup|bGroup)&group == 0 { // Both regions are not in the group.
- return ri.distance
+ return ri.distance, ri.distance == defaultDistance
}
}
}
}
- const defaultDistance = 4
- return defaultDistance
+ return defaultDistance, true
}
func (t Tag) variants() string {
@@ -898,4 +898,14 @@
notEquivalent = append(notEquivalent, langID(lm.from))
}
}
+ // Maximize undefined regions of paradigm locales.
+ for i, v := range paradigmLocales {
+ max, _ := addTags(Tag{lang: langID(v[0])})
+ if v[1] == 0 {
+ paradigmLocales[i][1] = uint16(max.region)
+ }
+ if v[2] == 0 {
+ paradigmLocales[i][2] = uint16(max.region)
+ }
+ }
}
diff --git a/language/match_test.go b/language/match_test.go
index ca77115..f1b9010 100644
--- a/language/match_test.go
+++ b/language/match_test.go
@@ -64,15 +64,6 @@
// combined: got "en-GB-u-ca-buddhist-nu-arab"; want "en-GB-fonipa-t-m0-iso-i0-pinyin-u-ca-buddhist-nu-arab"
"und,en-GB-u-sd-gbsct/en-fonipa-u-nu-Arab-ca-buddhist-t-m0-iso-i0-pinyin": true,
- // Go prefers exact matches over less exact preferred ones.
- // Preferring desired ones might be better.
- // NOTE: allow users to distinguish languages is a good solution.
- // the remaining cases are due to preferred locale rules.
- "pt-PT,pt-BR,es,es-419/pt-US,pt-PT": true, // match: got "pt-PT"; want "pt-BR"
- "pt-PT,pt,es,es-419/pt-US,pt-PT,pt": true, // match: got "pt-PT"; want "pt"
- // TODO: implement prefer primary locales.
- "und,en,en-GU,en-IN,en-GB/en-ZA": true, // match: got "en-IN"; want "en-GB"
-
// Inconsistencies with Mark Davis' implementation where it is not clear
// which is better.
@@ -259,6 +250,12 @@
}{
{"zh-TW", "zh-HK", 5},
{"zh-MO", "zh-HK", 4},
+ {"es-ES", "es-AR", 5},
+ {"es-ES", "es", 4},
+ {"es-419", "es-MX", 4},
+ {"es-AR", "es-MX", 4},
+ {"es-ES", "es-MX", 5},
+ {"es-PT", "es-MX", 5},
}
for _, tc := range testCases {
a := MustParse(tc.a)
@@ -270,33 +267,27 @@
t.Errorf("scripts differ: %q vs %q", aScript, bScript)
continue
}
- d := regionGroupDist(a.region, b.region, aScript.scriptID, a.lang)
+ d, _ := regionGroupDist(a.region, b.region, aScript.scriptID, a.lang)
if d != tc.distance {
t.Errorf("got %q; want %q", d, tc.distance)
}
}
}
-func TestParentDistance(t *testing.T) {
- tests := []struct {
- parent string
- tag string
- d uint8
- }{
- {"en-001", "en-AU", 1},
- {"pt-PT", "pt-AO", 1},
- {"pt", "pt-AO", 2},
- {"en-AU", "en-GB", 255},
- {"en-NL", "en-AU", 255},
- // Note that pt-BR and en-US are not automatically minimized.
- {"pt-BR", "pt-AO", 255},
- {"en-US", "en-AU", 255},
+func TestIsParadigmLocale(t *testing.T) {
+ testCases := map[string]bool{
+ "en-US": true,
+ "en-GB": true,
+ "en-VI": false,
+ "es-GB": false,
+ "es-ES": true,
+ "es-419": true,
}
- for _, tt := range tests {
- r := Raw.MustParse(tt.parent).region
- tag := Raw.MustParse(tt.tag)
- if d := parentDistance(r, tag); d != tt.d {
- t.Errorf("d(%s, %s) was %d; want %d", r, tag, d, tt.d)
+ for str, want := range testCases {
+ tag := Make(str)
+ got := isParadigmLocale(tag.lang, tag.region)
+ if got != want {
+ t.Errorf("isPL(%q) = %v; want %v", str, got, want)
}
}
}
diff --git a/language/tables.go b/language/tables.go
index a5e59d3..ec17f97 100644
--- a/language/tables.go
+++ b/language/tables.go
@@ -3335,6 +3335,13 @@
0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
}
+// Size: 18 bytes, 3 elements
+var paradigmLocales = [3][3]uint16{
+ 0: [3]uint16{0x138, 0x0, 0x7b},
+ 1: [3]uint16{0x13d, 0x0, 0x1f},
+ 2: [3]uint16{0x3be, 0x41, 0xee},
+}
+
type mutualIntelligibility struct {
want uint16
have uint16
@@ -3665,4 +3672,4 @@
4: {lang: 0x527, script: 0x38, maxScript: 0x38, toRegion: 0x8d, fromRegion: []uint16{0xc6}},
}
-// Total table size 27157 bytes (26KiB); checksum: F21EE307
+// Total table size 27175 bytes (26KiB); checksum: 569649CD