language: change semantics of CompactIndex This is the first CL in a sequence to enhance both the performance and functionality of language Tags. First step is to have CompactIndex implement the semantics that it should have: rather than returning Und when there is no match, it should return the closest matching compact index. CompactIndex is often used this way, and when it is not it was typcially a bug. This fixes that. Next steps are to have the fast-path internal encoding of tag be a compact index. The step after that is to keep two compact indexes, allow representing both the language and locale part of a tag like: en-US-va-posix and en-GB-rg-gbsct The result will allow for dual-mode tags resulting from matching, simplifying the life of an API user. (In Java the API users will get two tags from a match and will have to decide on a case-by-case basis which tag to use for which purpose.) This design also allows for fast converstion to compact indices, making it unnecessary for message.Printers to extract this index. Change-Id: I019ac6df4a3e3c7e6c758173fc942667df65ff36 Reviewed-on: https://go-review.googlesource.com/95815 Run-TryBot: Marcel van Lohuizen <mpvl@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Nigel Tao <nigeltao@golang.org>

commit: ccf29a08cb92872b297cad16c53ca7bbede5bd1d [log] [tgz]
author: Marcel van Lohuizen <mpvl@golang.org> Tue Feb 13 14:47:17 2018 +0100
committer: Marcel van Lohuizen <mpvl@golang.org> Fri Feb 23 17:23:06 2018 +0000
tree: 65e71444d2c93f6c1c50ecf699ea6a6036ea745e
parent: 9e2b64d659da1afe07ce1c9c1dfefc09d188f21e [diff]
diff --git a/internal/number/format.go b/internal/number/format.go
index 910bdeb..ce1a762 100644
--- a/internal/number/format.go
+++ b/internal/number/format.go

@@ -39,12 +39,8 @@
 
 func (f *Formatter) init(t language.Tag, index []uint8) {
 	f.Info = InfoFromTag(t)
-	for ; ; t = t.Parent() {
-		if ci, ok := language.CompactIndex(t); ok {
-			f.Pattern = formats[index[ci]]
-			break
-		}
-	}
+	ci, _ := language.CompactIndex(t)
+	f.Pattern = formats[index[ci]]
 }
 
 // InitPattern initializes a Formatter for the given Pattern.

diff --git a/internal/number/number.go b/internal/number/number.go
index 2a21f07..70c0836 100644
--- a/internal/number/number.go
+++ b/internal/number/number.go

@@ -100,12 +100,8 @@
 
 // InfoFromTag returns a Info for the given language tag.
 func InfoFromTag(t language.Tag) Info {
-	for {
-		if index, ok := language.CompactIndex(t); ok {
-			return InfoFromLangID(index, t.TypeForKey("nu"))
-		}
-		t = t.Parent()
-	}
+	index, _ := language.CompactIndex(t)
+	return InfoFromLangID(index, t.TypeForKey("nu"))
 }
 
 // IsDecimal reports if the numbering system can convert decimal to native
@@ -148,9 +144,6 @@
 }
 
 func formatForLang(t language.Tag, index []byte) *Pattern {
-	for ; ; t = t.Parent() {
-		if x, ok := language.CompactIndex(t); ok {
-			return &formats[index[x]]
-		}
-	}
+	x, _ := language.CompactIndex(t)
+	return &formats[index[x]]
 }

diff --git a/language/language.go b/language/language.go
index b65e213..f81f976 100644
--- a/language/language.go
+++ b/language/language.go

@@ -704,49 +704,61 @@
 
 // CompactIndex returns an index, where 0 <= index < NumCompactTags, for tags
 // for which data exists in the text repository. The index will change over time
-// and should not be stored in persistent storage. Extensions, except for the
-// 'va' type of the 'u' extension, are ignored. It will return 0, false if no
-// compact tag exists, where 0 is the index for the root language (Und).
-func CompactIndex(t Tag) (index int, ok bool) {
+// and should not be stored in persistent storage. If t does not match a compact
+// index, exact will be false and the compact index will be returned for the
+// first match after repeatedly taking the Parent of t.
+func CompactIndex(t Tag) (index int, exact bool) {
 	// TODO: perhaps give more frequent tags a lower index.
 	// TODO: we could make the indexes stable. This will excluded some
 	//       possibilities for optimization, so don't do this quite yet.
+	exact = true
+
 	b, s, r := t.Raw()
-	if len(t.str) > 0 {
+	switch {
+	case len(t.str) > 0:
 		if strings.HasPrefix(t.str, "x-") {
 			// We have no entries for user-defined tags.
 			return 0, false
 		}
 		if uint16(t.pVariant) != t.pExt {
-			// There are no tags with variants and an u-va type.
-			if t.TypeForKey("va") != "" {
-				return 0, false
+			if int(t.pExt) < len(t.str) {
+				exact = false
+				t, _ = Raw.Compose(b, s, r, t.Variants())
 			}
-			t, _ = Raw.Compose(b, s, r, t.Variants())
 		} else if _, ok := t.Extension('u'); ok {
+			// TODO: va may mean something else. Consider not considering it.
 			// Strip all but the 'va' entry.
+			old := t
 			variant := t.TypeForKey("va")
 			t, _ = Raw.Compose(b, s, r)
-			t, _ = t.SetTypeForKey("va", variant)
+			if variant != "" {
+				t, _ = t.SetTypeForKey("va", variant)
+			}
+			exact = old == t
 		}
 		if len(t.str) > 0 {
 			// We have some variants.
 			for i, s := range specialTags {
 				if s == t {
-					return i + 1, true
+					return i + 1, exact
 				}
 			}
-			return 0, false
+			exact = false
 		}
 	}
-	// No variants specified: just compare core components.
-	// The key has the form lllssrrr, where l, s, and r are nibbles for
-	// respectively the langID, scriptID, and regionID.
-	key := uint32(b.langID) << (8 + 12)
-	key |= uint32(s.scriptID) << 12
-	key |= uint32(r.regionID)
-	x, ok := coreTags[key]
-	return int(x), ok
+	for ; t != Und; t = t.Parent() {
+		// No variants specified: just compare core components.
+		// The key has the form lllssrrr, where l, s, and r are nibbles for
+		// respectively the langID, scriptID, and regionID.
+		key := uint32(b.langID) << (8 + 12)
+		key |= uint32(s.scriptID) << 12
+		key |= uint32(r.regionID)
+		if x, ok := coreTags[key]; ok {
+			return int(x), exact
+		}
+		exact = false
+	}
+	return int(0), exact
 }
 
 // Base is an ISO 639 language code, used for encoding the base language

diff --git a/language/language_test.go b/language/language_test.go
index 9e42d15..14bc8a7 100644
--- a/language/language_test.go
+++ b/language/language_test.go

@@ -87,14 +87,14 @@
 		// will be solved if we decide to fix the indexes.
 		{"und", 0, true},
 		{"ca-ES-valencia", 1, true},
-		{"ca-ES-valencia-u-va-posix", 0, false},
-		{"ca-ES-valencia-u-co-phonebk", 1, true},
-		{"ca-ES-valencia-u-co-phonebk-va-posix", 0, false},
+		{"ca-ES-valencia-u-va-posix", 1, false},
+		{"ca-ES-valencia-u-co-phonebk", 1, false},
+		{"ca-ES-valencia-u-co-phonebk-va-posix", 1, false},
 		{"x-klingon", 0, false},
 		{"en-US", 232, true},
 		{"en-US-u-va-posix", 2, true},
 		{"en", 136, true},
-		{"en-u-co-phonebk", 136, true},
+		{"en-u-co-phonebk", 136, false},
 		{"en-001", 137, true},
 		{"sh", 0, false}, // We don't normalize.
 	}
commit	ccf29a08cb92872b297cad16c53ca7bbede5bd1d	[log] [tgz]
author	Marcel van Lohuizen <mpvl@golang.org>	Tue Feb 13 14:47:17 2018 +0100
committer	Marcel van Lohuizen <mpvl@golang.org>	Fri Feb 23 17:23:06 2018 +0000
tree	65e71444d2c93f6c1c50ecf699ea6a6036ea745e
parent	9e2b64d659da1afe07ce1c9c1dfefc09d188f21e [diff]