language: change semantics of CompactIndex

This is the first CL in a sequence to enhance both the
performance and functionality of language Tags.

First step is to have CompactIndex implement the
semantics that it should have: rather than returning
Und when there is no match, it should return the closest
matching compact index. CompactIndex is often used
this way, and when it is not it was typcially a bug. This
fixes that.

Next steps are to have the fast-path internal encoding
of tag be a compact index.

The step after that is to keep two compact indexes,
allow representing both the language and locale part
of a tag like:
    en-US-va-posix
and
    en-GB-rg-gbsct

The result will allow for dual-mode tags resulting from
matching, simplifying the life of an API user.
(In Java the API users will get two tags from a match
and will have to decide on a case-by-case basis which
tag to use for which purpose.)
This design also allows for fast converstion to compact
indices, making it unnecessary for message.Printers to
extract this index.

Change-Id: I019ac6df4a3e3c7e6c758173fc942667df65ff36
Reviewed-on: https://go-review.googlesource.com/95815
Run-TryBot: Marcel van Lohuizen <mpvl@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Nigel Tao <nigeltao@golang.org>
diff --git a/internal/number/format.go b/internal/number/format.go
index 910bdeb..ce1a762 100644
--- a/internal/number/format.go
+++ b/internal/number/format.go
@@ -39,12 +39,8 @@
 
 func (f *Formatter) init(t language.Tag, index []uint8) {
 	f.Info = InfoFromTag(t)
-	for ; ; t = t.Parent() {
-		if ci, ok := language.CompactIndex(t); ok {
-			f.Pattern = formats[index[ci]]
-			break
-		}
-	}
+	ci, _ := language.CompactIndex(t)
+	f.Pattern = formats[index[ci]]
 }
 
 // InitPattern initializes a Formatter for the given Pattern.
diff --git a/internal/number/number.go b/internal/number/number.go
index 2a21f07..70c0836 100644
--- a/internal/number/number.go
+++ b/internal/number/number.go
@@ -100,12 +100,8 @@
 
 // InfoFromTag returns a Info for the given language tag.
 func InfoFromTag(t language.Tag) Info {
-	for {
-		if index, ok := language.CompactIndex(t); ok {
-			return InfoFromLangID(index, t.TypeForKey("nu"))
-		}
-		t = t.Parent()
-	}
+	index, _ := language.CompactIndex(t)
+	return InfoFromLangID(index, t.TypeForKey("nu"))
 }
 
 // IsDecimal reports if the numbering system can convert decimal to native
@@ -148,9 +144,6 @@
 }
 
 func formatForLang(t language.Tag, index []byte) *Pattern {
-	for ; ; t = t.Parent() {
-		if x, ok := language.CompactIndex(t); ok {
-			return &formats[index[x]]
-		}
-	}
+	x, _ := language.CompactIndex(t)
+	return &formats[index[x]]
 }
diff --git a/language/language.go b/language/language.go
index b65e213..f81f976 100644
--- a/language/language.go
+++ b/language/language.go
@@ -704,49 +704,61 @@
 
 // CompactIndex returns an index, where 0 <= index < NumCompactTags, for tags
 // for which data exists in the text repository. The index will change over time
-// and should not be stored in persistent storage. Extensions, except for the
-// 'va' type of the 'u' extension, are ignored. It will return 0, false if no
-// compact tag exists, where 0 is the index for the root language (Und).
-func CompactIndex(t Tag) (index int, ok bool) {
+// and should not be stored in persistent storage. If t does not match a compact
+// index, exact will be false and the compact index will be returned for the
+// first match after repeatedly taking the Parent of t.
+func CompactIndex(t Tag) (index int, exact bool) {
 	// TODO: perhaps give more frequent tags a lower index.
 	// TODO: we could make the indexes stable. This will excluded some
 	//       possibilities for optimization, so don't do this quite yet.
+	exact = true
+
 	b, s, r := t.Raw()
-	if len(t.str) > 0 {
+	switch {
+	case len(t.str) > 0:
 		if strings.HasPrefix(t.str, "x-") {
 			// We have no entries for user-defined tags.
 			return 0, false
 		}
 		if uint16(t.pVariant) != t.pExt {
-			// There are no tags with variants and an u-va type.
-			if t.TypeForKey("va") != "" {
-				return 0, false
+			if int(t.pExt) < len(t.str) {
+				exact = false
+				t, _ = Raw.Compose(b, s, r, t.Variants())
 			}
-			t, _ = Raw.Compose(b, s, r, t.Variants())
 		} else if _, ok := t.Extension('u'); ok {
+			// TODO: va may mean something else. Consider not considering it.
 			// Strip all but the 'va' entry.
+			old := t
 			variant := t.TypeForKey("va")
 			t, _ = Raw.Compose(b, s, r)
-			t, _ = t.SetTypeForKey("va", variant)
+			if variant != "" {
+				t, _ = t.SetTypeForKey("va", variant)
+			}
+			exact = old == t
 		}
 		if len(t.str) > 0 {
 			// We have some variants.
 			for i, s := range specialTags {
 				if s == t {
-					return i + 1, true
+					return i + 1, exact
 				}
 			}
-			return 0, false
+			exact = false
 		}
 	}
-	// No variants specified: just compare core components.
-	// The key has the form lllssrrr, where l, s, and r are nibbles for
-	// respectively the langID, scriptID, and regionID.
-	key := uint32(b.langID) << (8 + 12)
-	key |= uint32(s.scriptID) << 12
-	key |= uint32(r.regionID)
-	x, ok := coreTags[key]
-	return int(x), ok
+	for ; t != Und; t = t.Parent() {
+		// No variants specified: just compare core components.
+		// The key has the form lllssrrr, where l, s, and r are nibbles for
+		// respectively the langID, scriptID, and regionID.
+		key := uint32(b.langID) << (8 + 12)
+		key |= uint32(s.scriptID) << 12
+		key |= uint32(r.regionID)
+		if x, ok := coreTags[key]; ok {
+			return int(x), exact
+		}
+		exact = false
+	}
+	return int(0), exact
 }
 
 // Base is an ISO 639 language code, used for encoding the base language
diff --git a/language/language_test.go b/language/language_test.go
index 9e42d15..14bc8a7 100644
--- a/language/language_test.go
+++ b/language/language_test.go
@@ -87,14 +87,14 @@
 		// will be solved if we decide to fix the indexes.
 		{"und", 0, true},
 		{"ca-ES-valencia", 1, true},
-		{"ca-ES-valencia-u-va-posix", 0, false},
-		{"ca-ES-valencia-u-co-phonebk", 1, true},
-		{"ca-ES-valencia-u-co-phonebk-va-posix", 0, false},
+		{"ca-ES-valencia-u-va-posix", 1, false},
+		{"ca-ES-valencia-u-co-phonebk", 1, false},
+		{"ca-ES-valencia-u-co-phonebk-va-posix", 1, false},
 		{"x-klingon", 0, false},
 		{"en-US", 232, true},
 		{"en-US-u-va-posix", 2, true},
 		{"en", 136, true},
-		{"en-u-co-phonebk", 136, true},
+		{"en-u-co-phonebk", 136, false},
 		{"en-001", 137, true},
 		{"sh", 0, false}, // We don't normalize.
 	}