language: distinguish regional variant in matching

Change-Id: I1d997d625726e4b8152a7d039fc761dfcad45b7a
Reviewed-on: https://go-review.googlesource.com/95832
Run-TryBot: Marcel van Lohuizen <mpvl@golang.org>
Reviewed-by: Nigel Tao <nigeltao@golang.org>
diff --git a/language/httpexample_test.go b/language/httpexample_test.go
index 40d0663..03c0ab9 100644
--- a/language/httpexample_test.go
+++ b/language/httpexample_test.go
@@ -24,7 +24,7 @@
 	t, q, err := language.ParseAcceptLanguage(r.Header.Get("Accept-Language"))
 	// We ignore the error: the default language will be selected for t == nil.
 	tag, _, _ := matcher.Match(t...)
-	fmt.Printf("%5v (t: %6v; q: %3v; err: %v)\n", tag, t, q, err)
+	fmt.Printf("%17v (t: %6v; q: %3v; err: %v)\n", tag, t, q, err)
 }
 
 func ExampleParseAcceptLanguage() {
@@ -41,8 +41,8 @@
 	}
 
 	// Output:
-	// en-GB (t: [    en  en-US     nn]; q: [  1 0.8 0.3]; err: <nil>)
-	// en-GB (t: [   gsw  en-US     en]; q: [  1 0.8 0.7]; err: <nil>)
-	//    de (t: [   gsw     nl     da]; q: [  1   1   1]; err: <nil>)
-	// en-GB (t: []; q: []; err: language: tag is not well-formed)
+	//             en-GB (t: [    en  en-US     nn]; q: [  1 0.8 0.3]; err: <nil>)
+	// en-GB-u-rg-uszzzz (t: [   gsw  en-US     en]; q: [  1 0.8 0.7]; err: <nil>)
+	//                de (t: [   gsw     nl     da]; q: [  1   1   1]; err: <nil>)
+	//             en-GB (t: []; q: []; err: language: tag is not well-formed)
 }
diff --git a/language/language.go b/language/language.go
index 9ddff22..b254794 100644
--- a/language/language.go
+++ b/language/language.go
@@ -30,10 +30,11 @@
 }
 
 func makeTag(t language.Tag) (tag Tag) {
-	if region := t.TypeForKey("rg"); len(region) > 2 {
+	if region := t.TypeForKey("rg"); len(region) == 6 && region[2:] == "zzzz" {
 		if r, err := language.ParseRegion(region[:2]); err == nil {
 			tFull := t
 			t, _ = t.SetTypeForKey("rg", "")
+			// TODO: should we not consider "va" for the language tag?
 			var exact1, exact2 bool
 			tag.language, exact1 = compactIndex(t)
 			t.RegionID = r
@@ -60,7 +61,7 @@
 	tag := t.language.tag()
 	if t.language != t.locale {
 		loc := t.locale.tag()
-		tag.SetTypeForKey("rg", strings.ToLower(loc.RegionID.String())+"zzzz")
+		tag, _ = tag.SetTypeForKey("rg", strings.ToLower(loc.RegionID.String())+"zzzz")
 	}
 	return tag
 }
@@ -483,12 +484,56 @@
 }
 
 // CompactIndex returns an index, where 0 <= index < NumCompactTags, for tags
-// for which data exists in the text repository. The index will change over time
+// for which data exists in the text repository.The index will change over time
 // and should not be stored in persistent storage. If t does not match a compact
 // index, exact will be false and the compact index will be returned for the
 // first match after repeatedly taking the Parent of t.
 func CompactIndex(t Tag) (index int, exact bool) {
-	return int(t.locale), t.language == t.locale && t.full == nil
+	return int(t.language), t.full == nil
+}
+
+// TODO: make these functions and methods public once we settle on the API and
+//
+
+// regionalCompactIndex returns the CompactIndex for the regional variant of
+// this tag. This index is used to indicate region-specific overrides, such as
+// default currency, default calendar and week data, default time cycle, and
+// default measurement system and unit preferences.
+//
+// For instance, the tag en-GB-u-rg-uszzzz specifies British English with US
+// settings for currency, number formatting, etc. The CompactIndex for this tag
+// will be that for en-GB, while the regionalCompactIndex will be the one
+// corresponding to en-US.
+func regionalCompactIndex(t Tag) (index int, exact bool) {
+	return int(t.locale), t.full == nil
+}
+
+// languageTag returns t stripped of regional variant indicators.
+//
+// At the moment this means it is stripped of a regional and variant subtag "rg"
+// and "va" in the "u" extension.
+func (t Tag) languageTag() Tag {
+	if t.full == nil {
+		return Tag{language: t.language, locale: t.language}
+	}
+	tt := t.tag()
+	tt.SetTypeForKey("rg", "")
+	tt.SetTypeForKey("va", "")
+	return makeTag(tt)
+}
+
+// regionalTag returns the regional variant of the tag.
+//
+// At the moment this means that the region is set from the regional subtag
+// "rg" in the "u" extension.
+func (t Tag) regionalTag() Tag {
+	rt := Tag{language: t.locale, locale: t.locale}
+	if t.full == nil {
+		return rt
+	}
+	t, _ = Raw.Compose(rt, t.Variants(), t.Extensions())
+	t, _ = t.SetTypeForKey("rg", "")
+	return t
 }
 
 func compactIndex(t language.Tag) (index compactID, exact bool) {
diff --git a/language/language_test.go b/language/language_test.go
index 273f61f..168af38 100644
--- a/language/language_test.go
+++ b/language/language_test.go
@@ -46,35 +46,60 @@
 	}
 }
 
-func TestCompactIndex(t *testing.T) {
-	tests := []struct {
-		tag   string
-		index compactID
-		ok    bool
-	}{
-		// TODO: these values will change with each CLDR update. This issue
-		// will be solved if we decide to fix the indexes.
-		{"und", undIndex, true},
-		{"ca-ES-valencia", caESvalenciaIndex, true},
-		{"ca-ES-valencia-u-va-posix", caESvalenciaIndex, false},
-		{"ca-ES-valencia-u-co-phonebk", caESvalenciaIndex, false},
-		{"ca-ES-valencia-u-co-phonebk-va-posix", caESvalenciaIndex, false},
-		{"x-klingon", 0, false},
-		{"en-US", enUSIndex, true},
-		{"en-US-u-va-posix", enUSuvaposixIndex, true},
-		{"en", enIndex, true},
-		{"en-u-co-phonebk", enIndex, false},
-		{"en-001", en001Index, true},
-		{"zh-Hant-HK", zhHantHKIndex, true},
-		{"zh-HK", zhHantHKIndex, false}, // maximized to zh-Hant-HK
-		{"nl-Beng", 0, false},           // parent skips script
-		{"nl-NO", nlIndex, false},       // region is ignored
-		{"nl-Latn-NO", nlIndex, false},
-		{"nl-Latn-NO-u-co-phonebk", nlIndex, false},
-		{"nl-Latn-NO-valencia", nlIndex, false},
-		{"nl-Latn-NO-oxendict", nlIndex, false},
-		{"sh", shIndex, true}, // From plural rules.
+func TestString(t *testing.T) {
+	tests := []string{
+		"no-u-rg-dkzzzz",
 	}
+	for i, s := range tests {
+		tag := Make(s)
+		if tag.String() != s {
+			t.Errorf("%d:%s: got %s: want %s (%#v)", i, s, tag.String(), s, tag)
+		}
+	}
+}
+
+type compactTest struct {
+	tag   string
+	index compactID
+	ok    bool
+}
+
+var compactTests = []compactTest{
+	// TODO: these values will change with each CLDR update. This issue
+	// will be solved if we decide to fix the indexes.
+	{"und", undIndex, true},
+	{"ca-ES-valencia", caESvalenciaIndex, true},
+	{"ca-ES-valencia-u-va-posix", caESvalenciaIndex, false},
+	{"ca-ES-valencia-u-co-phonebk", caESvalenciaIndex, false},
+	{"ca-ES-valencia-u-co-phonebk-va-posix", caESvalenciaIndex, false},
+	{"x-klingon", 0, false},
+	{"en-US", enUSIndex, true},
+	{"en-US-u-va-posix", enUSuvaposixIndex, true},
+	{"en", enIndex, true},
+	{"en-u-co-phonebk", enIndex, false},
+	{"en-001", en001Index, true},
+	{"zh-Hant-HK", zhHantHKIndex, true},
+	{"zh-HK", zhHantHKIndex, false}, // maximized to zh-Hant-HK
+	{"nl-Beng", 0, false},           // parent skips script
+	{"nl-NO", nlIndex, false},       // region is ignored
+	{"nl-Latn-NO", nlIndex, false},
+	{"nl-Latn-NO-u-co-phonebk", nlIndex, false},
+	{"nl-Latn-NO-valencia", nlIndex, false},
+	{"nl-Latn-NO-oxendict", nlIndex, false},
+	{"sh", shIndex, true}, // From plural rules.
+}
+
+func TestCompactIndex(t *testing.T) {
+	tests := append(compactTests, []compactTest{
+		{"en-GB", enGBIndex, true},
+		{"en-GB-u-rg-uszzzz", enGBIndex, true},
+		{"en-GB-u-rg-USZZZZ", enGBIndex, true},
+		{"en-GB-u-rg-uszzzz-va-posix", enGBIndex, false},
+		{"en-GB-u-co-phonebk-rg-uszzzz", enGBIndex, false},
+		// Invalid region specifications are ignored.
+		{"en-GB-u-rg-usz-va-posix", enGBIndex, false},
+		{"en-GB-u-co-phonebk-rg-usz", enGBIndex, false},
+	}...)
 	for _, tt := range tests {
 		x, ok := CompactIndex(Raw.MustParse(tt.tag))
 		if compactID(x) != tt.index || ok != tt.ok {
@@ -83,6 +108,27 @@
 	}
 }
 
+func TestRegionalCompactIndex(t *testing.T) {
+	tests := append(compactTests, []compactTest{
+		{"en-GB", enGBIndex, true},
+		{"en-GB-u-rg-uszzzz", enUSIndex, true},
+		{"en-GB-u-rg-USZZZZ", enUSIndex, true},
+		// TODO: use different exact values for language and regional tag?
+		{"en-GB-u-rg-uszzzz-va-posix", enUSuvaposixIndex, false},
+		{"en-GB-u-co-phonebk-rg-uszzzz-va-posix", enUSuvaposixIndex, false},
+		{"en-GB-u-co-phonebk-rg-uszzzz", enUSIndex, false},
+		// Invalid region specifications are ignored.
+		{"en-GB-u-rg-usz-va-posix", enGBIndex, false},
+		{"en-GB-u-co-phonebk-rg-usz", enGBIndex, false},
+	}...)
+	for _, tt := range tests {
+		x, ok := regionalCompactIndex(Raw.MustParse(tt.tag))
+		if compactID(x) != tt.index || ok != tt.ok {
+			t.Errorf("%s: got %d, %v; want %d %v", tt.tag, x, ok, tt.index, tt.ok)
+		}
+	}
+}
+
 func TestMarshal(t *testing.T) {
 	testCases := []string{
 		// TODO: these values will change with each CLDR update. This issue
@@ -99,6 +145,12 @@
 		"en-u-co-phonebk",
 		"en-001",
 		"sh",
+
+		"en-GB-u-rg-uszzzz",
+		"en-GB-u-rg-uszzzz-va-posix",
+		"en-GB-u-co-phonebk-rg-uszzzz",
+		// Invalid tags should also roundtrip.
+		"en-GB-u-co-phonebk-rg-uszz",
 	}
 	for _, tc := range testCases {
 		var tag Tag
@@ -532,6 +584,16 @@
 		{"und-Qaai", "und-Zinh", DeprecatedScript},
 		{"und-Qaai", "und-Qaai", DeprecatedBase},
 		{"drh", "mn", All}, // drh -> khk -> mn
+
+		{"en-GB-u-rg-uszzzz", "en-GB-u-rg-uszzzz", Raw},
+		{"en-GB-u-rg-USZZZZ", "en-GB-u-rg-uszzzz", Raw},
+		// TODO: use different exact values for language and regional tag?
+		{"en-GB-u-rg-uszzzz-va-posix", "en-GB-u-rg-uszzzz-va-posix", Raw},
+		{"en-GB-u-rg-uszzzz-co-phonebk", "en-GB-u-co-phonebk-rg-uszzzz", Raw},
+		// Invalid region specifications are left as is.
+		{"en-GB-u-rg-usz", "en-GB-u-rg-usz", Raw},
+		{"en-GB-u-rg-usz-va-posix", "en-GB-u-rg-usz-va-posix", Raw},
+		{"en-GB-u-rg-usz-co-phonebk", "en-GB-u-co-phonebk-rg-usz", Raw},
 	}
 	for i, tt := range tests {
 		in, _ := Raw.Parse(tt.in)
@@ -558,6 +620,8 @@
 		{"co", "en-u-co-phonebk", "phonebk"},
 		{"co", "en-u-co-phonebk-cu-aud", "phonebk"},
 		{"co", "x-foo-u-co-phonebk", ""},
+		{"va", "en-US-u-va-posix", "posix"},
+		{"rg", "en-u-rg-gbzzzz", "gbzzzz"},
 		{"nu", "en-u-co-phonebk-nu-arabic", "arabic"},
 		{"kc", "cmn-u-co-stroke", ""},
 	}
@@ -656,6 +720,19 @@
 		{"pt-MZ", "pt-PT"},
 		{"pt-ST", "pt-PT"},
 		{"pt-TL", "pt-PT"},
+
+		{"en-GB-u-co-phonebk-rg-uszzzz", "en-GB"},
+		{"en-GB-u-rg-uszzzz", "en-GB"},
+		{"en-US-u-va-posix", "en-US"},
+
+		// Difference between language and regional tag.
+		{"ca-ES-valencia", "ca-ES"},
+		{"ca-ES-valencia-u-rg-ptzzzz", "ca-ES"},
+		{"en-US-u-va-variant", "en-US"},
+		{"en-u-va-variant", "en"},
+		{"en-u-rg-gbzzzz", "en"},
+		{"en-US-u-rg-gbzzzz", "en-US"},
+		{"nl-US-u-rg-gbzzzz", "nl-US"},
 	}
 	for _, tt := range tests {
 		tag := Raw.MustParse(tt.in)
diff --git a/language/match.go b/language/match.go
index d84e972..0edbc9d 100644
--- a/language/match.go
+++ b/language/match.go
@@ -6,6 +6,7 @@
 
 import (
 	"errors"
+	"strings"
 
 	"golang.org/x/text/language/internal"
 )
@@ -104,9 +105,14 @@
 		}
 		// TODO: select first language tag based on script.
 	}
-	if w.RegionID != 0 && tt.RegionID != 0 && tt.RegionID.Contains(w.RegionID) {
-		tt.RegionID = w.RegionID
-		tt.RemakeString()
+	if w.RegionID != tt.RegionID && w.RegionID != 0 {
+		if w.RegionID != 0 && tt.RegionID != 0 && tt.RegionID.Contains(w.RegionID) {
+			tt.RegionID = w.RegionID
+			tt.RemakeString()
+		} else if r := w.RegionID.String(); len(r) == 2 {
+			// TODO: also filter macro and deprecated.
+			tt, _ = tt.SetTypeForKey("rg", strings.ToLower(r)+"zzzz")
+		}
 	}
 	// Copy options from the user-provided tag into the result tag. This is hard
 	// to do after the fact, so we do it here.
diff --git a/language/match_test.go b/language/match_test.go
index 3bfefff..c21b863 100644
--- a/language/match_test.go
+++ b/language/match_test.go
@@ -40,13 +40,15 @@
 				gotCombined, index, conf := NewMatcher(supported).Match(desired...)
 
 				gotMatch := supported[index]
-				wantMatch := mk(p.String(2))
+				wantMatch := Raw.Make(p.String(2)) // wantMatch may be null
 				if gotMatch != wantMatch {
 					t.Fatalf("match: got %q; want %q (%v)", gotMatch, wantMatch, conf)
 				}
-				wantCombined, err := Raw.Parse(p.String(3))
-				if err == nil && gotCombined != wantCombined {
-					t.Errorf("combined: got %q; want %q (%v)", gotCombined, wantCombined, conf)
+				if tag := strings.TrimSpace(p.String(3)); tag != "" {
+					wantCombined := Raw.MustParse(tag)
+					if err == nil && gotCombined != wantCombined {
+						t.Errorf("combined: got %q; want %q (%v)", gotCombined, wantCombined, conf)
+					}
 				}
 			})
 		})
@@ -68,7 +70,7 @@
 	// which is better.
 
 	// Inconsistencies in combined. I think the Go approach is more appropriate.
-	// We could use -u-rg- and -u-va- as alternative.
+	// We could use -u-rg- as alternative.
 	"und,fr/fr-BE-fonipa":              true, // combined: got "fr"; want "fr-BE-fonipa"
 	"und,fr-CA/fr-BE-fonipa":           true, // combined: got "fr-CA"; want "fr-BE-fonipa"
 	"und,fr-fonupa/fr-BE-fonipa":       true, // combined: got "fr-fonupa"; want "fr-BE-fonipa"
diff --git a/language/parse.go b/language/parse.go
index e54a0a4..f0e0b64 100644
--- a/language/parse.go
+++ b/language/parse.go
@@ -74,10 +74,10 @@
 // than once, the latter will overwrite the former. Variants and Extensions are
 // accumulated, but if two extensions of the same type are passed, the latter
 // will replace the former. For -u extensions, though, the key-type pairs are
-// added, where later values overwrite older ones. A Tag overwrites all former values and typically
-// only makes sense as the first argument. The resulting tag is returned after
-// canonicalizing using CanonType c. If one or more errors are encountered,
-// one of the errors is returned.
+// added, where later values overwrite older ones. A Tag overwrites all former
+// values and typically only makes sense as the first argument. The resulting
+// tag is returned after canonicalizing using CanonType c. If one or more errors
+// are encountered, one of the errors is returned.
 func (c CanonType) Compose(part ...interface{}) (t Tag, err error) {
 	var b language.Builder
 	if err = update(&b, part...); err != nil {
diff --git a/language/parse_test.go b/language/parse_test.go
index 7a5b54b..3ac1642 100644
--- a/language/parse_test.go
+++ b/language/parse_test.go
@@ -36,6 +36,11 @@
 		{in: "root", lang: "und"},
 		{in: "und", lang: "und"},
 		{in: "en", lang: "en"},
+
+		{in: "en-US-u-va-posix", lang: "en", region: "US", ext: "u-va-posix"},
+		{in: "ca-ES-valencia", lang: "ca", region: "ES", variants: "valencia"},
+		{in: "en-US-u-rg-gbzzzz", lang: "en", region: "US", ext: "u-rg-gbzzzz"},
+
 		{in: "xy", lang: "und", invalid: true},
 		{in: "en-ZY", lang: "en", invalid: true},
 		{in: "gsw", lang: "gsw"},
diff --git a/language/testdata/GoLocaleMatcherTest.txt b/language/testdata/GoLocaleMatcherTest.txt
index 4f4c609..32a649f 100644
--- a/language/testdata/GoLocaleMatcherTest.txt
+++ b/language/testdata/GoLocaleMatcherTest.txt
@@ -9,7 +9,7 @@
 zh-CN, zh-TW, iw ; 	zh ; 	zh-CN
 zh-CN, zh-TW, iw ; 	zh-Hans-CN ; 	zh-CN
 zh-CN, zh-TW, iw ; 	zh-Hant-HK ; 	zh-TW
-zh-CN, zh-TW, iw ; 	he-IT ; 	iw ; iw
+zh-CN, zh-TW, iw ; 	he-IT ; 	iw ; iw-u-rg-itzzzz
 
 # language-specific script fallbacks 1
 en, sr, nl ; 	sr-Latn ; 	sr
@@ -125,7 +125,7 @@
 pt, pt-PT ; 	pt-ES ; 	pt-PT
 
 # if no preferred locale specified, pick top language, not regional
-en, fr, fr-CA, fr-CH ; 	fr-US ; 	fr #TODO: ; fr-u-rg-US
+en, fr, fr-CA, fr-CH ; 	fr-US ; 	fr  ; fr-u-rg-uszzzz
 
 # region distance German
 de-AT, de-DE, de-CH ; 	de ; 	de-DE
@@ -218,9 +218,14 @@
 en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh,  zh-Hant, zh-Hant-HK ; 	pt-TL ; 	pt-PT
 
 # preserve extensions
-en, de, sl-nedis ; 	de-FR-u-co-phonebk ; 	de ; de-u-co-phonebk
+en, de, sl-nedis ; 	de-FR-u-co-phonebk ; 	de ; de-u-co-phonebk-rg-frzzzz
 en, de, sl-nedis ; 	sl-nedis-u-cu-eur ; 	sl-nedis ; sl-nedis-u-cu-eur
 en, de, sl-nedis ; 	sl-u-cu-eur ; 	sl-nedis ; sl-nedis-u-cu-eur
-en, de, sl-nedis ; 	sl-HR-nedis-u-cu-eur ; 	sl-nedis ; sl-nedis-u-cu-eur
+en, de, sl-nedis ; 	sl-HR-nedis-u-cu-eur ; 	sl-nedis ; sl-nedis-u-cu-eur-rg-hrzzzz
 en, de, sl-nedis ; 	de-t-m0-iso-i0-pinyin ; 	de ; de-t-m0-iso-i0-pinyin
 
+und, nl ; 	nl-BE-fonipa ; 	nl ; 	nl-u-rg-bezzzz
+und, nl-CA ;	nl-BE-fonipa ; 	nl-CA ; 	nl-CA-u-rg-bezzzz
+und, nl-fonupa ; 	nl-BE-fonipa ; 	nl-fonupa ; 	nl-fonupa-u-rg-bezzzz
+und, no ; 	nn-DK-fonipa ; 	no ; 	no-u-rg-dkzzzz
+und, en-GB-u-sd-usca ; 	en-US-fonipa-u-nu-Arab-ca-buddhist-sd-usdc-t-m0-iso-i0-pinyin ; 	en-GB-u-sd-usca ; 	en-GB-t-m0-iso-i0-pinyin-u-ca-buddhist-nu-Arab-rg-uszzzz-sd-usca
\ No newline at end of file
diff --git a/message/catalog/catalog_test.go b/message/catalog/catalog_test.go
index 08bfdc7..3de4c52 100644
--- a/message/catalog/catalog_test.go
+++ b/message/catalog/catalog_test.go
@@ -63,7 +63,7 @@
 	},
 	match: []string{
 		"gr -> en",
-		"en-US -> en",
+		"en-US -> en-u-rg-uszzzz",
 	},
 	tags: langs("en"),
 }, {
diff --git a/message/catalog_test.go b/message/catalog_test.go
index 7a2301c..ff409a8 100644
--- a/message/catalog_test.go
+++ b/message/catalog_test.go
@@ -23,7 +23,7 @@
 		want string
 	}{{
 		args: "de-CH",
-		want: "de",
+		want: "de-u-rg-chzzzz",
 	}, {
 		args: "bn-u-nu-latn|en-US,en;q=0.9,de;q=0.8,nl;q=0.7",
 		want: "bn-u-nu-latn",