language: allow variable number of types per key in -u- extension

This also fixes CVE-2020-28851. This was an off-by one
error, but is fixed by handling all cases according to the spec.

These valid case seem to be not used in practice much,
if at all, but the main benefit is that it makes all valid BCP 47
language tags also valid -u extensions. Fixing the code
to handle BCP 47 results in cleaner and seemingly more
robust code.

The main difference is as follows. The old impementation
assumed a -u- extension of the form:

    <tag> "-u"  { "-" <attr> } { "-" <key> "-" <type> } [ <otherExtensions> ]

where <attr> and <type> are of length 3-8 and a <key> is of length 2.

According to the spec, though, the format is

    <tag> "-u"  { "-" <attr> } { "-" <key> { "-" <type> } } [ <otherExtensions> ]

So every key may be associated with zero or more types, instead of
exactly one.

The new code now handles this.

The language.Tag.TypeForKey method is now defined to only
return the first entry or nothing at all. This is for backwards
compatibilty reasons.

Fixes golang/go#42535

Change-Id: I23aec4e1c4d8807fc2ffc0eb3a08de2d8150219f
Reviewed-on: https://go-review.googlesource.com/c/text/+/293549
Trust: Marcel van Lohuizen <mpvl@golang.org>
Run-TryBot: Marcel van Lohuizen <mpvl@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
diff --git a/internal/language/language.go b/internal/language/language.go
index 1e74d1a..f41aedc 100644
--- a/internal/language/language.go
+++ b/internal/language/language.go
@@ -303,9 +303,17 @@
 // are of the allowed values defined for the Unicode locale extension ('u') in
 // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
 // TypeForKey will traverse the inheritance chain to get the correct value.
+//
+// If there are multiple types associated with a key, only the first will be
+// returned. If there is no type associated with a key, it returns the empty
+// string.
 func (t Tag) TypeForKey(key string) string {
-	if start, end, _ := t.findTypeForKey(key); end != start {
-		return t.str[start:end]
+	if _, start, end, _ := t.findTypeForKey(key); end != start {
+		s := t.str[start:end]
+		if p := strings.IndexByte(s, '-'); p >= 0 {
+			s = s[:p]
+		}
+		return s
 	}
 	return ""
 }
@@ -329,13 +337,13 @@
 
 	// Remove the setting if value is "".
 	if value == "" {
-		start, end, _ := t.findTypeForKey(key)
-		if start != end {
-			// Remove key tag and leading '-'.
-			start -= 4
-
+		start, sep, end, _ := t.findTypeForKey(key)
+		if start != sep {
 			// Remove a possible empty extension.
-			if (end == len(t.str) || t.str[end+2] == '-') && t.str[start-2] == '-' {
+			switch {
+			case t.str[start-2] != '-': // has previous elements.
+			case end == len(t.str), // end of string
+				end+2 < len(t.str) && t.str[end+2] == '-': // end of extension
 				start -= 2
 			}
 			if start == int(t.pVariant) && end == len(t.str) {
@@ -381,14 +389,14 @@
 		t.str = string(buf[:uStart+len(b)])
 	} else {
 		s := t.str
-		start, end, hasExt := t.findTypeForKey(key)
-		if start == end {
+		start, sep, end, hasExt := t.findTypeForKey(key)
+		if start == sep {
 			if hasExt {
 				b = b[2:]
 			}
-			t.str = fmt.Sprintf("%s-%s%s", s[:start], b, s[end:])
+			t.str = fmt.Sprintf("%s-%s%s", s[:sep], b, s[end:])
 		} else {
-			t.str = fmt.Sprintf("%s%s%s", s[:start], value, s[end:])
+			t.str = fmt.Sprintf("%s-%s%s", s[:start+3], value, s[end:])
 		}
 	}
 	return t, nil
@@ -399,10 +407,10 @@
 // wasn't found. The hasExt return value reports whether an -u extension was present.
 // Note: the extensions are typically very small and are likely to contain
 // only one key-type pair.
-func (t Tag) findTypeForKey(key string) (start, end int, hasExt bool) {
+func (t Tag) findTypeForKey(key string) (start, sep, end int, hasExt bool) {
 	p := int(t.pExt)
 	if len(key) != 2 || p == len(t.str) || p == 0 {
-		return p, p, false
+		return p, p, p, false
 	}
 	s := t.str
 
@@ -410,10 +418,10 @@
 	for p++; s[p] != 'u'; p++ {
 		if s[p] > 'u' {
 			p--
-			return p, p, false
+			return p, p, p, false
 		}
 		if p = nextExtension(s, p); p == len(s) {
-			return len(s), len(s), false
+			return len(s), len(s), len(s), false
 		}
 	}
 	// Proceed to the hyphen following the extension name.
@@ -424,40 +432,28 @@
 
 	// Iterate over keys until we get the end of a section.
 	for {
-		// p points to the hyphen preceding the current token.
-		if p3 := p + 3; s[p3] == '-' {
-			// Found a key.
-			// Check whether we just processed the key that was requested.
-			if curKey == key {
-				return start, p, true
+		end = p
+		for p++; p < len(s) && s[p] != '-'; p++ {
+		}
+		n := p - end - 1
+		if n <= 2 && curKey == key {
+			if sep < end {
+				sep++
 			}
-			// Set to the next key and continue scanning type tokens.
-			curKey = s[p+1 : p3]
+			return start, sep, end, true
+		}
+		switch n {
+		case 0, // invalid string
+			1: // next extension
+			return end, end, end, true
+		case 2:
+			// next key
+			curKey = s[end+1 : p]
 			if curKey > key {
-				return p, p, true
+				return end, end, end, true
 			}
-			// Start of the type token sequence.
-			start = p + 4
-			// A type is at least 3 characters long.
-			p += 7 // 4 + 3
-		} else {
-			// Attribute or type, which is at least 3 characters long.
-			p += 4
-		}
-		// p points past the third character of a type or attribute.
-		max := p + 5 // maximum length of token plus hyphen.
-		if len(s) < max {
-			max = len(s)
-		}
-		for ; p < max && s[p] != '-'; p++ {
-		}
-		// Bail if we have exhausted all tokens or if the next token starts
-		// a new extension.
-		if p == len(s) || s[p+2] == '-' {
-			if curKey == key {
-				return start, p, true
-			}
-			return p, p, true
+			start = end
+			sep = p
 		}
 	}
 }
diff --git a/internal/language/language_test.go b/internal/language/language_test.go
index 6c7c108..8244c1c 100644
--- a/internal/language/language_test.go
+++ b/internal/language/language_test.go
@@ -432,7 +432,9 @@
 		{"co", "pinyin", "en-u-co-phonebk-cu-xau", "en-u-co-pinyin-cu-xau", false},
 		{"co", "pinyin", "en-u-co-phonebk-v-xx", "en-u-co-pinyin-v-xx", false},
 		{"co", "pinyin", "en-u-co-phonebk-x-x", "en-u-co-pinyin-x-x", false},
+		{"co", "pinyin", "en-u-co-x-x", "en-u-co-pinyin-x-x", false},
 		{"nu", "arabic", "en-u-co-phonebk-nu-vaai", "en-u-co-phonebk-nu-arabic", false},
+		{"nu", "arabic", "en-u-co-phonebk-nu", "en-u-co-phonebk-nu-arabic", false},
 		// add to existing -u extension
 		{"co", "pinyin", "en-u-ca-gregory", "en-u-ca-gregory-co-pinyin", false},
 		{"co", "pinyin", "en-u-ca-gregory-nu-vaai", "en-u-ca-gregory-co-pinyin-nu-vaai", false},
@@ -441,8 +443,12 @@
 		{"ca", "gregory", "en-u-co-pinyin", "en-u-ca-gregory-co-pinyin", false},
 		// remove pair
 		{"co", "", "en-u-co-phonebk", "en", false},
+		{"co", "", "en-u-co", "en", false},
+		{"co", "", "en-u-co-v", "en", false},
+		{"co", "", "en-u-co-v-", "en", false},
 		{"co", "", "en-u-ca-gregory-co-phonebk", "en-u-ca-gregory", false},
 		{"co", "", "en-u-co-phonebk-nu-arabic", "en-u-nu-arabic", false},
+		{"co", "", "en-u-co-nu-arabic", "en-u-nu-arabic", false},
 		{"co", "", "en", "en", false},
 		// add -u extension
 		{"co", "pinyin", "en", "en-u-co-pinyin", false},
@@ -504,6 +510,8 @@
 		{"cu", false, "en-a-va-v-va", "en-a-va"},
 		{"cu", false, "en-x-a", "en"},
 		// Tags with the -u extension.
+		{"nu", true, "en-u-cu-nu", "en-u-cu"},
+		{"cu", true, "en-u-cu-nu", "en-u"},
 		{"co", true, "en-u-co-standard", "standard"},
 		{"co", true, "yue-u-co-pinyin", "pinyin"},
 		{"co", true, "en-u-co-abc", "abc"},
@@ -519,9 +527,9 @@
 		{"cu", true, "en-u-co-abc-def-nu-arabic", "en-u-co-abc-def"},
 	}
 	for i, tt := range tests {
-		start, end, hasExt := Make(tt.in).findTypeForKey(tt.key)
-		if start != end {
-			res := tt.in[start:end]
+		start, sep, end, hasExt := Make(tt.in).findTypeForKey(tt.key)
+		if sep != end {
+			res := tt.in[sep:end]
 			if res != tt.out {
 				t.Errorf("%d:%s: was %q; want %q", i, tt.in, res, tt.out)
 			}
diff --git a/internal/language/parse.go b/internal/language/parse.go
index a2fdad8..c696fd0 100644
--- a/internal/language/parse.go
+++ b/internal/language/parse.go
@@ -138,7 +138,7 @@
 			b = make([]byte, n)
 			copy(b, s.b[:oldStart])
 		} else {
-			b = s.b[:n:n]
+			b = s.b[:n]
 		}
 		copy(b[end:], s.b[oldEnd:])
 		s.b = b
@@ -483,7 +483,7 @@
 func parseExtension(scan *scanner) int {
 	start, end := scan.start, scan.end
 	switch scan.token[0] {
-	case 'u':
+	case 'u': // https://www.ietf.org/rfc/rfc6067.txt
 		attrStart := end
 		scan.scan()
 		for last := []byte{}; len(scan.token) > 2; scan.scan() {
@@ -503,27 +503,29 @@
 			last = scan.token
 			end = scan.end
 		}
+		// Scan key-type sequences. A key is of length 2 and may be followed
+		// by 0 or more "type" subtags from 3 to the maximum of 8 letters.
 		var last, key []byte
 		for attrEnd := end; len(scan.token) == 2; last = key {
 			key = scan.token
-			keyEnd := scan.end
-			end = scan.acceptMinSize(3)
+			end = scan.end
+			for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {
+				end = scan.end
+			}
 			// TODO: check key value validity
-			if keyEnd == end || bytes.Compare(key, last) != 1 {
+			if bytes.Compare(key, last) != 1 || scan.err != nil {
 				// We have an invalid key or the keys are not sorted.
 				// Start scanning keys from scratch and reorder.
 				p := attrEnd + 1
 				scan.next = p
 				keys := [][]byte{}
 				for scan.scan(); len(scan.token) == 2; {
-					keyStart, keyEnd := scan.start, scan.end
-					end = scan.acceptMinSize(3)
-					if keyEnd != end {
-						keys = append(keys, scan.b[keyStart:end])
-					} else {
-						scan.setError(ErrSyntax)
-						end = keyStart
+					keyStart := scan.start
+					end = scan.end
+					for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {
+						end = scan.end
 					}
+					keys = append(keys, scan.b[keyStart:end])
 				}
 				sort.Stable(bytesSort{keys, 2})
 				if n := len(keys); n > 0 {
@@ -547,7 +549,7 @@
 				break
 			}
 		}
-	case 't':
+	case 't': // https://www.ietf.org/rfc/rfc6497.txt
 		scan.scan()
 		if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
 			_, end = parseTag(scan)
diff --git a/internal/language/parse_test.go b/internal/language/parse_test.go
index 0cc97d7..e1d428a 100644
--- a/internal/language/parse_test.go
+++ b/internal/language/parse_test.go
@@ -164,13 +164,13 @@
 		{in: "en-9-aa-0-aa-z-bb-x-a", lang: "en", extList: []string{"0-aa", "9-aa", "z-bb", "x-a"}, changed: true},
 		{in: "en-u-c", lang: "en", ext: "", invalid: true},
 		{in: "en-u-co-phonebk", lang: "en", ext: "u-co-phonebk"},
-		{in: "en-u-co-phonebk-ca", lang: "en", ext: "u-co-phonebk", invalid: true},
-		{in: "en-u-nu-arabic-co-phonebk-ca", lang: "en", ext: "u-co-phonebk-nu-arabic", invalid: true, changed: true},
-		{in: "en-u-nu-arabic-co-phonebk-ca-x", lang: "en", ext: "u-co-phonebk-nu-arabic", invalid: true, changed: true},
-		{in: "en-u-nu-arabic-co-phonebk-ca-s", lang: "en", ext: "u-co-phonebk-nu-arabic", invalid: true, changed: true},
-		{in: "en-u-nu-arabic-co-phonebk-ca-a12345678", lang: "en", ext: "u-co-phonebk-nu-arabic", invalid: true, changed: true},
-		{in: "en-u-co-phonebook", lang: "en", ext: "", invalid: true},
-		{in: "en-u-co-phonebook-cu-xau", lang: "en", ext: "u-cu-xau", invalid: true, changed: true},
+		{in: "en-u-co-phonebk-ca", lang: "en", ext: "u-ca-co-phonebk", changed: true},
+		{in: "en-u-nu-arabic-co-phonebk-ca", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", changed: true},
+		{in: "en-u-nu-arabic-co-phonebk-ca-x", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", invalid: true, changed: true},
+		{in: "en-u-nu-arabic-co-phonebk-ca-s", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", invalid: true, changed: true},
+		{in: "en-u-nu-arabic-co-phonebk-ca-a12345678", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", invalid: true, changed: true},
+		{in: "en-u-co-phonebook", lang: "en", ext: "u-co", invalid: true},
+		{in: "en-u-co-phonebook-cu-xau", lang: "en", ext: "u-co-cu-xau", invalid: true, changed: true},
 		{in: "en-Cyrl-u-co-phonebk", lang: "en", script: "Cyrl", ext: "u-co-phonebk"},
 		{in: "en-US-u-co-phonebk", lang: "en", region: "US", ext: "u-co-phonebk"},
 		{in: "en-US-u-co-phonebk-cu-xau", lang: "en", region: "US", ext: "u-co-phonebk-cu-xau"},
@@ -179,9 +179,8 @@
 		{in: "en-u-def-abc-cu-xua-co-phonebk", lang: "en", ext: "u-abc-def-co-phonebk-cu-xua", changed: true},
 		{in: "en-u-def-abc", lang: "en", ext: "u-abc-def", changed: true},
 		{in: "en-u-cu-xua-co-phonebk-a-cd", lang: "en", extList: []string{"a-cd", "u-co-phonebk-cu-xua"}, changed: true},
-		// Invalid "u" extension. Drop invalid parts.
-		{in: "en-u-cu-co-phonebk", lang: "en", extList: []string{"u-co-phonebk"}, invalid: true, changed: true},
-		{in: "en-u-cu-xau-co", lang: "en", extList: []string{"u-cu-xau"}, invalid: true},
+		{in: "en-u-cu-co-phonebk", lang: "en", extList: []string{"u-co-phonebk-cu"}, changed: true},
+		{in: "en-u-cu-xau-co", lang: "en", extList: []string{"u-co-cu-xau"}, changed: true},
 		// LDML spec is not specific about it, but remove duplicates and return an error if the values differ.
 		{in: "en-u-cu-xau-co-phonebk-cu-xau", lang: "en", ext: "u-co-phonebk-cu-xau", changed: true},
 		// No change as the result is a substring of the original!
@@ -351,8 +350,8 @@
 		{"aa-AB", mkInvalid("AB")},
 		// ill-formed wins over invalid.
 		{"ac-u", ErrSyntax},
-		{"ac-u-ca", ErrSyntax},
-		{"ac-u-ca-co-pinyin", ErrSyntax},
+		{"ac-u-ca", mkInvalid("ac")},
+		{"ac-u-ca-co-pinyin", mkInvalid("ac")},
 		{"noob", ErrSyntax},
 	}
 	for _, tt := range tests {
diff --git a/language/language.go b/language/language.go
index abfa17f..289b3a3 100644
--- a/language/language.go
+++ b/language/language.go
@@ -412,6 +412,10 @@
 // are of the allowed values defined for the Unicode locale extension ('u') in
 // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
 // TypeForKey will traverse the inheritance chain to get the correct value.
+//
+// If there are multiple types associated with a key, only the first will be
+// returned. If there is no type associated with a key, it returns the empty
+// string.
 func (t Tag) TypeForKey(key string) string {
 	if !compact.Tag(t).MayHaveExtensions() {
 		if key != "rg" && key != "va" {
diff --git a/language/language_test.go b/language/language_test.go
index f7711ba..b2e3ce3 100644
--- a/language/language_test.go
+++ b/language/language_test.go
@@ -523,6 +523,13 @@
 		{"en-GB-u-rg-usz", "en-GB-u-rg-usz", Raw},
 		{"en-GB-u-rg-usz-va-posix", "en-GB-u-rg-usz-va-posix", Raw},
 		{"en-GB-u-rg-usz-co-phonebk", "en-GB-u-co-phonebk-rg-usz", Raw},
+
+		// CVE-2020-28851
+		// invalid key-value pair of -u- extension.
+		{"ES-u-000-00", "es-u-000-00", Raw},
+		{"ES-u-000-00-v-00", "es-u-000-00-v-00", Raw},
+		// reordered and unknown extension.
+		{"ES-v-00-u-000-00", "es-u-000-00-v-00", Raw},
 	}
 	for i, tt := range tests {
 		in, _ := Raw.Parse(tt.in)
@@ -553,6 +560,12 @@
 		{"rg", "en-u-rg-gbzzzz", "gbzzzz"},
 		{"nu", "en-u-co-phonebk-nu-arabic", "arabic"},
 		{"kc", "cmn-u-co-stroke", ""},
+		{"rg", "cmn-u-rg", ""},
+		{"rg", "cmn-u-rg-co-stroke", ""},
+		{"co", "cmn-u-rg-co-stroke", "stroke"},
+		{"co", "cmn-u-co-rg-gbzzzz", ""},
+		{"rg", "cmn-u-co-rg-gbzzzz", "gbzzzz"},
+		{"rg", "cmn-u-rg-gbzzzz-nlzzzz", "gbzzzz"},
 	}
 	for _, tt := range tests {
 		if v := Make(tt.in).TypeForKey(tt.key); v != tt.out {
diff --git a/language/parse_test.go b/language/parse_test.go
index 041660c..4b7e64d 100644
--- a/language/parse_test.go
+++ b/language/parse_test.go
@@ -101,13 +101,13 @@
 		{in: "en-9-aa-0-aa-z-bb-x-a", lang: "en", extList: []string{"0-aa", "9-aa", "z-bb", "x-a"}, changed: true},
 		{in: "en-u-c", lang: "en", ext: "", invalid: true},
 		{in: "en-u-co-phonebk", lang: "en", ext: "u-co-phonebk"},
-		{in: "en-u-co-phonebk-ca", lang: "en", ext: "u-co-phonebk", invalid: true},
-		{in: "en-u-nu-arabic-co-phonebk-ca", lang: "en", ext: "u-co-phonebk-nu-arabic", invalid: true, changed: true},
-		{in: "en-u-nu-arabic-co-phonebk-ca-x", lang: "en", ext: "u-co-phonebk-nu-arabic", invalid: true, changed: true},
-		{in: "en-u-nu-arabic-co-phonebk-ca-s", lang: "en", ext: "u-co-phonebk-nu-arabic", invalid: true, changed: true},
-		{in: "en-u-nu-arabic-co-phonebk-ca-a12345678", lang: "en", ext: "u-co-phonebk-nu-arabic", invalid: true, changed: true},
-		{in: "en-u-co-phonebook", lang: "en", ext: "", invalid: true},
-		{in: "en-u-co-phonebook-cu-xau", lang: "en", ext: "u-cu-xau", invalid: true, changed: true},
+		{in: "en-u-co-phonebk-ca", lang: "en", ext: "u-ca-co-phonebk", invalid: true},
+		{in: "en-u-nu-arabic-co-phonebk-ca", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", invalid: true, changed: true},
+		{in: "en-u-nu-arabic-co-phonebk-ca-x", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", invalid: true, changed: true},
+		{in: "en-u-nu-arabic-co-phonebk-ca-s", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", invalid: true, changed: true},
+		{in: "en-u-nu-arabic-co-phonebk-ca-a12345678", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", invalid: true, changed: true},
+		{in: "en-u-co-phonebook", lang: "en", ext: "u-co", invalid: true},
+		{in: "en-u-co-phonebook-cu-xau", lang: "en", ext: "u-co-cu-xau", invalid: true, changed: true},
 		{in: "en-Cyrl-u-co-phonebk", lang: "en", script: "Cyrl", ext: "u-co-phonebk"},
 		{in: "en-US-u-co-phonebk", lang: "en", region: "US", ext: "u-co-phonebk"},
 		{in: "en-US-u-co-phonebk-cu-xau", lang: "en", region: "US", ext: "u-co-phonebk-cu-xau"},
@@ -117,8 +117,8 @@
 		{in: "en-u-def-abc", lang: "en", ext: "u-abc-def", changed: true},
 		{in: "en-u-cu-xua-co-phonebk-a-cd", lang: "en", extList: []string{"a-cd", "u-co-phonebk-cu-xua"}, changed: true},
 		// Invalid "u" extension. Drop invalid parts.
-		{in: "en-u-cu-co-phonebk", lang: "en", extList: []string{"u-co-phonebk"}, invalid: true, changed: true},
-		{in: "en-u-cu-xau-co", lang: "en", extList: []string{"u-cu-xau"}, invalid: true},
+		{in: "en-u-cu-co-phonebk", lang: "en", extList: []string{"u-co-phonebk-cu"}, invalid: true, changed: true},
+		{in: "en-u-cu-xau-co", lang: "en", extList: []string{"u-co-cu-xau"}, invalid: true},
 		// We allow duplicate keys as the LDML spec does not explicitly prohibit it.
 		// TODO: Consider eliminating duplicates and returning an error.
 		{in: "en-u-cu-xau-co-phonebk-cu-xau", lang: "en", ext: "u-co-phonebk-cu-xau", changed: true},
@@ -219,8 +219,8 @@
 		{"aa-AB", mkInvalid("AB")},
 		// ill-formed wins over invalid.
 		{"ac-u", errSyntax},
-		{"ac-u-ca", errSyntax},
-		{"ac-u-ca-co-pinyin", errSyntax},
+		{"ac-u-ca", mkInvalid("ac")},
+		{"ac-u-ca-co-pinyin", mkInvalid("ac")},
 		{"noob", errSyntax},
 	}
 	for _, tt := range tests {