internal/language: fix canonicalization of extlang

parseTag tries to replace <lang>-<extlang> with <extlang>, but <extlang>
itself can also be replaced with its canonical form which can be a
different length than the original <extlang>. The existing
implementation assumes that the length of <extlang> is 3 and would leave
scanner positions in an incorrect state if the length of <extlang> is
not 3.

Fixes golang/go#41617

Change-Id: Ie0da320530e2545f9b521e7b8cf503d854c50b45
Reviewed-on: https://go-review.googlesource.com/c/text/+/260177
Run-TryBot: Alberto Donizetti <alb.donizetti@gmail.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Alberto Donizetti <alb.donizetti@gmail.com>
Trust: Cherry Mui <cherryyz@google.com>
Trust: Marcel van Lohuizen <mpvl@golang.org>
Reviewed-by: Marcel van Lohuizen <mpvl@golang.org>
diff --git a/internal/language/compact/parse_test.go b/internal/language/compact/parse_test.go
index abe3a58..2db200b 100644
--- a/internal/language/compact/parse_test.go
+++ b/internal/language/compact/parse_test.go
@@ -122,6 +122,11 @@
 		{in: "en-t-nl-abcd", lang: "en", ext: "t-nl", invalid: true},
 		{in: "en-t-nl-latn", lang: "en", ext: "t-nl-latn"},
 		{in: "en-t-t0-abcd-x-a", lang: "en", extList: []string{"t-t0-abcd", "x-a"}},
+		{in: "en_t_pt_MLt", lang: "en", ext: "t-pt-mlt", changed: true},
+		{in: "en-t-fr-est", lang: "en", ext: "t-fr-est", changed: false},
+		{in: "fr-est", lang: "et", changed: true},
+		{in: "fr-est-t-fr-est", lang: "et", ext: "t-fr-est", changed: true},
+		{in: "fr-est-Cyrl", lang: "et", script: "Cyrl", changed: true},
 		// invalid
 		{in: "", lang: "und", invalid: true},
 		{in: "-", lang: "und", invalid: true},
diff --git a/internal/language/language_test.go b/internal/language/language_test.go
index 8244c1c..668034d 100644
--- a/internal/language/language_test.go
+++ b/internal/language/language_test.go
@@ -681,6 +681,8 @@
 		"en-t-t0-abcd",
 		"en-t-nl-latn",
 		"en-t-t0-abcd-x-a",
+		"en_t_pt_MLt",
+		"en-t-fr-est",
 	}
 	// Change, but not memory allocation required.
 	benchSimpleChange = []string{
diff --git a/internal/language/parse.go b/internal/language/parse.go
index 47ee0fe..aad1e0a 100644
--- a/internal/language/parse.go
+++ b/internal/language/parse.go
@@ -270,7 +270,7 @@
 	} else if n >= 4 {
 		return Und, ErrSyntax
 	} else { // the usual case
-		t, end = parseTag(scan)
+		t, end = parseTag(scan, true)
 		if n := len(scan.token); n == 1 {
 			t.pExt = uint16(end)
 			end = parseExtensions(scan)
@@ -296,7 +296,8 @@
 
 // parseTag parses language, script, region and variants.
 // It returns a Tag and the end position in the input that was parsed.
-func parseTag(scan *scanner) (t Tag, end int) {
+// If doNorm is true, then <lang>-<extlang> will be normalized to <extlang>.
+func parseTag(scan *scanner, doNorm bool) (t Tag, end int) {
 	var e error
 	// TODO: set an error if an unknown lang, script or region is encountered.
 	t.LangID, e = getLangID(scan.token)
@@ -307,14 +308,17 @@
 	for len(scan.token) == 3 && isAlpha(scan.token[0]) {
 		// From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
 		// to a tag of the form <extlang>.
-		lang, e := getLangID(scan.token)
-		if lang != 0 {
-			t.LangID = lang
-			copy(scan.b[langStart:], lang.String())
-			scan.b[langStart+3] = '-'
-			scan.start = langStart + 4
+		if doNorm {
+			lang, e := getLangID(scan.token)
+			if lang != 0 {
+				t.LangID = lang
+				langStr := lang.String()
+				copy(scan.b[langStart:], langStr)
+				scan.b[langStart+len(langStr)] = '-'
+				scan.start = langStart + len(langStr) + 1
+			}
+			scan.gobble(e)
 		}
-		scan.gobble(e)
 		end = scan.scan()
 	}
 	if len(scan.token) == 4 && isAlpha(scan.token[0]) {
@@ -559,7 +563,7 @@
 	case 't': // https://www.ietf.org/rfc/rfc6497.txt
 		scan.scan()
 		if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
-			_, end = parseTag(scan)
+			_, end = parseTag(scan, false)
 			scan.toLower(start, end)
 		}
 		for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
diff --git a/internal/language/parse_test.go b/internal/language/parse_test.go
index e1d428a..0af9e8a 100644
--- a/internal/language/parse_test.go
+++ b/internal/language/parse_test.go
@@ -192,6 +192,14 @@
 		{in: "en-t-nl-abcd", lang: "en", ext: "t-nl", invalid: true},
 		{in: "en-t-nl-latn", lang: "en", ext: "t-nl-latn"},
 		{in: "en-t-t0-abcd-x-a", lang: "en", extList: []string{"t-t0-abcd", "x-a"}},
+		{in: "en_t_pt_MLt", lang: "en", ext: "t-pt-mlt", changed: true},
+		{in: "en-t-fr-est", lang: "en", ext: "t-fr-est", changed: false},
+		{in: "fr-est", lang: "et", changed: false},
+		{in: "fr-est-Cyrl", lang: "et", script: "Cyrl", changed: false},
+		// The same input here is used in both TestParse and TestParseExtensions.
+		// changed should be true for this input in TestParse but changed should be false for this input in TestParseExtensions
+		// because the entire input has been reformatted but the extension part hasn't.
+		// {in: "fr-est-t-fr-est", lang: "et", ext: "t-fr-est", changed: true},
 		// invalid
 		{in: "", lang: "und", invalid: true},
 		{in: "-", lang: "und", invalid: true},
@@ -299,7 +307,7 @@
 			return Tag{}, true
 		}
 		scan := makeScannerString(tt.in)
-		id, end := parseTag(&scan)
+		id, end := parseTag(&scan, true)
 		id.str = string(scan.b[:end])
 		tt.ext = ""
 		tt.extList = []string{}
diff --git a/language/language_test.go b/language/language_test.go
index b2e3ce3..d45706c 100644
--- a/language/language_test.go
+++ b/language/language_test.go
@@ -723,6 +723,8 @@
 		"en-t-t0-abcd",
 		"en-t-nl-latn",
 		"en-t-t0-abcd-x-a",
+		"en_t_pt_MLt",
+		"en-t-fr-est",
 	}
 	// Change, but not memory allocation required.
 	benchSimpleChange = []string{
diff --git a/language/parse_test.go b/language/parse_test.go
index 4b7e64d..e1e5653 100644
--- a/language/parse_test.go
+++ b/language/parse_test.go
@@ -129,6 +129,11 @@
 		{in: "en-t-nl-abcd", lang: "en", ext: "t-nl", invalid: true},
 		{in: "en-t-nl-latn", lang: "en", ext: "t-nl-latn"},
 		{in: "en-t-t0-abcd-x-a", lang: "en", extList: []string{"t-t0-abcd", "x-a"}},
+		{in: "en_t_pt_MLt", lang: "en", ext: "t-pt-mlt", changed: true},
+		{in: "en-t-fr-est", lang: "en", ext: "t-fr-est", changed: false},
+		{in: "fr-est", lang: "et", changed: true},
+		{in: "fr-est-t-fr-est", lang: "et", ext: "t-fr-est", changed: true},
+		{in: "fr-est-Cyrl", lang: "et", script: "Cyrl", changed: true},
 		// invalid
 		{in: "", lang: "und", invalid: true},
 		{in: "-", lang: "und", invalid: true},