internal/language: fix canonicalization of extlang
parseTag tries to replace <lang>-<extlang> with <extlang>, but <extlang>
itself can also be replaced with its canonical form which can be a
different length than the original <extlang>. The existing
implementation assumes that the length of <extlang> is 3 and would leave
scanner positions in an incorrect state if the length of <extlang> is
not 3.
Fixes golang/go#41617
Change-Id: Ie0da320530e2545f9b521e7b8cf503d854c50b45
Reviewed-on: https://go-review.googlesource.com/c/text/+/260177
Run-TryBot: Alberto Donizetti <alb.donizetti@gmail.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Alberto Donizetti <alb.donizetti@gmail.com>
Trust: Cherry Mui <cherryyz@google.com>
Trust: Marcel van Lohuizen <mpvl@golang.org>
Reviewed-by: Marcel van Lohuizen <mpvl@golang.org>
diff --git a/internal/language/compact/parse_test.go b/internal/language/compact/parse_test.go
index abe3a58..2db200b 100644
--- a/internal/language/compact/parse_test.go
+++ b/internal/language/compact/parse_test.go
@@ -122,6 +122,11 @@
{in: "en-t-nl-abcd", lang: "en", ext: "t-nl", invalid: true},
{in: "en-t-nl-latn", lang: "en", ext: "t-nl-latn"},
{in: "en-t-t0-abcd-x-a", lang: "en", extList: []string{"t-t0-abcd", "x-a"}},
+ {in: "en_t_pt_MLt", lang: "en", ext: "t-pt-mlt", changed: true},
+ {in: "en-t-fr-est", lang: "en", ext: "t-fr-est", changed: false},
+ {in: "fr-est", lang: "et", changed: true},
+ {in: "fr-est-t-fr-est", lang: "et", ext: "t-fr-est", changed: true},
+ {in: "fr-est-Cyrl", lang: "et", script: "Cyrl", changed: true},
// invalid
{in: "", lang: "und", invalid: true},
{in: "-", lang: "und", invalid: true},
diff --git a/internal/language/language_test.go b/internal/language/language_test.go
index 8244c1c..668034d 100644
--- a/internal/language/language_test.go
+++ b/internal/language/language_test.go
@@ -681,6 +681,8 @@
"en-t-t0-abcd",
"en-t-nl-latn",
"en-t-t0-abcd-x-a",
+ "en_t_pt_MLt",
+ "en-t-fr-est",
}
// Change, but not memory allocation required.
benchSimpleChange = []string{
diff --git a/internal/language/parse.go b/internal/language/parse.go
index 47ee0fe..aad1e0a 100644
--- a/internal/language/parse.go
+++ b/internal/language/parse.go
@@ -270,7 +270,7 @@
} else if n >= 4 {
return Und, ErrSyntax
} else { // the usual case
- t, end = parseTag(scan)
+ t, end = parseTag(scan, true)
if n := len(scan.token); n == 1 {
t.pExt = uint16(end)
end = parseExtensions(scan)
@@ -296,7 +296,8 @@
// parseTag parses language, script, region and variants.
// It returns a Tag and the end position in the input that was parsed.
-func parseTag(scan *scanner) (t Tag, end int) {
+// If doNorm is true, then <lang>-<extlang> will be normalized to <extlang>.
+func parseTag(scan *scanner, doNorm bool) (t Tag, end int) {
var e error
// TODO: set an error if an unknown lang, script or region is encountered.
t.LangID, e = getLangID(scan.token)
@@ -307,14 +308,17 @@
for len(scan.token) == 3 && isAlpha(scan.token[0]) {
// From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
// to a tag of the form <extlang>.
- lang, e := getLangID(scan.token)
- if lang != 0 {
- t.LangID = lang
- copy(scan.b[langStart:], lang.String())
- scan.b[langStart+3] = '-'
- scan.start = langStart + 4
+ if doNorm {
+ lang, e := getLangID(scan.token)
+ if lang != 0 {
+ t.LangID = lang
+ langStr := lang.String()
+ copy(scan.b[langStart:], langStr)
+ scan.b[langStart+len(langStr)] = '-'
+ scan.start = langStart + len(langStr) + 1
+ }
+ scan.gobble(e)
}
- scan.gobble(e)
end = scan.scan()
}
if len(scan.token) == 4 && isAlpha(scan.token[0]) {
@@ -559,7 +563,7 @@
case 't': // https://www.ietf.org/rfc/rfc6497.txt
scan.scan()
if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
- _, end = parseTag(scan)
+ _, end = parseTag(scan, false)
scan.toLower(start, end)
}
for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
diff --git a/internal/language/parse_test.go b/internal/language/parse_test.go
index e1d428a..0af9e8a 100644
--- a/internal/language/parse_test.go
+++ b/internal/language/parse_test.go
@@ -192,6 +192,14 @@
{in: "en-t-nl-abcd", lang: "en", ext: "t-nl", invalid: true},
{in: "en-t-nl-latn", lang: "en", ext: "t-nl-latn"},
{in: "en-t-t0-abcd-x-a", lang: "en", extList: []string{"t-t0-abcd", "x-a"}},
+ {in: "en_t_pt_MLt", lang: "en", ext: "t-pt-mlt", changed: true},
+ {in: "en-t-fr-est", lang: "en", ext: "t-fr-est", changed: false},
+ {in: "fr-est", lang: "et", changed: false},
+ {in: "fr-est-Cyrl", lang: "et", script: "Cyrl", changed: false},
+ // The same input here is used in both TestParse and TestParseExtensions.
+ // changed should be true for this input in TestParse but changed should be false for this input in TestParseExtensions
+ // because the entire input has been reformatted but the extension part hasn't.
+ // {in: "fr-est-t-fr-est", lang: "et", ext: "t-fr-est", changed: true},
// invalid
{in: "", lang: "und", invalid: true},
{in: "-", lang: "und", invalid: true},
@@ -299,7 +307,7 @@
return Tag{}, true
}
scan := makeScannerString(tt.in)
- id, end := parseTag(&scan)
+ id, end := parseTag(&scan, true)
id.str = string(scan.b[:end])
tt.ext = ""
tt.extList = []string{}
diff --git a/language/language_test.go b/language/language_test.go
index b2e3ce3..d45706c 100644
--- a/language/language_test.go
+++ b/language/language_test.go
@@ -723,6 +723,8 @@
"en-t-t0-abcd",
"en-t-nl-latn",
"en-t-t0-abcd-x-a",
+ "en_t_pt_MLt",
+ "en-t-fr-est",
}
// Change, but not memory allocation required.
benchSimpleChange = []string{
diff --git a/language/parse_test.go b/language/parse_test.go
index 4b7e64d..e1e5653 100644
--- a/language/parse_test.go
+++ b/language/parse_test.go
@@ -129,6 +129,11 @@
{in: "en-t-nl-abcd", lang: "en", ext: "t-nl", invalid: true},
{in: "en-t-nl-latn", lang: "en", ext: "t-nl-latn"},
{in: "en-t-t0-abcd-x-a", lang: "en", extList: []string{"t-t0-abcd", "x-a"}},
+ {in: "en_t_pt_MLt", lang: "en", ext: "t-pt-mlt", changed: true},
+ {in: "en-t-fr-est", lang: "en", ext: "t-fr-est", changed: false},
+ {in: "fr-est", lang: "et", changed: true},
+ {in: "fr-est-t-fr-est", lang: "et", ext: "t-fr-est", changed: true},
+ {in: "fr-est-Cyrl", lang: "et", script: "Cyrl", changed: true},
// invalid
{in: "", lang: "und", invalid: true},
{in: "-", lang: "und", invalid: true},