language/internal: finalize Builder and refine -u semantics
This is to aid the Compose implementation in language.
This also treats key-value pairs in -u extensions as individual
entities, allowing them to be merged and overwritten.
Multiple values of the same key are now disallowed; the
first value is taken. It is not considered an error, though,
allowing tags with different values to be composed.
Change-Id: I65f1e2a2645d15289a973356b9cae33d66068c2b
Reviewed-on: https://go-review.googlesource.com/95831
Run-TryBot: Marcel van Lohuizen <mpvl@golang.org>
Reviewed-by: Nigel Tao <nigeltao@golang.org>
diff --git a/language/examples_test.go b/language/examples_test.go
index d5e8176..68caa3f 100644
--- a/language/examples_test.go
+++ b/language/examples_test.go
@@ -162,7 +162,7 @@
// ja-US <nil>
// nl-US-u-nu-arabic <nil>
// nl-1901-u-co-phonebk <nil>
- // nl-1901-u-nu-arabic <nil>
+ // nl-1901-u-co-phonebk-nu-arabic <nil>
// und-1901-u-co-phonebk <nil>
// de-u-co-phonebk <nil>
// de-1901 <nil>
diff --git a/language/internal/compose.go b/language/internal/compose.go
index 772c3d4..4ae78e0 100644
--- a/language/internal/compose.go
+++ b/language/internal/compose.go
@@ -9,64 +9,130 @@
"strings"
)
+// A Builder allows constructing a Tag from individual components.
+// Its main user is Compose in the top-level language package.
type Builder struct {
Tag Tag
- Private string // the x extension
- Ext []string
- Variant []string
-
- Err error
+ private string // the x extension
+ variants []string
+ extensions []string
}
+// Make returns a new Tag from the current settings.
func (b *Builder) Make() Tag {
t := b.Tag
- if len(b.Ext) > 0 || len(b.Variant) > 0 {
- sort.Sort(sortVariants(b.Variant))
- sort.Strings(b.Ext)
- if b.Private != "" {
- b.Ext = append(b.Ext, b.Private)
+ if len(b.extensions) > 0 || len(b.variants) > 0 {
+ sort.Sort(sortVariants(b.variants))
+ sort.Strings(b.extensions)
+
+ if b.private != "" {
+ b.extensions = append(b.extensions, b.private)
}
- n := maxCoreSize + tokenLen(b.Variant...) + tokenLen(b.Ext...)
+ n := maxCoreSize + tokenLen(b.variants...) + tokenLen(b.extensions...)
buf := make([]byte, n)
p := t.genCoreBytes(buf)
t.pVariant = byte(p)
- p += appendTokens(buf[p:], b.Variant...)
+ p += appendTokens(buf[p:], b.variants...)
t.pExt = uint16(p)
- p += appendTokens(buf[p:], b.Ext...)
+ p += appendTokens(buf[p:], b.extensions...)
t.str = string(buf[:p])
- } else if b.Private != "" {
- t.str = b.Private
+ // We may not always need to remake the string, but when or when not
+ // to do so is rather tricky.
+ scan := makeScanner(buf[:p])
+ t, _ = parse(&scan, "")
+ return t
+
+ } else if b.private != "" {
+ t.str = b.private
t.RemakeString()
}
return t
}
+// SetTag copies all the settings from a given Tag. Any previously set values
+// are discarded.
func (b *Builder) SetTag(t Tag) {
b.Tag.LangID = t.LangID
b.Tag.RegionID = t.RegionID
b.Tag.ScriptID = t.ScriptID
// TODO: optimize
- b.Variant = b.Variant[:0]
+ b.variants = b.variants[:0]
if variants := t.Variants(); variants != "" {
for _, vr := range strings.Split(variants[1:], "-") {
- b.Variant = append(b.Variant, vr)
+ b.variants = append(b.variants, vr)
}
}
- b.Ext, b.Private = b.Ext[:0], ""
+ b.extensions, b.private = b.extensions[:0], ""
for _, e := range t.Extensions() {
b.AddExt(e)
}
}
+// AddExt adds extension e to the tag. e must be a valid extension as returned
+// by Tag.Extension. If the extension already exists, it will be discarded,
+// except for a -u extension, where non-existing key-type pairs will added.
func (b *Builder) AddExt(e string) {
- if e == "" {
- } else if e[0] == 'x' {
- b.Private = e
- } else {
- b.Ext = append(b.Ext, e)
+ if e[0] == 'x' {
+ if b.private == "" {
+ b.private = e
+ }
+ return
}
+ for i, s := range b.extensions {
+ if s[0] == e[0] {
+ if e[0] == 'u' {
+ b.extensions[i] += e[1:]
+ }
+ return
+ }
+ }
+ b.extensions = append(b.extensions, e)
+}
+
+// SetExt sets the extension e to the tag. e must be a valid extension as
+// returned by Tag.Extension. If the extension already exists, it will be
+// overwritten, except for a -u extension, where the individual key-type pairs
+// will be set.
+func (b *Builder) SetExt(e string) {
+ if e[0] == 'x' {
+ b.private = e
+ return
+ }
+ for i, s := range b.extensions {
+ if s[0] == e[0] {
+ if e[0] == 'u' {
+ b.extensions[i] = e + s[1:]
+ } else {
+ b.extensions[i] = e
+ }
+ return
+ }
+ }
+ b.extensions = append(b.extensions, e)
+}
+
+// AddVariant adds any number of variants.
+func (b *Builder) AddVariant(v ...string) {
+ for _, v := range v {
+ if v != "" {
+ b.variants = append(b.variants, v)
+ }
+ }
+}
+
+// ClearVariants removes any variants previously added, including those
+// copied from a Tag in SetTag.
+func (b *Builder) ClearVariants() {
+ b.variants = b.variants[:0]
+}
+
+// ClearExtensions removes any extensions previously added, including those
+// copied from a Tag in SetTag.
+func (b *Builder) ClearExtensions() {
+ b.private = ""
+ b.extensions = b.extensions[:0]
}
func tokenLen(token ...string) (n int) {
diff --git a/language/internal/compose_test.go b/language/internal/compose_test.go
new file mode 100644
index 0000000..1930d3d
--- /dev/null
+++ b/language/internal/compose_test.go
@@ -0,0 +1,67 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package language
+
+import (
+ "strings"
+ "testing"
+)
+
+func parseBase(s string) Language {
+ if s == "" {
+ return 0
+ }
+ return MustParseBase(s)
+}
+
+func parseScript(s string) Script {
+ if s == "" {
+ return 0
+ }
+ return MustParseScript(s)
+}
+
+func parseRegion(s string) Region {
+ if s == "" {
+ return 0
+ }
+ return MustParseRegion(s)
+}
+
+func TestBuilder(t *testing.T) {
+ partChecks(t, func(t *testing.T, tt *parseTest) (id Tag, skip bool) {
+ tag := Make(tt.in)
+ b := Builder{}
+ b.SetTag(Tag{
+ LangID: parseBase(tt.lang),
+ ScriptID: parseScript(tt.script),
+ RegionID: parseRegion(tt.region),
+ })
+ if tt.variants != "" {
+ b.AddVariant(strings.Split(tt.variants, "-")...)
+ }
+ for _, e := range tag.Extensions() {
+ b.AddExt(e)
+ }
+ got := b.Make()
+ if got != tag {
+ t.Errorf("%s: got %v; want %v", tt.in, got, tag)
+ }
+ return got, false
+ })
+}
+
+func TestSetTag(t *testing.T) {
+ partChecks(t, func(t *testing.T, tt *parseTest) (id Tag, skip bool) {
+ tag := Make(tt.in)
+ b := Builder{}
+ b.SetTag(tag)
+ got := b.Make()
+ if got != tag {
+ t.Errorf("%s: got %v; want %v", tt.in, got, tag)
+ }
+ return got, false
+ })
+}
diff --git a/language/internal/parse.go b/language/internal/parse.go
index 9953dac..eca4663 100644
--- a/language/internal/parse.go
+++ b/language/internal/parse.go
@@ -34,6 +34,10 @@
// TODO: return the position at which the syntax error occurred?
var ErrSyntax = errors.New("language: tag is not well-formed")
+// ErrDuplicateKey is returned when a tag contains the same key twice with
+// different values in the -u section.
+var ErrDuplicateKey = errors.New("language: different values for same key in -u extension")
+
// ValueError is returned by any of the parsing functions when the
// input is well-formed but the respective subtag is not recognized
// as a valid value.
@@ -164,7 +168,6 @@
// deleteRange removes the given range from s.b before the current token.
func (s *scanner) deleteRange(start, end int) {
- s.setError(ErrSyntax)
s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
diff := end - start
s.next -= diff
@@ -411,18 +414,27 @@
return s.i[i] < s.i[j]
}
-type bytesSort [][]byte
+type bytesSort struct {
+ b [][]byte
+ n int // first n bytes to compare
+}
func (b bytesSort) Len() int {
- return len(b)
+ return len(b.b)
}
func (b bytesSort) Swap(i, j int) {
- b[i], b[j] = b[j], b[i]
+ b.b[i], b.b[j] = b.b[j], b.b[i]
}
func (b bytesSort) Less(i, j int) bool {
- return bytes.Compare(b[i], b[j]) == -1
+ for k := 0; k < b.n; k++ {
+ if b.b[i][k] == b.b[j][k] {
+ continue
+ }
+ return b.b[i][k] < b.b[j][k]
+ }
+ return false
}
// parseExtensions parses and normalizes the extensions in the buffer.
@@ -451,7 +463,7 @@
}
exts = append(exts, extension)
}
- sort.Sort(bytesSort(exts))
+ sort.Sort(bytesSort{exts, 1})
if len(private) > 0 {
exts = append(exts, private)
}
@@ -483,7 +495,7 @@
attrs = append(attrs, scan.token)
end = scan.end
}
- sort.Sort(bytesSort(attrs))
+ sort.Sort(bytesSort{attrs, 3})
copy(scan.b[p:], bytes.Join(attrs, separator))
break
}
@@ -512,13 +524,25 @@
end = keyStart
}
}
- sort.Sort(bytesSort(keys))
+ sort.Stable(bytesSort{keys, 2})
+ if n := len(keys); n > 0 {
+ k := 0
+ for i := 1; i < n; i++ {
+ if !bytes.Equal(keys[k][:2], keys[i][:2]) {
+ k++
+ keys[k] = keys[i]
+ } else if !bytes.Equal(keys[k], keys[i]) {
+ scan.setError(ErrDuplicateKey)
+ }
+ }
+ keys = keys[:k+1]
+ }
reordered := bytes.Join(keys, separator)
if e := p + len(reordered); e < end {
scan.deleteRange(e, end)
end = e
}
- copy(scan.b[p:], bytes.Join(keys, separator))
+ copy(scan.b[p:], reordered)
break
}
}
diff --git a/language/internal/parse_test.go b/language/internal/parse_test.go
index f0d6b48..0cc97d7 100644
--- a/language/internal/parse_test.go
+++ b/language/internal/parse_test.go
@@ -182,9 +182,10 @@
// Invalid "u" extension. Drop invalid parts.
{in: "en-u-cu-co-phonebk", lang: "en", extList: []string{"u-co-phonebk"}, invalid: true, changed: true},
{in: "en-u-cu-xau-co", lang: "en", extList: []string{"u-cu-xau"}, invalid: true},
- // We allow duplicate keys as the LDML spec does not explicitly prohibit it.
- // TODO: Consider eliminating duplicates and returning an error.
- {in: "en-u-cu-xau-co-phonebk-cu-xau", lang: "en", ext: "u-co-phonebk-cu-xau-cu-xau", changed: true},
+ // LDML spec is not specific about it, but remove duplicates and return an error if the values differ.
+ {in: "en-u-cu-xau-co-phonebk-cu-xau", lang: "en", ext: "u-co-phonebk-cu-xau", changed: true},
+ // No change as the result is a substring of the original!
+ {in: "en-US-u-cu-xau-cu-eur", lang: "en", region: "US", ext: "u-cu-xau", invalid: true, changed: false},
{in: "en-t-en-Cyrl-NL-fonipa", lang: "en", ext: "t-en-cyrl-nl-fonipa", changed: true},
{in: "en-t-en-Cyrl-NL-fonipa-t0-abc-def", lang: "en", ext: "t-en-cyrl-nl-fonipa-t0-abc-def", changed: true},
{in: "en-t-t0-abcd", lang: "en", ext: "t-t0-abcd"},
diff --git a/language/language.go b/language/language.go
index 74b939d..9ddff22 100644
--- a/language/language.go
+++ b/language/language.go
@@ -508,8 +508,8 @@
if t.HasVariants() {
if t.HasExtensions() {
build := language.Builder{}
- build.SetTag(t)
- build.Private, build.Ext = "", nil
+ build.SetTag(language.Tag{LangID: b, ScriptID: s, RegionID: r})
+ build.AddVariant(t.Variants())
exact = false
t = build.Make()
}
diff --git a/language/parse.go b/language/parse.go
index 78122f6..e54a0a4 100644
--- a/language/parse.go
+++ b/language/parse.go
@@ -59,10 +59,11 @@
// Base, Script or Region or slice of type Variant or Extension is passed more
// than once, the latter will overwrite the former. Variants and Extensions are
// accumulated, but if two extensions of the same type are passed, the latter
-// will replace the former. A Tag overwrites all former values and typically
-// only makes sense as the first argument. The resulting tag is returned after
-// canonicalizing using the Default CanonType. If one or more errors are
-// encountered, one of the errors is returned.
+// will replace the former. For -u extensions, though, the key-type pairs are
+// added, where later values overwrite older ones. A Tag overwrites all former
+// values and typically only makes sense as the first argument. The resulting
+// tag is returned after canonicalizing using the Default CanonType. If one or
+// more errors are encountered, one of the errors is returned.
func Compose(part ...interface{}) (t Tag, err error) {
return Default.Compose(part...)
}
@@ -72,7 +73,8 @@
// Base, Script or Region or slice of type Variant or Extension is passed more
// than once, the latter will overwrite the former. Variants and Extensions are
// accumulated, but if two extensions of the same type are passed, the latter
-// will replace the former. A Tag overwrites all former values and typically
+// will replace the former. For -u extensions, though, the key-type pairs are
+// added, where later values overwrite older ones. A Tag overwrites all former values and typically
// only makes sense as the first argument. The resulting tag is returned after
// canonicalizing using CanonType c. If one or more errors are encountered,
// one of the errors is returned.
@@ -88,19 +90,6 @@
var errInvalidArgument = errors.New("invalid Extension or Variant")
func update(b *language.Builder, part ...interface{}) (err error) {
- replace := func(l *[]string, s string, eq func(a, b string) bool) bool {
- if s == "" {
- b.Err = errInvalidArgument
- return true
- }
- for i, v := range *l {
- if eq(v, s) {
- (*l)[i] = s
- return true
- }
- }
- return false
- }
for _, x := range part {
switch v := x.(type) {
case Tag:
@@ -112,26 +101,32 @@
case Region:
b.Tag.RegionID = v.regionID
case Variant:
- if !replace(&b.Variant, v.variant, func(a, b string) bool { return a == b }) {
- b.Variant = append(b.Variant, v.variant)
+ if v.variant == "" {
+ err = errInvalidArgument
+ break
}
+ b.AddVariant(v.variant)
case Extension:
- if !replace(&b.Ext, v.s, func(a, b string) bool { return a[0] == b[0] }) {
- b.AddExt(v.s)
+ if v.s == "" {
+ err = errInvalidArgument
+ break
}
+ b.SetExt(v.s)
case []Variant:
- b.Variant = nil
- for _, x := range v {
- update(b, x)
+ b.ClearVariants()
+ for _, v := range v {
+ b.AddVariant(v.variant)
}
case []Extension:
- b.Ext, b.Private = nil, ""
+ b.ClearExtensions()
for _, e := range v {
- update(b, e)
+ b.SetExt(e.s)
}
// TODO: support parsing of raw strings based on morphology or just extensions?
case error:
- err = v
+ if v != nil {
+ err = v
+ }
}
}
return
diff --git a/language/parse_test.go b/language/parse_test.go
index 0982d92..7a5b54b 100644
--- a/language/parse_test.go
+++ b/language/parse_test.go
@@ -116,7 +116,7 @@
{in: "en-u-cu-xau-co", lang: "en", extList: []string{"u-cu-xau"}, invalid: true},
// We allow duplicate keys as the LDML spec does not explicitly prohibit it.
// TODO: Consider eliminating duplicates and returning an error.
- {in: "en-u-cu-xau-co-phonebk-cu-xau", lang: "en", ext: "u-co-phonebk-cu-xau-cu-xau", changed: true},
+ {in: "en-u-cu-xau-co-phonebk-cu-xau", lang: "en", ext: "u-co-phonebk-cu-xau", changed: true},
{in: "en-t-en-Cyrl-NL-fonipa", lang: "en", ext: "t-en-cyrl-nl-fonipa", changed: true},
{in: "en-t-en-Cyrl-NL-fonipa-t0-abc-def", lang: "en", ext: "t-en-cyrl-nl-fonipa-t0-abc-def", changed: true},
{in: "en-t-t0-abcd", lang: "en", ext: "t-t0-abcd"},
@@ -253,8 +253,10 @@
r, _ := ParseRegion(tt.region)
p := []interface{}{l, s, r, s, r, l}
for _, x := range strings.Split(tt.variants, "-") {
- v, _ := ParseVariant(x)
- p = append(p, v)
+ if x != "" {
+ v, _ := ParseVariant(x)
+ p = append(p, v)
+ }
}
for _, x := range tt.extList {
e, _ := ParseExtension(x)