language/internal: finalize Builder and refine -u semantics

This is to aid the Compose implementation in language.

This also treats key-value pairs in -u extensions as individual
entities, allowing them to be merged and overwritten.

Multiple values of the same key are now disallowed; the
first value is taken. It is not considered an error, though,
allowing tags with different values to be composed.

Change-Id: I65f1e2a2645d15289a973356b9cae33d66068c2b
Reviewed-on: https://go-review.googlesource.com/95831
Run-TryBot: Marcel van Lohuizen <mpvl@golang.org>
Reviewed-by: Nigel Tao <nigeltao@golang.org>
diff --git a/language/examples_test.go b/language/examples_test.go
index d5e8176..68caa3f 100644
--- a/language/examples_test.go
+++ b/language/examples_test.go
@@ -162,7 +162,7 @@
 	// ja-US <nil>
 	// nl-US-u-nu-arabic <nil>
 	// nl-1901-u-co-phonebk <nil>
-	// nl-1901-u-nu-arabic <nil>
+	// nl-1901-u-co-phonebk-nu-arabic <nil>
 	// und-1901-u-co-phonebk <nil>
 	// de-u-co-phonebk <nil>
 	// de-1901 <nil>
diff --git a/language/internal/compose.go b/language/internal/compose.go
index 772c3d4..4ae78e0 100644
--- a/language/internal/compose.go
+++ b/language/internal/compose.go
@@ -9,64 +9,130 @@
 	"strings"
 )
 
+// A Builder allows constructing a Tag from individual components.
+// Its main user is Compose in the top-level language package.
 type Builder struct {
 	Tag Tag
 
-	Private string // the x extension
-	Ext     []string
-	Variant []string
-
-	Err error
+	private    string // the x extension
+	variants   []string
+	extensions []string
 }
 
+// Make returns a new Tag from the current settings.
 func (b *Builder) Make() Tag {
 	t := b.Tag
 
-	if len(b.Ext) > 0 || len(b.Variant) > 0 {
-		sort.Sort(sortVariants(b.Variant))
-		sort.Strings(b.Ext)
-		if b.Private != "" {
-			b.Ext = append(b.Ext, b.Private)
+	if len(b.extensions) > 0 || len(b.variants) > 0 {
+		sort.Sort(sortVariants(b.variants))
+		sort.Strings(b.extensions)
+
+		if b.private != "" {
+			b.extensions = append(b.extensions, b.private)
 		}
-		n := maxCoreSize + tokenLen(b.Variant...) + tokenLen(b.Ext...)
+		n := maxCoreSize + tokenLen(b.variants...) + tokenLen(b.extensions...)
 		buf := make([]byte, n)
 		p := t.genCoreBytes(buf)
 		t.pVariant = byte(p)
-		p += appendTokens(buf[p:], b.Variant...)
+		p += appendTokens(buf[p:], b.variants...)
 		t.pExt = uint16(p)
-		p += appendTokens(buf[p:], b.Ext...)
+		p += appendTokens(buf[p:], b.extensions...)
 		t.str = string(buf[:p])
-	} else if b.Private != "" {
-		t.str = b.Private
+		// We may not always need to remake the string, but when or when not
+		// to do so is rather tricky.
+		scan := makeScanner(buf[:p])
+		t, _ = parse(&scan, "")
+		return t
+
+	} else if b.private != "" {
+		t.str = b.private
 		t.RemakeString()
 	}
 	return t
 }
 
+// SetTag copies all the settings from a given Tag. Any previously set values
+// are discarded.
 func (b *Builder) SetTag(t Tag) {
 	b.Tag.LangID = t.LangID
 	b.Tag.RegionID = t.RegionID
 	b.Tag.ScriptID = t.ScriptID
 	// TODO: optimize
-	b.Variant = b.Variant[:0]
+	b.variants = b.variants[:0]
 	if variants := t.Variants(); variants != "" {
 		for _, vr := range strings.Split(variants[1:], "-") {
-			b.Variant = append(b.Variant, vr)
+			b.variants = append(b.variants, vr)
 		}
 	}
-	b.Ext, b.Private = b.Ext[:0], ""
+	b.extensions, b.private = b.extensions[:0], ""
 	for _, e := range t.Extensions() {
 		b.AddExt(e)
 	}
 }
 
+// AddExt adds extension e to the tag. e must be a valid extension as returned
+// by Tag.Extension. If the extension already exists, it will be discarded,
+// except for a -u extension, where non-existing key-type pairs will added.
 func (b *Builder) AddExt(e string) {
-	if e == "" {
-	} else if e[0] == 'x' {
-		b.Private = e
-	} else {
-		b.Ext = append(b.Ext, e)
+	if e[0] == 'x' {
+		if b.private == "" {
+			b.private = e
+		}
+		return
 	}
+	for i, s := range b.extensions {
+		if s[0] == e[0] {
+			if e[0] == 'u' {
+				b.extensions[i] += e[1:]
+			}
+			return
+		}
+	}
+	b.extensions = append(b.extensions, e)
+}
+
+// SetExt sets the extension e to the tag. e must be a valid extension as
+// returned by Tag.Extension. If the extension already exists, it will be
+// overwritten, except for a -u extension, where the individual key-type pairs
+// will be set.
+func (b *Builder) SetExt(e string) {
+	if e[0] == 'x' {
+		b.private = e
+		return
+	}
+	for i, s := range b.extensions {
+		if s[0] == e[0] {
+			if e[0] == 'u' {
+				b.extensions[i] = e + s[1:]
+			} else {
+				b.extensions[i] = e
+			}
+			return
+		}
+	}
+	b.extensions = append(b.extensions, e)
+}
+
+// AddVariant adds any number of variants.
+func (b *Builder) AddVariant(v ...string) {
+	for _, v := range v {
+		if v != "" {
+			b.variants = append(b.variants, v)
+		}
+	}
+}
+
+// ClearVariants removes any variants previously added, including those
+// copied from a Tag in SetTag.
+func (b *Builder) ClearVariants() {
+	b.variants = b.variants[:0]
+}
+
+// ClearExtensions removes any extensions previously added, including those
+// copied from a Tag in SetTag.
+func (b *Builder) ClearExtensions() {
+	b.private = ""
+	b.extensions = b.extensions[:0]
 }
 
 func tokenLen(token ...string) (n int) {
diff --git a/language/internal/compose_test.go b/language/internal/compose_test.go
new file mode 100644
index 0000000..1930d3d
--- /dev/null
+++ b/language/internal/compose_test.go
@@ -0,0 +1,67 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package language
+
+import (
+	"strings"
+	"testing"
+)
+
+func parseBase(s string) Language {
+	if s == "" {
+		return 0
+	}
+	return MustParseBase(s)
+}
+
+func parseScript(s string) Script {
+	if s == "" {
+		return 0
+	}
+	return MustParseScript(s)
+}
+
+func parseRegion(s string) Region {
+	if s == "" {
+		return 0
+	}
+	return MustParseRegion(s)
+}
+
+func TestBuilder(t *testing.T) {
+	partChecks(t, func(t *testing.T, tt *parseTest) (id Tag, skip bool) {
+		tag := Make(tt.in)
+		b := Builder{}
+		b.SetTag(Tag{
+			LangID:   parseBase(tt.lang),
+			ScriptID: parseScript(tt.script),
+			RegionID: parseRegion(tt.region),
+		})
+		if tt.variants != "" {
+			b.AddVariant(strings.Split(tt.variants, "-")...)
+		}
+		for _, e := range tag.Extensions() {
+			b.AddExt(e)
+		}
+		got := b.Make()
+		if got != tag {
+			t.Errorf("%s: got %v; want %v", tt.in, got, tag)
+		}
+		return got, false
+	})
+}
+
+func TestSetTag(t *testing.T) {
+	partChecks(t, func(t *testing.T, tt *parseTest) (id Tag, skip bool) {
+		tag := Make(tt.in)
+		b := Builder{}
+		b.SetTag(tag)
+		got := b.Make()
+		if got != tag {
+			t.Errorf("%s: got %v; want %v", tt.in, got, tag)
+		}
+		return got, false
+	})
+}
diff --git a/language/internal/parse.go b/language/internal/parse.go
index 9953dac..eca4663 100644
--- a/language/internal/parse.go
+++ b/language/internal/parse.go
@@ -34,6 +34,10 @@
 // TODO: return the position at which the syntax error occurred?
 var ErrSyntax = errors.New("language: tag is not well-formed")
 
+// ErrDuplicateKey is returned when a tag contains the same key twice with
+// different values in the -u section.
+var ErrDuplicateKey = errors.New("language: different values for same key in -u extension")
+
 // ValueError is returned by any of the parsing functions when the
 // input is well-formed but the respective subtag is not recognized
 // as a valid value.
@@ -164,7 +168,6 @@
 
 // deleteRange removes the given range from s.b before the current token.
 func (s *scanner) deleteRange(start, end int) {
-	s.setError(ErrSyntax)
 	s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
 	diff := end - start
 	s.next -= diff
@@ -411,18 +414,27 @@
 	return s.i[i] < s.i[j]
 }
 
-type bytesSort [][]byte
+type bytesSort struct {
+	b [][]byte
+	n int // first n bytes to compare
+}
 
 func (b bytesSort) Len() int {
-	return len(b)
+	return len(b.b)
 }
 
 func (b bytesSort) Swap(i, j int) {
-	b[i], b[j] = b[j], b[i]
+	b.b[i], b.b[j] = b.b[j], b.b[i]
 }
 
 func (b bytesSort) Less(i, j int) bool {
-	return bytes.Compare(b[i], b[j]) == -1
+	for k := 0; k < b.n; k++ {
+		if b.b[i][k] == b.b[j][k] {
+			continue
+		}
+		return b.b[i][k] < b.b[j][k]
+	}
+	return false
 }
 
 // parseExtensions parses and normalizes the extensions in the buffer.
@@ -451,7 +463,7 @@
 		}
 		exts = append(exts, extension)
 	}
-	sort.Sort(bytesSort(exts))
+	sort.Sort(bytesSort{exts, 1})
 	if len(private) > 0 {
 		exts = append(exts, private)
 	}
@@ -483,7 +495,7 @@
 					attrs = append(attrs, scan.token)
 					end = scan.end
 				}
-				sort.Sort(bytesSort(attrs))
+				sort.Sort(bytesSort{attrs, 3})
 				copy(scan.b[p:], bytes.Join(attrs, separator))
 				break
 			}
@@ -512,13 +524,25 @@
 						end = keyStart
 					}
 				}
-				sort.Sort(bytesSort(keys))
+				sort.Stable(bytesSort{keys, 2})
+				if n := len(keys); n > 0 {
+					k := 0
+					for i := 1; i < n; i++ {
+						if !bytes.Equal(keys[k][:2], keys[i][:2]) {
+							k++
+							keys[k] = keys[i]
+						} else if !bytes.Equal(keys[k], keys[i]) {
+							scan.setError(ErrDuplicateKey)
+						}
+					}
+					keys = keys[:k+1]
+				}
 				reordered := bytes.Join(keys, separator)
 				if e := p + len(reordered); e < end {
 					scan.deleteRange(e, end)
 					end = e
 				}
-				copy(scan.b[p:], bytes.Join(keys, separator))
+				copy(scan.b[p:], reordered)
 				break
 			}
 		}
diff --git a/language/internal/parse_test.go b/language/internal/parse_test.go
index f0d6b48..0cc97d7 100644
--- a/language/internal/parse_test.go
+++ b/language/internal/parse_test.go
@@ -182,9 +182,10 @@
 		// Invalid "u" extension. Drop invalid parts.
 		{in: "en-u-cu-co-phonebk", lang: "en", extList: []string{"u-co-phonebk"}, invalid: true, changed: true},
 		{in: "en-u-cu-xau-co", lang: "en", extList: []string{"u-cu-xau"}, invalid: true},
-		// We allow duplicate keys as the LDML spec does not explicitly prohibit it.
-		// TODO: Consider eliminating duplicates and returning an error.
-		{in: "en-u-cu-xau-co-phonebk-cu-xau", lang: "en", ext: "u-co-phonebk-cu-xau-cu-xau", changed: true},
+		// LDML spec is not specific about it, but remove duplicates and return an error if the values differ.
+		{in: "en-u-cu-xau-co-phonebk-cu-xau", lang: "en", ext: "u-co-phonebk-cu-xau", changed: true},
+		// No change as the result is a substring of the original!
+		{in: "en-US-u-cu-xau-cu-eur", lang: "en", region: "US", ext: "u-cu-xau", invalid: true, changed: false},
 		{in: "en-t-en-Cyrl-NL-fonipa", lang: "en", ext: "t-en-cyrl-nl-fonipa", changed: true},
 		{in: "en-t-en-Cyrl-NL-fonipa-t0-abc-def", lang: "en", ext: "t-en-cyrl-nl-fonipa-t0-abc-def", changed: true},
 		{in: "en-t-t0-abcd", lang: "en", ext: "t-t0-abcd"},
diff --git a/language/language.go b/language/language.go
index 74b939d..9ddff22 100644
--- a/language/language.go
+++ b/language/language.go
@@ -508,8 +508,8 @@
 		if t.HasVariants() {
 			if t.HasExtensions() {
 				build := language.Builder{}
-				build.SetTag(t)
-				build.Private, build.Ext = "", nil
+				build.SetTag(language.Tag{LangID: b, ScriptID: s, RegionID: r})
+				build.AddVariant(t.Variants())
 				exact = false
 				t = build.Make()
 			}
diff --git a/language/parse.go b/language/parse.go
index 78122f6..e54a0a4 100644
--- a/language/parse.go
+++ b/language/parse.go
@@ -59,10 +59,11 @@
 // Base, Script or Region or slice of type Variant or Extension is passed more
 // than once, the latter will overwrite the former. Variants and Extensions are
 // accumulated, but if two extensions of the same type are passed, the latter
-// will replace the former. A Tag overwrites all former values and typically
-// only makes sense as the first argument. The resulting tag is returned after
-// canonicalizing using the Default CanonType. If one or more errors are
-// encountered, one of the errors is returned.
+// will replace the former. For -u extensions, though, the key-type pairs are
+// added, where later values overwrite older ones. A Tag overwrites all former
+// values and typically only makes sense as the first argument. The resulting
+// tag is returned after canonicalizing using the Default CanonType. If one or
+// more errors are encountered, one of the errors is returned.
 func Compose(part ...interface{}) (t Tag, err error) {
 	return Default.Compose(part...)
 }
@@ -72,7 +73,8 @@
 // Base, Script or Region or slice of type Variant or Extension is passed more
 // than once, the latter will overwrite the former. Variants and Extensions are
 // accumulated, but if two extensions of the same type are passed, the latter
-// will replace the former. A Tag overwrites all former values and typically
+// will replace the former. For -u extensions, though, the key-type pairs are
+// added, where later values overwrite older ones. A Tag overwrites all former values and typically
 // only makes sense as the first argument. The resulting tag is returned after
 // canonicalizing using CanonType c. If one or more errors are encountered,
 // one of the errors is returned.
@@ -88,19 +90,6 @@
 var errInvalidArgument = errors.New("invalid Extension or Variant")
 
 func update(b *language.Builder, part ...interface{}) (err error) {
-	replace := func(l *[]string, s string, eq func(a, b string) bool) bool {
-		if s == "" {
-			b.Err = errInvalidArgument
-			return true
-		}
-		for i, v := range *l {
-			if eq(v, s) {
-				(*l)[i] = s
-				return true
-			}
-		}
-		return false
-	}
 	for _, x := range part {
 		switch v := x.(type) {
 		case Tag:
@@ -112,26 +101,32 @@
 		case Region:
 			b.Tag.RegionID = v.regionID
 		case Variant:
-			if !replace(&b.Variant, v.variant, func(a, b string) bool { return a == b }) {
-				b.Variant = append(b.Variant, v.variant)
+			if v.variant == "" {
+				err = errInvalidArgument
+				break
 			}
+			b.AddVariant(v.variant)
 		case Extension:
-			if !replace(&b.Ext, v.s, func(a, b string) bool { return a[0] == b[0] }) {
-				b.AddExt(v.s)
+			if v.s == "" {
+				err = errInvalidArgument
+				break
 			}
+			b.SetExt(v.s)
 		case []Variant:
-			b.Variant = nil
-			for _, x := range v {
-				update(b, x)
+			b.ClearVariants()
+			for _, v := range v {
+				b.AddVariant(v.variant)
 			}
 		case []Extension:
-			b.Ext, b.Private = nil, ""
+			b.ClearExtensions()
 			for _, e := range v {
-				update(b, e)
+				b.SetExt(e.s)
 			}
 		// TODO: support parsing of raw strings based on morphology or just extensions?
 		case error:
-			err = v
+			if v != nil {
+				err = v
+			}
 		}
 	}
 	return
diff --git a/language/parse_test.go b/language/parse_test.go
index 0982d92..7a5b54b 100644
--- a/language/parse_test.go
+++ b/language/parse_test.go
@@ -116,7 +116,7 @@
 		{in: "en-u-cu-xau-co", lang: "en", extList: []string{"u-cu-xau"}, invalid: true},
 		// We allow duplicate keys as the LDML spec does not explicitly prohibit it.
 		// TODO: Consider eliminating duplicates and returning an error.
-		{in: "en-u-cu-xau-co-phonebk-cu-xau", lang: "en", ext: "u-co-phonebk-cu-xau-cu-xau", changed: true},
+		{in: "en-u-cu-xau-co-phonebk-cu-xau", lang: "en", ext: "u-co-phonebk-cu-xau", changed: true},
 		{in: "en-t-en-Cyrl-NL-fonipa", lang: "en", ext: "t-en-cyrl-nl-fonipa", changed: true},
 		{in: "en-t-en-Cyrl-NL-fonipa-t0-abc-def", lang: "en", ext: "t-en-cyrl-nl-fonipa-t0-abc-def", changed: true},
 		{in: "en-t-t0-abcd", lang: "en", ext: "t-t0-abcd"},
@@ -253,8 +253,10 @@
 		r, _ := ParseRegion(tt.region)
 		p := []interface{}{l, s, r, s, r, l}
 		for _, x := range strings.Split(tt.variants, "-") {
-			v, _ := ParseVariant(x)
-			p = append(p, v)
+			if x != "" {
+				v, _ := ParseVariant(x)
+				p = append(p, v)
+			}
 		}
 		for _, x := range tt.extList {
 			e, _ := ParseExtension(x)