collate: move iter from collate to internal/colltab This allows it also to be used by the search package to implement search on non-normalized strings. Change-Id: I1265bd49ccca2c3fc22a7bb3d4f11f33375c3045 Reviewed-on: https://go-review.googlesource.com/10571 Reviewed-by: Nigel Tao <nigeltao@golang.org>

commit: 5c66c9ff778784a956f9eae4e20308396ecbd11d [log] [tgz]
author: Marcel van Lohuizen <mpvl@golang.org> Mon Jun 01 20:47:02 2015 +0200
committer: Marcel van Lohuizen <mpvl@golang.org> Tue Jun 02 16:32:54 2015 +0000
tree: 955d11eb03ca0d104ca7dd2d1815d8be4d3f6369
parent: 043fdd192e9613ace925f509247e9bbee0b23f42 [diff]
diff --git a/collate/collate.go b/collate/collate.go
index 530491b..a0f04ff 100644
--- a/collate/collate.go
+++ b/collate/collate.go

@@ -107,8 +107,8 @@
 func (c *Collator) Compare(a, b []byte) int {
 	// TODO: skip identical prefixes once we have a fast way to detect if a rune is
 	// part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
-	c.iter(0).setInput(a)
-	c.iter(1).setInput(b)
+	c.iter(0).SetInput(a)
+	c.iter(1).SetInput(b)
 	if res := c.compare(); res != 0 {
 		return res
 	}
@@ -123,8 +123,8 @@
 func (c *Collator) CompareString(a, b string) int {
 	// TODO: skip identical prefixes once we have a fast way to detect if a rune is
 	// part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
-	c.iter(0).setInputString(a)
-	c.iter(1).setInputString(b)
+	c.iter(0).SetInputString(a)
+	c.iter(1).SetInputString(b)
 	if res := c.compare(); res != 0 {
 		return res
 	}
@@ -219,166 +219,41 @@
 
 func (c *Collator) getColElems(str []byte) []colltab.Elem {
 	i := c.iter(0)
-	i.setInput(str)
-	for i.next() {
+	i.SetInput(str)
+	for i.Next() {
 	}
-	return i.ce
+	return i.Elems
 }
 
 func (c *Collator) getColElemsString(str string) []colltab.Elem {
 	i := c.iter(0)
-	i.setInputString(str)
-	for i.next() {
+	i.SetInputString(str)
+	for i.Next() {
 	}
-	return i.ce
+	return i.Elems
 }
 
 type iter struct {
-	bytes []byte
-	str   string
+	wa [512]colltab.Elem
 
-	wa  [512]colltab.Elem
-	ce  []colltab.Elem
+	newcolltab.Iter
 	pce int
-	nce int // nce <= len(nce)
-
-	prevCCC  uint8
-	pStarter int
-
-	t colltab.Weighter
 }
 
 func (i *iter) init(c *Collator) {
-	i.t = c.t
-	i.ce = i.wa[:0]
-}
-
-func (i *iter) reset() {
-	i.ce = i.ce[:0]
-	i.nce = 0
-	i.prevCCC = 0
-	i.pStarter = 0
-}
-
-func (i *iter) setInput(s []byte) *iter {
-	i.bytes = s
-	i.str = ""
-	i.reset()
-	return i
-}
-
-func (i *iter) setInputString(s string) *iter {
-	i.str = s
-	i.bytes = nil
-	i.reset()
-	return i
-}
-
-func (i *iter) done() bool {
-	return len(i.str) == 0 && len(i.bytes) == 0
-}
-
-func (i *iter) tail(n int) {
-	if i.bytes == nil {
-		i.str = i.str[n:]
-	} else {
-		i.bytes = i.bytes[n:]
-	}
-}
-
-func (i *iter) appendNext() int {
-	var sz int
-	if i.bytes == nil {
-		i.ce, sz = i.t.AppendNextString(i.ce, i.str)
-	} else {
-		i.ce, sz = i.t.AppendNext(i.ce, i.bytes)
-	}
-	return sz
-}
-
-// next appends Elems to the internal array until it adds an element with CCC=0.
-// In the majority of cases, a Elem with a primary value > 0 will have
-// a CCC of 0. The CCC values of colation elements are also used to detect if the
-// input string was not normalized and to adjust the result accordingly.
-func (i *iter) next() bool {
-	for !i.done() {
-		p0 := len(i.ce)
-		sz := i.appendNext()
-		i.tail(sz)
-		last := len(i.ce) - 1
-		if ccc := i.ce[last].CCC(); ccc == 0 {
-			i.nce = len(i.ce)
-			i.pStarter = last
-			i.prevCCC = 0
-			return true
-		} else if p0 < last && i.ce[p0].CCC() == 0 {
-			// set i.nce to only cover part of i.ce for which ccc == 0 and
-			// use rest the next call to next.
-			for p0++; p0 < last && i.ce[p0].CCC() == 0; p0++ {
-			}
-			i.nce = p0
-			i.pStarter = p0 - 1
-			i.prevCCC = ccc
-			return true
-		} else if ccc < i.prevCCC {
-			i.doNorm(p0, ccc) // should be rare for most common cases
-		} else {
-			i.prevCCC = ccc
-		}
-	}
-	if len(i.ce) != i.nce {
-		i.nce = len(i.ce)
-		return true
-	}
-	return false
-}
-
-// nextPlain is the same as next, but does not "normalize" the collation
-// elements.
-// TODO: remove this function. Using this instead of next does not seem
-// to improve performance in any significant way. We retain this until
-// later for evaluation purposes.
-func (i *iter) nextPlain() bool {
-	if i.done() {
-		return false
-	}
-	sz := i.appendNext()
-	i.tail(sz)
-	i.nce = len(i.ce)
-	return true
-}
-
-const maxCombiningCharacters = 30
-
-// doNorm reorders the collation elements in i.ce.
-// It assumes that blocks of collation elements added with appendNext
-// either start and end with the same CCC or start with CCC == 0.
-// This allows for a single insertion point for the entire block.
-// The correctness of this assumption is verified in builder.go.
-func (i *iter) doNorm(p int, ccc uint8) {
-	if p-i.pStarter > maxCombiningCharacters {
-		i.prevCCC = i.ce[len(i.ce)-1].CCC()
-		i.pStarter = len(i.ce) - 1
-		return
-	}
-	n := len(i.ce)
-	k := p
-	for p--; p > i.pStarter && ccc < i.ce[p-1].CCC(); p-- {
-	}
-	i.ce = append(i.ce, i.ce[p:k]...)
-	copy(i.ce[p:], i.ce[k:])
-	i.ce = i.ce[:n]
+	i.Weighter = c.t
+	i.Elems = i.wa[:0]
 }
 
 func (i *iter) nextPrimary() int {
 	for {
-		for ; i.pce < i.nce; i.pce++ {
-			if v := i.ce[i.pce].Primary(); v != 0 {
+		for ; i.pce < i.N; i.pce++ {
+			if v := i.Elems[i.pce].Primary(); v != 0 {
 				i.pce++
 				return v
 			}
 		}
-		if !i.next() {
+		if !i.Next() {
 			return 0
 		}
 	}
@@ -386,8 +261,8 @@
 }
 
 func (i *iter) nextSecondary() int {
-	for ; i.pce < len(i.ce); i.pce++ {
-		if v := i.ce[i.pce].Secondary(); v != 0 {
+	for ; i.pce < len(i.Elems); i.pce++ {
+		if v := i.Elems[i.pce].Secondary(); v != 0 {
 			i.pce++
 			return v
 		}
@@ -396,8 +271,8 @@
 }
 
 func (i *iter) prevSecondary() int {
-	for ; i.pce < len(i.ce); i.pce++ {
-		if v := i.ce[len(i.ce)-i.pce-1].Secondary(); v != 0 {
+	for ; i.pce < len(i.Elems); i.pce++ {
+		if v := i.Elems[len(i.Elems)-i.pce-1].Secondary(); v != 0 {
 			i.pce++
 			return v
 		}
@@ -406,8 +281,8 @@
 }
 
 func (i *iter) nextTertiary() int {
-	for ; i.pce < len(i.ce); i.pce++ {
-		if v := i.ce[i.pce].Tertiary(); v != 0 {
+	for ; i.pce < len(i.Elems); i.pce++ {
+		if v := i.Elems[i.pce].Tertiary(); v != 0 {
 			i.pce++
 			return int(v)
 		}
@@ -416,8 +291,8 @@
 }
 
 func (i *iter) nextQuaternary() int {
-	for ; i.pce < len(i.ce); i.pce++ {
-		if v := i.ce[i.pce].Quaternary(); v != 0 {
+	for ; i.pce < len(i.Elems); i.pce++ {
+		if v := i.Elems[i.pce].Quaternary(); v != 0 {
 			i.pce++
 			return v
 		}

diff --git a/collate/collate_test.go b/collate/collate_test.go
index dbb8c43..a79ef4b 100644
--- a/collate/collate_test.go
+++ b/collate/collate_test.go

@@ -449,80 +449,6 @@
 	}
 }
 
-func TestDoNorm(t *testing.T) {
-	const div = -1 // The insertion point of the next block.
-	tests := []struct {
-		in, out []int
-	}{
-		{in: []int{4, div, 3},
-			out: []int{3, 4},
-		},
-		{in: []int{4, div, 3, 3, 3},
-			out: []int{3, 3, 3, 4},
-		},
-		{in: []int{0, 4, div, 3},
-			out: []int{0, 3, 4},
-		},
-		{in: []int{0, 0, 4, 5, div, 3, 3},
-			out: []int{0, 0, 3, 3, 4, 5},
-		},
-		{in: []int{0, 0, 1, 4, 5, div, 3, 3},
-			out: []int{0, 0, 1, 3, 3, 4, 5},
-		},
-		{in: []int{0, 0, 1, 4, 5, div, 4, 4},
-			out: []int{0, 0, 1, 4, 4, 4, 5},
-		},
-	}
-	for j, tt := range tests {
-		i := iter{}
-		var w, p, s int
-		for k, cc := range tt.in {
-			if cc == 0 {
-				s = 0
-			}
-			if cc == div {
-				w = 100
-				p = k
-				i.pStarter = s
-				continue
-			}
-			i.ce = append(i.ce, makeCE([]int{w, defaultSecondary, 2, cc}))
-		}
-		i.prevCCC = i.ce[p-1].CCC()
-		i.doNorm(p, i.ce[p].CCC())
-		if len(i.ce) != len(tt.out) {
-			t.Errorf("%d: length was %d; want %d", j, len(i.ce), len(tt.out))
-		}
-		prevCCC := uint8(0)
-		for k, ce := range i.ce {
-			if int(ce.CCC()) != tt.out[k] {
-				t.Errorf("%d:%d: unexpected CCC. Was %d; want %d", j, k, ce.CCC(), tt.out[k])
-			}
-			if k > 0 && ce.CCC() == prevCCC && i.ce[k-1].Primary() > ce.Primary() {
-				t.Errorf("%d:%d: normalization crossed across CCC boundary.", j, k)
-			}
-		}
-	}
-	// test cutoff of large sequence of combining characters.
-	result := []uint8{8, 8, 8, 5, 5}
-	for o := -2; o <= 2; o++ {
-		i := iter{pStarter: 2, prevCCC: 8}
-		n := maxCombiningCharacters + 1 + o
-		for j := 1; j < n+i.pStarter; j++ {
-			i.ce = append(i.ce, makeCE([]int{100, defaultSecondary, 2, 8}))
-		}
-		p := len(i.ce)
-		i.ce = append(i.ce, makeCE([]int{0, defaultSecondary, 2, 5}))
-		i.doNorm(p, 5)
-		if i.prevCCC != result[o+2] {
-			t.Errorf("%d: i.prevCCC was %d; want %d", n, i.prevCCC, result[o+2])
-		}
-		if result[o+2] == 5 && i.pStarter != p {
-			t.Errorf("%d: i.pStarter was %d; want %d", n, i.pStarter, p)
-		}
-	}
-}
-
 func TestNumeric(t *testing.T) {
 	c := New(language.English, Loose, Numeric)
 

diff --git a/internal/colltab/iter.go b/internal/colltab/iter.go
new file mode 100644
index 0000000..88f83a4
--- /dev/null
+++ b/internal/colltab/iter.go

@@ -0,0 +1,143 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package colltab
+
+import (
+	"golang.org/x/text/collate/colltab"
+)
+
+// An Iter incrementally converts chunks of the input text to collation
+// elements, while ensuring that the collation elements are in normalized order
+// (that is, they are in the order as if the input text were normalized first).
+type Iter struct {
+	Weighter colltab.Weighter
+	Elems    []colltab.Elem
+	// N is the number of elements in Elems that will not be reordered on
+	// subsequent iterations, N <= len(Elems).
+	N int
+
+	bytes []byte
+	str   string
+
+	prevCCC  uint8
+	pStarter int
+}
+
+func (i *Iter) reset() {
+	i.Elems = i.Elems[:0]
+	i.N = 0
+	i.prevCCC = 0
+	i.pStarter = 0
+}
+
+// SetInput resets i to input s.
+func (i *Iter) SetInput(s []byte) {
+	i.bytes = s
+	i.str = ""
+	i.reset()
+}
+
+// SetInputString resets i to input s.
+func (i *Iter) SetInputString(s string) {
+	i.str = s
+	i.bytes = nil
+	i.reset()
+}
+
+func (i *Iter) done() bool {
+	return len(i.str) == 0 && len(i.bytes) == 0
+}
+
+func (i *Iter) tail(n int) {
+	if i.bytes == nil {
+		i.str = i.str[n:]
+	} else {
+		i.bytes = i.bytes[n:]
+	}
+}
+
+func (i *Iter) appendNext() int {
+	var sz int
+	if i.bytes == nil {
+		i.Elems, sz = i.Weighter.AppendNextString(i.Elems, i.str)
+	} else {
+		i.Elems, sz = i.Weighter.AppendNext(i.Elems, i.bytes)
+	}
+	return sz
+}
+
+// Next appends Elems to the internal array until it adds an element with CCC=0.
+// In the majority of cases, a Elem with a primary value > 0 will have a CCC of
+// 0. The CCC values of collation elements are also used to detect if the input
+// string was not normalized and to adjust the result accordingly.
+func (i *Iter) Next() bool {
+	for !i.done() {
+		p0 := len(i.Elems)
+		sz := i.appendNext()
+		i.tail(sz)
+		last := len(i.Elems) - 1
+		if ccc := i.Elems[last].CCC(); ccc == 0 {
+			i.N = len(i.Elems)
+			i.pStarter = last
+			i.prevCCC = 0
+			return true
+		} else if p0 < last && i.Elems[p0].CCC() == 0 {
+			// set i.N to only cover part of i.Elems for which ccc == 0 and
+			// use rest for the next call to next.
+			for p0++; p0 < last && i.Elems[p0].CCC() == 0; p0++ {
+			}
+			i.N = p0
+			i.pStarter = p0 - 1
+			i.prevCCC = ccc
+			return true
+		} else if ccc < i.prevCCC {
+			i.doNorm(p0, ccc) // should be rare, never occurs for NFD and FCC.
+		} else {
+			i.prevCCC = ccc
+		}
+	}
+	if len(i.Elems) != i.N {
+		i.N = len(i.Elems)
+		return true
+	}
+	return false
+}
+
+// nextNoNorm is the same as next, but does not "normalize" the collation
+// elements.
+func (i *Iter) nextNoNorm() bool {
+	// TODO: remove this function. Using this instead of next does not seem
+	// to improve performance in any significant way. We retain this until
+	// later for evaluation purposes.
+	if i.done() {
+		return false
+	}
+	sz := i.appendNext()
+	i.tail(sz)
+	i.N = len(i.Elems)
+	return true
+}
+
+const maxCombiningCharacters = 30
+
+// doNorm reorders the collation elements in i.Elems.
+// It assumes that blocks of collation elements added with appendNext
+// either start and end with the same CCC or start with CCC == 0.
+// This allows for a single insertion point for the entire block.
+// The correctness of this assumption is verified in builder.go.
+func (i *Iter) doNorm(p int, ccc uint8) {
+	if p-i.pStarter > maxCombiningCharacters {
+		i.prevCCC = i.Elems[len(i.Elems)-1].CCC()
+		i.pStarter = len(i.Elems) - 1
+		return
+	}
+	n := len(i.Elems)
+	k := p
+	for p--; p > i.pStarter && ccc < i.Elems[p-1].CCC(); p-- {
+	}
+	i.Elems = append(i.Elems, i.Elems[p:k]...)
+	copy(i.Elems[p:], i.Elems[k:])
+	i.Elems = i.Elems[:n]
+}

diff --git a/internal/colltab/iter_test.go b/internal/colltab/iter_test.go
new file mode 100644
index 0000000..fbafbe5
--- /dev/null
+++ b/internal/colltab/iter_test.go

@@ -0,0 +1,97 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package colltab
+
+import (
+	"testing"
+
+	"golang.org/x/text/collate/colltab"
+)
+
+const (
+	defaultSecondary = 0x20
+)
+
+func makeCE(w []int) colltab.Elem {
+	ce, err := colltab.MakeElem(w[0], w[1], w[2], uint8(w[3]))
+	if err != nil {
+		panic(err)
+	}
+	return ce
+}
+
+func TestDoNorm(t *testing.T) {
+	const div = -1 // The insertion point of the next block.
+	tests := []struct {
+		in, out []int
+	}{{
+		in:  []int{4, div, 3},
+		out: []int{3, 4},
+	}, {
+		in:  []int{4, div, 3, 3, 3},
+		out: []int{3, 3, 3, 4},
+	}, {
+		in:  []int{0, 4, div, 3},
+		out: []int{0, 3, 4},
+	}, {
+		in:  []int{0, 0, 4, 5, div, 3, 3},
+		out: []int{0, 0, 3, 3, 4, 5},
+	}, {
+		in:  []int{0, 0, 1, 4, 5, div, 3, 3},
+		out: []int{0, 0, 1, 3, 3, 4, 5},
+	}, {
+		in:  []int{0, 0, 1, 4, 5, div, 4, 4},
+		out: []int{0, 0, 1, 4, 4, 4, 5},
+	},
+	}
+	for j, tt := range tests {
+		i := Iter{}
+		var w, p, s int
+		for k, cc := range tt.in {
+			if cc == 0 {
+				s = 0
+			}
+			if cc == div {
+				w = 100
+				p = k
+				i.pStarter = s
+				continue
+			}
+			i.Elems = append(i.Elems, makeCE([]int{w, defaultSecondary, 2, cc}))
+		}
+		i.prevCCC = i.Elems[p-1].CCC()
+		i.doNorm(p, i.Elems[p].CCC())
+		if len(i.Elems) != len(tt.out) {
+			t.Errorf("%d: length was %d; want %d", j, len(i.Elems), len(tt.out))
+		}
+		prevCCC := uint8(0)
+		for k, ce := range i.Elems {
+			if int(ce.CCC()) != tt.out[k] {
+				t.Errorf("%d:%d: unexpected CCC. Was %d; want %d", j, k, ce.CCC(), tt.out[k])
+			}
+			if k > 0 && ce.CCC() == prevCCC && i.Elems[k-1].Primary() > ce.Primary() {
+				t.Errorf("%d:%d: normalization crossed across CCC boundary.", j, k)
+			}
+		}
+	}
+	// test cutoff of large sequence of combining characters.
+	result := []uint8{8, 8, 8, 5, 5}
+	for o := -2; o <= 2; o++ {
+		i := Iter{pStarter: 2, prevCCC: 8}
+		n := maxCombiningCharacters + 1 + o
+		for j := 1; j < n+i.pStarter; j++ {
+			i.Elems = append(i.Elems, makeCE([]int{100, defaultSecondary, 2, 8}))
+		}
+		p := len(i.Elems)
+		i.Elems = append(i.Elems, makeCE([]int{0, defaultSecondary, 2, 5}))
+		i.doNorm(p, 5)
+		if i.prevCCC != result[o+2] {
+			t.Errorf("%d: i.prevCCC was %d; want %d", n, i.prevCCC, result[o+2])
+		}
+		if result[o+2] == 5 && i.pStarter != p {
+			t.Errorf("%d: i.pStarter was %d; want %d", n, i.pStarter, p)
+		}
+	}
+}
commit	5c66c9ff778784a956f9eae4e20308396ecbd11d	[log] [tgz]
author	Marcel van Lohuizen <mpvl@golang.org>	Mon Jun 01 20:47:02 2015 +0200
committer	Marcel van Lohuizen <mpvl@golang.org>	Tue Jun 02 16:32:54 2015 +0000
tree	955d11eb03ca0d104ca7dd2d1815d8be4d3f6369
parent	043fdd192e9613ace925f509247e9bbee0b23f42 [diff]