collate: move iter from collate to internal/colltab
This allows it also to be used by the search package to
implement search on non-normalized strings.
Change-Id: I1265bd49ccca2c3fc22a7bb3d4f11f33375c3045
Reviewed-on: https://go-review.googlesource.com/10571
Reviewed-by: Nigel Tao <nigeltao@golang.org>
diff --git a/collate/collate.go b/collate/collate.go
index 530491b..a0f04ff 100644
--- a/collate/collate.go
+++ b/collate/collate.go
@@ -107,8 +107,8 @@
func (c *Collator) Compare(a, b []byte) int {
// TODO: skip identical prefixes once we have a fast way to detect if a rune is
// part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
- c.iter(0).setInput(a)
- c.iter(1).setInput(b)
+ c.iter(0).SetInput(a)
+ c.iter(1).SetInput(b)
if res := c.compare(); res != 0 {
return res
}
@@ -123,8 +123,8 @@
func (c *Collator) CompareString(a, b string) int {
// TODO: skip identical prefixes once we have a fast way to detect if a rune is
// part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
- c.iter(0).setInputString(a)
- c.iter(1).setInputString(b)
+ c.iter(0).SetInputString(a)
+ c.iter(1).SetInputString(b)
if res := c.compare(); res != 0 {
return res
}
@@ -219,166 +219,41 @@
func (c *Collator) getColElems(str []byte) []colltab.Elem {
i := c.iter(0)
- i.setInput(str)
- for i.next() {
+ i.SetInput(str)
+ for i.Next() {
}
- return i.ce
+ return i.Elems
}
func (c *Collator) getColElemsString(str string) []colltab.Elem {
i := c.iter(0)
- i.setInputString(str)
- for i.next() {
+ i.SetInputString(str)
+ for i.Next() {
}
- return i.ce
+ return i.Elems
}
type iter struct {
- bytes []byte
- str string
+ wa [512]colltab.Elem
- wa [512]colltab.Elem
- ce []colltab.Elem
+ newcolltab.Iter
pce int
- nce int // nce <= len(nce)
-
- prevCCC uint8
- pStarter int
-
- t colltab.Weighter
}
func (i *iter) init(c *Collator) {
- i.t = c.t
- i.ce = i.wa[:0]
-}
-
-func (i *iter) reset() {
- i.ce = i.ce[:0]
- i.nce = 0
- i.prevCCC = 0
- i.pStarter = 0
-}
-
-func (i *iter) setInput(s []byte) *iter {
- i.bytes = s
- i.str = ""
- i.reset()
- return i
-}
-
-func (i *iter) setInputString(s string) *iter {
- i.str = s
- i.bytes = nil
- i.reset()
- return i
-}
-
-func (i *iter) done() bool {
- return len(i.str) == 0 && len(i.bytes) == 0
-}
-
-func (i *iter) tail(n int) {
- if i.bytes == nil {
- i.str = i.str[n:]
- } else {
- i.bytes = i.bytes[n:]
- }
-}
-
-func (i *iter) appendNext() int {
- var sz int
- if i.bytes == nil {
- i.ce, sz = i.t.AppendNextString(i.ce, i.str)
- } else {
- i.ce, sz = i.t.AppendNext(i.ce, i.bytes)
- }
- return sz
-}
-
-// next appends Elems to the internal array until it adds an element with CCC=0.
-// In the majority of cases, a Elem with a primary value > 0 will have
-// a CCC of 0. The CCC values of colation elements are also used to detect if the
-// input string was not normalized and to adjust the result accordingly.
-func (i *iter) next() bool {
- for !i.done() {
- p0 := len(i.ce)
- sz := i.appendNext()
- i.tail(sz)
- last := len(i.ce) - 1
- if ccc := i.ce[last].CCC(); ccc == 0 {
- i.nce = len(i.ce)
- i.pStarter = last
- i.prevCCC = 0
- return true
- } else if p0 < last && i.ce[p0].CCC() == 0 {
- // set i.nce to only cover part of i.ce for which ccc == 0 and
- // use rest the next call to next.
- for p0++; p0 < last && i.ce[p0].CCC() == 0; p0++ {
- }
- i.nce = p0
- i.pStarter = p0 - 1
- i.prevCCC = ccc
- return true
- } else if ccc < i.prevCCC {
- i.doNorm(p0, ccc) // should be rare for most common cases
- } else {
- i.prevCCC = ccc
- }
- }
- if len(i.ce) != i.nce {
- i.nce = len(i.ce)
- return true
- }
- return false
-}
-
-// nextPlain is the same as next, but does not "normalize" the collation
-// elements.
-// TODO: remove this function. Using this instead of next does not seem
-// to improve performance in any significant way. We retain this until
-// later for evaluation purposes.
-func (i *iter) nextPlain() bool {
- if i.done() {
- return false
- }
- sz := i.appendNext()
- i.tail(sz)
- i.nce = len(i.ce)
- return true
-}
-
-const maxCombiningCharacters = 30
-
-// doNorm reorders the collation elements in i.ce.
-// It assumes that blocks of collation elements added with appendNext
-// either start and end with the same CCC or start with CCC == 0.
-// This allows for a single insertion point for the entire block.
-// The correctness of this assumption is verified in builder.go.
-func (i *iter) doNorm(p int, ccc uint8) {
- if p-i.pStarter > maxCombiningCharacters {
- i.prevCCC = i.ce[len(i.ce)-1].CCC()
- i.pStarter = len(i.ce) - 1
- return
- }
- n := len(i.ce)
- k := p
- for p--; p > i.pStarter && ccc < i.ce[p-1].CCC(); p-- {
- }
- i.ce = append(i.ce, i.ce[p:k]...)
- copy(i.ce[p:], i.ce[k:])
- i.ce = i.ce[:n]
+ i.Weighter = c.t
+ i.Elems = i.wa[:0]
}
func (i *iter) nextPrimary() int {
for {
- for ; i.pce < i.nce; i.pce++ {
- if v := i.ce[i.pce].Primary(); v != 0 {
+ for ; i.pce < i.N; i.pce++ {
+ if v := i.Elems[i.pce].Primary(); v != 0 {
i.pce++
return v
}
}
- if !i.next() {
+ if !i.Next() {
return 0
}
}
@@ -386,8 +261,8 @@
}
func (i *iter) nextSecondary() int {
- for ; i.pce < len(i.ce); i.pce++ {
- if v := i.ce[i.pce].Secondary(); v != 0 {
+ for ; i.pce < len(i.Elems); i.pce++ {
+ if v := i.Elems[i.pce].Secondary(); v != 0 {
i.pce++
return v
}
@@ -396,8 +271,8 @@
}
func (i *iter) prevSecondary() int {
- for ; i.pce < len(i.ce); i.pce++ {
- if v := i.ce[len(i.ce)-i.pce-1].Secondary(); v != 0 {
+ for ; i.pce < len(i.Elems); i.pce++ {
+ if v := i.Elems[len(i.Elems)-i.pce-1].Secondary(); v != 0 {
i.pce++
return v
}
@@ -406,8 +281,8 @@
}
func (i *iter) nextTertiary() int {
- for ; i.pce < len(i.ce); i.pce++ {
- if v := i.ce[i.pce].Tertiary(); v != 0 {
+ for ; i.pce < len(i.Elems); i.pce++ {
+ if v := i.Elems[i.pce].Tertiary(); v != 0 {
i.pce++
return int(v)
}
@@ -416,8 +291,8 @@
}
func (i *iter) nextQuaternary() int {
- for ; i.pce < len(i.ce); i.pce++ {
- if v := i.ce[i.pce].Quaternary(); v != 0 {
+ for ; i.pce < len(i.Elems); i.pce++ {
+ if v := i.Elems[i.pce].Quaternary(); v != 0 {
i.pce++
return v
}
diff --git a/collate/collate_test.go b/collate/collate_test.go
index dbb8c43..a79ef4b 100644
--- a/collate/collate_test.go
+++ b/collate/collate_test.go
@@ -449,80 +449,6 @@
}
}
-func TestDoNorm(t *testing.T) {
- const div = -1 // The insertion point of the next block.
- tests := []struct {
- in, out []int
- }{
- {in: []int{4, div, 3},
- out: []int{3, 4},
- },
- {in: []int{4, div, 3, 3, 3},
- out: []int{3, 3, 3, 4},
- },
- {in: []int{0, 4, div, 3},
- out: []int{0, 3, 4},
- },
- {in: []int{0, 0, 4, 5, div, 3, 3},
- out: []int{0, 0, 3, 3, 4, 5},
- },
- {in: []int{0, 0, 1, 4, 5, div, 3, 3},
- out: []int{0, 0, 1, 3, 3, 4, 5},
- },
- {in: []int{0, 0, 1, 4, 5, div, 4, 4},
- out: []int{0, 0, 1, 4, 4, 4, 5},
- },
- }
- for j, tt := range tests {
- i := iter{}
- var w, p, s int
- for k, cc := range tt.in {
- if cc == 0 {
- s = 0
- }
- if cc == div {
- w = 100
- p = k
- i.pStarter = s
- continue
- }
- i.ce = append(i.ce, makeCE([]int{w, defaultSecondary, 2, cc}))
- }
- i.prevCCC = i.ce[p-1].CCC()
- i.doNorm(p, i.ce[p].CCC())
- if len(i.ce) != len(tt.out) {
- t.Errorf("%d: length was %d; want %d", j, len(i.ce), len(tt.out))
- }
- prevCCC := uint8(0)
- for k, ce := range i.ce {
- if int(ce.CCC()) != tt.out[k] {
- t.Errorf("%d:%d: unexpected CCC. Was %d; want %d", j, k, ce.CCC(), tt.out[k])
- }
- if k > 0 && ce.CCC() == prevCCC && i.ce[k-1].Primary() > ce.Primary() {
- t.Errorf("%d:%d: normalization crossed across CCC boundary.", j, k)
- }
- }
- }
- // test cutoff of large sequence of combining characters.
- result := []uint8{8, 8, 8, 5, 5}
- for o := -2; o <= 2; o++ {
- i := iter{pStarter: 2, prevCCC: 8}
- n := maxCombiningCharacters + 1 + o
- for j := 1; j < n+i.pStarter; j++ {
- i.ce = append(i.ce, makeCE([]int{100, defaultSecondary, 2, 8}))
- }
- p := len(i.ce)
- i.ce = append(i.ce, makeCE([]int{0, defaultSecondary, 2, 5}))
- i.doNorm(p, 5)
- if i.prevCCC != result[o+2] {
- t.Errorf("%d: i.prevCCC was %d; want %d", n, i.prevCCC, result[o+2])
- }
- if result[o+2] == 5 && i.pStarter != p {
- t.Errorf("%d: i.pStarter was %d; want %d", n, i.pStarter, p)
- }
- }
-}
-
func TestNumeric(t *testing.T) {
c := New(language.English, Loose, Numeric)
diff --git a/internal/colltab/iter.go b/internal/colltab/iter.go
new file mode 100644
index 0000000..88f83a4
--- /dev/null
+++ b/internal/colltab/iter.go
@@ -0,0 +1,143 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package colltab
+
+import (
+ "golang.org/x/text/collate/colltab"
+)
+
+// An Iter incrementally converts chunks of the input text to collation
+// elements, while ensuring that the collation elements are in normalized order
+// (that is, they are in the order as if the input text were normalized first).
+type Iter struct {
+ Weighter colltab.Weighter
+ Elems []colltab.Elem
+ // N is the number of elements in Elems that will not be reordered on
+ // subsequent iterations, N <= len(Elems).
+ N int
+
+ bytes []byte
+ str string
+
+ prevCCC uint8
+ pStarter int
+}
+
+func (i *Iter) reset() {
+ i.Elems = i.Elems[:0]
+ i.N = 0
+ i.prevCCC = 0
+ i.pStarter = 0
+}
+
+// SetInput resets i to input s.
+func (i *Iter) SetInput(s []byte) {
+ i.bytes = s
+ i.str = ""
+ i.reset()
+}
+
+// SetInputString resets i to input s.
+func (i *Iter) SetInputString(s string) {
+ i.str = s
+ i.bytes = nil
+ i.reset()
+}
+
+func (i *Iter) done() bool {
+ return len(i.str) == 0 && len(i.bytes) == 0
+}
+
+func (i *Iter) tail(n int) {
+ if i.bytes == nil {
+ i.str = i.str[n:]
+ } else {
+ i.bytes = i.bytes[n:]
+ }
+}
+
+func (i *Iter) appendNext() int {
+ var sz int
+ if i.bytes == nil {
+ i.Elems, sz = i.Weighter.AppendNextString(i.Elems, i.str)
+ } else {
+ i.Elems, sz = i.Weighter.AppendNext(i.Elems, i.bytes)
+ }
+ return sz
+}
+
+// Next appends Elems to the internal array until it adds an element with CCC=0.
+// In the majority of cases, a Elem with a primary value > 0 will have a CCC of
+// 0. The CCC values of collation elements are also used to detect if the input
+// string was not normalized and to adjust the result accordingly.
+func (i *Iter) Next() bool {
+ for !i.done() {
+ p0 := len(i.Elems)
+ sz := i.appendNext()
+ i.tail(sz)
+ last := len(i.Elems) - 1
+ if ccc := i.Elems[last].CCC(); ccc == 0 {
+ i.N = len(i.Elems)
+ i.pStarter = last
+ i.prevCCC = 0
+ return true
+ } else if p0 < last && i.Elems[p0].CCC() == 0 {
+ // set i.N to only cover part of i.Elems for which ccc == 0 and
+ // use rest for the next call to next.
+ for p0++; p0 < last && i.Elems[p0].CCC() == 0; p0++ {
+ }
+ i.N = p0
+ i.pStarter = p0 - 1
+ i.prevCCC = ccc
+ return true
+ } else if ccc < i.prevCCC {
+ i.doNorm(p0, ccc) // should be rare, never occurs for NFD and FCC.
+ } else {
+ i.prevCCC = ccc
+ }
+ }
+ if len(i.Elems) != i.N {
+ i.N = len(i.Elems)
+ return true
+ }
+ return false
+}
+
+// nextNoNorm is the same as next, but does not "normalize" the collation
+// elements.
+func (i *Iter) nextNoNorm() bool {
+ // TODO: remove this function. Using this instead of next does not seem
+ // to improve performance in any significant way. We retain this until
+ // later for evaluation purposes.
+ if i.done() {
+ return false
+ }
+ sz := i.appendNext()
+ i.tail(sz)
+ i.N = len(i.Elems)
+ return true
+}
+
+const maxCombiningCharacters = 30
+
+// doNorm reorders the collation elements in i.Elems.
+// It assumes that blocks of collation elements added with appendNext
+// either start and end with the same CCC or start with CCC == 0.
+// This allows for a single insertion point for the entire block.
+// The correctness of this assumption is verified in builder.go.
+func (i *Iter) doNorm(p int, ccc uint8) {
+ if p-i.pStarter > maxCombiningCharacters {
+ i.prevCCC = i.Elems[len(i.Elems)-1].CCC()
+ i.pStarter = len(i.Elems) - 1
+ return
+ }
+ n := len(i.Elems)
+ k := p
+ for p--; p > i.pStarter && ccc < i.Elems[p-1].CCC(); p-- {
+ }
+ i.Elems = append(i.Elems, i.Elems[p:k]...)
+ copy(i.Elems[p:], i.Elems[k:])
+ i.Elems = i.Elems[:n]
+}
diff --git a/internal/colltab/iter_test.go b/internal/colltab/iter_test.go
new file mode 100644
index 0000000..fbafbe5
--- /dev/null
+++ b/internal/colltab/iter_test.go
@@ -0,0 +1,97 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package colltab
+
+import (
+ "testing"
+
+ "golang.org/x/text/collate/colltab"
+)
+
+const (
+ defaultSecondary = 0x20
+)
+
+func makeCE(w []int) colltab.Elem {
+ ce, err := colltab.MakeElem(w[0], w[1], w[2], uint8(w[3]))
+ if err != nil {
+ panic(err)
+ }
+ return ce
+}
+
+func TestDoNorm(t *testing.T) {
+ const div = -1 // The insertion point of the next block.
+ tests := []struct {
+ in, out []int
+ }{{
+ in: []int{4, div, 3},
+ out: []int{3, 4},
+ }, {
+ in: []int{4, div, 3, 3, 3},
+ out: []int{3, 3, 3, 4},
+ }, {
+ in: []int{0, 4, div, 3},
+ out: []int{0, 3, 4},
+ }, {
+ in: []int{0, 0, 4, 5, div, 3, 3},
+ out: []int{0, 0, 3, 3, 4, 5},
+ }, {
+ in: []int{0, 0, 1, 4, 5, div, 3, 3},
+ out: []int{0, 0, 1, 3, 3, 4, 5},
+ }, {
+ in: []int{0, 0, 1, 4, 5, div, 4, 4},
+ out: []int{0, 0, 1, 4, 4, 4, 5},
+ },
+ }
+ for j, tt := range tests {
+ i := Iter{}
+ var w, p, s int
+ for k, cc := range tt.in {
+ if cc == 0 {
+ s = 0
+ }
+ if cc == div {
+ w = 100
+ p = k
+ i.pStarter = s
+ continue
+ }
+ i.Elems = append(i.Elems, makeCE([]int{w, defaultSecondary, 2, cc}))
+ }
+ i.prevCCC = i.Elems[p-1].CCC()
+ i.doNorm(p, i.Elems[p].CCC())
+ if len(i.Elems) != len(tt.out) {
+ t.Errorf("%d: length was %d; want %d", j, len(i.Elems), len(tt.out))
+ }
+ prevCCC := uint8(0)
+ for k, ce := range i.Elems {
+ if int(ce.CCC()) != tt.out[k] {
+ t.Errorf("%d:%d: unexpected CCC. Was %d; want %d", j, k, ce.CCC(), tt.out[k])
+ }
+ if k > 0 && ce.CCC() == prevCCC && i.Elems[k-1].Primary() > ce.Primary() {
+ t.Errorf("%d:%d: normalization crossed across CCC boundary.", j, k)
+ }
+ }
+ }
+ // test cutoff of large sequence of combining characters.
+ result := []uint8{8, 8, 8, 5, 5}
+ for o := -2; o <= 2; o++ {
+ i := Iter{pStarter: 2, prevCCC: 8}
+ n := maxCombiningCharacters + 1 + o
+ for j := 1; j < n+i.pStarter; j++ {
+ i.Elems = append(i.Elems, makeCE([]int{100, defaultSecondary, 2, 8}))
+ }
+ p := len(i.Elems)
+ i.Elems = append(i.Elems, makeCE([]int{0, defaultSecondary, 2, 5}))
+ i.doNorm(p, 5)
+ if i.prevCCC != result[o+2] {
+ t.Errorf("%d: i.prevCCC was %d; want %d", n, i.prevCCC, result[o+2])
+ }
+ if result[o+2] == 5 && i.pStarter != p {
+ t.Errorf("%d: i.pStarter was %d; want %d", n, i.pStarter, p)
+ }
+ }
+}