search: added first implementation of search Includes various ignore options and forward anchored search. Change-Id: I28cd51b56a3d36685ad3cc3c8e273f0c3860763d Reviewed-on: https://go-review.googlesource.com/9507 Reviewed-by: Nigel Tao <nigeltao@golang.org>

commit: f81fc0e5cf83ef95ead8dbc735eeb4d0af274211 [log] [tgz]
author: Marcel van Lohuizen <mpvl@golang.org> Fri May 01 10:54:58 2015 +0200
committer: Marcel van Lohuizen <mpvl@golang.org> Fri May 29 14:24:33 2015 +0000
tree: 41f9cccdf70f6367fd36295f4af960a907693954
parent: c93e7c9fff19fb9139b5ab04ce041833add0134e [diff]
diff --git a/search/pattern.go b/search/pattern.go
new file mode 100644
index 0000000..ea2f596
--- /dev/null
+++ b/search/pattern.go

@@ -0,0 +1,204 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package search
+
+import (
+	"golang.org/x/text/collate/colltab"
+)
+
+// TODO: handle variable primary weights?
+
+type source struct {
+	m *Matcher
+	b []byte
+	s string
+}
+
+func (s *source) appendNext(a []colltab.Elem, p int) ([]colltab.Elem, int) {
+	if s.b != nil {
+		return s.m.w.AppendNext(a, s.b[p:])
+	}
+	return s.m.w.AppendNextString(a, s.s[p:])
+}
+
+func (s *source) len() int {
+	if s.b != nil {
+		return len(s.b)
+	}
+	return len(s.s)
+}
+
+// compile compiles and returns a pattern that can be used for faster searching.
+func compile(src source) *Pattern {
+	p := &Pattern{m: src.m}
+
+	// Convert the full input to collation elements.
+	for m, i, nSrc := 0, 0, src.len(); i < nSrc; i += m {
+		p.ce, m = src.appendNext(p.ce, i)
+	}
+
+	// Remove empty elements.
+	k := 0
+	for _, e := range p.ce {
+		if !isIgnorable(p.m, e) {
+			p.ce[k] = e
+			k++
+		}
+	}
+	p.ce = p.ce[:k]
+
+	return p
+}
+
+func isIgnorable(m *Matcher, e colltab.Elem) bool {
+	if e.Primary() > 0 {
+		return false
+	}
+	if e.Secondary() > 0 {
+		if !m.ignoreDiacritics {
+			return false
+		}
+		// Primary value is 0 and ignoreDiacritics is true. In this case we
+		// ignore the tertiary element, as it only pertains to the modifier.
+		return true
+	}
+	// TODO: further distinguish once we have the new implementation.
+	if !(m.ignoreWidth || m.ignoreCase) && e.Tertiary() > 0 {
+		return false
+	}
+	// TODO: we ignore the Quaternary level for now.
+	return true
+}
+
+type searcher struct {
+	source
+	*Pattern
+	seg []colltab.Elem // current segment of collation elements
+}
+
+// TODO: Use a Boyer-Moore-like algorithm (probably Sunday) for searching.
+
+func (s *searcher) forwardSearch() (start, end int) {
+	// Pick a large enough buffer such that we likely do not need to allocate
+	// and small enough to not cause too much overhead initializing.
+	var buf [8]colltab.Elem
+
+	for m, i, nText := 0, 0, s.len(); i < nText; i += m {
+		s.seg, m = s.appendNext(buf[:0], i)
+		if end := s.searchOnce(i, m); end != -1 {
+			return i, end
+		}
+	}
+	return -1, -1
+}
+
+func (s *searcher) anchoredForwardSearch() (start, end int) {
+	var buf [8]colltab.Elem
+	s.seg, end = s.appendNext(buf[:0], 0)
+	if end := s.searchOnce(0, end); end != -1 {
+		return 0, end
+	}
+	return -1, -1
+}
+
+// next advances to the next weight in a pattern. f must return one of the
+// weights of a collation element. next will advance to the first non-zero
+// weight and return this weight and true if it exists, or 0, false otherwise.
+func (p *Pattern) next(i *int, f func(colltab.Elem) int) (weight int, ok bool) {
+	for *i < len(p.ce) {
+		v := f(p.ce[*i])
+		*i++
+		if v != 0 {
+			return v, true
+		}
+	}
+	return 0, false
+}
+
+func tertiary(e colltab.Elem) int {
+	return int(e.Tertiary())
+}
+
+// searchOnce tries to match the pattern s.p at the text position i. s.buf needs
+// to be filled with collation elements of the first segment, where n is the
+// number of source bytes consumed for this segment. It will return the end
+// position of the match or -1.
+func (s *searcher) searchOnce(i, n int) (end int) {
+	var pLevel [4]int
+
+	// TODO: patch non-normalized strings (see collate.go).
+
+	m := s.Pattern.m
+	nSrc := s.len()
+	for {
+		k := 0
+		for ; k < len(s.seg); k++ {
+			if v := s.seg[k].Primary(); v > 0 {
+				if w, ok := s.next(&pLevel[0], colltab.Elem.Primary); !ok || v != w {
+					return -1
+				}
+			}
+
+			if !m.ignoreDiacritics {
+				if v := s.seg[k].Secondary(); v > 0 {
+					if w, ok := s.next(&pLevel[1], colltab.Elem.Secondary); !ok || v != w {
+						return -1
+					}
+				}
+			} else if s.seg[k].Primary() == 0 {
+				// We ignore tertiary values of collation elements of the
+				// secondary level.
+				continue
+			}
+
+			// TODO: distinguish between case and width. This will be easier to
+			// implement after we moved to the new collation implementation.
+			if !m.ignoreWidth && !m.ignoreCase {
+				if v := s.seg[k].Tertiary(); v > 0 {
+					if w, ok := s.next(&pLevel[2], tertiary); !ok || int(v) != w {
+						return -1
+					}
+				}
+			}
+			// TODO: check quaternary weight
+		}
+		i += n
+
+		// Check for completion.
+		switch {
+		// If any of these cases match, we are not at the end.
+		case pLevel[0] < len(s.ce):
+		case !m.ignoreDiacritics && pLevel[1] < len(s.ce):
+		case !(m.ignoreWidth || m.ignoreCase) && pLevel[2] < len(s.ce):
+		default:
+			// At this point, both the segment and pattern has matched fully.
+			// However, appendNext does not guarantee a segment is on a grapheme
+			// boundary. The proper way to check this is whether the text is
+			// followed by a modifier. We inspecting the properties of the
+			// collation element as an alternative to using unicode/norm or
+			// range tables.
+			// TODO: verify correctness of this algorithm and verify space/time
+			// trade-offs of the other approaches.
+			for ; i < nSrc; i += n {
+				// TODO: implement different behavior for WholeGrapheme(false).
+				s.seg, n = s.appendNext(s.seg[:0], i)
+				if s.seg[0].Primary() != 0 {
+					break
+				}
+				if !m.ignoreDiacritics {
+					return -1
+				}
+			}
+			return i
+		}
+
+		if i >= nSrc {
+			return -1
+		}
+
+		// Fill the buffer with the next batch of collation elements.
+		s.seg, n = s.appendNext(s.seg[:0], i)
+	}
+}

diff --git a/search/pattern_test.go b/search/pattern_test.go
new file mode 100644
index 0000000..64f4037
--- /dev/null
+++ b/search/pattern_test.go

@@ -0,0 +1,305 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package search
+
+import (
+	"testing"
+
+	"golang.org/x/text/language"
+)
+
+func TestCompile(t *testing.T) {
+	for i, tc := range []struct {
+		desc    string
+		pattern string
+		options []Option
+		n       int
+	}{{
+		desc:    "empty",
+		pattern: "",
+		n:       0,
+	}, {
+		desc:    "single",
+		pattern: "a",
+		n:       1,
+	}, {
+		desc:    "keep modifier",
+		pattern: "a\u0300", // U+0300: COMBINING GRAVE ACCENT
+		n:       2,
+	}, {
+		desc:    "remove modifier",
+		pattern: "a\u0300", // U+0300: COMBINING GRAVE ACCENT
+		options: []Option{IgnoreDiacritics},
+		n:       1,
+	}, {
+		desc:    "single with double collation element",
+		pattern: "ä",
+		n:       2,
+	}, {
+		desc:    "leading variable",
+		pattern: " a",
+		n:       2,
+	}, {
+		desc:    "trailing variable",
+		pattern: "aa ",
+		n:       3,
+	}, {
+		desc:    "leading and trailing variable",
+		pattern: " äb ",
+		n:       5,
+	}, {
+		desc:    "keep interior variable",
+		pattern: " ä b ",
+		n:       6,
+	}, {
+		desc:    "keep interior variables",
+		pattern: " b  ä ",
+		n:       7,
+	}, {
+		desc:    "remove ignoreables (zero-weights across the board)",
+		pattern: "\u009Db\u009Dä\u009D", // U+009D: OPERATING SYSTEM COMMAND
+		n:       3,
+	}} {
+		m := New(language.Und, tc.options...)
+		p := m.CompileString(tc.pattern)
+		if len(p.ce) != tc.n {
+			t.Errorf("%d:%s: Compile(%+q): got %d; want %d", i, tc.desc, tc.pattern, len(p.ce), tc.n)
+		}
+	}
+}
+
+func TestForwardSearch(t *testing.T) {
+	for i, tc := range []struct {
+		desc    string
+		tag     string
+		options []Option
+		pattern string
+		text    string
+		want    []int
+	}{{
+		// The semantics of an empty search is to match nothing.
+		// TODO: change this to be in line with strings.Index? It is quite a
+		// different beast, so not sure yet.
+
+		desc:    "empty pattern and text",
+		tag:     "und",
+		pattern: "",
+		text:    "",
+		want:    nil, // TODO: consider: []int{0, 0},
+	}, {
+		desc:    "non-empty pattern and empty text",
+		tag:     "und",
+		pattern: " ",
+		text:    "",
+		want:    nil,
+	}, {
+		desc:    "empty pattern and non-empty text",
+		tag:     "und",
+		pattern: "",
+		text:    "abc",
+		want:    nil, // TODO: consider: []int{0, 0, 1, 1, 2, 2, 3, 3},
+	}, {
+		// Variable-only patterns. We don't support variables at the moment,
+		// but verify that, given this, the behavior is indeed as expected.
+
+		desc:    "exact match of variable",
+		tag:     "und",
+		pattern: " ",
+		text:    " ",
+		want:    []int{0, 1},
+	}, {
+		desc:    "variables not handled by default",
+		tag:     "und",
+		pattern: "- ",
+		text:    " -",
+		want:    nil, // Would be (1, 2) for a median match with variable}.
+	}, {
+		desc:    "multiple subsequent identical variables",
+		tag:     "und",
+		pattern: " ",
+		text:    "    ",
+		want:    []int{0, 1, 1, 2, 2, 3, 3, 4},
+	}, {
+		desc:    "text with variables",
+		tag:     "und",
+		options: []Option{IgnoreDiacritics},
+		pattern: "abc",
+		text:    "3 abc 3",
+		want:    []int{2, 5},
+	}, {
+		desc:    "pattern with interior variables",
+		tag:     "und",
+		options: []Option{IgnoreDiacritics},
+		pattern: "a b c",
+		text:    "3 a b c abc a  b  c 3",
+		want:    []int{2, 7}, // Would have 3 matches using variable.
+
+		// TODO: Different variable handling settings.
+	}, {
+		// Options.
+
+		desc:    "match all levels",
+		tag:     "und",
+		pattern: "Abc",
+		text:    "abcAbcABCÁbcábc",
+		want:    []int{3, 6},
+	}, {
+		desc:    "ignore diacritics in text",
+		tag:     "und",
+		options: []Option{IgnoreDiacritics},
+		pattern: "Abc",
+		text:    "Ábc",
+		want:    []int{0, 4},
+	}, {
+		desc:    "ignore diacritics in pattern",
+		tag:     "und",
+		options: []Option{IgnoreDiacritics},
+		pattern: "Ábc",
+		text:    "Abc",
+		want:    []int{0, 3},
+	}, {
+		desc:    "ignore diacritics",
+		tag:     "und",
+		options: []Option{IgnoreDiacritics},
+		pattern: "Abc",
+		text:    "abcAbcABCÁbcábc",
+		want:    []int{3, 6, 9, 13},
+	}, {
+		desc:    "ignore case",
+		tag:     "und",
+		options: []Option{IgnoreCase},
+		pattern: "Abc",
+		text:    "abcAbcABCÁbcábc",
+		want:    []int{0, 3, 3, 6, 6, 9},
+	}, {
+		desc:    "ignore case and diacritics",
+		tag:     "und",
+		options: []Option{IgnoreCase, IgnoreDiacritics},
+		pattern: "Abc",
+		text:    "abcAbcABCÁbcábc",
+		want:    []int{0, 3, 3, 6, 6, 9, 9, 13, 13, 17},
+	}, {
+		desc:    "ignore width to fullwidth",
+		tag:     "und",
+		options: []Option{IgnoreWidth},
+		pattern: "abc",
+		text:    "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C
+		want:    []int{4, 13},
+	}, {
+		// TODO: distinguish between case and width.
+		desc:    "don't ignore width to fullwidth, ignoring only case",
+		tag:     "und",
+		options: []Option{IgnoreCase},
+		pattern: "abc",
+		text:    "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C
+		want:    []int{4, 13},
+	}, {
+		desc:    "ignore width to fullwidth and diacritics",
+		tag:     "und",
+		options: []Option{IgnoreWidth, IgnoreDiacritics},
+		pattern: "abc",
+		text:    "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C
+		want:    []int{4, 13},
+	}, {
+		desc:    "whole grapheme, single rune",
+		tag:     "und",
+		pattern: "eee",
+		text:    "123 eeé 123",
+		want:    nil,
+	}, {
+		// Note: rules on when to apply contractions may, for certain languages,
+		// differ between search and collation. For example, "ch" is not
+		// considered a contraction for the purpose of searching in Spanish.
+		// Therefore, be careful picking this test.
+		desc:    "whole grapheme, contractions",
+		tag:     "da",
+		pattern: "aba",
+		// Fails at the primary level, because "aa" is a contraction.
+		text: "123 abaa 123",
+		want: []int{},
+	}, {
+		desc:    "whole grapheme, trailing modifier",
+		tag:     "und",
+		pattern: "eee",
+		text:    "123 eee\u0300 123", // U+0300: COMBINING GRAVE ACCENT
+		want:    nil,
+	}, {
+		// Language-specific matching.
+
+		desc:    "",
+		tag:     "da",
+		options: []Option{IgnoreCase},
+		pattern: "Århus",
+		text:    "AarhusÅrhus  Århus  ",
+		want:    []int{0, 6, 6, 12, 14, 20},
+	}, {
+		desc:    "",
+		tag:     "da",
+		options: []Option{IgnoreCase},
+		pattern: "Aarhus",
+		text:    "Århus Aarhus",
+		want:    []int{0, 6, 7, 13},
+	}, {
+		desc:    "",
+		tag:     "en", // Å does not match A for English.
+		options: []Option{IgnoreCase},
+		pattern: "Aarhus",
+		text:    "Århus",
+		want:    nil,
+	}, {
+		desc:    "ignore modifier in text",
+		options: []Option{IgnoreDiacritics},
+		tag:     "und",
+		pattern: "eee",
+		text:    "123 eee\u0300 123", // U+0300: COMBINING GRAVE ACCENT
+		want:    []int{4, 9},         // Matches on grapheme boundary.
+	}, {
+		desc:    "ignore multiple modifiers in text",
+		options: []Option{IgnoreDiacritics},
+		tag:     "und",
+		pattern: "eee",
+		text:    "123 eee\u0300\u0300 123", // U+0300: COMBINING GRAVE ACCENT
+		want:    []int{4, 11},              // Matches on grapheme boundary.
+	}, {
+		desc:    "ignore modifier in pattern",
+		options: []Option{IgnoreDiacritics},
+		tag:     "und",
+		pattern: "eee\u0300", // U+0300: COMBINING GRAVE ACCENT
+		text:    "123 eee 123",
+		want:    []int{4, 7},
+	}, {
+		desc:    "ignore multiple modifiers in pattern",
+		options: []Option{IgnoreDiacritics},
+		tag:     "und",
+		pattern: "eee\u0300\u0300", // U+0300: COMBINING GRAVE ACCENT
+		text:    "123 eee 123",
+		want:    []int{4, 7},
+	}} {
+		m := New(language.MustParse(tc.tag), tc.options...)
+		p := m.CompileString(tc.pattern)
+		for j := 0; j < len(tc.text); {
+			start, end := p.IndexString(tc.text[j:])
+			if start == -1 && end == -1 {
+				j++
+				continue
+			}
+			start += j
+			end += j
+			j = end
+			if len(tc.want) == 0 {
+				t.Errorf("%d:%s: found unexpected result [%d %d]", i, tc.desc, start, end)
+				break
+			}
+			if tc.want[0] != start || tc.want[1] != end {
+				t.Errorf("%d:%s: got [%d %d]; want %v", i, tc.desc, start, end, tc.want[:2])
+				break
+			}
+			tc.want = tc.want[2:]
+		}
+		if len(tc.want) != 0 {
+			t.Errorf("%d:%s: %d extra results", i, tc.desc, len(tc.want)/2)
+		}
+	}
+}

diff --git a/search/search.go b/search/search.go
index 258be59..f72ca5f 100644
--- a/search/search.go
+++ b/search/search.go

@@ -36,18 +36,27 @@
 	Exact Option = nil
 
 	// Loose causes case, diacritics and width to be ignored.
-	Loose Option = nil //
+	Loose Option = loose
 
 	// IgnoreCase enables case-insensitive search.
-	IgnoreCase Option = nil
+	IgnoreCase Option = ignoreCase
 
 	// IgnoreDiacritics causes diacritics to be ignored ("ö" == "o").
-	IgnoreDiacritics Option = nil
+	IgnoreDiacritics Option = ignoreDiacritics
 
-	// IgnoreWidth equates fullwidth with halfwidth variants.
-	IgnoreWidth Option = nil
+	// IgnoreWidth equates narrow with wide variants.
+	IgnoreWidth Option = ignoreWidth
 )
 
+func ignoreDiacritics(m *Matcher) { m.ignoreDiacritics = true }
+func ignoreCase(m *Matcher)       { m.ignoreCase = true }
+func ignoreWidth(m *Matcher)      { m.ignoreWidth = true }
+func loose(m *Matcher) {
+	ignoreDiacritics(m)
+	ignoreCase(m)
+	ignoreWidth(m)
+}
+
 var (
 	// Supported lists the languages for which search differs from its parent.
 	Supported language.Coverage
@@ -77,7 +86,10 @@
 
 // A Matcher implements language-specific string matching.
 type Matcher struct {
-	w colltab.Weighter
+	w                colltab.Weighter
+	ignoreCase       bool
+	ignoreWidth      bool
+	ignoreDiacritics bool
 }
 
 // An IndexOption specifies how the Index methods of Pattern or Matcher should
@@ -87,28 +99,13 @@
 const (
 	// Anchor restricts the search to the start (or end for Backwards) of the
 	// text.
-	Anchor IndexOption = iota
+	Anchor IndexOption = 1 << iota
 
 	// Backwards starts the search from the end of the text.
 	Backwards
-)
 
-// Design note (TODO remove):
-// We use IndexOption, instead of having Index, IndexString, IndexLast,
-// IndexLastString, IndexPrefix, IndexPrefixString, ....
-// (Note: HasPrefix would have reduced utility compared to those in the  strings
-// and bytes packages as the matched prefix in the searched strings may be of
-// different lengths, so we need to return an additional index.)
-// Advantage:
-// - Avoid combinatorial explosion of method calls (now 2 Index variants,
-//   instead of 8, or 16 if we have All variants).
-// - Compared to an API where these options are set on Matcher or Pattern, it
-//   will be clearer when Index*() is invoked with certain options.
-// - Small API and still normal Index() call for the by far most common case.
-// Disadvantage:
-// - Slightly different from analogous packages in the core library (even though
-//   there things are not entirely consistent anyway.)
-// - Little bit of overhead on each Index call (one branch for the common case.)
+	anchorBackwards = Anchor | Backwards
+)
 
 // Index reports the start and end position of the first occurrence of pat in b
 // or -1, -1 if pat is not present.
@@ -138,17 +135,19 @@
 
 // Compile compiles and returns a pattern that can be used for faster searching.
 func (m *Matcher) Compile(b []byte) *Pattern {
-	panic("TODO: implement")
+	return compile(source{m: m, b: b})
 }
 
 // CompileString compiles and returns a pattern that can be used for faster
 // searching.
-func (m *Matcher) CompileString(str string) *Pattern {
-	panic("TODO: implement")
+func (m *Matcher) CompileString(s string) *Pattern {
+	return compile(source{m: m, s: s})
 }
 
 // A Pattern is a compiled search string. It is safe for concurrent use.
 type Pattern struct {
+	m  *Matcher
+	ce []colltab.Elem
 }
 
 // Design note (TODO remove):
@@ -159,13 +158,51 @@
 // Index reports the start and end position of the first occurrence of p in b
 // or -1, -1 if p is not present.
 func (p *Pattern) Index(b []byte, opts ...IndexOption) (start, end int) {
-	panic("TODO: implement")
+	sr := searcher{
+		source:  source{m: p.m, b: b},
+		Pattern: p,
+	}
+
+	var optMask IndexOption
+	for _, o := range opts {
+		optMask |= o
+	}
+
+	switch optMask {
+	case 0:
+		return sr.forwardSearch()
+	case Anchor:
+		return sr.anchoredForwardSearch()
+	case Backwards, anchorBackwards:
+		panic("TODO: implement")
+	default:
+		panic("unrecognized option")
+	}
 }
 
 // IndexString reports the start and end position of the first occurrence of p
 // in s or -1, -1 if p is not present.
 func (p *Pattern) IndexString(s string, opts ...IndexOption) (start, end int) {
-	panic("TODO: implement")
+	sr := searcher{
+		source:  source{m: p.m, s: s},
+		Pattern: p,
+	}
+
+	var optMask IndexOption
+	for _, o := range opts {
+		optMask |= o
+	}
+
+	switch optMask {
+	case 0:
+		return sr.forwardSearch()
+	case Anchor:
+		return sr.anchoredForwardSearch()
+	case Backwards, anchorBackwards:
+		panic("TODO: implement")
+	default:
+		panic("unrecognized option")
+	}
 }
 
 // TODO:
commit	f81fc0e5cf83ef95ead8dbc735eeb4d0af274211	[log] [tgz]
author	Marcel van Lohuizen <mpvl@golang.org>	Fri May 01 10:54:58 2015 +0200
committer	Marcel van Lohuizen <mpvl@golang.org>	Fri May 29 14:24:33 2015 +0000
tree	41f9cccdf70f6367fd36295f4af960a907693954
parent	c93e7c9fff19fb9139b5ab04ce041833add0134e [diff]