| // Copyright 2015 The Go Authors. All rights reserved. | 
 | // Use of this source code is governed by a BSD-style | 
 | // license that can be found in the LICENSE file. | 
 |  | 
 | package search | 
 |  | 
 | import ( | 
 | 	"reflect" | 
 | 	"strings" | 
 | 	"testing" | 
 |  | 
 | 	"golang.org/x/text/language" | 
 | ) | 
 |  | 
 | func TestCompile(t *testing.T) { | 
 | 	for i, tc := range []struct { | 
 | 		desc    string | 
 | 		pattern string | 
 | 		options []Option | 
 | 		n       int | 
 | 	}{{ | 
 | 		desc:    "empty", | 
 | 		pattern: "", | 
 | 		n:       0, | 
 | 	}, { | 
 | 		desc:    "single", | 
 | 		pattern: "a", | 
 | 		n:       1, | 
 | 	}, { | 
 | 		desc:    "keep modifier", | 
 | 		pattern: "a\u0300", // U+0300: COMBINING GRAVE ACCENT | 
 | 		n:       2, | 
 | 	}, { | 
 | 		desc:    "remove modifier", | 
 | 		pattern: "a\u0300", // U+0300: COMBINING GRAVE ACCENT | 
 | 		options: []Option{IgnoreDiacritics}, | 
 | 		n:       1, | 
 | 	}, { | 
 | 		desc:    "single with double collation element", | 
 | 		pattern: "ä", | 
 | 		n:       2, | 
 | 	}, { | 
 | 		desc:    "leading variable", | 
 | 		pattern: " a", | 
 | 		n:       2, | 
 | 	}, { | 
 | 		desc:    "trailing variable", | 
 | 		pattern: "aa ", | 
 | 		n:       3, | 
 | 	}, { | 
 | 		desc:    "leading and trailing variable", | 
 | 		pattern: " äb ", | 
 | 		n:       5, | 
 | 	}, { | 
 | 		desc:    "keep interior variable", | 
 | 		pattern: " ä b ", | 
 | 		n:       6, | 
 | 	}, { | 
 | 		desc:    "keep interior variables", | 
 | 		pattern: " b  ä ", | 
 | 		n:       7, | 
 | 	}, { | 
 | 		desc:    "remove ignoreables (zero-weights across the board)", | 
 | 		pattern: "\u009Db\u009Dä\u009D", // U+009D: OPERATING SYSTEM COMMAND | 
 | 		n:       3, | 
 | 	}} { | 
 | 		m := New(language.Und, tc.options...) | 
 | 		p := m.CompileString(tc.pattern) | 
 | 		if len(p.ce) != tc.n { | 
 | 			t.Errorf("%d:%s: Compile(%+q): got %d; want %d", i, tc.desc, tc.pattern, len(p.ce), tc.n) | 
 | 		} | 
 | 	} | 
 | } | 
 |  | 
 | func TestNorm(t *testing.T) { | 
 | 	// U+0300: COMBINING GRAVE ACCENT (CCC=230) | 
 | 	// U+031B: COMBINING HORN (CCC=216) | 
 | 	for _, tc := range []struct { | 
 | 		desc string | 
 | 		a    string | 
 | 		b    string | 
 | 		want bool // a and b compile into the same pattern? | 
 | 	}{{ | 
 | 		"simple", | 
 | 		"eee\u0300\u031b", | 
 | 		"eee\u031b\u0300", | 
 | 		true, | 
 | 	}, { | 
 | 		"large number of modifiers in pattern", | 
 | 		strings.Repeat("\u0300", 29) + "\u0318", | 
 | 		"\u0318" + strings.Repeat("\u0300", 29), | 
 | 		true, | 
 | 	}, { | 
 | 		"modifier overflow in pattern", | 
 | 		strings.Repeat("\u0300", 30) + "\u0318", | 
 | 		"\u0318" + strings.Repeat("\u0300", 30), | 
 | 		false, | 
 | 	}} { | 
 | 		m := New(language.Und) | 
 | 		a := m.CompileString(tc.a) | 
 | 		b := m.CompileString(tc.b) | 
 | 		if got := reflect.DeepEqual(a, b); got != tc.want { | 
 | 			t.Errorf("Compile(a) == Compile(b) == %v; want %v", got, tc.want) | 
 | 		} | 
 | 	} | 
 | } | 
 |  | 
 | func TestForwardSearch(t *testing.T) { | 
 | 	for i, tc := range []struct { | 
 | 		desc    string | 
 | 		tag     string | 
 | 		options []Option | 
 | 		pattern string | 
 | 		text    string | 
 | 		want    []int | 
 | 	}{{ | 
 | 		// The semantics of an empty search is to match nothing. | 
 | 		// TODO: change this to be in line with strings.Index? It is quite a | 
 | 		// different beast, so not sure yet. | 
 |  | 
 | 		desc:    "empty pattern and text", | 
 | 		tag:     "und", | 
 | 		pattern: "", | 
 | 		text:    "", | 
 | 		want:    nil, // TODO: consider: []int{0, 0}, | 
 | 	}, { | 
 | 		desc:    "non-empty pattern and empty text", | 
 | 		tag:     "und", | 
 | 		pattern: " ", | 
 | 		text:    "", | 
 | 		want:    nil, | 
 | 	}, { | 
 | 		desc:    "empty pattern and non-empty text", | 
 | 		tag:     "und", | 
 | 		pattern: "", | 
 | 		text:    "abc", | 
 | 		want:    nil, // TODO: consider: []int{0, 0, 1, 1, 2, 2, 3, 3}, | 
 | 	}, { | 
 | 		// Variable-only patterns. We don't support variables at the moment, | 
 | 		// but verify that, given this, the behavior is indeed as expected. | 
 |  | 
 | 		desc:    "exact match of variable", | 
 | 		tag:     "und", | 
 | 		pattern: " ", | 
 | 		text:    " ", | 
 | 		want:    []int{0, 1}, | 
 | 	}, { | 
 | 		desc:    "variables not handled by default", | 
 | 		tag:     "und", | 
 | 		pattern: "- ", | 
 | 		text:    " -", | 
 | 		want:    nil, // Would be (1, 2) for a median match with variable}. | 
 | 	}, { | 
 | 		desc:    "multiple subsequent identical variables", | 
 | 		tag:     "und", | 
 | 		pattern: " ", | 
 | 		text:    "    ", | 
 | 		want:    []int{0, 1, 1, 2, 2, 3, 3, 4}, | 
 | 	}, { | 
 | 		desc:    "text with variables", | 
 | 		tag:     "und", | 
 | 		options: []Option{IgnoreDiacritics}, | 
 | 		pattern: "abc", | 
 | 		text:    "3 abc 3", | 
 | 		want:    []int{2, 5}, | 
 | 	}, { | 
 | 		desc:    "pattern with interior variables", | 
 | 		tag:     "und", | 
 | 		options: []Option{IgnoreDiacritics}, | 
 | 		pattern: "a b c", | 
 | 		text:    "3 a b c abc a  b  c 3", | 
 | 		want:    []int{2, 7}, // Would have 3 matches using variable. | 
 |  | 
 | 		// TODO: Different variable handling settings. | 
 | 	}, { | 
 | 		// Options. | 
 |  | 
 | 		desc:    "match all levels", | 
 | 		tag:     "und", | 
 | 		pattern: "Abc", | 
 | 		text:    "abcAbcABCÁbcábc", | 
 | 		want:    []int{3, 6}, | 
 | 	}, { | 
 | 		desc:    "ignore diacritics in text", | 
 | 		tag:     "und", | 
 | 		options: []Option{IgnoreDiacritics}, | 
 | 		pattern: "Abc", | 
 | 		text:    "Ábc", | 
 | 		want:    []int{0, 4}, | 
 | 	}, { | 
 | 		desc:    "ignore diacritics in pattern", | 
 | 		tag:     "und", | 
 | 		options: []Option{IgnoreDiacritics}, | 
 | 		pattern: "Ábc", | 
 | 		text:    "Abc", | 
 | 		want:    []int{0, 3}, | 
 | 	}, { | 
 | 		desc:    "ignore diacritics", | 
 | 		tag:     "und", | 
 | 		options: []Option{IgnoreDiacritics}, | 
 | 		pattern: "Abc", | 
 | 		text:    "abcAbcABCÁbcábc", | 
 | 		want:    []int{3, 6, 9, 13}, | 
 | 	}, { | 
 | 		desc:    "ignore case", | 
 | 		tag:     "und", | 
 | 		options: []Option{IgnoreCase}, | 
 | 		pattern: "Abc", | 
 | 		text:    "abcAbcABCÁbcábc", | 
 | 		want:    []int{0, 3, 3, 6, 6, 9}, | 
 | 	}, { | 
 | 		desc:    "ignore case and diacritics", | 
 | 		tag:     "und", | 
 | 		options: []Option{IgnoreCase, IgnoreDiacritics}, | 
 | 		pattern: "Abc", | 
 | 		text:    "abcAbcABCÁbcábc", | 
 | 		want:    []int{0, 3, 3, 6, 6, 9, 9, 13, 13, 17}, | 
 | 	}, { | 
 | 		desc:    "ignore width to fullwidth", | 
 | 		tag:     "und", | 
 | 		options: []Option{IgnoreWidth}, | 
 | 		pattern: "abc", | 
 | 		text:    "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C | 
 | 		want:    []int{4, 13}, | 
 | 	}, { | 
 | 		// TODO: distinguish between case and width. | 
 | 		desc:    "don't ignore width to fullwidth, ignoring only case", | 
 | 		tag:     "und", | 
 | 		options: []Option{IgnoreCase}, | 
 | 		pattern: "abc", | 
 | 		text:    "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C | 
 | 		want:    []int{4, 13}, | 
 | 	}, { | 
 | 		desc:    "ignore width to fullwidth and diacritics", | 
 | 		tag:     "und", | 
 | 		options: []Option{IgnoreWidth, IgnoreDiacritics}, | 
 | 		pattern: "abc", | 
 | 		text:    "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C | 
 | 		want:    []int{4, 13}, | 
 | 	}, { | 
 | 		desc:    "whole grapheme, single rune", | 
 | 		tag:     "und", | 
 | 		pattern: "eee", | 
 | 		text:    "123 eeé 123", | 
 | 		want:    nil, | 
 | 	}, { | 
 | 		// Note: rules on when to apply contractions may, for certain languages, | 
 | 		// differ between search and collation. For example, "ch" is not | 
 | 		// considered a contraction for the purpose of searching in Spanish. | 
 | 		// Therefore, be careful picking this test. | 
 | 		desc:    "whole grapheme, contractions", | 
 | 		tag:     "da", | 
 | 		pattern: "aba", | 
 | 		// Fails at the primary level, because "aa" is a contraction. | 
 | 		text: "123 abaa 123", | 
 | 		want: []int{}, | 
 | 	}, { | 
 | 		desc:    "whole grapheme, trailing modifier", | 
 | 		tag:     "und", | 
 | 		pattern: "eee", | 
 | 		text:    "123 eee\u0300 123", // U+0300: COMBINING GRAVE ACCENT | 
 | 		want:    nil, | 
 | 	}, { | 
 | 		// Language-specific matching. | 
 |  | 
 | 		desc:    "", | 
 | 		tag:     "da", | 
 | 		options: []Option{IgnoreCase}, | 
 | 		pattern: "Århus", | 
 | 		text:    "AarhusÅrhus  Århus  ", | 
 | 		want:    []int{0, 6, 6, 12, 14, 20}, | 
 | 	}, { | 
 | 		desc:    "", | 
 | 		tag:     "da", | 
 | 		options: []Option{IgnoreCase}, | 
 | 		pattern: "Aarhus", | 
 | 		text:    "Århus Aarhus", | 
 | 		want:    []int{0, 6, 7, 13}, | 
 | 	}, { | 
 | 		desc:    "", | 
 | 		tag:     "en", // Å does not match A for English. | 
 | 		options: []Option{IgnoreCase}, | 
 | 		pattern: "Aarhus", | 
 | 		text:    "Århus", | 
 | 		want:    nil, | 
 | 	}, { | 
 | 		desc:    "ignore modifier in text", | 
 | 		options: []Option{IgnoreDiacritics}, | 
 | 		tag:     "und", | 
 | 		pattern: "eee", | 
 | 		text:    "123 eee\u0300 123", // U+0300: COMBINING GRAVE ACCENT | 
 | 		want:    []int{4, 9},         // Matches on grapheme boundary. | 
 | 	}, { | 
 | 		desc:    "ignore multiple modifiers in text", | 
 | 		options: []Option{IgnoreDiacritics}, | 
 | 		tag:     "und", | 
 | 		pattern: "eee", | 
 | 		text:    "123 eee\u0300\u0300 123", // U+0300: COMBINING GRAVE ACCENT | 
 | 		want:    []int{4, 11},              // Matches on grapheme boundary. | 
 | 	}, { | 
 | 		desc:    "ignore modifier in pattern", | 
 | 		options: []Option{IgnoreDiacritics}, | 
 | 		tag:     "und", | 
 | 		pattern: "eee\u0300", // U+0300: COMBINING GRAVE ACCENT | 
 | 		text:    "123 eee 123", | 
 | 		want:    []int{4, 7}, | 
 | 	}, { | 
 | 		desc:    "ignore multiple modifiers in pattern", | 
 | 		options: []Option{IgnoreDiacritics}, | 
 | 		tag:     "und", | 
 | 		pattern: "eee\u0300\u0300", // U+0300: COMBINING GRAVE ACCENT | 
 | 		text:    "123 eee 123", | 
 | 		want:    []int{4, 7}, | 
 | 	}, { | 
 | 		desc: "match non-normalized pattern", | 
 | 		tag:  "und", | 
 | 		// U+0300: COMBINING GRAVE ACCENT (CCC=230) | 
 | 		// U+031B: COMBINING HORN (CCC=216) | 
 | 		pattern: "eee\u0300\u031b", | 
 | 		text:    "123 eee\u031b\u0300 123", | 
 | 		want:    []int{4, 11}, | 
 | 	}, { | 
 | 		desc: "match non-normalized text", | 
 | 		tag:  "und", | 
 | 		// U+0300: COMBINING GRAVE ACCENT (CCC=230) | 
 | 		// U+031B: COMBINING HORN (CCC=216) | 
 | 		pattern: "eee\u031b\u0300", | 
 | 		text:    "123 eee\u0300\u031b 123", | 
 | 		want:    []int{4, 11}, | 
 | 	}} { | 
 | 		m := New(language.MustParse(tc.tag), tc.options...) | 
 | 		p := m.CompileString(tc.pattern) | 
 | 		for j := 0; j < len(tc.text); { | 
 | 			start, end := p.IndexString(tc.text[j:]) | 
 | 			if start == -1 && end == -1 { | 
 | 				j++ | 
 | 				continue | 
 | 			} | 
 | 			start += j | 
 | 			end += j | 
 | 			j = end | 
 | 			if len(tc.want) == 0 { | 
 | 				t.Errorf("%d:%s: found unexpected result [%d %d]", i, tc.desc, start, end) | 
 | 				break | 
 | 			} | 
 | 			if tc.want[0] != start || tc.want[1] != end { | 
 | 				t.Errorf("%d:%s: got [%d %d]; want %v", i, tc.desc, start, end, tc.want[:2]) | 
 | 				tc.want = tc.want[2:] | 
 | 				break | 
 | 			} | 
 | 			tc.want = tc.want[2:] | 
 | 		} | 
 | 		if len(tc.want) != 0 { | 
 | 			t.Errorf("%d:%s: %d extra results", i, tc.desc, len(tc.want)/2) | 
 | 		} | 
 | 	} | 
 | } |