| // Copyright 2015 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package search |
| |
| import ( |
| "reflect" |
| "strings" |
| "testing" |
| |
| "golang.org/x/text/language" |
| ) |
| |
| func TestCompile(t *testing.T) { |
| for i, tc := range []struct { |
| desc string |
| pattern string |
| options []Option |
| n int |
| }{{ |
| desc: "empty", |
| pattern: "", |
| n: 0, |
| }, { |
| desc: "single", |
| pattern: "a", |
| n: 1, |
| }, { |
| desc: "keep modifier", |
| pattern: "a\u0300", // U+0300: COMBINING GRAVE ACCENT |
| n: 2, |
| }, { |
| desc: "remove modifier", |
| pattern: "a\u0300", // U+0300: COMBINING GRAVE ACCENT |
| options: []Option{IgnoreDiacritics}, |
| n: 1, |
| }, { |
| desc: "single with double collation element", |
| pattern: "ä", |
| n: 2, |
| }, { |
| desc: "leading variable", |
| pattern: " a", |
| n: 2, |
| }, { |
| desc: "trailing variable", |
| pattern: "aa ", |
| n: 3, |
| }, { |
| desc: "leading and trailing variable", |
| pattern: " äb ", |
| n: 5, |
| }, { |
| desc: "keep interior variable", |
| pattern: " ä b ", |
| n: 6, |
| }, { |
| desc: "keep interior variables", |
| pattern: " b ä ", |
| n: 7, |
| }, { |
| desc: "remove ignoreables (zero-weights across the board)", |
| pattern: "\u009Db\u009Dä\u009D", // U+009D: OPERATING SYSTEM COMMAND |
| n: 3, |
| }} { |
| m := New(language.Und, tc.options...) |
| p := m.CompileString(tc.pattern) |
| if len(p.ce) != tc.n { |
| t.Errorf("%d:%s: Compile(%+q): got %d; want %d", i, tc.desc, tc.pattern, len(p.ce), tc.n) |
| } |
| } |
| } |
| |
| func TestNorm(t *testing.T) { |
| // U+0300: COMBINING GRAVE ACCENT (CCC=230) |
| // U+031B: COMBINING HORN (CCC=216) |
| for _, tc := range []struct { |
| desc string |
| a string |
| b string |
| want bool // a and b compile into the same pattern? |
| }{{ |
| "simple", |
| "eee\u0300\u031b", |
| "eee\u031b\u0300", |
| true, |
| }, { |
| "large number of modifiers in pattern", |
| strings.Repeat("\u0300", 29) + "\u0318", |
| "\u0318" + strings.Repeat("\u0300", 29), |
| true, |
| }, { |
| "modifier overflow in pattern", |
| strings.Repeat("\u0300", 30) + "\u0318", |
| "\u0318" + strings.Repeat("\u0300", 30), |
| false, |
| }} { |
| m := New(language.Und) |
| a := m.CompileString(tc.a) |
| b := m.CompileString(tc.b) |
| if got := reflect.DeepEqual(a, b); got != tc.want { |
| t.Errorf("Compile(a) == Compile(b) == %v; want %v", got, tc.want) |
| } |
| } |
| } |
| |
| func TestForwardSearch(t *testing.T) { |
| for i, tc := range []struct { |
| desc string |
| tag string |
| options []Option |
| pattern string |
| text string |
| want []int |
| }{{ |
| // The semantics of an empty search is to match nothing. |
| // TODO: change this to be in line with strings.Index? It is quite a |
| // different beast, so not sure yet. |
| |
| desc: "empty pattern and text", |
| tag: "und", |
| pattern: "", |
| text: "", |
| want: nil, // TODO: consider: []int{0, 0}, |
| }, { |
| desc: "non-empty pattern and empty text", |
| tag: "und", |
| pattern: " ", |
| text: "", |
| want: nil, |
| }, { |
| desc: "empty pattern and non-empty text", |
| tag: "und", |
| pattern: "", |
| text: "abc", |
| want: nil, // TODO: consider: []int{0, 0, 1, 1, 2, 2, 3, 3}, |
| }, { |
| // Variable-only patterns. We don't support variables at the moment, |
| // but verify that, given this, the behavior is indeed as expected. |
| |
| desc: "exact match of variable", |
| tag: "und", |
| pattern: " ", |
| text: " ", |
| want: []int{0, 1}, |
| }, { |
| desc: "variables not handled by default", |
| tag: "und", |
| pattern: "- ", |
| text: " -", |
| want: nil, // Would be (1, 2) for a median match with variable}. |
| }, { |
| desc: "multiple subsequent identical variables", |
| tag: "und", |
| pattern: " ", |
| text: " ", |
| want: []int{0, 1, 1, 2, 2, 3, 3, 4}, |
| }, { |
| desc: "text with variables", |
| tag: "und", |
| options: []Option{IgnoreDiacritics}, |
| pattern: "abc", |
| text: "3 abc 3", |
| want: []int{2, 5}, |
| }, { |
| desc: "pattern with interior variables", |
| tag: "und", |
| options: []Option{IgnoreDiacritics}, |
| pattern: "a b c", |
| text: "3 a b c abc a b c 3", |
| want: []int{2, 7}, // Would have 3 matches using variable. |
| |
| // TODO: Different variable handling settings. |
| }, { |
| // Options. |
| |
| desc: "match all levels", |
| tag: "und", |
| pattern: "Abc", |
| text: "abcAbcABCÁbcábc", |
| want: []int{3, 6}, |
| }, { |
| desc: "ignore diacritics in text", |
| tag: "und", |
| options: []Option{IgnoreDiacritics}, |
| pattern: "Abc", |
| text: "Ábc", |
| want: []int{0, 4}, |
| }, { |
| desc: "ignore diacritics in pattern", |
| tag: "und", |
| options: []Option{IgnoreDiacritics}, |
| pattern: "Ábc", |
| text: "Abc", |
| want: []int{0, 3}, |
| }, { |
| desc: "ignore diacritics", |
| tag: "und", |
| options: []Option{IgnoreDiacritics}, |
| pattern: "Abc", |
| text: "abcAbcABCÁbcábc", |
| want: []int{3, 6, 9, 13}, |
| }, { |
| desc: "ignore case", |
| tag: "und", |
| options: []Option{IgnoreCase}, |
| pattern: "Abc", |
| text: "abcAbcABCÁbcábc", |
| want: []int{0, 3, 3, 6, 6, 9}, |
| }, { |
| desc: "ignore case and diacritics", |
| tag: "und", |
| options: []Option{IgnoreCase, IgnoreDiacritics}, |
| pattern: "Abc", |
| text: "abcAbcABCÁbcábc", |
| want: []int{0, 3, 3, 6, 6, 9, 9, 13, 13, 17}, |
| }, { |
| desc: "ignore width to fullwidth", |
| tag: "und", |
| options: []Option{IgnoreWidth}, |
| pattern: "abc", |
| text: "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C |
| want: []int{4, 13}, |
| }, { |
| // TODO: distinguish between case and width. |
| desc: "don't ignore width to fullwidth, ignoring only case", |
| tag: "und", |
| options: []Option{IgnoreCase}, |
| pattern: "abc", |
| text: "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C |
| want: []int{4, 13}, |
| }, { |
| desc: "ignore width to fullwidth and diacritics", |
| tag: "und", |
| options: []Option{IgnoreWidth, IgnoreDiacritics}, |
| pattern: "abc", |
| text: "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C |
| want: []int{4, 13}, |
| }, { |
| desc: "whole grapheme, single rune", |
| tag: "und", |
| pattern: "eee", |
| text: "123 eeé 123", |
| want: nil, |
| }, { |
| // Note: rules on when to apply contractions may, for certain languages, |
| // differ between search and collation. For example, "ch" is not |
| // considered a contraction for the purpose of searching in Spanish. |
| // Therefore, be careful picking this test. |
| desc: "whole grapheme, contractions", |
| tag: "da", |
| pattern: "aba", |
| // Fails at the primary level, because "aa" is a contraction. |
| text: "123 abaa 123", |
| want: []int{}, |
| }, { |
| desc: "whole grapheme, trailing modifier", |
| tag: "und", |
| pattern: "eee", |
| text: "123 eee\u0300 123", // U+0300: COMBINING GRAVE ACCENT |
| want: nil, |
| }, { |
| // Language-specific matching. |
| |
| desc: "", |
| tag: "da", |
| options: []Option{IgnoreCase}, |
| pattern: "Århus", |
| text: "AarhusÅrhus Århus ", |
| want: []int{0, 6, 6, 12, 14, 20}, |
| }, { |
| desc: "", |
| tag: "da", |
| options: []Option{IgnoreCase}, |
| pattern: "Aarhus", |
| text: "Århus Aarhus", |
| want: []int{0, 6, 7, 13}, |
| }, { |
| desc: "", |
| tag: "en", // Å does not match A for English. |
| options: []Option{IgnoreCase}, |
| pattern: "Aarhus", |
| text: "Århus", |
| want: nil, |
| }, { |
| desc: "ignore modifier in text", |
| options: []Option{IgnoreDiacritics}, |
| tag: "und", |
| pattern: "eee", |
| text: "123 eee\u0300 123", // U+0300: COMBINING GRAVE ACCENT |
| want: []int{4, 9}, // Matches on grapheme boundary. |
| }, { |
| desc: "ignore multiple modifiers in text", |
| options: []Option{IgnoreDiacritics}, |
| tag: "und", |
| pattern: "eee", |
| text: "123 eee\u0300\u0300 123", // U+0300: COMBINING GRAVE ACCENT |
| want: []int{4, 11}, // Matches on grapheme boundary. |
| }, { |
| desc: "ignore modifier in pattern", |
| options: []Option{IgnoreDiacritics}, |
| tag: "und", |
| pattern: "eee\u0300", // U+0300: COMBINING GRAVE ACCENT |
| text: "123 eee 123", |
| want: []int{4, 7}, |
| }, { |
| desc: "ignore multiple modifiers in pattern", |
| options: []Option{IgnoreDiacritics}, |
| tag: "und", |
| pattern: "eee\u0300\u0300", // U+0300: COMBINING GRAVE ACCENT |
| text: "123 eee 123", |
| want: []int{4, 7}, |
| }, { |
| desc: "match non-normalized pattern", |
| tag: "und", |
| // U+0300: COMBINING GRAVE ACCENT (CCC=230) |
| // U+031B: COMBINING HORN (CCC=216) |
| pattern: "eee\u0300\u031b", |
| text: "123 eee\u031b\u0300 123", |
| want: []int{4, 11}, |
| }, { |
| desc: "match non-normalized text", |
| tag: "und", |
| // U+0300: COMBINING GRAVE ACCENT (CCC=230) |
| // U+031B: COMBINING HORN (CCC=216) |
| pattern: "eee\u031b\u0300", |
| text: "123 eee\u0300\u031b 123", |
| want: []int{4, 11}, |
| }} { |
| m := New(language.MustParse(tc.tag), tc.options...) |
| p := m.CompileString(tc.pattern) |
| for j := 0; j < len(tc.text); { |
| start, end := p.IndexString(tc.text[j:]) |
| if start == -1 && end == -1 { |
| j++ |
| continue |
| } |
| start += j |
| end += j |
| j = end |
| if len(tc.want) == 0 { |
| t.Errorf("%d:%s: found unexpected result [%d %d]", i, tc.desc, start, end) |
| break |
| } |
| if tc.want[0] != start || tc.want[1] != end { |
| t.Errorf("%d:%s: got [%d %d]; want %v", i, tc.desc, start, end, tc.want[:2]) |
| tc.want = tc.want[2:] |
| break |
| } |
| tc.want = tc.want[2:] |
| } |
| if len(tc.want) != 0 { |
| t.Errorf("%d:%s: %d extra results", i, tc.desc, len(tc.want)/2) |
| } |
| } |
| } |