| // Copyright 2015 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| //go:generate go run ../collate/maketables.go -cldr=23 -unicode=6.2.0 -types=search,searchjl -package=search |
| |
| // Package search provides language-specific search and string matching. |
| // |
| // Natural language matching can be intricate. For example, Danish will insist |
| // "Århus" and "Aarhus" are the same name and Turkish will match I to ı (note |
| // the lack of a dot) in a case-insensitive match. This package handles such |
| // language-specific details. |
| // |
| // Text passed to any of the calls in this message does not need to be |
| // normalized. |
| package search // import "golang.org/x/text/search" |
| |
| import ( |
| "strings" |
| |
| "golang.org/x/text/internal/colltab" |
| "golang.org/x/text/language" |
| ) |
| |
| // An Option configures a Matcher. |
| type Option func(*Matcher) |
| |
| var ( |
| // WholeWord restricts matches to complete words. The default is to match at |
| // the character level. |
| WholeWord Option = nil |
| |
| // Exact requires that two strings are their exact equivalent. For example |
| // å would not match aa in Danish. It overrides any of the ignore options. |
| Exact Option = nil |
| |
| // Loose causes case, diacritics and width to be ignored. |
| Loose Option = loose |
| |
| // IgnoreCase enables case-insensitive search. |
| IgnoreCase Option = ignoreCase |
| |
| // IgnoreDiacritics causes diacritics to be ignored ("ö" == "o"). |
| IgnoreDiacritics Option = ignoreDiacritics |
| |
| // IgnoreWidth equates narrow with wide variants. |
| IgnoreWidth Option = ignoreWidth |
| ) |
| |
| func ignoreDiacritics(m *Matcher) { m.ignoreDiacritics = true } |
| func ignoreCase(m *Matcher) { m.ignoreCase = true } |
| func ignoreWidth(m *Matcher) { m.ignoreWidth = true } |
| func loose(m *Matcher) { |
| ignoreDiacritics(m) |
| ignoreCase(m) |
| ignoreWidth(m) |
| } |
| |
| var ( |
| // Supported lists the languages for which search differs from its parent. |
| Supported language.Coverage |
| |
| tags []language.Tag |
| ) |
| |
| func init() { |
| ids := strings.Split(availableLocales, ",") |
| tags = make([]language.Tag, len(ids)) |
| for i, s := range ids { |
| tags[i] = language.Raw.MustParse(s) |
| } |
| Supported = language.NewCoverage(tags) |
| } |
| |
| // New returns a new Matcher for the given language and options. |
| func New(t language.Tag, opts ...Option) *Matcher { |
| m := &Matcher{ |
| w: getTable(locales[colltab.MatchLang(t, tags)]), |
| } |
| for _, f := range opts { |
| f(m) |
| } |
| return m |
| } |
| |
| // A Matcher implements language-specific string matching. |
| type Matcher struct { |
| w colltab.Weighter |
| ignoreCase bool |
| ignoreWidth bool |
| ignoreDiacritics bool |
| } |
| |
| // An IndexOption specifies how the Index methods of Pattern or Matcher should |
| // match the input. |
| type IndexOption byte |
| |
| const ( |
| // Anchor restricts the search to the start (or end for Backwards) of the |
| // text. |
| Anchor IndexOption = 1 << iota |
| |
| // Backwards starts the search from the end of the text. |
| Backwards |
| |
| anchorBackwards = Anchor | Backwards |
| ) |
| |
| // Index reports the start and end position of the first occurrence of pat in b |
| // or -1, -1 if pat is not present. |
| func (m *Matcher) Index(b, pat []byte, opts ...IndexOption) (start, end int) { |
| // TODO: implement optimized version that does not use a pattern. |
| return m.Compile(pat).Index(b, opts...) |
| } |
| |
| // IndexString reports the start and end position of the first occurrence of pat |
| // in s or -1, -1 if pat is not present. |
| func (m *Matcher) IndexString(s, pat string, opts ...IndexOption) (start, end int) { |
| // TODO: implement optimized version that does not use a pattern. |
| return m.CompileString(pat).IndexString(s, opts...) |
| } |
| |
| // Equal reports whether a and b are equivalent. |
| func (m *Matcher) Equal(a, b []byte) bool { |
| _, end := m.Index(a, b, Anchor) |
| return end == len(a) |
| } |
| |
| // EqualString reports whether a and b are equivalent. |
| func (m *Matcher) EqualString(a, b string) bool { |
| _, end := m.IndexString(a, b, Anchor) |
| return end == len(a) |
| } |
| |
| // Compile compiles and returns a pattern that can be used for faster searching. |
| func (m *Matcher) Compile(b []byte) *Pattern { |
| p := &Pattern{m: m} |
| iter := colltab.Iter{Weighter: m.w} |
| for iter.SetInput(b); iter.Next(); { |
| } |
| p.ce = iter.Elems |
| p.deleteEmptyElements() |
| return p |
| } |
| |
| // CompileString compiles and returns a pattern that can be used for faster |
| // searching. |
| func (m *Matcher) CompileString(s string) *Pattern { |
| p := &Pattern{m: m} |
| iter := colltab.Iter{Weighter: m.w} |
| for iter.SetInputString(s); iter.Next(); { |
| } |
| p.ce = iter.Elems |
| p.deleteEmptyElements() |
| return p |
| } |
| |
| // A Pattern is a compiled search string. It is safe for concurrent use. |
| type Pattern struct { |
| m *Matcher |
| ce []colltab.Elem |
| } |
| |
| // Design note (TODO remove): |
| // The cost of retrieving collation elements for each rune, which is used for |
| // search as well, is not trivial. Also, algorithms like Boyer-Moore and |
| // Sunday require some additional precomputing. |
| |
| // Index reports the start and end position of the first occurrence of p in b |
| // or -1, -1 if p is not present. |
| func (p *Pattern) Index(b []byte, opts ...IndexOption) (start, end int) { |
| // Pick a large enough buffer such that we likely do not need to allocate |
| // and small enough to not cause too much overhead initializing. |
| var buf [8]colltab.Elem |
| |
| it := &colltab.Iter{ |
| Weighter: p.m.w, |
| Elems: buf[:0], |
| } |
| it.SetInput(b) |
| |
| var optMask IndexOption |
| for _, o := range opts { |
| optMask |= o |
| } |
| |
| switch optMask { |
| case 0: |
| return p.forwardSearch(it) |
| case Anchor: |
| return p.anchoredForwardSearch(it) |
| case Backwards, anchorBackwards: |
| panic("TODO: implement") |
| default: |
| panic("unrecognized option") |
| } |
| } |
| |
| // IndexString reports the start and end position of the first occurrence of p |
| // in s or -1, -1 if p is not present. |
| func (p *Pattern) IndexString(s string, opts ...IndexOption) (start, end int) { |
| // Pick a large enough buffer such that we likely do not need to allocate |
| // and small enough to not cause too much overhead initializing. |
| var buf [8]colltab.Elem |
| |
| it := &colltab.Iter{ |
| Weighter: p.m.w, |
| Elems: buf[:0], |
| } |
| it.SetInputString(s) |
| |
| var optMask IndexOption |
| for _, o := range opts { |
| optMask |= o |
| } |
| |
| switch optMask { |
| case 0: |
| return p.forwardSearch(it) |
| case Anchor: |
| return p.anchoredForwardSearch(it) |
| case Backwards, anchorBackwards: |
| panic("TODO: implement") |
| default: |
| panic("unrecognized option") |
| } |
| } |
| |
| // TODO: |
| // - Maybe IndexAll methods (probably not necessary). |
| // - Some way to match patterns in a Reader (a bit tricky). |
| // - Some fold transformer that folds text to comparable text, based on the |
| // search options. This is a common technique, though very different from the |
| // collation-based design of this package. It has a somewhat different use |
| // case, so probably makes sense to support both. Should probably be in a |
| // different package, though, as it uses completely different kind of tables |
| // (based on norm, cases, width and range tables.) |