| // Copyright 2010 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // The suffixarray package implements substring search in logarithmic time |
| // using an in-memory suffix array. |
| // |
| // Example use: |
| // |
| // // create index for some data |
| // index := suffixarray.New(data) |
| // |
| // // lookup byte slice s |
| // offsets1 := index.Lookup(s, -1) // the list of all indices where s occurs in data |
| // offsets2 := index.Lookup(s, 3) // the list of at most 3 indices where s occurs in data |
| // |
| package suffixarray |
| |
| import ( |
| "bytes" |
| "container/vector" |
| "sort" |
| ) |
| |
| // BUG(gri): For larger data (10MB) which contains very long (say 100000) |
| // contiguous sequences of identical bytes, index creation time will be extremely slow. |
| |
| // TODO(gri): Use a more sophisticated algorithm to create the suffix array. |
| |
| |
| // Index implements a suffix array for fast substring search. |
| type Index struct { |
| data []byte |
| sa []int // suffix array for data |
| } |
| |
| |
| // New creates a new Index for data. |
| // Index creation time is approximately O(N*log(N)) for N = len(data). |
| // |
| func New(data []byte) *Index { |
| sa := make([]int, len(data)) |
| for i, _ := range sa { |
| sa[i] = i |
| } |
| x := &index{data, sa} |
| sort.Sort(x) |
| return (*Index)(x) |
| } |
| |
| |
| func (x *Index) at(i int) []byte { |
| return x.data[x.sa[i]:] |
| } |
| |
| |
| // Binary search according to "A Method of Programming", E.W. Dijkstra. |
| func (x *Index) search(s []byte) int { |
| i, j := 0, len(x.sa) |
| // i < j for non-empty x |
| for i+1 < j { |
| // 0 <= i < j <= len(x.sa) && (x.at(i) <= s < x.at(j) || (s is not in x)) |
| h := i + (j-i)/2 // i < h < j |
| if bytes.Compare(x.at(h), s) <= 0 { |
| i = h |
| } else { // s < x.at(h) |
| j = h |
| } |
| } |
| // i+1 == j for non-empty x |
| return i |
| } |
| |
| |
| // Lookup returns an unsorted list of at most n indices where the byte string s |
| // occurs in the indexed data. If n < 0, all occurrences are returned. |
| // The result is nil if s is empty, s is not found, or n == 0. |
| // Lookup time is O((log(N) + len(res))*len(s)) where N is the |
| // size of the indexed data. |
| // |
| func (x *Index) Lookup(s []byte, n int) []int { |
| var res vector.IntVector |
| |
| if len(s) > 0 && n != 0 { |
| // find matching suffix index i |
| i := x.search(s) |
| // x.at(i) <= s < x.at(i+1) |
| |
| // ignore the first suffix if it is < s |
| if i < len(x.sa) && bytes.Compare(x.at(i), s) < 0 { |
| i++ |
| } |
| |
| // collect the following suffixes with matching prefixes |
| for (n < 0 || len(res) < n) && i < len(x.sa) && bytes.HasPrefix(x.at(i), s) { |
| res.Push(x.sa[i]) |
| i++ |
| } |
| } |
| |
| return res |
| } |
| |
| |
| // index is like Index; it is only used to hide the sort.Interface methods |
| type index struct { |
| data []byte |
| sa []int |
| } |
| |
| |
| // index implements sort.Interface |
| |
| func (x *index) Len() int { return len(x.sa) } |
| func (x *index) Less(i, j int) bool { return bytes.Compare(x.at(i), x.at(j)) < 0 } |
| func (x *index) Swap(i, j int) { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] } |
| func (a *index) at(i int) []byte { return a.data[a.sa[i]:] } |