Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 1 | // Copyright 2010 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // The suffixarray package implements substring search in logarithmic time |
| 6 | // using an in-memory suffix array. |
| 7 | // |
| 8 | // Example use: |
| 9 | // |
| 10 | // // create index for some data |
| 11 | // index := suffixarray.New(data) |
| 12 | // |
| 13 | // // lookup byte slice s |
| 14 | // offsets1 := index.Lookup(s, -1) // the list of all indices where s occurs in data |
| 15 | // offsets2 := index.Lookup(s, 3) // the list of at most 3 indices where s occurs in data |
| 16 | // |
| 17 | package suffixarray |
| 18 | |
| 19 | import ( |
| 20 | "bytes" |
| 21 | "container/vector" |
| 22 | "sort" |
| 23 | ) |
| 24 | |
| 25 | // BUG(gri): For larger data (10MB) which contains very long (say 100000) |
| 26 | // contiguous sequences of identical bytes, index creation time will be extremely slow. |
| 27 | |
| 28 | // TODO(gri): Use a more sophisticated algorithm to create the suffix array. |
| 29 | |
| 30 | |
| 31 | // Index implements a suffix array for fast substring search. |
| 32 | type Index struct { |
| 33 | data []byte |
| 34 | sa []int // suffix array for data |
| 35 | } |
| 36 | |
| 37 | |
| 38 | // New creates a new Index for data. |
| 39 | // Index creation time is approximately O(N*log(N)) for N = len(data). |
| 40 | // |
| 41 | func New(data []byte) *Index { |
| 42 | sa := make([]int, len(data)) |
Ryan Hitchman | 062406b | 2010-12-08 21:36:56 -0800 | [diff] [blame] | 43 | for i := range sa { |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 44 | sa[i] = i |
| 45 | } |
Robert Griesemer | 3487495 | 2010-09-22 11:03:57 -0700 | [diff] [blame] | 46 | x := &Index{data, sa} |
| 47 | sort.Sort((*index)(x)) |
| 48 | return x |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 49 | } |
| 50 | |
| 51 | |
Robert Griesemer | 52c9fb6 | 2010-12-13 17:08:01 -0800 | [diff] [blame^] | 52 | // Data returns the data over which the index was created. |
| 53 | // It must not be modified. |
| 54 | // |
| 55 | func (x *Index) Data() []byte { |
| 56 | return x.data |
| 57 | } |
| 58 | |
| 59 | |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 60 | func (x *Index) at(i int) []byte { |
| 61 | return x.data[x.sa[i]:] |
| 62 | } |
| 63 | |
| 64 | |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 65 | func (x *Index) search(s []byte) int { |
Russ Cox | b51f0c5 | 2010-11-19 16:04:25 -0500 | [diff] [blame] | 66 | return sort.Search(len(x.sa), func(i int) bool { return bytes.Compare(x.at(i), s) >= 0 }) |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 67 | } |
| 68 | |
| 69 | |
| 70 | // Lookup returns an unsorted list of at most n indices where the byte string s |
| 71 | // occurs in the indexed data. If n < 0, all occurrences are returned. |
| 72 | // The result is nil if s is empty, s is not found, or n == 0. |
Robert Griesemer | 3487495 | 2010-09-22 11:03:57 -0700 | [diff] [blame] | 73 | // Lookup time is O((log(N) + len(result))*len(s)) where N is the |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 74 | // size of the indexed data. |
| 75 | // |
| 76 | func (x *Index) Lookup(s []byte, n int) []int { |
| 77 | var res vector.IntVector |
| 78 | |
| 79 | if len(s) > 0 && n != 0 { |
| 80 | // find matching suffix index i |
| 81 | i := x.search(s) |
Russ Cox | b51f0c5 | 2010-11-19 16:04:25 -0500 | [diff] [blame] | 82 | // x.at(i-1) < s <= x.at(i) |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 83 | |
| 84 | // collect the following suffixes with matching prefixes |
| 85 | for (n < 0 || len(res) < n) && i < len(x.sa) && bytes.HasPrefix(x.at(i), s) { |
| 86 | res.Push(x.sa[i]) |
| 87 | i++ |
| 88 | } |
| 89 | } |
| 90 | |
| 91 | return res |
| 92 | } |
| 93 | |
| 94 | |
Robert Griesemer | 3487495 | 2010-09-22 11:03:57 -0700 | [diff] [blame] | 95 | // index is used to hide the sort.Interface |
| 96 | type index Index |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 97 | |
| 98 | func (x *index) Len() int { return len(x.sa) } |
| 99 | func (x *index) Less(i, j int) bool { return bytes.Compare(x.at(i), x.at(j)) < 0 } |
| 100 | func (x *index) Swap(i, j int) { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] } |
| 101 | func (a *index) at(i int) []byte { return a.data[a.sa[i]:] } |