blob: 174460cab8607f01677186b62df21dac84d87fa4 [file] [log] [blame]
Robert Griesemer22974fb2010-09-21 23:12:57 -07001// Copyright 2010 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
Nigel Tao6a186d32011-04-20 09:57:05 +10005// Package suffixarray implements substring search in logarithmic time using
6// an in-memory suffix array.
Robert Griesemer22974fb2010-09-21 23:12:57 -07007//
8// Example use:
9//
10// // create index for some data
11// index := suffixarray.New(data)
12//
13// // lookup byte slice s
14// offsets1 := index.Lookup(s, -1) // the list of all indices where s occurs in data
15// offsets2 := index.Lookup(s, 3) // the list of at most 3 indices where s occurs in data
16//
17package suffixarray
18
19import (
20 "bytes"
Robert Griesemera7a7cc52011-09-30 11:31:28 -070021 "encoding/binary"
Robert Griesemerbd80b112011-09-15 16:21:21 -070022 "io"
23 "os"
Russ Cox6c230fb2011-09-26 18:33:13 -040024 "regexp"
Robert Griesemer22974fb2010-09-21 23:12:57 -070025 "sort"
26)
27
Robert Griesemer22974fb2010-09-21 23:12:57 -070028// Index implements a suffix array for fast substring search.
29type Index struct {
30 data []byte
Robert Griesemer71557712011-09-27 16:21:28 -070031 sa []int // suffix array for data; len(sa) == len(data)
Robert Griesemer22974fb2010-09-21 23:12:57 -070032}
33
Robert Griesemer22974fb2010-09-21 23:12:57 -070034// New creates a new Index for data.
Eric Eisnere3c95652011-01-11 21:46:50 -080035// Index creation time is O(N*log(N)) for N = len(data).
Robert Griesemer22974fb2010-09-21 23:12:57 -070036func New(data []byte) *Index {
Eric Eisnere3c95652011-01-11 21:46:50 -080037 return &Index{data, qsufsort(data)}
Robert Griesemer22974fb2010-09-21 23:12:57 -070038}
39
Robert Griesemera7a7cc52011-09-30 11:31:28 -070040// writeInt writes an int x to w using buf to buffer the write.
41func writeInt(w io.Writer, buf []byte, x int) os.Error {
42 binary.PutVarint(buf, int64(x))
43 _, err := w.Write(buf[0:binary.MaxVarintLen64])
44 return err
45}
46
47// readInt reads an int x from r using buf to buffer the read and returns x.
48func readInt(r io.Reader, buf []byte) (int, os.Error) {
49 _, err := io.ReadFull(r, buf[0:binary.MaxVarintLen64]) // ok to continue with error
50 x, _ := binary.Varint(buf)
51 return int(x), err
52}
53
54// writeSlice writes data[:n] to w and returns n.
55// It uses buf to buffer the write.
56func writeSlice(w io.Writer, buf []byte, data []int) (n int, err os.Error) {
57 // encode as many elements as fit into buf
58 p := binary.MaxVarintLen64
59 for ; n < len(data) && p+binary.MaxVarintLen64 <= len(buf); n++ {
60 p += binary.PutUvarint(buf[p:], uint64(data[n]))
61 }
62
63 // update buffer size
64 binary.PutVarint(buf, int64(p))
65
66 // write buffer
67 _, err = w.Write(buf[0:p])
68 return
69}
70
71// readSlice reads data[:n] from r and returns n.
72// It uses buf to buffer the read.
73func readSlice(r io.Reader, buf []byte, data []int) (n int, err os.Error) {
74 // read buffer size
75 var size int
76 size, err = readInt(r, buf)
77 if err != nil {
78 return
79 }
80
81 // read buffer w/o the size
82 if _, err = io.ReadFull(r, buf[binary.MaxVarintLen64:size]); err != nil {
83 return
84 }
85
86 // decode as many elements as present in buf
87 for p := binary.MaxVarintLen64; p < size; n++ {
88 x, w := binary.Uvarint(buf[p:])
89 data[n] = int(x)
90 p += w
91 }
92
93 return
94}
95
96const bufSize = 16 << 10 // reasonable for BenchmarkSaveRestore
Robert Griesemer5ee7ef92011-09-20 14:36:19 -070097
Robert Griesemerbd80b112011-09-15 16:21:21 -070098// Read reads the index from r into x; x must not be nil.
99func (x *Index) Read(r io.Reader) os.Error {
Robert Griesemera7a7cc52011-09-30 11:31:28 -0700100 // buffer for all reads
101 buf := make([]byte, bufSize)
102
103 // read length
104 n, err := readInt(r, buf)
105 if err != nil {
Robert Griesemerbd80b112011-09-15 16:21:21 -0700106 return err
107 }
Robert Griesemera7a7cc52011-09-30 11:31:28 -0700108
109 // allocate space
Robert Griesemer5ee7ef92011-09-20 14:36:19 -0700110 if 2*n < cap(x.data) || cap(x.data) < n {
Robert Griesemerbd80b112011-09-15 16:21:21 -0700111 // new data is significantly smaller or larger then
112 // existing buffers - allocate new ones
113 x.data = make([]byte, n)
Robert Griesemer71557712011-09-27 16:21:28 -0700114 x.sa = make([]int, n)
Robert Griesemerbd80b112011-09-15 16:21:21 -0700115 } else {
116 // re-use existing buffers
117 x.data = x.data[0:n]
118 x.sa = x.sa[0:n]
119 }
Robert Griesemera7a7cc52011-09-30 11:31:28 -0700120
121 // read data
122 if _, err := io.ReadFull(r, x.data); err != nil {
123 return err
124 }
125
126 // read index
127 for sa := x.sa; len(sa) > 0; {
128 n, err := readSlice(r, buf, sa)
129 if err != nil {
Robert Griesemer5ee7ef92011-09-20 14:36:19 -0700130 return err
131 }
Robert Griesemera7a7cc52011-09-30 11:31:28 -0700132 sa = sa[n:]
Robert Griesemerbd80b112011-09-15 16:21:21 -0700133 }
Robert Griesemerbd80b112011-09-15 16:21:21 -0700134 return nil
135}
136
137// Write writes the index x to w.
138func (x *Index) Write(w io.Writer) os.Error {
Robert Griesemera7a7cc52011-09-30 11:31:28 -0700139 // buffer for all writes
140 buf := make([]byte, bufSize)
141
142 // write length
143 if err := writeInt(w, buf, len(x.data)); err != nil {
Robert Griesemerbd80b112011-09-15 16:21:21 -0700144 return err
145 }
Robert Griesemera7a7cc52011-09-30 11:31:28 -0700146
147 // write data
148 if _, err := w.Write(x.data); err != nil {
149 return err
150 }
151
152 // write index
153 for sa := x.sa; len(sa) > 0; {
154 n, err := writeSlice(w, buf, sa)
155 if err != nil {
Robert Griesemer5ee7ef92011-09-20 14:36:19 -0700156 return err
157 }
Robert Griesemera7a7cc52011-09-30 11:31:28 -0700158 sa = sa[n:]
Robert Griesemerbd80b112011-09-15 16:21:21 -0700159 }
160 return nil
161}
162
Robert Griesemer3fb6d622010-12-14 10:22:00 -0800163// Bytes returns the data over which the index was created.
Robert Griesemer52c9fb62010-12-13 17:08:01 -0800164// It must not be modified.
165//
Robert Griesemer3fb6d622010-12-14 10:22:00 -0800166func (x *Index) Bytes() []byte {
Robert Griesemer52c9fb62010-12-13 17:08:01 -0800167 return x.data
168}
169
Robert Griesemer22974fb2010-09-21 23:12:57 -0700170func (x *Index) at(i int) []byte {
171 return x.data[x.sa[i]:]
172}
173
Eric Eisnerab036ab2011-01-24 13:03:32 -0800174// lookupAll returns a slice into the matching region of the index.
175// The runtime is O(log(N)*len(s)).
Robert Griesemer71557712011-09-27 16:21:28 -0700176func (x *Index) lookupAll(s []byte) []int {
Eric Eisnerab036ab2011-01-24 13:03:32 -0800177 // find matching suffix index range [i:j]
178 // find the first index where s would be the prefix
179 i := sort.Search(len(x.sa), func(i int) bool { return bytes.Compare(x.at(i), s) >= 0 })
180 // starting at i, find the first index at which s is not a prefix
181 j := i + sort.Search(len(x.sa)-i, func(j int) bool { return !bytes.HasPrefix(x.at(j+i), s) })
182 return x.sa[i:j]
Robert Griesemer22974fb2010-09-21 23:12:57 -0700183}
184
Robert Griesemer22974fb2010-09-21 23:12:57 -0700185// Lookup returns an unsorted list of at most n indices where the byte string s
186// occurs in the indexed data. If n < 0, all occurrences are returned.
187// The result is nil if s is empty, s is not found, or n == 0.
Eric Eisnerab036ab2011-01-24 13:03:32 -0800188// Lookup time is O(log(N)*len(s) + len(result)) where N is the
Robert Griesemer22974fb2010-09-21 23:12:57 -0700189// size of the indexed data.
190//
Robert Griesemer2662b992010-12-17 14:00:46 -0800191func (x *Index) Lookup(s []byte, n int) (result []int) {
Robert Griesemer22974fb2010-09-21 23:12:57 -0700192 if len(s) > 0 && n != 0 {
Eric Eisnerab036ab2011-01-24 13:03:32 -0800193 matches := x.lookupAll(s)
Robert Griesemerbd80b112011-09-15 16:21:21 -0700194 if n < 0 || len(matches) < n {
Eric Eisnerab036ab2011-01-24 13:03:32 -0800195 n = len(matches)
196 }
Robert Griesemerbd80b112011-09-15 16:21:21 -0700197 // 0 <= n <= len(matches)
Eric Eisnerab036ab2011-01-24 13:03:32 -0800198 if n > 0 {
199 result = make([]int, n)
Robert Griesemer71557712011-09-27 16:21:28 -0700200 copy(result, matches)
Robert Griesemer22974fb2010-09-21 23:12:57 -0700201 }
202 }
Robert Griesemer2662b992010-12-17 14:00:46 -0800203 return
204}
Robert Griesemer22974fb2010-09-21 23:12:57 -0700205
Robert Griesemer2662b992010-12-17 14:00:46 -0800206// FindAllIndex returns a sorted list of non-overlapping matches of the
207// regular expression r, where a match is a pair of indices specifying
208// the matched slice of x.Bytes(). If n < 0, all matches are returned
209// in successive order. Otherwise, at most n matches are returned and
210// they may not be successive. The result is nil if there are no matches,
211// or if n == 0.
212//
213func (x *Index) FindAllIndex(r *regexp.Regexp, n int) (result [][]int) {
214 // a non-empty literal prefix is used to determine possible
215 // match start indices with Lookup
216 prefix, complete := r.LiteralPrefix()
217 lit := []byte(prefix)
218
219 // worst-case scenario: no literal prefix
220 if prefix == "" {
221 return r.FindAllIndex(x.data, n)
222 }
223
224 // if regexp is a literal just use Lookup and convert its
225 // result into match pairs
226 if complete {
227 // Lookup returns indices that may belong to overlapping matches.
228 // After eliminating them, we may end up with fewer than n matches.
229 // If we don't have enough at the end, redo the search with an
230 // increased value n1, but only if Lookup returned all the requested
231 // indices in the first place (if it returned fewer than that then
232 // there cannot be more).
233 for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ {
234 indices := x.Lookup(lit, n1)
235 if len(indices) == 0 {
236 return
237 }
Andrew Gerrand5bcbcab32011-07-08 10:52:50 +1000238 sort.Ints(indices)
Robert Griesemer2662b992010-12-17 14:00:46 -0800239 pairs := make([]int, 2*len(indices))
240 result = make([][]int, len(indices))
241 count := 0
242 prev := 0
243 for _, i := range indices {
244 if count == n {
245 break
246 }
247 // ignore indices leading to overlapping matches
248 if prev <= i {
249 j := 2 * count
250 pairs[j+0] = i
251 pairs[j+1] = i + len(lit)
252 result[count] = pairs[j : j+2]
253 count++
254 prev = i + len(lit)
255 }
256 }
257 result = result[0:count]
258 if len(result) >= n || len(indices) != n1 {
259 // found all matches or there's no chance to find more
260 // (n and n1 can be negative)
261 break
262 }
263 }
264 if len(result) == 0 {
265 result = nil
266 }
267 return
268 }
269
270 // regexp has a non-empty literal prefix; Lookup(lit) computes
271 // the indices of possible complete matches; use these as starting
272 // points for anchored searches
273 // (regexp "^" matches beginning of input, not beginning of line)
274 r = regexp.MustCompile("^" + r.String()) // compiles because r compiled
275
276 // same comment about Lookup applies here as in the loop above
277 for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ {
278 indices := x.Lookup(lit, n1)
279 if len(indices) == 0 {
280 return
281 }
Andrew Gerrand5bcbcab32011-07-08 10:52:50 +1000282 sort.Ints(indices)
Robert Griesemer2662b992010-12-17 14:00:46 -0800283 result = result[0:0]
284 prev := 0
285 for _, i := range indices {
286 if len(result) == n {
287 break
288 }
289 m := r.FindIndex(x.data[i:]) // anchored search - will not run off
290 // ignore indices leading to overlapping matches
291 if m != nil && prev <= i {
292 m[0] = i // correct m
293 m[1] += i
294 result = append(result, m)
295 prev = m[1]
296 }
297 }
298 if len(result) >= n || len(indices) != n1 {
299 // found all matches or there's no chance to find more
300 // (n and n1 can be negative)
301 break
302 }
303 }
304 if len(result) == 0 {
305 result = nil
306 }
307 return
Robert Griesemer22974fb2010-09-21 23:12:57 -0700308}