Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 1 | // Copyright 2010 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
Nigel Tao | 6a186d3 | 2011-04-20 09:57:05 +1000 | [diff] [blame] | 5 | // Package suffixarray implements substring search in logarithmic time using |
| 6 | // an in-memory suffix array. |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 7 | // |
| 8 | // Example use: |
| 9 | // |
| 10 | // // create index for some data |
| 11 | // index := suffixarray.New(data) |
| 12 | // |
| 13 | // // lookup byte slice s |
| 14 | // offsets1 := index.Lookup(s, -1) // the list of all indices where s occurs in data |
| 15 | // offsets2 := index.Lookup(s, 3) // the list of at most 3 indices where s occurs in data |
| 16 | // |
| 17 | package suffixarray |
| 18 | |
| 19 | import ( |
| 20 | "bytes" |
Robert Griesemer | a7a7cc5 | 2011-09-30 11:31:28 -0700 | [diff] [blame] | 21 | "encoding/binary" |
Robert Griesemer | bd80b11 | 2011-09-15 16:21:21 -0700 | [diff] [blame] | 22 | "io" |
| 23 | "os" |
Russ Cox | 6c230fb | 2011-09-26 18:33:13 -0400 | [diff] [blame] | 24 | "regexp" |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 25 | "sort" |
| 26 | ) |
| 27 | |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 28 | // Index implements a suffix array for fast substring search. |
| 29 | type Index struct { |
| 30 | data []byte |
Robert Griesemer | 7155771 | 2011-09-27 16:21:28 -0700 | [diff] [blame] | 31 | sa []int // suffix array for data; len(sa) == len(data) |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 32 | } |
| 33 | |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 34 | // New creates a new Index for data. |
Eric Eisner | e3c9565 | 2011-01-11 21:46:50 -0800 | [diff] [blame] | 35 | // Index creation time is O(N*log(N)) for N = len(data). |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 36 | func New(data []byte) *Index { |
Eric Eisner | e3c9565 | 2011-01-11 21:46:50 -0800 | [diff] [blame] | 37 | return &Index{data, qsufsort(data)} |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 38 | } |
| 39 | |
Robert Griesemer | a7a7cc5 | 2011-09-30 11:31:28 -0700 | [diff] [blame] | 40 | // writeInt writes an int x to w using buf to buffer the write. |
| 41 | func writeInt(w io.Writer, buf []byte, x int) os.Error { |
| 42 | binary.PutVarint(buf, int64(x)) |
| 43 | _, err := w.Write(buf[0:binary.MaxVarintLen64]) |
| 44 | return err |
| 45 | } |
| 46 | |
| 47 | // readInt reads an int x from r using buf to buffer the read and returns x. |
| 48 | func readInt(r io.Reader, buf []byte) (int, os.Error) { |
| 49 | _, err := io.ReadFull(r, buf[0:binary.MaxVarintLen64]) // ok to continue with error |
| 50 | x, _ := binary.Varint(buf) |
| 51 | return int(x), err |
| 52 | } |
| 53 | |
| 54 | // writeSlice writes data[:n] to w and returns n. |
| 55 | // It uses buf to buffer the write. |
| 56 | func writeSlice(w io.Writer, buf []byte, data []int) (n int, err os.Error) { |
| 57 | // encode as many elements as fit into buf |
| 58 | p := binary.MaxVarintLen64 |
| 59 | for ; n < len(data) && p+binary.MaxVarintLen64 <= len(buf); n++ { |
| 60 | p += binary.PutUvarint(buf[p:], uint64(data[n])) |
| 61 | } |
| 62 | |
| 63 | // update buffer size |
| 64 | binary.PutVarint(buf, int64(p)) |
| 65 | |
| 66 | // write buffer |
| 67 | _, err = w.Write(buf[0:p]) |
| 68 | return |
| 69 | } |
| 70 | |
| 71 | // readSlice reads data[:n] from r and returns n. |
| 72 | // It uses buf to buffer the read. |
| 73 | func readSlice(r io.Reader, buf []byte, data []int) (n int, err os.Error) { |
| 74 | // read buffer size |
| 75 | var size int |
| 76 | size, err = readInt(r, buf) |
| 77 | if err != nil { |
| 78 | return |
| 79 | } |
| 80 | |
| 81 | // read buffer w/o the size |
| 82 | if _, err = io.ReadFull(r, buf[binary.MaxVarintLen64:size]); err != nil { |
| 83 | return |
| 84 | } |
| 85 | |
| 86 | // decode as many elements as present in buf |
| 87 | for p := binary.MaxVarintLen64; p < size; n++ { |
| 88 | x, w := binary.Uvarint(buf[p:]) |
| 89 | data[n] = int(x) |
| 90 | p += w |
| 91 | } |
| 92 | |
| 93 | return |
| 94 | } |
| 95 | |
| 96 | const bufSize = 16 << 10 // reasonable for BenchmarkSaveRestore |
Robert Griesemer | 5ee7ef9 | 2011-09-20 14:36:19 -0700 | [diff] [blame] | 97 | |
Robert Griesemer | bd80b11 | 2011-09-15 16:21:21 -0700 | [diff] [blame] | 98 | // Read reads the index from r into x; x must not be nil. |
| 99 | func (x *Index) Read(r io.Reader) os.Error { |
Robert Griesemer | a7a7cc5 | 2011-09-30 11:31:28 -0700 | [diff] [blame] | 100 | // buffer for all reads |
| 101 | buf := make([]byte, bufSize) |
| 102 | |
| 103 | // read length |
| 104 | n, err := readInt(r, buf) |
| 105 | if err != nil { |
Robert Griesemer | bd80b11 | 2011-09-15 16:21:21 -0700 | [diff] [blame] | 106 | return err |
| 107 | } |
Robert Griesemer | a7a7cc5 | 2011-09-30 11:31:28 -0700 | [diff] [blame] | 108 | |
| 109 | // allocate space |
Robert Griesemer | 5ee7ef9 | 2011-09-20 14:36:19 -0700 | [diff] [blame] | 110 | if 2*n < cap(x.data) || cap(x.data) < n { |
Robert Griesemer | bd80b11 | 2011-09-15 16:21:21 -0700 | [diff] [blame] | 111 | // new data is significantly smaller or larger then |
| 112 | // existing buffers - allocate new ones |
| 113 | x.data = make([]byte, n) |
Robert Griesemer | 7155771 | 2011-09-27 16:21:28 -0700 | [diff] [blame] | 114 | x.sa = make([]int, n) |
Robert Griesemer | bd80b11 | 2011-09-15 16:21:21 -0700 | [diff] [blame] | 115 | } else { |
| 116 | // re-use existing buffers |
| 117 | x.data = x.data[0:n] |
| 118 | x.sa = x.sa[0:n] |
| 119 | } |
Robert Griesemer | a7a7cc5 | 2011-09-30 11:31:28 -0700 | [diff] [blame] | 120 | |
| 121 | // read data |
| 122 | if _, err := io.ReadFull(r, x.data); err != nil { |
| 123 | return err |
| 124 | } |
| 125 | |
| 126 | // read index |
| 127 | for sa := x.sa; len(sa) > 0; { |
| 128 | n, err := readSlice(r, buf, sa) |
| 129 | if err != nil { |
Robert Griesemer | 5ee7ef9 | 2011-09-20 14:36:19 -0700 | [diff] [blame] | 130 | return err |
| 131 | } |
Robert Griesemer | a7a7cc5 | 2011-09-30 11:31:28 -0700 | [diff] [blame] | 132 | sa = sa[n:] |
Robert Griesemer | bd80b11 | 2011-09-15 16:21:21 -0700 | [diff] [blame] | 133 | } |
Robert Griesemer | bd80b11 | 2011-09-15 16:21:21 -0700 | [diff] [blame] | 134 | return nil |
| 135 | } |
| 136 | |
| 137 | // Write writes the index x to w. |
| 138 | func (x *Index) Write(w io.Writer) os.Error { |
Robert Griesemer | a7a7cc5 | 2011-09-30 11:31:28 -0700 | [diff] [blame] | 139 | // buffer for all writes |
| 140 | buf := make([]byte, bufSize) |
| 141 | |
| 142 | // write length |
| 143 | if err := writeInt(w, buf, len(x.data)); err != nil { |
Robert Griesemer | bd80b11 | 2011-09-15 16:21:21 -0700 | [diff] [blame] | 144 | return err |
| 145 | } |
Robert Griesemer | a7a7cc5 | 2011-09-30 11:31:28 -0700 | [diff] [blame] | 146 | |
| 147 | // write data |
| 148 | if _, err := w.Write(x.data); err != nil { |
| 149 | return err |
| 150 | } |
| 151 | |
| 152 | // write index |
| 153 | for sa := x.sa; len(sa) > 0; { |
| 154 | n, err := writeSlice(w, buf, sa) |
| 155 | if err != nil { |
Robert Griesemer | 5ee7ef9 | 2011-09-20 14:36:19 -0700 | [diff] [blame] | 156 | return err |
| 157 | } |
Robert Griesemer | a7a7cc5 | 2011-09-30 11:31:28 -0700 | [diff] [blame] | 158 | sa = sa[n:] |
Robert Griesemer | bd80b11 | 2011-09-15 16:21:21 -0700 | [diff] [blame] | 159 | } |
| 160 | return nil |
| 161 | } |
| 162 | |
Robert Griesemer | 3fb6d62 | 2010-12-14 10:22:00 -0800 | [diff] [blame] | 163 | // Bytes returns the data over which the index was created. |
Robert Griesemer | 52c9fb6 | 2010-12-13 17:08:01 -0800 | [diff] [blame] | 164 | // It must not be modified. |
| 165 | // |
Robert Griesemer | 3fb6d62 | 2010-12-14 10:22:00 -0800 | [diff] [blame] | 166 | func (x *Index) Bytes() []byte { |
Robert Griesemer | 52c9fb6 | 2010-12-13 17:08:01 -0800 | [diff] [blame] | 167 | return x.data |
| 168 | } |
| 169 | |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 170 | func (x *Index) at(i int) []byte { |
| 171 | return x.data[x.sa[i]:] |
| 172 | } |
| 173 | |
Eric Eisner | ab036ab | 2011-01-24 13:03:32 -0800 | [diff] [blame] | 174 | // lookupAll returns a slice into the matching region of the index. |
| 175 | // The runtime is O(log(N)*len(s)). |
Robert Griesemer | 7155771 | 2011-09-27 16:21:28 -0700 | [diff] [blame] | 176 | func (x *Index) lookupAll(s []byte) []int { |
Eric Eisner | ab036ab | 2011-01-24 13:03:32 -0800 | [diff] [blame] | 177 | // find matching suffix index range [i:j] |
| 178 | // find the first index where s would be the prefix |
| 179 | i := sort.Search(len(x.sa), func(i int) bool { return bytes.Compare(x.at(i), s) >= 0 }) |
| 180 | // starting at i, find the first index at which s is not a prefix |
| 181 | j := i + sort.Search(len(x.sa)-i, func(j int) bool { return !bytes.HasPrefix(x.at(j+i), s) }) |
| 182 | return x.sa[i:j] |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 183 | } |
| 184 | |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 185 | // Lookup returns an unsorted list of at most n indices where the byte string s |
| 186 | // occurs in the indexed data. If n < 0, all occurrences are returned. |
| 187 | // The result is nil if s is empty, s is not found, or n == 0. |
Eric Eisner | ab036ab | 2011-01-24 13:03:32 -0800 | [diff] [blame] | 188 | // Lookup time is O(log(N)*len(s) + len(result)) where N is the |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 189 | // size of the indexed data. |
| 190 | // |
Robert Griesemer | 2662b99 | 2010-12-17 14:00:46 -0800 | [diff] [blame] | 191 | func (x *Index) Lookup(s []byte, n int) (result []int) { |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 192 | if len(s) > 0 && n != 0 { |
Eric Eisner | ab036ab | 2011-01-24 13:03:32 -0800 | [diff] [blame] | 193 | matches := x.lookupAll(s) |
Robert Griesemer | bd80b11 | 2011-09-15 16:21:21 -0700 | [diff] [blame] | 194 | if n < 0 || len(matches) < n { |
Eric Eisner | ab036ab | 2011-01-24 13:03:32 -0800 | [diff] [blame] | 195 | n = len(matches) |
| 196 | } |
Robert Griesemer | bd80b11 | 2011-09-15 16:21:21 -0700 | [diff] [blame] | 197 | // 0 <= n <= len(matches) |
Eric Eisner | ab036ab | 2011-01-24 13:03:32 -0800 | [diff] [blame] | 198 | if n > 0 { |
| 199 | result = make([]int, n) |
Robert Griesemer | 7155771 | 2011-09-27 16:21:28 -0700 | [diff] [blame] | 200 | copy(result, matches) |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 201 | } |
| 202 | } |
Robert Griesemer | 2662b99 | 2010-12-17 14:00:46 -0800 | [diff] [blame] | 203 | return |
| 204 | } |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 205 | |
Robert Griesemer | 2662b99 | 2010-12-17 14:00:46 -0800 | [diff] [blame] | 206 | // FindAllIndex returns a sorted list of non-overlapping matches of the |
| 207 | // regular expression r, where a match is a pair of indices specifying |
| 208 | // the matched slice of x.Bytes(). If n < 0, all matches are returned |
| 209 | // in successive order. Otherwise, at most n matches are returned and |
| 210 | // they may not be successive. The result is nil if there are no matches, |
| 211 | // or if n == 0. |
| 212 | // |
| 213 | func (x *Index) FindAllIndex(r *regexp.Regexp, n int) (result [][]int) { |
| 214 | // a non-empty literal prefix is used to determine possible |
| 215 | // match start indices with Lookup |
| 216 | prefix, complete := r.LiteralPrefix() |
| 217 | lit := []byte(prefix) |
| 218 | |
| 219 | // worst-case scenario: no literal prefix |
| 220 | if prefix == "" { |
| 221 | return r.FindAllIndex(x.data, n) |
| 222 | } |
| 223 | |
| 224 | // if regexp is a literal just use Lookup and convert its |
| 225 | // result into match pairs |
| 226 | if complete { |
| 227 | // Lookup returns indices that may belong to overlapping matches. |
| 228 | // After eliminating them, we may end up with fewer than n matches. |
| 229 | // If we don't have enough at the end, redo the search with an |
| 230 | // increased value n1, but only if Lookup returned all the requested |
| 231 | // indices in the first place (if it returned fewer than that then |
| 232 | // there cannot be more). |
| 233 | for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ { |
| 234 | indices := x.Lookup(lit, n1) |
| 235 | if len(indices) == 0 { |
| 236 | return |
| 237 | } |
Andrew Gerrand | 5bcbcab3 | 2011-07-08 10:52:50 +1000 | [diff] [blame] | 238 | sort.Ints(indices) |
Robert Griesemer | 2662b99 | 2010-12-17 14:00:46 -0800 | [diff] [blame] | 239 | pairs := make([]int, 2*len(indices)) |
| 240 | result = make([][]int, len(indices)) |
| 241 | count := 0 |
| 242 | prev := 0 |
| 243 | for _, i := range indices { |
| 244 | if count == n { |
| 245 | break |
| 246 | } |
| 247 | // ignore indices leading to overlapping matches |
| 248 | if prev <= i { |
| 249 | j := 2 * count |
| 250 | pairs[j+0] = i |
| 251 | pairs[j+1] = i + len(lit) |
| 252 | result[count] = pairs[j : j+2] |
| 253 | count++ |
| 254 | prev = i + len(lit) |
| 255 | } |
| 256 | } |
| 257 | result = result[0:count] |
| 258 | if len(result) >= n || len(indices) != n1 { |
| 259 | // found all matches or there's no chance to find more |
| 260 | // (n and n1 can be negative) |
| 261 | break |
| 262 | } |
| 263 | } |
| 264 | if len(result) == 0 { |
| 265 | result = nil |
| 266 | } |
| 267 | return |
| 268 | } |
| 269 | |
| 270 | // regexp has a non-empty literal prefix; Lookup(lit) computes |
| 271 | // the indices of possible complete matches; use these as starting |
| 272 | // points for anchored searches |
| 273 | // (regexp "^" matches beginning of input, not beginning of line) |
| 274 | r = regexp.MustCompile("^" + r.String()) // compiles because r compiled |
| 275 | |
| 276 | // same comment about Lookup applies here as in the loop above |
| 277 | for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ { |
| 278 | indices := x.Lookup(lit, n1) |
| 279 | if len(indices) == 0 { |
| 280 | return |
| 281 | } |
Andrew Gerrand | 5bcbcab3 | 2011-07-08 10:52:50 +1000 | [diff] [blame] | 282 | sort.Ints(indices) |
Robert Griesemer | 2662b99 | 2010-12-17 14:00:46 -0800 | [diff] [blame] | 283 | result = result[0:0] |
| 284 | prev := 0 |
| 285 | for _, i := range indices { |
| 286 | if len(result) == n { |
| 287 | break |
| 288 | } |
| 289 | m := r.FindIndex(x.data[i:]) // anchored search - will not run off |
| 290 | // ignore indices leading to overlapping matches |
| 291 | if m != nil && prev <= i { |
| 292 | m[0] = i // correct m |
| 293 | m[1] += i |
| 294 | result = append(result, m) |
| 295 | prev = m[1] |
| 296 | } |
| 297 | } |
| 298 | if len(result) >= n || len(indices) != n1 { |
| 299 | // found all matches or there's no chance to find more |
| 300 | // (n and n1 can be negative) |
| 301 | break |
| 302 | } |
| 303 | } |
| 304 | if len(result) == 0 { |
| 305 | result = nil |
| 306 | } |
| 307 | return |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 308 | } |