blob: 0a8d9e2cb8b770826eea756c32d211b35ded016b [file] [log] [blame]
Robert Griesemer22974fb2010-09-21 23:12:57 -07001// Copyright 2010 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// The suffixarray package implements substring search in logarithmic time
6// using an in-memory suffix array.
7//
8// Example use:
9//
10// // create index for some data
11// index := suffixarray.New(data)
12//
13// // lookup byte slice s
14// offsets1 := index.Lookup(s, -1) // the list of all indices where s occurs in data
15// offsets2 := index.Lookup(s, 3) // the list of at most 3 indices where s occurs in data
16//
17package suffixarray
18
19import (
20 "bytes"
21 "container/vector"
22 "sort"
23)
24
25// BUG(gri): For larger data (10MB) which contains very long (say 100000)
26// contiguous sequences of identical bytes, index creation time will be extremely slow.
27
28// TODO(gri): Use a more sophisticated algorithm to create the suffix array.
29
30
31// Index implements a suffix array for fast substring search.
32type Index struct {
33 data []byte
34 sa []int // suffix array for data
35}
36
37
38// New creates a new Index for data.
39// Index creation time is approximately O(N*log(N)) for N = len(data).
40//
41func New(data []byte) *Index {
42 sa := make([]int, len(data))
Ryan Hitchman062406b2010-12-08 21:36:56 -080043 for i := range sa {
Robert Griesemer22974fb2010-09-21 23:12:57 -070044 sa[i] = i
45 }
Robert Griesemer34874952010-09-22 11:03:57 -070046 x := &Index{data, sa}
47 sort.Sort((*index)(x))
48 return x
Robert Griesemer22974fb2010-09-21 23:12:57 -070049}
50
51
Robert Griesemer52c9fb62010-12-13 17:08:01 -080052// Data returns the data over which the index was created.
53// It must not be modified.
54//
55func (x *Index) Data() []byte {
56 return x.data
57}
58
59
Robert Griesemer22974fb2010-09-21 23:12:57 -070060func (x *Index) at(i int) []byte {
61 return x.data[x.sa[i]:]
62}
63
64
Robert Griesemer22974fb2010-09-21 23:12:57 -070065func (x *Index) search(s []byte) int {
Russ Coxb51f0c52010-11-19 16:04:25 -050066 return sort.Search(len(x.sa), func(i int) bool { return bytes.Compare(x.at(i), s) >= 0 })
Robert Griesemer22974fb2010-09-21 23:12:57 -070067}
68
69
70// Lookup returns an unsorted list of at most n indices where the byte string s
71// occurs in the indexed data. If n < 0, all occurrences are returned.
72// The result is nil if s is empty, s is not found, or n == 0.
Robert Griesemer34874952010-09-22 11:03:57 -070073// Lookup time is O((log(N) + len(result))*len(s)) where N is the
Robert Griesemer22974fb2010-09-21 23:12:57 -070074// size of the indexed data.
75//
76func (x *Index) Lookup(s []byte, n int) []int {
77 var res vector.IntVector
78
79 if len(s) > 0 && n != 0 {
80 // find matching suffix index i
81 i := x.search(s)
Russ Coxb51f0c52010-11-19 16:04:25 -050082 // x.at(i-1) < s <= x.at(i)
Robert Griesemer22974fb2010-09-21 23:12:57 -070083
84 // collect the following suffixes with matching prefixes
85 for (n < 0 || len(res) < n) && i < len(x.sa) && bytes.HasPrefix(x.at(i), s) {
86 res.Push(x.sa[i])
87 i++
88 }
89 }
90
91 return res
92}
93
94
Robert Griesemer34874952010-09-22 11:03:57 -070095// index is used to hide the sort.Interface
96type index Index
Robert Griesemer22974fb2010-09-21 23:12:57 -070097
98func (x *index) Len() int { return len(x.sa) }
99func (x *index) Less(i, j int) bool { return bytes.Compare(x.at(i), x.at(j)) < 0 }
100func (x *index) Swap(i, j int) { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] }
101func (a *index) at(i int) []byte { return a.data[a.sa[i]:] }