suffixarray: faster creation algorithm
This implements the algorithm qsufsort using the sort package
as a sorting primitive. Its worst-case performance is O(N*log(N)), and it
uses only an additional slice of N ints of memory during creation.
Benchmarks (seconds):
old new
10k nulls 149 0.044
1M English corpus 32.0 3.6
R=gri, gri1
CC=golang-dev
https://golang.org/cl/3752044
diff --git a/src/pkg/index/suffixarray/suffixarray.go b/src/pkg/index/suffixarray/suffixarray.go
index 88cf925..628e000 100644
--- a/src/pkg/index/suffixarray/suffixarray.go
+++ b/src/pkg/index/suffixarray/suffixarray.go
@@ -22,11 +22,6 @@
"sort"
)
-// BUG(gri): For larger data (10MB) which contains very long (say 100000)
-// contiguous sequences of identical bytes, index creation time will be extremely slow.
-
-// TODO(gri): Use a more sophisticated algorithm to create the suffix array.
-
// Index implements a suffix array for fast substring search.
type Index struct {
@@ -36,16 +31,9 @@
// New creates a new Index for data.
-// Index creation time is approximately O(N*log(N)) for N = len(data).
-//
+// Index creation time is O(N*log(N)) for N = len(data).
func New(data []byte) *Index {
- sa := make([]int, len(data))
- for i := range sa {
- sa[i] = i
- }
- x := &Index{data, sa}
- sort.Sort((*index)(x))
- return x
+ return &Index{data, qsufsort(data)}
}
@@ -192,12 +180,3 @@
}
return
}
-
-
-// index is used to hide the sort.Interface
-type index Index
-
-func (x *index) Len() int { return len(x.sa) }
-func (x *index) Less(i, j int) bool { return bytes.Compare(x.at(i), x.at(j)) < 0 }
-func (x *index) Swap(i, j int) { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] }
-func (a *index) at(i int) []byte { return a.data[a.sa[i]:] }