blob: 644f00c75771c6eb3d21ff9aa5c9b3009a28f845 [file] [log] [blame]
Robert Griesemer22974fb2010-09-21 23:12:57 -07001// Copyright 2010 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package suffixarray
6
7import (
Eric Eisnere3c95652011-01-11 21:46:50 -08008 "bytes"
Rob Pike30aa7012011-11-08 15:40:58 -08009 "math/rand"
Russ Cox6c230fb2011-09-26 18:33:13 -040010 "regexp"
Robert Griesemer22974fb2010-09-21 23:12:57 -070011 "sort"
12 "strings"
13 "testing"
14)
15
Robert Griesemer22974fb2010-09-21 23:12:57 -070016type testCase struct {
Robert Griesemer2662b992010-12-17 14:00:46 -080017 name string // name of test case
18 source string // source to index
19 patterns []string // patterns to lookup
Robert Griesemer22974fb2010-09-21 23:12:57 -070020}
21
Robert Griesemer22974fb2010-09-21 23:12:57 -070022var testCases = []testCase{
Robert Griesemer34788912010-10-22 10:06:33 -070023 {
Robert Griesemer22974fb2010-09-21 23:12:57 -070024 "empty string",
25 "",
26 []string{
27 "",
28 "foo",
Robert Griesemer2662b992010-12-17 14:00:46 -080029 "(foo)",
30 ".*",
31 "a*",
Robert Griesemer22974fb2010-09-21 23:12:57 -070032 },
33 },
34
Robert Griesemer34788912010-10-22 10:06:33 -070035 {
Robert Griesemer22974fb2010-09-21 23:12:57 -070036 "all a's",
37 "aaaaaaaaaa", // 10 a's
38 []string{
39 "",
40 "a",
41 "aa",
42 "aaa",
43 "aaaa",
44 "aaaaa",
45 "aaaaaa",
46 "aaaaaaa",
47 "aaaaaaaa",
48 "aaaaaaaaa",
49 "aaaaaaaaaa",
50 "aaaaaaaaaaa", // 11 a's
Robert Griesemer2662b992010-12-17 14:00:46 -080051 ".",
52 ".*",
53 "a+",
54 "aa+",
55 "aaaa[b]?",
56 "aaa*",
Robert Griesemer22974fb2010-09-21 23:12:57 -070057 },
58 },
59
Robert Griesemer34788912010-10-22 10:06:33 -070060 {
Robert Griesemer22974fb2010-09-21 23:12:57 -070061 "abc",
62 "abc",
63 []string{
64 "a",
65 "b",
66 "c",
67 "ab",
68 "bc",
69 "abc",
Robert Griesemer2662b992010-12-17 14:00:46 -080070 "a.c",
71 "a(b|c)",
72 "abc?",
Robert Griesemer22974fb2010-09-21 23:12:57 -070073 },
74 },
75
Robert Griesemer34788912010-10-22 10:06:33 -070076 {
Robert Griesemer22974fb2010-09-21 23:12:57 -070077 "barbara*3",
78 "barbarabarbarabarbara",
79 []string{
80 "a",
81 "bar",
82 "rab",
83 "arab",
84 "barbar",
Robert Griesemer2662b992010-12-17 14:00:46 -080085 "bara?bar",
Robert Griesemer22974fb2010-09-21 23:12:57 -070086 },
87 },
88
Robert Griesemer34788912010-10-22 10:06:33 -070089 {
Robert Griesemer22974fb2010-09-21 23:12:57 -070090 "typing drill",
91 "Now is the time for all good men to come to the aid of their country.",
92 []string{
93 "Now",
94 "the time",
95 "to come the aid",
96 "is the time for all good men to come to the aid of their",
Robert Griesemer2662b992010-12-17 14:00:46 -080097 "to (come|the)?",
Robert Griesemer22974fb2010-09-21 23:12:57 -070098 },
99 },
Eric Eisnerd93b2f32011-01-31 13:13:02 -0800100
101 {
102 "godoc simulation",
103 "package main\n\nimport(\n \"rand\"\n ",
104 []string{},
105 },
Robert Griesemer22974fb2010-09-21 23:12:57 -0700106}
107
Robert Hencke3fbd4782011-05-30 18:02:59 +1000108// find all occurrences of s in source; report at most n occurrences
Robert Griesemer22974fb2010-09-21 23:12:57 -0700109func find(src, s string, n int) []int {
Kyle Consalus476150f2011-08-08 14:32:37 -0700110 var res []int
Robert Griesemer22974fb2010-09-21 23:12:57 -0700111 if s != "" && n != 0 {
Robert Griesemer81847782011-01-04 13:16:50 -0800112 // find at most n occurrences of s in src
Robert Griesemer22974fb2010-09-21 23:12:57 -0700113 for i := -1; n < 0 || len(res) < n; {
114 j := strings.Index(src[i+1:], s)
115 if j < 0 {
116 break
117 }
118 i += j + 1
Kyle Consalus476150f2011-08-08 14:32:37 -0700119 res = append(res, i)
Robert Griesemer22974fb2010-09-21 23:12:57 -0700120 }
121 }
122 return res
123}
124
Robert Griesemer2662b992010-12-17 14:00:46 -0800125func testLookup(t *testing.T, tc *testCase, x *Index, s string, n int) {
126 res := x.Lookup([]byte(s), n)
127 exp := find(tc.source, s, n)
Robert Griesemer22974fb2010-09-21 23:12:57 -0700128
Robert Griesemer2662b992010-12-17 14:00:46 -0800129 // check that the lengths match
130 if len(res) != len(exp) {
131 t.Errorf("test %q, lookup %q (n = %d): expected %d results; got %d", tc.name, s, n, len(exp), len(res))
132 }
133
134 // if n >= 0 the number of results is limited --- unless n >= all results,
135 // we may obtain different positions from the Index and from find (because
136 // Index may not find the results in the same order as find) => in general
137 // we cannot simply check that the res and exp lists are equal
138
139 // check that each result is in fact a correct match and there are no duplicates
Andrew Gerrand5bcbcab32011-07-08 10:52:50 +1000140 sort.Ints(res)
Robert Griesemer2662b992010-12-17 14:00:46 -0800141 for i, r := range res {
142 if r < 0 || len(tc.source) <= r {
143 t.Errorf("test %q, lookup %q, result %d (n = %d): index %d out of range [0, %d[", tc.name, s, i, n, r, len(tc.source))
144 } else if !strings.HasPrefix(tc.source[r:], s) {
145 t.Errorf("test %q, lookup %q, result %d (n = %d): index %d not a match", tc.name, s, i, n, r)
Robert Griesemer22974fb2010-09-21 23:12:57 -0700146 }
Robert Griesemer2662b992010-12-17 14:00:46 -0800147 if i > 0 && res[i-1] == r {
148 t.Errorf("test %q, lookup %q, result %d (n = %d): found duplicate index %d", tc.name, s, i, n, r)
149 }
150 }
Robert Griesemer22974fb2010-09-21 23:12:57 -0700151
Robert Griesemer2662b992010-12-17 14:00:46 -0800152 if n < 0 {
153 // all results computed - sorted res and exp must be equal
Robert Griesemer22974fb2010-09-21 23:12:57 -0700154 for i, r := range res {
Robert Griesemer2662b992010-12-17 14:00:46 -0800155 e := exp[i]
156 if r != e {
157 t.Errorf("test %q, lookup %q, result %d: expected index %d; got %d", tc.name, s, i, e, r)
Robert Griesemer22974fb2010-09-21 23:12:57 -0700158 }
159 }
Robert Griesemer2662b992010-12-17 14:00:46 -0800160 }
161}
Robert Griesemer22974fb2010-09-21 23:12:57 -0700162
Robert Griesemer2662b992010-12-17 14:00:46 -0800163func testFindAllIndex(t *testing.T, tc *testCase, x *Index, rx *regexp.Regexp, n int) {
164 res := x.FindAllIndex(rx, n)
165 exp := rx.FindAllStringIndex(tc.source, n)
166
167 // check that the lengths match
168 if len(res) != len(exp) {
169 t.Errorf("test %q, FindAllIndex %q (n = %d): expected %d results; got %d", tc.name, rx, n, len(exp), len(res))
170 }
171
172 // if n >= 0 the number of results is limited --- unless n >= all results,
173 // we may obtain different positions from the Index and from regexp (because
174 // Index may not find the results in the same order as regexp) => in general
175 // we cannot simply check that the res and exp lists are equal
176
177 // check that each result is in fact a correct match and the result is sorted
178 for i, r := range res {
179 if r[0] < 0 || r[0] > r[1] || len(tc.source) < r[1] {
180 t.Errorf("test %q, FindAllIndex %q, result %d (n == %d): illegal match [%d, %d]", tc.name, rx, i, n, r[0], r[1])
181 } else if !rx.MatchString(tc.source[r[0]:r[1]]) {
182 t.Errorf("test %q, FindAllIndex %q, result %d (n = %d): [%d, %d] not a match", tc.name, rx, i, n, r[0], r[1])
183 }
184 }
185
186 if n < 0 {
187 // all results computed - sorted res and exp must be equal
Robert Griesemer22974fb2010-09-21 23:12:57 -0700188 for i, r := range res {
Robert Griesemer2662b992010-12-17 14:00:46 -0800189 e := exp[i]
190 if r[0] != e[0] || r[1] != e[1] {
191 t.Errorf("test %q, FindAllIndex %q, result %d: expected match [%d, %d]; got [%d, %d]",
192 tc.name, rx, i, e[0], e[1], r[0], r[1])
Robert Griesemer22974fb2010-09-21 23:12:57 -0700193 }
194 }
Robert Griesemer2662b992010-12-17 14:00:46 -0800195 }
196}
Robert Griesemer22974fb2010-09-21 23:12:57 -0700197
Robert Griesemer2662b992010-12-17 14:00:46 -0800198func testLookups(t *testing.T, tc *testCase, x *Index, n int) {
199 for _, pat := range tc.patterns {
200 testLookup(t, tc, x, pat, n)
201 if rx, err := regexp.Compile(pat); err == nil {
202 testFindAllIndex(t, tc, x, rx, n)
Robert Griesemer22974fb2010-09-21 23:12:57 -0700203 }
204 }
205}
206
Eric Eisnere3c95652011-01-11 21:46:50 -0800207// index is used to hide the sort.Interface
208type index Index
209
210func (x *index) Len() int { return len(x.sa) }
211func (x *index) Less(i, j int) bool { return bytes.Compare(x.at(i), x.at(j)) < 0 }
212func (x *index) Swap(i, j int) { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] }
213func (a *index) at(i int) []byte { return a.data[a.sa[i]:] }
214
Eric Eisnere3c95652011-01-11 21:46:50 -0800215func testConstruction(t *testing.T, tc *testCase, x *Index) {
216 if !sort.IsSorted((*index)(x)) {
Robert Griesemerbd80b112011-09-15 16:21:21 -0700217 t.Errorf("failed testConstruction %s", tc.name)
218 }
219}
220
221func equal(x, y *Index) bool {
222 if !bytes.Equal(x.data, y.data) {
223 return false
224 }
225 for i, j := range x.sa {
226 if j != y.sa[i] {
227 return false
228 }
229 }
230 return true
231}
232
Robert Griesemera7a7cc52011-09-30 11:31:28 -0700233// returns the serialized index size
234func testSaveRestore(t *testing.T, tc *testCase, x *Index) int {
Robert Griesemerbd80b112011-09-15 16:21:21 -0700235 var buf bytes.Buffer
236 if err := x.Write(&buf); err != nil {
237 t.Errorf("failed writing index %s (%s)", tc.name, err)
238 }
Robert Griesemera7a7cc52011-09-30 11:31:28 -0700239 size := buf.Len()
Robert Griesemerbd80b112011-09-15 16:21:21 -0700240 var y Index
241 if err := y.Read(&buf); err != nil {
242 t.Errorf("failed reading index %s (%s)", tc.name, err)
243 }
244 if !equal(x, &y) {
245 t.Errorf("restored index doesn't match saved index %s", tc.name)
Eric Eisnere3c95652011-01-11 21:46:50 -0800246 }
Robert Griesemera7a7cc52011-09-30 11:31:28 -0700247 return size
Eric Eisnere3c95652011-01-11 21:46:50 -0800248}
249
Robert Griesemer22974fb2010-09-21 23:12:57 -0700250func TestIndex(t *testing.T) {
251 for _, tc := range testCases {
252 x := New([]byte(tc.source))
Eric Eisnere3c95652011-01-11 21:46:50 -0800253 testConstruction(t, &tc, x)
Robert Griesemerbd80b112011-09-15 16:21:21 -0700254 testSaveRestore(t, &tc, x)
Robert Griesemer2662b992010-12-17 14:00:46 -0800255 testLookups(t, &tc, x, 0)
256 testLookups(t, &tc, x, 1)
257 testLookups(t, &tc, x, 10)
258 testLookups(t, &tc, x, 2e9)
259 testLookups(t, &tc, x, -1)
Robert Griesemer22974fb2010-09-21 23:12:57 -0700260 }
261}
Robert Griesemer5ee7ef92011-09-20 14:36:19 -0700262
Eric Eisner481e6192011-09-23 09:18:10 -0700263// Of all possible inputs, the random bytes have the least amount of substring
264// repetition, and the repeated bytes have the most. For most algorithms,
265// the running time of every input will be between these two.
266func benchmarkNew(b *testing.B, random bool) {
267 b.StopTimer()
268 data := make([]byte, 1e6)
269 if random {
270 for i := range data {
271 data[i] = byte(rand.Intn(256))
272 }
273 }
274 b.StartTimer()
275 for i := 0; i < b.N; i++ {
276 New(data)
277 }
278}
279
280func BenchmarkNewIndexRandom(b *testing.B) {
281 benchmarkNew(b, true)
282}
283func BenchmarkNewIndexRepeat(b *testing.B) {
284 benchmarkNew(b, false)
285}
286
Robert Griesemer5ee7ef92011-09-20 14:36:19 -0700287func BenchmarkSaveRestore(b *testing.B) {
288 b.StopTimer()
289 r := rand.New(rand.NewSource(0x5a77a1)) // guarantee always same sequence
Dmitriy Vyukovb0c586a2014-06-24 20:37:28 -0700290 data := make([]byte, 1<<20) // 1MB of data to index
Robert Griesemer5ee7ef92011-09-20 14:36:19 -0700291 for i := range data {
292 data[i] = byte(r.Intn(256))
293 }
294 x := New(data)
Robert Griesemera7a7cc52011-09-30 11:31:28 -0700295 size := testSaveRestore(nil, nil, x) // verify correctness
296 buf := bytes.NewBuffer(make([]byte, size)) // avoid growing
297 b.SetBytes(int64(size))
Robert Griesemer5ee7ef92011-09-20 14:36:19 -0700298 b.StartTimer()
299 for i := 0; i < b.N; i++ {
300 x.Write(buf)
301 var y Index
302 y.Read(buf)
303 }
304}