Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 1 | // Copyright 2010 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package suffixarray |
| 6 | |
| 7 | import ( |
Eric Eisner | e3c9565 | 2011-01-11 21:46:50 -0800 | [diff] [blame] | 8 | "bytes" |
Rob Pike | 30aa701 | 2011-11-08 15:40:58 -0800 | [diff] [blame] | 9 | "math/rand" |
Russ Cox | 6c230fb | 2011-09-26 18:33:13 -0400 | [diff] [blame] | 10 | "regexp" |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 11 | "sort" |
| 12 | "strings" |
| 13 | "testing" |
| 14 | ) |
| 15 | |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 16 | type testCase struct { |
Robert Griesemer | 2662b99 | 2010-12-17 14:00:46 -0800 | [diff] [blame] | 17 | name string // name of test case |
| 18 | source string // source to index |
| 19 | patterns []string // patterns to lookup |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 20 | } |
| 21 | |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 22 | var testCases = []testCase{ |
Robert Griesemer | 3478891 | 2010-10-22 10:06:33 -0700 | [diff] [blame] | 23 | { |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 24 | "empty string", |
| 25 | "", |
| 26 | []string{ |
| 27 | "", |
| 28 | "foo", |
Robert Griesemer | 2662b99 | 2010-12-17 14:00:46 -0800 | [diff] [blame] | 29 | "(foo)", |
| 30 | ".*", |
| 31 | "a*", |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 32 | }, |
| 33 | }, |
| 34 | |
Robert Griesemer | 3478891 | 2010-10-22 10:06:33 -0700 | [diff] [blame] | 35 | { |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 36 | "all a's", |
| 37 | "aaaaaaaaaa", // 10 a's |
| 38 | []string{ |
| 39 | "", |
| 40 | "a", |
| 41 | "aa", |
| 42 | "aaa", |
| 43 | "aaaa", |
| 44 | "aaaaa", |
| 45 | "aaaaaa", |
| 46 | "aaaaaaa", |
| 47 | "aaaaaaaa", |
| 48 | "aaaaaaaaa", |
| 49 | "aaaaaaaaaa", |
| 50 | "aaaaaaaaaaa", // 11 a's |
Robert Griesemer | 2662b99 | 2010-12-17 14:00:46 -0800 | [diff] [blame] | 51 | ".", |
| 52 | ".*", |
| 53 | "a+", |
| 54 | "aa+", |
| 55 | "aaaa[b]?", |
| 56 | "aaa*", |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 57 | }, |
| 58 | }, |
| 59 | |
Robert Griesemer | 3478891 | 2010-10-22 10:06:33 -0700 | [diff] [blame] | 60 | { |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 61 | "abc", |
| 62 | "abc", |
| 63 | []string{ |
| 64 | "a", |
| 65 | "b", |
| 66 | "c", |
| 67 | "ab", |
| 68 | "bc", |
| 69 | "abc", |
Robert Griesemer | 2662b99 | 2010-12-17 14:00:46 -0800 | [diff] [blame] | 70 | "a.c", |
| 71 | "a(b|c)", |
| 72 | "abc?", |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 73 | }, |
| 74 | }, |
| 75 | |
Robert Griesemer | 3478891 | 2010-10-22 10:06:33 -0700 | [diff] [blame] | 76 | { |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 77 | "barbara*3", |
| 78 | "barbarabarbarabarbara", |
| 79 | []string{ |
| 80 | "a", |
| 81 | "bar", |
| 82 | "rab", |
| 83 | "arab", |
| 84 | "barbar", |
Robert Griesemer | 2662b99 | 2010-12-17 14:00:46 -0800 | [diff] [blame] | 85 | "bara?bar", |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 86 | }, |
| 87 | }, |
| 88 | |
Robert Griesemer | 3478891 | 2010-10-22 10:06:33 -0700 | [diff] [blame] | 89 | { |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 90 | "typing drill", |
| 91 | "Now is the time for all good men to come to the aid of their country.", |
| 92 | []string{ |
| 93 | "Now", |
| 94 | "the time", |
| 95 | "to come the aid", |
| 96 | "is the time for all good men to come to the aid of their", |
Robert Griesemer | 2662b99 | 2010-12-17 14:00:46 -0800 | [diff] [blame] | 97 | "to (come|the)?", |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 98 | }, |
| 99 | }, |
Eric Eisner | d93b2f3 | 2011-01-31 13:13:02 -0800 | [diff] [blame] | 100 | |
| 101 | { |
| 102 | "godoc simulation", |
| 103 | "package main\n\nimport(\n \"rand\"\n ", |
| 104 | []string{}, |
| 105 | }, |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 106 | } |
| 107 | |
Robert Hencke | 3fbd478 | 2011-05-30 18:02:59 +1000 | [diff] [blame] | 108 | // find all occurrences of s in source; report at most n occurrences |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 109 | func find(src, s string, n int) []int { |
Kyle Consalus | 476150f | 2011-08-08 14:32:37 -0700 | [diff] [blame] | 110 | var res []int |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 111 | if s != "" && n != 0 { |
Robert Griesemer | 8184778 | 2011-01-04 13:16:50 -0800 | [diff] [blame] | 112 | // find at most n occurrences of s in src |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 113 | for i := -1; n < 0 || len(res) < n; { |
| 114 | j := strings.Index(src[i+1:], s) |
| 115 | if j < 0 { |
| 116 | break |
| 117 | } |
| 118 | i += j + 1 |
Kyle Consalus | 476150f | 2011-08-08 14:32:37 -0700 | [diff] [blame] | 119 | res = append(res, i) |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 120 | } |
| 121 | } |
| 122 | return res |
| 123 | } |
| 124 | |
Robert Griesemer | 2662b99 | 2010-12-17 14:00:46 -0800 | [diff] [blame] | 125 | func testLookup(t *testing.T, tc *testCase, x *Index, s string, n int) { |
| 126 | res := x.Lookup([]byte(s), n) |
| 127 | exp := find(tc.source, s, n) |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 128 | |
Robert Griesemer | 2662b99 | 2010-12-17 14:00:46 -0800 | [diff] [blame] | 129 | // check that the lengths match |
| 130 | if len(res) != len(exp) { |
| 131 | t.Errorf("test %q, lookup %q (n = %d): expected %d results; got %d", tc.name, s, n, len(exp), len(res)) |
| 132 | } |
| 133 | |
| 134 | // if n >= 0 the number of results is limited --- unless n >= all results, |
| 135 | // we may obtain different positions from the Index and from find (because |
| 136 | // Index may not find the results in the same order as find) => in general |
| 137 | // we cannot simply check that the res and exp lists are equal |
| 138 | |
| 139 | // check that each result is in fact a correct match and there are no duplicates |
Andrew Gerrand | 5bcbcab3 | 2011-07-08 10:52:50 +1000 | [diff] [blame] | 140 | sort.Ints(res) |
Robert Griesemer | 2662b99 | 2010-12-17 14:00:46 -0800 | [diff] [blame] | 141 | for i, r := range res { |
| 142 | if r < 0 || len(tc.source) <= r { |
| 143 | t.Errorf("test %q, lookup %q, result %d (n = %d): index %d out of range [0, %d[", tc.name, s, i, n, r, len(tc.source)) |
| 144 | } else if !strings.HasPrefix(tc.source[r:], s) { |
| 145 | t.Errorf("test %q, lookup %q, result %d (n = %d): index %d not a match", tc.name, s, i, n, r) |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 146 | } |
Robert Griesemer | 2662b99 | 2010-12-17 14:00:46 -0800 | [diff] [blame] | 147 | if i > 0 && res[i-1] == r { |
| 148 | t.Errorf("test %q, lookup %q, result %d (n = %d): found duplicate index %d", tc.name, s, i, n, r) |
| 149 | } |
| 150 | } |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 151 | |
Robert Griesemer | 2662b99 | 2010-12-17 14:00:46 -0800 | [diff] [blame] | 152 | if n < 0 { |
| 153 | // all results computed - sorted res and exp must be equal |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 154 | for i, r := range res { |
Robert Griesemer | 2662b99 | 2010-12-17 14:00:46 -0800 | [diff] [blame] | 155 | e := exp[i] |
| 156 | if r != e { |
| 157 | t.Errorf("test %q, lookup %q, result %d: expected index %d; got %d", tc.name, s, i, e, r) |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 158 | } |
| 159 | } |
Robert Griesemer | 2662b99 | 2010-12-17 14:00:46 -0800 | [diff] [blame] | 160 | } |
| 161 | } |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 162 | |
Robert Griesemer | 2662b99 | 2010-12-17 14:00:46 -0800 | [diff] [blame] | 163 | func testFindAllIndex(t *testing.T, tc *testCase, x *Index, rx *regexp.Regexp, n int) { |
| 164 | res := x.FindAllIndex(rx, n) |
| 165 | exp := rx.FindAllStringIndex(tc.source, n) |
| 166 | |
| 167 | // check that the lengths match |
| 168 | if len(res) != len(exp) { |
| 169 | t.Errorf("test %q, FindAllIndex %q (n = %d): expected %d results; got %d", tc.name, rx, n, len(exp), len(res)) |
| 170 | } |
| 171 | |
| 172 | // if n >= 0 the number of results is limited --- unless n >= all results, |
| 173 | // we may obtain different positions from the Index and from regexp (because |
| 174 | // Index may not find the results in the same order as regexp) => in general |
| 175 | // we cannot simply check that the res and exp lists are equal |
| 176 | |
| 177 | // check that each result is in fact a correct match and the result is sorted |
| 178 | for i, r := range res { |
| 179 | if r[0] < 0 || r[0] > r[1] || len(tc.source) < r[1] { |
| 180 | t.Errorf("test %q, FindAllIndex %q, result %d (n == %d): illegal match [%d, %d]", tc.name, rx, i, n, r[0], r[1]) |
| 181 | } else if !rx.MatchString(tc.source[r[0]:r[1]]) { |
| 182 | t.Errorf("test %q, FindAllIndex %q, result %d (n = %d): [%d, %d] not a match", tc.name, rx, i, n, r[0], r[1]) |
| 183 | } |
| 184 | } |
| 185 | |
| 186 | if n < 0 { |
| 187 | // all results computed - sorted res and exp must be equal |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 188 | for i, r := range res { |
Robert Griesemer | 2662b99 | 2010-12-17 14:00:46 -0800 | [diff] [blame] | 189 | e := exp[i] |
| 190 | if r[0] != e[0] || r[1] != e[1] { |
| 191 | t.Errorf("test %q, FindAllIndex %q, result %d: expected match [%d, %d]; got [%d, %d]", |
| 192 | tc.name, rx, i, e[0], e[1], r[0], r[1]) |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 193 | } |
| 194 | } |
Robert Griesemer | 2662b99 | 2010-12-17 14:00:46 -0800 | [diff] [blame] | 195 | } |
| 196 | } |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 197 | |
Robert Griesemer | 2662b99 | 2010-12-17 14:00:46 -0800 | [diff] [blame] | 198 | func testLookups(t *testing.T, tc *testCase, x *Index, n int) { |
| 199 | for _, pat := range tc.patterns { |
| 200 | testLookup(t, tc, x, pat, n) |
| 201 | if rx, err := regexp.Compile(pat); err == nil { |
| 202 | testFindAllIndex(t, tc, x, rx, n) |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 203 | } |
| 204 | } |
| 205 | } |
| 206 | |
Eric Eisner | e3c9565 | 2011-01-11 21:46:50 -0800 | [diff] [blame] | 207 | // index is used to hide the sort.Interface |
| 208 | type index Index |
| 209 | |
| 210 | func (x *index) Len() int { return len(x.sa) } |
| 211 | func (x *index) Less(i, j int) bool { return bytes.Compare(x.at(i), x.at(j)) < 0 } |
| 212 | func (x *index) Swap(i, j int) { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] } |
| 213 | func (a *index) at(i int) []byte { return a.data[a.sa[i]:] } |
| 214 | |
Eric Eisner | e3c9565 | 2011-01-11 21:46:50 -0800 | [diff] [blame] | 215 | func testConstruction(t *testing.T, tc *testCase, x *Index) { |
| 216 | if !sort.IsSorted((*index)(x)) { |
Robert Griesemer | bd80b11 | 2011-09-15 16:21:21 -0700 | [diff] [blame] | 217 | t.Errorf("failed testConstruction %s", tc.name) |
| 218 | } |
| 219 | } |
| 220 | |
| 221 | func equal(x, y *Index) bool { |
| 222 | if !bytes.Equal(x.data, y.data) { |
| 223 | return false |
| 224 | } |
| 225 | for i, j := range x.sa { |
| 226 | if j != y.sa[i] { |
| 227 | return false |
| 228 | } |
| 229 | } |
| 230 | return true |
| 231 | } |
| 232 | |
Robert Griesemer | a7a7cc5 | 2011-09-30 11:31:28 -0700 | [diff] [blame] | 233 | // returns the serialized index size |
| 234 | func testSaveRestore(t *testing.T, tc *testCase, x *Index) int { |
Robert Griesemer | bd80b11 | 2011-09-15 16:21:21 -0700 | [diff] [blame] | 235 | var buf bytes.Buffer |
| 236 | if err := x.Write(&buf); err != nil { |
| 237 | t.Errorf("failed writing index %s (%s)", tc.name, err) |
| 238 | } |
Robert Griesemer | a7a7cc5 | 2011-09-30 11:31:28 -0700 | [diff] [blame] | 239 | size := buf.Len() |
Robert Griesemer | bd80b11 | 2011-09-15 16:21:21 -0700 | [diff] [blame] | 240 | var y Index |
| 241 | if err := y.Read(&buf); err != nil { |
| 242 | t.Errorf("failed reading index %s (%s)", tc.name, err) |
| 243 | } |
| 244 | if !equal(x, &y) { |
| 245 | t.Errorf("restored index doesn't match saved index %s", tc.name) |
Eric Eisner | e3c9565 | 2011-01-11 21:46:50 -0800 | [diff] [blame] | 246 | } |
Robert Griesemer | a7a7cc5 | 2011-09-30 11:31:28 -0700 | [diff] [blame] | 247 | return size |
Eric Eisner | e3c9565 | 2011-01-11 21:46:50 -0800 | [diff] [blame] | 248 | } |
| 249 | |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 250 | func TestIndex(t *testing.T) { |
| 251 | for _, tc := range testCases { |
| 252 | x := New([]byte(tc.source)) |
Eric Eisner | e3c9565 | 2011-01-11 21:46:50 -0800 | [diff] [blame] | 253 | testConstruction(t, &tc, x) |
Robert Griesemer | bd80b11 | 2011-09-15 16:21:21 -0700 | [diff] [blame] | 254 | testSaveRestore(t, &tc, x) |
Robert Griesemer | 2662b99 | 2010-12-17 14:00:46 -0800 | [diff] [blame] | 255 | testLookups(t, &tc, x, 0) |
| 256 | testLookups(t, &tc, x, 1) |
| 257 | testLookups(t, &tc, x, 10) |
| 258 | testLookups(t, &tc, x, 2e9) |
| 259 | testLookups(t, &tc, x, -1) |
Robert Griesemer | 22974fb | 2010-09-21 23:12:57 -0700 | [diff] [blame] | 260 | } |
| 261 | } |
Robert Griesemer | 5ee7ef9 | 2011-09-20 14:36:19 -0700 | [diff] [blame] | 262 | |
Eric Eisner | 481e619 | 2011-09-23 09:18:10 -0700 | [diff] [blame] | 263 | // Of all possible inputs, the random bytes have the least amount of substring |
| 264 | // repetition, and the repeated bytes have the most. For most algorithms, |
| 265 | // the running time of every input will be between these two. |
| 266 | func benchmarkNew(b *testing.B, random bool) { |
| 267 | b.StopTimer() |
| 268 | data := make([]byte, 1e6) |
| 269 | if random { |
| 270 | for i := range data { |
| 271 | data[i] = byte(rand.Intn(256)) |
| 272 | } |
| 273 | } |
| 274 | b.StartTimer() |
| 275 | for i := 0; i < b.N; i++ { |
| 276 | New(data) |
| 277 | } |
| 278 | } |
| 279 | |
| 280 | func BenchmarkNewIndexRandom(b *testing.B) { |
| 281 | benchmarkNew(b, true) |
| 282 | } |
| 283 | func BenchmarkNewIndexRepeat(b *testing.B) { |
| 284 | benchmarkNew(b, false) |
| 285 | } |
| 286 | |
Robert Griesemer | 5ee7ef9 | 2011-09-20 14:36:19 -0700 | [diff] [blame] | 287 | func BenchmarkSaveRestore(b *testing.B) { |
| 288 | b.StopTimer() |
| 289 | r := rand.New(rand.NewSource(0x5a77a1)) // guarantee always same sequence |
Dmitriy Vyukov | b0c586a | 2014-06-24 20:37:28 -0700 | [diff] [blame] | 290 | data := make([]byte, 1<<20) // 1MB of data to index |
Robert Griesemer | 5ee7ef9 | 2011-09-20 14:36:19 -0700 | [diff] [blame] | 291 | for i := range data { |
| 292 | data[i] = byte(r.Intn(256)) |
| 293 | } |
| 294 | x := New(data) |
Robert Griesemer | a7a7cc5 | 2011-09-30 11:31:28 -0700 | [diff] [blame] | 295 | size := testSaveRestore(nil, nil, x) // verify correctness |
| 296 | buf := bytes.NewBuffer(make([]byte, size)) // avoid growing |
| 297 | b.SetBytes(int64(size)) |
Robert Griesemer | 5ee7ef9 | 2011-09-20 14:36:19 -0700 | [diff] [blame] | 298 | b.StartTimer() |
| 299 | for i := 0; i < b.N; i++ { |
| 300 | x.Write(buf) |
| 301 | var y Index |
| 302 | y.Read(buf) |
| 303 | } |
| 304 | } |