blob: fd95604fe44a944b86d055f89830fe98e5be319e [file] [log] [blame]
Michael Matloob93238622014-12-28 00:17:01 -08001// Copyright 2015 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// backtrack is a regular expression search with submatch
6// tracking for small regular expressions and texts. It allocates
7// a bit vector with (length of input) * (length of prog) bits,
8// to make sure it never explores the same (character position, instruction)
9// state multiple times. This limits the search to run in time linear in
10// the length of the test.
11//
12// backtrack is a fast replacement for the NFA code on small
13// regexps when onepass cannot be used.
14
15package regexp
16
17import "regexp/syntax"
18
19// A job is an entry on the backtracker's job stack. It holds
20// the instruction pc and the position in the input.
21type job struct {
22 pc uint32
23 arg int
24 pos int
25}
26
27const (
28 visitedBits = 32
29 maxBacktrackProg = 500 // len(prog.Inst) <= max
30 maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits)
31)
32
33// bitState holds state for the backtracker.
34type bitState struct {
35 prog *syntax.Prog
36
37 end int
38 cap []int
Michael Matloob93238622014-12-28 00:17:01 -080039 input input
40 jobs []job
41 visited []uint32
42}
43
44var notBacktrack *bitState = nil
45
46// maxBitStateLen returns the maximum length of a string to search with
47// the backtracker using prog.
48func maxBitStateLen(prog *syntax.Prog) int {
Matthew Brennana5130882015-04-03 20:09:53 -040049 if !shouldBacktrack(prog) {
50 return 0
51 }
Michael Matloob93238622014-12-28 00:17:01 -080052 return maxBacktrackVector / len(prog.Inst)
53}
54
55// newBitState returns a new bitState for the given prog,
56// or notBacktrack if the size of the prog exceeds the maximum size that
57// the backtracker will be run for.
58func newBitState(prog *syntax.Prog) *bitState {
Matthew Brennana5130882015-04-03 20:09:53 -040059 if !shouldBacktrack(prog) {
Michael Matloob93238622014-12-28 00:17:01 -080060 return notBacktrack
61 }
62 return &bitState{
63 prog: prog,
64 }
65}
66
Matthew Brennana5130882015-04-03 20:09:53 -040067// shouldBacktrack reports whether the program is too
68// long for the backtracker to run.
69func shouldBacktrack(prog *syntax.Prog) bool {
70 return len(prog.Inst) <= maxBacktrackProg
71}
72
Michael Matloob93238622014-12-28 00:17:01 -080073// reset resets the state of the backtracker.
Michael Matloob485f3482015-04-06 13:33:47 -070074// end is the end position in the input.
75// ncap is the number of captures.
76func (b *bitState) reset(end int, ncap int) {
Michael Matloob93238622014-12-28 00:17:01 -080077 b.end = end
Michael Matloob93238622014-12-28 00:17:01 -080078
79 if cap(b.jobs) == 0 {
80 b.jobs = make([]job, 0, 256)
81 } else {
82 b.jobs = b.jobs[:0]
83 }
84
85 visitedSize := (len(b.prog.Inst)*(end+1) + visitedBits - 1) / visitedBits
86 if cap(b.visited) < visitedSize {
87 b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits)
88 } else {
89 b.visited = b.visited[:visitedSize]
90 for i := range b.visited {
91 b.visited[i] = 0
92 }
93 }
94
Michael Matloob485f3482015-04-06 13:33:47 -070095 if cap(b.cap) < ncap {
Michael Matloob93238622014-12-28 00:17:01 -080096 b.cap = make([]int, ncap)
Michael Matloob485f3482015-04-06 13:33:47 -070097 } else {
98 b.cap = b.cap[:ncap]
Michael Matloob93238622014-12-28 00:17:01 -080099 }
100 for i := range b.cap {
101 b.cap[i] = -1
102 }
103}
104
105// shouldVisit reports whether the combination of (pc, pos) has not
106// been visited yet.
107func (b *bitState) shouldVisit(pc uint32, pos int) bool {
108 n := uint(int(pc)*(b.end+1) + pos)
109 if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 {
110 return false
111 }
112 b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1))
113 return true
114}
115
116// push pushes (pc, pos, arg) onto the job stack if it should be
117// visited.
118func (b *bitState) push(pc uint32, pos int, arg int) {
119 if b.prog.Inst[pc].Op == syntax.InstFail {
120 return
121 }
122
123 // Only check shouldVisit when arg == 0.
124 // When arg > 0, we are continuing a previous visit.
125 if arg == 0 && !b.shouldVisit(pc, pos) {
126 return
127 }
128
129 b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos})
130}
131
132// tryBacktrack runs a backtracking search starting at pos.
133func (m *machine) tryBacktrack(b *bitState, i input, pc uint32, pos int) bool {
134 longest := m.re.longest
135 m.matched = false
136
137 b.push(pc, pos, 0)
138 for len(b.jobs) > 0 {
139 l := len(b.jobs) - 1
140 // Pop job off the stack.
141 pc := b.jobs[l].pc
142 pos := b.jobs[l].pos
143 arg := b.jobs[l].arg
144 b.jobs = b.jobs[:l]
145
146 // Optimization: rather than push and pop,
147 // code that is going to Push and continue
148 // the loop simply updates ip, p, and arg
149 // and jumps to CheckAndLoop. We have to
150 // do the ShouldVisit check that Push
151 // would have, but we avoid the stack
152 // manipulation.
153 goto Skip
154 CheckAndLoop:
155 if !b.shouldVisit(pc, pos) {
156 continue
157 }
158 Skip:
159
160 inst := b.prog.Inst[pc]
161
162 switch inst.Op {
163 default:
164 panic("bad inst")
165 case syntax.InstFail:
166 panic("unexpected InstFail")
167 case syntax.InstAlt:
168 // Cannot just
169 // b.push(inst.Out, pos, 0)
170 // b.push(inst.Arg, pos, 0)
171 // If during the processing of inst.Out, we encounter
172 // inst.Arg via another path, we want to process it then.
173 // Pushing it here will inhibit that. Instead, re-push
174 // inst with arg==1 as a reminder to push inst.Arg out
175 // later.
176 switch arg {
177 case 0:
178 b.push(pc, pos, 1)
179 pc = inst.Out
180 goto CheckAndLoop
181 case 1:
182 // Finished inst.Out; try inst.Arg.
183 arg = 0
184 pc = inst.Arg
185 goto CheckAndLoop
186 }
187 panic("bad arg in InstAlt")
188
189 case syntax.InstAltMatch:
190 // One opcode consumes runes; the other leads to match.
191 switch b.prog.Inst[inst.Out].Op {
192 case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
193 // inst.Arg is the match.
194 b.push(inst.Arg, pos, 0)
195 pc = inst.Arg
196 pos = b.end
197 goto CheckAndLoop
198 }
199 // inst.Out is the match - non-greedy
200 b.push(inst.Out, b.end, 0)
201 pc = inst.Out
202 goto CheckAndLoop
203
204 case syntax.InstRune:
205 r, width := i.step(pos)
206 if !inst.MatchRune(r) {
207 continue
208 }
209 pos += width
210 pc = inst.Out
211 goto CheckAndLoop
212
213 case syntax.InstRune1:
214 r, width := i.step(pos)
215 if r != inst.Rune[0] {
216 continue
217 }
218 pos += width
219 pc = inst.Out
220 goto CheckAndLoop
221
222 case syntax.InstRuneAnyNotNL:
223 r, width := i.step(pos)
224 if r == '\n' || r == endOfText {
225 continue
226 }
227 pos += width
228 pc = inst.Out
229 goto CheckAndLoop
230
231 case syntax.InstRuneAny:
232 r, width := i.step(pos)
233 if r == endOfText {
234 continue
235 }
236 pos += width
237 pc = inst.Out
238 goto CheckAndLoop
239
240 case syntax.InstCapture:
241 switch arg {
242 case 0:
243 if 0 <= inst.Arg && inst.Arg < uint32(len(b.cap)) {
244 // Capture pos to register, but save old value.
245 b.push(pc, b.cap[inst.Arg], 1) // come back when we're done.
246 b.cap[inst.Arg] = pos
247 }
248 pc = inst.Out
249 goto CheckAndLoop
250 case 1:
251 // Finished inst.Out; restore the old value.
252 b.cap[inst.Arg] = pos
253 continue
254
255 }
256 panic("bad arg in InstCapture")
257 continue
258
259 case syntax.InstEmptyWidth:
260 if syntax.EmptyOp(inst.Arg)&^i.context(pos) != 0 {
261 continue
262 }
263 pc = inst.Out
264 goto CheckAndLoop
265
266 case syntax.InstNop:
267 pc = inst.Out
268 goto CheckAndLoop
269
270 case syntax.InstMatch:
271 // We found a match. If the caller doesn't care
272 // where the match is, no point going further.
Michael Matloob485f3482015-04-06 13:33:47 -0700273 if len(b.cap) == 0 {
Michael Matloob93238622014-12-28 00:17:01 -0800274 m.matched = true
275 return m.matched
276 }
277
278 // Record best match so far.
279 // Only need to check end point, because this entire
280 // call is only considering one start position.
Michael Matloob485f3482015-04-06 13:33:47 -0700281 if len(b.cap) > 1 {
282 b.cap[1] = pos
283 }
Michael Matloob93238622014-12-28 00:17:01 -0800284 if !m.matched || (longest && pos > 0 && pos > m.matchcap[1]) {
285 copy(m.matchcap, b.cap)
286 }
287 m.matched = true
288
289 // If going for first match, we're done.
290 if !longest {
291 return m.matched
292 }
293
294 // If we used the entire text, no longer match is possible.
295 if pos == b.end {
296 return m.matched
297 }
298
299 // Otherwise, continue on in hope of a longer match.
300 continue
301 }
302 panic("unreachable")
303 }
304
305 return m.matched
306}
307
308// backtrack runs a backtracking search of prog on the input starting at pos.
Michael Matloob485f3482015-04-06 13:33:47 -0700309func (m *machine) backtrack(i input, pos int, end int, ncap int) bool {
Michael Matloob93238622014-12-28 00:17:01 -0800310 if !i.canCheckPrefix() {
311 panic("backtrack called for a RuneReader")
312 }
313
314 startCond := m.re.cond
315 if startCond == ^syntax.EmptyOp(0) { // impossible
316 return false
317 }
318 if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
319 // Anchored match, past beginning of text.
320 return false
321 }
322
323 b := m.b
Michael Matloob485f3482015-04-06 13:33:47 -0700324 b.reset(end, ncap)
Michael Matloob93238622014-12-28 00:17:01 -0800325
Michael Matloob485f3482015-04-06 13:33:47 -0700326 m.matchcap = m.matchcap[:ncap]
Michael Matloob93238622014-12-28 00:17:01 -0800327 for i := range m.matchcap {
328 m.matchcap[i] = -1
329 }
330
331 // Anchored search must start at the beginning of the input
332 if startCond&syntax.EmptyBeginText != 0 {
Michael Matloob485f3482015-04-06 13:33:47 -0700333 if len(b.cap) > 0 {
334 b.cap[0] = pos
335 }
Michael Matloob93238622014-12-28 00:17:01 -0800336 return m.tryBacktrack(b, i, uint32(m.p.Start), pos)
337 }
338
339 // Unanchored search, starting from each possible text position.
340 // Notice that we have to try the empty string at the end of
341 // the text, so the loop condition is pos <= end, not pos < end.
342 // This looks like it's quadratic in the size of the text,
343 // but we are not clearing visited between calls to TrySearch,
344 // so no work is duplicated and it ends up still being linear.
345 width := -1
346 for ; pos <= end && width != 0; pos += width {
347 if len(m.re.prefix) > 0 {
348 // Match requires literal prefix; fast search for it.
349 advance := i.index(m.re, pos)
350 if advance < 0 {
351 return false
352 }
353 pos += advance
354 }
355
Michael Matloob485f3482015-04-06 13:33:47 -0700356 if len(b.cap) > 0 {
357 b.cap[0] = pos
358 }
Michael Matloob93238622014-12-28 00:17:01 -0800359 if m.tryBacktrack(b, i, uint32(m.p.Start), pos) {
360 // Match must be leftmost; done.
361 return true
362 }
363 _, width = i.step(pos)
364 }
365 return false
366}