| package regexp |
| |
| import "exp/regexp/syntax" |
| |
| // A queue is a 'sparse array' holding pending threads of execution. |
| // See http://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html |
| type queue struct { |
| sparse []uint32 |
| dense []entry |
| } |
| |
| // A entry is an entry on a queue. |
| // It holds both the instruction pc and the actual thread. |
| // Some queue entries are just place holders so that the machine |
| // knows it has considered that pc. Such entries have t == nil. |
| type entry struct { |
| pc uint32 |
| t *thread |
| } |
| |
| // A thread is the state of a single path through the machine: |
| // an instruction and a corresponding capture array. |
| // See http://swtch.com/~rsc/regexp/regexp2.html |
| type thread struct { |
| inst *syntax.Inst |
| cap []int |
| } |
| |
| // A machine holds all the state during an NFA simulation for p. |
| type machine struct { |
| re *Regexp // corresponding Regexp |
| p *syntax.Prog // compiled program |
| q0, q1 queue // two queues for runq, nextq |
| pool []*thread // pool of available threads |
| matched bool // whether a match was found |
| matchcap []int // capture information for the match |
| } |
| |
| // progMachine returns a new machine running the prog p. |
| func progMachine(p *syntax.Prog) *machine { |
| m := &machine{p: p} |
| n := len(m.p.Inst) |
| m.q0 = queue{make([]uint32, n), make([]entry, 0, n)} |
| m.q1 = queue{make([]uint32, n), make([]entry, 0, n)} |
| ncap := p.NumCap |
| if ncap < 2 { |
| ncap = 2 |
| } |
| m.matchcap = make([]int, ncap) |
| return m |
| } |
| |
| // alloc allocates a new thread with the given instruction. |
| // It uses the free pool if possible. |
| func (m *machine) alloc(i *syntax.Inst) *thread { |
| var t *thread |
| if n := len(m.pool); n > 0 { |
| t = m.pool[n-1] |
| m.pool = m.pool[:n-1] |
| } else { |
| t = new(thread) |
| t.cap = make([]int, cap(m.matchcap)) |
| } |
| t.cap = t.cap[:len(m.matchcap)] |
| t.inst = i |
| return t |
| } |
| |
| // free returns t to the free pool. |
| func (m *machine) free(t *thread) { |
| m.pool = append(m.pool, t) |
| } |
| |
| // match runs the machine over the input starting at pos. |
| // It reports whether a match was found. |
| // If so, m.matchcap holds the submatch information. |
| func (m *machine) match(i input, pos int) bool { |
| startCond := m.re.cond |
| if startCond == ^syntax.EmptyOp(0) { // impossible |
| return false |
| } |
| m.matched = false |
| for i := range m.matchcap { |
| m.matchcap[i] = -1 |
| } |
| runq, nextq := &m.q0, &m.q1 |
| rune, rune1 := endOfText, endOfText |
| width, width1 := 0, 0 |
| rune, width = i.step(pos) |
| if rune != endOfText { |
| rune1, width1 = i.step(pos + width) |
| } |
| // TODO: Let caller specify the initial flag setting. |
| // For now assume pos == 0 is beginning of text and |
| // pos != 0 is not even beginning of line. |
| // TODO: Word boundary. |
| var flag syntax.EmptyOp |
| if pos == 0 { |
| flag = syntax.EmptyBeginText | syntax.EmptyBeginLine |
| } |
| |
| // Update flag using lookahead rune. |
| if rune1 == '\n' { |
| flag |= syntax.EmptyEndLine |
| } |
| if rune1 == endOfText { |
| flag |= syntax.EmptyEndText |
| } |
| |
| for { |
| if len(runq.dense) == 0 { |
| if startCond&syntax.EmptyBeginText != 0 && pos != 0 { |
| // Anchored match, past beginning of text. |
| break |
| } |
| if m.matched { |
| // Have match; finished exploring alternatives. |
| break |
| } |
| if len(m.re.prefix) > 0 && rune1 != m.re.prefixRune && i.canCheckPrefix() { |
| // Match requires literal prefix; fast search for it. |
| advance := i.index(m.re, pos) |
| if advance < 0 { |
| break |
| } |
| pos += advance |
| rune, width = i.step(pos) |
| rune1, width1 = i.step(pos + width) |
| } |
| } |
| if !m.matched { |
| if len(m.matchcap) > 0 { |
| m.matchcap[0] = pos |
| } |
| m.add(runq, uint32(m.p.Start), pos, m.matchcap, flag) |
| } |
| // TODO: word boundary |
| flag = 0 |
| if rune == '\n' { |
| flag |= syntax.EmptyBeginLine |
| } |
| if rune1 == '\n' { |
| flag |= syntax.EmptyEndLine |
| } |
| if rune1 == endOfText { |
| flag |= syntax.EmptyEndText |
| } |
| m.step(runq, nextq, pos, pos+width, rune, flag) |
| if width == 0 { |
| break |
| } |
| pos += width |
| rune, width = rune1, width1 |
| if rune != endOfText { |
| rune1, width1 = i.step(pos + width) |
| } |
| runq, nextq = nextq, runq |
| } |
| m.clear(nextq) |
| return m.matched |
| } |
| |
| // clear frees all threads on the thread queue. |
| func (m *machine) clear(q *queue) { |
| for _, d := range q.dense { |
| if d.t != nil { |
| m.free(d.t) |
| } |
| } |
| q.dense = q.dense[:0] |
| } |
| |
| // step executes one step of the machine, running each of the threads |
| // on runq and appending new threads to nextq. |
| // The step processes the rune c (which may be endOfText), |
| // which starts at position pos and ends at nextPos. |
| // nextCond gives the setting for the empty-width flags after c. |
| func (m *machine) step(runq, nextq *queue, pos, nextPos, c int, nextCond syntax.EmptyOp) { |
| for j := 0; j < len(runq.dense); j++ { |
| d := &runq.dense[j] |
| t := d.t |
| if t == nil { |
| continue |
| } |
| /* |
| * If we support leftmost-longest matching: |
| if longest && matched && match[0] < t.cap[0] { |
| m.free(t) |
| continue |
| } |
| */ |
| |
| i := t.inst |
| switch i.Op { |
| default: |
| panic("bad inst") |
| |
| case syntax.InstMatch: |
| if len(t.cap) > 0 { |
| t.cap[1] = pos |
| copy(m.matchcap, t.cap) |
| } |
| m.matched = true |
| for _, d := range runq.dense[j+1:] { |
| if d.t != nil { |
| m.free(d.t) |
| } |
| } |
| runq.dense = runq.dense[:0] |
| |
| case syntax.InstRune: |
| if i.MatchRune(c) { |
| m.add(nextq, i.Out, nextPos, t.cap, nextCond) |
| } |
| } |
| m.free(t) |
| } |
| runq.dense = runq.dense[:0] |
| } |
| |
| // add adds an entry to q for pc, unless the q already has such an entry. |
| // It also recursively adds an entry for all instructions reachable from pc by following |
| // empty-width conditions satisfied by cond. pos gives the current position |
| // in the input. |
| func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond syntax.EmptyOp) { |
| if pc == 0 { |
| return |
| } |
| if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc { |
| return |
| } |
| |
| j := len(q.dense) |
| q.dense = q.dense[:j+1] |
| d := &q.dense[j] |
| d.t = nil |
| d.pc = pc |
| q.sparse[pc] = uint32(j) |
| |
| i := &m.p.Inst[pc] |
| switch i.Op { |
| default: |
| panic("unhandled") |
| case syntax.InstFail: |
| // nothing |
| case syntax.InstAlt, syntax.InstAltMatch: |
| m.add(q, i.Out, pos, cap, cond) |
| m.add(q, i.Arg, pos, cap, cond) |
| case syntax.InstEmptyWidth: |
| if syntax.EmptyOp(i.Arg)&^cond == 0 { |
| m.add(q, i.Out, pos, cap, cond) |
| } |
| case syntax.InstNop: |
| m.add(q, i.Out, pos, cap, cond) |
| case syntax.InstCapture: |
| if int(i.Arg) < len(cap) { |
| opos := cap[i.Arg] |
| cap[i.Arg] = pos |
| m.add(q, i.Out, pos, cap, cond) |
| cap[i.Arg] = opos |
| } else { |
| m.add(q, i.Out, pos, cap, cond) |
| } |
| case syntax.InstMatch, syntax.InstRune: |
| t := m.alloc(i) |
| if len(t.cap) > 0 { |
| copy(t.cap, cap) |
| } |
| d.t = t |
| } |
| } |
| |
| // empty is a non-nil 0-element slice, |
| // so doExecute can avoid an allocation |
| // when 0 captures are requested from a successful match. |
| var empty = make([]int, 0) |
| |
| // doExecute finds the leftmost match in the input and returns |
| // the position of its subexpressions. |
| func (re *Regexp) doExecute(i input, pos int, ncap int) []int { |
| m := re.get() |
| m.matchcap = m.matchcap[:ncap] |
| if !m.match(i, pos) { |
| re.put(m) |
| return nil |
| } |
| if ncap == 0 { |
| re.put(m) |
| return empty // empty but not nil |
| } |
| cap := make([]int, ncap) |
| copy(cap, m.matchcap) |
| re.put(m) |
| return cap |
| } |