src/regexp/syntax/prog.go - go - Git at Google

 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package syntax

 import (
 	"bytes"
 	"strconv"
 	"unicode"
 )

 // Compiled program.
 // May not belong in this package, but convenient for now.

 // A Prog is a compiled regular expression program.
 type Prog struct {
 	Inst   []Inst
 	Start  int // index of start instruction
 	NumCap int // number of InstCapture insts in re
 }

 // An InstOp is an instruction opcode.
 type InstOp uint8

 const (
 	InstAlt InstOp = iota
 	InstAltMatch
 	InstCapture
 	InstEmptyWidth
 	InstMatch
 	InstFail
 	InstNop
 	InstRune
 	InstRune1
 	InstRuneAny
 	InstRuneAnyNotNL
 )

 var instOpNames = []string{
 	"InstAlt",
 	"InstAltMatch",
 	"InstCapture",
 	"InstEmptyWidth",
 	"InstMatch",
 	"InstFail",
 	"InstNop",
 	"InstRune",
 	"InstRune1",
 	"InstRuneAny",
 	"InstRuneAnyNotNL",
 }

 func (i InstOp) String() string {
 	if uint(i) >= uint(len(instOpNames)) {
 		return ""
 	}
 	return instOpNames[i]
 }

 // An EmptyOp specifies a kind or mixture of zero-width assertions.
 type EmptyOp uint8

 const (
 	EmptyBeginLine EmptyOp = 1 << iota
 	EmptyEndLine
 	EmptyBeginText
 	EmptyEndText
 	EmptyWordBoundary
 	EmptyNoWordBoundary
 )

 // EmptyOpContext returns the zero-width assertions
 // satisfied at the position between the runes r1 and r2.
 // Passing r1 == -1 indicates that the position is
 // at the beginning of the text.
 // Passing r2 == -1 indicates that the position is
 // at the end of the text.
 func EmptyOpContext(r1, r2 rune) EmptyOp {
 	var op EmptyOp = EmptyNoWordBoundary
 	var boundary byte
 	switch {
 	case IsWordChar(r1):
 		boundary = 1
 	case r1 == '\n':
 		op |= EmptyBeginLine
 	case r1 < 0:
 		op |= EmptyBeginText | EmptyBeginLine
 	}
 	switch {
 	case IsWordChar(r2):
 		boundary ^= 1
 	case r2 == '\n':
 		op |= EmptyEndLine
 	case r2 < 0:
 		op |= EmptyEndText | EmptyEndLine
 	}
 	if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2)
 		op ^= (EmptyWordBoundary | EmptyNoWordBoundary)
 	}
 	return op
 }

 // IsWordChar reports whether r is consider a ``word character''
 // during the evaluation of the \b and \B zero-width assertions.
 // These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
 func IsWordChar(r rune) bool {
 	return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
 }

 // An Inst is a single instruction in a regular expression program.
 type Inst struct {
 	Op   InstOp
 	Out  uint32 // all but InstMatch, InstFail
 	Arg  uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
 	Rune []rune
 }

 func (p *Prog) String() string {
 	var b bytes.Buffer
 	dumpProg(&b, p)
 	return b.String()
 }

 // skipNop follows any no-op or capturing instructions
 // and returns the resulting pc.
 func (p *Prog) skipNop(pc uint32) (*Inst, uint32) {
 	i := &p.Inst[pc]
 	for i.Op == InstNop || i.Op == InstCapture {
 		pc = i.Out
 		i = &p.Inst[pc]
 	}
 	return i, pc
 }

 // op returns i.Op but merges all the Rune special cases into InstRune
 func (i *Inst) op() InstOp {
 	op := i.Op
 	switch op {
 	case InstRune1, InstRuneAny, InstRuneAnyNotNL:
 		op = InstRune
 	}
 	return op
 }

 // Prefix returns a literal string that all matches for the
 // regexp must start with. Complete is true if the prefix
 // is the entire match.
 func (p *Prog) Prefix() (prefix string, complete bool) {
 	i, _ := p.skipNop(uint32(p.Start))

 	// Avoid allocation of buffer if prefix is empty.
 	if i.op() != InstRune || len(i.Rune) != 1 {
 		return "", i.Op == InstMatch
 	}

 	// Have prefix; gather characters.
 	var buf bytes.Buffer
 	for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 {
 		buf.WriteRune(i.Rune[0])
 		i, _ = p.skipNop(i.Out)
 	}
 	return buf.String(), i.Op == InstMatch
 }

 // StartCond returns the leading empty-width conditions that must
 // be true in any match. It returns ^EmptyOp(0) if no matches are possible.
 func (p *Prog) StartCond() EmptyOp {
 	var flag EmptyOp
 	pc := uint32(p.Start)
 	i := &p.Inst[pc]
 Loop:
 	for {
 		switch i.Op {
 		case InstEmptyWidth:
 			flag |= EmptyOp(i.Arg)
 		case InstFail:
 			return ^EmptyOp(0)
 		case InstCapture, InstNop:
 			// skip
 		default:
 			break Loop
 		}
 		pc = i.Out
 		i = &p.Inst[pc]
 	}
 	return flag
 }

 const noMatch = -1

 // MatchRune reports whether the instruction matches (and consumes) r.
 // It should only be called when i.Op == InstRune.
 func (i *Inst) MatchRune(r rune) bool {
 	return i.MatchRunePos(r) != noMatch
 }

 // MatchRunePos checks whether the instruction matches (and consumes) r.
 // If so, MatchRunePos returns the index of the matching rune pair
 // (or, when len(i.Rune) == 1, rune singleton).
 // If not, MatchRunePos returns -1.
 // MatchRunePos should only be called when i.Op == InstRune.
 func (i *Inst) MatchRunePos(r rune) int {
 	rune := i.Rune

 	// Special case: single-rune slice is from literal string, not char class.
 	if len(rune) == 1 {
 		r0 := rune[0]
 		if r == r0 {
 			return 0
 		}
 		if Flags(i.Arg)&FoldCase != 0 {
 			for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
 				if r == r1 {
 					return 0
 				}
 			}
 		}
 		return noMatch
 	}

 	// Peek at the first few pairs.
 	// Should handle ASCII well.
 	for j := 0; j < len(rune) && j <= 8; j += 2 {
 		if r < rune[j] {
 			return noMatch
 		}
 		if r <= rune[j+1] {
 			return j / 2
 		}
 	}

 	// Otherwise binary search.
 	lo := 0
 	hi := len(rune) / 2
 	for lo < hi {
 		m := lo + (hi-lo)/2
 		if c := rune[2*m]; c <= r {
 			if r <= rune[2*m+1] {
 				return m
 			}
 			lo = m + 1
 		} else {
 			hi = m
 		}
 	}
 	return noMatch
 }

 // As per re2's Prog::IsWordChar. Determines whether rune is an ASCII word char.
 // Since we act on runes, it would be easy to support Unicode here.
 func wordRune(r rune) bool {
 	return r == '_' ||
 		('A' <= r && r <= 'Z') ||
 		('a' <= r && r <= 'z') ||
 		('0' <= r && r <= '9')
 }

 // MatchEmptyWidth reports whether the instruction matches
 // an empty string between the runes before and after.
 // It should only be called when i.Op == InstEmptyWidth.
 func (i *Inst) MatchEmptyWidth(before rune, after rune) bool {
 	switch EmptyOp(i.Arg) {
 	case EmptyBeginLine:
 		return before == '\n' || before == -1
 	case EmptyEndLine:
 		return after == '\n' || after == -1
 	case EmptyBeginText:
 		return before == -1
 	case EmptyEndText:
 		return after == -1
 	case EmptyWordBoundary:
 		return wordRune(before) != wordRune(after)
 	case EmptyNoWordBoundary:
 		return wordRune(before) == wordRune(after)
 	}
 	panic("unknown empty width arg")
 }

 func (i *Inst) String() string {
 	var b bytes.Buffer
 	dumpInst(&b, i)
 	return b.String()
 }

 func bw(b *bytes.Buffer, args ...string) {
 	for _, s := range args {
 		b.WriteString(s)
 	}
 }

 func dumpProg(b *bytes.Buffer, p *Prog) {
 	for j := range p.Inst {
 		i := &p.Inst[j]
 		pc := strconv.Itoa(j)
 		if len(pc) < 3 {
 			b.WriteString("   "[len(pc):])
 		}
 		if j == p.Start {
 			pc += "*"
 		}
 		bw(b, pc, "\t")
 		dumpInst(b, i)
 		bw(b, "\n")
 	}
 }

 func u32(i uint32) string {
 	return strconv.FormatUint(uint64(i), 10)
 }

 func dumpInst(b *bytes.Buffer, i *Inst) {
 	switch i.Op {
 	case InstAlt:
 		bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
 	case InstAltMatch:
 		bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
 	case InstCapture:
 		bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
 	case InstEmptyWidth:
 		bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
 	case InstMatch:
 		bw(b, "match")
 	case InstFail:
 		bw(b, "fail")
 	case InstNop:
 		bw(b, "nop -> ", u32(i.Out))
 	case InstRune:
 		if i.Rune == nil {
 			// shouldn't happen
 			bw(b, "rune <nil>")
 		}
 		bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
 		if Flags(i.Arg)&FoldCase != 0 {
 			bw(b, "/i")
 		}
 		bw(b, " -> ", u32(i.Out))
 	case InstRune1:
 		bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
 	case InstRuneAny:
 		bw(b, "any -> ", u32(i.Out))
 	case InstRuneAnyNotNL:
 		bw(b, "anynotnl -> ", u32(i.Out))
 	}
 }
	// Copyright 2011 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package syntax

	import (
	"bytes"
	"strconv"
	"unicode"
	)

	// Compiled program.
	// May not belong in this package, but convenient for now.

	// A Prog is a compiled regular expression program.
	type Prog struct {
	Inst []Inst
	Start int // index of start instruction
	NumCap int // number of InstCapture insts in re
	}

	// An InstOp is an instruction opcode.
	type InstOp uint8

	const (
	InstAlt InstOp = iota
	InstAltMatch
	InstCapture
	InstEmptyWidth
	InstMatch
	InstFail
	InstNop
	InstRune
	InstRune1
	InstRuneAny
	InstRuneAnyNotNL
	)

	var instOpNames = []string{
	"InstAlt",
	"InstAltMatch",
	"InstCapture",
	"InstEmptyWidth",
	"InstMatch",
	"InstFail",
	"InstNop",
	"InstRune",
	"InstRune1",
	"InstRuneAny",
	"InstRuneAnyNotNL",
	}

	func (i InstOp) String() string {
	if uint(i) >= uint(len(instOpNames)) {
	return ""
	}
	return instOpNames[i]
	}

	// An EmptyOp specifies a kind or mixture of zero-width assertions.
	type EmptyOp uint8

	const (
	EmptyBeginLine EmptyOp = 1 << iota
	EmptyEndLine
	EmptyBeginText
	EmptyEndText
	EmptyWordBoundary
	EmptyNoWordBoundary
	)

	// EmptyOpContext returns the zero-width assertions
	// satisfied at the position between the runes r1 and r2.
	// Passing r1 == -1 indicates that the position is
	// at the beginning of the text.
	// Passing r2 == -1 indicates that the position is
	// at the end of the text.
	func EmptyOpContext(r1, r2 rune) EmptyOp {
	var op EmptyOp = EmptyNoWordBoundary
	var boundary byte
	switch {
	case IsWordChar(r1):
	boundary = 1
	case r1 == '\n':
	op \|= EmptyBeginLine
	case r1 < 0:
	op \|= EmptyBeginText \| EmptyBeginLine
	}
	switch {
	case IsWordChar(r2):
	boundary ^= 1
	case r2 == '\n':
	op \|= EmptyEndLine
	case r2 < 0:
	op \|= EmptyEndText \| EmptyEndLine
	}
	if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2)
	op ^= (EmptyWordBoundary \| EmptyNoWordBoundary)
	}
	return op
	}

	// IsWordChar reports whether r is consider a ``word character''
	// during the evaluation of the \b and \B zero-width assertions.
	// These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
	func IsWordChar(r rune) bool {
	return 'A' <= r && r <= 'Z' \|\| 'a' <= r && r <= 'z' \|\| '0' <= r && r <= '9' \|\| r == '_'
	}

	// An Inst is a single instruction in a regular expression program.
	type Inst struct {
	Op InstOp
	Out uint32 // all but InstMatch, InstFail
	Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
	Rune []rune
	}

	func (p *Prog) String() string {
	var b bytes.Buffer
	dumpProg(&b, p)
	return b.String()
	}

	// skipNop follows any no-op or capturing instructions
	// and returns the resulting pc.
	func (p Prog) skipNop(pc uint32) (Inst, uint32) {
	i := &p.Inst[pc]
	for i.Op == InstNop \|\| i.Op == InstCapture {
	pc = i.Out
	i = &p.Inst[pc]
	}
	return i, pc
	}

	// op returns i.Op but merges all the Rune special cases into InstRune
	func (i *Inst) op() InstOp {
	op := i.Op
	switch op {
	case InstRune1, InstRuneAny, InstRuneAnyNotNL:
	op = InstRune
	}
	return op
	}

	// Prefix returns a literal string that all matches for the
	// regexp must start with. Complete is true if the prefix
	// is the entire match.
	func (p *Prog) Prefix() (prefix string, complete bool) {
	i, _ := p.skipNop(uint32(p.Start))

	// Avoid allocation of buffer if prefix is empty.
	if i.op() != InstRune \|\| len(i.Rune) != 1 {
	return "", i.Op == InstMatch
	}

	// Have prefix; gather characters.
	var buf bytes.Buffer
	for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 {
	buf.WriteRune(i.Rune[0])
	i, _ = p.skipNop(i.Out)
	}
	return buf.String(), i.Op == InstMatch
	}

	// StartCond returns the leading empty-width conditions that must
	// be true in any match. It returns ^EmptyOp(0) if no matches are possible.
	func (p *Prog) StartCond() EmptyOp {
	var flag EmptyOp
	pc := uint32(p.Start)
	i := &p.Inst[pc]
	Loop:
	for {
	switch i.Op {
	case InstEmptyWidth:
	flag \|= EmptyOp(i.Arg)
	case InstFail:
	return ^EmptyOp(0)
	case InstCapture, InstNop:
	// skip
	default:
	break Loop
	}
	pc = i.Out
	i = &p.Inst[pc]
	}
	return flag
	}

	const noMatch = -1

	// MatchRune reports whether the instruction matches (and consumes) r.
	// It should only be called when i.Op == InstRune.
	func (i *Inst) MatchRune(r rune) bool {
	return i.MatchRunePos(r) != noMatch
	}

	// MatchRunePos checks whether the instruction matches (and consumes) r.
	// If so, MatchRunePos returns the index of the matching rune pair
	// (or, when len(i.Rune) == 1, rune singleton).
	// If not, MatchRunePos returns -1.
	// MatchRunePos should only be called when i.Op == InstRune.
	func (i *Inst) MatchRunePos(r rune) int {
	rune := i.Rune

	// Special case: single-rune slice is from literal string, not char class.
	if len(rune) == 1 {
	r0 := rune[0]
	if r == r0 {
	return 0
	}
	if Flags(i.Arg)&FoldCase != 0 {
	for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
	if r == r1 {
	return 0
	}
	}
	}
	return noMatch
	}

	// Peek at the first few pairs.
	// Should handle ASCII well.
	for j := 0; j < len(rune) && j <= 8; j += 2 {
	if r < rune[j] {
	return noMatch
	}
	if r <= rune[j+1] {
	return j / 2
	}
	}

	// Otherwise binary search.
	lo := 0
	hi := len(rune) / 2
	for lo < hi {
	m := lo + (hi-lo)/2
	if c := rune[2*m]; c <= r {
	if r <= rune[2*m+1] {
	return m
	}
	lo = m + 1
	} else {
	hi = m
	}
	}
	return noMatch
	}

	// As per re2's Prog::IsWordChar. Determines whether rune is an ASCII word char.
	// Since we act on runes, it would be easy to support Unicode here.
	func wordRune(r rune) bool {
	return r == '_' \|\|
	('A' <= r && r <= 'Z') \|\|
	('a' <= r && r <= 'z') \|\|
	('0' <= r && r <= '9')
	}

	// MatchEmptyWidth reports whether the instruction matches
	// an empty string between the runes before and after.
	// It should only be called when i.Op == InstEmptyWidth.
	func (i *Inst) MatchEmptyWidth(before rune, after rune) bool {
	switch EmptyOp(i.Arg) {
	case EmptyBeginLine:
	return before == '\n' \|\| before == -1
	case EmptyEndLine:
	return after == '\n' \|\| after == -1
	case EmptyBeginText:
	return before == -1
	case EmptyEndText:
	return after == -1
	case EmptyWordBoundary:
	return wordRune(before) != wordRune(after)
	case EmptyNoWordBoundary:
	return wordRune(before) == wordRune(after)
	}
	panic("unknown empty width arg")
	}

	func (i *Inst) String() string {
	var b bytes.Buffer
	dumpInst(&b, i)
	return b.String()
	}

	func bw(b *bytes.Buffer, args ...string) {
	for _, s := range args {
	b.WriteString(s)
	}
	}

	func dumpProg(b bytes.Buffer, p Prog) {
	for j := range p.Inst {
	i := &p.Inst[j]
	pc := strconv.Itoa(j)
	if len(pc) < 3 {
	b.WriteString(" "[len(pc):])
	}
	if j == p.Start {
	pc += "*"
	}
	bw(b, pc, "\t")
	dumpInst(b, i)
	bw(b, "\n")
	}
	}

	func u32(i uint32) string {
	return strconv.FormatUint(uint64(i), 10)
	}

	func dumpInst(b bytes.Buffer, i Inst) {
	switch i.Op {
	case InstAlt:
	bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
	case InstAltMatch:
	bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
	case InstCapture:
	bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
	case InstEmptyWidth:
	bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
	case InstMatch:
	bw(b, "match")
	case InstFail:
	bw(b, "fail")
	case InstNop:
	bw(b, "nop -> ", u32(i.Out))
	case InstRune:
	if i.Rune == nil {
	// shouldn't happen
	bw(b, "rune <nil>")
	}
	bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
	if Flags(i.Arg)&FoldCase != 0 {
	bw(b, "/i")
	}
	bw(b, " -> ", u32(i.Out))
	case InstRune1:
	bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
	case InstRuneAny:
	bw(b, "any -> ", u32(i.Out))
	case InstRuneAnyNotNL:
	bw(b, "anynotnl -> ", u32(i.Out))
	}
	}