src/pkg/ebnf/ebnf.go - go - Git at Google

 // Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 // Package ebnf is a library for EBNF grammars. The input is text ([]byte)
 // satisfying the following grammar (represented itself in EBNF):
 //
 //	Production  = name "=" [ Expression ] "." .
 //	Expression  = Alternative { "|" Alternative } .
 //	Alternative = Term { Term } .
 //	Term        = name | token [ "…" token ] | Group | Option | Repetition .
 //	Group       = "(" Expression ")" .
 //	Option      = "[" Expression "]" .
 //	Repetition  = "{" Expression "}" .
 //
 // A name is a Go identifier, a token is a Go string, and comments
 // and white space follow the same rules as for the Go language.
 // Production names starting with an uppercase Unicode letter denote
 // non-terminal productions (i.e., productions which allow white-space
 // and comments between tokens); all other production names denote
 // lexical productions.
 //
 package ebnf

 import (
 	"go/scanner"
 	"go/token"
 	"os"
 	"unicode"
 	"utf8"
 )


 // ----------------------------------------------------------------------------
 // Internal representation

 type (
 	// An Expression node represents a production expression.
 	Expression interface {
 		// Pos is the position of the first character of the syntactic construct
 		Pos() token.Pos
 	}

 	// An Alternative node represents a non-empty list of alternative expressions.
 	Alternative []Expression // x | y | z

 	// A Sequence node represents a non-empty list of sequential expressions.
 	Sequence []Expression // x y z

 	// A Name node represents a production name.
 	Name struct {
 		StringPos token.Pos
 		String    string
 	}

 	// A Token node represents a literal.
 	Token struct {
 		StringPos token.Pos
 		String    string
 	}

 	// A List node represents a range of characters.
 	Range struct {
 		Begin, End *Token // begin ... end
 	}

 	// A Group node represents a grouped expression.
 	Group struct {
 		Lparen token.Pos
 		Body   Expression // (body)
 	}

 	// An Option node represents an optional expression.
 	Option struct {
 		Lbrack token.Pos
 		Body   Expression // [body]
 	}

 	// A Repetition node represents a repeated expression.
 	Repetition struct {
 		Lbrace token.Pos
 		Body   Expression // {body}
 	}

 	// A Bad node stands for pieces of source code that lead to a parse error.
 	Bad struct {
 		TokPos token.Pos
 		Error  string // parser error message
 	}

 	// A Production node represents an EBNF production.
 	Production struct {
 		Name *Name
 		Expr Expression
 	}

 	// A Grammar is a set of EBNF productions. The map
 	// is indexed by production name.
 	//
 	Grammar map[string]*Production
 )


 func (x Alternative) Pos() token.Pos { return x[0].Pos() } // the parser always generates non-empty Alternative
 func (x Sequence) Pos() token.Pos    { return x[0].Pos() } // the parser always generates non-empty Sequences
 func (x *Name) Pos() token.Pos       { return x.StringPos }
 func (x *Token) Pos() token.Pos      { return x.StringPos }
 func (x *Range) Pos() token.Pos      { return x.Begin.Pos() }
 func (x *Group) Pos() token.Pos      { return x.Lparen }
 func (x *Option) Pos() token.Pos     { return x.Lbrack }
 func (x *Repetition) Pos() token.Pos { return x.Lbrace }
 func (x *Bad) Pos() token.Pos        { return x.TokPos }
 func (x *Production) Pos() token.Pos { return x.Name.Pos() }


 // ----------------------------------------------------------------------------
 // Grammar verification

 func isLexical(name string) bool {
 	ch, _ := utf8.DecodeRuneInString(name)
 	return !unicode.IsUpper(ch)
 }


 type verifier struct {
 	fset *token.FileSet
 	scanner.ErrorVector
 	worklist []*Production
 	reached  Grammar // set of productions reached from (and including) the root production
 	grammar  Grammar
 }


 func (v *verifier) error(pos token.Pos, msg string) {
 	v.Error(v.fset.Position(pos), msg)
 }


 func (v *verifier) push(prod *Production) {
 	name := prod.Name.String
 	if _, found := v.reached[name]; !found {
 		v.worklist = append(v.worklist, prod)
 		v.reached[name] = prod
 	}
 }


 func (v *verifier) verifyChar(x *Token) int {
 	s := x.String
 	if utf8.RuneCountInString(s) != 1 {
 		v.error(x.Pos(), "single char expected, found "+s)
 		return 0
 	}
 	ch, _ := utf8.DecodeRuneInString(s)
 	return ch
 }


 func (v *verifier) verifyExpr(expr Expression, lexical bool) {
 	switch x := expr.(type) {
 	case nil:
 		// empty expression
 	case Alternative:
 		for _, e := range x {
 			v.verifyExpr(e, lexical)
 		}
 	case Sequence:
 		for _, e := range x {
 			v.verifyExpr(e, lexical)
 		}
 	case *Name:
 		// a production with this name must exist;
 		// add it to the worklist if not yet processed
 		if prod, found := v.grammar[x.String]; found {
 			v.push(prod)
 		} else {
 			v.error(x.Pos(), "missing production "+x.String)
 		}
 		// within a lexical production references
 		// to non-lexical productions are invalid
 		if lexical && !isLexical(x.String) {
 			v.error(x.Pos(), "reference to non-lexical production "+x.String)
 		}
 	case *Token:
 		// nothing to do for now
 	case *Range:
 		i := v.verifyChar(x.Begin)
 		j := v.verifyChar(x.End)
 		if i >= j {
 			v.error(x.Pos(), "decreasing character range")
 		}
 	case *Group:
 		v.verifyExpr(x.Body, lexical)
 	case *Option:
 		v.verifyExpr(x.Body, lexical)
 	case *Repetition:
 		v.verifyExpr(x.Body, lexical)
 	default:
 		panic("unreachable")
 	}
 }


 func (v *verifier) verify(fset *token.FileSet, grammar Grammar, start string) {
 	// find root production
 	root, found := grammar[start]
 	if !found {
 		// token.NoPos doesn't require a file set;
 		// ok to set v.fset only afterwards
 		v.error(token.NoPos, "no start production "+start)
 		return
 	}

 	// initialize verifier
 	v.fset = fset
 	v.ErrorVector.Reset()
 	v.worklist = v.worklist[0:0]
 	v.reached = make(Grammar)
 	v.grammar = grammar

 	// work through the worklist
 	v.push(root)
 	for {
 		n := len(v.worklist) - 1
 		if n < 0 {
 			break
 		}
 		prod := v.worklist[n]
 		v.worklist = v.worklist[0:n]
 		v.verifyExpr(prod.Expr, isLexical(prod.Name.String))
 	}

 	// check if all productions were reached
 	if len(v.reached) < len(v.grammar) {
 		for name, prod := range v.grammar {
 			if _, found := v.reached[name]; !found {
 				v.error(prod.Pos(), name+" is unreachable")
 			}
 		}
 	}
 }


 // Verify checks that:
 //	- all productions used are defined
 //	- all productions defined are used when beginning at start
 //	- lexical productions refer only to other lexical productions
 //
 // Position information is interpreted relative to the file set fset.
 //
 func Verify(fset *token.FileSet, grammar Grammar, start string) os.Error {
 	var v verifier
 	v.verify(fset, grammar, start)
 	return v.GetError(scanner.Sorted)
 }
	// Copyright 2009 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	// Package ebnf is a library for EBNF grammars. The input is text ([]byte)
	// satisfying the following grammar (represented itself in EBNF):
	//
	// Production = name "=" [ Expression ] "." .
	// Expression = Alternative { "\|" Alternative } .
	// Alternative = Term { Term } .
	// Term = name \| token [ "…" token ] \| Group \| Option \| Repetition .
	// Group = "(" Expression ")" .
	// Option = "[" Expression "]" .
	// Repetition = "{" Expression "}" .
	//
	// A name is a Go identifier, a token is a Go string, and comments
	// and white space follow the same rules as for the Go language.
	// Production names starting with an uppercase Unicode letter denote
	// non-terminal productions (i.e., productions which allow white-space
	// and comments between tokens); all other production names denote
	// lexical productions.
	//
	package ebnf

	import (
	"go/scanner"
	"go/token"
	"os"
	"unicode"
	"utf8"
	)


	// ----------------------------------------------------------------------------
	// Internal representation

	type (
	// An Expression node represents a production expression.
	Expression interface {
	// Pos is the position of the first character of the syntactic construct
	Pos() token.Pos
	}

	// An Alternative node represents a non-empty list of alternative expressions.
	Alternative []Expression // x \| y \| z

	// A Sequence node represents a non-empty list of sequential expressions.
	Sequence []Expression // x y z

	// A Name node represents a production name.
	Name struct {
	StringPos token.Pos
	String string
	}

	// A Token node represents a literal.
	Token struct {
	StringPos token.Pos
	String string
	}

	// A List node represents a range of characters.
	Range struct {
	Begin, End *Token // begin ... end
	}

	// A Group node represents a grouped expression.
	Group struct {
	Lparen token.Pos
	Body Expression // (body)
	}

	// An Option node represents an optional expression.
	Option struct {
	Lbrack token.Pos
	Body Expression // [body]
	}

	// A Repetition node represents a repeated expression.
	Repetition struct {
	Lbrace token.Pos
	Body Expression // {body}
	}

	// A Bad node stands for pieces of source code that lead to a parse error.
	Bad struct {
	TokPos token.Pos
	Error string // parser error message
	}

	// A Production node represents an EBNF production.
	Production struct {
	Name *Name
	Expr Expression
	}

	// A Grammar is a set of EBNF productions. The map
	// is indexed by production name.
	//
	Grammar map[string]*Production
	)


	func (x Alternative) Pos() token.Pos { return x[0].Pos() } // the parser always generates non-empty Alternative
	func (x Sequence) Pos() token.Pos { return x[0].Pos() } // the parser always generates non-empty Sequences
	func (x *Name) Pos() token.Pos { return x.StringPos }
	func (x *Token) Pos() token.Pos { return x.StringPos }
	func (x *Range) Pos() token.Pos { return x.Begin.Pos() }
	func (x *Group) Pos() token.Pos { return x.Lparen }
	func (x *Option) Pos() token.Pos { return x.Lbrack }
	func (x *Repetition) Pos() token.Pos { return x.Lbrace }
	func (x *Bad) Pos() token.Pos { return x.TokPos }
	func (x *Production) Pos() token.Pos { return x.Name.Pos() }


	// ----------------------------------------------------------------------------
	// Grammar verification

	func isLexical(name string) bool {
	ch, _ := utf8.DecodeRuneInString(name)
	return !unicode.IsUpper(ch)
	}


	type verifier struct {
	fset *token.FileSet
	scanner.ErrorVector
	worklist []*Production
	reached Grammar // set of productions reached from (and including) the root production
	grammar Grammar
	}


	func (v *verifier) error(pos token.Pos, msg string) {
	v.Error(v.fset.Position(pos), msg)
	}


	func (v verifier) push(prod Production) {
	name := prod.Name.String
	if _, found := v.reached[name]; !found {
	v.worklist = append(v.worklist, prod)
	v.reached[name] = prod
	}
	}


	func (v verifier) verifyChar(x Token) int {
	s := x.String
	if utf8.RuneCountInString(s) != 1 {
	v.error(x.Pos(), "single char expected, found "+s)
	return 0
	}
	ch, _ := utf8.DecodeRuneInString(s)
	return ch
	}


	func (v *verifier) verifyExpr(expr Expression, lexical bool) {
	switch x := expr.(type) {
	case nil:
	// empty expression
	case Alternative:
	for _, e := range x {
	v.verifyExpr(e, lexical)
	}
	case Sequence:
	for _, e := range x {
	v.verifyExpr(e, lexical)
	}
	case *Name:
	// a production with this name must exist;
	// add it to the worklist if not yet processed
	if prod, found := v.grammar[x.String]; found {
	v.push(prod)
	} else {
	v.error(x.Pos(), "missing production "+x.String)
	}
	// within a lexical production references
	// to non-lexical productions are invalid
	if lexical && !isLexical(x.String) {
	v.error(x.Pos(), "reference to non-lexical production "+x.String)
	}
	case *Token:
	// nothing to do for now
	case *Range:
	i := v.verifyChar(x.Begin)
	j := v.verifyChar(x.End)
	if i >= j {
	v.error(x.Pos(), "decreasing character range")
	}
	case *Group:
	v.verifyExpr(x.Body, lexical)
	case *Option:
	v.verifyExpr(x.Body, lexical)
	case *Repetition:
	v.verifyExpr(x.Body, lexical)
	default:
	panic("unreachable")
	}
	}


	func (v verifier) verify(fset token.FileSet, grammar Grammar, start string) {
	// find root production
	root, found := grammar[start]
	if !found {
	// token.NoPos doesn't require a file set;
	// ok to set v.fset only afterwards
	v.error(token.NoPos, "no start production "+start)
	return
	}

	// initialize verifier
	v.fset = fset
	v.ErrorVector.Reset()
	v.worklist = v.worklist[0:0]
	v.reached = make(Grammar)
	v.grammar = grammar

	// work through the worklist
	v.push(root)
	for {
	n := len(v.worklist) - 1
	if n < 0 {
	break
	}
	prod := v.worklist[n]
	v.worklist = v.worklist[0:n]
	v.verifyExpr(prod.Expr, isLexical(prod.Name.String))
	}

	// check if all productions were reached
	if len(v.reached) < len(v.grammar) {
	for name, prod := range v.grammar {
	if _, found := v.reached[name]; !found {
	v.error(prod.Pos(), name+" is unreachable")
	}
	}
	}
	}


	// Verify checks that:
	// - all productions used are defined
	// - all productions defined are used when beginning at start
	// - lexical productions refer only to other lexical productions
	//
	// Position information is interpreted relative to the file set fset.
	//
	func Verify(fset *token.FileSet, grammar Grammar, start string) os.Error {
	var v verifier
	v.verify(fset, grammar, start)
	return v.GetError(scanner.Sorted)
	}