blob: 6917e1bc5b2c73eb9201fff58eef2d1e1b2b7272 [file] [log] [blame]
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package filter
import (
"fmt"
"regexp/syntax"
)
// parseTrace can be set by tests to trace the parser.
var parseTrace bool
// ParseFilter parses a filter expression into an [Expr].
// Empty input returns nil, nil.
// filter = [ expression ] ;
func ParseFilter(filter string) (Expr, error) {
lex := lexer{
input: filter,
nextPos: position{
line: 1,
col: 0,
},
}
tok := lex.nextToken()
if tok.kind == tokenWSC {
tok = lex.nextToken()
}
if tok.kind == tokenEOF {
return nil, nil
}
lex.pushToken(tok)
expr, err := parseExpression(&lex)
if err != nil {
return nil, err
}
tok = lex.nextToken()
if tok.kind == tokenWSC {
tok = lex.nextToken()
}
if tok.kind != tokenEOF {
return nil, fmt.Errorf("%s: unexpected tokens after filter expression", tok.pos)
}
return expr, nil
}
// parseExpr parses an expression.
// expression =  sequence, { "AND", sequence } ;
func parseExpression(lex *lexer) (e Expr, err error) {
fn := trace("Expression")
defer func() { fn(e, err) }()
return parseJunction(lex, tokenAnd, parseSequence)
}
// parseSequence parses an implicit conjunction sequence.
// sequence = factor, { factor } ;
func parseSequence(lex *lexer) (e Expr, err error) {
fn := trace("Sequence")
defer func() { fn(e, err) }()
factor, err := parseFactor(lex)
if err != nil {
return nil, err
}
for {
tok := lex.nextToken()
if tok.kind == tokenWSC {
tok = lex.nextToken()
}
lex.pushToken(tok)
if tok.kind == tokenEOF || tok.kind == tokenRparen {
return factor, nil
}
pos := tok.pos
if tok.kind == tokenAnd {
return factor, nil
}
rfactor, err := parseFactor(lex)
if err != nil {
return nil, err
}
factor = &binaryExpr{
op: tokenAnd,
left: factor,
right: rfactor,
pos: pos,
}
}
}
// parseFactor parses a disjunction.
// factor = term, { "OR", term } ;
func parseFactor(lex *lexer) (e Expr, err error) {
fn := trace("Factor")
defer func() { fn(e, err) }()
return parseJunction(lex, tokenOr, parseTerm)
}
// parseJunction parses an AND or OR sequence.
func parseJunction(lex *lexer, kind tokenKind, parseElement func(lex *lexer) (Expr, error)) (e Expr, err error) {
fn := trace("Junction")
defer func() { fn(e, err) }()
sub, err := parseElement(lex)
if err != nil {
return nil, err
}
for {
tok := lex.nextToken()
if tok.kind == tokenWSC {
tok = lex.nextToken()
}
if tok.kind != kind {
lex.pushToken(tok)
return sub, nil
}
pos := tok.pos
tok = lex.nextToken()
if tok.kind != tokenWSC {
lex.pushToken(tok)
name := "AND"
if kind == tokenOr {
name = "OR"
}
return nil, fmt.Errorf("%s: missing whitespace after %s", pos, name)
}
rsub, err := parseElement(lex)
if err != nil {
return nil, err
}
sub = &binaryExpr{
op: kind,
left: sub,
right: rsub,
pos: pos,
}
}
}
// parseTerm parses a term.
// term = [ "-" | "NOT" ], primitive ;
func parseTerm(lex *lexer) (e Expr, err error) {
fn := trace("Term")
defer func() { fn(e, err) }()
tok := lex.nextToken()
pos := tok.pos
if tok.kind != tokenMinus && tok.kind != tokenNot {
lex.pushToken(tok)
}
if tok.kind == tokenNot {
stok := lex.nextToken()
if stok.kind != tokenWSC {
lex.pushToken(stok)
return nil, fmt.Errorf("%s: missing whitespace after NOT", tok.pos)
}
}
var prim Expr
ntok := lex.nextToken()
lex.pushToken(ntok)
if ntok.kind == tokenMinus || ntok.kind == tokenNot {
prim, err = parseTerm(lex)
} else {
prim, err = parsePrimitive(lex)
}
if err != nil {
return nil, err
}
if tok.kind == tokenMinus || tok.kind == tokenNot {
prim = &unaryExpr{
op: tok.kind,
expr: prim,
pos: pos,
}
}
return prim, nil
}
// parsePrimitive parses a primitive.
//
// primitive
// = parenthesized expression
// | global restriction
// | regular restriction
// ;
// global restriction = function | value ;
// regular restriction = comparable, comparison, argument ;
// value = unquoted text | quoted string ;
func parsePrimitive(lex *lexer) (e Expr, err error) {
fn := trace("Primitive")
defer func() { fn(e, err) }()
tok := lex.nextToken()
if tok.kind == tokenWSC {
tok = lex.nextToken()
}
if tok.kind == tokenLparen {
return parseParenthesizedExpression(lex)
}
left, err := parseComparable(lex, tok)
if err != nil {
return nil, err
}
tok = lex.nextToken()
if tok.kind == tokenWSC {
tok = lex.nextToken()
}
switch tok.kind {
case tokenLessThanEquals, tokenLessThan,
tokenGreaterThanEquals, tokenGreaterThan,
tokenNotEquals, tokenEquals,
tokenHas,
tokenMatchesRegexp, tokenNotMatchesRegexp:
ntok := lex.nextToken()
if ntok.kind != tokenWSC {
lex.pushToken(ntok)
}
right, err := parseArg(lex)
if err != nil {
return nil, err
}
switch tok.kind {
case tokenMatchesRegexp, tokenNotMatchesRegexp:
// Verify that the right hand side
// is a valid regular expression
// (or a combination of regular expressions,
// as in "x*" OR "y*").
var walk func(Expr) error
walk = func(e Expr) error {
switch e := e.(type) {
case *binaryExpr:
if err := walk(e.left); err != nil {
return err
}
return walk(e.right)
case *unaryExpr:
return walk(e.expr)
case *nameExpr:
if !e.isString {
return fmt.Errorf("%s: regular expression is not a quoted string", e.pos)
}
if _, err := syntax.Parse(e.name, syntax.Perl); err != nil {
return fmt.Errorf("%s: invalid regular expression: %v", e.pos, err)
}
return nil
case *comparisonExpr:
return fmt.Errorf("%s: regular expression is not a quoted string", e.pos)
case *memberExpr:
return fmt.Errorf("%s: regular expression is not a quoted string", e.pos)
case *functionExpr:
return fmt.Errorf("%s: regular expression is not a quoted string", ntok.pos)
default:
panic("can't happen")
}
}
if err := walk(right); err != nil {
return nil, err
}
}
ret := &comparisonExpr{
op: tok.kind,
left: left,
right: right,
pos: tok.pos,
}
return ret, nil
default:
lex.pushToken(tok)
return left, nil
}
}
// parseParenthesizedExpressions parses parentheses.
// We've already seen the "(".
// parenthesized expression = "(", expression, ")" ;
func parseParenthesizedExpression(lex *lexer) (e Expr, err error) {
fn := trace("ParenthesizedExpression")
defer func() { fn(e, err) }()
tok := lex.nextToken()
if tok.kind != tokenWSC {
lex.pushToken(tok)
}
ret, err := parseExpression(lex)
if err != nil {
return nil, err
}
// For some reason strings in composite expressions are handled
// differently from plain strings.
var markComposite func(Expr)
markComposite = func(e Expr) {
switch e := e.(type) {
case *binaryExpr:
markComposite(e.left)
markComposite(e.right)
case *unaryExpr:
markComposite(e.expr)
case *comparisonExpr:
markComposite(e.left)
markComposite(e.right)
case *nameExpr:
e.isComposite = true
case *memberExpr:
markComposite(e.holder)
case *functionExpr:
default:
panic("can't happen")
}
}
markComposite(ret)
tok = lex.nextToken()
if tok.kind != tokenRparen {
lex.pushToken(tok)
return nil, fmt.Errorf("%s: expected right parenthesis after expression", tok.pos)
}
return ret, nil
}
// parseComparable parses a value that may be compared.
// comparable = member | function ;
func parseComparable(lex *lexer, tok token) (e Expr, err error) {
fn := trace("Comparable")
defer func() { fn(e, err) }()
// The implemented parser doesn't quite match the documented parser.
// The implemented parser permits using AND/OR/NOT as a comparable
// if they are followed and/or preceded by DOT.
sawWhite := false
ntok := lex.nextToken()
if ntok.kind == tokenWSC {
sawWhite = true
ntok = lex.nextToken()
}
lex.pushToken(ntok)
switch tok.kind {
case tokenString, tokenText:
case tokenAnd, tokenOr, tokenNot:
if ntok.kind != tokenDot {
return nil, fmt.Errorf("%s: misplaced AND/OR/NOT", tok.pos)
}
tok = keywordToText(tok)
default:
return nil, fmt.Errorf("%s: expected identifier or value", tok.pos)
}
switch ntok.kind {
case tokenDot:
lex.nextToken()
e, err := parseMember(lex, tok, ntok.pos)
if err != nil {
return nil, err
}
// Using a member as a function call doesn't seem to be
// in the grammar, but the test cases accept it.
ntok := lex.nextToken()
if ntok.kind != tokenLparen || sawWhite {
lex.pushToken(ntok)
return e, nil
}
return parseFunction(lex, e)
case tokenLparen:
fn := &nameExpr{
name: tok.val,
pos: tok.pos,
isString: false,
}
if sawWhite || len(tok.val) == 0 || isDigit(rune(tok.val[0])) {
return fn, nil
}
lex.nextToken()
return parseFunction(lex, fn)
default:
ret := &nameExpr{
name: tok.val,
pos: tok.pos,
isString: tok.kind == tokenString,
}
return ret, nil
}
}
// parseMember parses a member reference.
// We have already seen the first identifier, in tok, and a ".".
// member = identifier, { ".", identifier } ;
// identifier = unquoted text ;
func parseMember(lex *lexer, tok token, pos position) (e Expr, err error) {
fn := trace("Member")
defer func() { fn(e, err) }()
var ret Expr
ret = &nameExpr{
name: tok.val,
pos: tok.pos,
isString: false,
}
for {
ntok := lex.nextToken()
switch ntok.kind {
case tokenString, tokenText:
case tokenAnd, tokenOr, tokenNot:
ntok = keywordToText(ntok)
default:
return nil, fmt.Errorf("%s: expected identifier", ntok.pos)
}
ret = &memberExpr{
holder: ret,
member: ntok.val,
pos: pos,
}
ntok = lex.nextToken()
if ntok.kind != tokenDot {
lex.pushToken(ntok)
return ret, nil
}
}
}
// parseFunction parses a function call.
// We have already seen the function name, in tok, and a "(".
// function = identifier, "(", [ argument, { ",", argument } ], ")" ;
func parseFunction(lex *lexer, fn Expr) (e Expr, err error) {
tfn := trace("Function")
defer func() { tfn(e, err) }()
tok := lex.nextToken()
if tok.kind == tokenRparen {
ret := &functionExpr{
fn: fn,
args: nil,
}
return ret, nil
}
lex.pushToken(tok)
var args []Expr
for {
arg, err := parseArg(lex)
if err != nil {
return nil, err
}
args = append(args, arg)
tok := lex.nextToken()
if tok.kind == tokenRparen {
ret := &functionExpr{
fn: fn,
args: args,
}
return ret, nil
}
if tok.kind == tokenWSC {
tok = lex.nextToken()
}
if tok.kind != tokenComma {
return nil, fmt.Errorf("%s: expected comma after function argument", tok.pos)
}
tok = lex.nextToken()
if tok.kind != tokenWSC {
lex.pushToken(tok)
}
}
}
// parseArg parses an argument.
//
// argument
// = comparable
// | value
// | parenthesized expression
// ;
func parseArg(lex *lexer) (e Expr, err error) {
fn := trace("Arg")
defer func() { fn(e, err) }()
tok := lex.nextToken()
if tok.kind == tokenLparen {
return parseParenthesizedExpression(lex)
}
return parseComparable(lex, tok)
}
// keywordToText turns a keyword token into a text token,
// for cases where a keyword is permitted.
func keywordToText(tok token) token {
switch tok.kind {
case tokenAnd:
return token{kind: tokenText, val: "AND"}
case tokenOr:
return token{kind: tokenText, val: "OR"}
case tokenNot:
return token{kind: tokenText, val: "NOT"}
default:
panic("can't happen")
}
}
// traceIndent is how far to indent the trace.
var traceIndent int
// trace emits a parse trace. It returns a function to defer.
func trace(fn string) func(Expr, error) {
if parseTrace {
fmt.Printf("%*s%s\n", traceIndent, "", fn)
traceIndent++
return func(e Expr, err error) {
traceIndent--
fmt.Printf("%*s%s returning ", traceIndent, "", fn)
if e == nil && err == nil {
fmt.Printf("nil, nil\n")
} else if e == nil {
fmt.Printf("error %v\n", err)
} else if err == nil {
fmt.Printf("%s", e)
} else {
fmt.Printf("error %v %s", err, e)
}
}
}
return func(Expr, error) {}
}