blob: 499f2e716b669b1689742b228c02163320df02ec [file] [log] [blame]
// Copyright 2022 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package script implements a simple classification scripting language.
// A script is a sequence of rules of the form “action <- pattern”,
// meaning send results matching pattern to the named action.
package script
import (
"fmt"
"regexp"
"strconv"
"strings"
"unicode/utf8"
)
// A Script is a sequence of Action <- Pattern rules.
type Script struct {
File string
Rules []*Rule
}
// A Rule is a single Action <- Pattern rule.
type Rule struct {
Action string // "skip", "post", and so on
Pattern Expr // pattern expression
}
// Action returns the action specified by the script for the given record.
func (s *Script) Action(record Record) string {
for _, r := range s.Rules {
if r.Pattern.Match(record) {
return r.Action
}
}
return ""
}
// A Record is a set of key:value pairs.
type Record map[string]string
// An Expr is a pattern expression that can evaluate itself on a Record.
// The underlying concrete type is *CmpExpr, *AndExpr, *OrExpr, *NotExpr, or *RegExpr.
type Expr interface {
// String returns the syntax for the pattern.
String() string
// Match reports whether the pattern matches the record.
Match(record Record) bool
}
// A CmpExpr is an Expr for a string comparison.
type CmpExpr struct {
Field string
Op string
Literal string
}
func (x *CmpExpr) Match(record Record) bool {
f := record[x.Field]
l := x.Literal
switch x.Op {
case "==":
return f == l
case "!=":
return f != l
case "<":
return f < l
case "<=":
return f <= l
case ">":
return f > l
case ">=":
return f >= l
}
return false
}
func (x *CmpExpr) String() string {
s := strconv.Quote(x.Literal)
if x.Field == "" {
return s
}
return x.Field + " " + x.Op + " " + s
}
func cmp(field, op, literal string) Expr { return &CmpExpr{field, op, literal} }
// A RegExpr is an Expr for a regular expression test.
type RegExpr struct {
Field string
Not bool
Regexp *regexp.Regexp
}
func (x *RegExpr) Match(record Record) bool {
ok := x.Regexp.MatchString(record[x.Field])
if x.Not {
return !ok
}
return ok
}
func (x *RegExpr) String() string {
s := x.Regexp.String()
s = "`" + strings.ReplaceAll(s, "`", `\x60`) + "`"
if x.Field == "" {
return s
}
op := " ~ "
if x.Not {
op = " !~ "
}
return x.Field + op + s
}
func regx(field string, not bool, re *regexp.Regexp) Expr { return &RegExpr{field, not, re} }
func regcomp(s string) (*regexp.Regexp, error) {
return regexp.Compile("(?m)" + s)
}
// A NotExpr represents the expression !X (the negation of X).
type NotExpr struct {
X Expr
}
func (x *NotExpr) Match(record Record) bool {
return !x.X.Match(record)
}
func (x *NotExpr) String() string {
return "!(" + x.X.String() + ")"
}
func not(x Expr) Expr { return &NotExpr{x} }
// An AndExpr represents the expression X && Y.
type AndExpr struct {
X, Y Expr
}
func (x *AndExpr) Match(record Record) bool {
return x.X.Match(record) && x.Y.Match(record)
}
func (x *AndExpr) String() string {
return andArg(x.X) + " && " + andArg(x.Y)
}
func andArg(x Expr) string {
s := x.String()
if _, ok := x.(*OrExpr); ok {
s = "(" + s + ")"
}
return s
}
func and(x, y Expr) Expr {
return &AndExpr{x, y}
}
// An OrExpr represents the expression X || Y.
type OrExpr struct {
X, Y Expr
}
func (x *OrExpr) Match(record Record) bool {
return x.X.Match(record) || x.Y.Match(record)
}
func (x *OrExpr) String() string {
return orArg(x.X) + " || " + orArg(x.Y)
}
func orArg(x Expr) string {
s := x.String()
if _, ok := x.(*AndExpr); ok {
s = "(" + s + ")"
}
return s
}
func or(x, y Expr) Expr {
return &OrExpr{x, y}
}
// A SyntaxError reports a syntax error in a parsed match expression.
type SyntaxError struct {
File string // input file
Line int // line number where error was detected (1-indexed)
Offset int // byte offset in line where error was detected (1-indexed)
Err string // description of error
}
func (e *SyntaxError) Error() string {
if e.Offset == 0 {
return fmt.Sprintf("%s:%d: %s", e.File, e.Line, e.Err)
}
return fmt.Sprintf("%s:%d.%d: %s", e.File, e.Line, e.Offset, e.Err)
}
// A parser holds state for parsing a build expression.
type parser struct {
file string // input file, for errors
s string // input string
i int // next read location in s
fields map[string]bool // known input fields for comparisons
tok string // last token read; "`", "\"", "a" for backquoted regexp, literal string, identifier
lit string // text of backquoted regexp, literal string, or identifier
pos int // position (start) of last token
}
// Parse parses text as a script,
// returning the parsed form and any parse errors found.
// (The parser attempts to recover after parse errors by starting over
// at the next newline, so multiple parse errors are possible.)
// The file argument is used for reporting the file name in errors
// and in the Script's File field;
// Parse does not read from the file itself.
func Parse(file, text string, fields []string) (*Script, []*SyntaxError) {
p := &parser{
file: file,
s: text,
}
p.fields = make(map[string]bool)
for _, f := range fields {
p.fields[f] = true
}
var s Script
s.File = file
var errs []*SyntaxError
for {
r, err := p.parseRule()
if err != nil {
errs = append(errs, err.(*SyntaxError))
i := strings.Index(p.s[p.i:], "\n")
if i < 0 {
break
}
p.i += i + 1
continue
}
if r == nil {
break
}
s.Rules = append(s.Rules, r)
}
return &s, errs
}
// parseRule parses a single rule from a script.
// On entry, the next input token has not been lexed.
// On exit, the next input token has been lexed and is in p.tok.
// If there is an error, it is guaranteed to be a *SyntaxError.
// parseRule returns nil, nil at end of file.
func (p *parser) parseRule() (x *Rule, err error) {
defer func() {
if e := recover(); e != nil {
if e, ok := e.(*SyntaxError); ok {
err = e
return
}
panic(e) // unreachable unless parser has a bug
}
}()
x = p.rule()
if p.tok != "" && p.tok != "\n" {
p.unexpected()
}
return x, nil
}
// unexpected reports a parse error due to an unexpected token
func (p *parser) unexpected() {
what := p.tok
switch what {
case "a":
what = "identifier " + p.lit
case "\"":
what = "quoted string " + p.lit
case "`":
what = "backquoted string " + p.lit
case "\n":
what = "end of line"
case "":
what = "end of script"
}
p.parseError("unexpected " + what)
}
// rule parses a single rule.
// On entry, the next input token has not yet been lexed.
// On exit, the next input token has been lexed and is in p.tok.
// If there is no next rule (the script has been read in its entirety), rule returns nil.
func (p *parser) rule() *Rule {
p.lex()
for p.tok == "\n" {
p.lex()
}
if p.tok == "" {
return nil
}
if p.tok != "a" {
p.unexpected()
}
action := p.lit
p.lex()
if p.tok != "<-" {
p.unexpected()
}
return &Rule{Action: action, Pattern: p.or()}
}
// or parses a sequence of || expressions.
// On entry, the next input token has not yet been lexed.
// On exit, the next input token has been lexed and is in p.tok.
func (p *parser) or() Expr {
x := p.and()
for p.tok == "||" {
x = or(x, p.and())
}
return x
}
// and parses a sequence of && expressions.
// On entry, the next input token has not yet been lexed.
// On exit, the next input token has been lexed and is in p.tok.
func (p *parser) and() Expr {
x := p.cmp()
for p.tok == "&&" {
x = and(x, p.cmp())
}
return x
}
// cmp parses a comparison expression or atom.
// On entry, the next input token has not been lexed.
// On exit, the next input token has been lexed and is in p.tok.
func (p *parser) cmp() Expr {
p.lex()
switch p.tok {
default:
p.unexpected()
case "!":
p.lex()
return not(p.atom())
case "(", "\"", "`":
return p.atom()
case "a":
// comparison
field := p.lit
if !p.fields[field] {
p.parseError("unknown field " + field)
}
p.lex()
switch p.tok {
default:
p.unexpected()
case "==", "!=", "<", "<=", ">", ">=":
op := p.tok
p.lex()
if p.tok != "\"" {
p.parseError(op + " requires quoted string")
}
s := p.lit
p.lex()
return cmp(field, op, s)
case "~", "!~":
op := p.tok
p.lex()
if p.tok != "`" {
p.parseError(op + " requires backquoted regexp")
}
re, err := regcomp(p.lit)
if err != nil {
p.parseError("invalid regexp: " + err.Error())
}
p.lex()
return regx(field, op == "!~", re)
}
}
panic("unreachable")
}
// atom parses a regexp or string comparison or a parenthesized expression.
// On entry, the next input token HAS been lexed.
// On exit, the next input token has been lexed and is in p.tok.
func (p *parser) atom() Expr {
// first token already in p.tok
switch p.tok {
default:
p.unexpected()
case "(":
defer func() {
if e := recover(); e != nil {
if e, ok := e.(*SyntaxError); ok && e.Err == "unexpected end of expression" {
e.Err = "missing close paren"
}
panic(e)
}
}()
x := p.or()
if p.tok != ")" {
p.parseError("missing close paren")
}
p.lex()
return x
case "`":
re, err := regcomp(p.lit)
if err != nil {
p.parseError("invalid regexp: " + err.Error())
}
p.lex()
return regx("", false, re)
}
panic("unreachable")
}
// lex finds and consumes the next token in the input stream.
// On return, p.tok is set to the token text
// and p.pos records the byte offset of the start of the token in the input stream.
// If lex reaches the end of the input, p.tok is set to the empty string.
// For any other syntax error, lex panics with a SyntaxError.
func (p *parser) lex() {
Top:
for p.i < len(p.s) && (p.s[p.i] == ' ' || p.s[p.i] == '\t') {
p.i++
}
if p.i >= len(p.s) {
p.tok = ""
p.pos = p.i
return
}
switch p.s[p.i] {
case '#':
// line comment
for p.i < len(p.s) && p.s[p.i] != '\n' {
p.i++
}
goto Top
case '\n':
// like in Go, not a line ending if it follows a continuation token.
switch p.tok {
case "(", "&&", "||", "==", "!=", "~", "!~", "!", "<-":
p.i++
goto Top
}
p.pos = p.i
p.i++
p.tok = p.s[p.pos:p.i]
return
case '<': // <-, <=
p.pos = p.i
p.i++
if p.i < len(p.s) && (p.s[p.i] == '-' || p.s[p.i] == '=') {
p.i++
}
p.tok = p.s[p.pos:p.i]
return
case '!', '>': // ! != > >=
p.pos = p.i
p.i++
if p.i < len(p.s) && p.s[p.i] == '=' {
p.i++
}
p.tok = p.s[p.pos:p.i]
return
case '(', ')', '~': // ( ) ~
p.pos = p.i
p.i++
p.tok = p.s[p.pos:p.i]
return
case '&', '|', '=': // && || ==
if p.i+1 >= len(p.s) || p.s[p.i+1] != p.s[p.i] {
p.lexError("invalid syntax at " + string(rune(p.s[p.i])))
}
p.pos = p.i
p.i += 2
p.tok = p.s[p.pos:p.i]
return
case '`':
j := p.i + 1
for j < len(p.s) && p.s[j] != '`' {
if p.s[j] == '\n' {
p.lexError("newline in backquoted regexp")
}
j++
}
if j >= len(p.s) {
p.lexError("unterminated backquoted regexp")
}
p.pos = p.i
p.i = j + 1
p.tok = "`"
p.lit = p.s[p.pos+1 : j]
return
case '"':
j := p.i + 1
for j < len(p.s) && p.s[j] != '"' {
if p.s[j] == '\n' {
p.lexError("newline in quoted string")
}
if p.s[j] == '\\' {
j++
}
j++
}
if j >= len(p.s) {
p.lexError("unterminated quoted string")
}
s, err := strconv.Unquote(p.s[p.i : j+1])
if err != nil {
p.lexError("invalid quoted string: " + err.Error())
}
p.pos = p.i
p.i = j + 1
p.tok = "\""
p.lit = s
return
case '\'':
p.lexError("single-quoted strings not allowed")
}
// ascii name
if isalpha(p.s[p.i]) {
j := p.i
for j < len(p.s) && isalnum(p.s[j]) {
j++
}
p.pos = p.i
p.i = j
p.tok = "a"
p.lit = p.s[p.pos:p.i]
return
}
c, _ := utf8.DecodeRuneInString(p.s[p.i:])
p.lexError(fmt.Sprintf("invalid syntax at %q (U+%04x)", c, c))
}
// lexError reports a lex error with the given error text.
func (p *parser) lexError(err string) {
p.errorAt(p.i, err)
}
// parseError reports a parse error with the given error text.
// (A parse error differs from a lex error in which parser position
// the error is attributed to.)
func (p *parser) parseError(err string) {
p.errorAt(p.pos, err)
}
// errorAt reports a syntax error at the given position.
func (p *parser) errorAt(pos int, err string) {
line := 1 + strings.Count(p.s[:pos], "\n")
i := pos - strings.LastIndex(p.s[:pos], "\n")
panic(&SyntaxError{File: p.file, Line: line, Offset: i, Err: err})
}
// isalpha reports whether c is an ASCII alphabetic or _.
func isalpha(c byte) bool {
return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || c == '_'
}
// isalnum reports whether c is an ASCII alphanumeric or _.
func isalnum(c byte) bool {
return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' || c == '_'
}