| // Copyright 2022 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // Package script implements a simple classification scripting language. |
| // A script is a sequence of rules of the form “action <- pattern”, |
| // meaning send results matching pattern to the named action. |
| package script |
| |
| import ( |
| "fmt" |
| "regexp" |
| "strconv" |
| "strings" |
| "unicode/utf8" |
| ) |
| |
| // A Script is a sequence of Action <- Pattern rules. |
| type Script struct { |
| File string |
| Rules []*Rule |
| } |
| |
| // A Rule is a single Action <- Pattern rule. |
| type Rule struct { |
| Action string // "skip", "post", and so on |
| Pattern Expr // pattern expression |
| } |
| |
| // Action returns the action specified by the script for the given record. |
| func (s *Script) Action(record Record) string { |
| for _, r := range s.Rules { |
| if r.Pattern.Match(record) { |
| return r.Action |
| } |
| } |
| return "" |
| } |
| |
| // A Record is a set of key:value pairs. |
| type Record map[string]string |
| |
| // An Expr is a pattern expression that can evaluate itself on a Record. |
| // The underlying concrete type is *CmpExpr, *AndExpr, *OrExpr, *NotExpr, or *RegExpr. |
| type Expr interface { |
| // String returns the syntax for the pattern. |
| String() string |
| |
| // Match reports whether the pattern matches the record. |
| Match(record Record) bool |
| } |
| |
| // A CmpExpr is an Expr for a string comparison. |
| type CmpExpr struct { |
| Field string |
| Op string |
| Literal string |
| } |
| |
| func (x *CmpExpr) Match(record Record) bool { |
| f := record[x.Field] |
| l := x.Literal |
| switch x.Op { |
| case "==": |
| return f == l |
| case "!=": |
| return f != l |
| case "<": |
| return f < l |
| case "<=": |
| return f <= l |
| case ">": |
| return f > l |
| case ">=": |
| return f >= l |
| } |
| return false |
| } |
| |
| func (x *CmpExpr) String() string { |
| s := strconv.Quote(x.Literal) |
| if x.Field == "" { |
| return s |
| } |
| return x.Field + " " + x.Op + " " + s |
| } |
| |
| func cmp(field, op, literal string) Expr { return &CmpExpr{field, op, literal} } |
| |
| // A RegExpr is an Expr for a regular expression test. |
| type RegExpr struct { |
| Field string |
| Not bool |
| Regexp *regexp.Regexp |
| } |
| |
| func (x *RegExpr) Match(record Record) bool { |
| ok := x.Regexp.MatchString(record[x.Field]) |
| if x.Not { |
| return !ok |
| } |
| return ok |
| } |
| |
| func (x *RegExpr) String() string { |
| s := x.Regexp.String() |
| s = "`" + strings.ReplaceAll(s, "`", `\x60`) + "`" |
| if x.Field == "" { |
| return s |
| } |
| op := " ~ " |
| if x.Not { |
| op = " !~ " |
| } |
| return x.Field + op + s |
| } |
| |
| func regx(field string, not bool, re *regexp.Regexp) Expr { return &RegExpr{field, not, re} } |
| func regcomp(s string) (*regexp.Regexp, error) { |
| return regexp.Compile("(?m)" + s) |
| } |
| |
| // A NotExpr represents the expression !X (the negation of X). |
| type NotExpr struct { |
| X Expr |
| } |
| |
| func (x *NotExpr) Match(record Record) bool { |
| return !x.X.Match(record) |
| } |
| |
| func (x *NotExpr) String() string { |
| return "!(" + x.X.String() + ")" |
| } |
| |
| func not(x Expr) Expr { return &NotExpr{x} } |
| |
| // An AndExpr represents the expression X && Y. |
| type AndExpr struct { |
| X, Y Expr |
| } |
| |
| func (x *AndExpr) Match(record Record) bool { |
| return x.X.Match(record) && x.Y.Match(record) |
| } |
| |
| func (x *AndExpr) String() string { |
| return andArg(x.X) + " && " + andArg(x.Y) |
| } |
| |
| func andArg(x Expr) string { |
| s := x.String() |
| if _, ok := x.(*OrExpr); ok { |
| s = "(" + s + ")" |
| } |
| return s |
| } |
| |
| func and(x, y Expr) Expr { |
| return &AndExpr{x, y} |
| } |
| |
| // An OrExpr represents the expression X || Y. |
| type OrExpr struct { |
| X, Y Expr |
| } |
| |
| func (x *OrExpr) Match(record Record) bool { |
| return x.X.Match(record) || x.Y.Match(record) |
| } |
| |
| func (x *OrExpr) String() string { |
| return orArg(x.X) + " || " + orArg(x.Y) |
| } |
| |
| func orArg(x Expr) string { |
| s := x.String() |
| if _, ok := x.(*AndExpr); ok { |
| s = "(" + s + ")" |
| } |
| return s |
| } |
| |
| func or(x, y Expr) Expr { |
| return &OrExpr{x, y} |
| } |
| |
| // A SyntaxError reports a syntax error in a parsed match expression. |
| type SyntaxError struct { |
| File string // input file |
| Line int // line number where error was detected (1-indexed) |
| Offset int // byte offset in line where error was detected (1-indexed) |
| Err string // description of error |
| } |
| |
| func (e *SyntaxError) Error() string { |
| if e.Offset == 0 { |
| return fmt.Sprintf("%s:%d: %s", e.File, e.Line, e.Err) |
| } |
| return fmt.Sprintf("%s:%d.%d: %s", e.File, e.Line, e.Offset, e.Err) |
| } |
| |
| // A parser holds state for parsing a build expression. |
| type parser struct { |
| file string // input file, for errors |
| s string // input string |
| i int // next read location in s |
| fields map[string]bool // known input fields for comparisons |
| |
| tok string // last token read; "`", "\"", "a" for backquoted regexp, literal string, identifier |
| lit string // text of backquoted regexp, literal string, or identifier |
| pos int // position (start) of last token |
| } |
| |
| // Parse parses text as a script, |
| // returning the parsed form and any parse errors found. |
| // (The parser attempts to recover after parse errors by starting over |
| // at the next newline, so multiple parse errors are possible.) |
| // The file argument is used for reporting the file name in errors |
| // and in the Script's File field; |
| // Parse does not read from the file itself. |
| func Parse(file, text string, fields []string) (*Script, []*SyntaxError) { |
| p := &parser{ |
| file: file, |
| s: text, |
| } |
| p.fields = make(map[string]bool) |
| for _, f := range fields { |
| p.fields[f] = true |
| } |
| var s Script |
| s.File = file |
| var errs []*SyntaxError |
| for { |
| r, err := p.parseRule() |
| if err != nil { |
| errs = append(errs, err.(*SyntaxError)) |
| i := strings.Index(p.s[p.i:], "\n") |
| if i < 0 { |
| break |
| } |
| p.i += i + 1 |
| continue |
| } |
| if r == nil { |
| break |
| } |
| s.Rules = append(s.Rules, r) |
| } |
| return &s, errs |
| } |
| |
| // parseRule parses a single rule from a script. |
| // On entry, the next input token has not been lexed. |
| // On exit, the next input token has been lexed and is in p.tok. |
| // If there is an error, it is guaranteed to be a *SyntaxError. |
| // parseRule returns nil, nil at end of file. |
| func (p *parser) parseRule() (x *Rule, err error) { |
| defer func() { |
| if e := recover(); e != nil { |
| if e, ok := e.(*SyntaxError); ok { |
| err = e |
| return |
| } |
| panic(e) // unreachable unless parser has a bug |
| } |
| }() |
| |
| x = p.rule() |
| if p.tok != "" && p.tok != "\n" { |
| p.unexpected() |
| } |
| return x, nil |
| } |
| |
| // unexpected reports a parse error due to an unexpected token |
| func (p *parser) unexpected() { |
| what := p.tok |
| switch what { |
| case "a": |
| what = "identifier " + p.lit |
| case "\"": |
| what = "quoted string " + p.lit |
| case "`": |
| what = "backquoted string " + p.lit |
| case "\n": |
| what = "end of line" |
| case "": |
| what = "end of script" |
| } |
| p.parseError("unexpected " + what) |
| } |
| |
| // rule parses a single rule. |
| // On entry, the next input token has not yet been lexed. |
| // On exit, the next input token has been lexed and is in p.tok. |
| // If there is no next rule (the script has been read in its entirety), rule returns nil. |
| func (p *parser) rule() *Rule { |
| p.lex() |
| for p.tok == "\n" { |
| p.lex() |
| } |
| if p.tok == "" { |
| return nil |
| } |
| if p.tok != "a" { |
| p.unexpected() |
| } |
| action := p.lit |
| p.lex() |
| if p.tok != "<-" { |
| p.unexpected() |
| } |
| return &Rule{Action: action, Pattern: p.or()} |
| } |
| |
| // or parses a sequence of || expressions. |
| // On entry, the next input token has not yet been lexed. |
| // On exit, the next input token has been lexed and is in p.tok. |
| func (p *parser) or() Expr { |
| x := p.and() |
| for p.tok == "||" { |
| x = or(x, p.and()) |
| } |
| return x |
| } |
| |
| // and parses a sequence of && expressions. |
| // On entry, the next input token has not yet been lexed. |
| // On exit, the next input token has been lexed and is in p.tok. |
| func (p *parser) and() Expr { |
| x := p.cmp() |
| for p.tok == "&&" { |
| x = and(x, p.cmp()) |
| } |
| return x |
| } |
| |
| // cmp parses a comparison expression or atom. |
| // On entry, the next input token has not been lexed. |
| // On exit, the next input token has been lexed and is in p.tok. |
| func (p *parser) cmp() Expr { |
| p.lex() |
| switch p.tok { |
| default: |
| p.unexpected() |
| case "!": |
| p.lex() |
| return not(p.atom()) |
| case "(", "\"", "`": |
| return p.atom() |
| case "a": |
| // comparison |
| field := p.lit |
| if !p.fields[field] { |
| p.parseError("unknown field " + field) |
| } |
| p.lex() |
| switch p.tok { |
| default: |
| p.unexpected() |
| case "==", "!=", "<", "<=", ">", ">=": |
| op := p.tok |
| p.lex() |
| if p.tok != "\"" { |
| p.parseError(op + " requires quoted string") |
| } |
| s := p.lit |
| p.lex() |
| return cmp(field, op, s) |
| case "~", "!~": |
| op := p.tok |
| p.lex() |
| if p.tok != "`" { |
| p.parseError(op + " requires backquoted regexp") |
| } |
| re, err := regcomp(p.lit) |
| if err != nil { |
| p.parseError("invalid regexp: " + err.Error()) |
| } |
| p.lex() |
| return regx(field, op == "!~", re) |
| } |
| } |
| panic("unreachable") |
| } |
| |
| // atom parses a regexp or string comparison or a parenthesized expression. |
| // On entry, the next input token HAS been lexed. |
| // On exit, the next input token has been lexed and is in p.tok. |
| func (p *parser) atom() Expr { |
| // first token already in p.tok |
| switch p.tok { |
| default: |
| p.unexpected() |
| |
| case "(": |
| defer func() { |
| if e := recover(); e != nil { |
| if e, ok := e.(*SyntaxError); ok && e.Err == "unexpected end of expression" { |
| e.Err = "missing close paren" |
| } |
| panic(e) |
| } |
| }() |
| x := p.or() |
| if p.tok != ")" { |
| p.parseError("missing close paren") |
| } |
| p.lex() |
| return x |
| |
| case "`": |
| re, err := regcomp(p.lit) |
| if err != nil { |
| p.parseError("invalid regexp: " + err.Error()) |
| } |
| p.lex() |
| return regx("", false, re) |
| } |
| panic("unreachable") |
| } |
| |
| // lex finds and consumes the next token in the input stream. |
| // On return, p.tok is set to the token text |
| // and p.pos records the byte offset of the start of the token in the input stream. |
| // If lex reaches the end of the input, p.tok is set to the empty string. |
| // For any other syntax error, lex panics with a SyntaxError. |
| func (p *parser) lex() { |
| Top: |
| for p.i < len(p.s) && (p.s[p.i] == ' ' || p.s[p.i] == '\t') { |
| p.i++ |
| } |
| if p.i >= len(p.s) { |
| p.tok = "" |
| p.pos = p.i |
| return |
| } |
| switch p.s[p.i] { |
| case '#': |
| // line comment |
| for p.i < len(p.s) && p.s[p.i] != '\n' { |
| p.i++ |
| } |
| goto Top |
| case '\n': |
| // like in Go, not a line ending if it follows a continuation token. |
| switch p.tok { |
| case "(", "&&", "||", "==", "!=", "~", "!~", "!", "<-": |
| p.i++ |
| goto Top |
| } |
| p.pos = p.i |
| p.i++ |
| p.tok = p.s[p.pos:p.i] |
| return |
| case '<': // <-, <= |
| p.pos = p.i |
| p.i++ |
| if p.i < len(p.s) && (p.s[p.i] == '-' || p.s[p.i] == '=') { |
| p.i++ |
| } |
| p.tok = p.s[p.pos:p.i] |
| return |
| case '!', '>': // ! != > >= |
| p.pos = p.i |
| p.i++ |
| if p.i < len(p.s) && p.s[p.i] == '=' { |
| p.i++ |
| } |
| p.tok = p.s[p.pos:p.i] |
| return |
| case '(', ')', '~': // ( ) ~ |
| p.pos = p.i |
| p.i++ |
| p.tok = p.s[p.pos:p.i] |
| return |
| case '&', '|', '=': // && || == |
| if p.i+1 >= len(p.s) || p.s[p.i+1] != p.s[p.i] { |
| p.lexError("invalid syntax at " + string(rune(p.s[p.i]))) |
| } |
| p.pos = p.i |
| p.i += 2 |
| p.tok = p.s[p.pos:p.i] |
| return |
| case '`': |
| j := p.i + 1 |
| for j < len(p.s) && p.s[j] != '`' { |
| if p.s[j] == '\n' { |
| p.lexError("newline in backquoted regexp") |
| } |
| j++ |
| } |
| if j >= len(p.s) { |
| p.lexError("unterminated backquoted regexp") |
| } |
| p.pos = p.i |
| p.i = j + 1 |
| p.tok = "`" |
| p.lit = p.s[p.pos+1 : j] |
| return |
| case '"': |
| j := p.i + 1 |
| for j < len(p.s) && p.s[j] != '"' { |
| if p.s[j] == '\n' { |
| p.lexError("newline in quoted string") |
| } |
| if p.s[j] == '\\' { |
| j++ |
| } |
| j++ |
| } |
| if j >= len(p.s) { |
| p.lexError("unterminated quoted string") |
| } |
| s, err := strconv.Unquote(p.s[p.i : j+1]) |
| if err != nil { |
| p.lexError("invalid quoted string: " + err.Error()) |
| } |
| p.pos = p.i |
| p.i = j + 1 |
| p.tok = "\"" |
| p.lit = s |
| return |
| case '\'': |
| p.lexError("single-quoted strings not allowed") |
| } |
| |
| // ascii name |
| if isalpha(p.s[p.i]) { |
| j := p.i |
| for j < len(p.s) && isalnum(p.s[j]) { |
| j++ |
| } |
| p.pos = p.i |
| p.i = j |
| p.tok = "a" |
| p.lit = p.s[p.pos:p.i] |
| return |
| } |
| |
| c, _ := utf8.DecodeRuneInString(p.s[p.i:]) |
| p.lexError(fmt.Sprintf("invalid syntax at %q (U+%04x)", c, c)) |
| } |
| |
| // lexError reports a lex error with the given error text. |
| func (p *parser) lexError(err string) { |
| p.errorAt(p.i, err) |
| } |
| |
| // parseError reports a parse error with the given error text. |
| // (A parse error differs from a lex error in which parser position |
| // the error is attributed to.) |
| func (p *parser) parseError(err string) { |
| p.errorAt(p.pos, err) |
| } |
| |
| // errorAt reports a syntax error at the given position. |
| func (p *parser) errorAt(pos int, err string) { |
| line := 1 + strings.Count(p.s[:pos], "\n") |
| i := pos - strings.LastIndex(p.s[:pos], "\n") |
| panic(&SyntaxError{File: p.file, Line: line, Offset: i, Err: err}) |
| } |
| |
| // isalpha reports whether c is an ASCII alphabetic or _. |
| func isalpha(c byte) bool { |
| return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || c == '_' |
| } |
| |
| // isalnum reports whether c is an ASCII alphanumeric or _. |
| func isalnum(c byte) bool { |
| return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' || c == '_' |
| } |