cmd/watchflakes/internal/script: add script parser
Needs tests.
Change-Id: Ia970c247ab36239e6e164c708c55ba3119054d9a
Reviewed-on: https://go-review.googlesource.com/c/build/+/432402
Reviewed-by: Cherry Mui <cherryyz@google.com>
Auto-Submit: Russ Cox <rsc@golang.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Russ Cox <rsc@golang.org>
diff --git a/cmd/watchflakes/internal/script/script.go b/cmd/watchflakes/internal/script/script.go
new file mode 100644
index 0000000..499f2e7
--- /dev/null
+++ b/cmd/watchflakes/internal/script/script.go
@@ -0,0 +1,573 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package script implements a simple classification scripting language.
+// A script is a sequence of rules of the form “action <- pattern”,
+// meaning send results matching pattern to the named action.
+package script
+
+import (
+ "fmt"
+ "regexp"
+ "strconv"
+ "strings"
+ "unicode/utf8"
+)
+
+// A Script is a sequence of Action <- Pattern rules.
+type Script struct {
+ File string
+ Rules []*Rule
+}
+
+// A Rule is a single Action <- Pattern rule.
+type Rule struct {
+ Action string // "skip", "post", and so on
+ Pattern Expr // pattern expression
+}
+
+// Action returns the action specified by the script for the given record.
+func (s *Script) Action(record Record) string {
+ for _, r := range s.Rules {
+ if r.Pattern.Match(record) {
+ return r.Action
+ }
+ }
+ return ""
+}
+
+// A Record is a set of key:value pairs.
+type Record map[string]string
+
+// An Expr is a pattern expression that can evaluate itself on a Record.
+// The underlying concrete type is *CmpExpr, *AndExpr, *OrExpr, *NotExpr, or *RegExpr.
+type Expr interface {
+ // String returns the syntax for the pattern.
+ String() string
+
+ // Match reports whether the pattern matches the record.
+ Match(record Record) bool
+}
+
+// A CmpExpr is an Expr for a string comparison.
+type CmpExpr struct {
+ Field string
+ Op string
+ Literal string
+}
+
+func (x *CmpExpr) Match(record Record) bool {
+ f := record[x.Field]
+ l := x.Literal
+ switch x.Op {
+ case "==":
+ return f == l
+ case "!=":
+ return f != l
+ case "<":
+ return f < l
+ case "<=":
+ return f <= l
+ case ">":
+ return f > l
+ case ">=":
+ return f >= l
+ }
+ return false
+}
+
+func (x *CmpExpr) String() string {
+ s := strconv.Quote(x.Literal)
+ if x.Field == "" {
+ return s
+ }
+ return x.Field + " " + x.Op + " " + s
+}
+
+func cmp(field, op, literal string) Expr { return &CmpExpr{field, op, literal} }
+
+// A RegExpr is an Expr for a regular expression test.
+type RegExpr struct {
+ Field string
+ Not bool
+ Regexp *regexp.Regexp
+}
+
+func (x *RegExpr) Match(record Record) bool {
+ ok := x.Regexp.MatchString(record[x.Field])
+ if x.Not {
+ return !ok
+ }
+ return ok
+}
+
+func (x *RegExpr) String() string {
+ s := x.Regexp.String()
+ s = "`" + strings.ReplaceAll(s, "`", `\x60`) + "`"
+ if x.Field == "" {
+ return s
+ }
+ op := " ~ "
+ if x.Not {
+ op = " !~ "
+ }
+ return x.Field + op + s
+}
+
+func regx(field string, not bool, re *regexp.Regexp) Expr { return &RegExpr{field, not, re} }
+func regcomp(s string) (*regexp.Regexp, error) {
+ return regexp.Compile("(?m)" + s)
+}
+
+// A NotExpr represents the expression !X (the negation of X).
+type NotExpr struct {
+ X Expr
+}
+
+func (x *NotExpr) Match(record Record) bool {
+ return !x.X.Match(record)
+}
+
+func (x *NotExpr) String() string {
+ return "!(" + x.X.String() + ")"
+}
+
+func not(x Expr) Expr { return &NotExpr{x} }
+
+// An AndExpr represents the expression X && Y.
+type AndExpr struct {
+ X, Y Expr
+}
+
+func (x *AndExpr) Match(record Record) bool {
+ return x.X.Match(record) && x.Y.Match(record)
+}
+
+func (x *AndExpr) String() string {
+ return andArg(x.X) + " && " + andArg(x.Y)
+}
+
+func andArg(x Expr) string {
+ s := x.String()
+ if _, ok := x.(*OrExpr); ok {
+ s = "(" + s + ")"
+ }
+ return s
+}
+
+func and(x, y Expr) Expr {
+ return &AndExpr{x, y}
+}
+
+// An OrExpr represents the expression X || Y.
+type OrExpr struct {
+ X, Y Expr
+}
+
+func (x *OrExpr) Match(record Record) bool {
+ return x.X.Match(record) || x.Y.Match(record)
+}
+
+func (x *OrExpr) String() string {
+ return orArg(x.X) + " || " + orArg(x.Y)
+}
+
+func orArg(x Expr) string {
+ s := x.String()
+ if _, ok := x.(*AndExpr); ok {
+ s = "(" + s + ")"
+ }
+ return s
+}
+
+func or(x, y Expr) Expr {
+ return &OrExpr{x, y}
+}
+
+// A SyntaxError reports a syntax error in a parsed match expression.
+type SyntaxError struct {
+ File string // input file
+ Line int // line number where error was detected (1-indexed)
+ Offset int // byte offset in line where error was detected (1-indexed)
+ Err string // description of error
+}
+
+func (e *SyntaxError) Error() string {
+ if e.Offset == 0 {
+ return fmt.Sprintf("%s:%d: %s", e.File, e.Line, e.Err)
+ }
+ return fmt.Sprintf("%s:%d.%d: %s", e.File, e.Line, e.Offset, e.Err)
+}
+
+// A parser holds state for parsing a build expression.
+type parser struct {
+ file string // input file, for errors
+ s string // input string
+ i int // next read location in s
+ fields map[string]bool // known input fields for comparisons
+
+ tok string // last token read; "`", "\"", "a" for backquoted regexp, literal string, identifier
+ lit string // text of backquoted regexp, literal string, or identifier
+ pos int // position (start) of last token
+}
+
+// Parse parses text as a script,
+// returning the parsed form and any parse errors found.
+// (The parser attempts to recover after parse errors by starting over
+// at the next newline, so multiple parse errors are possible.)
+// The file argument is used for reporting the file name in errors
+// and in the Script's File field;
+// Parse does not read from the file itself.
+func Parse(file, text string, fields []string) (*Script, []*SyntaxError) {
+ p := &parser{
+ file: file,
+ s: text,
+ }
+ p.fields = make(map[string]bool)
+ for _, f := range fields {
+ p.fields[f] = true
+ }
+ var s Script
+ s.File = file
+ var errs []*SyntaxError
+ for {
+ r, err := p.parseRule()
+ if err != nil {
+ errs = append(errs, err.(*SyntaxError))
+ i := strings.Index(p.s[p.i:], "\n")
+ if i < 0 {
+ break
+ }
+ p.i += i + 1
+ continue
+ }
+ if r == nil {
+ break
+ }
+ s.Rules = append(s.Rules, r)
+ }
+ return &s, errs
+}
+
+// parseRule parses a single rule from a script.
+// On entry, the next input token has not been lexed.
+// On exit, the next input token has been lexed and is in p.tok.
+// If there is an error, it is guaranteed to be a *SyntaxError.
+// parseRule returns nil, nil at end of file.
+func (p *parser) parseRule() (x *Rule, err error) {
+ defer func() {
+ if e := recover(); e != nil {
+ if e, ok := e.(*SyntaxError); ok {
+ err = e
+ return
+ }
+ panic(e) // unreachable unless parser has a bug
+ }
+ }()
+
+ x = p.rule()
+ if p.tok != "" && p.tok != "\n" {
+ p.unexpected()
+ }
+ return x, nil
+}
+
+// unexpected reports a parse error due to an unexpected token
+func (p *parser) unexpected() {
+ what := p.tok
+ switch what {
+ case "a":
+ what = "identifier " + p.lit
+ case "\"":
+ what = "quoted string " + p.lit
+ case "`":
+ what = "backquoted string " + p.lit
+ case "\n":
+ what = "end of line"
+ case "":
+ what = "end of script"
+ }
+ p.parseError("unexpected " + what)
+}
+
+// rule parses a single rule.
+// On entry, the next input token has not yet been lexed.
+// On exit, the next input token has been lexed and is in p.tok.
+// If there is no next rule (the script has been read in its entirety), rule returns nil.
+func (p *parser) rule() *Rule {
+ p.lex()
+ for p.tok == "\n" {
+ p.lex()
+ }
+ if p.tok == "" {
+ return nil
+ }
+ if p.tok != "a" {
+ p.unexpected()
+ }
+ action := p.lit
+ p.lex()
+ if p.tok != "<-" {
+ p.unexpected()
+ }
+ return &Rule{Action: action, Pattern: p.or()}
+}
+
+// or parses a sequence of || expressions.
+// On entry, the next input token has not yet been lexed.
+// On exit, the next input token has been lexed and is in p.tok.
+func (p *parser) or() Expr {
+ x := p.and()
+ for p.tok == "||" {
+ x = or(x, p.and())
+ }
+ return x
+}
+
+// and parses a sequence of && expressions.
+// On entry, the next input token has not yet been lexed.
+// On exit, the next input token has been lexed and is in p.tok.
+func (p *parser) and() Expr {
+ x := p.cmp()
+ for p.tok == "&&" {
+ x = and(x, p.cmp())
+ }
+ return x
+}
+
+// cmp parses a comparison expression or atom.
+// On entry, the next input token has not been lexed.
+// On exit, the next input token has been lexed and is in p.tok.
+func (p *parser) cmp() Expr {
+ p.lex()
+ switch p.tok {
+ default:
+ p.unexpected()
+ case "!":
+ p.lex()
+ return not(p.atom())
+ case "(", "\"", "`":
+ return p.atom()
+ case "a":
+ // comparison
+ field := p.lit
+ if !p.fields[field] {
+ p.parseError("unknown field " + field)
+ }
+ p.lex()
+ switch p.tok {
+ default:
+ p.unexpected()
+ case "==", "!=", "<", "<=", ">", ">=":
+ op := p.tok
+ p.lex()
+ if p.tok != "\"" {
+ p.parseError(op + " requires quoted string")
+ }
+ s := p.lit
+ p.lex()
+ return cmp(field, op, s)
+ case "~", "!~":
+ op := p.tok
+ p.lex()
+ if p.tok != "`" {
+ p.parseError(op + " requires backquoted regexp")
+ }
+ re, err := regcomp(p.lit)
+ if err != nil {
+ p.parseError("invalid regexp: " + err.Error())
+ }
+ p.lex()
+ return regx(field, op == "!~", re)
+ }
+ }
+ panic("unreachable")
+}
+
+// atom parses a regexp or string comparison or a parenthesized expression.
+// On entry, the next input token HAS been lexed.
+// On exit, the next input token has been lexed and is in p.tok.
+func (p *parser) atom() Expr {
+ // first token already in p.tok
+ switch p.tok {
+ default:
+ p.unexpected()
+
+ case "(":
+ defer func() {
+ if e := recover(); e != nil {
+ if e, ok := e.(*SyntaxError); ok && e.Err == "unexpected end of expression" {
+ e.Err = "missing close paren"
+ }
+ panic(e)
+ }
+ }()
+ x := p.or()
+ if p.tok != ")" {
+ p.parseError("missing close paren")
+ }
+ p.lex()
+ return x
+
+ case "`":
+ re, err := regcomp(p.lit)
+ if err != nil {
+ p.parseError("invalid regexp: " + err.Error())
+ }
+ p.lex()
+ return regx("", false, re)
+ }
+ panic("unreachable")
+}
+
+// lex finds and consumes the next token in the input stream.
+// On return, p.tok is set to the token text
+// and p.pos records the byte offset of the start of the token in the input stream.
+// If lex reaches the end of the input, p.tok is set to the empty string.
+// For any other syntax error, lex panics with a SyntaxError.
+func (p *parser) lex() {
+Top:
+ for p.i < len(p.s) && (p.s[p.i] == ' ' || p.s[p.i] == '\t') {
+ p.i++
+ }
+ if p.i >= len(p.s) {
+ p.tok = ""
+ p.pos = p.i
+ return
+ }
+ switch p.s[p.i] {
+ case '#':
+ // line comment
+ for p.i < len(p.s) && p.s[p.i] != '\n' {
+ p.i++
+ }
+ goto Top
+ case '\n':
+ // like in Go, not a line ending if it follows a continuation token.
+ switch p.tok {
+ case "(", "&&", "||", "==", "!=", "~", "!~", "!", "<-":
+ p.i++
+ goto Top
+ }
+ p.pos = p.i
+ p.i++
+ p.tok = p.s[p.pos:p.i]
+ return
+ case '<': // <-, <=
+ p.pos = p.i
+ p.i++
+ if p.i < len(p.s) && (p.s[p.i] == '-' || p.s[p.i] == '=') {
+ p.i++
+ }
+ p.tok = p.s[p.pos:p.i]
+ return
+ case '!', '>': // ! != > >=
+ p.pos = p.i
+ p.i++
+ if p.i < len(p.s) && p.s[p.i] == '=' {
+ p.i++
+ }
+ p.tok = p.s[p.pos:p.i]
+ return
+ case '(', ')', '~': // ( ) ~
+ p.pos = p.i
+ p.i++
+ p.tok = p.s[p.pos:p.i]
+ return
+ case '&', '|', '=': // && || ==
+ if p.i+1 >= len(p.s) || p.s[p.i+1] != p.s[p.i] {
+ p.lexError("invalid syntax at " + string(rune(p.s[p.i])))
+ }
+ p.pos = p.i
+ p.i += 2
+ p.tok = p.s[p.pos:p.i]
+ return
+ case '`':
+ j := p.i + 1
+ for j < len(p.s) && p.s[j] != '`' {
+ if p.s[j] == '\n' {
+ p.lexError("newline in backquoted regexp")
+ }
+ j++
+ }
+ if j >= len(p.s) {
+ p.lexError("unterminated backquoted regexp")
+ }
+ p.pos = p.i
+ p.i = j + 1
+ p.tok = "`"
+ p.lit = p.s[p.pos+1 : j]
+ return
+ case '"':
+ j := p.i + 1
+ for j < len(p.s) && p.s[j] != '"' {
+ if p.s[j] == '\n' {
+ p.lexError("newline in quoted string")
+ }
+ if p.s[j] == '\\' {
+ j++
+ }
+ j++
+ }
+ if j >= len(p.s) {
+ p.lexError("unterminated quoted string")
+ }
+ s, err := strconv.Unquote(p.s[p.i : j+1])
+ if err != nil {
+ p.lexError("invalid quoted string: " + err.Error())
+ }
+ p.pos = p.i
+ p.i = j + 1
+ p.tok = "\""
+ p.lit = s
+ return
+ case '\'':
+ p.lexError("single-quoted strings not allowed")
+ }
+
+ // ascii name
+ if isalpha(p.s[p.i]) {
+ j := p.i
+ for j < len(p.s) && isalnum(p.s[j]) {
+ j++
+ }
+ p.pos = p.i
+ p.i = j
+ p.tok = "a"
+ p.lit = p.s[p.pos:p.i]
+ return
+ }
+
+ c, _ := utf8.DecodeRuneInString(p.s[p.i:])
+ p.lexError(fmt.Sprintf("invalid syntax at %q (U+%04x)", c, c))
+}
+
+// lexError reports a lex error with the given error text.
+func (p *parser) lexError(err string) {
+ p.errorAt(p.i, err)
+}
+
+// parseError reports a parse error with the given error text.
+// (A parse error differs from a lex error in which parser position
+// the error is attributed to.)
+func (p *parser) parseError(err string) {
+ p.errorAt(p.pos, err)
+}
+
+// errorAt reports a syntax error at the given position.
+func (p *parser) errorAt(pos int, err string) {
+ line := 1 + strings.Count(p.s[:pos], "\n")
+ i := pos - strings.LastIndex(p.s[:pos], "\n")
+ panic(&SyntaxError{File: p.file, Line: line, Offset: i, Err: err})
+}
+
+// isalpha reports whether c is an ASCII alphabetic or _.
+func isalpha(c byte) bool {
+ return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || c == '_'
+}
+
+// isalnum reports whether c is an ASCII alphanumeric or _.
+func isalnum(c byte) bool {
+ return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' || c == '_'
+}