cmd/watchflakes/internal/script: add script parser

Needs tests.

Change-Id: Ia970c247ab36239e6e164c708c55ba3119054d9a
Reviewed-on: https://go-review.googlesource.com/c/build/+/432402
Reviewed-by: Cherry Mui <cherryyz@google.com>
Auto-Submit: Russ Cox <rsc@golang.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Russ Cox <rsc@golang.org>
diff --git a/cmd/watchflakes/internal/script/script.go b/cmd/watchflakes/internal/script/script.go
new file mode 100644
index 0000000..499f2e7
--- /dev/null
+++ b/cmd/watchflakes/internal/script/script.go
@@ -0,0 +1,573 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package script implements a simple classification scripting language.
+// A script is a sequence of rules of the form “action <- pattern”,
+// meaning send results matching pattern to the named action.
+package script
+
+import (
+	"fmt"
+	"regexp"
+	"strconv"
+	"strings"
+	"unicode/utf8"
+)
+
+// A Script is a sequence of Action <- Pattern rules.
+type Script struct {
+	File  string
+	Rules []*Rule
+}
+
+// A Rule is a single Action <- Pattern rule.
+type Rule struct {
+	Action  string // "skip", "post", and so on
+	Pattern Expr   // pattern expression
+}
+
+// Action returns the action specified by the script for the given record.
+func (s *Script) Action(record Record) string {
+	for _, r := range s.Rules {
+		if r.Pattern.Match(record) {
+			return r.Action
+		}
+	}
+	return ""
+}
+
+// A Record is a set of key:value pairs.
+type Record map[string]string
+
+// An Expr is a pattern expression that can evaluate itself on a Record.
+// The underlying concrete type is *CmpExpr, *AndExpr, *OrExpr, *NotExpr, or *RegExpr.
+type Expr interface {
+	// String returns the syntax for the pattern.
+	String() string
+
+	// Match reports whether the pattern matches the record.
+	Match(record Record) bool
+}
+
+// A CmpExpr is an Expr for a string comparison.
+type CmpExpr struct {
+	Field   string
+	Op      string
+	Literal string
+}
+
+func (x *CmpExpr) Match(record Record) bool {
+	f := record[x.Field]
+	l := x.Literal
+	switch x.Op {
+	case "==":
+		return f == l
+	case "!=":
+		return f != l
+	case "<":
+		return f < l
+	case "<=":
+		return f <= l
+	case ">":
+		return f > l
+	case ">=":
+		return f >= l
+	}
+	return false
+}
+
+func (x *CmpExpr) String() string {
+	s := strconv.Quote(x.Literal)
+	if x.Field == "" {
+		return s
+	}
+	return x.Field + " " + x.Op + " " + s
+}
+
+func cmp(field, op, literal string) Expr { return &CmpExpr{field, op, literal} }
+
+// A RegExpr is an Expr for a regular expression test.
+type RegExpr struct {
+	Field  string
+	Not    bool
+	Regexp *regexp.Regexp
+}
+
+func (x *RegExpr) Match(record Record) bool {
+	ok := x.Regexp.MatchString(record[x.Field])
+	if x.Not {
+		return !ok
+	}
+	return ok
+}
+
+func (x *RegExpr) String() string {
+	s := x.Regexp.String()
+	s = "`" + strings.ReplaceAll(s, "`", `\x60`) + "`"
+	if x.Field == "" {
+		return s
+	}
+	op := " ~ "
+	if x.Not {
+		op = " !~ "
+	}
+	return x.Field + op + s
+}
+
+func regx(field string, not bool, re *regexp.Regexp) Expr { return &RegExpr{field, not, re} }
+func regcomp(s string) (*regexp.Regexp, error) {
+	return regexp.Compile("(?m)" + s)
+}
+
+// A NotExpr represents the expression !X (the negation of X).
+type NotExpr struct {
+	X Expr
+}
+
+func (x *NotExpr) Match(record Record) bool {
+	return !x.X.Match(record)
+}
+
+func (x *NotExpr) String() string {
+	return "!(" + x.X.String() + ")"
+}
+
+func not(x Expr) Expr { return &NotExpr{x} }
+
+// An AndExpr represents the expression X && Y.
+type AndExpr struct {
+	X, Y Expr
+}
+
+func (x *AndExpr) Match(record Record) bool {
+	return x.X.Match(record) && x.Y.Match(record)
+}
+
+func (x *AndExpr) String() string {
+	return andArg(x.X) + " && " + andArg(x.Y)
+}
+
+func andArg(x Expr) string {
+	s := x.String()
+	if _, ok := x.(*OrExpr); ok {
+		s = "(" + s + ")"
+	}
+	return s
+}
+
+func and(x, y Expr) Expr {
+	return &AndExpr{x, y}
+}
+
+// An OrExpr represents the expression X || Y.
+type OrExpr struct {
+	X, Y Expr
+}
+
+func (x *OrExpr) Match(record Record) bool {
+	return x.X.Match(record) || x.Y.Match(record)
+}
+
+func (x *OrExpr) String() string {
+	return orArg(x.X) + " || " + orArg(x.Y)
+}
+
+func orArg(x Expr) string {
+	s := x.String()
+	if _, ok := x.(*AndExpr); ok {
+		s = "(" + s + ")"
+	}
+	return s
+}
+
+func or(x, y Expr) Expr {
+	return &OrExpr{x, y}
+}
+
+// A SyntaxError reports a syntax error in a parsed match expression.
+type SyntaxError struct {
+	File   string // input file
+	Line   int    // line number where error was detected (1-indexed)
+	Offset int    // byte offset in line where error was detected (1-indexed)
+	Err    string // description of error
+}
+
+func (e *SyntaxError) Error() string {
+	if e.Offset == 0 {
+		return fmt.Sprintf("%s:%d: %s", e.File, e.Line, e.Err)
+	}
+	return fmt.Sprintf("%s:%d.%d: %s", e.File, e.Line, e.Offset, e.Err)
+}
+
+// A parser holds state for parsing a build expression.
+type parser struct {
+	file   string          // input file, for errors
+	s      string          // input string
+	i      int             // next read location in s
+	fields map[string]bool // known input fields for comparisons
+
+	tok string // last token read; "`", "\"", "a" for backquoted regexp, literal string, identifier
+	lit string // text of backquoted regexp, literal string, or identifier
+	pos int    // position (start) of last token
+}
+
+// Parse parses text as a script,
+// returning the parsed form and any parse errors found.
+// (The parser attempts to recover after parse errors by starting over
+// at the next newline, so multiple parse errors are possible.)
+// The file argument is used for reporting the file name in errors
+// and in the Script's File field;
+// Parse does not read from the file itself.
+func Parse(file, text string, fields []string) (*Script, []*SyntaxError) {
+	p := &parser{
+		file: file,
+		s:    text,
+	}
+	p.fields = make(map[string]bool)
+	for _, f := range fields {
+		p.fields[f] = true
+	}
+	var s Script
+	s.File = file
+	var errs []*SyntaxError
+	for {
+		r, err := p.parseRule()
+		if err != nil {
+			errs = append(errs, err.(*SyntaxError))
+			i := strings.Index(p.s[p.i:], "\n")
+			if i < 0 {
+				break
+			}
+			p.i += i + 1
+			continue
+		}
+		if r == nil {
+			break
+		}
+		s.Rules = append(s.Rules, r)
+	}
+	return &s, errs
+}
+
+// parseRule parses a single rule from a script.
+// On entry, the next input token has not been lexed.
+// On exit, the next input token has been lexed and is in p.tok.
+// If there is an error, it is guaranteed to be a *SyntaxError.
+// parseRule returns nil, nil at end of file.
+func (p *parser) parseRule() (x *Rule, err error) {
+	defer func() {
+		if e := recover(); e != nil {
+			if e, ok := e.(*SyntaxError); ok {
+				err = e
+				return
+			}
+			panic(e) // unreachable unless parser has a bug
+		}
+	}()
+
+	x = p.rule()
+	if p.tok != "" && p.tok != "\n" {
+		p.unexpected()
+	}
+	return x, nil
+}
+
+// unexpected reports a parse error due to an unexpected token
+func (p *parser) unexpected() {
+	what := p.tok
+	switch what {
+	case "a":
+		what = "identifier " + p.lit
+	case "\"":
+		what = "quoted string " + p.lit
+	case "`":
+		what = "backquoted string " + p.lit
+	case "\n":
+		what = "end of line"
+	case "":
+		what = "end of script"
+	}
+	p.parseError("unexpected " + what)
+}
+
+// rule parses a single rule.
+// On entry, the next input token has not yet been lexed.
+// On exit, the next input token has been lexed and is in p.tok.
+// If there is no next rule (the script has been read in its entirety), rule returns nil.
+func (p *parser) rule() *Rule {
+	p.lex()
+	for p.tok == "\n" {
+		p.lex()
+	}
+	if p.tok == "" {
+		return nil
+	}
+	if p.tok != "a" {
+		p.unexpected()
+	}
+	action := p.lit
+	p.lex()
+	if p.tok != "<-" {
+		p.unexpected()
+	}
+	return &Rule{Action: action, Pattern: p.or()}
+}
+
+// or parses a sequence of || expressions.
+// On entry, the next input token has not yet been lexed.
+// On exit, the next input token has been lexed and is in p.tok.
+func (p *parser) or() Expr {
+	x := p.and()
+	for p.tok == "||" {
+		x = or(x, p.and())
+	}
+	return x
+}
+
+// and parses a sequence of && expressions.
+// On entry, the next input token has not yet been lexed.
+// On exit, the next input token has been lexed and is in p.tok.
+func (p *parser) and() Expr {
+	x := p.cmp()
+	for p.tok == "&&" {
+		x = and(x, p.cmp())
+	}
+	return x
+}
+
+// cmp parses a comparison expression or atom.
+// On entry, the next input token has not been lexed.
+// On exit, the next input token has been lexed and is in p.tok.
+func (p *parser) cmp() Expr {
+	p.lex()
+	switch p.tok {
+	default:
+		p.unexpected()
+	case "!":
+		p.lex()
+		return not(p.atom())
+	case "(", "\"", "`":
+		return p.atom()
+	case "a":
+		// comparison
+		field := p.lit
+		if !p.fields[field] {
+			p.parseError("unknown field " + field)
+		}
+		p.lex()
+		switch p.tok {
+		default:
+			p.unexpected()
+		case "==", "!=", "<", "<=", ">", ">=":
+			op := p.tok
+			p.lex()
+			if p.tok != "\"" {
+				p.parseError(op + " requires quoted string")
+			}
+			s := p.lit
+			p.lex()
+			return cmp(field, op, s)
+		case "~", "!~":
+			op := p.tok
+			p.lex()
+			if p.tok != "`" {
+				p.parseError(op + " requires backquoted regexp")
+			}
+			re, err := regcomp(p.lit)
+			if err != nil {
+				p.parseError("invalid regexp: " + err.Error())
+			}
+			p.lex()
+			return regx(field, op == "!~", re)
+		}
+	}
+	panic("unreachable")
+}
+
+// atom parses a regexp or string comparison or a parenthesized expression.
+// On entry, the next input token HAS been lexed.
+// On exit, the next input token has been lexed and is in p.tok.
+func (p *parser) atom() Expr {
+	// first token already in p.tok
+	switch p.tok {
+	default:
+		p.unexpected()
+
+	case "(":
+		defer func() {
+			if e := recover(); e != nil {
+				if e, ok := e.(*SyntaxError); ok && e.Err == "unexpected end of expression" {
+					e.Err = "missing close paren"
+				}
+				panic(e)
+			}
+		}()
+		x := p.or()
+		if p.tok != ")" {
+			p.parseError("missing close paren")
+		}
+		p.lex()
+		return x
+
+	case "`":
+		re, err := regcomp(p.lit)
+		if err != nil {
+			p.parseError("invalid regexp: " + err.Error())
+		}
+		p.lex()
+		return regx("", false, re)
+	}
+	panic("unreachable")
+}
+
+// lex finds and consumes the next token in the input stream.
+// On return, p.tok is set to the token text
+// and p.pos records the byte offset of the start of the token in the input stream.
+// If lex reaches the end of the input, p.tok is set to the empty string.
+// For any other syntax error, lex panics with a SyntaxError.
+func (p *parser) lex() {
+Top:
+	for p.i < len(p.s) && (p.s[p.i] == ' ' || p.s[p.i] == '\t') {
+		p.i++
+	}
+	if p.i >= len(p.s) {
+		p.tok = ""
+		p.pos = p.i
+		return
+	}
+	switch p.s[p.i] {
+	case '#':
+		// line comment
+		for p.i < len(p.s) && p.s[p.i] != '\n' {
+			p.i++
+		}
+		goto Top
+	case '\n':
+		// like in Go, not a line ending if it follows a continuation token.
+		switch p.tok {
+		case "(", "&&", "||", "==", "!=", "~", "!~", "!", "<-":
+			p.i++
+			goto Top
+		}
+		p.pos = p.i
+		p.i++
+		p.tok = p.s[p.pos:p.i]
+		return
+	case '<': // <-, <=
+		p.pos = p.i
+		p.i++
+		if p.i < len(p.s) && (p.s[p.i] == '-' || p.s[p.i] == '=') {
+			p.i++
+		}
+		p.tok = p.s[p.pos:p.i]
+		return
+	case '!', '>': // ! != > >=
+		p.pos = p.i
+		p.i++
+		if p.i < len(p.s) && p.s[p.i] == '=' {
+			p.i++
+		}
+		p.tok = p.s[p.pos:p.i]
+		return
+	case '(', ')', '~': // ( ) ~
+		p.pos = p.i
+		p.i++
+		p.tok = p.s[p.pos:p.i]
+		return
+	case '&', '|', '=': // && || ==
+		if p.i+1 >= len(p.s) || p.s[p.i+1] != p.s[p.i] {
+			p.lexError("invalid syntax at " + string(rune(p.s[p.i])))
+		}
+		p.pos = p.i
+		p.i += 2
+		p.tok = p.s[p.pos:p.i]
+		return
+	case '`':
+		j := p.i + 1
+		for j < len(p.s) && p.s[j] != '`' {
+			if p.s[j] == '\n' {
+				p.lexError("newline in backquoted regexp")
+			}
+			j++
+		}
+		if j >= len(p.s) {
+			p.lexError("unterminated backquoted regexp")
+		}
+		p.pos = p.i
+		p.i = j + 1
+		p.tok = "`"
+		p.lit = p.s[p.pos+1 : j]
+		return
+	case '"':
+		j := p.i + 1
+		for j < len(p.s) && p.s[j] != '"' {
+			if p.s[j] == '\n' {
+				p.lexError("newline in quoted string")
+			}
+			if p.s[j] == '\\' {
+				j++
+			}
+			j++
+		}
+		if j >= len(p.s) {
+			p.lexError("unterminated quoted string")
+		}
+		s, err := strconv.Unquote(p.s[p.i : j+1])
+		if err != nil {
+			p.lexError("invalid quoted string: " + err.Error())
+		}
+		p.pos = p.i
+		p.i = j + 1
+		p.tok = "\""
+		p.lit = s
+		return
+	case '\'':
+		p.lexError("single-quoted strings not allowed")
+	}
+
+	// ascii name
+	if isalpha(p.s[p.i]) {
+		j := p.i
+		for j < len(p.s) && isalnum(p.s[j]) {
+			j++
+		}
+		p.pos = p.i
+		p.i = j
+		p.tok = "a"
+		p.lit = p.s[p.pos:p.i]
+		return
+	}
+
+	c, _ := utf8.DecodeRuneInString(p.s[p.i:])
+	p.lexError(fmt.Sprintf("invalid syntax at %q (U+%04x)", c, c))
+}
+
+// lexError reports a lex error with the given error text.
+func (p *parser) lexError(err string) {
+	p.errorAt(p.i, err)
+}
+
+// parseError reports a parse error with the given error text.
+// (A parse error differs from a lex error in which parser position
+// the error is attributed to.)
+func (p *parser) parseError(err string) {
+	p.errorAt(p.pos, err)
+}
+
+// errorAt reports a syntax error at the given position.
+func (p *parser) errorAt(pos int, err string) {
+	line := 1 + strings.Count(p.s[:pos], "\n")
+	i := pos - strings.LastIndex(p.s[:pos], "\n")
+	panic(&SyntaxError{File: p.file, Line: line, Offset: i, Err: err})
+}
+
+// isalpha reports whether c is an ASCII alphabetic or _.
+func isalpha(c byte) bool {
+	return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || c == '_'
+}
+
+// isalnum reports whether c is an ASCII alphanumeric or _.
+func isalnum(c byte) bool {
+	return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' || c == '_'
+}