cmd/watchflakes/main.go - build - Git at Google

 // Copyright 2024 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 // Watchflakes is a program that triages apparent test flakes
 // on the build.golang.org dashboards. See https://go.dev/wiki/Watchflakes.
 package main

 import (
 	"context"
 	"flag"
 	"fmt"
 	"log"
 	"os"
 	"runtime"
 	"strconv"
 	"strings"
 	"time"

 	bbpb "go.chromium.org/luci/buildbucket/proto"
 	rdbpb "go.chromium.org/luci/resultdb/proto/v1"
 	"golang.org/x/build/buildenv"
 	"golang.org/x/build/cmd/watchflakes/internal/script"
 	"golang.org/x/build/internal/secret"
 	"rsc.io/github"
 )

 // TODO:
 // - subrepos by go commit
 // - handle INFRA_FAILURE and CANCELED

 var _ = fmt.Print

 // Query failures within most recent timeLimit.
 const timeLimit = 45 * 24 * time.Hour

 const maxFailPerBuild = 3

 var (
 	build   = flag.String("build", "", "a particular build ID or URL to analyze (mainly for debugging)")
 	md      = flag.Bool("md", false, "print Markdown output suitable for GitHub issues")
 	post    = flag.Bool("post", false, "post updates to GitHub issues")
 	repeat  = flag.Duration("repeat", 0, "keep running with specified `period`; zero means to run once and exit")
 	verbose = flag.Bool("v", false, "print verbose posting decisions")

 	useSecretManager = flag.Bool("use-secret-manager", false, "fetch GitHub token from Secret Manager instead of $HOME/.netrc")
 )

 func usage() {
 	fmt.Fprintf(os.Stderr, "Usage: watchflakes [options] [script]\n")
 	flag.PrintDefaults()
 	os.Exit(2)
 }

 func main() {
 	log.SetPrefix("watchflakes: ")
 	flag.Usage = usage
 	buildenv.RegisterStagingFlag()
 	flag.Parse()
 	if flag.NArg() > 1 {
 		usage()
 	}

 	var query *Issue
 	if flag.NArg() == 1 {
 		s, err := script.Parse("script", flag.Arg(0), fields)
 		if err != nil {
 			log.Fatalf("parsing query:\n%s", err)
 		}
 		query = &Issue{Issue: new(github.Issue), Script: s, ScriptText: flag.Arg(0)}
 	}

 	// Create an authenticated GitHub client.
 	if *useSecretManager {
 		// Fetch credentials from Secret Manager.
 		secretCl, err := secret.NewClientInProject(buildenv.FromFlags().ProjectName)
 		if err != nil {
 			log.Fatalln("failed to create a Secret Manager client:", err)
 		}
 		ghToken, err := secretCl.Retrieve(context.Background(), secret.NameWatchflakesGitHubToken)
 		if err != nil {
 			log.Fatalln("failed to retrieve GitHub token from Secret Manager:", err)
 		}
 		gh = github.NewClient(ghToken)
 	} else {
 		// Use credentials in $HOME/.netrc.
 		var err error
 		gh, err = github.Dial("")
 		if err != nil {
 			log.Fatalln("github.Dial:", err)
 		}
 	}

 	// Load LUCI dashboards
 	c := NewLUCIClient(runtime.GOMAXPROCS(0) * 4)
 	c.TraceSteps = true

 	var ticker *time.Ticker
 	timeout := 30 * time.Minute // default timeout for one-off run
 	if *repeat != 0 {
 		ticker = time.NewTicker(*repeat)
 		timeout = *repeat * 2 // expected to finish in one repeat cycle, give some extra room
 	}
 Repeat:
 	startTime := time.Now()
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
 	var boards []*Dashboard
 	if *build == "" {
 		// fetch the dashboard
 		var err error
 		boards, err = c.ListBoards(ctx)
 		if err != nil {
 			log.Fatalln("ListBoards:", err)
 		}
 		err = c.ReadBoards(ctx, boards, startTime.Add(-timeLimit))
 		if err != nil {
 			log.Fatalln("ReadBoards:", err)
 		}
 		skipBrokenCommits(boards)
 		skipBrokenBuilders(boards)
 	} else {
 		id, err := strconv.ParseInt(strings.TrimPrefix(*build, "https://ci.chromium.org/b/"), 10, 64)
 		if err != nil {
 			log.Fatalf("invalid build ID for -build flag: %q\n\texpect a build ID or https://ci.chromium.org/b/ URL", *build)
 		}
 		r, err := c.GetBuild(ctx, id)
 		if err != nil {
 			log.Fatalf("GetBuild %d: %v", id, err)
 		}
 		// make a one-entry board
 		board := &Dashboard{
 			Builders: []Builder{{r.Builder, r.BuilderConfigProperties}},
 			Commits:  []Commit{{Hash: r.Commit}},
 			Results:  [][]*BuildResult{{r}},
 		}
 		boards = []*Dashboard{board}
 	}

 	failRes := c.FindFailures(ctx, boards)
 	c.FetchLogs(failRes)

 	if *verbose {
 		for _, r := range failRes {
 			fmt.Printf("failure %s %s %s\n", r.Builder, shortHash(r.Commit), buildURL(r.ID))
 		}
 	}

 	// Load GitHub issues
 	var issues []*Issue
 	issues, err := readIssues(issues)
 	if err != nil {
 		log.Fatal(err)
 	}
 	findScripts(issues)
 	if query == nil {
 		postIssueErrors(issues)
 	}
 	if query != nil {
 		issues = []*Issue{query}
 	}

 	for _, r := range failRes {
 		newIssue := 0
 		fs := r.Failures
 		fs = coalesceFailures(fs)
 		if len(fs) == 0 {
 			// No test failure, Probably a build failure.
 			// E.g. https://ci.chromium.org/ui/b/8759448820419452721
 			// Make a dummy failure.
 			f := &Failure{
 				Status:  rdbpb.TestStatus_FAIL,
 				LogText: r.StepLogText,
 			}
 			fs = []*Failure{f}
 		}
 		for _, f := range fs {
 			fp := NewFailurePost(r, f)
 			record := fp.Record()
 			action, targets := run(issues, record)
 			if *verbose {
 				printRecord(record, false)
 				fmt.Printf("\t%s %v\n", action, targets)
 			}
 			switch action {
 			case "skip":
 				// do nothing
 				if *verbose {
 					fmt.Printf("%s: skipped by #%d\n", fp.URL, targets[0].Number)
 				}

 			case "":
 				if newIssue > 0 {
 					// If we already opened a new issue for a build, don't open another one.
 					// It could be that the build is just broken.
 					// One can look at the issue and split if necessary.
 					break
 				}

 				// create a new issue
 				if query == nil {
 					if *verbose {
 						fmt.Printf("%s: new issue\n", fp.URL)
 					}
 					issue, err := reportNew(fp)
 					if err != nil {
 						log.Fatal(err)
 					}
 					issues = append(issues, issue)
 					newIssue++
 				}

 			case "default", "post", "take":
 				for _, issue := range targets {
 					if !issue.Mentions[fp.URL] && issue.Stale {
 						readComments(issue)
 					}
 					if *verbose {
 						mentioned := "un"
 						if issue.Mentions[fp.URL] {
 							mentioned = "already "
 						}
 						fmt.Printf("%s: %s #%d, %smentioned\n", fp.URL, action, issue.Number, mentioned)
 					}
 					if !issue.Mentions[fp.URL] {
 						issue.Post = append(issue.Post, fp)
 					}
 				}
 			}
 		}
 	}

 	if query != nil {
 		format := (*FailurePost).Text
 		if *md {
 			format = (*FailurePost).Markdown
 		}
 		for i, fp := range query.Post {
 			if i > 0 {
 				fmt.Printf("\n")
 			}
 			os.Stdout.WriteString(format(fp))
 		}
 		if *md {
 			os.Stdout.WriteString(signature)
 		}
 		return
 	}

 	posts := 0
 	for _, issue := range issues {
 		if len(issue.Post) > 0 {
 			fmt.Printf(" - new for #%d %s\n", issue.Number, issue.Title)
 			for _, fp := range issue.Post {
 				fmt.Printf("    - %s\n      %s\n", fp, fp.URL)
 			}
 			msg := updateText(issue)
 			if *verbose {
 				fmt.Printf("\n%s\n", indent(spaces[:3], msg))
 			}
 			if *post {
 				if err := postComment(issue, msg); err != nil {
 					log.Print(err)
 					continue
 				}
 				if issue.Mentions == nil {
 					issue.Mentions = make(map[string]bool)
 				}
 				for _, fp := range issue.Post {
 					issue.Mentions[fp.URL] = true
 				}
 			}
 			posts++
 		}
 	}

 	cancel()

 	log.Printf("Done. %d boards, %d failures, %d issues, %d posts, in %v\n", len(boards), len(failRes), len(issues), posts, time.Since(startTime))

 	if *repeat != 0 {
 		<-ticker.C
 		goto Repeat
 	}
 }

 const SKIP = bbpb.Status_STATUS_UNSPECIFIED // for smashing the status to skip a non-flake failure

 // skipBrokenCommits identifies broken commits,
 // which are the ones that failed on at least 1/4 of builders,
 // and then changes all results for those commits to SKIP.
 func skipBrokenCommits(boards []*Dashboard) {
 	for _, dash := range boards {
 		builderThreshold := len(dash.Builders) / 4
 		for i := 0; i < len(dash.Commits); i++ {
 			bad := 0
 			good := 0
 			for _, rs := range dash.Results {
 				if rs[i] == nil {
 					continue
 				}
 				switch rs[i].Status {
 				case bbpb.Status_SUCCESS:
 					good++
 				case bbpb.Status_FAILURE:
 					bad++
 					// ignore other status
 				}
 			}
 			if bad > builderThreshold || good < builderThreshold {
 				fmt.Printf("skip: commit %s (%s %s) is broken (good=%d bad=%d)\n", shortHash(dash.Commits[i].Hash), dash.Repo, dash.GoBranch, good, bad)
 				for _, rs := range dash.Results {
 					if rs[i] != nil {
 						rs[i].Status = SKIP
 					}
 				}
 			}
 		}
 	}
 }

 // skipBrokenBuilders identifies builders that were consistently broken
 // (at least tooManyToBeFlakes failures in a row) and then turned ok.
 // It changes those consistent failures to SKIP.
 func skipBrokenBuilders(boards []*Dashboard) {
 	const tooManyToBeFlakes = 4

 	for _, dash := range boards {
 		for _, rs := range dash.Results {
 			bad := 100 // squash failures at the top of the dashboard, which may turn out to be consistent
 			badStart := 0
 			skip := func(i int) { // skip the i-th result
 				if rs[i] != nil {
 					fmt.Printf("skip: builder %s was broken at %s (%s %s)\n", rs[i].Builder, shortHash(rs[i].Commit), dash.Repo, dash.GoBranch)
 					rs[i].Status = SKIP
 				}
 			}
 			for i, r := range rs {
 				if rs[i] == nil {
 					continue
 				}
 				switch r.Status {
 				case bbpb.Status_SUCCESS:
 					bad = 0
 					continue
 				case bbpb.Status_FAILURE:
 					bad++
 				default: // ignore other status
 					continue
 				}
 				switch bad {
 				case 1:
 					badStart = i
 				case tooManyToBeFlakes:
 					// Skip the run so far.
 					for j := badStart; j < i; j++ {
 						skip(j)
 					}
 				}
 				if bad >= tooManyToBeFlakes {
 					skip(i)
 				}
 			}

 			// Bad entries ending just before the cutoff are not flakes
 			// even if there are just a few of them. Otherwise we get
 			// spurious flakes when there's one bad entry before the
 			// cutoff and lots after the cutoff.
 			if bad > 0 && badStart > 0 {
 				for j := badStart; j < len(rs); j++ {
 					skip(j)
 				}
 			}
 		}
 	}
 }

 // run runs the scripts in issues on record.
 // It returns the desired action (skip, post, default)
 // as well as the list of target issues (for post or default).
 func run(issues []*Issue, record script.Record) (action string, targets []*Issue) {
 	var def, post []*Issue

 	for _, issue := range issues {
 		if issue.Script != nil {
 			switch issue.Script.Action(record) {
 			case "skip":
 				return "skip", []*Issue{issue}
 			case "take":
 				println("TAKE", issue.Number)
 			case "default":
 				def = append(def, issue)
 			case "post":
 				post = append(post, issue)
 			}
 		}
 	}

 	if len(post) > 0 {
 		return "post", post
 	}
 	if len(def) > 0 {
 		return "default", def[:1]
 	}
 	return "", nil
 }

 // FailurePost is a failure to be posted on an issue.
 type FailurePost struct {
 	*BuildResult
 	*Failure
 	URL     string // LUCI build page
 	Pkg     string
 	Test    string
 	Snippet string
 }

 func NewFailurePost(r *BuildResult, f *Failure) *FailurePost {
 	pkg, test := splitTestID(f.TestID)
 	snip := snippet(f.LogText)
 	if snip == "" {
 		snip = snippet(r.LogText)
 	}
 	fp := &FailurePost{
 		BuildResult: r,
 		Failure:     f,
 		URL:         buildURL(r.ID),
 		Pkg:         pkg,
 		Test:        test,
 		Snippet:     snip,
 	}
 	return fp
 }

 // fields is the list of known fields for use by script patterns.
 // It must be in sync with the Record method below.
 var fields = []string{
 	"",
 	"section", // not used, keep for compatibility with old watchflakes
 	"pkg",
 	"test",
 	"mode",
 	"output",
 	"snippet",
 	"date",
 	"builder",
 	"repo",
 	"goos",
 	"goarch",
 	"log",
 	"status",
 }

 func (fp *FailurePost) Record() script.Record {
 	// Note: update fields above if any new fields are added to this record.
 	m := script.Record{
 		"pkg":     fp.Pkg,
 		"test":    fp.Test,
 		"output":  fp.Failure.LogText,
 		"snippet": fp.Snippet,
 		"date":    fp.Time.Format(time.RFC3339),
 		"builder": fp.Builder,
 		"repo":    fp.Repo,
 		"goos":    fp.Target.GOOS,
 		"goarch":  fp.Target.GOARCH,
 		"log":     fp.BuildResult.LogText,
 		"status":  fp.Failure.Status.String(),
 	}
 	m[""] = m["output"] // default field for `regexp` search (as opposed to field ~ `regexp`)
 	if fp.IsBuildFailure() {
 		m["mode"] = "build"
 	}
 	return m
 }

 func printRecord(r script.Record, verbose bool) {
 	fmt.Printf("%s %s %s %s %s %s\n", r["date"], r["builder"], r["goos"], r["goarch"],
 		r["pkg"], r["test"])
 	if verbose {
 		fmt.Printf("%s\n", indent(spaces[:4], r["snippet"]))
 	}
 }

 func (fp *FailurePost) IsBuildFailure() bool {
 	// no test ID. dummy for build failure.
 	return fp.Failure.TestID == ""
 }

 // String returns a header to identify the log and failure.
 func (fp *FailurePost) String() string {
 	repo := fp.Repo
 	sep := ""
 	if fp.GoCommit != "" {
 		sep = " go@"
 	}
 	if fp.GoBranch != "" && fp.GoBranch != "master" {
 		b := strings.TrimPrefix(fp.GoBranch, " release-branch.")
 		if repo == "go" {
 			repo = b
 		}
 		if sep == " go@" {
 			sep = " " + b + "@"
 		}
 	}
 	s := fmt.Sprintf("%s %s %s@%s%s%s",
 		fp.Time.Format("2006-01-02 15:04"),
 		fp.Builder, repo, shortHash(fp.Commit),
 		sep, shortHash(fp.GoCommit))

 	if fp.Pkg != "" || fp.Test != "" {
 		s += " " + shortPkg(fp.Pkg)
 		if fp.Pkg != "" && fp.Test != "" {
 			s += "."
 		}
 		s += fp.Test
 	}
 	if fp.IsBuildFailure() {
 		s += " [build]"
 	}
 	if fp.Failure.Status != rdbpb.TestStatus_FAIL {
 		s += fmt.Sprintf(" [%s]", fp.Failure.Status)
 	}
 	return s
 }

 // Markdown returns Markdown suitable for posting to GitHub.
 func (fp *FailurePost) Markdown() string {
 	return fmt.Sprintf("<details><summary>%s (<a href=\"%s\">log</a>)</summary>\n\n%s</details>\n",
 		fp.String(), fp.URL, indent(spaces[:4], fp.Snippet))
 }

 // Text returns text suitable for reading in interactive use or debug logging.
 func (fp *FailurePost) Text() string {
 	return fmt.Sprintf("%s\n%s\n%s\n", fp, fp.URL, strings.TrimRight(fp.Snippet, "\n"))
 }

 var spaces = strings.Repeat(" ", 100)

 // indent returns a copy of text in which every line has been indented by prefix.
 // It also ensures that, except when text is empty, text ends in a \n character.
 func indent(prefix, text string) string {
 	if text == "" {
 		return ""
 	}
 	text = strings.TrimRight(text, "\n")
 	return prefix + strings.ReplaceAll(text, "\n", "\n"+prefix) + "\n"
 }

 // shortPkg shortens pkg by removing any leading
 // golang.org/ (for packages like golang.org/x/sys/windows).
 func shortPkg(pkg string) string {
 	pkg = strings.TrimPrefix(pkg, "golang.org/")
 	return pkg
 }

 // shorten the output lines to form a snippet
 func snippet(log string) string {
 	lines := strings.SplitAfter(log, "\n")

 	// Remove beginning and trailing blank lines.
 	for len(lines) > 0 && strings.TrimSpace(lines[0]) == "" {
 		lines = lines[1:]
 	}
 	for len(lines) > 0 && strings.TrimSpace(lines[len(lines)-1]) == "" {
 		lines = lines[:len(lines)-1]
 	}

 	// If we have more than 30 lines, make the snippet by taking the first 10,
 	// the last 10, and possibly a middle 10. The middle 10 is included when
 	// the interior lines (between the first and last 10) contain an important-looking
 	// message like "panic:" or "--- FAIL:". The middle 10 start at the important-looking line.
 	// such as
 	if len(lines) > 30 {
 		var keep []string
 		keep = append(keep, lines[:10]...)
 		dots := true
 		for i := 10; i < len(lines)-10; i++ {
 			s := strings.TrimSpace(lines[i])
 			if strings.HasPrefix(s, "panic:") || strings.HasPrefix(s, "fatal error:") || strings.HasPrefix(s, "--- FAIL:") || strings.Contains(s, ": internal compiler error:") {
 				if i > 10 {
 					keep = append(keep, "...\n")
 				}
 				end := i + 10
 				if end >= len(lines)-10 {
 					dots = false
 					end = len(lines) - 10
 				}
 				keep = append(keep, lines[i:end]...)
 				break
 			}
 		}
 		if dots {
 			keep = append(keep, "...\n")
 		}
 		keep = append(keep, lines[len(lines)-10:]...)
 		lines = keep
 	}

 	return strings.Join(lines, "")
 }

 // If a build that has too many failures, the build is probably broken
 // (e.g. timeout, crash). Coalesce the failures and report maxFailPerBuild
 // of them.
 func coalesceFailures(fs []*Failure) []*Failure {
 	var res []*Failure
 	// A subtest fail may cause the parent test to fail, combine them.
 	// So is a test failure causing package-level failure.
 	var cur *Failure
 	for _, f := range fs {
 		if cur != nil {
 			cpkg, ctst := splitTestID(cur.TestID)
 			if (ctst == "" && strings.HasPrefix(f.TestID, cpkg+".")) ||
 				strings.HasPrefix(f.TestID, cur.TestID+"/") {
 				// 1. cur is a package-level failure, and f is a test in that package
 				// 2. f is a subtest of cur.
 				// In either case, consume cur, replace with f.
 				res[len(res)-1] = f
 				cur = f
 				continue
 			}
 		}
 		cur = f
 		res = append(res, f)
 	}
 	if len(res) <= maxFailPerBuild {
 		return res
 	}

 	// If multiple subtests fail under the same parent, pick one that is
 	// more likely to be helpful. Prefer the one containing "FAIL", then
 	// the longer log message.
 	moreLikelyUseful := func(f, last *Failure) bool {
 		return strings.Contains(f.LogText, "--- FAIL") &&
 			(!strings.Contains(last.LogText, "--- FAIL") || len(f.LogText) > len(last.LogText))
 	}
 	siblingSubtests := func(f, last *Failure) bool {
 		pkg, tst := splitTestID(f.TestID)
 		pkg2, tst2 := splitTestID(last.TestID)
 		if pkg != pkg2 {
 			return false
 		}
 		par, _, ok := strings.Cut(tst, "/")
 		par2, _, ok2 := strings.Cut(tst2, "/")
 		return ok && ok2 && par == par2
 	}
 	cur = nil
 	fs = res
 	res = fs[:0]
 	for _, f := range fs {
 		if cur != nil && siblingSubtests(f, cur) {
 			if moreLikelyUseful(f, res[len(res)-1]) {
 				res[len(res)-1] = f
 			}
 			continue
 		}
 		cur = f
 		res = append(res, f)
 	}
 	if len(res) <= maxFailPerBuild {
 		return res
 	}

 	// If there are still too many failures, coalesce by package (pick one with longest log).
 	fs = res
 	res = fs[:0]
 	curpkg := ""
 	for _, f := range fs {
 		pkg, _ := splitTestID(f.TestID)
 		if curpkg != "" && curpkg == pkg {
 			if moreLikelyUseful(f, res[len(res)-1]) {
 				res[len(res)-1] = f
 			}
 			continue
 		}
 		curpkg = pkg
 		res = append(res, f)
 	}
 	if len(res) > maxFailPerBuild {
 		res = res[:maxFailPerBuild]
 	}
 	return res
 }
	// Copyright 2024 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	// Watchflakes is a program that triages apparent test flakes
	// on the build.golang.org dashboards. See https://go.dev/wiki/Watchflakes.
	package main

	import (
	"context"
	"flag"
	"fmt"
	"log"
	"os"
	"runtime"
	"strconv"
	"strings"
	"time"

	bbpb "go.chromium.org/luci/buildbucket/proto"
	rdbpb "go.chromium.org/luci/resultdb/proto/v1"
	"golang.org/x/build/buildenv"
	"golang.org/x/build/cmd/watchflakes/internal/script"
	"golang.org/x/build/internal/secret"
	"rsc.io/github"
	)

	// TODO:
	// - subrepos by go commit
	// - handle INFRA_FAILURE and CANCELED

	var _ = fmt.Print

	// Query failures within most recent timeLimit.
	const timeLimit = 45 * 24 * time.Hour

	const maxFailPerBuild = 3

	var (
	build = flag.String("build", "", "a particular build ID or URL to analyze (mainly for debugging)")
	md = flag.Bool("md", false, "print Markdown output suitable for GitHub issues")
	post = flag.Bool("post", false, "post updates to GitHub issues")
	repeat = flag.Duration("repeat", 0, "keep running with specified `period`; zero means to run once and exit")
	verbose = flag.Bool("v", false, "print verbose posting decisions")

	useSecretManager = flag.Bool("use-secret-manager", false, "fetch GitHub token from Secret Manager instead of $HOME/.netrc")
	)

	func usage() {
	fmt.Fprintf(os.Stderr, "Usage: watchflakes [options] [script]\n")
	flag.PrintDefaults()
	os.Exit(2)
	}

	func main() {
	log.SetPrefix("watchflakes: ")
	flag.Usage = usage
	buildenv.RegisterStagingFlag()
	flag.Parse()
	if flag.NArg() > 1 {
	usage()
	}

	var query *Issue
	if flag.NArg() == 1 {
	s, err := script.Parse("script", flag.Arg(0), fields)
	if err != nil {
	log.Fatalf("parsing query:\n%s", err)
	}
	query = &Issue{Issue: new(github.Issue), Script: s, ScriptText: flag.Arg(0)}
	}

	// Create an authenticated GitHub client.
	if *useSecretManager {
	// Fetch credentials from Secret Manager.
	secretCl, err := secret.NewClientInProject(buildenv.FromFlags().ProjectName)
	if err != nil {
	log.Fatalln("failed to create a Secret Manager client:", err)
	}
	ghToken, err := secretCl.Retrieve(context.Background(), secret.NameWatchflakesGitHubToken)
	if err != nil {
	log.Fatalln("failed to retrieve GitHub token from Secret Manager:", err)
	}
	gh = github.NewClient(ghToken)
	} else {
	// Use credentials in $HOME/.netrc.
	var err error
	gh, err = github.Dial("")
	if err != nil {
	log.Fatalln("github.Dial:", err)
	}
	}

	// Load LUCI dashboards
	c := NewLUCIClient(runtime.GOMAXPROCS(0) * 4)
	c.TraceSteps = true

	var ticker *time.Ticker
	timeout := 30 * time.Minute // default timeout for one-off run
	if *repeat != 0 {
	ticker = time.NewTicker(*repeat)
	timeout = repeat 2 // expected to finish in one repeat cycle, give some extra room
	}
	Repeat:
	startTime := time.Now()
	ctx, cancel := context.WithTimeout(context.Background(), timeout)
	var boards []*Dashboard
	if *build == "" {
	// fetch the dashboard
	var err error
	boards, err = c.ListBoards(ctx)
	if err != nil {
	log.Fatalln("ListBoards:", err)
	}
	err = c.ReadBoards(ctx, boards, startTime.Add(-timeLimit))
	if err != nil {
	log.Fatalln("ReadBoards:", err)
	}
	skipBrokenCommits(boards)
	skipBrokenBuilders(boards)
	} else {
	id, err := strconv.ParseInt(strings.TrimPrefix(*build, "https://ci.chromium.org/b/"), 10, 64)
	if err != nil {
	log.Fatalf("invalid build ID for -build flag: %q\n\texpect a build ID or https://ci.chromium.org/b/ URL", *build)
	}
	r, err := c.GetBuild(ctx, id)
	if err != nil {
	log.Fatalf("GetBuild %d: %v", id, err)
	}
	// make a one-entry board
	board := &Dashboard{
	Builders: []Builder{{r.Builder, r.BuilderConfigProperties}},
	Commits: []Commit{{Hash: r.Commit}},
	Results: [][]*BuildResult{{r}},
	}
	boards = []*Dashboard{board}
	}

	failRes := c.FindFailures(ctx, boards)
	c.FetchLogs(failRes)

	if *verbose {
	for _, r := range failRes {
	fmt.Printf("failure %s %s %s\n", r.Builder, shortHash(r.Commit), buildURL(r.ID))
	}
	}

	// Load GitHub issues
	var issues []*Issue
	issues, err := readIssues(issues)
	if err != nil {
	log.Fatal(err)
	}
	findScripts(issues)
	if query == nil {
	postIssueErrors(issues)
	}
	if query != nil {
	issues = []*Issue{query}
	}

	for _, r := range failRes {
	newIssue := 0
	fs := r.Failures
	fs = coalesceFailures(fs)
	if len(fs) == 0 {
	// No test failure, Probably a build failure.
	// E.g. https://ci.chromium.org/ui/b/8759448820419452721
	// Make a dummy failure.
	f := &Failure{
	Status: rdbpb.TestStatus_FAIL,
	LogText: r.StepLogText,
	}
	fs = []*Failure{f}
	}
	for _, f := range fs {
	fp := NewFailurePost(r, f)
	record := fp.Record()
	action, targets := run(issues, record)
	if *verbose {
	printRecord(record, false)
	fmt.Printf("\t%s %v\n", action, targets)
	}
	switch action {
	case "skip":
	// do nothing
	if *verbose {
	fmt.Printf("%s: skipped by #%d\n", fp.URL, targets[0].Number)
	}

	case "":
	if newIssue > 0 {
	// If we already opened a new issue for a build, don't open another one.
	// It could be that the build is just broken.
	// One can look at the issue and split if necessary.
	break
	}

	// create a new issue
	if query == nil {
	if *verbose {
	fmt.Printf("%s: new issue\n", fp.URL)
	}
	issue, err := reportNew(fp)
	if err != nil {
	log.Fatal(err)
	}
	issues = append(issues, issue)
	newIssue++
	}

	case "default", "post", "take":
	for _, issue := range targets {
	if !issue.Mentions[fp.URL] && issue.Stale {
	readComments(issue)
	}
	if *verbose {
	mentioned := "un"
	if issue.Mentions[fp.URL] {
	mentioned = "already "
	}
	fmt.Printf("%s: %s #%d, %smentioned\n", fp.URL, action, issue.Number, mentioned)
	}
	if !issue.Mentions[fp.URL] {
	issue.Post = append(issue.Post, fp)
	}
	}
	}
	}
	}

	if query != nil {
	format := (*FailurePost).Text
	if *md {
	format = (*FailurePost).Markdown
	}
	for i, fp := range query.Post {
	if i > 0 {
	fmt.Printf("\n")
	}
	os.Stdout.WriteString(format(fp))
	}
	if *md {
	os.Stdout.WriteString(signature)
	}
	return
	}

	posts := 0
	for _, issue := range issues {
	if len(issue.Post) > 0 {
	fmt.Printf(" - new for #%d %s\n", issue.Number, issue.Title)
	for _, fp := range issue.Post {
	fmt.Printf(" - %s\n %s\n", fp, fp.URL)
	}
	msg := updateText(issue)
	if *verbose {
	fmt.Printf("\n%s\n", indent(spaces[:3], msg))
	}
	if *post {
	if err := postComment(issue, msg); err != nil {
	log.Print(err)
	continue
	}
	if issue.Mentions == nil {
	issue.Mentions = make(map[string]bool)
	}
	for _, fp := range issue.Post {
	issue.Mentions[fp.URL] = true
	}
	}
	posts++
	}
	}

	cancel()

	log.Printf("Done. %d boards, %d failures, %d issues, %d posts, in %v\n", len(boards), len(failRes), len(issues), posts, time.Since(startTime))

	if *repeat != 0 {
	<-ticker.C
	goto Repeat
	}
	}

	const SKIP = bbpb.Status_STATUS_UNSPECIFIED // for smashing the status to skip a non-flake failure

	// skipBrokenCommits identifies broken commits,
	// which are the ones that failed on at least 1/4 of builders,
	// and then changes all results for those commits to SKIP.
	func skipBrokenCommits(boards []*Dashboard) {
	for _, dash := range boards {
	builderThreshold := len(dash.Builders) / 4
	for i := 0; i < len(dash.Commits); i++ {
	bad := 0
	good := 0
	for _, rs := range dash.Results {
	if rs[i] == nil {
	continue
	}
	switch rs[i].Status {
	case bbpb.Status_SUCCESS:
	good++
	case bbpb.Status_FAILURE:
	bad++
	// ignore other status
	}
	}
	if bad > builderThreshold \|\| good < builderThreshold {
	fmt.Printf("skip: commit %s (%s %s) is broken (good=%d bad=%d)\n", shortHash(dash.Commits[i].Hash), dash.Repo, dash.GoBranch, good, bad)
	for _, rs := range dash.Results {
	if rs[i] != nil {
	rs[i].Status = SKIP
	}
	}
	}
	}
	}
	}

	// skipBrokenBuilders identifies builders that were consistently broken
	// (at least tooManyToBeFlakes failures in a row) and then turned ok.
	// It changes those consistent failures to SKIP.
	func skipBrokenBuilders(boards []*Dashboard) {
	const tooManyToBeFlakes = 4

	for _, dash := range boards {
	for _, rs := range dash.Results {
	bad := 100 // squash failures at the top of the dashboard, which may turn out to be consistent
	badStart := 0
	skip := func(i int) { // skip the i-th result
	if rs[i] != nil {
	fmt.Printf("skip: builder %s was broken at %s (%s %s)\n", rs[i].Builder, shortHash(rs[i].Commit), dash.Repo, dash.GoBranch)
	rs[i].Status = SKIP
	}
	}
	for i, r := range rs {
	if rs[i] == nil {
	continue
	}
	switch r.Status {
	case bbpb.Status_SUCCESS:
	bad = 0
	continue
	case bbpb.Status_FAILURE:
	bad++
	default: // ignore other status
	continue
	}
	switch bad {
	case 1:
	badStart = i
	case tooManyToBeFlakes:
	// Skip the run so far.
	for j := badStart; j < i; j++ {
	skip(j)
	}
	}
	if bad >= tooManyToBeFlakes {
	skip(i)
	}
	}

	// Bad entries ending just before the cutoff are not flakes
	// even if there are just a few of them. Otherwise we get
	// spurious flakes when there's one bad entry before the
	// cutoff and lots after the cutoff.
	if bad > 0 && badStart > 0 {
	for j := badStart; j < len(rs); j++ {
	skip(j)
	}
	}
	}
	}
	}

	// run runs the scripts in issues on record.
	// It returns the desired action (skip, post, default)
	// as well as the list of target issues (for post or default).
	func run(issues []Issue, record script.Record) (action string, targets []Issue) {
	var def, post []*Issue

	for _, issue := range issues {
	if issue.Script != nil {
	switch issue.Script.Action(record) {
	case "skip":
	return "skip", []*Issue{issue}
	case "take":
	println("TAKE", issue.Number)
	case "default":
	def = append(def, issue)
	case "post":
	post = append(post, issue)
	}
	}
	}

	if len(post) > 0 {
	return "post", post
	}
	if len(def) > 0 {
	return "default", def[:1]
	}
	return "", nil
	}

	// FailurePost is a failure to be posted on an issue.
	type FailurePost struct {
	*BuildResult
	*Failure
	URL string // LUCI build page
	Pkg string
	Test string
	Snippet string
	}

	func NewFailurePost(r BuildResult, f Failure) *FailurePost {
	pkg, test := splitTestID(f.TestID)
	snip := snippet(f.LogText)
	if snip == "" {
	snip = snippet(r.LogText)
	}
	fp := &FailurePost{
	BuildResult: r,
	Failure: f,
	URL: buildURL(r.ID),
	Pkg: pkg,
	Test: test,
	Snippet: snip,
	}
	return fp
	}

	// fields is the list of known fields for use by script patterns.
	// It must be in sync with the Record method below.
	var fields = []string{
	"",
	"section", // not used, keep for compatibility with old watchflakes
	"pkg",
	"test",
	"mode",
	"output",
	"snippet",
	"date",
	"builder",
	"repo",
	"goos",
	"goarch",
	"log",
	"status",
	}

	func (fp *FailurePost) Record() script.Record {
	// Note: update fields above if any new fields are added to this record.
	m := script.Record{
	"pkg": fp.Pkg,
	"test": fp.Test,
	"output": fp.Failure.LogText,
	"snippet": fp.Snippet,
	"date": fp.Time.Format(time.RFC3339),
	"builder": fp.Builder,
	"repo": fp.Repo,
	"goos": fp.Target.GOOS,
	"goarch": fp.Target.GOARCH,
	"log": fp.BuildResult.LogText,
	"status": fp.Failure.Status.String(),
	}
	m[""] = m["output"] // default field for `regexp` search (as opposed to field ~ `regexp`)
	if fp.IsBuildFailure() {
	m["mode"] = "build"
	}
	return m
	}

	func printRecord(r script.Record, verbose bool) {
	fmt.Printf("%s %s %s %s %s %s\n", r["date"], r["builder"], r["goos"], r["goarch"],
	r["pkg"], r["test"])
	if verbose {
	fmt.Printf("%s\n", indent(spaces[:4], r["snippet"]))
	}
	}

	func (fp *FailurePost) IsBuildFailure() bool {
	// no test ID. dummy for build failure.
	return fp.Failure.TestID == ""
	}

	// String returns a header to identify the log and failure.
	func (fp *FailurePost) String() string {
	repo := fp.Repo
	sep := ""
	if fp.GoCommit != "" {
	sep = " go@"
	}
	if fp.GoBranch != "" && fp.GoBranch != "master" {
	b := strings.TrimPrefix(fp.GoBranch, " release-branch.")
	if repo == "go" {
	repo = b
	}
	if sep == " go@" {
	sep = " " + b + "@"
	}
	}
	s := fmt.Sprintf("%s %s %s@%s%s%s",
	fp.Time.Format("2006-01-02 15:04"),
	fp.Builder, repo, shortHash(fp.Commit),
	sep, shortHash(fp.GoCommit))

	if fp.Pkg != "" \|\| fp.Test != "" {
	s += " " + shortPkg(fp.Pkg)
	if fp.Pkg != "" && fp.Test != "" {
	s += "."
	}
	s += fp.Test
	}
	if fp.IsBuildFailure() {
	s += " [build]"
	}
	if fp.Failure.Status != rdbpb.TestStatus_FAIL {
	s += fmt.Sprintf(" [%s]", fp.Failure.Status)
	}
	return s
	}

	// Markdown returns Markdown suitable for posting to GitHub.
	func (fp *FailurePost) Markdown() string {
	return fmt.Sprintf("<details><summary>%s (<a href=\"%s\">log</a>)</summary>\n\n%s</details>\n",
	fp.String(), fp.URL, indent(spaces[:4], fp.Snippet))
	}

	// Text returns text suitable for reading in interactive use or debug logging.
	func (fp *FailurePost) Text() string {
	return fmt.Sprintf("%s\n%s\n%s\n", fp, fp.URL, strings.TrimRight(fp.Snippet, "\n"))
	}

	var spaces = strings.Repeat(" ", 100)

	// indent returns a copy of text in which every line has been indented by prefix.
	// It also ensures that, except when text is empty, text ends in a \n character.
	func indent(prefix, text string) string {
	if text == "" {
	return ""
	}
	text = strings.TrimRight(text, "\n")
	return prefix + strings.ReplaceAll(text, "\n", "\n"+prefix) + "\n"
	}

	// shortPkg shortens pkg by removing any leading
	// golang.org/ (for packages like golang.org/x/sys/windows).
	func shortPkg(pkg string) string {
	pkg = strings.TrimPrefix(pkg, "golang.org/")
	return pkg
	}

	// shorten the output lines to form a snippet
	func snippet(log string) string {
	lines := strings.SplitAfter(log, "\n")

	// Remove beginning and trailing blank lines.
	for len(lines) > 0 && strings.TrimSpace(lines[0]) == "" {
	lines = lines[1:]
	}
	for len(lines) > 0 && strings.TrimSpace(lines[len(lines)-1]) == "" {
	lines = lines[:len(lines)-1]
	}

	// If we have more than 30 lines, make the snippet by taking the first 10,
	// the last 10, and possibly a middle 10. The middle 10 is included when
	// the interior lines (between the first and last 10) contain an important-looking
	// message like "panic:" or "--- FAIL:". The middle 10 start at the important-looking line.
	// such as
	if len(lines) > 30 {
	var keep []string
	keep = append(keep, lines[:10]...)
	dots := true
	for i := 10; i < len(lines)-10; i++ {
	s := strings.TrimSpace(lines[i])
	if strings.HasPrefix(s, "panic:") \|\| strings.HasPrefix(s, "fatal error:") \|\| strings.HasPrefix(s, "--- FAIL:") \|\| strings.Contains(s, ": internal compiler error:") {
	if i > 10 {
	keep = append(keep, "...\n")
	}
	end := i + 10
	if end >= len(lines)-10 {
	dots = false
	end = len(lines) - 10
	}
	keep = append(keep, lines[i:end]...)
	break
	}
	}
	if dots {
	keep = append(keep, "...\n")
	}
	keep = append(keep, lines[len(lines)-10:]...)
	lines = keep
	}

	return strings.Join(lines, "")
	}

	// If a build that has too many failures, the build is probably broken
	// (e.g. timeout, crash). Coalesce the failures and report maxFailPerBuild
	// of them.
	func coalesceFailures(fs []Failure) []Failure {
	var res []*Failure
	// A subtest fail may cause the parent test to fail, combine them.
	// So is a test failure causing package-level failure.
	var cur *Failure
	for _, f := range fs {
	if cur != nil {
	cpkg, ctst := splitTestID(cur.TestID)
	if (ctst == "" && strings.HasPrefix(f.TestID, cpkg+".")) \|\|
	strings.HasPrefix(f.TestID, cur.TestID+"/") {
	// 1. cur is a package-level failure, and f is a test in that package
	// 2. f is a subtest of cur.
	// In either case, consume cur, replace with f.
	res[len(res)-1] = f
	cur = f
	continue
	}
	}
	cur = f
	res = append(res, f)
	}
	if len(res) <= maxFailPerBuild {
	return res
	}

	// If multiple subtests fail under the same parent, pick one that is
	// more likely to be helpful. Prefer the one containing "FAIL", then
	// the longer log message.
	moreLikelyUseful := func(f, last *Failure) bool {
	return strings.Contains(f.LogText, "--- FAIL") &&
	(!strings.Contains(last.LogText, "--- FAIL") \|\| len(f.LogText) > len(last.LogText))
	}
	siblingSubtests := func(f, last *Failure) bool {
	pkg, tst := splitTestID(f.TestID)
	pkg2, tst2 := splitTestID(last.TestID)
	if pkg != pkg2 {
	return false
	}
	par, _, ok := strings.Cut(tst, "/")
	par2, _, ok2 := strings.Cut(tst2, "/")
	return ok && ok2 && par == par2
	}
	cur = nil
	fs = res
	res = fs[:0]
	for _, f := range fs {
	if cur != nil && siblingSubtests(f, cur) {
	if moreLikelyUseful(f, res[len(res)-1]) {
	res[len(res)-1] = f
	}
	continue
	}
	cur = f
	res = append(res, f)
	}
	if len(res) <= maxFailPerBuild {
	return res
	}

	// If there are still too many failures, coalesce by package (pick one with longest log).
	fs = res
	res = fs[:0]
	curpkg := ""
	for _, f := range fs {
	pkg, _ := splitTestID(f.TestID)
	if curpkg != "" && curpkg == pkg {
	if moreLikelyUseful(f, res[len(res)-1]) {
	res[len(res)-1] = f
	}
	continue
	}
	curpkg = pkg
	res = append(res, f)
	}
	if len(res) > maxFailPerBuild {
	res = res[:maxFailPerBuild]
	}
	return res
	}