| // Copyright 2024 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // Watchflakes is a program that triages apparent test flakes |
| // on the build.golang.org dashboards. See https://go.dev/wiki/Watchflakes. |
| package main |
| |
| import ( |
| "context" |
| "flag" |
| "fmt" |
| "log" |
| "os" |
| "runtime" |
| "strconv" |
| "strings" |
| "time" |
| |
| bbpb "go.chromium.org/luci/buildbucket/proto" |
| rdbpb "go.chromium.org/luci/resultdb/proto/v1" |
| "golang.org/x/build/buildenv" |
| "golang.org/x/build/cmd/watchflakes/internal/script" |
| "golang.org/x/build/internal/secret" |
| "rsc.io/github" |
| ) |
| |
| // TODO: |
| // - subrepos by go commit |
| // - handle INFRA_FAILURE and CANCELED |
| |
| var _ = fmt.Print |
| |
| // Query failures within most recent timeLimit. |
| const timeLimit = 45 * 24 * time.Hour |
| |
| const maxFailPerBuild = 3 |
| |
| const tooManyToBeFlakes = 4 |
| |
| var ( |
| build = flag.String("build", "", "a particular build ID or URL to analyze (mainly for debugging)") |
| md = flag.Bool("md", false, "print Markdown output suitable for GitHub issues") |
| post = flag.Bool("post", false, "post updates to GitHub issues") |
| repeat = flag.Duration("repeat", 0, "keep running with specified `period`; zero means to run once and exit") |
| verbose = flag.Bool("v", false, "print verbose posting decisions") |
| |
| useSecretManager = flag.Bool("use-secret-manager", false, "fetch GitHub token from Secret Manager instead of $HOME/.netrc") |
| ) |
| |
| func usage() { |
| fmt.Fprintf(os.Stderr, "Usage: watchflakes [options] [script]\n") |
| flag.PrintDefaults() |
| os.Exit(2) |
| } |
| |
| func main() { |
| log.SetPrefix("watchflakes: ") |
| flag.Usage = usage |
| buildenv.RegisterStagingFlag() |
| flag.Parse() |
| if flag.NArg() > 1 { |
| usage() |
| } |
| |
| var query *Issue |
| if flag.NArg() == 1 { |
| s, err := script.Parse("script", flag.Arg(0), fields) |
| if err != nil { |
| log.Fatalf("parsing query:\n%s", err) |
| } |
| query = &Issue{Issue: new(github.Issue), Script: s, ScriptText: flag.Arg(0)} |
| } |
| |
| // Create an authenticated GitHub client. |
| if *useSecretManager { |
| // Fetch credentials from Secret Manager. |
| secretCl, err := secret.NewClientInProject(buildenv.FromFlags().ProjectName) |
| if err != nil { |
| log.Fatalln("failed to create a Secret Manager client:", err) |
| } |
| ghToken, err := secretCl.Retrieve(context.Background(), secret.NameWatchflakesGitHubToken) |
| if err != nil { |
| log.Fatalln("failed to retrieve GitHub token from Secret Manager:", err) |
| } |
| gh = github.NewClient(ghToken) |
| } else { |
| // Use credentials in $HOME/.netrc. |
| var err error |
| gh, err = github.Dial("") |
| if err != nil { |
| log.Fatalln("github.Dial:", err) |
| } |
| } |
| |
| // Load LUCI dashboards |
| c := NewLUCIClient(runtime.GOMAXPROCS(0) * 4) |
| c.TraceSteps = true |
| |
| var ticker *time.Ticker |
| timeout := 30 * time.Minute // default timeout for one-off run |
| if *repeat != 0 { |
| ticker = time.NewTicker(*repeat) |
| timeout = *repeat * 2 // expected to finish in one repeat cycle, give some extra room |
| } |
| Repeat: |
| startTime := time.Now() |
| ctx, cancel := context.WithTimeout(context.Background(), timeout) |
| var boards []*Dashboard |
| if *build == "" { |
| // fetch the dashboard |
| var err error |
| boards, err = c.ListBoards(ctx) |
| if err != nil { |
| log.Fatalln("ListBoards:", err) |
| } |
| err = c.ReadBoards(ctx, boards, startTime.Add(-timeLimit)) |
| if err != nil { |
| log.Fatalln("ReadBoards:", err) |
| } |
| skipBrokenCommits(boards) |
| skipBrokenBuilders(boards) |
| } else { |
| id, err := strconv.ParseInt(strings.TrimPrefix(*build, "https://ci.chromium.org/b/"), 10, 64) |
| if err != nil { |
| log.Fatalf("invalid build ID for -build flag: %q\n\texpect a build ID or https://ci.chromium.org/b/ URL", *build) |
| } |
| r, err := c.GetBuild(ctx, id) |
| if err != nil { |
| log.Fatalf("GetBuild %d: %v", id, err) |
| } |
| // make a one-entry board |
| board := &Dashboard{ |
| Builders: []Builder{{r.Builder, r.BuilderConfigProperties}}, |
| Commits: []Commit{{Hash: r.Commit}}, |
| Results: [][]*BuildResult{{r}}, |
| } |
| boards = []*Dashboard{board} |
| } |
| |
| failRes := c.FindFailures(ctx, boards) |
| c.FetchLogs(failRes) |
| |
| if *verbose { |
| for _, r := range failRes { |
| fmt.Printf("failure %s %s %s\n", r.Builder, shortHash(r.Commit), buildURL(r.ID)) |
| } |
| } |
| |
| // Load GitHub issues |
| var issues []*Issue |
| issues, err := readIssues(issues) |
| if err != nil { |
| log.Fatal(err) |
| } |
| findScripts(issues) |
| if query == nil { |
| postIssueErrors(issues) |
| } |
| if query != nil { |
| issues = []*Issue{query} |
| } |
| |
| for _, r := range failRes { |
| newIssue := 0 |
| fs := r.Failures |
| fs = coalesceFailures(fs) |
| if len(fs) == 0 { |
| // No test failure, Probably a build failure. |
| // E.g. https://ci.chromium.org/ui/b/8759448820419452721 |
| // Make a dummy failure. |
| f := &Failure{ |
| Status: rdbpb.TestStatus_FAIL, |
| LogText: r.StepLogText, |
| } |
| fs = []*Failure{f} |
| } |
| for _, f := range fs { |
| fp := NewFailurePost(r, f) |
| record := fp.Record() |
| action, targets := run(issues, record) |
| if *verbose { |
| printRecord(record, false) |
| fmt.Printf("\t%s %v\n", action, targets) |
| } |
| switch action { |
| case "skip": |
| // do nothing |
| if *verbose { |
| fmt.Printf("%s: skipped by #%d\n", fp.URL, targets[0].Number) |
| } |
| |
| case "": |
| if newIssue > 0 { |
| // If we already opened a new issue for a build, don't open another one. |
| // It could be that the build is just broken. |
| // One can look at the issue and split if necessary. |
| break |
| } |
| |
| // create a new issue |
| if query == nil { |
| if *verbose { |
| fmt.Printf("%s: new issue\n", fp.URL) |
| } |
| issue, err := prepareNew(fp) |
| if err != nil { |
| log.Fatal(err) |
| } |
| issues = append(issues, issue) |
| newIssue++ |
| } |
| |
| case "default", "post", "take": |
| for _, issue := range targets { |
| if !issue.Mentions[fp.URL] && issue.Stale { |
| readComments(issue) |
| } |
| if *verbose { |
| mentioned := "un" |
| if issue.Mentions[fp.URL] { |
| mentioned = "already " |
| } |
| fmt.Printf("%s: %s #%d, %smentioned\n", fp.URL, action, issue.Number, mentioned) |
| } |
| if !issue.Mentions[fp.URL] { |
| issue.Post = append(issue.Post, fp) |
| } |
| } |
| } |
| } |
| } |
| for _, issue := range issues { |
| if issue.Number == 0 && len(issue.Post) >= tooManyToBeFlakes && issue.Post[0].Top { |
| // New issue. Check if it is failing consistently at top. |
| top := 0 |
| for _, fp := range issue.Post { |
| if fp.Top { |
| top++ |
| } |
| } |
| if top >= tooManyToBeFlakes { |
| issue.Title += " [consistent failure]" |
| } |
| } |
| } |
| |
| if query != nil { |
| format := (*FailurePost).Text |
| if *md { |
| format = (*FailurePost).Markdown |
| } |
| for i, fp := range query.Post { |
| if i > 0 { |
| fmt.Printf("\n") |
| } |
| os.Stdout.WriteString(format(fp)) |
| } |
| if *md { |
| os.Stdout.WriteString(signature) |
| } |
| return |
| } |
| |
| posts := 0 |
| for _, issue := range issues { |
| if len(issue.Post) > 0 { |
| if *post && issue.Number == 0 { |
| issue.Issue = postNew(issue.Title, issue.Body) |
| } |
| fmt.Printf(" - new for #%d %s\n", issue.Number, issue.Title) |
| for _, fp := range issue.Post { |
| fmt.Printf(" - %s\n %s\n", fp, fp.URL) |
| } |
| msg := updateText(issue) |
| if *verbose { |
| fmt.Printf("\n%s\n", indent(spaces[:3], msg)) |
| } |
| if *post { |
| if err := postComment(issue, msg); err != nil { |
| log.Print(err) |
| continue |
| } |
| if issue.Mentions == nil { |
| issue.Mentions = make(map[string]bool) |
| } |
| for _, fp := range issue.Post { |
| issue.Mentions[fp.URL] = true |
| } |
| } |
| posts++ |
| } |
| } |
| |
| cancel() |
| |
| log.Printf("Done. %d boards, %d failures, %d issues, %d posts, in %v\n", len(boards), len(failRes), len(issues), posts, time.Since(startTime)) |
| |
| if *repeat != 0 { |
| <-ticker.C |
| goto Repeat |
| } |
| } |
| |
| const SKIP = bbpb.Status_STATUS_UNSPECIFIED // for smashing the status to skip a non-flake failure |
| |
| // skipBrokenCommits identifies broken commits, |
| // which are the ones that failed on at least 1/4 of builders, |
| // and then changes all results for those commits to SKIP. |
| func skipBrokenCommits(boards []*Dashboard) { |
| for _, dash := range boards { |
| builderThreshold := len(dash.Builders) / 4 |
| for i := 0; i < len(dash.Commits); i++ { |
| bad := 0 |
| good := 0 |
| for _, rs := range dash.Results { |
| if rs[i] == nil { |
| continue |
| } |
| switch rs[i].Status { |
| case bbpb.Status_SUCCESS: |
| good++ |
| case bbpb.Status_FAILURE: |
| bad++ |
| // ignore other status |
| } |
| } |
| if bad > builderThreshold || good < builderThreshold { |
| fmt.Printf("skip: commit %s (%s %s) is broken (good=%d bad=%d)\n", shortHash(dash.Commits[i].Hash), dash.Repo, dash.GoBranch, good, bad) |
| for _, rs := range dash.Results { |
| if rs[i] != nil { |
| rs[i].Status = SKIP |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| // skipBrokenBuilders identifies builders that were consistently broken |
| // (at least tooManyToBeFlakes failures in a row) and then turned ok. |
| // It changes those consistent failures to SKIP. |
| // |
| // It does not skip consistent failures at the top (latest few commits). |
| // Instead, it sets Top to true on them. |
| func skipBrokenBuilders(boards []*Dashboard) { |
| for _, dash := range boards { |
| for _, rs := range dash.Results { |
| bad := 0 |
| badStart := 0 |
| top := true |
| skip := func(i int) { // skip the i-th result |
| if rs[i] != nil { |
| fmt.Printf("skip: builder %s was broken at %s (%s %s)\n", rs[i].Builder, shortHash(rs[i].Commit), dash.Repo, dash.GoBranch) |
| rs[i].Status = SKIP |
| } |
| } |
| for i, r := range rs { |
| if rs[i] == nil { |
| continue |
| } |
| switch r.Status { |
| case bbpb.Status_SUCCESS: |
| if top && bad < tooManyToBeFlakes { |
| // Skip the run at the top. |
| // Too few to tell if it is flaky or consistent. |
| // It may also get fixed soon. |
| for j := 0; j < i; j++ { |
| skip(j) |
| } |
| } |
| top = false |
| bad = 0 |
| continue |
| case bbpb.Status_FAILURE: |
| bad++ |
| if top { |
| // Set Top to true, but don't skip. |
| r.Top = true |
| continue |
| } |
| default: // ignore other status |
| continue |
| } |
| switch bad { |
| case 1: |
| badStart = i |
| case tooManyToBeFlakes: |
| // Skip the run so far. |
| for j := badStart; j < i; j++ { |
| skip(j) |
| } |
| } |
| if bad >= tooManyToBeFlakes { |
| skip(i) |
| } |
| } |
| |
| // Bad entries ending just before the cutoff are not flakes |
| // even if there are just a few of them. Otherwise we get |
| // spurious flakes when there's one bad entry before the |
| // cutoff and lots after the cutoff. |
| if bad > 0 { |
| for j := badStart; j < len(rs); j++ { |
| skip(j) |
| } |
| } |
| } |
| } |
| } |
| |
| // run runs the scripts in issues on record. |
| // It returns the desired action (skip, post, default) |
| // as well as the list of target issues (for post or default). |
| func run(issues []*Issue, record script.Record) (action string, targets []*Issue) { |
| var def, post []*Issue |
| |
| for _, issue := range issues { |
| if issue.Script != nil { |
| switch issue.Script.Action(record) { |
| case "skip": |
| return "skip", []*Issue{issue} |
| case "take": |
| println("TAKE", issue.Number) |
| case "default": |
| def = append(def, issue) |
| case "post": |
| post = append(post, issue) |
| } |
| } |
| } |
| |
| if len(post) > 0 { |
| return "post", post |
| } |
| if len(def) > 0 { |
| return "default", def[:1] |
| } |
| return "", nil |
| } |
| |
| // FailurePost is a failure to be posted on an issue. |
| type FailurePost struct { |
| *BuildResult |
| *Failure |
| URL string // LUCI build page |
| Pkg string |
| Test string |
| Snippet string |
| } |
| |
| func NewFailurePost(r *BuildResult, f *Failure) *FailurePost { |
| pkg, test := splitTestID(f.TestID) |
| snip := snippet(f.LogText) |
| if snip == "" { |
| snip = snippet(r.LogText) |
| } |
| fp := &FailurePost{ |
| BuildResult: r, |
| Failure: f, |
| URL: buildURL(r.ID), |
| Pkg: pkg, |
| Test: test, |
| Snippet: snip, |
| } |
| return fp |
| } |
| |
| // fields is the list of known fields for use by script patterns. |
| // It must be in sync with the Record method below. |
| var fields = []string{ |
| "", |
| "section", // not used, keep for compatibility with old watchflakes |
| "pkg", |
| "test", |
| "mode", |
| "output", |
| "snippet", |
| "date", |
| "builder", |
| "repo", |
| "goos", |
| "goarch", |
| "log", |
| "status", |
| } |
| |
| func (fp *FailurePost) Record() script.Record { |
| // Note: update fields above if any new fields are added to this record. |
| m := script.Record{ |
| "pkg": fp.Pkg, |
| "test": fp.Test, |
| "output": fp.Failure.LogText, |
| "snippet": fp.Snippet, |
| "date": fp.Time.Format(time.RFC3339), |
| "builder": fp.Builder, |
| "repo": fp.Repo, |
| "goos": fp.Target.GOOS, |
| "goarch": fp.Target.GOARCH, |
| "log": fp.BuildResult.LogText, |
| "status": fp.Failure.Status.String(), |
| } |
| m[""] = m["output"] // default field for `regexp` search (as opposed to field ~ `regexp`) |
| if fp.IsBuildFailure() { |
| m["mode"] = "build" |
| } |
| return m |
| } |
| |
| func printRecord(r script.Record, verbose bool) { |
| fmt.Printf("%s %s %s %s %s %s\n", r["date"], r["builder"], r["goos"], r["goarch"], |
| r["pkg"], r["test"]) |
| if verbose { |
| fmt.Printf("%s\n", indent(spaces[:4], r["snippet"])) |
| } |
| } |
| |
| func (fp *FailurePost) IsBuildFailure() bool { |
| // no test ID. dummy for build failure. |
| return fp.Failure.TestID == "" |
| } |
| |
| // String returns a header to identify the log and failure. |
| func (fp *FailurePost) String() string { |
| repo := fp.Repo |
| sep := "" |
| if fp.GoCommit != "" { |
| sep = " go@" |
| } |
| if fp.GoBranch != "" && fp.GoBranch != "master" { |
| b := strings.TrimPrefix(fp.GoBranch, " release-branch.") |
| if repo == "go" { |
| repo = b |
| } |
| if sep == " go@" { |
| sep = " " + b + "@" |
| } |
| } |
| s := fmt.Sprintf("%s %s %s@%s%s%s", |
| fp.Time.Format("2006-01-02 15:04"), |
| fp.Builder, repo, shortHash(fp.Commit), |
| sep, shortHash(fp.GoCommit)) |
| |
| if fp.Pkg != "" || fp.Test != "" { |
| s += " " + shortPkg(fp.Pkg) |
| if fp.Pkg != "" && fp.Test != "" { |
| s += "." |
| } |
| s += fp.Test |
| } |
| if fp.IsBuildFailure() { |
| s += " [build]" |
| } |
| if fp.Failure.Status != rdbpb.TestStatus_FAIL { |
| s += fmt.Sprintf(" [%s]", fp.Failure.Status) |
| } |
| return s |
| } |
| |
| // Markdown returns Markdown suitable for posting to GitHub. |
| func (fp *FailurePost) Markdown() string { |
| return fmt.Sprintf("<details><summary>%s (<a href=\"%s\">log</a>)</summary>\n\n%s</details>\n", |
| fp.String(), fp.URL, indent(spaces[:4], fp.Snippet)) |
| } |
| |
| // Text returns text suitable for reading in interactive use or debug logging. |
| func (fp *FailurePost) Text() string { |
| return fmt.Sprintf("%s\n%s\n%s\n", fp, fp.URL, strings.TrimRight(fp.Snippet, "\n")) |
| } |
| |
| var spaces = strings.Repeat(" ", 100) |
| |
| // indent returns a copy of text in which every line has been indented by prefix. |
| // It also ensures that, except when text is empty, text ends in a \n character. |
| func indent(prefix, text string) string { |
| if text == "" { |
| return "" |
| } |
| text = strings.TrimRight(text, "\n") |
| return prefix + strings.ReplaceAll(text, "\n", "\n"+prefix) + "\n" |
| } |
| |
| // shortPkg shortens pkg by removing any leading |
| // golang.org/ (for packages like golang.org/x/sys/windows). |
| func shortPkg(pkg string) string { |
| pkg = strings.TrimPrefix(pkg, "golang.org/") |
| return pkg |
| } |
| |
| // shorten the output lines to form a snippet |
| func snippet(log string) string { |
| lines := strings.SplitAfter(log, "\n") |
| |
| // Remove beginning and trailing blank lines. |
| for len(lines) > 0 && strings.TrimSpace(lines[0]) == "" { |
| lines = lines[1:] |
| } |
| for len(lines) > 0 && strings.TrimSpace(lines[len(lines)-1]) == "" { |
| lines = lines[:len(lines)-1] |
| } |
| |
| // If we have more than 30 lines, make the snippet by taking the first 10, |
| // the last 10, and possibly a middle 10. The middle 10 is included when |
| // the interior lines (between the first and last 10) contain an important-looking |
| // message like "panic:" or "--- FAIL:". The middle 10 start at the important-looking line. |
| // such as |
| if len(lines) > 30 { |
| var keep []string |
| keep = append(keep, lines[:10]...) |
| dots := true |
| for i := 10; i < len(lines)-10; i++ { |
| s := strings.TrimSpace(lines[i]) |
| if strings.HasPrefix(s, "panic:") || strings.HasPrefix(s, "fatal error:") || strings.HasPrefix(s, "--- FAIL:") || strings.Contains(s, ": internal compiler error:") { |
| if i > 10 { |
| keep = append(keep, "...\n") |
| } |
| end := i + 10 |
| if end >= len(lines)-10 { |
| dots = false |
| end = len(lines) - 10 |
| } |
| keep = append(keep, lines[i:end]...) |
| break |
| } |
| } |
| if dots { |
| keep = append(keep, "...\n") |
| } |
| keep = append(keep, lines[len(lines)-10:]...) |
| lines = keep |
| } |
| |
| return strings.Join(lines, "") |
| } |
| |
| // If a build that has too many failures, the build is probably broken |
| // (e.g. timeout, crash). Coalesce the failures and report maxFailPerBuild |
| // of them. |
| func coalesceFailures(fs []*Failure) []*Failure { |
| var res []*Failure |
| // A subtest fail may cause the parent test to fail, combine them. |
| // So is a test failure causing package-level failure. |
| var cur *Failure |
| for _, f := range fs { |
| if cur != nil { |
| cpkg, ctst := splitTestID(cur.TestID) |
| if (ctst == "" && strings.HasPrefix(f.TestID, cpkg+".")) || |
| strings.HasPrefix(f.TestID, cur.TestID+"/") { |
| // 1. cur is a package-level failure, and f is a test in that package |
| // 2. f is a subtest of cur. |
| // In either case, consume cur, replace with f. |
| res[len(res)-1] = f |
| cur = f |
| continue |
| } |
| } |
| cur = f |
| res = append(res, f) |
| } |
| if len(res) <= maxFailPerBuild { |
| return res |
| } |
| |
| // If multiple subtests fail under the same parent, pick one that is |
| // more likely to be helpful. Prefer the one containing "FAIL", then |
| // the longer log message. |
| moreLikelyUseful := func(f, last *Failure) bool { |
| return strings.Contains(f.LogText, "--- FAIL") && |
| (!strings.Contains(last.LogText, "--- FAIL") || len(f.LogText) > len(last.LogText)) |
| } |
| siblingSubtests := func(f, last *Failure) bool { |
| pkg, tst := splitTestID(f.TestID) |
| pkg2, tst2 := splitTestID(last.TestID) |
| if pkg != pkg2 { |
| return false |
| } |
| par, _, ok := strings.Cut(tst, "/") |
| par2, _, ok2 := strings.Cut(tst2, "/") |
| return ok && ok2 && par == par2 |
| } |
| cur = nil |
| fs = res |
| res = fs[:0] |
| for _, f := range fs { |
| if cur != nil && siblingSubtests(f, cur) { |
| if moreLikelyUseful(f, res[len(res)-1]) { |
| res[len(res)-1] = f |
| } |
| continue |
| } |
| cur = f |
| res = append(res, f) |
| } |
| if len(res) <= maxFailPerBuild { |
| return res |
| } |
| |
| // If there are still too many failures, coalesce by package (pick one with longest log). |
| fs = res |
| res = fs[:0] |
| curpkg := "" |
| for _, f := range fs { |
| pkg, _ := splitTestID(f.TestID) |
| if curpkg != "" && curpkg == pkg { |
| if moreLikelyUseful(f, res[len(res)-1]) { |
| res[len(res)-1] = f |
| } |
| continue |
| } |
| curpkg = pkg |
| res = append(res, f) |
| } |
| if len(res) > maxFailPerBuild { |
| res = res[:maxFailPerBuild] |
| } |
| return res |
| } |