| // Copyright 2024 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package main |
| |
| import ( |
| "context" |
| "flag" |
| "fmt" |
| "log" |
| "os" |
| "strings" |
| "time" |
| |
| bbpb "go.chromium.org/luci/buildbucket/proto" |
| rdbpb "go.chromium.org/luci/resultdb/proto/v1" |
| "golang.org/x/build/cmd/watchflakes/internal/script" |
| "rsc.io/github" |
| ) |
| |
| // TODO: |
| // - subrepos by go commit |
| // - handle INFRA_FAILURE and CANCELED |
| |
| var _ = fmt.Print |
| |
| // Query failures within most recent timeLimit. |
| const timeLimit = 45 * 24 * time.Hour |
| |
| const maxFailPerBuild = 3 |
| |
| var ( |
| md = flag.Bool("md", false, "print Markdown output suitable for GitHub issues") |
| post = flag.Bool("post", false, "post updates to GitHub issues") |
| repeat = flag.Duration("repeat", 0, "run again after waiting `delay`") |
| verbose = flag.Bool("v", false, "print verbose posting decisions") |
| ) |
| |
| func usage() { |
| fmt.Fprintf(os.Stderr, "Usage: watchflakes [options] [script]\n") |
| flag.PrintDefaults() |
| os.Exit(2) |
| } |
| |
| func main() { |
| log.SetPrefix("watchflakes: ") |
| flag.Usage = usage |
| flag.Parse() |
| if flag.NArg() > 1 { |
| usage() |
| } |
| |
| var query *Issue |
| if flag.NArg() == 1 { |
| s, err := script.Parse("script", flag.Arg(0), fields) |
| if err != nil { |
| log.Fatalf("parsing query:\n%s", err) |
| } |
| query = &Issue{Issue: new(github.Issue), Script: s, ScriptText: flag.Arg(0)} |
| } |
| |
| // Load LUCI dashboards |
| ctx := context.Background() |
| c := NewLUCIClient() |
| |
| Repeat: |
| startTime := time.Now() |
| boards := c.ListBoards(ctx) |
| c.ReadBoards(ctx, boards, startTime.Add(-timeLimit)) |
| skipBrokenCommits(boards) |
| skipBrokenBuilders(boards) |
| |
| failRes := c.FindFailures(ctx, boards) |
| fetchLogs(failRes) |
| |
| if *verbose { |
| for _, r := range failRes { |
| fmt.Printf("failure %s %s %s\n", r.Builder, shortHash(r.Commit), buildURL(r.ID)) |
| } |
| } |
| |
| // Load GitHub issues |
| var issues []*Issue |
| issues, err := readIssues(issues) |
| if err != nil { |
| log.Fatal(err) |
| } |
| findScripts(issues) |
| if query == nil { |
| postIssueErrors(issues) |
| } |
| if query != nil { |
| issues = []*Issue{query} |
| } |
| |
| for _, r := range failRes { |
| newIssue := 0 |
| fs := r.Failures |
| fs = coalesceFailures(fs) |
| if len(fs) == 0 { |
| // No test failure, Probably a build failure. |
| // E.g. https://ci.chromium.org/ui/b/8759448820419452721 |
| // Make a dummy failure. |
| f := &Failure{ |
| Status: rdbpb.TestStatus_FAIL, |
| LogText: r.StepLogText, |
| } |
| fs = []*Failure{f} |
| } |
| for _, f := range fs { |
| fp := NewFailurePost(r, f) |
| record := fp.Record() |
| action, targets := run(issues, record) |
| if *verbose { |
| printRecord(record, false) |
| fmt.Printf("\t%s %v\n", action, targets) |
| } |
| switch action { |
| case "skip": |
| // do nothing |
| if *verbose { |
| fmt.Printf("%s: skipped by #%d\n", fp.URL, targets[0].Number) |
| } |
| |
| case "": |
| if newIssue > 0 { |
| // If we already opened a new issue for a build, don't open another one. |
| // It could be that the build is just broken. |
| // One can look at the issue and split if necessary. |
| break |
| } |
| |
| // create a new issue |
| if query == nil { |
| if *verbose { |
| fmt.Printf("%s: new issue\n", fp.URL) |
| } |
| issue, err := reportNew(fp) |
| if err != nil { |
| log.Fatal(err) |
| } |
| issues = append(issues, issue) |
| newIssue++ |
| } |
| |
| case "default", "post", "take": |
| for _, issue := range targets { |
| if !issue.Mentions[fp.URL] && issue.Stale { |
| readComments(issue) |
| } |
| if *verbose { |
| mentioned := "un" |
| if issue.Mentions[fp.URL] { |
| mentioned = "already " |
| } |
| fmt.Printf("%s: %s #%d, %smentioned\n", fp.URL, action, issue.Number, mentioned) |
| } |
| if !issue.Mentions[fp.URL] { |
| issue.Post = append(issue.Post, fp) |
| } |
| } |
| } |
| } |
| } |
| |
| if query != nil { |
| format := (*FailurePost).Text |
| if *md { |
| format = (*FailurePost).Markdown |
| } |
| for i, fp := range query.Post { |
| if i > 0 { |
| fmt.Printf("\n") |
| } |
| os.Stdout.WriteString(format(fp)) |
| } |
| if *md { |
| os.Stdout.WriteString(signature) |
| } |
| return |
| } |
| |
| posts := 0 |
| for _, issue := range issues { |
| if len(issue.Post) > 0 { |
| fmt.Printf(" - new for #%d %s\n", issue.Number, issue.Title) |
| for _, fp := range issue.Post { |
| fmt.Printf(" - %s\n %s\n", fp, fp.URL) |
| } |
| msg := updateText(issue) |
| if *verbose { |
| fmt.Printf("\n%s\n", indent(spaces[:3], msg)) |
| } |
| if *post { |
| if err := postComment(issue, msg); err != nil { |
| log.Print(err) |
| continue |
| } |
| if issue.Mentions == nil { |
| issue.Mentions = make(map[string]bool) |
| } |
| for _, fp := range issue.Post { |
| issue.Mentions[fp.URL] = true |
| } |
| } |
| posts++ |
| } |
| } |
| |
| log.Printf("Done. %d boards, %d failures, %d issues, %d posts, in %v\n", len(boards), len(failRes), len(issues), posts, time.Since(startTime)) |
| |
| if *repeat != 0 { |
| time.Sleep(*repeat) |
| goto Repeat |
| } |
| } |
| |
| const SKIP = bbpb.Status_STATUS_UNSPECIFIED // for smashing the status to skip a non-flake failure |
| |
| // skipBrokenCommits identifies broken commits, |
| // which are the ones that failed on at least 1/4 of builders, |
| // and then changes all results for those commits to SKIP. |
| func skipBrokenCommits(boards []*Dashboard) { |
| for _, dash := range boards { |
| builderThreshold := len(dash.Builders) / 4 |
| for i := 0; i < len(dash.Commits); i++ { |
| bad := 0 |
| good := 0 |
| for _, rs := range dash.Results { |
| if rs[i] == nil { |
| continue |
| } |
| switch rs[i].Status { |
| case bbpb.Status_SUCCESS: |
| good++ |
| case bbpb.Status_FAILURE: |
| bad++ |
| // ignore other status |
| } |
| } |
| if bad > builderThreshold || good < builderThreshold { |
| fmt.Printf("skip: commit %s (%s %s) is broken (good=%d bad=%d)\n", shortHash(dash.Commits[i].Hash), dash.Repo, dash.GoBranch, good, bad) |
| for _, rs := range dash.Results { |
| if rs[i] != nil { |
| rs[i].Status = SKIP |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| // skipBrokenBuilders identifies builders that were consistently broken |
| // (at least tooManyToBeFlakes failures in a row) and then turned ok. |
| // It changes those consistent failures to SKIP. |
| func skipBrokenBuilders(boards []*Dashboard) { |
| const tooManyToBeFlakes = 4 |
| |
| for _, dash := range boards { |
| for _, rs := range dash.Results { |
| bad := 100 // squash failures at the top of the dashboard, which may turn out to be consistent |
| badStart := 0 |
| skip := func(i int) { // skip the i-th result |
| if rs[i] != nil { |
| fmt.Printf("skip: builder %s was broken at %s (%s %s)\n", rs[i].Builder, shortHash(rs[i].Commit), dash.Repo, dash.GoBranch) |
| rs[i].Status = SKIP |
| } |
| } |
| for i, r := range rs { |
| if rs[i] == nil { |
| continue |
| } |
| switch r.Status { |
| case bbpb.Status_SUCCESS: |
| bad = 0 |
| continue |
| case bbpb.Status_FAILURE: |
| bad++ |
| default: // ignore other status |
| continue |
| } |
| switch bad { |
| case 1: |
| badStart = i |
| case tooManyToBeFlakes: |
| // Skip the run so far. |
| for j := badStart; j < i; j++ { |
| skip(j) |
| } |
| } |
| if bad >= tooManyToBeFlakes { |
| skip(i) |
| } |
| } |
| |
| // Bad entries ending just before the cutoff are not flakes |
| // even if there are just a few of them. Otherwise we get |
| // spurious flakes when there's one bad entry before the |
| // cutoff and lots after the cutoff. |
| if bad > 0 && badStart > 0 { |
| for j := badStart; j < len(rs); j++ { |
| skip(j) |
| } |
| } |
| } |
| } |
| } |
| |
| // run runs the scripts in issues on record. |
| // It returns the desired action (skip, post, default) |
| // as well as the list of target issues (for post or default). |
| func run(issues []*Issue, record script.Record) (action string, targets []*Issue) { |
| var def, post []*Issue |
| |
| for _, issue := range issues { |
| if issue.Script != nil { |
| switch issue.Script.Action(record) { |
| case "skip": |
| return "skip", []*Issue{issue} |
| case "take": |
| println("TAKE", issue.Number) |
| case "default": |
| def = append(def, issue) |
| case "post": |
| post = append(post, issue) |
| } |
| } |
| } |
| |
| if len(post) > 0 { |
| return "post", post |
| } |
| if len(def) > 0 { |
| return "default", def[:1] |
| } |
| return "", nil |
| } |
| |
| // FailurePost is a failure to be posted on an issue. |
| type FailurePost struct { |
| *BuildResult |
| *Failure |
| URL string // LUCI build page |
| Pkg string |
| Test string |
| Snippet string |
| } |
| |
| func NewFailurePost(r *BuildResult, f *Failure) *FailurePost { |
| pkg, test := splitTestID(f.TestID) |
| snip := snippet(f.LogText) |
| if snip == "" { |
| snip = snippet(r.LogText) |
| } |
| fp := &FailurePost{ |
| BuildResult: r, |
| Failure: f, |
| URL: buildURL(r.ID), |
| Pkg: pkg, |
| Test: test, |
| Snippet: snip, |
| } |
| return fp |
| } |
| |
| // fields is the list of known fields for use by script patterns. |
| // It must be in sync with the Record method below. |
| var fields = []string{ |
| "", |
| "section", // not used, keep for compatibility with old watchflakes |
| "pkg", |
| "test", |
| "mode", |
| "output", |
| "snippet", |
| "date", |
| "builder", |
| "repo", |
| "goos", |
| "goarch", |
| "log", |
| "status", |
| } |
| |
| func (fp *FailurePost) Record() script.Record { |
| // Note: update fields above if any new fields are added to this record. |
| m := script.Record{ |
| "pkg": fp.Pkg, |
| "test": fp.Test, |
| "output": fp.Failure.LogText, |
| "snippet": fp.Snippet, |
| "date": fp.Time.Format(time.RFC3339), |
| "builder": fp.Builder, |
| "repo": fp.Repo, |
| "goos": fp.Target.GOOS, |
| "goarch": fp.Target.GOARCH, |
| "log": fp.BuildResult.LogText, |
| "status": fp.Failure.Status.String(), |
| } |
| m[""] = m["output"] // default field for `regexp` search (as opposed to field ~ `regexp`) |
| if fp.IsBuildFailure() { |
| m["mode"] = "build" |
| } |
| return m |
| } |
| |
| func printRecord(r script.Record, verbose bool) { |
| fmt.Printf("%s %s %s %s %s %s\n", r["date"], r["builder"], r["goos"], r["goarch"], |
| r["pkg"], r["test"]) |
| if verbose { |
| fmt.Printf("%s\n", indent(spaces[:4], r["snippet"])) |
| } |
| } |
| |
| func (fp *FailurePost) IsBuildFailure() bool { |
| // no test ID. dummy for build failure. |
| return fp.Failure.TestID == "" |
| } |
| |
| // String returns a header to identify the log and failure. |
| func (fp *FailurePost) String() string { |
| repo := fp.Repo |
| sep := "" |
| if fp.GoCommit != "" { |
| sep = " go@" |
| } |
| if fp.GoBranch != "" && fp.GoBranch != "master" { |
| b := strings.TrimPrefix(fp.GoBranch, " release-branch.") |
| if repo == "go" { |
| repo = b |
| } |
| if sep == " go@" { |
| sep = " " + b + "@" |
| } |
| } |
| s := fmt.Sprintf("%s %s %s@%s%s%s", |
| fp.Time.Format("2006-01-02 15:04"), |
| fp.Builder, repo, shortHash(fp.Commit), |
| sep, shortHash(fp.GoCommit)) |
| |
| if fp.Pkg != "" || fp.Test != "" { |
| s += " " + shortPkg(fp.Pkg) |
| if fp.Pkg != "" && fp.Test != "" { |
| s += "." |
| } |
| s += fp.Test |
| } |
| if fp.IsBuildFailure() { |
| s += " [build]" |
| } |
| if fp.Failure.Status != rdbpb.TestStatus_FAIL { |
| s += fmt.Sprintf(" [%s]", fp.Failure.Status) |
| } |
| return s |
| } |
| |
| // Markdown returns Markdown suitable for posting to GitHub. |
| func (fp *FailurePost) Markdown() string { |
| return fmt.Sprintf("<details><summary>%s (<a href=\"%s\">log</a>)</summary>\n\n%s</details>\n", |
| fp.String(), fp.URL, indent(spaces[:4], fp.Snippet)) |
| } |
| |
| // Text returns text suitable for reading in interactive use or debug logging. |
| func (fp *FailurePost) Text() string { |
| return fmt.Sprintf("%s\n%s\n%s\n", fp, fp.URL, strings.TrimRight(fp.Snippet, "\n")) |
| } |
| |
| var spaces = strings.Repeat(" ", 100) |
| |
| // indent returns a copy of text in which every line has been indented by prefix. |
| // It also ensures that, except when text is empty, text ends in a \n character. |
| func indent(prefix, text string) string { |
| if text == "" { |
| return "" |
| } |
| text = strings.TrimRight(text, "\n") |
| return prefix + strings.ReplaceAll(text, "\n", "\n"+prefix) + "\n" |
| } |
| |
| // shortPkg shortens pkg by removing any leading |
| // golang.org/ (for packages like golang.org/x/sys/windows). |
| func shortPkg(pkg string) string { |
| pkg = strings.TrimPrefix(pkg, "golang.org/") |
| return pkg |
| } |
| |
| // shorten the output lines to form a snippet |
| func snippet(log string) string { |
| lines := strings.SplitAfter(log, "\n") |
| |
| // Remove beginning and trailing blank lines. |
| for len(lines) > 0 && strings.TrimSpace(lines[0]) == "" { |
| lines = lines[1:] |
| } |
| for len(lines) > 0 && strings.TrimSpace(lines[len(lines)-1]) == "" { |
| lines = lines[:len(lines)-1] |
| } |
| |
| // If we have more than 30 lines, make the snippet by taking the first 10, |
| // the last 10, and possibly a middle 10. The middle 10 is included when |
| // the interior lines (between the first and last 10) contain an important-looking |
| // message like "panic:" or "--- FAIL:". The middle 10 start at the important-looking line. |
| // such as |
| if len(lines) > 30 { |
| var keep []string |
| keep = append(keep, lines[:10]...) |
| dots := true |
| for i := 10; i < len(lines)-10; i++ { |
| s := strings.TrimSpace(lines[i]) |
| if strings.HasPrefix(s, "panic:") || strings.HasPrefix(s, "fatal error:") || strings.HasPrefix(s, "--- FAIL:") || strings.Contains(s, ": internal compiler error:") { |
| if i > 10 { |
| keep = append(keep, "...\n") |
| } |
| end := i + 10 |
| if end >= len(lines)-10 { |
| dots = false |
| end = len(lines) - 10 |
| } |
| keep = append(keep, lines[i:end]...) |
| break |
| } |
| } |
| if dots { |
| keep = append(keep, "...\n") |
| } |
| keep = append(keep, lines[len(lines)-10:]...) |
| lines = keep |
| } |
| |
| return strings.Join(lines, "") |
| } |
| |
| // If a build that has too many failures, the build is probably broken |
| // (e.g. timeout, crash). Coalesce the failures and report maxFailPerBuild |
| // of them. |
| func coalesceFailures(fs []*Failure) []*Failure { |
| var res []*Failure |
| // A subtest fail may cause the parent test to fail, combine them. |
| var cur *Failure |
| for _, f := range fs { |
| if cur != nil && strings.HasPrefix(f.TestID, cur.TestID+"/") { |
| // f is a subtest of cur. Consume cur, replace with f. |
| res[len(res)-1] = f |
| cur = f |
| continue |
| } |
| cur = f |
| res = append(res, f) |
| } |
| if len(res) <= maxFailPerBuild { |
| return res |
| } |
| |
| // If multiple subtests fail under the same parent, pick one that is |
| // more likely to be helpful. Prefer the one containing "FAIL", then |
| // the longer log message. |
| moreLikelyUseful := func(f, last *Failure) bool { |
| return strings.Contains(f.LogText, "--- FAIL") && |
| (!strings.Contains(last.LogText, "--- FAIL") || len(f.LogText) > len(last.LogText)) |
| } |
| cur = nil |
| res = fs[:0] |
| for _, f := range fs { |
| if cur != nil && strings.HasPrefix(f.TestID, cur.TestID+"/") { |
| if moreLikelyUseful(f, res[len(res)-1]) { |
| res[len(res)-1] = f |
| } |
| continue |
| } |
| cur = f |
| res = append(res, f) |
| } |
| if len(res) <= maxFailPerBuild { |
| return res |
| } |
| |
| // If there are still too many failures, coalesce by package (pick one with longest log). |
| fs = res |
| res = fs[:0] |
| curpkg := "" |
| for _, f := range fs { |
| pkg, _ := splitTestID(f.TestID) |
| if curpkg != "" && curpkg == pkg { |
| if moreLikelyUseful(f, res[len(res)-1]) { |
| res[len(res)-1] = f |
| } |
| continue |
| } |
| curpkg = pkg |
| res = append(res, f) |
| } |
| if len(res) > maxFailPerBuild { |
| res = res[:maxFailPerBuild] |
| } |
| return res |
| } |