// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Watchflakes is a program that triages apparent test flakes
// on the build.golang.org dashboards. See https://go.dev/wiki/Watchflakes.
package main

import (
	"context"
	"flag"
	"fmt"
	"log"
	"os"
	"runtime"
	"strings"
	"time"

	bbpb "go.chromium.org/luci/buildbucket/proto"
	rdbpb "go.chromium.org/luci/resultdb/proto/v1"
	"golang.org/x/build/buildenv"
	"golang.org/x/build/cmd/watchflakes/internal/script"
	"golang.org/x/build/internal/secret"
	"rsc.io/github"
)

// TODO:
// - subrepos by go commit
// - handle INFRA_FAILURE and CANCELED

var _ = fmt.Print

// Query failures within most recent timeLimit.
const timeLimit = 45 * 24 * time.Hour

const maxFailPerBuild = 3

var (
	md      = flag.Bool("md", false, "print Markdown output suitable for GitHub issues")
	post    = flag.Bool("post", false, "post updates to GitHub issues")
	repeat  = flag.Duration("repeat", 0, "keep running with specified `period`; zero means to run once and exit")
	verbose = flag.Bool("v", false, "print verbose posting decisions")

	useSecretManager = flag.Bool("use-secret-manager", false, "fetch GitHub token from Secret Manager instead of $HOME/.netrc")
)

func usage() {
	fmt.Fprintf(os.Stderr, "Usage: watchflakes [options] [script]\n")
	flag.PrintDefaults()
	os.Exit(2)
}

func main() {
	log.SetPrefix("watchflakes: ")
	flag.Usage = usage
	buildenv.RegisterStagingFlag()
	flag.Parse()
	if flag.NArg() > 1 {
		usage()
	}

	var query *Issue
	if flag.NArg() == 1 {
		s, err := script.Parse("script", flag.Arg(0), fields)
		if err != nil {
			log.Fatalf("parsing query:\n%s", err)
		}
		query = &Issue{Issue: new(github.Issue), Script: s, ScriptText: flag.Arg(0)}
	}

	// Create an authenticated GitHub client.
	if *useSecretManager {
		// Fetch credentials from Secret Manager.
		secretCl, err := secret.NewClientInProject(buildenv.FromFlags().ProjectName)
		if err != nil {
			log.Fatalln("failed to create a Secret Manager client:", err)
		}
		ghToken, err := secretCl.Retrieve(context.Background(), secret.NameWatchflakesGitHubToken)
		if err != nil {
			log.Fatalln("failed to retrieve GitHub token from Secret Manager:", err)
		}
		gh = github.NewClient(ghToken)
	} else {
		// Use credentials in $HOME/.netrc.
		var err error
		gh, err = github.Dial("")
		if err != nil {
			log.Fatalln("github.Dial:", err)
		}
	}

	// Load LUCI dashboards
	ctx := context.Background()
	c := NewLUCIClient(runtime.GOMAXPROCS(0) * 4)
	c.TraceSteps = true

	var ticker *time.Ticker
	if *repeat != 0 {
		ticker = time.NewTicker(*repeat)
	}
Repeat:
	startTime := time.Now()
	boards, err := c.ListBoards(ctx)
	if err != nil {
		log.Fatalln("ListBoards:", err)
	}
	c.ReadBoards(ctx, boards, startTime.Add(-timeLimit))
	skipBrokenCommits(boards)
	skipBrokenBuilders(boards)

	failRes := c.FindFailures(ctx, boards)
	c.FetchLogs(failRes)

	if *verbose {
		for _, r := range failRes {
			fmt.Printf("failure %s %s %s\n", r.Builder, shortHash(r.Commit), buildURL(r.ID))
		}
	}

	// Load GitHub issues
	var issues []*Issue
	issues, err = readIssues(issues)
	if err != nil {
		log.Fatal(err)
	}
	findScripts(issues)
	if query == nil {
		postIssueErrors(issues)
	}
	if query != nil {
		issues = []*Issue{query}
	}

	for _, r := range failRes {
		newIssue := 0
		fs := r.Failures
		fs = coalesceFailures(fs)
		if len(fs) == 0 {
			// No test failure, Probably a build failure.
			// E.g. https://ci.chromium.org/ui/b/8759448820419452721
			// Make a dummy failure.
			f := &Failure{
				Status:  rdbpb.TestStatus_FAIL,
				LogText: r.StepLogText,
			}
			fs = []*Failure{f}
		}
		for _, f := range fs {
			fp := NewFailurePost(r, f)
			record := fp.Record()
			action, targets := run(issues, record)
			if *verbose {
				printRecord(record, false)
				fmt.Printf("\t%s %v\n", action, targets)
			}
			switch action {
			case "skip":
				// do nothing
				if *verbose {
					fmt.Printf("%s: skipped by #%d\n", fp.URL, targets[0].Number)
				}

			case "":
				if newIssue > 0 {
					// If we already opened a new issue for a build, don't open another one.
					// It could be that the build is just broken.
					// One can look at the issue and split if necessary.
					break
				}

				// create a new issue
				if query == nil {
					if *verbose {
						fmt.Printf("%s: new issue\n", fp.URL)
					}
					issue, err := reportNew(fp)
					if err != nil {
						log.Fatal(err)
					}
					issues = append(issues, issue)
					newIssue++
				}

			case "default", "post", "take":
				for _, issue := range targets {
					if !issue.Mentions[fp.URL] && issue.Stale {
						readComments(issue)
					}
					if *verbose {
						mentioned := "un"
						if issue.Mentions[fp.URL] {
							mentioned = "already "
						}
						fmt.Printf("%s: %s #%d, %smentioned\n", fp.URL, action, issue.Number, mentioned)
					}
					if !issue.Mentions[fp.URL] {
						issue.Post = append(issue.Post, fp)
					}
				}
			}
		}
	}

	if query != nil {
		format := (*FailurePost).Text
		if *md {
			format = (*FailurePost).Markdown
		}
		for i, fp := range query.Post {
			if i > 0 {
				fmt.Printf("\n")
			}
			os.Stdout.WriteString(format(fp))
		}
		if *md {
			os.Stdout.WriteString(signature)
		}
		return
	}

	posts := 0
	for _, issue := range issues {
		if len(issue.Post) > 0 {
			fmt.Printf(" - new for #%d %s\n", issue.Number, issue.Title)
			for _, fp := range issue.Post {
				fmt.Printf("    - %s\n      %s\n", fp, fp.URL)
			}
			msg := updateText(issue)
			if *verbose {
				fmt.Printf("\n%s\n", indent(spaces[:3], msg))
			}
			if *post {
				if err := postComment(issue, msg); err != nil {
					log.Print(err)
					continue
				}
				if issue.Mentions == nil {
					issue.Mentions = make(map[string]bool)
				}
				for _, fp := range issue.Post {
					issue.Mentions[fp.URL] = true
				}
			}
			posts++
		}
	}

	log.Printf("Done. %d boards, %d failures, %d issues, %d posts, in %v\n", len(boards), len(failRes), len(issues), posts, time.Since(startTime))

	if *repeat != 0 {
		<-ticker.C
		goto Repeat
	}
}

const SKIP = bbpb.Status_STATUS_UNSPECIFIED // for smashing the status to skip a non-flake failure

// skipBrokenCommits identifies broken commits,
// which are the ones that failed on at least 1/4 of builders,
// and then changes all results for those commits to SKIP.
func skipBrokenCommits(boards []*Dashboard) {
	for _, dash := range boards {
		builderThreshold := len(dash.Builders) / 4
		for i := 0; i < len(dash.Commits); i++ {
			bad := 0
			good := 0
			for _, rs := range dash.Results {
				if rs[i] == nil {
					continue
				}
				switch rs[i].Status {
				case bbpb.Status_SUCCESS:
					good++
				case bbpb.Status_FAILURE:
					bad++
					// ignore other status
				}
			}
			if bad > builderThreshold || good < builderThreshold {
				fmt.Printf("skip: commit %s (%s %s) is broken (good=%d bad=%d)\n", shortHash(dash.Commits[i].Hash), dash.Repo, dash.GoBranch, good, bad)
				for _, rs := range dash.Results {
					if rs[i] != nil {
						rs[i].Status = SKIP
					}
				}
			}
		}
	}
}

// skipBrokenBuilders identifies builders that were consistently broken
// (at least tooManyToBeFlakes failures in a row) and then turned ok.
// It changes those consistent failures to SKIP.
func skipBrokenBuilders(boards []*Dashboard) {
	const tooManyToBeFlakes = 4

	for _, dash := range boards {
		for _, rs := range dash.Results {
			bad := 100 // squash failures at the top of the dashboard, which may turn out to be consistent
			badStart := 0
			skip := func(i int) { // skip the i-th result
				if rs[i] != nil {
					fmt.Printf("skip: builder %s was broken at %s (%s %s)\n", rs[i].Builder, shortHash(rs[i].Commit), dash.Repo, dash.GoBranch)
					rs[i].Status = SKIP
				}
			}
			for i, r := range rs {
				if rs[i] == nil {
					continue
				}
				switch r.Status {
				case bbpb.Status_SUCCESS:
					bad = 0
					continue
				case bbpb.Status_FAILURE:
					bad++
				default: // ignore other status
					continue
				}
				switch bad {
				case 1:
					badStart = i
				case tooManyToBeFlakes:
					// Skip the run so far.
					for j := badStart; j < i; j++ {
						skip(j)
					}
				}
				if bad >= tooManyToBeFlakes {
					skip(i)
				}
			}

			// Bad entries ending just before the cutoff are not flakes
			// even if there are just a few of them. Otherwise we get
			// spurious flakes when there's one bad entry before the
			// cutoff and lots after the cutoff.
			if bad > 0 && badStart > 0 {
				for j := badStart; j < len(rs); j++ {
					skip(j)
				}
			}
		}
	}
}

// run runs the scripts in issues on record.
// It returns the desired action (skip, post, default)
// as well as the list of target issues (for post or default).
func run(issues []*Issue, record script.Record) (action string, targets []*Issue) {
	var def, post []*Issue

	for _, issue := range issues {
		if issue.Script != nil {
			switch issue.Script.Action(record) {
			case "skip":
				return "skip", []*Issue{issue}
			case "take":
				println("TAKE", issue.Number)
			case "default":
				def = append(def, issue)
			case "post":
				post = append(post, issue)
			}
		}
	}

	if len(post) > 0 {
		return "post", post
	}
	if len(def) > 0 {
		return "default", def[:1]
	}
	return "", nil
}

// FailurePost is a failure to be posted on an issue.
type FailurePost struct {
	*BuildResult
	*Failure
	URL     string // LUCI build page
	Pkg     string
	Test    string
	Snippet string
}

func NewFailurePost(r *BuildResult, f *Failure) *FailurePost {
	pkg, test := splitTestID(f.TestID)
	snip := snippet(f.LogText)
	if snip == "" {
		snip = snippet(r.LogText)
	}
	fp := &FailurePost{
		BuildResult: r,
		Failure:     f,
		URL:         buildURL(r.ID),
		Pkg:         pkg,
		Test:        test,
		Snippet:     snip,
	}
	return fp
}

// fields is the list of known fields for use by script patterns.
// It must be in sync with the Record method below.
var fields = []string{
	"",
	"section", // not used, keep for compatibility with old watchflakes
	"pkg",
	"test",
	"mode",
	"output",
	"snippet",
	"date",
	"builder",
	"repo",
	"goos",
	"goarch",
	"log",
	"status",
}

func (fp *FailurePost) Record() script.Record {
	// Note: update fields above if any new fields are added to this record.
	m := script.Record{
		"pkg":     fp.Pkg,
		"test":    fp.Test,
		"output":  fp.Failure.LogText,
		"snippet": fp.Snippet,
		"date":    fp.Time.Format(time.RFC3339),
		"builder": fp.Builder,
		"repo":    fp.Repo,
		"goos":    fp.Target.GOOS,
		"goarch":  fp.Target.GOARCH,
		"log":     fp.BuildResult.LogText,
		"status":  fp.Failure.Status.String(),
	}
	m[""] = m["output"] // default field for `regexp` search (as opposed to field ~ `regexp`)
	if fp.IsBuildFailure() {
		m["mode"] = "build"
	}
	return m
}

func printRecord(r script.Record, verbose bool) {
	fmt.Printf("%s %s %s %s %s %s\n", r["date"], r["builder"], r["goos"], r["goarch"],
		r["pkg"], r["test"])
	if verbose {
		fmt.Printf("%s\n", indent(spaces[:4], r["snippet"]))
	}
}

func (fp *FailurePost) IsBuildFailure() bool {
	// no test ID. dummy for build failure.
	return fp.Failure.TestID == ""
}

// String returns a header to identify the log and failure.
func (fp *FailurePost) String() string {
	repo := fp.Repo
	sep := ""
	if fp.GoCommit != "" {
		sep = " go@"
	}
	if fp.GoBranch != "" && fp.GoBranch != "master" {
		b := strings.TrimPrefix(fp.GoBranch, " release-branch.")
		if repo == "go" {
			repo = b
		}
		if sep == " go@" {
			sep = " " + b + "@"
		}
	}
	s := fmt.Sprintf("%s %s %s@%s%s%s",
		fp.Time.Format("2006-01-02 15:04"),
		fp.Builder, repo, shortHash(fp.Commit),
		sep, shortHash(fp.GoCommit))

	if fp.Pkg != "" || fp.Test != "" {
		s += " " + shortPkg(fp.Pkg)
		if fp.Pkg != "" && fp.Test != "" {
			s += "."
		}
		s += fp.Test
	}
	if fp.IsBuildFailure() {
		s += " [build]"
	}
	if fp.Failure.Status != rdbpb.TestStatus_FAIL {
		s += fmt.Sprintf(" [%s]", fp.Failure.Status)
	}
	return s
}

// Markdown returns Markdown suitable for posting to GitHub.
func (fp *FailurePost) Markdown() string {
	return fmt.Sprintf("<details><summary>%s (<a href=\"%s\">log</a>)</summary>\n\n%s</details>\n",
		fp.String(), fp.URL, indent(spaces[:4], fp.Snippet))
}

// Text returns text suitable for reading in interactive use or debug logging.
func (fp *FailurePost) Text() string {
	return fmt.Sprintf("%s\n%s\n%s\n", fp, fp.URL, strings.TrimRight(fp.Snippet, "\n"))
}

var spaces = strings.Repeat(" ", 100)

// indent returns a copy of text in which every line has been indented by prefix.
// It also ensures that, except when text is empty, text ends in a \n character.
func indent(prefix, text string) string {
	if text == "" {
		return ""
	}
	text = strings.TrimRight(text, "\n")
	return prefix + strings.ReplaceAll(text, "\n", "\n"+prefix) + "\n"
}

// shortPkg shortens pkg by removing any leading
// golang.org/ (for packages like golang.org/x/sys/windows).
func shortPkg(pkg string) string {
	pkg = strings.TrimPrefix(pkg, "golang.org/")
	return pkg
}

// shorten the output lines to form a snippet
func snippet(log string) string {
	lines := strings.SplitAfter(log, "\n")

	// Remove beginning and trailing blank lines.
	for len(lines) > 0 && strings.TrimSpace(lines[0]) == "" {
		lines = lines[1:]
	}
	for len(lines) > 0 && strings.TrimSpace(lines[len(lines)-1]) == "" {
		lines = lines[:len(lines)-1]
	}

	// If we have more than 30 lines, make the snippet by taking the first 10,
	// the last 10, and possibly a middle 10. The middle 10 is included when
	// the interior lines (between the first and last 10) contain an important-looking
	// message like "panic:" or "--- FAIL:". The middle 10 start at the important-looking line.
	// such as
	if len(lines) > 30 {
		var keep []string
		keep = append(keep, lines[:10]...)
		dots := true
		for i := 10; i < len(lines)-10; i++ {
			s := strings.TrimSpace(lines[i])
			if strings.HasPrefix(s, "panic:") || strings.HasPrefix(s, "fatal error:") || strings.HasPrefix(s, "--- FAIL:") || strings.Contains(s, ": internal compiler error:") {
				if i > 10 {
					keep = append(keep, "...\n")
				}
				end := i + 10
				if end >= len(lines)-10 {
					dots = false
					end = len(lines) - 10
				}
				keep = append(keep, lines[i:end]...)
				break
			}
		}
		if dots {
			keep = append(keep, "...\n")
		}
		keep = append(keep, lines[len(lines)-10:]...)
		lines = keep
	}

	return strings.Join(lines, "")
}

// If a build that has too many failures, the build is probably broken
// (e.g. timeout, crash). Coalesce the failures and report maxFailPerBuild
// of them.
func coalesceFailures(fs []*Failure) []*Failure {
	var res []*Failure
	// A subtest fail may cause the parent test to fail, combine them.
	var cur *Failure
	for _, f := range fs {
		if cur != nil && strings.HasPrefix(f.TestID, cur.TestID+"/") {
			// f is a subtest of cur. Consume cur, replace with f.
			res[len(res)-1] = f
			cur = f
			continue
		}
		cur = f
		res = append(res, f)
	}
	if len(res) <= maxFailPerBuild {
		return res
	}

	// If multiple subtests fail under the same parent, pick one that is
	// more likely to be helpful. Prefer the one containing "FAIL", then
	// the longer log message.
	moreLikelyUseful := func(f, last *Failure) bool {
		return strings.Contains(f.LogText, "--- FAIL") &&
			(!strings.Contains(last.LogText, "--- FAIL") || len(f.LogText) > len(last.LogText))
	}
	cur = nil
	res = fs[:0]
	for _, f := range fs {
		if cur != nil && strings.HasPrefix(f.TestID, cur.TestID+"/") {
			if moreLikelyUseful(f, res[len(res)-1]) {
				res[len(res)-1] = f
			}
			continue
		}
		cur = f
		res = append(res, f)
	}
	if len(res) <= maxFailPerBuild {
		return res
	}

	// If there are still too many failures, coalesce by package (pick one with longest log).
	fs = res
	res = fs[:0]
	curpkg := ""
	for _, f := range fs {
		pkg, _ := splitTestID(f.TestID)
		if curpkg != "" && curpkg == pkg {
			if moreLikelyUseful(f, res[len(res)-1]) {
				res[len(res)-1] = f
			}
			continue
		}
		curpkg = pkg
		res = append(res, f)
	}
	if len(res) > maxFailPerBuild {
		res = res[:maxFailPerBuild]
	}
	return res
}
