internal/relui: modify advisory trybots

This change introduces retries to each advisory trybot run. A trybot
run will be attempted 3 times before it fails. If it fails then the
operator will be presented with the option to approve the failed
trybot run. There is a follow up task which waits for all advisory
trybot runs to be approved or pass before it continues processing the
workflow.

Fixes golang/go#57725

Change-Id: I2250289f5d597c7cb493d0267e451691548589c7
Reviewed-on: https://go-review.googlesource.com/c/build/+/463535
Run-TryBot: Carlos Amedee <carlos@golang.org>
Reviewed-by: Carlos Amedee <carlos@golang.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
diff --git a/internal/relui/buildrelease_test.go b/internal/relui/buildrelease_test.go
index d9d9a4a..9c62bea 100644
--- a/internal/relui/buildrelease_test.go
+++ b/internal/relui/buildrelease_test.go
@@ -433,7 +433,7 @@
 	defaultApprove := deps.buildTasks.ApproveAction
 	approvedTrybots := false
 	deps.buildTasks.ApproveAction = func(ctx *workflow.TaskContext) error {
-		if strings.Contains(ctx.TaskName, "TryBot failures") {
+		if strings.Contains(ctx.TaskName, "Run advisory TryBot") {
 			approvedTrybots = true
 			return nil
 		}
diff --git a/internal/relui/workflows.go b/internal/relui/workflows.go
index f03db11..47a3226 100644
--- a/internal/relui/workflows.go
+++ b/internal/relui/workflows.go
@@ -16,6 +16,7 @@
 	"math/rand"
 	"net/http"
 	"path"
+	"sort"
 	"strings"
 	"sync"
 	"time"
@@ -448,7 +449,7 @@
 		result := wf.Task3(wd, "Run advisory TryBot "+bc.Name, tasks.runAdvisoryTryBot, wf.Const(bc), skipTests, source)
 		advisoryResults = append(advisoryResults, result)
 	}
-	tryBotsApproved := wf.Action1(wd, "Approve any TryBot failures", tasks.checkAdvisoryTrybots, wf.Slice(advisoryResults...))
+	tryBotsApproved := wf.Action1(wd, "Wait for advisory TryBots", tasks.checkAdvisoryTrybots, wf.Slice(advisoryResults...))
 
 	signedAndTested := wf.Task2(wd, "Wait for signing and tests", func(ctx *wf.TaskContext, artifacts []artifact, version string) ([]artifact, error) {
 		// Note: Note this needs to happen somewhere, doesn't matter where. Maybe move it to a nicer place later.
@@ -761,14 +762,23 @@
 			return tryBotResult{bc.Name, true}, nil
 		}
 	}
-
 	passed := false
-	_, err := b.runBuildStep(ctx, nil, bc, source, "", func(bs *task.BuildletStep, r io.Reader, w io.Writer) error {
-		var err error
-		passed, err = bs.RunTryBot(ctx, r)
-		return err
-	})
-	return tryBotResult{bc.Name, passed}, err
+	for attempt := 1; attempt <= workflow.MaxRetries && !passed; attempt++ {
+		ctx.Printf("======== Trybot Attempt %d of %d ========\n", attempt, workflow.MaxRetries)
+		_, err := b.runBuildStep(ctx, nil, bc, source, "", func(bs *task.BuildletStep, r io.Reader, w io.Writer) error {
+			var err error
+			passed, err = bs.RunTryBot(ctx, r)
+			return err
+		})
+		if err != nil {
+			ctx.Printf("Trybot Attempt failed: %v\n", err)
+		}
+	}
+	if !passed {
+		ctx.Printf("Advisory TryBot failed. Check the logs and approve this task if it's okay:\n")
+		return tryBotResult{bc.Name, passed}, b.ApproveAction(ctx)
+	}
+	return tryBotResult{bc.Name, passed}, nil
 }
 
 func (b *BuildReleaseTasks) checkAdvisoryTrybots(ctx *wf.TaskContext, results []tryBotResult) error {
@@ -778,11 +788,12 @@
 			fails = append(fails, r.Name)
 		}
 	}
-	if len(fails) == 0 {
+	if len(fails) != 0 {
+		sort.Strings(fails)
+		ctx.Printf("Some advisory TryBots failed and their failures have been approved:\n%v", strings.Join(fails, "\n"))
 		return nil
 	}
-	ctx.Printf("Some advisory TryBots failed. Check their logs and approve this task if it's okay:\n%v", strings.Join(fails, "\n"))
-	return b.ApproveAction(ctx)
+	return nil
 }
 
 // runBuildStep is a convenience function that manages resources a build step might need.