sweet: don't start diagnostics until all cockroach instances are ready There's currently a flake with TestSweetEndToEnd, specifically with testing pgo, wherein we'll start grabbing diagnostics from cockroach instances too soon. Change-Id: I7eda15ee8d3401c591ad568b8c8bb125ace72e80 Reviewed-on: https://go-review.googlesource.com/c/benchmarks/+/614535 Auto-Submit: Michael Knyszek <mknyszek@google.com> Reviewed-by: Michael Pratt <mpratt@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
diff --git a/sweet/benchmarks/cockroachdb/main.go b/sweet/benchmarks/cockroachdb/main.go index 6ee31ab..0d0979c 100644 --- a/sweet/benchmarks/cockroachdb/main.go +++ b/sweet/benchmarks/cockroachdb/main.go
@@ -20,6 +20,7 @@ "runtime" "strconv" "strings" + "sync" "syscall" "time" @@ -163,15 +164,19 @@ } // waitForCluster pings nodes in the cluster until one responds, or -// we time out. We only care to wait for one node to respond as the -// workload will work as long as it can connect to one node initially. -// The --ramp flag will take care of startup noise. +// we time out. We wait for all nodes to respond because even if the +// workload can start before all nodes are ready, that's not true for +// collecting diagnostic data. func waitForCluster(instances []*cockroachdbInstance, cfg *config) error { ctx, cancel := context.WithCancel(context.Background()) defer cancel() + + var wg sync.WaitGroup for _, inst := range instances { inst := inst - go func(ctx context.Context) { + wg.Add(1) + go func(context.Context) { + defer wg.Done() for { select { case <-ctx.Done(): @@ -181,16 +186,20 @@ // 5 seconds first and between pings. 5 seconds was chosen through // trial and error as a time that nodes are *usually* ready by. if err := inst.ping(cfg); err == nil { - cancel() return } } } }(ctx) } + done := make(chan struct{}) + go func() { + wg.Wait() + done <- struct{}{} + }() select { - case <-ctx.Done(): + case <-done: case <-time.After(time.Minute): return errors.New("benchmark timed out waiting for cluster to be ready") }