sweet: force-kill CockroachDB if SIGTERM isn't working This change uses SIGKILL when benchmarks are finished running but for whatever reason the CockroachDB cluster isn't responding to SIGTERM. At this point, it's fine to forcibly kill the server, but let's also make sure we kill all other instances too (since there's no clean shutdown). Currently timeouts in the cockroachdb benchmark are causing loss of data (a separate issue we should fix) but even if that wasn't the case, we'd also be losing data for CockroachDB. This CL fixes the problem with the benchmark: locally I couldn't get it to succeed with 20 runs, but with this patch, it has no problem finishing. We should investigate why CockroachDB isn't responding to SIGTERM and whether there's a cleaner way to ensure a shutdown. This is OK for now, and we may want to keep this behavior long-term anyway (useful when benchmarking unvetted patches that cause a hang, for example). This CL also adds a bunch more logging to the benchmark runner, too. Change-Id: I57cf27f35b71b6c69a8ca2ec38107e1c912a5167 Cq-Include-Trybots: luci.golang.try:x_benchmarks-gotip-linux-amd64-longtest,x_benchmarks-go1.22-linux-amd64-longtest Reviewed-on: https://go-review.googlesource.com/c/benchmarks/+/594775 Auto-Submit: Michael Knyszek <mknyszek@google.com> Reviewed-by: Michael Pratt <mpratt@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>

commit: 1a6c737d7080425a4003b909cf48d833be79f37a [log] [tgz]
author: Michael Anthony Knyszek <mknyszek@google.com> Tue Jun 25 14:13:22 2024 +0000
committer: Gopher Robot <gobot@golang.org> Tue Jun 25 19:20:42 2024 +0000
tree: 42b1b06aab8667ce6dd967f82671dc6c2e070d77
parent: 1cf02ebdacc0716fe3e0964d1918307d9dc28efa [diff]
diff --git a/sweet/benchmarks/cockroachdb/main.go b/sweet/benchmarks/cockroachdb/main.go
index b3794d6..08e2271 100644
--- a/sweet/benchmarks/cockroachdb/main.go
+++ b/sweet/benchmarks/cockroachdb/main.go

@@ -12,9 +12,7 @@
 	"errors"
 	"flag"
 	"fmt"
-	"golang.org/x/benchmarks/sweet/benchmarks/internal/driver"
-	"golang.org/x/benchmarks/sweet/benchmarks/internal/server"
-	"golang.org/x/benchmarks/sweet/common/diagnostics"
+	"log"
 	"os"
 	"os/exec"
 	"regexp"
@@ -25,6 +23,10 @@
 	"sync/atomic"
 	"syscall"
 	"time"
+
+	"golang.org/x/benchmarks/sweet/benchmarks/internal/driver"
+	"golang.org/x/benchmarks/sweet/benchmarks/internal/server"
+	"golang.org/x/benchmarks/sweet/common/diagnostics"
 )
 
 const (
@@ -256,20 +258,53 @@
 	return fmt.Sprintf("%s:%d", cliCfg.host, i.httpPort)
 }
 
-func (i *cockroachdbInstance) shutdown() error {
+func (i *cockroachdbInstance) shutdown() (killed bool, err error) {
 	// Only attempt to shut down the process if we got to a point
 	// that a command was constructed and started.
-	if i.cmd != nil && i.cmd.Process != nil {
-		// Send SIGTERM and wait instead of just killing as sending SIGKILL
-		// bypasses node shutdown logic and will leave the node in a bad state.
-		// Normally not an issue unless you want to restart the cluster i.e.
-		// to poke around and see what's wrong.
-		if err := i.cmd.Process.Signal(syscall.SIGTERM); err != nil {
-			return err
-		}
+	if i.cmd == nil || i.cmd.Process == nil {
+		return false, nil
+	}
+	// Send SIGTERM and wait instead of just killing as sending SIGKILL
+	// bypasses node shutdown logic and will leave the node in a bad state.
+	// Normally not an issue unless you want to restart the cluster i.e.
+	// to poke around and see what's wrong.
+	if err := i.cmd.Process.Signal(syscall.SIGTERM); err != nil {
+		return false, err
+	}
+	done := make(chan struct{})
+	go func() {
 		if _, err := i.cmd.Process.Wait(); err != nil {
-			return err
+			fmt.Fprintf(os.Stderr, "failed to wait for instance %s: %v\n", i.name, err)
 		}
+		done <- struct{}{}
+	}()
+	select {
+	case <-done:
+	case <-time.After(1 * time.Minute):
+		// If it takes a full minute to shut down, just kill the instance
+		// and report that we did it. We *probably* won't need it again.
+		if err := i.cmd.Process.Signal(syscall.SIGKILL); err != nil {
+			return false, fmt.Errorf("failed to send SIGKILL to instance %s: %v", i.name, err)
+		}
+		killed = true
+
+		// Wait again -- this time it should happen.
+		log.Println("sent kill signal to", i.name, "and waiting for exit")
+		<-done
+	}
+	return killed, nil
+}
+
+func (i *cockroachdbInstance) kill() error {
+	if i.cmd == nil || i.cmd.Process == nil {
+		// Nothing to kill.
+		return nil
+	}
+	if err := i.cmd.Process.Signal(syscall.SIGKILL); err != nil {
+		return err
+	}
+	if _, err := i.cmd.Process.Wait(); err != nil {
+		return fmt.Errorf("failed to wait for instance %s: %v", i.name, err)
 	}
 	return nil
 }
@@ -345,6 +380,7 @@
 		pgurls = append(pgurls, fmt.Sprintf(`postgres://root@%s?sslmode=disable`, host))
 	}
 	// Load in the schema needed for the workload via `workload init`
+	log.Println("loading the schema")
 	initArgs := []string{"workload", "init", cfg.bench.workload}
 	initArgs = append(initArgs, pgurls...)
 	initCmd := exec.Command(cfg.cockroachdbBin, initArgs...)
@@ -355,6 +391,8 @@
 		return err
 	}
 
+	log.Println("sleeping")
+
 	// If we try and start the workload right after loading in the schema
 	// it will spam us with database does not exist errors. We could repeatedly
 	// retry until the database exists by parsing the output, or we can just
@@ -369,6 +407,7 @@
 	}
 	args = append(args, pgurls...)
 
+	log.Println("running benchmark timeout")
 	cmd := exec.Command(cfg.cockroachdbBin, args...)
 	fmt.Fprintln(os.Stderr, cmd.String())
 
@@ -499,6 +538,7 @@
 }
 
 func run(cfg *config) (err error) {
+	log.Println("launching cluster")
 	var instances []*cockroachdbInstance
 	// Launch the server.
 	instances, err = launchCockroachCluster(cfg)
@@ -509,26 +549,41 @@
 
 	// Clean up the cluster after we're done.
 	defer func() {
-		// We only need send SIGTERM to one instance, attempting to
+		log.Println("shutting down cluster")
+
+		// We only need send a shutdown signal to one instance, attempting to
 		// send it again will cause it to hang.
 		inst := instances[0]
-		if r := inst.shutdown(); r != nil {
+		killed, r := inst.shutdown()
+		if r != nil {
 			if err == nil {
 				err = r
 			} else {
 				fmt.Fprintf(os.Stderr, "failed to shutdown %s: %v", inst.name, r)
 			}
 		}
+		if killed {
+			log.Println("killed instance", inst.name, "and killing others")
+
+			// If we ended up killing an instance, try to kill the other instances too.
+			for _, inst := range instances[1:] {
+				if err := inst.kill(); err != nil {
+					fmt.Fprintf(os.Stderr, "failed to kill %s: %v", inst.name, err)
+				}
+			}
+		}
 		if err != nil && inst.output.Len() != 0 {
 			fmt.Fprintf(os.Stderr, "=== Instance %q stdout+stderr ===\n", inst.name)
 			fmt.Fprintln(os.Stderr, inst.output.String())
 		}
 	}()
 
+	log.Println("waiting for cluster")
 	if err = waitForCluster(instances, cfg); err != nil {
 		return err
 	}
 
+	log.Println("setting cluster settings")
 	if err = instances[0].setClusterSettings(cfg); err != nil {
 		return err
 	}
@@ -593,6 +648,7 @@
 		if len(finishers) != 0 {
 			// Finish all the diagnostic collections in concurrently. Otherwise we could be waiting a while.
 			defer func() {
+				log.Println("running finishers")
 				var wg sync.WaitGroup
 				for _, finish := range finishers {
 					finish := finish
@@ -606,6 +662,7 @@
 			}()
 		}
 		// Actually run the benchmark.
+		log.Println("running benchmark")
 		return runBenchmark(d, cfg, instances)
 	}, opts...)
 }
commit	1a6c737d7080425a4003b909cf48d833be79f37a	[log] [tgz]
author	Michael Anthony Knyszek <mknyszek@google.com>	Tue Jun 25 14:13:22 2024 +0000
committer	Gopher Robot <gobot@golang.org>	Tue Jun 25 19:20:42 2024 +0000
tree	42b1b06aab8667ce6dd967f82671dc6c2e070d77
parent	1cf02ebdacc0716fe3e0964d1918307d9dc28efa [diff]