sweet: force-kill CockroachDB if SIGTERM isn't working
This change uses SIGKILL when benchmarks are finished running but for
whatever reason the CockroachDB cluster isn't responding to SIGTERM. At
this point, it's fine to forcibly kill the server, but let's also make
sure we kill all other instances too (since there's no clean shutdown).
Currently timeouts in the cockroachdb benchmark are causing loss of data
(a separate issue we should fix) but even if that wasn't the case, we'd
also be losing data for CockroachDB. This CL fixes the problem with the
benchmark: locally I couldn't get it to succeed with 20 runs, but with
this patch, it has no problem finishing. We should investigate why
CockroachDB isn't responding to SIGTERM and whether there's a cleaner
way to ensure a shutdown. This is OK for now, and we may want to keep
this behavior long-term anyway (useful when benchmarking unvetted
patches that cause a hang, for example).
This CL also adds a bunch more logging to the benchmark runner, too.
Change-Id: I57cf27f35b71b6c69a8ca2ec38107e1c912a5167
Cq-Include-Trybots: luci.golang.try:x_benchmarks-gotip-linux-amd64-longtest,x_benchmarks-go1.22-linux-amd64-longtest
Reviewed-on: https://go-review.googlesource.com/c/benchmarks/+/594775
Auto-Submit: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
diff --git a/sweet/benchmarks/cockroachdb/main.go b/sweet/benchmarks/cockroachdb/main.go
index b3794d6..08e2271 100644
--- a/sweet/benchmarks/cockroachdb/main.go
+++ b/sweet/benchmarks/cockroachdb/main.go
@@ -12,9 +12,7 @@
"errors"
"flag"
"fmt"
- "golang.org/x/benchmarks/sweet/benchmarks/internal/driver"
- "golang.org/x/benchmarks/sweet/benchmarks/internal/server"
- "golang.org/x/benchmarks/sweet/common/diagnostics"
+ "log"
"os"
"os/exec"
"regexp"
@@ -25,6 +23,10 @@
"sync/atomic"
"syscall"
"time"
+
+ "golang.org/x/benchmarks/sweet/benchmarks/internal/driver"
+ "golang.org/x/benchmarks/sweet/benchmarks/internal/server"
+ "golang.org/x/benchmarks/sweet/common/diagnostics"
)
const (
@@ -256,20 +258,53 @@
return fmt.Sprintf("%s:%d", cliCfg.host, i.httpPort)
}
-func (i *cockroachdbInstance) shutdown() error {
+func (i *cockroachdbInstance) shutdown() (killed bool, err error) {
// Only attempt to shut down the process if we got to a point
// that a command was constructed and started.
- if i.cmd != nil && i.cmd.Process != nil {
- // Send SIGTERM and wait instead of just killing as sending SIGKILL
- // bypasses node shutdown logic and will leave the node in a bad state.
- // Normally not an issue unless you want to restart the cluster i.e.
- // to poke around and see what's wrong.
- if err := i.cmd.Process.Signal(syscall.SIGTERM); err != nil {
- return err
- }
+ if i.cmd == nil || i.cmd.Process == nil {
+ return false, nil
+ }
+ // Send SIGTERM and wait instead of just killing as sending SIGKILL
+ // bypasses node shutdown logic and will leave the node in a bad state.
+ // Normally not an issue unless you want to restart the cluster i.e.
+ // to poke around and see what's wrong.
+ if err := i.cmd.Process.Signal(syscall.SIGTERM); err != nil {
+ return false, err
+ }
+ done := make(chan struct{})
+ go func() {
if _, err := i.cmd.Process.Wait(); err != nil {
- return err
+ fmt.Fprintf(os.Stderr, "failed to wait for instance %s: %v\n", i.name, err)
}
+ done <- struct{}{}
+ }()
+ select {
+ case <-done:
+ case <-time.After(1 * time.Minute):
+ // If it takes a full minute to shut down, just kill the instance
+ // and report that we did it. We *probably* won't need it again.
+ if err := i.cmd.Process.Signal(syscall.SIGKILL); err != nil {
+ return false, fmt.Errorf("failed to send SIGKILL to instance %s: %v", i.name, err)
+ }
+ killed = true
+
+ // Wait again -- this time it should happen.
+ log.Println("sent kill signal to", i.name, "and waiting for exit")
+ <-done
+ }
+ return killed, nil
+}
+
+func (i *cockroachdbInstance) kill() error {
+ if i.cmd == nil || i.cmd.Process == nil {
+ // Nothing to kill.
+ return nil
+ }
+ if err := i.cmd.Process.Signal(syscall.SIGKILL); err != nil {
+ return err
+ }
+ if _, err := i.cmd.Process.Wait(); err != nil {
+ return fmt.Errorf("failed to wait for instance %s: %v", i.name, err)
}
return nil
}
@@ -345,6 +380,7 @@
pgurls = append(pgurls, fmt.Sprintf(`postgres://root@%s?sslmode=disable`, host))
}
// Load in the schema needed for the workload via `workload init`
+ log.Println("loading the schema")
initArgs := []string{"workload", "init", cfg.bench.workload}
initArgs = append(initArgs, pgurls...)
initCmd := exec.Command(cfg.cockroachdbBin, initArgs...)
@@ -355,6 +391,8 @@
return err
}
+ log.Println("sleeping")
+
// If we try and start the workload right after loading in the schema
// it will spam us with database does not exist errors. We could repeatedly
// retry until the database exists by parsing the output, or we can just
@@ -369,6 +407,7 @@
}
args = append(args, pgurls...)
+ log.Println("running benchmark timeout")
cmd := exec.Command(cfg.cockroachdbBin, args...)
fmt.Fprintln(os.Stderr, cmd.String())
@@ -499,6 +538,7 @@
}
func run(cfg *config) (err error) {
+ log.Println("launching cluster")
var instances []*cockroachdbInstance
// Launch the server.
instances, err = launchCockroachCluster(cfg)
@@ -509,26 +549,41 @@
// Clean up the cluster after we're done.
defer func() {
- // We only need send SIGTERM to one instance, attempting to
+ log.Println("shutting down cluster")
+
+ // We only need send a shutdown signal to one instance, attempting to
// send it again will cause it to hang.
inst := instances[0]
- if r := inst.shutdown(); r != nil {
+ killed, r := inst.shutdown()
+ if r != nil {
if err == nil {
err = r
} else {
fmt.Fprintf(os.Stderr, "failed to shutdown %s: %v", inst.name, r)
}
}
+ if killed {
+ log.Println("killed instance", inst.name, "and killing others")
+
+ // If we ended up killing an instance, try to kill the other instances too.
+ for _, inst := range instances[1:] {
+ if err := inst.kill(); err != nil {
+ fmt.Fprintf(os.Stderr, "failed to kill %s: %v", inst.name, err)
+ }
+ }
+ }
if err != nil && inst.output.Len() != 0 {
fmt.Fprintf(os.Stderr, "=== Instance %q stdout+stderr ===\n", inst.name)
fmt.Fprintln(os.Stderr, inst.output.String())
}
}()
+ log.Println("waiting for cluster")
if err = waitForCluster(instances, cfg); err != nil {
return err
}
+ log.Println("setting cluster settings")
if err = instances[0].setClusterSettings(cfg); err != nil {
return err
}
@@ -593,6 +648,7 @@
if len(finishers) != 0 {
// Finish all the diagnostic collections in concurrently. Otherwise we could be waiting a while.
defer func() {
+ log.Println("running finishers")
var wg sync.WaitGroup
for _, finish := range finishers {
finish := finish
@@ -606,6 +662,7 @@
}()
}
// Actually run the benchmark.
+ log.Println("running benchmark")
return runBenchmark(d, cfg, instances)
}, opts...)
}