cmd/coordinator: consolidate and increase global VM deletion timeout
We had a lot of flexibility over timeouts, making their maintenance
harder. Consolidate it to a single timeout in the pool package, and
modify it from 45 minutes to 2 hours.
There's room for improvement in how we maintain this timeout,
but I'm leaving that for future work (with a tracking issue).
Fixes golang/go#52591.
Updates golang/go#52929.
Updates golang/go#49666.
Updates golang/go#42699.
Change-Id: I2ad92648d89a714397bd8b0e1ec490fc9f6d6790
Reviewed-on: https://go-review.googlesource.com/c/build/+/406216
Run-TryBot: Dmitri Shuralyov <dmitshur@golang.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Carlos Amedee <carlos@golang.org>
Reviewed-by: Heschi Kreinick <heschi@google.com>
diff --git a/buildlet/buildlet.go b/buildlet/buildlet.go
index c9711cb..08d2914 100644
--- a/buildlet/buildlet.go
+++ b/buildlet/buildlet.go
@@ -40,7 +40,7 @@
// DeleteIn optionally specifies a duration at which
// to delete the VM.
- // If zero, a reasonable default is used.
+ // If zero, a short default is used (not enough for longtest builders).
// Negative means no deletion timeout.
DeleteIn time.Duration
diff --git a/buildlet/ec2.go b/buildlet/ec2.go
index 1b2a611..02a3efa 100644
--- a/buildlet/ec2.go
+++ b/buildlet/ec2.go
@@ -59,6 +59,7 @@
opts.Description = fmt.Sprintf("Go Builder for %s", hostType)
}
if opts.DeleteIn == 0 {
+ // Note: This implements a short default in the rare case the caller doesn't care.
opts.DeleteIn = 30 * time.Minute
}
diff --git a/cmd/coordinator/coordinator.go b/cmd/coordinator/coordinator.go
index d5ba79f..3b81101 100644
--- a/cmd/coordinator/coordinator.go
+++ b/cmd/coordinator/coordinator.go
@@ -131,13 +131,6 @@
const (
maxStatusDone = 30
-
- // vmDeleteTimeout and podDeleteTimeout is how long before we delete a VM.
- // In practice this need only be as long as the slowest
- // builder (plan9 currently), because on startup this program
- // already deletes all buildlets it doesn't know about
- // (i.e. ones from a previous instance of the coordinator).
- vmDeleteTimeout = 45 * time.Minute
)
// Fake keys signed by a fake CA.
@@ -266,7 +259,7 @@
// a shared package.
pool.SetBuilderMasterKey(masterKey())
- err := pool.InitGCE(sc, vmDeleteTimeout, testFiles, &basePinErr, isGCERemoteBuildlet, *buildEnvName, *mode)
+ err := pool.InitGCE(sc, testFiles, &basePinErr, isGCERemoteBuildlet, *buildEnvName, *mode)
if err != nil {
if *mode == "" {
*mode = "dev"
diff --git a/dashboard/builders.go b/dashboard/builders.go
index 13b57fa..ecdf6bd 100644
--- a/dashboard/builders.go
+++ b/dashboard/builders.go
@@ -237,7 +237,6 @@
goBootstrapURLTmpl: "https://storage.googleapis.com/$BUCKET/gobootstrap-openbsd-386-go1_12.tar.gz",
Notes: "OpenBSD 6.8 (with 009_exit syspatch); GCE VM is built from script in build/env/openbsd-386",
SSHUsername: "gopher",
- DeleteTimeout: 60 * time.Minute, // See golang/go#49666.
},
"host-openbsd-amd64-70": &HostConfig{
VMImage: "openbsd-amd64-70",
@@ -254,7 +253,6 @@
goBootstrapURLTmpl: "https://storage.googleapis.com/$BUCKET/gobootstrap-openbsd-386-go1_12.tar.gz",
Notes: "OpenBSD 7.0; GCE VM is built from script in build/env/openbsd-386. n2-highcpu host.",
SSHUsername: "gopher",
- DeleteTimeout: 60 * time.Minute, // See golang/go#49666.
},
"host-openbsd-386-70-n2d": &HostConfig{
// This host config is only for the runtime team to use investigating golang/go#49209.
@@ -689,13 +687,13 @@
env: []string{"GOROOT_BOOTSTRAP=/usr/lib/go"},
},
"host-linux-amd64-perf": &HostConfig{
- Notes: "Cascade Lake performance testing machines",
- machineType: "c2-standard-8", // C2 has precisely defined, consistent server architecture.
- ContainerImage: "linux-x86-bullseye:latest",
- buildletURLTmpl: "https://storage.googleapis.com/$BUCKET/buildlet.linux-amd64",
- env: []string{"GOROOT_BOOTSTRAP=/go1.4"},
- SSHUsername: "root",
- DeleteTimeout: 8 * time.Hour,
+ Notes: "Cascade Lake performance testing machines",
+ machineType: "c2-standard-8", // C2 has precisely defined, consistent server architecture.
+ ContainerImage: "linux-x86-bullseye:latest",
+ buildletURLTmpl: "https://storage.googleapis.com/$BUCKET/buildlet.linux-amd64",
+ env: []string{"GOROOT_BOOTSTRAP=/go1.4"},
+ SSHUsername: "root",
+ CustomDeleteTimeout: 8 * time.Hour,
},
}
@@ -786,8 +784,12 @@
// EC2 options
isEC2 bool // if true, the instance is configured to run on EC2
- // GCE or EC2 options
- DeleteTimeout time.Duration // Timeout after which the VM is destroyed. Zero duration uses global default.
+ // GCE or EC2 options:
+ //
+ // CustomDeleteTimeout is an optional custom timeout after which the VM is forcibly destroyed and the build is retried.
+ // Zero duration defers decision to internal/coordinator/pool package, which is enough for longtest post-submit builds.
+ // (This is generally an internal implementation detail, currently left behind only for the -perf builder.)
+ CustomDeleteTimeout time.Duration
// Reverse options
ExpectNum int // expected number of reverse buildlets of this type
diff --git a/internal/coordinator/pool/ec2.go b/internal/coordinator/pool/ec2.go
index 07ce2cb..7780a86 100644
--- a/internal/coordinator/pool/ec2.go
+++ b/internal/coordinator/pool/ec2.go
@@ -64,13 +64,6 @@
// EC2Opt is optional configuration for the the buildlet.
type EC2Opt func(*EC2Buildlet)
-// WithVMDeleteTimeout sets the VM deletion timeout for all EC2 VMs.
-func WithVMDeleteTimeout(timeout time.Duration) EC2Opt {
- return func(eb *EC2Buildlet) {
- eb.vmDeleteTimeout = timeout
- }
-}
-
// EC2Buildlet manages a pool of AWS EC2 buildlets.
type EC2Buildlet struct {
// awsClient is the client used to interact with AWS services.
@@ -92,8 +85,6 @@
ledger *ledger
// cancelPoll will signal to the pollers to discontinue polling.
cancelPoll context.CancelFunc
- // vmDeleteTimeout contains the timeout used to determine if a VM should be deleted.
- vmDeleteTimeout time.Duration
// pollWait waits for all pollers to terminate polling.
pollWait sync.WaitGroup
}
@@ -121,7 +112,6 @@
hosts: hosts,
isRemoteBuildlet: fn,
ledger: newLedger(),
- vmDeleteTimeout: 45 * time.Minute, // default VM delete timeout
}
for _, opt := range opts {
opt(b)
@@ -195,7 +185,7 @@
Zone: "", // allow the EC2 api pick an availability zone with capacity
TLS: kp,
Meta: make(map[string]string),
- DeleteIn: determineDeleteTimeout(hconf, eb.vmDeleteTimeout),
+ DeleteIn: determineDeleteTimeout(hconf),
OnInstanceRequested: func() {
log.Printf("EC2 VM %q now booting", instName)
},
diff --git a/internal/coordinator/pool/ec2_test.go b/internal/coordinator/pool/ec2_test.go
index cb40d53..eb7e943 100644
--- a/internal/coordinator/pool/ec2_test.go
+++ b/internal/coordinator/pool/ec2_test.go
@@ -578,6 +578,7 @@
return nil, fmt.Errorf("invalid vmName: %q and hostType: %q", vmName, hostType)
}
if opts.DeleteIn == 0 {
+ // Note: This implements a short default in the rare case the caller doesn't care.
opts.DeleteIn = 30 * time.Minute
}
if !f.createVMRequestSuccess {
diff --git a/internal/coordinator/pool/gce.go b/internal/coordinator/pool/gce.go
index 1858a0e..34e4263 100644
--- a/internal/coordinator/pool/gce.go
+++ b/internal/coordinator/pool/gce.go
@@ -84,16 +84,14 @@
// values created due to separating the buildlet pools into a separate package
gceMode string
- deleteTimeout time.Duration
testFiles map[string]string
basePinErr *atomic.Value
isGCERemoteBuildlet IsRemoteBuildletFunc
)
// InitGCE initializes the GCE buildlet pool.
-func InitGCE(sc *secret.Client, vmDeleteTimeout time.Duration, tFiles map[string]string, basePin *atomic.Value, fn IsRemoteBuildletFunc, buildEnvName, mode string) error {
+func InitGCE(sc *secret.Client, tFiles map[string]string, basePin *atomic.Value, fn IsRemoteBuildletFunc, buildEnvName, mode string) error {
gceMode = mode
- deleteTimeout = vmDeleteTimeout
testFiles = tFiles
basePinErr = basePin
isGCERemoteBuildlet = fn
@@ -417,7 +415,7 @@
log.Printf("Creating GCE VM %q for %s at %s", instName, hostType, zone)
bc, err = buildlet.StartNewVM(gcpCreds, buildEnv, instName, hostType, buildlet.VMOpts{
- DeleteIn: determineDeleteTimeout(hconf, deleteTimeout),
+ DeleteIn: determineDeleteTimeout(hconf),
OnInstanceRequested: func() {
log.Printf("GCE VM %q now booting", instName)
},
diff --git a/internal/coordinator/pool/kube.go b/internal/coordinator/pool/kube.go
index 3e762b2..b67a015 100644
--- a/internal/coordinator/pool/kube.go
+++ b/internal/coordinator/pool/kube.go
@@ -33,15 +33,6 @@
This file implements the Kubernetes-based buildlet pool.
*/
-const (
- // podDeleteTimeout is how long before we delete a VM.
- // In practice this need only be as long as the slowest
- // builder (plan9 currently), because on startup this program
- // already deletes all buildlets it doesn't know about
- // (i.e. ones from a previous instance of the coordinator).
- podDeleteTimeout = 45 * time.Minute
-)
-
// Initialized by initKube:
var (
buildletsKubeClient *kubernetes.Client // for "buildlets" cluster
@@ -270,7 +261,7 @@
ProjectID: NewGCEConfiguration().BuildEnv().ProjectName,
ImageRegistry: registryPrefix,
Description: fmt.Sprintf("Go Builder for %s", hostType),
- DeleteIn: determineDeleteTimeout(hconf, podDeleteTimeout),
+ DeleteIn: determineDeleteTimeout(hconf),
OnPodCreating: func() {
lg.LogEventTime("pod_creating")
p.setPodUsed(podName, true)
@@ -406,7 +397,7 @@
return fmt.Sprintf("Kubernetes pool capacity: %d/%d", inUse, total)
}
-// CleanUpOldPods loops forever and periodically enumerates pods
+// CleanUpOldPodsLoop loops forever and periodically enumerates pods
// and deletes those which have expired.
//
// A Pod is considered expired if it has a "delete-at" metadata
diff --git a/internal/coordinator/pool/pool.go b/internal/coordinator/pool/pool.go
index f4ac79f..b1e67a1 100644
--- a/internal/coordinator/pool/pool.go
+++ b/internal/coordinator/pool/pool.go
@@ -68,17 +68,40 @@
return fmt.Sprintf("buildlet-%s-rn%s", strings.TrimPrefix(hostType, "host-"), randHex(length))
}
-// determineDeleteTimeout determines the buildlet timeout duration with the
-// following priority:
+// determineDeleteTimeout reports the buildlet delete timeout duration
+// with the following priority:
//
-// 1. Host type default from host config.
-// 2. Global default from 'timeout'.
-func determineDeleteTimeout(host *dashboard.HostConfig, timeout time.Duration) time.Duration {
- if host.DeleteTimeout != 0 {
- return host.DeleteTimeout
+// 1. Host type override from host config.
+// 2. Global default.
+func determineDeleteTimeout(host *dashboard.HostConfig) time.Duration {
+ if host.CustomDeleteTimeout != 0 {
+ return host.CustomDeleteTimeout
}
- return timeout
+ // The value we return below is effectively a global default.
+ //
+ // The comment of CleanUpOldVMs (and CleanUpOldPodsLoop) includes:
+ //
+ // This is the safety mechanism to delete VMs which stray from the
+ // normal deleting process. VMs are created to run a single build and
+ // should be shut down by a controlling process. Due to various types
+ // of failures, they might get stranded. To prevent them from getting
+ // stranded and wasting resources forever, we instead set the
+ // "delete-at" metadata attribute on them when created to some time
+ // that's well beyond their expected lifetime.
+ //
+ // Issue go.dev/issue/52929 tracks what to do about this global
+ // timeout in the long term. Unless something changes,
+ // it needs to be maintained manually so that it's always
+ // "well beyond their expected lifetime" of each builder that doesn't
+ // otherwise override this timeout—otherwise it'll cause even more
+ // resources to be used due the automatic (and unlimited) retrying
+ // as described in go.dev/issue/42699.
+ //
+ // A global timeout of 45 minutes was chosen in 2015.
+ // Longtest builders were added in 2018 started to reach 45 mins by 2021-2022.
+ // Try 2 hours next, which might last some years (depending on test volume and test speed).
+ return 2 * time.Hour
}
// isBuildlet checks the name string in order to determine if the name is for a buildlet.
diff --git a/internal/coordinator/pool/pool_test.go b/internal/coordinator/pool/pool_test.go
index 266d1cd..7bceb61 100644
--- a/internal/coordinator/pool/pool_test.go
+++ b/internal/coordinator/pool/pool_test.go
@@ -18,29 +18,25 @@
testCases := []struct {
desc string
hostValue time.Duration
- timeout time.Duration
wantTimeout time.Duration
}{
{
- desc: "from-host",
- hostValue: time.Minute,
- timeout: time.Second,
- wantTimeout: time.Minute,
+ desc: "default",
+ wantTimeout: 2 * time.Hour,
},
{
- desc: "from-argument",
- hostValue: 0,
- timeout: time.Second,
- wantTimeout: time.Second,
+ desc: "from-host",
+ hostValue: 8 * time.Hour,
+ wantTimeout: 8 * time.Hour,
},
}
for _, tc := range testCases {
t.Run(tc.desc, func(t *testing.T) {
h := &dashboard.HostConfig{
- DeleteTimeout: tc.hostValue,
+ CustomDeleteTimeout: tc.hostValue,
}
- if got := determineDeleteTimeout(h, tc.timeout); got != tc.wantTimeout {
- t.Errorf("determineDeleteTimeout(%+v, %s) = %s; want %s", h, tc.timeout, got, tc.wantTimeout)
+ if got := determineDeleteTimeout(h); got != tc.wantTimeout {
+ t.Errorf("determineDeleteTimeout(%+v) = %s; want %s", h, got, tc.wantTimeout)
}
})
}