cmd/coordinator: consolidate and increase global VM deletion timeout

We had a lot of flexibility over timeouts, making their maintenance
harder. Consolidate it to a single timeout in the pool package, and
modify it from 45 minutes to 2 hours.

There's room for improvement in how we maintain this timeout,
but I'm leaving that for future work (with a tracking issue).

Fixes golang/go#52591.
Updates golang/go#52929.
Updates golang/go#49666.
Updates golang/go#42699.

Change-Id: I2ad92648d89a714397bd8b0e1ec490fc9f6d6790
Reviewed-on: https://go-review.googlesource.com/c/build/+/406216
Run-TryBot: Dmitri Shuralyov <dmitshur@golang.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Carlos Amedee <carlos@golang.org>
Reviewed-by: Heschi Kreinick <heschi@google.com>
diff --git a/buildlet/buildlet.go b/buildlet/buildlet.go
index c9711cb..08d2914 100644
--- a/buildlet/buildlet.go
+++ b/buildlet/buildlet.go
@@ -40,7 +40,7 @@
 
 	// DeleteIn optionally specifies a duration at which
 	// to delete the VM.
-	// If zero, a reasonable default is used.
+	// If zero, a short default is used (not enough for longtest builders).
 	// Negative means no deletion timeout.
 	DeleteIn time.Duration
 
diff --git a/buildlet/ec2.go b/buildlet/ec2.go
index 1b2a611..02a3efa 100644
--- a/buildlet/ec2.go
+++ b/buildlet/ec2.go
@@ -59,6 +59,7 @@
 		opts.Description = fmt.Sprintf("Go Builder for %s", hostType)
 	}
 	if opts.DeleteIn == 0 {
+		// Note: This implements a short default in the rare case the caller doesn't care.
 		opts.DeleteIn = 30 * time.Minute
 	}
 
diff --git a/cmd/coordinator/coordinator.go b/cmd/coordinator/coordinator.go
index d5ba79f..3b81101 100644
--- a/cmd/coordinator/coordinator.go
+++ b/cmd/coordinator/coordinator.go
@@ -131,13 +131,6 @@
 
 const (
 	maxStatusDone = 30
-
-	// vmDeleteTimeout and podDeleteTimeout is how long before we delete a VM.
-	// In practice this need only be as long as the slowest
-	// builder (plan9 currently), because on startup this program
-	// already deletes all buildlets it doesn't know about
-	// (i.e. ones from a previous instance of the coordinator).
-	vmDeleteTimeout = 45 * time.Minute
 )
 
 // Fake keys signed by a fake CA.
@@ -266,7 +259,7 @@
 	// a shared package.
 	pool.SetBuilderMasterKey(masterKey())
 
-	err := pool.InitGCE(sc, vmDeleteTimeout, testFiles, &basePinErr, isGCERemoteBuildlet, *buildEnvName, *mode)
+	err := pool.InitGCE(sc, testFiles, &basePinErr, isGCERemoteBuildlet, *buildEnvName, *mode)
 	if err != nil {
 		if *mode == "" {
 			*mode = "dev"
diff --git a/dashboard/builders.go b/dashboard/builders.go
index 13b57fa..ecdf6bd 100644
--- a/dashboard/builders.go
+++ b/dashboard/builders.go
@@ -237,7 +237,6 @@
 		goBootstrapURLTmpl: "https://storage.googleapis.com/$BUCKET/gobootstrap-openbsd-386-go1_12.tar.gz",
 		Notes:              "OpenBSD 6.8 (with 009_exit syspatch); GCE VM is built from script in build/env/openbsd-386",
 		SSHUsername:        "gopher",
-		DeleteTimeout:      60 * time.Minute, // See golang/go#49666.
 	},
 	"host-openbsd-amd64-70": &HostConfig{
 		VMImage:            "openbsd-amd64-70",
@@ -254,7 +253,6 @@
 		goBootstrapURLTmpl: "https://storage.googleapis.com/$BUCKET/gobootstrap-openbsd-386-go1_12.tar.gz",
 		Notes:              "OpenBSD 7.0; GCE VM is built from script in build/env/openbsd-386. n2-highcpu host.",
 		SSHUsername:        "gopher",
-		DeleteTimeout:      60 * time.Minute, // See golang/go#49666.
 	},
 	"host-openbsd-386-70-n2d": &HostConfig{
 		// This host config is only for the runtime team to use investigating golang/go#49209.
@@ -689,13 +687,13 @@
 		env:       []string{"GOROOT_BOOTSTRAP=/usr/lib/go"},
 	},
 	"host-linux-amd64-perf": &HostConfig{
-		Notes:           "Cascade Lake performance testing machines",
-		machineType:     "c2-standard-8", // C2 has precisely defined, consistent server architecture.
-		ContainerImage:  "linux-x86-bullseye:latest",
-		buildletURLTmpl: "https://storage.googleapis.com/$BUCKET/buildlet.linux-amd64",
-		env:             []string{"GOROOT_BOOTSTRAP=/go1.4"},
-		SSHUsername:     "root",
-		DeleteTimeout:   8 * time.Hour,
+		Notes:               "Cascade Lake performance testing machines",
+		machineType:         "c2-standard-8", // C2 has precisely defined, consistent server architecture.
+		ContainerImage:      "linux-x86-bullseye:latest",
+		buildletURLTmpl:     "https://storage.googleapis.com/$BUCKET/buildlet.linux-amd64",
+		env:                 []string{"GOROOT_BOOTSTRAP=/go1.4"},
+		SSHUsername:         "root",
+		CustomDeleteTimeout: 8 * time.Hour,
 	},
 }
 
@@ -786,8 +784,12 @@
 	// EC2 options
 	isEC2 bool // if true, the instance is configured to run on EC2
 
-	// GCE or EC2 options
-	DeleteTimeout time.Duration // Timeout after which the VM is destroyed. Zero duration uses global default.
+	// GCE or EC2 options:
+	//
+	// CustomDeleteTimeout is an optional custom timeout after which the VM is forcibly destroyed and the build is retried.
+	// Zero duration defers decision to internal/coordinator/pool package, which is enough for longtest post-submit builds.
+	// (This is generally an internal implementation detail, currently left behind only for the -perf builder.)
+	CustomDeleteTimeout time.Duration
 
 	// Reverse options
 	ExpectNum       int  // expected number of reverse buildlets of this type
diff --git a/internal/coordinator/pool/ec2.go b/internal/coordinator/pool/ec2.go
index 07ce2cb..7780a86 100644
--- a/internal/coordinator/pool/ec2.go
+++ b/internal/coordinator/pool/ec2.go
@@ -64,13 +64,6 @@
 // EC2Opt is optional configuration for the the buildlet.
 type EC2Opt func(*EC2Buildlet)
 
-// WithVMDeleteTimeout sets the VM deletion timeout for all EC2 VMs.
-func WithVMDeleteTimeout(timeout time.Duration) EC2Opt {
-	return func(eb *EC2Buildlet) {
-		eb.vmDeleteTimeout = timeout
-	}
-}
-
 // EC2Buildlet manages a pool of AWS EC2 buildlets.
 type EC2Buildlet struct {
 	// awsClient is the client used to interact with AWS services.
@@ -92,8 +85,6 @@
 	ledger *ledger
 	// cancelPoll will signal to the pollers to discontinue polling.
 	cancelPoll context.CancelFunc
-	// vmDeleteTimeout contains the timeout used to determine if a VM should be deleted.
-	vmDeleteTimeout time.Duration
 	// pollWait waits for all pollers to terminate polling.
 	pollWait sync.WaitGroup
 }
@@ -121,7 +112,6 @@
 		hosts:            hosts,
 		isRemoteBuildlet: fn,
 		ledger:           newLedger(),
-		vmDeleteTimeout:  45 * time.Minute, // default VM delete timeout
 	}
 	for _, opt := range opts {
 		opt(b)
@@ -195,7 +185,7 @@
 		Zone:     "", // allow the EC2 api pick an availability zone with capacity
 		TLS:      kp,
 		Meta:     make(map[string]string),
-		DeleteIn: determineDeleteTimeout(hconf, eb.vmDeleteTimeout),
+		DeleteIn: determineDeleteTimeout(hconf),
 		OnInstanceRequested: func() {
 			log.Printf("EC2 VM %q now booting", instName)
 		},
diff --git a/internal/coordinator/pool/ec2_test.go b/internal/coordinator/pool/ec2_test.go
index cb40d53..eb7e943 100644
--- a/internal/coordinator/pool/ec2_test.go
+++ b/internal/coordinator/pool/ec2_test.go
@@ -578,6 +578,7 @@
 		return nil, fmt.Errorf("invalid vmName: %q and hostType: %q", vmName, hostType)
 	}
 	if opts.DeleteIn == 0 {
+		// Note: This implements a short default in the rare case the caller doesn't care.
 		opts.DeleteIn = 30 * time.Minute
 	}
 	if !f.createVMRequestSuccess {
diff --git a/internal/coordinator/pool/gce.go b/internal/coordinator/pool/gce.go
index 1858a0e..34e4263 100644
--- a/internal/coordinator/pool/gce.go
+++ b/internal/coordinator/pool/gce.go
@@ -84,16 +84,14 @@
 
 	// values created due to separating the buildlet pools into a separate package
 	gceMode             string
-	deleteTimeout       time.Duration
 	testFiles           map[string]string
 	basePinErr          *atomic.Value
 	isGCERemoteBuildlet IsRemoteBuildletFunc
 )
 
 // InitGCE initializes the GCE buildlet pool.
-func InitGCE(sc *secret.Client, vmDeleteTimeout time.Duration, tFiles map[string]string, basePin *atomic.Value, fn IsRemoteBuildletFunc, buildEnvName, mode string) error {
+func InitGCE(sc *secret.Client, tFiles map[string]string, basePin *atomic.Value, fn IsRemoteBuildletFunc, buildEnvName, mode string) error {
 	gceMode = mode
-	deleteTimeout = vmDeleteTimeout
 	testFiles = tFiles
 	basePinErr = basePin
 	isGCERemoteBuildlet = fn
@@ -417,7 +415,7 @@
 
 	log.Printf("Creating GCE VM %q for %s at %s", instName, hostType, zone)
 	bc, err = buildlet.StartNewVM(gcpCreds, buildEnv, instName, hostType, buildlet.VMOpts{
-		DeleteIn: determineDeleteTimeout(hconf, deleteTimeout),
+		DeleteIn: determineDeleteTimeout(hconf),
 		OnInstanceRequested: func() {
 			log.Printf("GCE VM %q now booting", instName)
 		},
diff --git a/internal/coordinator/pool/kube.go b/internal/coordinator/pool/kube.go
index 3e762b2..b67a015 100644
--- a/internal/coordinator/pool/kube.go
+++ b/internal/coordinator/pool/kube.go
@@ -33,15 +33,6 @@
 This file implements the Kubernetes-based buildlet pool.
 */
 
-const (
-	// podDeleteTimeout is how long before we delete a VM.
-	// In practice this need only be as long as the slowest
-	// builder (plan9 currently), because on startup this program
-	// already deletes all buildlets it doesn't know about
-	// (i.e. ones from a previous instance of the coordinator).
-	podDeleteTimeout = 45 * time.Minute
-)
-
 // Initialized by initKube:
 var (
 	buildletsKubeClient *kubernetes.Client // for "buildlets" cluster
@@ -270,7 +261,7 @@
 		ProjectID:     NewGCEConfiguration().BuildEnv().ProjectName,
 		ImageRegistry: registryPrefix,
 		Description:   fmt.Sprintf("Go Builder for %s", hostType),
-		DeleteIn:      determineDeleteTimeout(hconf, podDeleteTimeout),
+		DeleteIn:      determineDeleteTimeout(hconf),
 		OnPodCreating: func() {
 			lg.LogEventTime("pod_creating")
 			p.setPodUsed(podName, true)
@@ -406,7 +397,7 @@
 	return fmt.Sprintf("Kubernetes pool capacity: %d/%d", inUse, total)
 }
 
-// CleanUpOldPods loops forever and periodically enumerates pods
+// CleanUpOldPodsLoop loops forever and periodically enumerates pods
 // and deletes those which have expired.
 //
 // A Pod is considered expired if it has a "delete-at" metadata
diff --git a/internal/coordinator/pool/pool.go b/internal/coordinator/pool/pool.go
index f4ac79f..b1e67a1 100644
--- a/internal/coordinator/pool/pool.go
+++ b/internal/coordinator/pool/pool.go
@@ -68,17 +68,40 @@
 	return fmt.Sprintf("buildlet-%s-rn%s", strings.TrimPrefix(hostType, "host-"), randHex(length))
 }
 
-// determineDeleteTimeout determines the buildlet timeout duration with the
-// following priority:
+// determineDeleteTimeout reports the buildlet delete timeout duration
+// with the following priority:
 //
-// 1. Host type default from host config.
-// 2. Global default from 'timeout'.
-func determineDeleteTimeout(host *dashboard.HostConfig, timeout time.Duration) time.Duration {
-	if host.DeleteTimeout != 0 {
-		return host.DeleteTimeout
+// 1. Host type override from host config.
+// 2. Global default.
+func determineDeleteTimeout(host *dashboard.HostConfig) time.Duration {
+	if host.CustomDeleteTimeout != 0 {
+		return host.CustomDeleteTimeout
 	}
 
-	return timeout
+	// The value we return below is effectively a global default.
+	//
+	// The comment of CleanUpOldVMs (and CleanUpOldPodsLoop) includes:
+	//
+	//	This is the safety mechanism to delete VMs which stray from the
+	//	normal deleting process. VMs are created to run a single build and
+	//	should be shut down by a controlling process. Due to various types
+	//	of failures, they might get stranded. To prevent them from getting
+	//	stranded and wasting resources forever, we instead set the
+	//	"delete-at" metadata attribute on them when created to some time
+	//	that's well beyond their expected lifetime.
+	//
+	// Issue go.dev/issue/52929 tracks what to do about this global
+	// timeout in the long term. Unless something changes,
+	// it needs to be maintained manually so that it's always
+	// "well beyond their expected lifetime" of each builder that doesn't
+	// otherwise override this timeout—otherwise it'll cause even more
+	// resources to be used due the automatic (and unlimited) retrying
+	// as described in go.dev/issue/42699.
+	//
+	// A global timeout of 45 minutes was chosen in 2015.
+	// Longtest builders were added in 2018 started to reach 45 mins by 2021-2022.
+	// Try 2 hours next, which might last some years (depending on test volume and test speed).
+	return 2 * time.Hour
 }
 
 // isBuildlet checks the name string in order to determine if the name is for a buildlet.
diff --git a/internal/coordinator/pool/pool_test.go b/internal/coordinator/pool/pool_test.go
index 266d1cd..7bceb61 100644
--- a/internal/coordinator/pool/pool_test.go
+++ b/internal/coordinator/pool/pool_test.go
@@ -18,29 +18,25 @@
 	testCases := []struct {
 		desc        string
 		hostValue   time.Duration
-		timeout     time.Duration
 		wantTimeout time.Duration
 	}{
 		{
-			desc:        "from-host",
-			hostValue:   time.Minute,
-			timeout:     time.Second,
-			wantTimeout: time.Minute,
+			desc:        "default",
+			wantTimeout: 2 * time.Hour,
 		},
 		{
-			desc:        "from-argument",
-			hostValue:   0,
-			timeout:     time.Second,
-			wantTimeout: time.Second,
+			desc:        "from-host",
+			hostValue:   8 * time.Hour,
+			wantTimeout: 8 * time.Hour,
 		},
 	}
 	for _, tc := range testCases {
 		t.Run(tc.desc, func(t *testing.T) {
 			h := &dashboard.HostConfig{
-				DeleteTimeout: tc.hostValue,
+				CustomDeleteTimeout: tc.hostValue,
 			}
-			if got := determineDeleteTimeout(h, tc.timeout); got != tc.wantTimeout {
-				t.Errorf("determineDeleteTimeout(%+v, %s) = %s; want %s", h, tc.timeout, got, tc.wantTimeout)
+			if got := determineDeleteTimeout(h); got != tc.wantTimeout {
+				t.Errorf("determineDeleteTimeout(%+v) = %s; want %s", h, got, tc.wantTimeout)
 			}
 		})
 	}