cmd/coordinator,buildlet: keep active GCE SSH sessions alive

This CL skips deleting active remote buildlets.

The coordinator has multiple ways of tracking stale buildlets. For our
GCE buildlets, we periodically delete old VMs after their expiration
time, typically 45 minutes after their creation. The expiration tracking
in coordinator/gce.go does not account for remote buildlets, which are
buildlets created by users or cmd/release. Remote buildlets have their
own staleness checks and cleanup process, so we should skip the GCE
specific cleanup logic for them.

This adds an additional field to the buildlet Client in order to
correlate a GCE VM with a buildlet.

Updates golang/go#37001

Change-Id: Ib0acdf79c4dfbee6e0061c513f98b749d4b9cc64
Reviewed-on: https://go-review.googlesource.com/c/build/+/217722
Run-TryBot: Alexander Rakoczy <alex@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@golang.org>
diff --git a/buildlet/buildletclient.go b/buildlet/buildletclient.go
index a6f0d49..d23984c 100644
--- a/buildlet/buildletclient.go
+++ b/buildlet/buildletclient.go
@@ -110,6 +110,19 @@
 	c.desc = v
 }
 
+// SetGCEInstanceName sets an instance name for GCE buildlets.
+// This value differs from the buildlet name used in the CLI and web interface.
+func (c *Client) SetGCEInstanceName(v string) {
+	c.gceInstanceName = v
+}
+
+// GCEInstanceName gets an instance name for GCE buildlets.
+// This value differs from the buildlet name used in the CLI and web interface.
+// For non-GCE buildlets, this will return an empty string.
+func (c *Client) GCEInstanceName() string {
+	return c.gceInstanceName
+}
+
 // SetHTTPClient replaces the underlying HTTP client.
 // It should only be called before the Client is used.
 func (c *Client) SetHTTPClient(httpClient *http.Client) {
@@ -141,15 +154,16 @@
 
 // A Client interacts with a single buildlet.
 type Client struct {
-	ipPort         string // required, unless remoteBuildlet+baseURL is set
-	tls            KeyPair
-	httpClient     *http.Client
-	dialer         func(context.Context) (net.Conn, error) // nil means to use net.Dialer.DialContext
-	baseURL        string                                  // optional baseURL (used by remote buildlets)
-	authUser       string                                  // defaults to "gomote", if password is non-empty
-	password       string                                  // basic auth password or empty for none
-	remoteBuildlet string                                  // non-empty if for remote buildlets (used by client)
-	name           string                                  // optional name for debugging, returned by Name
+	ipPort          string // required, unless remoteBuildlet+baseURL is set
+	tls             KeyPair
+	httpClient      *http.Client
+	dialer          func(context.Context) (net.Conn, error) // nil means to use net.Dialer.DialContext
+	baseURL         string                                  // optional baseURL (used by remote buildlets)
+	authUser        string                                  // defaults to "gomote", if password is non-empty
+	password        string                                  // basic auth password or empty for none
+	remoteBuildlet  string                                  // non-empty if for remote buildlets (used by client)
+	name            string                                  // optional name for debugging, returned by Name
+	gceInstanceName string                                  // instance name for GCE VMs
 
 	closeFuncs  []func() // optional extra code to run on close
 	releaseMode bool
diff --git a/cmd/coordinator/gce.go b/cmd/coordinator/gce.go
index 927b185..228dee5 100644
--- a/cmd/coordinator/gce.go
+++ b/cmd/coordinator/gce.go
@@ -353,6 +353,7 @@
 	}
 	waitBuildlet.Done(nil)
 	bc.SetDescription("GCE VM: " + instName)
+	bc.SetGCEInstanceName(instName)
 	bc.SetOnHeartbeatFailure(func() {
 		p.putBuildlet(bc, hostType, zone, instName)
 	})
@@ -543,11 +544,11 @@
 // cleanZoneVMs is part of cleanUpOldVMs, operating on a single zone.
 func (p *gceBuildletPool) cleanZoneVMs(zone string) error {
 	// Fetch the first 500 (default) running instances and clean
-	// thoes. We expect that we'll be running many fewer than
+	// those. We expect that we'll be running many fewer than
 	// that. Even if we have more, eventually the first 500 will
 	// either end or be cleaned, and then the next call will get a
 	// partially-different 500.
-	// TODO(bradfitz): revist this code if we ever start running
+	// TODO(bradfitz): revisit this code if we ever start running
 	// thousands of VMs.
 	gceAPIGate()
 	list, err := computeService.Instances.List(buildEnv.ProjectName, zone).Do()
@@ -559,6 +560,11 @@
 			// Defensive. Not seen in practice.
 			continue
 		}
+		if isGCERemoteBuildlet(inst.Name) {
+			// Remote buildlets have their own expiration mechanism that respects active SSH sessions.
+			log.Printf("cleanZoneVMs: skipping remote buildlet %q", inst.Name)
+			continue
+		}
 		var sawDeleteAt bool
 		var deleteReason string
 		for _, it := range inst.Metadata.Items {
diff --git a/cmd/coordinator/remote.go b/cmd/coordinator/remote.go
index 11a9bf8..6294dca 100644
--- a/cmd/coordinator/remote.go
+++ b/cmd/coordinator/remote.go
@@ -105,6 +105,17 @@
 	}
 }
 
+func isGCERemoteBuildlet(instName string) bool {
+	remoteBuildlets.Lock()
+	defer remoteBuildlets.Unlock()
+	for _, rb := range remoteBuildlets.m {
+		if rb.buildlet.GCEInstanceName() == instName {
+			return true
+		}
+	}
+	return false
+}
+
 func expireBuildlets() {
 	defer cleanTimer.Reset(remoteBuildletCleanInterval)
 	remoteBuildlets.Lock()