cmd/coordinator,buildlet: keep active GCE SSH sessions alive
This CL skips deleting active remote buildlets.
The coordinator has multiple ways of tracking stale buildlets. For our
GCE buildlets, we periodically delete old VMs after their expiration
time, typically 45 minutes after their creation. The expiration tracking
in coordinator/gce.go does not account for remote buildlets, which are
buildlets created by users or cmd/release. Remote buildlets have their
own staleness checks and cleanup process, so we should skip the GCE
specific cleanup logic for them.
This adds an additional field to the buildlet Client in order to
correlate a GCE VM with a buildlet.
Updates golang/go#37001
Change-Id: Ib0acdf79c4dfbee6e0061c513f98b749d4b9cc64
Reviewed-on: https://go-review.googlesource.com/c/build/+/217722
Run-TryBot: Alexander Rakoczy <alex@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@golang.org>
diff --git a/buildlet/buildletclient.go b/buildlet/buildletclient.go
index a6f0d49..d23984c 100644
--- a/buildlet/buildletclient.go
+++ b/buildlet/buildletclient.go
@@ -110,6 +110,19 @@
c.desc = v
}
+// SetGCEInstanceName sets an instance name for GCE buildlets.
+// This value differs from the buildlet name used in the CLI and web interface.
+func (c *Client) SetGCEInstanceName(v string) {
+ c.gceInstanceName = v
+}
+
+// GCEInstanceName gets an instance name for GCE buildlets.
+// This value differs from the buildlet name used in the CLI and web interface.
+// For non-GCE buildlets, this will return an empty string.
+func (c *Client) GCEInstanceName() string {
+ return c.gceInstanceName
+}
+
// SetHTTPClient replaces the underlying HTTP client.
// It should only be called before the Client is used.
func (c *Client) SetHTTPClient(httpClient *http.Client) {
@@ -141,15 +154,16 @@
// A Client interacts with a single buildlet.
type Client struct {
- ipPort string // required, unless remoteBuildlet+baseURL is set
- tls KeyPair
- httpClient *http.Client
- dialer func(context.Context) (net.Conn, error) // nil means to use net.Dialer.DialContext
- baseURL string // optional baseURL (used by remote buildlets)
- authUser string // defaults to "gomote", if password is non-empty
- password string // basic auth password or empty for none
- remoteBuildlet string // non-empty if for remote buildlets (used by client)
- name string // optional name for debugging, returned by Name
+ ipPort string // required, unless remoteBuildlet+baseURL is set
+ tls KeyPair
+ httpClient *http.Client
+ dialer func(context.Context) (net.Conn, error) // nil means to use net.Dialer.DialContext
+ baseURL string // optional baseURL (used by remote buildlets)
+ authUser string // defaults to "gomote", if password is non-empty
+ password string // basic auth password or empty for none
+ remoteBuildlet string // non-empty if for remote buildlets (used by client)
+ name string // optional name for debugging, returned by Name
+ gceInstanceName string // instance name for GCE VMs
closeFuncs []func() // optional extra code to run on close
releaseMode bool
diff --git a/cmd/coordinator/gce.go b/cmd/coordinator/gce.go
index 927b185..228dee5 100644
--- a/cmd/coordinator/gce.go
+++ b/cmd/coordinator/gce.go
@@ -353,6 +353,7 @@
}
waitBuildlet.Done(nil)
bc.SetDescription("GCE VM: " + instName)
+ bc.SetGCEInstanceName(instName)
bc.SetOnHeartbeatFailure(func() {
p.putBuildlet(bc, hostType, zone, instName)
})
@@ -543,11 +544,11 @@
// cleanZoneVMs is part of cleanUpOldVMs, operating on a single zone.
func (p *gceBuildletPool) cleanZoneVMs(zone string) error {
// Fetch the first 500 (default) running instances and clean
- // thoes. We expect that we'll be running many fewer than
+ // those. We expect that we'll be running many fewer than
// that. Even if we have more, eventually the first 500 will
// either end or be cleaned, and then the next call will get a
// partially-different 500.
- // TODO(bradfitz): revist this code if we ever start running
+ // TODO(bradfitz): revisit this code if we ever start running
// thousands of VMs.
gceAPIGate()
list, err := computeService.Instances.List(buildEnv.ProjectName, zone).Do()
@@ -559,6 +560,11 @@
// Defensive. Not seen in practice.
continue
}
+ if isGCERemoteBuildlet(inst.Name) {
+ // Remote buildlets have their own expiration mechanism that respects active SSH sessions.
+ log.Printf("cleanZoneVMs: skipping remote buildlet %q", inst.Name)
+ continue
+ }
var sawDeleteAt bool
var deleteReason string
for _, it := range inst.Metadata.Items {
diff --git a/cmd/coordinator/remote.go b/cmd/coordinator/remote.go
index 11a9bf8..6294dca 100644
--- a/cmd/coordinator/remote.go
+++ b/cmd/coordinator/remote.go
@@ -105,6 +105,17 @@
}
}
+func isGCERemoteBuildlet(instName string) bool {
+ remoteBuildlets.Lock()
+ defer remoteBuildlets.Unlock()
+ for _, rb := range remoteBuildlets.m {
+ if rb.buildlet.GCEInstanceName() == instName {
+ return true
+ }
+ }
+ return false
+}
+
func expireBuildlets() {
defer cleanTimer.Reset(remoteBuildletCleanInterval)
remoteBuildlets.Lock()