buildletclient: simplify Close

Now Close always means destroy.

The old code & API was a half-baked implementation of a (lack of)
design where there was a difference between being done with a buildlet
(with it possibly being reused by somebody else?) and you wanting to
nuke it completely. Unfortunately this just grew messier and more
broken over time.

This attempts to clean it all up.

We can add the sharing ideas back later when there's actually a design
and implementation. (We're not losing anything with this CL, because
nothing ever shared buildlets)

In fact, this CL fixes a problem where reverse buildlets weren't
getting their underlying net.Conns closed when their healthchecks
failed, leading to 4+ hour (and counting) build hangs due to buildlets
getting killed at inopportune moments (e.g. me testing running a
reverse buildlet on my home mac to help out with the dashboard
backlog)

Change-Id: I07be09f4d5f0f09d35e51e41c48b1296b71bb9b5
Reviewed-on: https://go-review.googlesource.com/14585
Reviewed-by: David Crawshaw <crawshaw@golang.org>
Reviewed-by: Andrew Gerrand <adg@golang.org>
diff --git a/buildlet/buildletclient.go b/buildlet/buildletclient.go
index 536dbff..a3a9832 100644
--- a/buildlet/buildletclient.go
+++ b/buildlet/buildletclient.go
@@ -48,25 +48,38 @@
 	c.peerDead = make(chan struct{})
 }
 
-// SetCloseFunc sets a function to be called when c.Close is called.
-// SetCloseFunc must not be called concurrently with Close.
-func (c *Client) SetCloseFunc(fn func() error) {
-	c.closeFunc = fn
+// SetOnHeartbeatFailure sets a function to be called when heartbeats
+// against this builder fail, or when the client is destroyed with
+// Close. The function fn is never called more than once.
+// SetOnHeartbeatFailure must be set before any use of the buildlet.
+func (c *Client) SetOnHeartbeatFailure(fn func()) {
+	c.heartbeatFailure = fn
 }
 
 var ErrClosed = errors.New("buildlet: Client closed")
 
+// Closes destroys and closes down the buildlet, destroying all state
+// immediately.
 func (c *Client) Close() error {
-	c.setPeerDead(ErrClosed) // TODO(bradfitz): split concept of closed vs. broken?
-	var err error
-	if c.closeFunc != nil {
-		err = c.closeFunc()
-		c.closeFunc = nil
-	}
-	return err
+	c.closeOnce.Do(func() {
+		// Send a best-effort notification to the server to destroy itself.
+		// Don't want too long (since it's likely in a broken state anyway).
+		// Ignore the return value, since we're about to forcefully destroy
+		// it anyway.
+		req, err := http.NewRequest("POST", c.URL()+"/halt", nil)
+		if err != nil {
+			// ignore.
+		} else {
+			_, err = c.doHeaderTimeout(req, 2*time.Second)
+		}
+		if err == nil {
+			err = ErrClosed
+		}
+		c.setPeerDead(err) // which will also cause c.heartbeatFailure to run
+	})
+	return nil
 }
 
-// To be called only via c.setPeerDeadOnce.Do(s.setPeerDead)
 func (c *Client) setPeerDead(err error) {
 	c.setPeerDeadOnce.Do(func() {
 		c.MarkBroken()
@@ -112,9 +125,10 @@
 	password       string // basic auth password or empty for none
 	remoteBuildlet string // non-empty if for remote buildlets
 
-	closeFunc func() error
-	desc      string
+	heartbeatFailure func() // optional
+	desc             string
 
+	closeOnce         sync.Once
 	initHeartbeatOnce sync.Once
 	setPeerDeadOnce   sync.Once
 	peerDead          chan struct{} // closed on peer death
@@ -214,17 +228,20 @@
 	for {
 		select {
 		case <-c.peerDead:
-			// Already dead by something else.
-			// Most likely: c.Close was called.
+			// Dead for whatever reason (heartbeat, remote
+			// side closed, caller Closed
+			// normally). Regardless, we call the
+			// heartbeatFailure func if set.
+			if c.heartbeatFailure != nil {
+				c.heartbeatFailure()
+			}
 			return
 		case <-time.After(10 * time.Second):
 			t0 := time.Now()
 			if _, err := c.Status(); err != nil {
 				failInARow++
 				if failInARow == 3 {
-					c.MarkBroken()
 					c.setPeerDead(fmt.Errorf("Buildlet %v failed heartbeat after %v; marking dead; err=%v", c, time.Since(t0), err))
-					return
 				}
 			} else {
 				failInARow = 0
@@ -491,15 +508,6 @@
 	}
 }
 
-// Destroy shuts down the buildlet, destroying all state immediately.
-func (c *Client) Destroy() error {
-	req, err := http.NewRequest("POST", c.URL()+"/halt", nil)
-	if err != nil {
-		return err
-	}
-	return c.doOK(req)
-}
-
 // RemoveAll deletes the provided paths, relative to the work directory.
 func (c *Client) RemoveAll(paths ...string) error {
 	if len(paths) == 0 {
@@ -516,13 +524,14 @@
 
 // DestroyVM shuts down the buildlet and destroys the VM instance.
 func (c *Client) DestroyVM(ts oauth2.TokenSource, proj, zone, instance string) error {
+	// TODO(bradfitz): move GCE stuff out of this package?
 	gceErrc := make(chan error, 1)
 	buildletErrc := make(chan error, 1)
 	go func() {
 		gceErrc <- DestroyVM(ts, proj, zone, instance)
 	}()
 	go func() {
-		buildletErrc <- c.Destroy()
+		buildletErrc <- c.Close()
 	}()
 	timeout := time.NewTimer(5 * time.Second)
 	defer timeout.Stop()
diff --git a/cmd/coordinator/coordinator.go b/cmd/coordinator/coordinator.go
index a344714..4a79e52 100644
--- a/cmd/coordinator/coordinator.go
+++ b/cmd/coordinator/coordinator.go
@@ -1240,7 +1240,6 @@
 		return fmt.Errorf("failed to get a buildlet: %v", err)
 	}
 	defer bc.Close()
-	defer nukeIfBroken(bc)
 	st.mu.Lock()
 	st.bc = bc
 	st.mu.Unlock()
@@ -1966,7 +1965,6 @@
 			go func(bc *buildlet.Client) {
 				defer st.logEventTime("closed_helper", bc.Name())
 				defer bc.Close()
-				defer nukeIfBroken(bc)
 				if devPause {
 					defer time.Sleep(5 * time.Minute)
 					defer st.logEventTime("DEV_HELPER_SLEEP", bc.Name())
@@ -2642,12 +2640,4 @@
 	return slurp, nil
 }
 
-func nukeIfBroken(bc *buildlet.Client) {
-	if bc.IsBroken() {
-		// It may not have come from the reverse pool, but it's harmless if
-		// it didn't.
-		reversePool.nukeBuildlet(bc)
-	}
-}
-
 var nl = []byte("\n")
diff --git a/cmd/coordinator/gce.go b/cmd/coordinator/gce.go
index fc75a02..a2b87d4 100644
--- a/cmd/coordinator/gce.go
+++ b/cmd/coordinator/gce.go
@@ -266,7 +266,9 @@
 		return nil, err
 	}
 	bc.SetDescription("GCE VM: " + instName)
-	bc.SetCloseFunc(func() error { return p.putBuildlet(bc, typ, instName) })
+	bc.SetOnHeartbeatFailure(func() {
+		p.putBuildlet(bc, typ, instName)
+	})
 	return bc, nil
 }
 
@@ -274,8 +276,12 @@
 	// TODO(bradfitz): add the buildlet to a freelist (of max N
 	// items) for up to 10 minutes since when it got started if
 	// it's never seen a command execution failure, and we can
-	// wipe all its disk content. (perhaps wipe its disk content when
-	// it's retrieved, not put back on the freelist)
+	// wipe all its disk content? (perhaps wipe its disk content
+	// when it's retrieved, like the reverse buildlet pool) But
+	// this will require re-introducing a distinction in the
+	// buildlet client library between Close, Destroy/Halt, and
+	// tracking execution errors.  That was all half-baked before
+	// and thus removed. Now Close always destroys everything.
 	deleteVM(projectZone, instName)
 	p.setInstanceUsed(instName, false)
 
diff --git a/cmd/coordinator/remote.go b/cmd/coordinator/remote.go
index bf6b1f5..41dd59d 100644
--- a/cmd/coordinator/remote.go
+++ b/cmd/coordinator/remote.go
@@ -66,11 +66,6 @@
 	}
 }
 
-func destroyCloseBuildlet(bc *buildlet.Client) {
-	bc.Destroy()
-	bc.Close()
-}
-
 func expireBuildlets() {
 	defer cleanTimer.Reset(remoteBuildletCleanInterval)
 	remoteBuildlets.Lock()
@@ -78,7 +73,7 @@
 	now := time.Now()
 	for name, rb := range remoteBuildlets.m {
 		if !rb.Expires.IsZero() && rb.Expires.Before(now) {
-			go destroyCloseBuildlet(rb.buildlet)
+			go rb.buildlet.Close()
 			delete(remoteBuildlets.m, name)
 		}
 	}
@@ -261,7 +256,7 @@
 	}
 
 	if r.Method == "POST" && r.URL.Path == "/halt" {
-		err := rb.buildlet.Destroy()
+		err := rb.buildlet.Close()
 		if err != nil {
 			http.Error(w, err.Error(), http.StatusInternalServerError)
 		}
diff --git a/cmd/coordinator/reverse.go b/cmd/coordinator/reverse.go
index 96e7953..36bd9af 100644
--- a/cmd/coordinator/reverse.go
+++ b/cmd/coordinator/reverse.go
@@ -79,10 +79,6 @@
 			// Found an unused match.
 			b.inUseAs = machineType
 			b.inUseTime = time.Now()
-			b.client.SetCloseFunc(func() error {
-				p.nukeBuildlet(b.client)
-				return nil
-			})
 			return b.client, nil
 		}
 	}
@@ -441,6 +437,7 @@
 		},
 	})
 	client.SetDescription(fmt.Sprintf("reverse peer %s/%s for modes %v", hostname, r.RemoteAddr, modes))
+	client.SetOnHeartbeatFailure(func() { conn.Close() })
 	tstatus := time.Now()
 	status, err := client.Status()
 	if err != nil {
diff --git a/cmd/gomote/destroy.go b/cmd/gomote/destroy.go
index 27a2bf2..9fbc016 100644
--- a/cmd/gomote/destroy.go
+++ b/cmd/gomote/destroy.go
@@ -27,5 +27,5 @@
 	if err != nil {
 		return err
 	}
-	return bc.Destroy()
+	return bc.Close()
 }
diff --git a/cmd/release/release.go b/cmd/release/release.go
index df04622..5b8fc7d 100644
--- a/cmd/release/release.go
+++ b/cmd/release/release.go
@@ -216,9 +216,6 @@
 	if err != nil {
 		return nil, err
 	}
-	bc.SetCloseFunc(func() error {
-		return bc.Destroy()
-	})
 	return bc, nil
 }