cmd/coordinator, buildlet: reliability fixes
Add background heartbeats to detect dead GCE VMs (the OpenBSD
buildlets seem to hang a lot),
Add timeouts to test executions.
Take helper buildlets out of service once they're marked bad.
Keep the in-order buildlet running forever when sharding tests, in
case all the helpers die. (observed once)
Keep a cache of recently deleted VMs and don't try to delete VMs again
if we've recently deleted them. (they're slow to delete)
More reverse buildlets more paranoid in their health checking and closing
of the connection.
Make status page link to /try set URLs.
Also, better logging (more sometimes, less others0, and misc bug fixes.
Change-Id: I57a5e8e39381234006cac4dd799b655d64be71bb
Reviewed-on: https://go-review.googlesource.com/10981
Reviewed-by: Andrew Gerrand <adg@golang.org>
diff --git a/buildlet/buildletclient.go b/buildlet/buildletclient.go
index 1400eb1..7ca7b22 100644
--- a/buildlet/buildletclient.go
+++ b/buildlet/buildletclient.go
@@ -33,6 +33,7 @@
ipPort: ipPort,
tls: kp,
password: kp.Password(),
+ peerDead: make(chan struct{}),
httpClient: &http.Client{
Transport: &http.Transport{
Dial: defaultDialer(),
@@ -49,6 +50,7 @@
}
func (c *Client) Close() error {
+ c.setPeerDead(errors.New("Close called"))
var err error
if c.closeFunc != nil {
err = c.closeFunc()
@@ -57,6 +59,14 @@
return err
}
+// To be called only via c.setPeerDeadOnce.Do(s.setPeerDead)
+func (c *Client) setPeerDead(err error) {
+ c.setPeerDeadOnce.Do(func() {
+ c.deadErr = err
+ close(c.peerDead)
+ })
+}
+
// SetDescription sets a short description of where the buildlet
// connection came from. This is used by the build coordinator status
// page, mostly for debugging.
@@ -65,10 +75,21 @@
}
// SetHTTPClient replaces the underlying HTTP client.
+// It should only be called before the Client is used.
func (c *Client) SetHTTPClient(httpClient *http.Client) {
c.httpClient = httpClient
}
+// EnableHeartbeats enables background heartbeating
+// against the peer.
+// It should only be called before the Client is used.
+func (c *Client) EnableHeartbeats() {
+ // TODO(bradfitz): make this always enabled, once the
+ // reverse buildlet connection model supports
+ // multiple connections at once.
+ c.heartbeat = true
+}
+
// defaultDialer returns the net/http package's default Dial function.
// Notably, this sets TCP keep-alive values, so when we kill VMs
// (whose TCP stacks stop replying, forever), we don't leak file
@@ -86,10 +107,16 @@
tls KeyPair
password string // basic auth password or empty for none
httpClient *http.Client
+ heartbeat bool // whether to heartbeat in the background
closeFunc func() error
desc string
+ initHeartbeatOnce sync.Once
+ setPeerDeadOnce sync.Once
+ peerDead chan struct{} // closed on peer death
+ deadErr error // guarded by peerDead's close
+
mu sync.Mutex
broken bool // client is broken in some way
}
@@ -126,12 +153,41 @@
}
func (c *Client) do(req *http.Request) (*http.Response, error) {
+ c.initHeartbeatOnce.Do(c.initHeartbeats)
if c.password != "" {
req.SetBasicAuth("gomote", c.password)
}
return c.httpClient.Do(req)
}
+func (c *Client) initHeartbeats() {
+ if !c.heartbeat {
+ // TODO(bradfitz): make this always enabled later, once
+ // reverse buildlets are fixed.
+ return
+ }
+ go c.heartbeatLoop()
+}
+
+func (c *Client) heartbeatLoop() {
+ for {
+ select {
+ case <-c.peerDead:
+ // Already dead by something else.
+ // Most likely: c.Close was called.
+ return
+ case <-time.After(10 * time.Second):
+ t0 := time.Now()
+ if _, err := c.Status(); err != nil {
+ err := fmt.Errorf("Buildlet %v failed heartbeat after %v; marking dead; err=%v", c, time.Since(t0), err)
+ c.MarkBroken()
+ c.setPeerDead(err)
+ return
+ }
+ }
+ }
+}
+
var errHeaderTimeout = errors.New("timeout waiting for headers")
// doHeaderTimeout calls c.do(req) and returns its results, or
@@ -150,15 +206,20 @@
timer := time.NewTimer(max)
defer timer.Stop()
+ cleanup := func() {
+ if re := <-resErrc; re.res != nil {
+ re.res.Body.Close()
+ }
+ }
+
select {
case re := <-resErrc:
return re.res, re.err
+ case <-c.peerDead:
+ go cleanup()
+ return nil, c.deadErr
case <-timer.C:
- go func() {
- if re := <-resErrc; re.res != nil {
- res.Body.Close()
- }
- }()
+ go cleanup()
return nil, errHeaderTimeout
}
}
@@ -279,8 +340,13 @@
// response from the buildlet, but before the output begins
// writing to Output.
OnStartExec func()
+
+ // Timeout is an optional duration before ErrTimeout is returned.
+ Timeout time.Duration
}
+var ErrTimeout = errors.New("buildlet: timeout waiting for command to complete")
+
// Exec runs cmd on the buildlet.
//
// Two errors are returned: one is whether the command succeeded
@@ -315,9 +381,9 @@
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
// The first thing the buildlet's exec handler does is flush the headers, so
- // 5 seconds should be plenty of time, regardless of where on the planet
+ // 10 seconds should be plenty of time, regardless of where on the planet
// (Atlanta, Paris, etc) the reverse buildlet is:
- res, err := c.doHeaderTimeout(req, 5*time.Second)
+ res, err := c.doHeaderTimeout(req, 10*time.Second)
if err == errHeaderTimeout {
c.MarkBroken()
return nil, errors.New("buildlet: timeout waiting for exec header response")
@@ -332,28 +398,52 @@
}
condRun(opts.OnStartExec)
- // Stream the output:
- out := opts.Output
- if out == nil {
- out = ioutil.Discard
+ type errs struct {
+ remoteErr, execErr error
}
- if _, err := io.Copy(out, res.Body); err != nil {
- return nil, fmt.Errorf("error copying response: %v", err)
- }
+ resc := make(chan errs, 1)
+ go func() {
+ // Stream the output:
+ out := opts.Output
+ if out == nil {
+ out = ioutil.Discard
+ }
+ if _, err := io.Copy(out, res.Body); err != nil {
+ resc <- errs{execErr: fmt.Errorf("error copying response: %v", err)}
+ return
+ }
- // Don't record to the dashboard unless we heard the trailer from
- // the buildlet, otherwise it was probably some unrelated error
- // (like the VM being killed, or the buildlet crashing due to
- // e.g. https://golang.org/issue/9309, since we require a tip
- // build of the buildlet to get Trailers support)
- state := res.Trailer.Get("Process-State")
- if state == "" {
- return nil, errors.New("missing Process-State trailer from HTTP response; buildlet built with old (<= 1.4) Go?")
+ // Don't record to the dashboard unless we heard the trailer from
+ // the buildlet, otherwise it was probably some unrelated error
+ // (like the VM being killed, or the buildlet crashing due to
+ // e.g. https://golang.org/issue/9309, since we require a tip
+ // build of the buildlet to get Trailers support)
+ state := res.Trailer.Get("Process-State")
+ if state == "" {
+ resc <- errs{execErr: errors.New("missing Process-State trailer from HTTP response; buildlet built with old (<= 1.4) Go?")}
+ return
+ }
+ if state != "ok" {
+ resc <- errs{remoteErr: errors.New(state)}
+ } else {
+ resc <- errs{} // success
+ }
+ }()
+ var timer <-chan time.Time
+ if opts.Timeout > 0 {
+ t := time.NewTimer(opts.Timeout)
+ defer t.Stop()
+ timer = t.C
}
- if state != "ok" {
- return errors.New(state), nil
+ select {
+ case <-timer:
+ c.MarkBroken()
+ return nil, ErrTimeout
+ case res := <-resc:
+ return res.remoteErr, res.execErr
+ case <-c.peerDead:
+ return nil, c.deadErr
}
- return nil, nil
}
// Destroy shuts down the buildlet, destroying all state immediately.
@@ -437,7 +527,7 @@
if err != nil {
return Status{}, err
}
- resp, err := c.do(req)
+ resp, err := c.doHeaderTimeout(req, 10*time.Second) // plenty of time
if err != nil {
return Status{}, err
}
@@ -462,7 +552,7 @@
if err != nil {
return "", err
}
- resp, err := c.do(req)
+ resp, err := c.doHeaderTimeout(req, 10*time.Second) // plenty of time
if err != nil {
return "", err
}