buildlet: fix Exec to return ErrTimeout on timeout
The coordinator relies on Exec reporting that the given timeout was
exceeded in order to mark a build as failed instead of retrying it.
A refactor resulted in Exec no longer doing that, despite what its
documentation promises, so fix that.
Also add a test since evidence shows that catching a regression can
be helpful.
For golang/go#42699.
Updates golang/go#35707.
Change-Id: Iacef90b83e7b81fad88a33baa6489d5157e3528f
Reviewed-on: https://go-review.googlesource.com/c/build/+/407555
Reviewed-by: Carlos Amedee <carlos@golang.org>
Reviewed-by: Bryan Mills <bcmills@google.com>
Run-TryBot: Dmitri Shuralyov <dmitshur@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Auto-Submit: Dmitri Shuralyov <dmitshur@golang.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
diff --git a/buildlet/buildletclient.go b/buildlet/buildletclient.go
index bd352db..573dc87 100644
--- a/buildlet/buildletclient.go
+++ b/buildlet/buildletclient.go
@@ -509,6 +509,8 @@
OnStartExec func()
}
+// ErrTimeout is a sentinel error that represents that waiting
+// for a command to complete has exceeded the given timeout.
var ErrTimeout = errors.New("buildlet: timeout waiting for command to complete")
// Exec runs cmd on the buildlet.
@@ -519,8 +521,8 @@
// seen to completition. If execErr is non-nil, the remoteErr is
// meaningless.
//
-// If the context's deadline is exceeded, the returned execErr is
-// ErrTimeout.
+// If the context's deadline is exceeded while waiting for the command
+// to complete, the returned execErr is ErrTimeout.
func (c *client) Exec(ctx context.Context, cmd string, opts ExecOpts) (remoteErr, execErr error) {
var mode string
if opts.SystemLevel {
@@ -553,10 +555,11 @@
// (Atlanta, Paris, Sydney, etc.) the reverse buildlet is:
res, err := c.doHeaderTimeout(req, 20*time.Second)
if err == errHeaderTimeout {
+ // If we don't see headers after all that time,
+ // consider the buildlet to be unhealthy.
c.MarkBroken()
return nil, errors.New("buildlet: timeout waiting for exec header response")
- }
- if err != nil {
+ } else if err != nil {
return nil, err
}
defer res.Body.Close()
@@ -577,7 +580,7 @@
out = ioutil.Discard
}
if _, err := io.Copy(out, res.Body); err != nil {
- resc <- errs{execErr: fmt.Errorf("error copying response: %v", err)}
+ resc <- errs{execErr: fmt.Errorf("error copying response: %w", err)}
return
}
@@ -600,10 +603,15 @@
select {
case res := <-resc:
if res.execErr != nil {
+ // Note: We've historically marked the buildlet as unhealthy after
+ // reaching any kind of execution error, even when it's a remote command
+ // execution timeout (see use of ErrTimeout below).
+ // This is certainly on the safer side of avoiding false positive signal,
+ // but maybe someday we'll want to start to rely on the buildlet to report
+ // such a condition and not mark it as unhealthy.
+
c.MarkBroken()
- if res.execErr == context.DeadlineExceeded {
- // Historical pre-context value.
- // TODO: update docs & callers to just use the context value.
+ if errors.Is(res.execErr, context.DeadlineExceeded) {
res.execErr = ErrTimeout
}
}
diff --git a/buildlet/buildletclient_test.go b/buildlet/buildletclient_test.go
index 289daca..9315bb6 100644
--- a/buildlet/buildletclient_test.go
+++ b/buildlet/buildletclient_test.go
@@ -7,10 +7,12 @@
import (
"context"
"crypto/tls"
+ "encoding/json"
"errors"
"net"
"net/http"
"net/http/httptest"
+ "net/url"
"strings"
"testing"
)
@@ -171,3 +173,55 @@
}
return kp
}
+
+// Test that Exec returns ErrTimeout upon reaching the context timeout
+// during command execution, as its documentation promises.
+func TestExecTimeoutError(t *testing.T) {
+ mux := http.NewServeMux()
+ mux.HandleFunc("/status", func(w http.ResponseWriter, req *http.Request) {
+ json.NewEncoder(w).Encode(Status{})
+ })
+ mux.HandleFunc("/exec", func(w http.ResponseWriter, req *http.Request) {
+ w.Write([]byte("."))
+ w.(http.Flusher).Flush() // /exec needs to flush headers right away.
+ <-req.Context().Done() // Simulate that execution hangs, so no more output.
+ })
+ ts := httptest.NewServer(mux)
+ defer ts.Close()
+ u, err := url.Parse(ts.URL)
+ if err != nil {
+ t.Fatalf("unable to parse http server url %s", err)
+ }
+ cl := NewClient(u.Host, NoKeyPair)
+ defer cl.Close()
+
+ // Use a custom context that reports context.DeadlineExceeded
+ // after Exec starts command execution. (context.WithTimeout
+ // requires us to select an arbitrary duration, which might
+ // not be long enough or will make the test take too long.)
+ ctx := deadlineOnDemandContext{
+ Context: context.Background(),
+ done: make(chan struct{}),
+ }
+ _, execErr := cl.Exec(ctx, "./bin/test", ExecOpts{
+ OnStartExec: func() { close(ctx.done) },
+ })
+ if execErr != ErrTimeout {
+ t.Errorf("cl.Exec error = %v; want %v", execErr, ErrTimeout)
+ }
+}
+
+type deadlineOnDemandContext struct {
+ context.Context
+ done chan struct{}
+}
+
+func (c deadlineOnDemandContext) Done() <-chan struct{} { return c.done }
+func (c deadlineOnDemandContext) Err() error {
+ select {
+ default:
+ return nil
+ case <-c.done:
+ return context.DeadlineExceeded
+ }
+}