all: improved monitoring of buildlet pods after creation
* Replaced cancel with context.Context
* StartPod can be canceled
* Wait for buildlet to come online, but fail fast if pod fails first
* Support timeout waiting for pod to leave pending phase
* Use Kubernetes watch API (long poll)
Updates golang/go#12546
Change-Id: I792a3b8fed615362a0290feee7de0c2cefe43c0e
Reviewed-on: https://go-review.googlesource.com/15285
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
diff --git a/cmd/coordinator/coordinator.go b/cmd/coordinator/coordinator.go
index 1a08a18..aa1d2ba 100644
--- a/cmd/coordinator/coordinator.go
+++ b/cmd/coordinator/coordinator.go
@@ -40,6 +40,7 @@
"golang.org/x/build/internal/singleflight"
"golang.org/x/build/livelog"
"golang.org/x/build/types"
+ "golang.org/x/net/context"
"google.golang.org/cloud/storage"
)
@@ -876,7 +877,7 @@
for {
timeout := time.NewTimer(10 * time.Minute)
select {
- case <-bs.donec:
+ case <-bs.ctx.Done():
timeout.Stop()
break WaitCh
case <-timeout.C:
@@ -1023,22 +1024,6 @@
logEventTime(event string, optText ...string)
}
-var ErrCanceled = errors.New("canceled")
-
-// Cancel is a channel that's closed by the caller when the request is no longer
-// desired. The function receiving a cancel should return ErrCanceled whenever
-// Cancel becomes readable.
-type Cancel <-chan struct{}
-
-func (c Cancel) IsCanceled() bool {
- select {
- case <-c:
- return true
- default:
- return false
- }
-}
-
type BuildletPool interface {
// GetBuildlet returns a new buildlet client.
//
@@ -1048,23 +1033,23 @@
// anything except for log messages or VM naming.
//
// Clients must Close when done with the client.
- GetBuildlet(cancel Cancel, machineType, rev string, el eventTimeLogger) (*buildlet.Client, error)
+ GetBuildlet(ctx context.Context, machineType, rev string, el eventTimeLogger) (*buildlet.Client, error)
String() string // TODO(bradfitz): more status stuff
}
// GetBuildlets creates up to n buildlets and sends them on the returned channel
// before closing the channel.
-func GetBuildlets(cancel Cancel, pool BuildletPool, n int, machineType, rev string, el eventTimeLogger) <-chan *buildlet.Client {
+func GetBuildlets(ctx context.Context, pool BuildletPool, n int, machineType, rev string, el eventTimeLogger) <-chan *buildlet.Client {
ch := make(chan *buildlet.Client) // NOT buffered
var wg sync.WaitGroup
wg.Add(n)
for i := 0; i < n; i++ {
go func() {
defer wg.Done()
- bc, err := pool.GetBuildlet(cancel, machineType, rev, el)
+ bc, err := pool.GetBuildlet(ctx, machineType, rev, el)
if err != nil {
- if err != ErrCanceled {
+ if err != context.Canceled {
log.Printf("failed to get a %s buildlet for rev %s: %v", machineType, rev, err)
}
return
@@ -1072,7 +1057,7 @@
el.logEventTime("empty_helper_ready", bc.Name())
select {
case ch <- bc:
- case <-cancel:
+ case <-ctx.Done():
el.logEventTime("helper_killed_before_use", bc.Name())
bc.Close()
return
@@ -1104,18 +1089,20 @@
if !ok {
return nil, fmt.Errorf("unknown builder type %q", rev.name)
}
+ ctx, cancel := context.WithCancel(context.Background())
return &buildStatus{
builderRev: rev,
conf: conf,
- donec: make(chan struct{}),
startTime: time.Now(),
+ ctx: ctx,
+ cancel: cancel,
}, nil
}
// start sets the st.startTime and starts the build in a new goroutine.
// If it returns an error, st is not modified and a new goroutine has not
// been started.
-// The build status's donec channel is closed on when the build is complete
+// The build status's context is closed on when the build is complete
// in either direction.
func (st *buildStatus) start() {
setStatus(st.builderRev, st)
@@ -1221,7 +1208,7 @@
func (st *buildStatus) onceInitHelpersFunc() {
pool, _ := st.buildletPool() // won't return an error since we called it already
- st.helpers = GetBuildlets(st.donec, pool, st.conf.NumTestHelpers, st.buildletType(), st.rev, st)
+ st.helpers = GetBuildlets(st.ctx, pool, st.conf.NumTestHelpers, st.buildletType(), st.rev, st)
}
// We should try to build from a snapshot if this is a subrepo build, we can
@@ -1243,7 +1230,7 @@
return err
}
st.logEventTime("get_buildlet")
- bc, err := pool.GetBuildlet(nil, st.buildletType(), st.rev, st)
+ bc, err := pool.GetBuildlet(st.ctx, st.buildletType(), st.rev, st)
if err != nil {
return fmt.Errorf("failed to get a buildlet: %v", err)
}
@@ -2011,7 +1998,7 @@
// lumpy. The rest of the buildlets run the largest tests
// first (critical path scheduling).
// The buildletActivity WaitGroup is used to track when all
- // the buildlets are dead or don.
+ // the buildlets are dead or done.
var buildletActivity sync.WaitGroup
buildletActivity.Add(2) // one per goroutine below (main + helper launcher goroutine)
go func() {
@@ -2020,7 +2007,7 @@
tis, ok := set.testsToRunInOrder()
if !ok {
select {
- case <-st.donec:
+ case <-st.ctx.Done():
return
case <-time.After(5 * time.Second):
}
@@ -2394,12 +2381,13 @@
// Immutable:
builderRev
conf dashboard.BuildConfig
- startTime time.Time // actually time of newBuild (~same thing)
- trySet *trySet // or nil
- donec chan struct{} // closed when done
+ startTime time.Time // actually time of newBuild (~same thing)
+ trySet *trySet // or nil
onceInitHelpers sync.Once // guards call of onceInitHelpersFunc, to init::
helpers <-chan *buildlet.Client
+ ctx context.Context // used to start the build
+ cancel context.CancelFunc // used to cancel context; for use by setDone only
mu sync.Mutex // guards following
failURL string // if non-empty, permanent URL of failure
@@ -2418,7 +2406,7 @@
st.succeeded = succeeded
st.done = time.Now()
st.output.Close()
- close(st.donec)
+ st.cancel()
}
func (st *buildStatus) isRunning() bool {
diff --git a/cmd/coordinator/gce.go b/cmd/coordinator/gce.go
index ab0a702..337f85c 100644
--- a/cmd/coordinator/gce.go
+++ b/cmd/coordinator/gce.go
@@ -206,13 +206,13 @@
p.disabled = !enabled
}
-func (p *gceBuildletPool) GetBuildlet(cancel Cancel, typ, rev string, el eventTimeLogger) (*buildlet.Client, error) {
+func (p *gceBuildletPool) GetBuildlet(ctx context.Context, typ, rev string, el eventTimeLogger) (*buildlet.Client, error) {
el.logEventTime("awaiting_gce_quota")
conf, ok := dashboard.Builders[typ]
if !ok {
return nil, fmt.Errorf("gcepool: unknown buildlet type %q", typ)
}
- if err := p.awaitVMCountQuota(cancel, conf.GCENumCPU()); err != nil {
+ if err := p.awaitVMCountQuota(ctx, conf.GCENumCPU()); err != nil {
return nil, err
}
@@ -331,8 +331,8 @@
}
// awaitVMCountQuota waits for numCPU CPUs of quota to become available,
-// or returns ErrCanceled.
-func (p *gceBuildletPool) awaitVMCountQuota(cancel Cancel, numCPU int) error {
+// or returns ctx.Err.
+func (p *gceBuildletPool) awaitVMCountQuota(ctx context.Context, numCPU int) error {
// Poll every 2 seconds, which could be better, but works and
// is simple.
for {
@@ -341,8 +341,8 @@
}
select {
case <-time.After(2 * time.Second):
- case <-cancel:
- return ErrCanceled
+ case <-ctx.Done():
+ return ctx.Err()
}
}
}
diff --git a/cmd/coordinator/kube.go b/cmd/coordinator/kube.go
index 5742129..0dba4bf 100644
--- a/cmd/coordinator/kube.go
+++ b/cmd/coordinator/kube.go
@@ -19,6 +19,7 @@
"golang.org/x/build/buildlet"
"golang.org/x/build/dashboard"
"golang.org/x/build/kubernetes"
+ "golang.org/x/net/context"
"golang.org/x/oauth2"
container "google.golang.org/api/container/v1"
)
@@ -114,7 +115,7 @@
mu sync.Mutex
}
-func (p *kubeBuildletPool) GetBuildlet(cancel Cancel, typ, rev string, el eventTimeLogger) (*buildlet.Client, error) {
+func (p *kubeBuildletPool) GetBuildlet(ctx context.Context, typ, rev string, el eventTimeLogger) (*buildlet.Client, error) {
conf, ok := dashboard.Builders[typ]
if !ok || conf.KubeImage == "" {
return nil, fmt.Errorf("kubepool: invalid builder type %q", typ)
@@ -146,9 +147,9 @@
var needDelete bool
el.logEventTime("creating_kube_pod", podName)
- log.Printf("Creating Kubernetes pod %q for %s at %s", podName, typ, rev)
+ log.Printf("Creating Kubernetes pod %q for %s at %s", podName, typ, rev)
- bc, err := buildlet.StartPod(kubeClient, podName, typ, buildlet.PodOpts{
+ bc, err := buildlet.StartPod(ctx, kubeClient, podName, typ, buildlet.PodOpts{
ImageRegistry: registryPrefix,
Description: fmt.Sprintf("Go Builder for %s at %s", typ, rev),
DeleteIn: deleteIn,
diff --git a/cmd/coordinator/remote.go b/cmd/coordinator/remote.go
index 41dd59d..a79aa16 100644
--- a/cmd/coordinator/remote.go
+++ b/cmd/coordinator/remote.go
@@ -21,6 +21,7 @@
"golang.org/x/build/buildlet"
"golang.org/x/build/dashboard"
+ "golang.org/x/net/context"
)
var (
@@ -106,12 +107,13 @@
if cn, ok := w.(http.CloseNotifier); ok {
closeNotify = cn.CloseNotify()
}
- cancel := make(chan struct{})
+
+ ctx, cancel := context.WithCancel(context.Background())
resc := make(chan *buildlet.Client)
errc := make(chan error)
go func() {
- bc, err := pool.GetBuildlet(cancel, typ, rev, eventTimeLoggerFunc(func(event string, optText ...string) {
+ bc, err := pool.GetBuildlet(ctx, typ, rev, eventTimeLoggerFunc(func(event string, optText ...string) {
var extra string
if len(optText) > 0 {
extra = " " + optText[0]
@@ -152,7 +154,7 @@
return
case <-closeNotify:
log.Printf("client went away during buildlet create request")
- close(cancel)
+ cancel()
closeNotify = nil // unnecessary, but habit.
}
}
diff --git a/cmd/coordinator/reverse.go b/cmd/coordinator/reverse.go
index 944f356..b6f3236 100644
--- a/cmd/coordinator/reverse.go
+++ b/cmd/coordinator/reverse.go
@@ -43,6 +43,7 @@
"golang.org/x/build/buildlet"
"golang.org/x/build/revdial"
+ "golang.org/x/net/context"
)
const minBuildletVersion = 1
@@ -185,7 +186,7 @@
return c
}
-func (p *reverseBuildletPool) GetBuildlet(cancel Cancel, machineType, rev string, el eventTimeLogger) (*buildlet.Client, error) {
+func (p *reverseBuildletPool) GetBuildlet(ctx context.Context, machineType, rev string, el eventTimeLogger) (*buildlet.Client, error) {
seenErrInUse := false
for {
b, err := p.tryToGrab(machineType)
@@ -200,8 +201,8 @@
log.Printf("Rev %q is waiting high-priority", rev)
}
select {
- case <-cancel:
- return nil, ErrCanceled
+ case <-ctx.Done():
+ return nil, ctx.Err()
case bc := <-highPri:
log.Printf("Rev %q stole a high-priority one.", rev)
return p.cleanedBuildlet(bc, el)