all: improved monitoring of buildlet pods after creation

* Replaced cancel with context.Context
* StartPod can be canceled
* Wait for buildlet to come online, but fail fast if pod fails first
* Support timeout waiting for pod to leave pending phase
* Use Kubernetes watch API (long poll)

Updates golang/go#12546

Change-Id: I792a3b8fed615362a0290feee7de0c2cefe43c0e
Reviewed-on: https://go-review.googlesource.com/15285
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
diff --git a/cmd/coordinator/coordinator.go b/cmd/coordinator/coordinator.go
index 1a08a18..aa1d2ba 100644
--- a/cmd/coordinator/coordinator.go
+++ b/cmd/coordinator/coordinator.go
@@ -40,6 +40,7 @@
 	"golang.org/x/build/internal/singleflight"
 	"golang.org/x/build/livelog"
 	"golang.org/x/build/types"
+	"golang.org/x/net/context"
 	"google.golang.org/cloud/storage"
 )
 
@@ -876,7 +877,7 @@
 		for {
 			timeout := time.NewTimer(10 * time.Minute)
 			select {
-			case <-bs.donec:
+			case <-bs.ctx.Done():
 				timeout.Stop()
 				break WaitCh
 			case <-timeout.C:
@@ -1023,22 +1024,6 @@
 	logEventTime(event string, optText ...string)
 }
 
-var ErrCanceled = errors.New("canceled")
-
-// Cancel is a channel that's closed by the caller when the request is no longer
-// desired. The function receiving a cancel should return ErrCanceled whenever
-// Cancel becomes readable.
-type Cancel <-chan struct{}
-
-func (c Cancel) IsCanceled() bool {
-	select {
-	case <-c:
-		return true
-	default:
-		return false
-	}
-}
-
 type BuildletPool interface {
 	// GetBuildlet returns a new buildlet client.
 	//
@@ -1048,23 +1033,23 @@
 	// anything except for log messages or VM naming.
 	//
 	// Clients must Close when done with the client.
-	GetBuildlet(cancel Cancel, machineType, rev string, el eventTimeLogger) (*buildlet.Client, error)
+	GetBuildlet(ctx context.Context, machineType, rev string, el eventTimeLogger) (*buildlet.Client, error)
 
 	String() string // TODO(bradfitz): more status stuff
 }
 
 // GetBuildlets creates up to n buildlets and sends them on the returned channel
 // before closing the channel.
-func GetBuildlets(cancel Cancel, pool BuildletPool, n int, machineType, rev string, el eventTimeLogger) <-chan *buildlet.Client {
+func GetBuildlets(ctx context.Context, pool BuildletPool, n int, machineType, rev string, el eventTimeLogger) <-chan *buildlet.Client {
 	ch := make(chan *buildlet.Client) // NOT buffered
 	var wg sync.WaitGroup
 	wg.Add(n)
 	for i := 0; i < n; i++ {
 		go func() {
 			defer wg.Done()
-			bc, err := pool.GetBuildlet(cancel, machineType, rev, el)
+			bc, err := pool.GetBuildlet(ctx, machineType, rev, el)
 			if err != nil {
-				if err != ErrCanceled {
+				if err != context.Canceled {
 					log.Printf("failed to get a %s buildlet for rev %s: %v", machineType, rev, err)
 				}
 				return
@@ -1072,7 +1057,7 @@
 			el.logEventTime("empty_helper_ready", bc.Name())
 			select {
 			case ch <- bc:
-			case <-cancel:
+			case <-ctx.Done():
 				el.logEventTime("helper_killed_before_use", bc.Name())
 				bc.Close()
 				return
@@ -1104,18 +1089,20 @@
 	if !ok {
 		return nil, fmt.Errorf("unknown builder type %q", rev.name)
 	}
+	ctx, cancel := context.WithCancel(context.Background())
 	return &buildStatus{
 		builderRev: rev,
 		conf:       conf,
-		donec:      make(chan struct{}),
 		startTime:  time.Now(),
+		ctx:        ctx,
+		cancel:     cancel,
 	}, nil
 }
 
 // start sets the st.startTime and starts the build in a new goroutine.
 // If it returns an error, st is not modified and a new goroutine has not
 // been started.
-// The build status's donec channel is closed on when the build is complete
+// The build status's context is closed on when the build is complete
 // in either direction.
 func (st *buildStatus) start() {
 	setStatus(st.builderRev, st)
@@ -1221,7 +1208,7 @@
 
 func (st *buildStatus) onceInitHelpersFunc() {
 	pool, _ := st.buildletPool() // won't return an error since we called it already
-	st.helpers = GetBuildlets(st.donec, pool, st.conf.NumTestHelpers, st.buildletType(), st.rev, st)
+	st.helpers = GetBuildlets(st.ctx, pool, st.conf.NumTestHelpers, st.buildletType(), st.rev, st)
 }
 
 // We should try to build from a snapshot if this is a subrepo build, we can
@@ -1243,7 +1230,7 @@
 		return err
 	}
 	st.logEventTime("get_buildlet")
-	bc, err := pool.GetBuildlet(nil, st.buildletType(), st.rev, st)
+	bc, err := pool.GetBuildlet(st.ctx, st.buildletType(), st.rev, st)
 	if err != nil {
 		return fmt.Errorf("failed to get a buildlet: %v", err)
 	}
@@ -2011,7 +1998,7 @@
 	// lumpy.  The rest of the buildlets run the largest tests
 	// first (critical path scheduling).
 	// The buildletActivity WaitGroup is used to track when all
-	// the buildlets are dead or don.
+	// the buildlets are dead or done.
 	var buildletActivity sync.WaitGroup
 	buildletActivity.Add(2) // one per goroutine below (main + helper launcher goroutine)
 	go func() {
@@ -2020,7 +2007,7 @@
 			tis, ok := set.testsToRunInOrder()
 			if !ok {
 				select {
-				case <-st.donec:
+				case <-st.ctx.Done():
 					return
 				case <-time.After(5 * time.Second):
 				}
@@ -2394,12 +2381,13 @@
 	// Immutable:
 	builderRev
 	conf      dashboard.BuildConfig
-	startTime time.Time     // actually time of newBuild (~same thing)
-	trySet    *trySet       // or nil
-	donec     chan struct{} // closed when done
+	startTime time.Time // actually time of newBuild (~same thing)
+	trySet    *trySet   // or nil
 
 	onceInitHelpers sync.Once // guards call of onceInitHelpersFunc, to init::
 	helpers         <-chan *buildlet.Client
+	ctx             context.Context    // used to start the build
+	cancel          context.CancelFunc // used to cancel context; for use by setDone only
 
 	mu              sync.Mutex       // guards following
 	failURL         string           // if non-empty, permanent URL of failure
@@ -2418,7 +2406,7 @@
 	st.succeeded = succeeded
 	st.done = time.Now()
 	st.output.Close()
-	close(st.donec)
+	st.cancel()
 }
 
 func (st *buildStatus) isRunning() bool {
diff --git a/cmd/coordinator/gce.go b/cmd/coordinator/gce.go
index ab0a702..337f85c 100644
--- a/cmd/coordinator/gce.go
+++ b/cmd/coordinator/gce.go
@@ -206,13 +206,13 @@
 	p.disabled = !enabled
 }
 
-func (p *gceBuildletPool) GetBuildlet(cancel Cancel, typ, rev string, el eventTimeLogger) (*buildlet.Client, error) {
+func (p *gceBuildletPool) GetBuildlet(ctx context.Context, typ, rev string, el eventTimeLogger) (*buildlet.Client, error) {
 	el.logEventTime("awaiting_gce_quota")
 	conf, ok := dashboard.Builders[typ]
 	if !ok {
 		return nil, fmt.Errorf("gcepool: unknown buildlet type %q", typ)
 	}
-	if err := p.awaitVMCountQuota(cancel, conf.GCENumCPU()); err != nil {
+	if err := p.awaitVMCountQuota(ctx, conf.GCENumCPU()); err != nil {
 		return nil, err
 	}
 
@@ -331,8 +331,8 @@
 }
 
 // awaitVMCountQuota waits for numCPU CPUs of quota to become available,
-// or returns ErrCanceled.
-func (p *gceBuildletPool) awaitVMCountQuota(cancel Cancel, numCPU int) error {
+// or returns ctx.Err.
+func (p *gceBuildletPool) awaitVMCountQuota(ctx context.Context, numCPU int) error {
 	// Poll every 2 seconds, which could be better, but works and
 	// is simple.
 	for {
@@ -341,8 +341,8 @@
 		}
 		select {
 		case <-time.After(2 * time.Second):
-		case <-cancel:
-			return ErrCanceled
+		case <-ctx.Done():
+			return ctx.Err()
 		}
 	}
 }
diff --git a/cmd/coordinator/kube.go b/cmd/coordinator/kube.go
index 5742129..0dba4bf 100644
--- a/cmd/coordinator/kube.go
+++ b/cmd/coordinator/kube.go
@@ -19,6 +19,7 @@
 	"golang.org/x/build/buildlet"
 	"golang.org/x/build/dashboard"
 	"golang.org/x/build/kubernetes"
+	"golang.org/x/net/context"
 	"golang.org/x/oauth2"
 	container "google.golang.org/api/container/v1"
 )
@@ -114,7 +115,7 @@
 	mu sync.Mutex
 }
 
-func (p *kubeBuildletPool) GetBuildlet(cancel Cancel, typ, rev string, el eventTimeLogger) (*buildlet.Client, error) {
+func (p *kubeBuildletPool) GetBuildlet(ctx context.Context, typ, rev string, el eventTimeLogger) (*buildlet.Client, error) {
 	conf, ok := dashboard.Builders[typ]
 	if !ok || conf.KubeImage == "" {
 		return nil, fmt.Errorf("kubepool: invalid builder type %q", typ)
@@ -146,9 +147,9 @@
 	var needDelete bool
 
 	el.logEventTime("creating_kube_pod", podName)
-	log.Printf("Creating Kubernetes  pod %q for %s at %s", podName, typ, rev)
+	log.Printf("Creating Kubernetes pod %q for %s at %s", podName, typ, rev)
 
-	bc, err := buildlet.StartPod(kubeClient, podName, typ, buildlet.PodOpts{
+	bc, err := buildlet.StartPod(ctx, kubeClient, podName, typ, buildlet.PodOpts{
 		ImageRegistry: registryPrefix,
 		Description:   fmt.Sprintf("Go Builder for %s at %s", typ, rev),
 		DeleteIn:      deleteIn,
diff --git a/cmd/coordinator/remote.go b/cmd/coordinator/remote.go
index 41dd59d..a79aa16 100644
--- a/cmd/coordinator/remote.go
+++ b/cmd/coordinator/remote.go
@@ -21,6 +21,7 @@
 
 	"golang.org/x/build/buildlet"
 	"golang.org/x/build/dashboard"
+	"golang.org/x/net/context"
 )
 
 var (
@@ -106,12 +107,13 @@
 	if cn, ok := w.(http.CloseNotifier); ok {
 		closeNotify = cn.CloseNotify()
 	}
-	cancel := make(chan struct{})
+
+	ctx, cancel := context.WithCancel(context.Background())
 
 	resc := make(chan *buildlet.Client)
 	errc := make(chan error)
 	go func() {
-		bc, err := pool.GetBuildlet(cancel, typ, rev, eventTimeLoggerFunc(func(event string, optText ...string) {
+		bc, err := pool.GetBuildlet(ctx, typ, rev, eventTimeLoggerFunc(func(event string, optText ...string) {
 			var extra string
 			if len(optText) > 0 {
 				extra = " " + optText[0]
@@ -152,7 +154,7 @@
 			return
 		case <-closeNotify:
 			log.Printf("client went away during buildlet create request")
-			close(cancel)
+			cancel()
 			closeNotify = nil // unnecessary, but habit.
 		}
 	}
diff --git a/cmd/coordinator/reverse.go b/cmd/coordinator/reverse.go
index 944f356..b6f3236 100644
--- a/cmd/coordinator/reverse.go
+++ b/cmd/coordinator/reverse.go
@@ -43,6 +43,7 @@
 
 	"golang.org/x/build/buildlet"
 	"golang.org/x/build/revdial"
+	"golang.org/x/net/context"
 )
 
 const minBuildletVersion = 1
@@ -185,7 +186,7 @@
 	return c
 }
 
-func (p *reverseBuildletPool) GetBuildlet(cancel Cancel, machineType, rev string, el eventTimeLogger) (*buildlet.Client, error) {
+func (p *reverseBuildletPool) GetBuildlet(ctx context.Context, machineType, rev string, el eventTimeLogger) (*buildlet.Client, error) {
 	seenErrInUse := false
 	for {
 		b, err := p.tryToGrab(machineType)
@@ -200,8 +201,8 @@
 				log.Printf("Rev %q is waiting high-priority", rev)
 			}
 			select {
-			case <-cancel:
-				return nil, ErrCanceled
+			case <-ctx.Done():
+				return nil, ctx.Err()
 			case bc := <-highPri:
 				log.Printf("Rev %q stole a high-priority one.", rev)
 				return p.cleanedBuildlet(bc, el)