all: kubernetes builder autoscaling Improvements to support rapid scheduling of many build jobs: - Retry logic in Kubernetes client to handle sporadic connection closes from their API server under heavy load - Cluster autoscaler scales on default CPU utilization metric - Debug mode allows scheduling multiple builds to test scaling - Account for scheduled vs. provisioned resources in a cluster and use that information to estimate when a build's pod will be scheduled and in running state - Use estimated scheduled time to set context timeout - Track pod lifecycle (requested time, estimated available time, actual available time, terminate time, etc) Change-Id: I14d6c5e01af0970dbb3390a29d1ee5c43049fff8 Reviewed-on: https://go-review.googlesource.com/19524 Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>

commit: 34ff1d9bc8f5640c8e6d214e18dfab8462292407 [log] [tgz]
author: Evan Brown <evanbrown@google.com> Tue Feb 16 22:39:27 2016 -0800
committer: Brad Fitzpatrick <bradfitz@golang.org> Thu Apr 07 17:22:57 2016 +0000
tree: 2997d7c0103a57ec7717a64251c19cf9833afb72
parent: 86757ddb63cf0fb2ff4f3b002b492c4b0544dba8 [diff] [blame]
diff --git a/kubernetes/client.go b/kubernetes/client.go
index 5767465..16c2c1f 100644
--- a/kubernetes/client.go
+++ b/kubernetes/client.go

@@ -53,8 +53,7 @@
 // Run creates a new pod resource in the default pod namespace with
 // the given pod API specification.
 // It returns the pod status once it has entered the Running phase.
-// An error is returned if the pod can not be created, if it does
-// does not enter the running phase within 2 minutes, or if ctx.Done
+// An error is returned if the pod can not be created, or if ctx.Done
 // is closed.
 func (c *Client) RunPod(ctx context.Context, pod *api.Pod) (*api.Pod, error) {
 	var podJSON bytes.Buffer
@@ -82,17 +81,27 @@
 	if err := json.Unmarshal(body, &podResult); err != nil {
 		return nil, fmt.Errorf("failed to decode pod resources: %v", err)
 	}
-	ctx, cancel := context.WithTimeout(ctx, 2*time.Minute)
-	defer cancel()
 
-	createdPod, err := c.AwaitPodNotPending(ctx, podResult.Name, podResult.ObjectMeta.ResourceVersion)
-	if err != nil {
-		// The pod did not leave the pending state. We should try to manually delete it before
-		// returning an error.
-		c.DeletePod(context.Background(), podResult.Name)
-		return nil, fmt.Errorf("timed out waiting for pod %q to leave pending state: %v", pod.Name, err)
+	retryWait := 1
+	retryMax := retryWait << 3 // retry 3 times
+	for {
+		createdPod, err := c.AwaitPodNotPending(ctx, podResult.Name, podResult.ObjectMeta.ResourceVersion)
+		if err != nil {
+			if err == context.Canceled {
+				return nil, err
+			}
+			if retryWait < retryMax { // retry
+				time.Sleep(time.Duration(retryWait) * time.Second)
+				retryWait = retryWait << 1
+				continue
+			}
+			// The pod did not leave the pending state. We should try to manually delete it before
+			// returning an error.
+			c.DeletePod(context.Background(), podResult.Name)
+			return nil, fmt.Errorf("waiting for pod %q to leave pending state: %v", pod.Name, err)
+		}
+		return createdPod, nil
 	}
-	return createdPod, nil
 }
 
 // GetPods returns all pods in the cluster, regardless of status.
@@ -219,11 +228,14 @@
 			return
 		}
 		res, err := ctxhttp.Do(ctx, c.httpClient, req)
-		defer res.Body.Close()
 		if err != nil {
-			statusChan <- PodStatusResult{Err: fmt.Errorf("failed to make request: GET %q: %v", getURL, err)}
+			if err != context.Canceled {
+				statusChan <- PodStatusResult{Err: fmt.Errorf("failed to make request: GET %q: %v", getURL, err)}
+			}
+			statusChan <- PodStatusResult{Err: err} // context.Canceled
 			return
 		}
+		defer res.Body.Close()
 
 		var wps watchPodStatus
 		reader := bufio.NewReader(res.Body)
commit	34ff1d9bc8f5640c8e6d214e18dfab8462292407	[log] [tgz]
author	Evan Brown <evanbrown@google.com>	Tue Feb 16 22:39:27 2016 -0800
committer	Brad Fitzpatrick <bradfitz@golang.org>	Thu Apr 07 17:22:57 2016 +0000
tree	2997d7c0103a57ec7717a64251c19cf9833afb72
parent	86757ddb63cf0fb2ff4f3b002b492c4b0544dba8 [diff] [blame]