coordinator: rate-limit GCE calls, put cap on number of VMs at once

Also, make Windows use regular disks for now, since its image is so
large (100 GB) and we only have 2TB of SSD quota.

This is all very conservative and paranoid for now until I figure out
what part of the coordinator was misbehaving.

Change-Id: Icead5c07cf706c2cfc4d1dd66a108649429018ac
Reviewed-on: https://go-review.googlesource.com/7910
Reviewed-by: David Crawshaw <crawshaw@golang.org>
diff --git a/buildlet/gce.go b/buildlet/gce.go
index 889ab0f..2668887 100644
--- a/buildlet/gce.go
+++ b/buildlet/gce.go
@@ -17,6 +17,16 @@
 	"google.golang.org/api/compute/v1"
 )
 
+// GCEGate optionally specifies a function to run before any GCE API call.
+// It's intended to be used to bound QPS rate to GCE.
+var GCEGate func()
+
+func apiGate() {
+	if GCEGate != nil {
+		GCEGate()
+	}
+}
+
 // VMOpts control how new VMs are started.
 type VMOpts struct {
 	// Zone is the GCE zone to create the VM in. Required.
@@ -77,6 +87,10 @@
 
 	prefix := "https://www.googleapis.com/compute/v1/projects/" + projectID
 	machType := prefix + "/zones/" + zone + "/machineTypes/" + conf.MachineType()
+	diskType := "https://www.googleapis.com/compute/v1/projects/" + projectID + "/zones/" + zone + "/diskTypes/pd-ssd"
+	if conf.RegularDisk {
+		diskType = "" // a spinning disk
+	}
 
 	instance := &compute.Instance{
 		Name:        instName,
@@ -90,7 +104,7 @@
 				InitializeParams: &compute.AttachedDiskInitializeParams{
 					DiskName:    instName,
 					SourceImage: "https://www.googleapis.com/compute/v1/projects/" + projectID + "/global/images/" + conf.VMImage,
-					DiskType:    "https://www.googleapis.com/compute/v1/projects/" + projectID + "/zones/" + zone + "/diskTypes/pd-ssd",
+					DiskType:    diskType,
 				},
 			},
 		},
@@ -146,6 +160,7 @@
 		addMeta(k, v)
 	}
 
+	apiGate()
 	op, err := computeService.Instances.Insert(projectID, zone, instance).Do()
 	if err != nil {
 		return nil, fmt.Errorf("Failed to create instance: %v", err)
@@ -157,6 +172,7 @@
 OpLoop:
 	for {
 		time.Sleep(2 * time.Second)
+		apiGate()
 		op, err := computeService.ZoneOperations.Get(projectID, zone, createOp).Do()
 		if err != nil {
 			return nil, fmt.Errorf("Failed to get op %s: %v", createOp, err)
@@ -178,6 +194,7 @@
 	}
 	condRun(opts.OnInstanceCreated)
 
+	apiGate()
 	inst, err := computeService.Instances.Get(projectID, zone, instName).Do()
 	if err != nil {
 		return nil, fmt.Errorf("Error getting instance %s details after creation: %v", instName, err)
@@ -244,6 +261,7 @@
 // returns once it's been requested, not when it's done.
 func DestroyVM(ts oauth2.TokenSource, proj, zone, instance string) error {
 	computeService, _ := compute.New(oauth2.NewClient(oauth2.NoContext, ts))
+	apiGate()
 	_, err := computeService.Instances.Delete(proj, zone, instance).Do()
 	return err
 }
@@ -264,6 +282,7 @@
 	computeService, _ := compute.New(oauth2.NewClient(oauth2.NoContext, ts))
 
 	// TODO(bradfitz): paging over results if more than 500
+	apiGate()
 	list, err := computeService.Instances.List(proj, zone).Do()
 	if err != nil {
 		return nil, err
diff --git a/cmd/coordinator/coordinator.go b/cmd/coordinator/coordinator.go
index c77e005..8949c2e 100644
--- a/cmd/coordinator/coordinator.go
+++ b/cmd/coordinator/coordinator.go
@@ -42,6 +42,7 @@
 	"golang.org/x/oauth2"
 	"golang.org/x/oauth2/google"
 	"google.golang.org/api/compute/v1"
+	"google.golang.org/api/googleapi"
 	"google.golang.org/cloud"
 	"google.golang.org/cloud/compute/metadata"
 	"google.golang.org/cloud/storage"
@@ -220,7 +221,7 @@
 		"builder": []string{st.name},
 		"key":     []string{builderKey(st.name)},
 		"hash":    []string{st.rev},
-		"url":     []string{fmt.Sprintf("http://%v/logs?name=%s&rev=%s&st=%p", externalIP, st.name, st.rev, st)},
+		"url":     []string{fmt.Sprintf("http://farmer.golang.org/logs?name=%s&rev=%s&st=%p", st.name, st.rev, st)},
 	}.Encode()
 	for {
 		st.mu.Lock()
@@ -411,7 +412,7 @@
 
 	io.WriteString(w, "<html><body><h1>Go build coordinator</h1>")
 
-	fmt.Fprintf(w, "<h2>running</h2><p>%d total builds active", numTotal)
+	fmt.Fprintf(w, "<h2>running</h2><p>%d total builds active; VM capacity: %d/%d", numTotal, len(vmCap), cap(vmCap))
 
 	io.WriteString(w, "<pre>")
 	for _, st := range active {
@@ -1009,11 +1010,6 @@
 	go func() {
 		err := buildInVM(conf, st)
 		if err != nil {
-			if st.hasEvent("instance_created") {
-				go deleteVM(projectZone, st.instName)
-			}
-		}
-		if err != nil {
 			fmt.Fprintf(st, "\n\nError: %v\n", err)
 		}
 		st.setDone(err == nil)
@@ -1022,24 +1018,53 @@
 	return st
 }
 
+// We artifically limit ourselves to 60 VMs right now, assuming that
+// each takes 2 CPU, and we have a current quota of 200 CPUs. That
+// gives us headroom, but also doesn't account for SSD or memory
+// quota.
+// TODO(bradfitz): better quota system.
+const maxVMs = 60
+
+var vmCap = make(chan bool, maxVMs)
+
+func awaitVMCountQuota() { vmCap <- true }
+func putVMCountQuota()   { <-vmCap }
+
 func buildInVM(conf dashboard.BuildConfig, st *buildStatus) (retErr error) {
+	st.logEventTime("awaiting_gce_quota")
+	awaitVMCountQuota()
+	defer putVMCountQuota()
+
+	needDelete := false
+	defer func() {
+		if !needDelete {
+			return
+		}
+		deleteVM(projectZone, st.instName)
+	}()
+
+	st.logEventTime("creating_instance")
+	log.Printf("Creating VM for %s, %s", conf.Name, st.rev)
 	bc, err := buildlet.StartNewVM(tokenSource, st.instName, conf.Name, buildlet.VMOpts{
 		ProjectID:   projectID,
 		Zone:        projectZone,
 		Description: fmt.Sprintf("Go Builder building %s %s", conf.Name, st.rev),
 		DeleteIn:    vmDeleteTimeout,
 		OnInstanceRequested: func() {
+			needDelete = true
 			st.logEventTime("instance_create_requested")
 			log.Printf("%v now booting VM %v for build", st.builderRev, st.instName)
 		},
 		OnInstanceCreated: func() {
 			st.logEventTime("instance_created")
+			needDelete = true // redundant with OnInstanceRequested one, but fine.
 		},
 		OnGotInstanceInfo: func() {
 			st.logEventTime("waiting_for_buildlet")
 		},
 	})
 	if err != nil {
+		log.Printf("Failed to create VM for %s, %s: %v", conf.Name, st.rev, err)
 		return err
 	}
 	st.logEventTime("buildlet_up")
@@ -1459,6 +1484,7 @@
 	// partially-different 500.
 	// TODO(bradfitz): revist this code if we ever start running
 	// thousands of VMs.
+	gceAPIGate()
 	list, err := computeService.Instances.List(projectID, zone).Do()
 	if err != nil {
 		return fmt.Errorf("listing instances: %v", err)
@@ -1495,13 +1521,25 @@
 	return nil
 }
 
-func deleteVM(zone, instName string) {
+// deleteVM starts a delete of an instance in a given zone.
+//
+// It either returns an operation name (if delete is pending) or the
+// empty string if the instance didn't exist.
+func deleteVM(zone, instName string) (operation string, err error) {
+	gceAPIGate()
 	op, err := computeService.Instances.Delete(projectID, zone, instName).Do()
+	apiErr, ok := err.(*googleapi.Error)
+	if ok {
+		if apiErr.Code == 404 {
+			return "", nil
+		}
+	}
 	if err != nil {
 		log.Printf("Failed to delete instance %q in zone %q: %v", instName, zone, err)
-		return
+		return "", err
 	}
-	log.Printf("Sent request to delete instance %q in zone %q. Operation ID == %v", instName, zone, op.Id)
+	log.Printf("Sent request to delete instance %q in zone %q. Operation ID, Name: %v, %v", instName, zone, op.Id, op.Name)
+	return op.Name, nil
 }
 
 func hasScope(want string) bool {
@@ -1627,3 +1665,16 @@
 		panic("previously assumed to never fail: " + err.Error())
 	}
 }
+
+// apiCallTicker ticks regularly, preventing us from accidentally making
+// GCE API calls too quickly. Our quota is 20 QPS, but we temporarily
+// limit ourselves to less than that.
+var apiCallTicker = time.NewTicker(time.Second / 5)
+
+func init() {
+	buildlet.GCEGate = gceAPIGate
+}
+
+func gceAPIGate() {
+	<-apiCallTicker.C
+}
diff --git a/dashboard/builders.go b/dashboard/builders.go
index 9e7806f..0ecda55 100644
--- a/dashboard/builders.go
+++ b/dashboard/builders.go
@@ -24,6 +24,8 @@
 	Go14URL     string // URL to built Go 1.4 tar.gz
 	buildletURL string // optional override buildlet URL
 
+	RegularDisk bool // if true, use spinning disk instead of SSD
+
 	env []string // extra environment ("key=value") pairs
 }
 
@@ -268,6 +270,7 @@
 		VMImage:     "windows-buildlet",
 		machineType: "n1-highcpu-2",
 		Go14URL:     "https://storage.googleapis.com/go-builder-data/go1.4-windows-amd64.tar.gz",
+		RegularDisk: true,
 		env:         []string{"GOARCH=amd64", "GOHOSTARCH=amd64"},
 	})
 	addBuilder(BuildConfig{
@@ -275,6 +278,7 @@
 		VMImage:     "windows-buildlet",
 		machineType: "n1-highcpu-4",
 		Go14URL:     "https://storage.googleapis.com/go-builder-data/go1.4-windows-amd64.tar.gz",
+		RegularDisk: true,
 		env:         []string{"GOARCH=amd64", "GOHOSTARCH=amd64"},
 	})
 	addBuilder(BuildConfig{
@@ -283,6 +287,7 @@
 		machineType: "n1-highcpu-2",
 		buildletURL: "http://storage.googleapis.com/go-builder-data/buildlet.windows-amd64",
 		Go14URL:     "https://storage.googleapis.com/go-builder-data/go1.4-windows-386.tar.gz",
+		RegularDisk: true,
 		env:         []string{"GOARCH=386", "GOHOSTARCH=386"},
 	})
 }