dashboard/coordinator: clean up VMs more, fix watchVM bug, tweak plan 9 settings

Change-Id: I30609505cd3790f9e9505a4f020215de7b0ab74b
Reviewed-on: https://go-review.googlesource.com/2474
Reviewed-by: Andrew Gerrand <adg@golang.org>
diff --git a/coordinator/main.go b/coordinator/main.go
index de8c38d..e48a8f7 100644
--- a/coordinator/main.go
+++ b/coordinator/main.go
@@ -13,6 +13,7 @@
 	"compress/gzip"
 	"crypto/hmac"
 	"crypto/md5"
+	"crypto/rand"
 	"encoding/json"
 	"errors"
 	"flag"
@@ -63,7 +64,16 @@
 	statusDone []*buildStatus // finished recently, capped to maxStatusDone
 )
 
-const maxStatusDone = 30
+const (
+	maxStatusDone = 30
+
+	// vmDeleteTimeout is how long before we delete a VM.
+	// In practice this need only be as long as the slowest
+	// builder (plan9 currently), because on startup this program
+	// already deletes all buildlets it doesn't know about
+	// (i.e. ones from a previous instance of the coordinator).
+	vmDeleteTimeout = 45 * time.Minute
+)
 
 // Initialized by initGCE:
 var (
@@ -194,8 +204,38 @@
 	addBuilder(buildConfig{name: "linux-amd64-clang", image: "gobuilders/linux-x86-clang"})
 
 	// VMs:
-	addBuilder(buildConfig{name: "openbsd-amd64-gce56", vmImage: "openbsd-amd64-56"})
-	// addBuilder(buildConfig{name: "plan9-386-gce", vmImage: "plan9-386"})
+	addBuilder(buildConfig{
+		name:        "openbsd-amd64-gce56",
+		vmImage:     "openbsd-amd64-56",
+		machineType: "n1-highcpu-2",
+	})
+	addBuilder(buildConfig{
+		name:    "plan9-386-gce",
+		vmImage: "plan9-386",
+		// We *were* using n1-standard-1 because Plan 9 can only
+		// reliably use a single CPU. Using 2 or 4 and we see
+		// test failures. See:
+		//    https://golang.org/issue/8393
+		//    https://golang.org/issue/9491
+		// n1-standard-1 has 3.6 GB of memory which is
+		// overkill (userspace probably only sees 2GB anyway),
+		// but it's the cheapest option. And plenty to keep
+		// our ~250 MB of inputs+outputs in its ramfs.
+		//
+		// But the docs says "For the n1 series of machine
+		// types, a virtual CPU is implemented as a single
+		// hyperthread on a 2.6GHz Intel Sandy Bridge Xeon or
+		// Intel Ivy Bridge Xeon (or newer) processor. This
+		// means that the n1-standard-2 machine type will see
+		// a whole physical core."
+		//
+		// ... so we use n1-highcpu-2 (1.80 RAM, still
+		// plenty), just so we can get 1 whole core for the
+		// single-core Plan 9. It will see 2 virtual cores and
+		// only use 1, but we hope that 1 will be more powerful
+		// and we'll stop timing out on tests.
+		machineType: "n1-highcpu-2",
+	})
 
 	addWatcher(watchConfig{repo: "https://go.googlesource.com/go", dash: "https://build.golang.org/"})
 	// TODO(adg,cmang): fix gccgo watcher
@@ -318,6 +358,21 @@
 	statusDone = append(statusDone, st)
 }
 
+func vmIsBuilding(instName string) bool {
+	if instName == "" {
+		log.Printf("bogus empty instance name passed to vmIsBuilding")
+		return false
+	}
+	statusMu.Lock()
+	defer statusMu.Unlock()
+	for _, st := range status {
+		if st.instName == instName {
+			return true
+		}
+	}
+	return false
+}
+
 // statusPtrStr disambiguates which status to return if there are
 // multiple in the history (e.g. recent failures where the build
 // didn't finish for reasons outside of all.bash failing)
@@ -710,6 +765,15 @@
 
 var osArchRx = regexp.MustCompile(`^(\w+-\w+)`)
 
+func randHex(n int) string {
+	buf := make([]byte, n/2)
+	_, err := rand.Read(buf)
+	if err != nil {
+		panic("Failed to get randomness: " + err.Error())
+	}
+	return fmt.Sprintf("%x", buf)
+}
+
 // startBuildingInVM starts a VM on GCE running the buildlet binary to build rev.
 func startBuildingInVM(conf buildConfig, rev string) (*buildStatus, error) {
 	brev := builderRev{
@@ -723,7 +787,7 @@
 
 	// name is the project-wide unique name of the GCE instance. It can't be longer
 	// than 61 bytes, so we only use the first 8 bytes of the rev.
-	name := "buildlet-" + conf.name + "-" + rev[:8]
+	name := "buildlet-" + conf.name + "-" + rev[:8] + "-rn" + randHex(6)
 
 	// buildletURL is the URL of the buildlet binary which the VMs
 	// are configured to download at boot and run. This lets us
@@ -776,7 +840,7 @@
 				// that killing.
 				{
 					Key:   "delete-at",
-					Value: fmt.Sprint(time.Now().Add(30 * time.Minute).Unix()),
+					Value: fmt.Sprint(time.Now().Add(vmDeleteTimeout).Unix()),
 				},
 			},
 		},
@@ -816,14 +880,16 @@
 }
 
 // watchVM monitors a VM doing a build.
-func watchVM(st *buildStatus) (err error) {
+func watchVM(st *buildStatus) (retErr error) {
 	goodRes := func(res *http.Response, err error, what string) bool {
 		if err != nil {
-			err = fmt.Errorf("%s: %v", what, err)
+			retErr = fmt.Errorf("%s: %v", what, err)
 			return false
 		}
 		if res.StatusCode/100 != 2 {
-			err = fmt.Errorf("%s: %v", what, res.Status)
+			slurp, _ := ioutil.ReadAll(io.LimitReader(res.Body, 4<<10))
+			retErr = fmt.Errorf("%s: %v; body: %s", what, res.Status, slurp)
+			res.Body.Close()
 			return false
 
 		}
@@ -1241,12 +1307,18 @@
 		return fmt.Errorf("listing instances: %v", err)
 	}
 	for _, inst := range list.Items {
+		if !strings.HasPrefix(inst.Name, "buildlet-") {
+			// We only delete ones we created.
+			continue
+		}
 		if inst.Metadata == nil {
 			// Defensive. Not seen in practice.
 			continue
 		}
+		sawDeleteAt := false
 		for _, it := range inst.Metadata.Items {
 			if it.Key == "delete-at" {
+				sawDeleteAt = true
 				unixDeadline, err := strconv.ParseInt(it.Value, 10, 64)
 				if err != nil {
 					log.Printf("invalid delete-at value %q seen; ignoring", it.Value)
@@ -1257,6 +1329,10 @@
 				}
 			}
 		}
+		if sawDeleteAt && !vmIsBuilding(inst.Name) {
+			log.Printf("Deleting VM %q in zone %q from an earlier coordinator generation ...", inst.Name, zone)
+			deleteVM(zone, inst.Name)
+		}
 	}
 	return nil
 }