dashboard/coordinator: clean up VMs more, fix watchVM bug, tweak plan 9 settings
Change-Id: I30609505cd3790f9e9505a4f020215de7b0ab74b
Reviewed-on: https://go-review.googlesource.com/2474
Reviewed-by: Andrew Gerrand <adg@golang.org>
diff --git a/coordinator/main.go b/coordinator/main.go
index de8c38d..e48a8f7 100644
--- a/coordinator/main.go
+++ b/coordinator/main.go
@@ -13,6 +13,7 @@
"compress/gzip"
"crypto/hmac"
"crypto/md5"
+ "crypto/rand"
"encoding/json"
"errors"
"flag"
@@ -63,7 +64,16 @@
statusDone []*buildStatus // finished recently, capped to maxStatusDone
)
-const maxStatusDone = 30
+const (
+ maxStatusDone = 30
+
+ // vmDeleteTimeout is how long before we delete a VM.
+ // In practice this need only be as long as the slowest
+ // builder (plan9 currently), because on startup this program
+ // already deletes all buildlets it doesn't know about
+ // (i.e. ones from a previous instance of the coordinator).
+ vmDeleteTimeout = 45 * time.Minute
+)
// Initialized by initGCE:
var (
@@ -194,8 +204,38 @@
addBuilder(buildConfig{name: "linux-amd64-clang", image: "gobuilders/linux-x86-clang"})
// VMs:
- addBuilder(buildConfig{name: "openbsd-amd64-gce56", vmImage: "openbsd-amd64-56"})
- // addBuilder(buildConfig{name: "plan9-386-gce", vmImage: "plan9-386"})
+ addBuilder(buildConfig{
+ name: "openbsd-amd64-gce56",
+ vmImage: "openbsd-amd64-56",
+ machineType: "n1-highcpu-2",
+ })
+ addBuilder(buildConfig{
+ name: "plan9-386-gce",
+ vmImage: "plan9-386",
+ // We *were* using n1-standard-1 because Plan 9 can only
+ // reliably use a single CPU. Using 2 or 4 and we see
+ // test failures. See:
+ // https://golang.org/issue/8393
+ // https://golang.org/issue/9491
+ // n1-standard-1 has 3.6 GB of memory which is
+ // overkill (userspace probably only sees 2GB anyway),
+ // but it's the cheapest option. And plenty to keep
+ // our ~250 MB of inputs+outputs in its ramfs.
+ //
+ // But the docs says "For the n1 series of machine
+ // types, a virtual CPU is implemented as a single
+ // hyperthread on a 2.6GHz Intel Sandy Bridge Xeon or
+ // Intel Ivy Bridge Xeon (or newer) processor. This
+ // means that the n1-standard-2 machine type will see
+ // a whole physical core."
+ //
+ // ... so we use n1-highcpu-2 (1.80 RAM, still
+ // plenty), just so we can get 1 whole core for the
+ // single-core Plan 9. It will see 2 virtual cores and
+ // only use 1, but we hope that 1 will be more powerful
+ // and we'll stop timing out on tests.
+ machineType: "n1-highcpu-2",
+ })
addWatcher(watchConfig{repo: "https://go.googlesource.com/go", dash: "https://build.golang.org/"})
// TODO(adg,cmang): fix gccgo watcher
@@ -318,6 +358,21 @@
statusDone = append(statusDone, st)
}
+func vmIsBuilding(instName string) bool {
+ if instName == "" {
+ log.Printf("bogus empty instance name passed to vmIsBuilding")
+ return false
+ }
+ statusMu.Lock()
+ defer statusMu.Unlock()
+ for _, st := range status {
+ if st.instName == instName {
+ return true
+ }
+ }
+ return false
+}
+
// statusPtrStr disambiguates which status to return if there are
// multiple in the history (e.g. recent failures where the build
// didn't finish for reasons outside of all.bash failing)
@@ -710,6 +765,15 @@
var osArchRx = regexp.MustCompile(`^(\w+-\w+)`)
+func randHex(n int) string {
+ buf := make([]byte, n/2)
+ _, err := rand.Read(buf)
+ if err != nil {
+ panic("Failed to get randomness: " + err.Error())
+ }
+ return fmt.Sprintf("%x", buf)
+}
+
// startBuildingInVM starts a VM on GCE running the buildlet binary to build rev.
func startBuildingInVM(conf buildConfig, rev string) (*buildStatus, error) {
brev := builderRev{
@@ -723,7 +787,7 @@
// name is the project-wide unique name of the GCE instance. It can't be longer
// than 61 bytes, so we only use the first 8 bytes of the rev.
- name := "buildlet-" + conf.name + "-" + rev[:8]
+ name := "buildlet-" + conf.name + "-" + rev[:8] + "-rn" + randHex(6)
// buildletURL is the URL of the buildlet binary which the VMs
// are configured to download at boot and run. This lets us
@@ -776,7 +840,7 @@
// that killing.
{
Key: "delete-at",
- Value: fmt.Sprint(time.Now().Add(30 * time.Minute).Unix()),
+ Value: fmt.Sprint(time.Now().Add(vmDeleteTimeout).Unix()),
},
},
},
@@ -816,14 +880,16 @@
}
// watchVM monitors a VM doing a build.
-func watchVM(st *buildStatus) (err error) {
+func watchVM(st *buildStatus) (retErr error) {
goodRes := func(res *http.Response, err error, what string) bool {
if err != nil {
- err = fmt.Errorf("%s: %v", what, err)
+ retErr = fmt.Errorf("%s: %v", what, err)
return false
}
if res.StatusCode/100 != 2 {
- err = fmt.Errorf("%s: %v", what, res.Status)
+ slurp, _ := ioutil.ReadAll(io.LimitReader(res.Body, 4<<10))
+ retErr = fmt.Errorf("%s: %v; body: %s", what, res.Status, slurp)
+ res.Body.Close()
return false
}
@@ -1241,12 +1307,18 @@
return fmt.Errorf("listing instances: %v", err)
}
for _, inst := range list.Items {
+ if !strings.HasPrefix(inst.Name, "buildlet-") {
+ // We only delete ones we created.
+ continue
+ }
if inst.Metadata == nil {
// Defensive. Not seen in practice.
continue
}
+ sawDeleteAt := false
for _, it := range inst.Metadata.Items {
if it.Key == "delete-at" {
+ sawDeleteAt = true
unixDeadline, err := strconv.ParseInt(it.Value, 10, 64)
if err != nil {
log.Printf("invalid delete-at value %q seen; ignoring", it.Value)
@@ -1257,6 +1329,10 @@
}
}
}
+ if sawDeleteAt && !vmIsBuilding(inst.Name) {
+ log.Printf("Deleting VM %q in zone %q from an earlier coordinator generation ...", inst.Name, zone)
+ deleteVM(zone, inst.Name)
+ }
}
return nil
}