cmd/coordinator: add some health status handlers & output on front page

Updates golang/go#15760

Change-Id: I046cc02118daa79e62b408bda591670a8ff70b17
Reviewed-on: https://go-review.googlesource.com/c/build/+/178798
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
diff --git a/cmd/coordinator/coordinator.go b/cmd/coordinator/coordinator.go
index 33dfca6..f94fdff 100644
--- a/cmd/coordinator/coordinator.go
+++ b/cmd/coordinator/coordinator.go
@@ -3212,6 +3212,9 @@
 func (st *buildStatus) HTMLStatusLine_done() template.HTML { return st.htmlStatusLine(false) }
 
 func (st *buildStatus) htmlStatusLine(full bool) template.HTML {
+	if st == nil {
+		return "[nil]"
+	}
 	st.mu.Lock()
 	defer st.mu.Unlock()
 
diff --git a/cmd/coordinator/reverse.go b/cmd/coordinator/reverse.go
index beba59a..49f0084 100644
--- a/cmd/coordinator/reverse.go
+++ b/cmd/coordinator/reverse.go
@@ -54,7 +54,8 @@
 const minBuildletVersion = 1
 
 var reversePool = &reverseBuildletPool{
-	oldInUse: make(map[*buildlet.Client]bool),
+	oldInUse:     make(map[*buildlet.Client]bool),
+	hostLastGood: make(map[string]time.Time),
 }
 
 const maxOldRevdialUsers = 10
@@ -62,7 +63,7 @@
 type token struct{}
 
 type reverseBuildletPool struct {
-	// mu guards all 4 fields below and also fields of
+	// mu guards all 5 fields below and also fields of
 	// *reverseBuildlet in buildlets
 	mu sync.Mutex
 
@@ -78,6 +79,8 @@
 	// These are a liability due to runaway memory issues (Issue 31639) so
 	// we bound how many can be running at once. Fortunately there aren't many left.
 	oldInUse map[*buildlet.Client]bool
+
+	hostLastGood map[string]time.Time
 }
 
 func (p *reverseBuildletPool) ServeReverseStatusJSON(w http.ResponseWriter, r *http.Request) {
@@ -213,6 +216,7 @@
 		panic("previous health check still running")
 	}
 	if b.inUse {
+		p.hostLastGood[b.hostname] = time.Now()
 		p.mu.Unlock()
 		return true // skip busy buildlets
 	}
@@ -252,7 +256,9 @@
 	}
 	b.inUse = false
 	b.inHealthCheck = false
-	b.inUseTime = time.Now()
+	now := time.Now()
+	b.inUseTime = now
+	p.hostLastGood[b.hostname] = now
 	go p.noteBuildletAvailable(b.hostType)
 	return true
 }
@@ -472,6 +478,7 @@
 	defer p.noteBuildletAvailable(b.hostType)
 	defer p.mu.Unlock()
 	p.buildlets = append(p.buildlets, b)
+	p.hostLastGood[b.hostname] = time.Now()
 	go p.healthCheckBuildletLoop(b)
 }
 
diff --git a/cmd/coordinator/status.go b/cmd/coordinator/status.go
index 5acf7b2..43284d7 100644
--- a/cmd/coordinator/status.go
+++ b/cmd/coordinator/status.go
@@ -9,6 +9,7 @@
 import (
 	"bytes"
 	"fmt"
+	"html"
 	"html/template"
 	"io"
 	"net/http"
@@ -17,29 +18,359 @@
 	"runtime"
 	"sort"
 	"strings"
+	"sync"
 	"sync/atomic"
 	"time"
+
+	"golang.org/x/build/dashboard"
 )
 
+// status
+type statusLevel int
+
+const (
+	// levelInfo is an informational text that's not an error,
+	// such as "coordinator just started recently, waiting to
+	// start health check"
+	levelInfo statusLevel = iota
+	// levelWarn is a non-critical error, such as "missing 1 of 50
+	// of ARM machines"
+	levelWarn
+	// levelError is something that should be fixed sooner, such
+	// as "all Macs are gone".
+	levelError
+)
+
+func (l statusLevel) String() string {
+	switch l {
+	case levelInfo:
+		return "Info"
+	case levelWarn:
+		return "Warn"
+	case levelError:
+		return "Error"
+	}
+	return ""
+}
+
+type levelText struct {
+	Level statusLevel
+	Text  string
+}
+
+func (lt levelText) AsHTML() template.HTML {
+	switch lt.Level {
+	case levelInfo:
+		return template.HTML(html.EscapeString(lt.Text))
+	case levelWarn:
+		return template.HTML(fmt.Sprintf("<span style='color: orange'>%s</span>", html.EscapeString(lt.Text)))
+	case levelError:
+		return template.HTML(fmt.Sprintf("<span style='color: red'><b>%s</b></span>", html.EscapeString(lt.Text)))
+	}
+	return ""
+}
+
+type checkWriter struct {
+	Out []levelText
+}
+
+func (w *checkWriter) error(s string)                       { w.Out = append(w.Out, levelText{levelError, s}) }
+func (w *checkWriter) errorf(a string, args ...interface{}) { w.error(fmt.Sprintf(a, args...)) }
+func (w *checkWriter) info(s string)                        { w.Out = append(w.Out, levelText{levelInfo, s}) }
+func (w *checkWriter) infof(a string, args ...interface{})  { w.info(fmt.Sprintf(a, args...)) }
+func (w *checkWriter) warn(s string)                        { w.Out = append(w.Out, levelText{levelWarn, s}) }
+func (w *checkWriter) warnf(a string, args ...interface{})  { w.warn(fmt.Sprintf(a, args...)) }
+func (w *checkWriter) hasErrors() bool {
+	for _, v := range w.Out {
+		if v.Level == levelError {
+			return true
+		}
+	}
+	return false
+}
+
+type healthChecker struct {
+	ID     string
+	Title  string
+	EnvURL string
+	Check  func(*checkWriter)
+}
+
+func (hc *healthChecker) DoCheck() *checkWriter {
+	cw := new(checkWriter)
+	hc.Check(cw)
+	return cw
+}
+
+var (
+	healthCheckers    []*healthChecker
+	healthCheckerByID = map[string]*healthChecker{}
+)
+
+func addHealthChecker(hc *healthChecker) {
+	if _, dup := healthCheckerByID[hc.ID]; dup {
+		panic("duplicate health checker ID " + hc.ID)
+	}
+	healthCheckers = append(healthCheckers, hc)
+	healthCheckerByID[hc.ID] = hc
+	http.Handle("/status/"+hc.ID, healthCheckerHandler(hc))
+}
+
+func init() {
+	addHealthChecker(newMacHealthChecker())
+	addHealthChecker(newScalewayHealthChecker())
+	addHealthChecker(newPacketHealthChecker())
+	addHealthChecker(newOSUPPC64Checker())
+	addHealthChecker(newOSUPPC64leChecker())
+	addHealthChecker(newJoyentChecker())
+}
+
+func newMacHealthChecker() *healthChecker {
+	var hosts []string
+	for i := 1; i <= 10; i++ {
+		for _, suf := range []string{"a", "b"} {
+			name := fmt.Sprintf("macstadium_host%02d%s", i, suf)
+			hosts = append(hosts, name)
+		}
+	}
+	checkHosts := reverseHostChecker(hosts)
+
+	// And check that the makemac daemon is listening.
+	var makeMac struct {
+		sync.Mutex
+		lastErr   error
+		lastCheck time.Time // currently unused
+	}
+	setMakeMacErr := func(err error) {
+		makeMac.Lock()
+		defer makeMac.Unlock()
+		makeMac.lastErr = err
+		makeMac.lastCheck = time.Now()
+	}
+	go func() {
+		c := &http.Client{Timeout: 15 * time.Second}
+		for {
+			res, err := c.Get("http://macstadiumd.golang.org:8713")
+			if err != nil {
+				setMakeMacErr(err)
+			} else {
+				res.Body.Close()
+				if res.StatusCode != 200 {
+					setMakeMacErr(fmt.Errorf("HTTP status %v", res.Status))
+				} else if res.Header.Get("Content-Type") != "application/json" {
+					setMakeMacErr(fmt.Errorf("unexpected content-type %q", res.Header.Get("Content-Type")))
+				} else {
+					setMakeMacErr(nil)
+				}
+			}
+			time.Sleep(15 * time.Second)
+		}
+	}()
+	return &healthChecker{
+		ID:     "macs",
+		Title:  "MacStadium Mac VMs",
+		EnvURL: "https://github.com/golang/build/tree/master/env/darwin/macstadium",
+		Check: func(w *checkWriter) {
+			// Check hosts.
+			checkHosts(w)
+			// Check makemac daemon.
+			makeMac.Lock()
+			defer makeMac.Unlock()
+			if makeMac.lastErr != nil {
+				w.errorf("makemac daemon: %v", makeMac.lastErr)
+			}
+			return
+		},
+	}
+}
+
+func newJoyentChecker() *healthChecker {
+	return &healthChecker{
+		ID:     "joyent",
+		Title:  "Joyent solaris/amd64 machines",
+		EnvURL: "https://github.com/golang/build/tree/master/env/solaris-amd64/joyent",
+		Check: func(cw *checkWriter) {
+			p := reversePool
+			p.mu.Lock()
+			defer p.mu.Unlock()
+			n := 0
+			for _, b := range p.buildlets {
+				if b.hostType == "host-solaris-amd64" {
+					n++
+				}
+			}
+			want := dashboard.Hosts["host-solaris-amd64"].ExpectNum
+			if n < want {
+				cw.errorf("%d connected; want %d", n, want)
+			}
+		},
+	}
+}
+
+func newScalewayHealthChecker() *healthChecker {
+	var hosts []string
+	for i := 1; i <= 50; i++ {
+		name := fmt.Sprintf("scaleway-prod-%02d", i)
+		hosts = append(hosts, name)
+	}
+	return &healthChecker{
+		ID:     "scaleway",
+		Title:  "Scaleway linux/arm machines",
+		EnvURL: "https://github.com/golang/build/tree/master/env/linux-arm/scaleway",
+		Check:  reverseHostChecker(hosts),
+	}
+}
+
+func newPacketHealthChecker() *healthChecker {
+	var hosts []string
+	for i := 1; i <= 20; i++ {
+		name := fmt.Sprintf("packet%02d", i)
+		hosts = append(hosts, name)
+	}
+	return &healthChecker{
+		ID:     "packet",
+		Title:  "Packet linux/arm64 machines",
+		EnvURL: "https://github.com/golang/build/tree/master/env/linux-arm64/packet",
+		Check:  reverseHostChecker(hosts),
+	}
+}
+
+func newOSUPPC64Checker() *healthChecker {
+	var hosts []string
+	for i := 1; i <= 5; i++ {
+		name := fmt.Sprintf("go-be-%v", i)
+		hosts = append(hosts, name)
+	}
+	return &healthChecker{
+		ID:     "osuppc64",
+		Title:  "OSU linux/ppc64 machines",
+		EnvURL: "https://github.com/golang/build/tree/master/env/linux-ppc64/osuosl",
+		Check:  reverseHostChecker(hosts),
+	}
+}
+
+func newOSUPPC64leChecker() *healthChecker {
+	var hosts []string
+	for i := 1; i <= 5; i++ {
+		name := fmt.Sprintf("go-le-%v", i)
+		hosts = append(hosts, name)
+	}
+	return &healthChecker{
+		ID:     "osuppc64le",
+		Title:  "OSU linux/ppc64le machines",
+		EnvURL: "https://github.com/golang/build/tree/master/env/linux-ppc64le/osuosl",
+		Check:  reverseHostChecker(hosts),
+	}
+}
+
+func reverseHostChecker(hosts []string) func(cw *checkWriter) {
+	const recentThreshold = 2 * time.Minute // let VMs be away 2 minutes; assume ~1 minute bootup + slop
+	checkStart := time.Now().Add(recentThreshold)
+
+	hostSet := map[string]bool{}
+	for _, v := range hosts {
+		hostSet[v] = true
+	}
+
+	return func(cw *checkWriter) {
+		p := reversePool
+		p.mu.Lock()
+		defer p.mu.Unlock()
+
+		now := time.Now()
+		wantGoodSince := now.Add(-recentThreshold)
+		numMissing := 0
+		numGood := 0
+		// Check last good times
+		for _, host := range hosts {
+			lastGood, ok := p.hostLastGood[host]
+			if ok && lastGood.After(wantGoodSince) {
+				numGood++
+				continue
+			}
+			if now.Before(checkStart) {
+				cw.infof("%s not yet connected", host)
+				continue
+			}
+			if ok {
+				cw.warnf("%s missing, not seen for %v", host, time.Now().Sub(lastGood).Round(time.Second))
+			} else {
+				cw.warnf("%s missing, never seen (at least %v)", host, uptime())
+			}
+			numMissing++
+		}
+		if numMissing > 0 {
+			sum := numMissing + numGood
+			percentMissing := float64(numMissing) / float64(sum)
+			msg := fmt.Sprintf("%d machines missing, %.0f%% of capacity", numMissing, percentMissing*100)
+			if percentMissing >= 0.15 {
+				cw.error(msg)
+			} else {
+				cw.warn(msg)
+			}
+		}
+
+		// And check that we don't have more than 1
+		// connected of any type.
+		count := map[string]int{}
+		for _, b := range p.buildlets {
+			if hostSet[b.hostname] {
+				count[b.hostname]++
+			}
+		}
+		for name, n := range count {
+			if n > 1 {
+				cw.errorf("%q is connected from %v machines", name, n)
+			}
+		}
+
+		return
+	}
+}
+
+func healthCheckerHandler(hc *healthChecker) http.Handler {
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		cw := new(checkWriter)
+		hc.Check(cw)
+		w.Header().Set("Content-Type", "text/plain; charset=utf-8")
+		if cw.hasErrors() {
+			w.WriteHeader(500)
+		} else {
+			w.WriteHeader(200)
+		}
+		if len(cw.Out) == 0 {
+			io.WriteString(w, "ok\n")
+			return
+		}
+		fmt.Fprintf(w, "# %q status: %s\n", hc.ID, hc.Title)
+		if hc.EnvURL != "" {
+			fmt.Fprintf(w, "# Notes: %v\n", hc.EnvURL)
+		}
+		for _, v := range cw.Out {
+			fmt.Fprintf(w, "%s: %s\n", v.Level, v.Text)
+		}
+	})
+}
+
+func uptime() time.Duration { return time.Now().Sub(processStartTime).Round(time.Second) }
+
 func handleStatus(w http.ResponseWriter, r *http.Request) {
 	if r.URL.Path != "/" {
 		http.NotFound(w, r)
 		return
 	}
-	round := func(t time.Duration) time.Duration {
-		return t / time.Second * time.Second
-	}
 	df := diskFree()
 
 	statusMu.Lock()
 	data := statusData{
-		Total:        len(status),
-		Uptime:       round(time.Now().Sub(processStartTime)),
-		Recent:       append([]*buildStatus{}, statusDone...),
-		DiskFree:     df,
-		Version:      Version,
-		NumFD:        fdCount(),
-		NumGoroutine: runtime.NumGoroutine(),
+		Total:          len(status),
+		Uptime:         uptime(),
+		Recent:         append([]*buildStatus{}, statusDone...),
+		DiskFree:       df,
+		Version:        Version,
+		NumFD:          fdCount(),
+		NumGoroutine:   runtime.NumGoroutine(),
+		HealthCheckers: healthCheckers,
 	}
 	for _, st := range status {
 		if atomic.LoadInt32(&st.hasBuildlet) != 0 {
@@ -158,6 +489,7 @@
 	RemoteBuildlets   template.HTML
 	DiskFree          string
 	Version           string
+	HealthCheckers    []*healthChecker
 }
 
 var statusTmpl = template.Must(template.New("status").Parse(`
@@ -177,6 +509,18 @@
 <h2>Running</h2>
 <p>{{printf "%d" .Total}} total builds; {{printf "%d" .ActiveBuilds}} active ({{.ActiveReverse}} reverse). Uptime {{printf "%s" .Uptime}}. Version {{.Version}}.
 
+<h2 id=health>Health <a href='#health'>¶</a></h2>
+<ul>{{range .HealthCheckers}}
+  <li><a href="/status/{{.ID}}">{{.Title}}</a>{{if .EnvURL}} [<a href="{{.EnvURL}}">docs</a>]{{end -}}: {{with .DoCheck.Out}}
+      <ul>
+        {{- range .}}
+          <li>{{ .AsHTML}}</li>
+        {{- end}}
+      </ul>
+    {{else}}ok{{end}}
+  </li>
+{{end}}</ul>
+
 <h2 id=trybots>Active Trybot Runs <a href='#trybots'>¶</a></h2>
 {{- if .TrybotsErr}}
 <b>trybots disabled:</b>: {{.TrybotsErr}}
diff --git a/cmd/coordinator/status_test.go b/cmd/coordinator/status_test.go
index 8e06c34..deaa247 100644
--- a/cmd/coordinator/status_test.go
+++ b/cmd/coordinator/status_test.go
@@ -7,6 +7,8 @@
 package main
 
 import (
+	"net/http/httptest"
+	"strings"
 	"testing"
 	"time"
 )
@@ -35,3 +37,67 @@
 		}
 	}
 }
+
+func init() {
+	addHealthChecker(&healthChecker{
+		ID:    "allgood",
+		Title: "All Good Test",
+		Check: func(*checkWriter) {},
+	})
+
+	addHealthChecker(&healthChecker{
+		ID:    "errortest",
+		Title: "Error Test",
+		Check: func(cw *checkWriter) {
+			cw.info("test-info")
+			cw.warn("test-warn")
+			cw.error("test-error")
+		},
+	})
+}
+
+func TestHandleStatus_HealthFormatting(t *testing.T) {
+	statusMu.Lock()
+	for k := range status {
+		delete(status, k)
+	}
+	for k := range tries {
+		delete(tries, k)
+	}
+	tryList = nil
+	statusMu.Unlock()
+
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest("GET", "/", nil)
+	handleStatus(rec, req)
+	const pre = "<h2 id=health>Health"
+	const suf = "<h2 id=trybots>Active Trybot Runs"
+	got := rec.Body.String()
+	if i := strings.Index(got, pre); i != -1 {
+		got = got[i+len(pre):]
+	} else {
+		t.Fatalf("output didn't contain %q: %s", pre, got)
+	}
+	if i := strings.Index(got, suf); i != -1 {
+		got = got[:i]
+	} else {
+		t.Fatalf("output didn't contain %q: %s", suf, got)
+	}
+	for _, sub := range []string{
+		`<a href="/status/macs">MacStadium Mac VMs</a> [`,
+		`<a href="/status/scaleway">Scaleway linux/arm machines</a> [`,
+		`<li>scaleway-prod-02 not yet connected</li>`,
+		`<li>macstadium_host06a not yet connected</li>`,
+		`<a href="/status/allgood">All Good Test</a>: ok`,
+		`<li>test-info</li>`,
+		`<li><span style='color: orange'>test-warn</span></li>`,
+		`<li><span style='color: red'><b>test-error</b></span></li>`,
+	} {
+		if !strings.Contains(got, sub) {
+			t.Errorf("didn't find substring %q in output", sub)
+		}
+	}
+	if t.Failed() {
+		t.Logf("Got: %s", got)
+	}
+}