cmd/coordinator: add some health status handlers & output on front page
Updates golang/go#15760
Change-Id: I046cc02118daa79e62b408bda591670a8ff70b17
Reviewed-on: https://go-review.googlesource.com/c/build/+/178798
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
diff --git a/cmd/coordinator/coordinator.go b/cmd/coordinator/coordinator.go
index 33dfca6..f94fdff 100644
--- a/cmd/coordinator/coordinator.go
+++ b/cmd/coordinator/coordinator.go
@@ -3212,6 +3212,9 @@
func (st *buildStatus) HTMLStatusLine_done() template.HTML { return st.htmlStatusLine(false) }
func (st *buildStatus) htmlStatusLine(full bool) template.HTML {
+ if st == nil {
+ return "[nil]"
+ }
st.mu.Lock()
defer st.mu.Unlock()
diff --git a/cmd/coordinator/reverse.go b/cmd/coordinator/reverse.go
index beba59a..49f0084 100644
--- a/cmd/coordinator/reverse.go
+++ b/cmd/coordinator/reverse.go
@@ -54,7 +54,8 @@
const minBuildletVersion = 1
var reversePool = &reverseBuildletPool{
- oldInUse: make(map[*buildlet.Client]bool),
+ oldInUse: make(map[*buildlet.Client]bool),
+ hostLastGood: make(map[string]time.Time),
}
const maxOldRevdialUsers = 10
@@ -62,7 +63,7 @@
type token struct{}
type reverseBuildletPool struct {
- // mu guards all 4 fields below and also fields of
+ // mu guards all 5 fields below and also fields of
// *reverseBuildlet in buildlets
mu sync.Mutex
@@ -78,6 +79,8 @@
// These are a liability due to runaway memory issues (Issue 31639) so
// we bound how many can be running at once. Fortunately there aren't many left.
oldInUse map[*buildlet.Client]bool
+
+ hostLastGood map[string]time.Time
}
func (p *reverseBuildletPool) ServeReverseStatusJSON(w http.ResponseWriter, r *http.Request) {
@@ -213,6 +216,7 @@
panic("previous health check still running")
}
if b.inUse {
+ p.hostLastGood[b.hostname] = time.Now()
p.mu.Unlock()
return true // skip busy buildlets
}
@@ -252,7 +256,9 @@
}
b.inUse = false
b.inHealthCheck = false
- b.inUseTime = time.Now()
+ now := time.Now()
+ b.inUseTime = now
+ p.hostLastGood[b.hostname] = now
go p.noteBuildletAvailable(b.hostType)
return true
}
@@ -472,6 +478,7 @@
defer p.noteBuildletAvailable(b.hostType)
defer p.mu.Unlock()
p.buildlets = append(p.buildlets, b)
+ p.hostLastGood[b.hostname] = time.Now()
go p.healthCheckBuildletLoop(b)
}
diff --git a/cmd/coordinator/status.go b/cmd/coordinator/status.go
index 5acf7b2..43284d7 100644
--- a/cmd/coordinator/status.go
+++ b/cmd/coordinator/status.go
@@ -9,6 +9,7 @@
import (
"bytes"
"fmt"
+ "html"
"html/template"
"io"
"net/http"
@@ -17,29 +18,359 @@
"runtime"
"sort"
"strings"
+ "sync"
"sync/atomic"
"time"
+
+ "golang.org/x/build/dashboard"
)
+// status
+type statusLevel int
+
+const (
+ // levelInfo is an informational text that's not an error,
+ // such as "coordinator just started recently, waiting to
+ // start health check"
+ levelInfo statusLevel = iota
+ // levelWarn is a non-critical error, such as "missing 1 of 50
+ // of ARM machines"
+ levelWarn
+ // levelError is something that should be fixed sooner, such
+ // as "all Macs are gone".
+ levelError
+)
+
+func (l statusLevel) String() string {
+ switch l {
+ case levelInfo:
+ return "Info"
+ case levelWarn:
+ return "Warn"
+ case levelError:
+ return "Error"
+ }
+ return ""
+}
+
+type levelText struct {
+ Level statusLevel
+ Text string
+}
+
+func (lt levelText) AsHTML() template.HTML {
+ switch lt.Level {
+ case levelInfo:
+ return template.HTML(html.EscapeString(lt.Text))
+ case levelWarn:
+ return template.HTML(fmt.Sprintf("<span style='color: orange'>%s</span>", html.EscapeString(lt.Text)))
+ case levelError:
+ return template.HTML(fmt.Sprintf("<span style='color: red'><b>%s</b></span>", html.EscapeString(lt.Text)))
+ }
+ return ""
+}
+
+type checkWriter struct {
+ Out []levelText
+}
+
+func (w *checkWriter) error(s string) { w.Out = append(w.Out, levelText{levelError, s}) }
+func (w *checkWriter) errorf(a string, args ...interface{}) { w.error(fmt.Sprintf(a, args...)) }
+func (w *checkWriter) info(s string) { w.Out = append(w.Out, levelText{levelInfo, s}) }
+func (w *checkWriter) infof(a string, args ...interface{}) { w.info(fmt.Sprintf(a, args...)) }
+func (w *checkWriter) warn(s string) { w.Out = append(w.Out, levelText{levelWarn, s}) }
+func (w *checkWriter) warnf(a string, args ...interface{}) { w.warn(fmt.Sprintf(a, args...)) }
+func (w *checkWriter) hasErrors() bool {
+ for _, v := range w.Out {
+ if v.Level == levelError {
+ return true
+ }
+ }
+ return false
+}
+
+type healthChecker struct {
+ ID string
+ Title string
+ EnvURL string
+ Check func(*checkWriter)
+}
+
+func (hc *healthChecker) DoCheck() *checkWriter {
+ cw := new(checkWriter)
+ hc.Check(cw)
+ return cw
+}
+
+var (
+ healthCheckers []*healthChecker
+ healthCheckerByID = map[string]*healthChecker{}
+)
+
+func addHealthChecker(hc *healthChecker) {
+ if _, dup := healthCheckerByID[hc.ID]; dup {
+ panic("duplicate health checker ID " + hc.ID)
+ }
+ healthCheckers = append(healthCheckers, hc)
+ healthCheckerByID[hc.ID] = hc
+ http.Handle("/status/"+hc.ID, healthCheckerHandler(hc))
+}
+
+func init() {
+ addHealthChecker(newMacHealthChecker())
+ addHealthChecker(newScalewayHealthChecker())
+ addHealthChecker(newPacketHealthChecker())
+ addHealthChecker(newOSUPPC64Checker())
+ addHealthChecker(newOSUPPC64leChecker())
+ addHealthChecker(newJoyentChecker())
+}
+
+func newMacHealthChecker() *healthChecker {
+ var hosts []string
+ for i := 1; i <= 10; i++ {
+ for _, suf := range []string{"a", "b"} {
+ name := fmt.Sprintf("macstadium_host%02d%s", i, suf)
+ hosts = append(hosts, name)
+ }
+ }
+ checkHosts := reverseHostChecker(hosts)
+
+ // And check that the makemac daemon is listening.
+ var makeMac struct {
+ sync.Mutex
+ lastErr error
+ lastCheck time.Time // currently unused
+ }
+ setMakeMacErr := func(err error) {
+ makeMac.Lock()
+ defer makeMac.Unlock()
+ makeMac.lastErr = err
+ makeMac.lastCheck = time.Now()
+ }
+ go func() {
+ c := &http.Client{Timeout: 15 * time.Second}
+ for {
+ res, err := c.Get("http://macstadiumd.golang.org:8713")
+ if err != nil {
+ setMakeMacErr(err)
+ } else {
+ res.Body.Close()
+ if res.StatusCode != 200 {
+ setMakeMacErr(fmt.Errorf("HTTP status %v", res.Status))
+ } else if res.Header.Get("Content-Type") != "application/json" {
+ setMakeMacErr(fmt.Errorf("unexpected content-type %q", res.Header.Get("Content-Type")))
+ } else {
+ setMakeMacErr(nil)
+ }
+ }
+ time.Sleep(15 * time.Second)
+ }
+ }()
+ return &healthChecker{
+ ID: "macs",
+ Title: "MacStadium Mac VMs",
+ EnvURL: "https://github.com/golang/build/tree/master/env/darwin/macstadium",
+ Check: func(w *checkWriter) {
+ // Check hosts.
+ checkHosts(w)
+ // Check makemac daemon.
+ makeMac.Lock()
+ defer makeMac.Unlock()
+ if makeMac.lastErr != nil {
+ w.errorf("makemac daemon: %v", makeMac.lastErr)
+ }
+ return
+ },
+ }
+}
+
+func newJoyentChecker() *healthChecker {
+ return &healthChecker{
+ ID: "joyent",
+ Title: "Joyent solaris/amd64 machines",
+ EnvURL: "https://github.com/golang/build/tree/master/env/solaris-amd64/joyent",
+ Check: func(cw *checkWriter) {
+ p := reversePool
+ p.mu.Lock()
+ defer p.mu.Unlock()
+ n := 0
+ for _, b := range p.buildlets {
+ if b.hostType == "host-solaris-amd64" {
+ n++
+ }
+ }
+ want := dashboard.Hosts["host-solaris-amd64"].ExpectNum
+ if n < want {
+ cw.errorf("%d connected; want %d", n, want)
+ }
+ },
+ }
+}
+
+func newScalewayHealthChecker() *healthChecker {
+ var hosts []string
+ for i := 1; i <= 50; i++ {
+ name := fmt.Sprintf("scaleway-prod-%02d", i)
+ hosts = append(hosts, name)
+ }
+ return &healthChecker{
+ ID: "scaleway",
+ Title: "Scaleway linux/arm machines",
+ EnvURL: "https://github.com/golang/build/tree/master/env/linux-arm/scaleway",
+ Check: reverseHostChecker(hosts),
+ }
+}
+
+func newPacketHealthChecker() *healthChecker {
+ var hosts []string
+ for i := 1; i <= 20; i++ {
+ name := fmt.Sprintf("packet%02d", i)
+ hosts = append(hosts, name)
+ }
+ return &healthChecker{
+ ID: "packet",
+ Title: "Packet linux/arm64 machines",
+ EnvURL: "https://github.com/golang/build/tree/master/env/linux-arm64/packet",
+ Check: reverseHostChecker(hosts),
+ }
+}
+
+func newOSUPPC64Checker() *healthChecker {
+ var hosts []string
+ for i := 1; i <= 5; i++ {
+ name := fmt.Sprintf("go-be-%v", i)
+ hosts = append(hosts, name)
+ }
+ return &healthChecker{
+ ID: "osuppc64",
+ Title: "OSU linux/ppc64 machines",
+ EnvURL: "https://github.com/golang/build/tree/master/env/linux-ppc64/osuosl",
+ Check: reverseHostChecker(hosts),
+ }
+}
+
+func newOSUPPC64leChecker() *healthChecker {
+ var hosts []string
+ for i := 1; i <= 5; i++ {
+ name := fmt.Sprintf("go-le-%v", i)
+ hosts = append(hosts, name)
+ }
+ return &healthChecker{
+ ID: "osuppc64le",
+ Title: "OSU linux/ppc64le machines",
+ EnvURL: "https://github.com/golang/build/tree/master/env/linux-ppc64le/osuosl",
+ Check: reverseHostChecker(hosts),
+ }
+}
+
+func reverseHostChecker(hosts []string) func(cw *checkWriter) {
+ const recentThreshold = 2 * time.Minute // let VMs be away 2 minutes; assume ~1 minute bootup + slop
+ checkStart := time.Now().Add(recentThreshold)
+
+ hostSet := map[string]bool{}
+ for _, v := range hosts {
+ hostSet[v] = true
+ }
+
+ return func(cw *checkWriter) {
+ p := reversePool
+ p.mu.Lock()
+ defer p.mu.Unlock()
+
+ now := time.Now()
+ wantGoodSince := now.Add(-recentThreshold)
+ numMissing := 0
+ numGood := 0
+ // Check last good times
+ for _, host := range hosts {
+ lastGood, ok := p.hostLastGood[host]
+ if ok && lastGood.After(wantGoodSince) {
+ numGood++
+ continue
+ }
+ if now.Before(checkStart) {
+ cw.infof("%s not yet connected", host)
+ continue
+ }
+ if ok {
+ cw.warnf("%s missing, not seen for %v", host, time.Now().Sub(lastGood).Round(time.Second))
+ } else {
+ cw.warnf("%s missing, never seen (at least %v)", host, uptime())
+ }
+ numMissing++
+ }
+ if numMissing > 0 {
+ sum := numMissing + numGood
+ percentMissing := float64(numMissing) / float64(sum)
+ msg := fmt.Sprintf("%d machines missing, %.0f%% of capacity", numMissing, percentMissing*100)
+ if percentMissing >= 0.15 {
+ cw.error(msg)
+ } else {
+ cw.warn(msg)
+ }
+ }
+
+ // And check that we don't have more than 1
+ // connected of any type.
+ count := map[string]int{}
+ for _, b := range p.buildlets {
+ if hostSet[b.hostname] {
+ count[b.hostname]++
+ }
+ }
+ for name, n := range count {
+ if n > 1 {
+ cw.errorf("%q is connected from %v machines", name, n)
+ }
+ }
+
+ return
+ }
+}
+
+func healthCheckerHandler(hc *healthChecker) http.Handler {
+ return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ cw := new(checkWriter)
+ hc.Check(cw)
+ w.Header().Set("Content-Type", "text/plain; charset=utf-8")
+ if cw.hasErrors() {
+ w.WriteHeader(500)
+ } else {
+ w.WriteHeader(200)
+ }
+ if len(cw.Out) == 0 {
+ io.WriteString(w, "ok\n")
+ return
+ }
+ fmt.Fprintf(w, "# %q status: %s\n", hc.ID, hc.Title)
+ if hc.EnvURL != "" {
+ fmt.Fprintf(w, "# Notes: %v\n", hc.EnvURL)
+ }
+ for _, v := range cw.Out {
+ fmt.Fprintf(w, "%s: %s\n", v.Level, v.Text)
+ }
+ })
+}
+
+func uptime() time.Duration { return time.Now().Sub(processStartTime).Round(time.Second) }
+
func handleStatus(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/" {
http.NotFound(w, r)
return
}
- round := func(t time.Duration) time.Duration {
- return t / time.Second * time.Second
- }
df := diskFree()
statusMu.Lock()
data := statusData{
- Total: len(status),
- Uptime: round(time.Now().Sub(processStartTime)),
- Recent: append([]*buildStatus{}, statusDone...),
- DiskFree: df,
- Version: Version,
- NumFD: fdCount(),
- NumGoroutine: runtime.NumGoroutine(),
+ Total: len(status),
+ Uptime: uptime(),
+ Recent: append([]*buildStatus{}, statusDone...),
+ DiskFree: df,
+ Version: Version,
+ NumFD: fdCount(),
+ NumGoroutine: runtime.NumGoroutine(),
+ HealthCheckers: healthCheckers,
}
for _, st := range status {
if atomic.LoadInt32(&st.hasBuildlet) != 0 {
@@ -158,6 +489,7 @@
RemoteBuildlets template.HTML
DiskFree string
Version string
+ HealthCheckers []*healthChecker
}
var statusTmpl = template.Must(template.New("status").Parse(`
@@ -177,6 +509,18 @@
<h2>Running</h2>
<p>{{printf "%d" .Total}} total builds; {{printf "%d" .ActiveBuilds}} active ({{.ActiveReverse}} reverse). Uptime {{printf "%s" .Uptime}}. Version {{.Version}}.
+<h2 id=health>Health <a href='#health'>¶</a></h2>
+<ul>{{range .HealthCheckers}}
+ <li><a href="/status/{{.ID}}">{{.Title}}</a>{{if .EnvURL}} [<a href="{{.EnvURL}}">docs</a>]{{end -}}: {{with .DoCheck.Out}}
+ <ul>
+ {{- range .}}
+ <li>{{ .AsHTML}}</li>
+ {{- end}}
+ </ul>
+ {{else}}ok{{end}}
+ </li>
+{{end}}</ul>
+
<h2 id=trybots>Active Trybot Runs <a href='#trybots'>¶</a></h2>
{{- if .TrybotsErr}}
<b>trybots disabled:</b>: {{.TrybotsErr}}
diff --git a/cmd/coordinator/status_test.go b/cmd/coordinator/status_test.go
index 8e06c34..deaa247 100644
--- a/cmd/coordinator/status_test.go
+++ b/cmd/coordinator/status_test.go
@@ -7,6 +7,8 @@
package main
import (
+ "net/http/httptest"
+ "strings"
"testing"
"time"
)
@@ -35,3 +37,67 @@
}
}
}
+
+func init() {
+ addHealthChecker(&healthChecker{
+ ID: "allgood",
+ Title: "All Good Test",
+ Check: func(*checkWriter) {},
+ })
+
+ addHealthChecker(&healthChecker{
+ ID: "errortest",
+ Title: "Error Test",
+ Check: func(cw *checkWriter) {
+ cw.info("test-info")
+ cw.warn("test-warn")
+ cw.error("test-error")
+ },
+ })
+}
+
+func TestHandleStatus_HealthFormatting(t *testing.T) {
+ statusMu.Lock()
+ for k := range status {
+ delete(status, k)
+ }
+ for k := range tries {
+ delete(tries, k)
+ }
+ tryList = nil
+ statusMu.Unlock()
+
+ rec := httptest.NewRecorder()
+ req := httptest.NewRequest("GET", "/", nil)
+ handleStatus(rec, req)
+ const pre = "<h2 id=health>Health"
+ const suf = "<h2 id=trybots>Active Trybot Runs"
+ got := rec.Body.String()
+ if i := strings.Index(got, pre); i != -1 {
+ got = got[i+len(pre):]
+ } else {
+ t.Fatalf("output didn't contain %q: %s", pre, got)
+ }
+ if i := strings.Index(got, suf); i != -1 {
+ got = got[:i]
+ } else {
+ t.Fatalf("output didn't contain %q: %s", suf, got)
+ }
+ for _, sub := range []string{
+ `<a href="/status/macs">MacStadium Mac VMs</a> [`,
+ `<a href="/status/scaleway">Scaleway linux/arm machines</a> [`,
+ `<li>scaleway-prod-02 not yet connected</li>`,
+ `<li>macstadium_host06a not yet connected</li>`,
+ `<a href="/status/allgood">All Good Test</a>: ok`,
+ `<li>test-info</li>`,
+ `<li><span style='color: orange'>test-warn</span></li>`,
+ `<li><span style='color: red'><b>test-error</b></span></li>`,
+ } {
+ if !strings.Contains(got, sub) {
+ t.Errorf("didn't find substring %q in output", sub)
+ }
+ }
+ if t.Failed() {
+ t.Logf("Got: %s", got)
+ }
+}