cmd/coordinator: report errors on all gitmirror instances
There are 2 gitmirror instances running since 2017 (CL 36801).
The gitmirror health check has been relying on fetching the HTML
status page from the gitmirror service via its load balancer.
If one of of the two gitmirror instances has a problem, the health
check can miss it and report all okay. This caused the health check
to appear flaky and untrustworthy, even though the underlying failure
in gitmirror is reported reliably.
Start querying all gitmirror instances and report combined errors.
Also report a problem if there are no gitmirror instances running.
Remove the /debug/watcher/ debug endpoint. Its current implementation
is unhelpful because the load balancer causes it to display the status
of a single instance. Making it useful would require significant rework.
In the meantime, the status pages of individual gitmirror instances can
be accessed by using the kubectl port-forward command.
Fixes golang/go#37828.
Change-Id: I3f3e322a85e07f23f18a56a7fd913abed75ee77e
Reviewed-on: https://go-review.googlesource.com/c/build/+/226678
Reviewed-by: Alexander Rakoczy <alex@golang.org>
Reviewed-by: Carlos Amedee <carlos@golang.org>
diff --git a/cmd/coordinator/coordinator.go b/cmd/coordinator/coordinator.go
index 07237e3..6a719a5 100644
--- a/cmd/coordinator/coordinator.go
+++ b/cmd/coordinator/coordinator.go
@@ -31,7 +31,6 @@
"log"
"net"
"net/http"
- "net/http/httputil"
_ "net/http/pprof"
"net/url"
"os"
@@ -325,7 +324,6 @@
protos.RegisterCoordinatorServer(grpcServer, gs)
http.HandleFunc("/", handleStatus)
http.HandleFunc("/debug/goroutines", handleDebugGoroutines)
- http.HandleFunc("/debug/watcher/", handleDebugWatcher)
http.HandleFunc("/builders", handleBuilders)
http.HandleFunc("/temporarylogs", handleLogs)
http.HandleFunc("/reverse", handleReverse)
@@ -439,33 +437,6 @@
http.Redirect(w, req, u.String(), http.StatusMovedPermanently)
}
-// watcherProxy is the proxy which forwards from
-// https://farmer.golang.org/ to the gitmirror kubernetes service (git
-// cache+sync).
-// This is used for /debug/watcher/<reponame> status pages, which are
-// served at the same URL paths for both the farmer.golang.org host
-// and the internal backend. (The name "watcher" is old; it's now called
-// "gitmirror" but the URL path remains for now.)
-var watcherProxy *httputil.ReverseProxy
-
-func init() {
- u, err := url.Parse("http://gitmirror/") // unused hostname
- if err != nil {
- log.Fatal(err)
- }
- watcherProxy = httputil.NewSingleHostReverseProxy(u)
- watcherProxy.Transport = &http.Transport{
- IdleConnTimeout: 30 * time.Second,
- DialContext: func(ctx context.Context, network, addr string) (net.Conn, error) {
- return goKubeClient.DialServicePort(ctx, "gitmirror", "")
- },
- }
-}
-
-func handleDebugWatcher(w http.ResponseWriter, r *http.Request) {
- watcherProxy.ServeHTTP(w, r)
-}
-
func stagingClusterBuilders() map[string]*dashboard.BuildConfig {
m := map[string]*dashboard.BuildConfig{}
for _, name := range []string{
diff --git a/cmd/coordinator/status.go b/cmd/coordinator/status.go
index 3e57f74..66a92e4 100644
--- a/cmd/coordinator/status.go
+++ b/cmd/coordinator/status.go
@@ -18,6 +18,7 @@
"html/template"
"io"
"io/ioutil"
+ "log"
"net/http"
"os"
"os/exec"
@@ -32,6 +33,7 @@
"golang.org/x/build/cmd/coordinator/internal"
"golang.org/x/build/dashboard"
"golang.org/x/build/internal/foreach"
+ "golang.org/x/build/kubernetes/api"
)
// status
@@ -176,15 +178,41 @@
}
}
-// $1 is repo; $2 is error message
-var gitMirrorLineRx = regexp.MustCompile(`/debug/watcher/([\w-]+).?>.+</a> - (.*)`)
-
+// gitMirrorErrors queries the status pages of all
+// running gitmirror instances and reports errors.
func gitMirrorErrors() (errs []string) {
- ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+ ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second)
defer cancel()
- req, _ := http.NewRequest("GET", "http://gitmirror/", nil)
- req = req.WithContext(ctx)
- res, err := watcherProxy.Transport.RoundTrip(req)
+ pods, err := goKubeClient.GetPods(ctx)
+ if err != nil {
+ log.Println("gitMirrorErrors: goKubeClient.GetPods:", err)
+ return []string{"failed to get pods; can't query gitmirror status"}
+ }
+ var runningGitMirror []api.Pod
+ for _, p := range pods {
+ if p.Labels["app"] != "gitmirror" || p.Status.Phase != "Running" {
+ continue
+ }
+ runningGitMirror = append(runningGitMirror, p)
+ }
+ if len(runningGitMirror) == 0 {
+ return []string{"no running gitmirror instances"}
+ }
+ for _, pod := range runningGitMirror {
+ // The gitmirror -http=:8585 status page URL is hardcoded here.
+ // If the ReplicationController configuration changes (rare), this
+ // health check will begin to fail until it's updated accordingly.
+ instanceErrors := gitMirrorInstanceErrors(ctx, fmt.Sprintf("http://%s:8585/", pod.Status.PodIP))
+ for _, err := range instanceErrors {
+ errs = append(errs, fmt.Sprintf("instance %s: %s", pod.Name, err))
+ }
+ }
+ return errs
+}
+
+func gitMirrorInstanceErrors(ctx context.Context, url string) (errs []string) {
+ req, _ := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
+ res, err := http.DefaultClient.Do(req)
if err != nil {
return []string{err.Error()}
}
@@ -220,13 +248,20 @@
return errs
}
+// $1 is repo; $2 is error message
+var gitMirrorLineRx = regexp.MustCompile(`/debug/watcher/([\w-]+).?>.+</a> - (.*)`)
+
func newGitMirrorChecker() *healthChecker {
return &healthChecker{
ID: "gitmirror",
Title: "Git mirroring",
DocURL: "https://github.com/golang/build/tree/master/cmd/gitmirror",
Check: func(w *checkWriter) {
- ee, _ := lastGitMirrorErrors.Load().([]string)
+ ee, ok := lastGitMirrorErrors.Load().([]string)
+ if !ok {
+ w.warn("still checking")
+ return
+ }
for _, v := range ee {
w.error(v)
}