cmd/runqemubuildlet: restart unresponsive qemu processes

Expose the healthz port from the buildlet running under QEMU, and
periodically check it for a successful response. If it has been failing
for longer than ten minutes, try to restart the VM. This should
successfully restart VMs that failed to boot, failed to shut down, or
are otherwise unresponsive.

For golang/go#47018

Change-Id: I9218f94ee24de6e0a56ad60a18e075ce48893938
Reviewed-on: https://go-review.googlesource.com/c/build/+/336109
Trust: Alexander Rakoczy <alex@golang.org>
Run-TryBot: Alexander Rakoczy <alex@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@golang.org>
Reviewed-by: Carlos Amedee <carlos@golang.org>
diff --git a/cmd/runqemubuildlet/heartbeat.go b/cmd/runqemubuildlet/heartbeat.go
new file mode 100644
index 0000000..8aed2d8
--- /dev/null
+++ b/cmd/runqemubuildlet/heartbeat.go
@@ -0,0 +1,70 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build go1.16
+// +build go1.16
+
+package main
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"net/http"
+	"time"
+
+	"golang.org/x/build/internal"
+)
+
+// buildletHealthTimeout is the maximum time to wait for a
+// checkBuildletHealth request to complete.
+const buildletHealthTimeout = 10 * time.Second
+
+// checkBuildletHealth performs a GET request against URL, and returns
+// an error if an http.StatusOK isn't returned before
+// buildletHealthTimeout has elapsed.
+func checkBuildletHealth(ctx context.Context, url string) error {
+	ctx, cancel := context.WithTimeout(ctx, buildletHealthTimeout)
+	defer cancel()
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
+	if err != nil {
+		return err
+	}
+	resp, err := http.DefaultClient.Do(req)
+	if err != nil {
+		return err
+	}
+	defer resp.Body.Close()
+	if _, err := io.Copy(io.Discard, resp.Body); err != nil {
+		return err
+	}
+	if resp.StatusCode != http.StatusOK {
+		return fmt.Errorf("resp.StatusCode = %d, wanted %d", resp.StatusCode, http.StatusOK)
+	}
+	return nil
+}
+
+// heartbeatContext calls f every period. If f consistently returns an
+// error for longer than the provided timeout duration, the context
+// returned by heartbeatContext will be cancelled, and
+// heartbeatContext will stop sending requests.
+//
+// A single call to f that does not return an error will reset the
+// timeout window, unless heartbeatContext has already timed out.
+func heartbeatContext(ctx context.Context, period time.Duration, timeout time.Duration, f func(context.Context) error) (context.Context, func()) {
+	ctx, cancel := context.WithCancel(ctx)
+
+	lastSuccess := time.Now()
+	go internal.PeriodicallyDo(ctx, period, func(ctx context.Context, t time.Time) {
+		err := f(ctx)
+		if err != nil && t.Sub(lastSuccess) > timeout {
+			cancel()
+		}
+		if err == nil {
+			lastSuccess = t
+		}
+	})
+
+	return ctx, cancel
+}
diff --git a/cmd/runqemubuildlet/heartbeat_test.go b/cmd/runqemubuildlet/heartbeat_test.go
new file mode 100644
index 0000000..c69bc04
--- /dev/null
+++ b/cmd/runqemubuildlet/heartbeat_test.go
@@ -0,0 +1,94 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build go1.16
+// +build go1.16
+
+package main
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"net/http"
+	"net/http/httptest"
+	"net/url"
+	"testing"
+	"time"
+)
+
+func TestCheckBuildletHealth(t *testing.T) {
+	cases := []struct {
+		desc     string
+		respCode int
+		wantErr  bool
+	}{
+		{
+			desc:     "success",
+			respCode: http.StatusOK,
+		},
+		{
+			desc:     "failure",
+			respCode: http.StatusBadGateway,
+			wantErr:  true,
+		},
+	}
+	for _, c := range cases {
+		t.Run(c.desc, func(t *testing.T) {
+			m := http.NewServeMux()
+			m.HandleFunc("/healthz", func(w http.ResponseWriter, req *http.Request) {
+				w.WriteHeader(c.respCode)
+				fmt.Sprintln(w, "ok")
+			})
+			s := httptest.NewServer(m)
+			defer s.Close()
+			u, err := url.Parse(s.URL)
+			if err != nil {
+				t.Fatalf("url.Parse(%q) = %v, wanted no error", s.URL, err)
+			}
+			u.Path = "/healthz"
+
+			if err := checkBuildletHealth(context.Background(), u.String()); (err != nil) != c.wantErr {
+				t.Errorf("checkBuildletHealth(_, %q) = %v, wantErr: %t", s.URL, err, c.wantErr)
+			}
+		})
+	}
+}
+
+func TestHeartbeatContext(t *testing.T) {
+	ctx := context.Background()
+
+	didWork := make(chan interface{}, 2)
+	done := make(chan interface{})
+	ctx, cancel := heartbeatContext(ctx, time.Millisecond, 100*time.Millisecond, func(context.Context) error {
+		select {
+		case <-done:
+			return errors.New("heartbeat stopped")
+		case didWork <- nil:
+		default:
+		}
+		return nil
+	})
+	defer cancel()
+
+	select {
+	case <-time.After(5 * time.Second):
+		t.Errorf("heatbeatContext() never called f, wanted at least one call")
+	case <-didWork:
+	}
+
+	select {
+	case <-done:
+		t.Errorf("heartbeatContext() finished early, wanted it to still be testing")
+	case <-didWork:
+		close(done)
+	}
+
+	select {
+	case <-time.After(5 * time.Second):
+		t.Errorf("heartbeatContext() did not timeout, wanted timeout after failing over %v", time.Second)
+	case <-ctx.Done():
+		// heartbeatContext() successfully timed out after failing
+	}
+}
diff --git a/cmd/runqemubuildlet/main.go b/cmd/runqemubuildlet/main.go
index a8c296a..e0cac9e 100644
--- a/cmd/runqemubuildlet/main.go
+++ b/cmd/runqemubuildlet/main.go
@@ -24,6 +24,7 @@
 
 var (
 	windows10Path = flag.String("windows-10-path", defaultWindowsDir(), "Path to Windows image and QEMU dependencies.")
+	healthzURL    = flag.String("buildlet-healthz-url", "http://localhost:8080/healthz", "URL to buildlet /healthz endpoint.")
 )
 
 func main() {
@@ -33,22 +34,32 @@
 	defer stop()
 
 	for ctx.Err() == nil {
-		cmd := windows10Cmd(*windows10Path)
-		log.Printf("Starting VM: %s", cmd.String())
-		cmd.Stdout = os.Stdout
-		cmd.Stderr = os.Stderr
-		if err := cmd.Start(); err != nil {
-			log.Printf("cmd.Start() = %v. Retrying in 10 seconds.", err)
+		if err := runWindows10(ctx); err != nil {
+			log.Printf("runWindows10() = %v. Retrying in 10 seconds.", err)
 			time.Sleep(10 * time.Second)
 			continue
 		}
-		if err := internal.WaitOrStop(ctx, cmd, os.Interrupt, time.Minute); err != nil {
-			log.Printf("waitOrStop(_, %v, %v, %v) = %v. Retrying in 10 seconds.", cmd, os.Interrupt, time.Minute, err)
-			time.Sleep(10 * time.Second)
-		}
 	}
 }
 
+func runWindows10(ctx context.Context) error {
+	cmd := windows10Cmd(*windows10Path)
+	log.Printf("Starting VM: %s", cmd.String())
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	if err := cmd.Start(); err != nil {
+		return fmt.Errorf("cmd.Start() = %w", err)
+	}
+	ctx, cancel := heartbeatContext(ctx, 30*time.Second, 10*time.Minute, func(ctx context.Context) error {
+		return checkBuildletHealth(ctx, *healthzURL)
+	})
+	defer cancel()
+	if err := internal.WaitOrStop(ctx, cmd, os.Interrupt, time.Minute); err != nil {
+		return fmt.Errorf("WaitOrStop(_, %v, %v, %v) = %w", cmd, os.Interrupt, time.Minute, err)
+	}
+	return nil
+}
+
 // defaultWindowsDir returns a default path for a Windows VM.
 //
 // The directory should contain the Windows VM image, and UTM
@@ -81,7 +92,7 @@
 		"-device", "usb-mouse,bus=usb-bus.0",
 		"-device", "usb-kbd,bus=usb-bus.0",
 		"-device", "virtio-net-pci,netdev=net0",
-		"-netdev", "user,id=net0",
+		"-netdev", "user,id=net0,hostfwd=tcp::8080-:8080",
 		"-bios", filepath.Join(base, "Images/QEMU_EFI.fd"),
 		"-device", "nvme,drive=drive0,serial=drive0,bootindex=0",
 		"-drive", fmt.Sprintf("if=none,media=disk,id=drive0,file=%s,cache=writethrough", filepath.Join(base, "Images/win10.qcow2")),