internal/worker: reset server after certain number of requests
This CL updates gvisor to the newest available version and adds a
workaround for an issue where processes keep getting created by runsc
and stay alive until the limit is reached and server is killed.
Updates golang/go#65215
Change-Id: I9ac46bdadde3adf78285cd5ed8b42837394ec886
Reviewed-on: https://go-review.googlesource.com/c/pkgsite-metrics/+/557555
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Jonathan Amsterdam <jba@google.com>
Run-TryBot: Zvonimir Pavlinovic <zpavlinovic@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
diff --git a/cmd/worker/Dockerfile b/cmd/worker/Dockerfile
index 4cf6497..1c84398 100644
--- a/cmd/worker/Dockerfile
+++ b/cmd/worker/Dockerfile
@@ -42,7 +42,7 @@
#### Sandbox setup
# Install runsc.
-ADD https://storage.googleapis.com/gvisor/releases/release/20230320.0/x86_64/runsc /usr/local/bin/
+ADD https://storage.googleapis.com/gvisor/releases/release/20240115.0/x86_64/runsc /usr/local/bin/
RUN chmod a+rx /usr/local/bin/runsc
# Set up for runsc.
diff --git a/internal/worker/govulncheck_scan.go b/internal/worker/govulncheck_scan.go
index 9824151..e7ef7d7 100644
--- a/internal/worker/govulncheck_scan.go
+++ b/internal/worker/govulncheck_scan.go
@@ -83,7 +83,6 @@
// Collect basic metrics.
gReqCounter.Record(r.Context(), 1)
- h.Server.reqs++
skip := false // request skipped
defer func() {
gSuccCounter.Record(r.Context(), 1, event.Bool("success", err == nil))
diff --git a/internal/worker/server.go b/internal/worker/server.go
index ef695c2..8d203af 100644
--- a/internal/worker/server.go
+++ b/internal/worker/server.go
@@ -13,6 +13,7 @@
"os"
"strings"
"sync"
+ "sync/atomic"
"time"
"cloud.google.com/go/errorreporting"
@@ -39,7 +40,9 @@
// Firestore namespace for storing work versions.
fsNamespace *fstore.Namespace
- reqs int // for debugging
+ // reqs is the number of incoming scan requests, both analysis and
+ // govulncheck. Used for monitoring, debugging, and server restart.
+ reqs atomic.Uint64
devMode bool
mu sync.Mutex
@@ -47,7 +50,7 @@
// Info summarizes Server execution as text.
func (s *Server) Info() string {
- return fmt.Sprintf("total requests: %d", s.reqs)
+ return fmt.Sprintf("total requests: %d", s.reqs.Load())
}
func NewServer(ctx context.Context, cfg *config.Config) (_ *Server, err error) {
@@ -202,7 +205,7 @@
h := newGovulncheckServer(s)
s.handle("/govulncheck/enqueueall", h.handleEnqueueAll)
s.handle("/govulncheck/enqueue", h.handleEnqueue)
- s.handle("/govulncheck/scan/", h.handleScan)
+ s.handle("/govulncheck/scan/", reqMonitorHandler(s, h.handleScan))
}
func (s *Server) registerAnalysisHandlers(ctx context.Context) error {
@@ -210,11 +213,29 @@
if err != nil {
return err
}
- s.handle("/analysis/scan/", h.handleScan)
+ s.handle("/analysis/scan/", reqMonitorHandler(s, h.handleScan))
s.handle("/analysis/enqueue", h.handleEnqueue)
return nil
}
+// reqMonitorHandler creates a handler with h that 1) updates server request statistics
+// and 2) resets the server after a certain number of incoming server requests. The
+// incoming request that triggers the reset will be disregarded. Cloud Run will retry
+// that request.
+func reqMonitorHandler(s *Server, h func(http.ResponseWriter, *http.Request) error) func(http.ResponseWriter, *http.Request) error {
+ const reqLimit = 500 // experimentally shown to be a good threshold
+ return func(w http.ResponseWriter, r *http.Request) error {
+ // Reset the server after a certain number of requests due to a process leak.
+ // TODO(#65215): why does this happen? It seems to be due to gvisor.
+ if s.reqs.Load() > reqLimit {
+ log.Infof(r.Context(), "resetting server after %d requests, just before: %v", reqLimit, r.URL.Path)
+ os.Exit(0)
+ }
+ s.reqs.Add(1)
+ return h(w, r)
+ }
+}
+
type serverError struct {
status int // HTTP status code
err error // wrapped error