internal/worker: function to call pkgsite to triage

Rework the pkg.go.dev request in cveModulePath.

Add a separate function that rate-limits and caches the calls to
pkg.go.dev.

Change-Id: If36916983826f50b9f2ac8027c4555460a56655a
Reviewed-on: https://go-review.googlesource.com/c/vuln/+/368434
Trust: Jonathan Amsterdam <jba@google.com>
Run-TryBot: Jonathan Amsterdam <jba@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Julie Qiu <julie@golang.org>
diff --git a/go.mod b/go.mod
index 7ec2e2c..70f6e5d 100644
--- a/go.mod
+++ b/go.mod
@@ -38,6 +38,7 @@
 	golang.org/x/oauth2 v0.0.0-20211005180243-6b3c2da341f1 // indirect
 	golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359 // indirect
 	golang.org/x/text v0.3.7 // indirect
+	golang.org/x/time v0.0.0-20191024005414-555d28b269f0
 	golang.org/x/tools v0.1.8-0.20211029000441-d6a9af8af023
 	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
 	google.golang.org/api v0.60.0
diff --git a/go.sum b/go.sum
index e93dcf8..26a3970 100644
--- a/go.sum
+++ b/go.sum
@@ -679,6 +679,7 @@
 golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
+golang.org/x/time v0.0.0-20191024005414-555d28b269f0 h1:/5xXl8Y5W96D+TtHSlonuFqGHIWVuyCkGJLwGh9JJFs=
 golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180828015842-6cd1fcedba52/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
diff --git a/internal/worker/triage.go b/internal/worker/triage.go
new file mode 100644
index 0000000..07ac2ca
--- /dev/null
+++ b/internal/worker/triage.go
@@ -0,0 +1,132 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package worker
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"net/http"
+	"net/url"
+	"strings"
+	"time"
+
+	"golang.org/x/exp/event"
+	"golang.org/x/time/rate"
+	"golang.org/x/vuln/internal/cveschema"
+	"golang.org/x/vuln/internal/derrors"
+	"golang.org/x/vuln/internal/worker/log"
+)
+
+var errCVEVersionUnsupported = errors.New("unsupported CVE version")
+
+var vcsHostsWithThreeElementRepoName = map[string]bool{
+	"bitbucket.org": true,
+	"gitea.com":     true,
+	"gitee.com":     true,
+	"github.com":    true,
+	"gitlab.com":    true,
+	"golang.org":    true,
+}
+
+var stdlibKeywords = map[string]bool{
+	"github.com/golang": true,
+	"golang-announce":   true,
+	"golang-nuts":       true,
+	"golang.org":        true,
+}
+
+// TriageCVE reports whether the CVE refers to a
+// Go module.
+func TriageCVE(c *cveschema.CVE) (_ bool, err error) {
+	defer derrors.Wrap(&err, "triageCVE(%q)", c.ID)
+	switch c.DataVersion {
+	case "4.0":
+		mp, err := cveModulePath(context.TODO(), c)
+		if err != nil {
+			return false, err
+		}
+		if mp == "" {
+			return false, nil
+		}
+		return true, nil
+	default:
+		// TODO(https://golang.org/issue/49289): Add support for v5.0.
+		return false, fmt.Errorf("CVE %q has DataVersion %q: %w", c.ID, c.DataVersion, errCVEVersionUnsupported)
+	}
+}
+
+// cveModulePath returns a Go module path for a CVE, if we can determine what
+// it is.
+// TODO(golang/go#49733) Use the CandidateModulePaths function from pkgsite to catch
+// longer module paths, e.g. github.com/pulumi/pulumi/sdk/v2.
+func cveModulePath(ctx context.Context, c *cveschema.CVE) (_ string, err error) {
+	defer derrors.Wrap(&err, "cveModulePath(%q)", c.ID)
+	for _, r := range c.References.Data {
+		if r.URL == "" {
+			continue
+		}
+		for k := range stdlibKeywords {
+			if strings.Contains(r.URL, k) {
+				return "Go Standard Library", nil
+			}
+		}
+		for host := range vcsHostsWithThreeElementRepoName {
+			if !strings.Contains(r.URL, host) {
+				continue
+			}
+			refURL, err := url.Parse(r.URL)
+			if err != nil {
+				return "", fmt.Errorf("url.Parse(%q): %v", r.URL, err)
+			}
+			u := refURL.Host + refURL.Path
+			parts := strings.Split(u, "/")
+			if len(parts) < 3 {
+				continue
+			}
+			mod := strings.Join(parts[0:3], "/")
+			known, err := knownToPkgsite(ctx, "https://pkg.go.dev", mod)
+			if err != nil {
+				return "", err
+			}
+			if known {
+				return mod, nil
+			}
+		}
+	}
+	return "", nil
+}
+
+// Limit pkgsite calls to 2 qps (once every 500ms)
+var pkgsiteRateLimiter = rate.NewLimiter(rate.Every(500*time.Millisecond), 3)
+
+var seenModulePath = map[string]bool{}
+
+// knownToPkgsite reports whether pkgsite knows that modulePath actually refers
+// to a module.
+func knownToPkgsite(ctx context.Context, url, modulePath string) (bool, error) {
+	// If we've seen it before, no need to call.
+	if b, ok := seenModulePath[modulePath]; ok {
+		return b, nil
+	}
+	// Pause to maintain a max QPS.
+	if err := pkgsiteRateLimiter.Wait(ctx); err != nil {
+		return false, err
+	}
+	msg := fmt.Sprintf("call to %s", url)
+	log.Info(ctx, msg+" starting", event.String("path", modulePath))
+	start := time.Now()
+	res, err := http.Get(url + "/mod/" + modulePath)
+	log.Info(ctx, msg+" ended",
+		event.String("path", modulePath),
+		event.Value("latency", time.Since(start)),
+		event.Value("error", err))
+	if err != nil {
+		return false, err
+	}
+	known := res.StatusCode == http.StatusOK
+	seenModulePath[modulePath] = known
+	return known, nil
+}
diff --git a/internal/worker/triage_test.go b/internal/worker/triage_test.go
new file mode 100644
index 0000000..6c70c20
--- /dev/null
+++ b/internal/worker/triage_test.go
@@ -0,0 +1,46 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package worker
+
+import (
+	"context"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+
+	"golang.org/x/vuln/internal/worker/log"
+)
+
+func TestKnownToPkgsite(t *testing.T) {
+	ctx := log.WithLineLogger(context.Background())
+
+	const validModule = "golang.org/x/mod"
+	s := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		modulePath := strings.TrimPrefix(r.URL.Path, "/mod/")
+		if modulePath != validModule {
+			http.Error(w, "unknown", http.StatusNotFound)
+		}
+	}))
+	defer s.Close()
+
+	for _, test := range []struct {
+		in   string
+		want bool
+	}{
+		{validModule, true},
+		{"github.com/something/something", false},
+	} {
+		t.Run(test.in, func(t *testing.T) {
+			got, err := knownToPkgsite(ctx, s.URL, test.in)
+			if err != nil {
+				t.Fatal(err)
+			}
+			if got != test.want {
+				t.Errorf("got %t, want %t", got, test.want)
+			}
+		})
+	}
+}