internal/worker: don't call pkgsite for known non-modules

Requests to pkg.go.dev are throttled to qps, so they are expensive.
Many of the calls are unnecessary because the paths are known not
to be Go modules.

Keep a list of patterns that match such paths and check them before
calling pkgsite.

Change-Id: I5e80d0c494510824939964b72466605ecc5cacbe
Reviewed-on: https://go-review.googlesource.com/c/vuln/+/370394
Trust: Jonathan Amsterdam <jba@google.com>
Run-TryBot: Jonathan Amsterdam <jba@google.com>
Reviewed-by: Julie Qiu <julie@golang.org>
diff --git a/internal/worker/paths.go b/internal/worker/paths.go
index 0672a21..dc5f4cd 100644
--- a/internal/worker/paths.go
+++ b/internal/worker/paths.go
@@ -6,6 +6,7 @@
 
 import (
 	"path"
+	"regexp"
 	"strings"
 
 	"golang.org/x/mod/module"
@@ -31,10 +32,138 @@
 	}
 }
 
+// negativePrefixPatterns is a list of glob patterns that describe prefixes of
+//  potential module paths that are known not to be modules. These are turned
+//  into regexps below and checked against each module path before calling
+//  pkgsite. This can speed up triage because pkgsite requests are throttled.
+var negativePrefixPatterns = []string{
+	"*.blogspot.com",
+	"*.blogspot.dk",
+	"*.readthedocs.org",
+	"archives.neohapsis.com",
+	"archives.neohapsis.com/archives/bugtraq",
+	"blog.*",
+	"blog.python.org",
+	"blogs.oracle.com",
+	"blogs.technet.com",
+	"bugs.*",
+	"bugzilla.*",
+	"cr.yp.to/talks",
+	"crbug.com",
+	"developer.mozilla.org/docs",
+	"developer.mozilla.org/en-US/docs",
+	"docs.google.com",
+	"docs.microsoft.com",
+	"drupal.org/node",
+	"erpscan.com/advisories",
+	"exchange.xforce.ibmcloud.com",
+	"github.com/*/*/blob",
+	"github.com/*/*/commit",
+	"github.com/*/*/issues",
+	"github.com/torvalds/linux/commit",
+	"groups.google.com",
+	"helpx.adobe.com/security",
+	"hg.openjdk.java.net",
+	"ics-cert.us-cert.gov",
+	"issues.apache.org",
+	"jira.*",
+	"jvn.jp",
+	"jvndb.jvn.jp",
+	"krebsonsecurity.com",
+	"labs.mwrinfosecurity.com/advisories",
+	"lists.*/archive",
+	"lists.*/archives",
+	"lists.*/pipermail",
+	"lists.opensuse.org",
+	"lists.ubuntu.com",
+	"mail-archives.*",
+	"mail.*.org/archives",
+	"mail.*/pipermail",
+	"mailman.*.org/archives",
+	"mailman.*.org/pipermail",
+	"nodesecurity.io/advisories",
+	"openwall.com/lists",
+	"oss.oracle.com/pipermail",
+	"osvdb.org",
+	"owncloud.org/about/security",
+	"packetstormsecurity.com/files",
+	"plus.google.com",
+	"puppetlabs.com/security",
+	"raw.github.com",
+	"rhn.redhat.com/errata",
+	"seclists.org",
+	"secunia.com/advisories",
+	"secunia.com/secunia_research",
+	"security.gentoo.org/glsa",
+	"service.sap.com",
+	"subversion.apache.org/security",
+	"support.*",
+	"technet.microsoft.com/en-us/security",
+	"technet.microsoft.com/security",
+	"tools.cisco.com/security/center",
+	"twitter.com",
+	"ubuntu.com/usn",
+	"weblog.*",
+	"www.adobe.com/support/security",
+	"www.bugzilla.org/security",
+	"www.coresecurity.com/advisories",
+	"www.debian.org/security",
+	"www.drupal.org/node",
+	"www.exploit-db.com",
+	"www.htbridge.com/advisory",
+	"www.ibm.com/developerworks/java",
+	"www.kb.cert.org",
+	"www.kernel.org/pub/linux/kernel/v3*/ChangeLog*",
+	"www.mozilla.org/security",
+	"www.openwall.com/lists",
+	"www.oracle.com/technetwork",
+	"www.osvdb.org",
+	"www.phpmyadmin.net/home_page/security",
+	"www.portcullis-security.com/security-research-and-downloads",
+	"www.postgresql.org/docs",
+	"www.redhat.com/archives",
+	"www.samba.org/samba/security",
+	"www.security-assessment.com/files",
+	"www.securityfocus.com",
+	"www.securitytracker.com",
+	"www.sophos.com/en-us/support",
+	"www.suse.com/support",
+	"www.ubuntu.com/usn",
+	"www.us-cert.gov/cas",
+	"www.us-cert.gov/ncas",
+	"www.vmware.com/security/advisories",
+	"www.wireshark.org/security",
+	"www.zerodayinitiative.com/advisories",
+	"zerodayinitiative.com/advisories",
+}
+
+var negativeRegexps []*regexp.Regexp
+
+func init() {
+	rep := strings.NewReplacer(".", `\.`, "*", `[^/]*`)
+	for _, pat := range negativePrefixPatterns {
+		r := "^" + rep.Replace(pat) + "($|/)"
+		negativeRegexps = append(negativeRegexps, regexp.MustCompile(r))
+	}
+}
+
+// matchesNegativeRegexp reports whether s matches any element of negativeRegexps.
+func matchesNegativeRegexp(s string) bool {
+	for _, nr := range negativeRegexps {
+		if nr.MatchString(s) {
+			return true
+		}
+	}
+	return false
+}
+
 // candidateModulePaths returns the potential module paths that could contain
 // the fullPath, from longest to shortest. It returns nil if no valid module
 // paths can be constructed.
 func candidateModulePaths(fullPath string) []string {
+	if matchesNegativeRegexp(fullPath) {
+		return nil
+	}
 	if stdlibContains(fullPath) {
 		if err := module.CheckImportPath(fullPath); err != nil {
 			return nil
diff --git a/internal/worker/paths_test.go b/internal/worker/paths_test.go
index 5a4045f..ce027e8 100644
--- a/internal/worker/paths_test.go
+++ b/internal/worker/paths_test.go
@@ -54,3 +54,26 @@
 		}
 	}
 }
+
+func TestMatchesNegativeRegexp(t *testing.T) {
+	for _, test := range []struct {
+		in   string
+		want bool
+	}{
+		{"groups.google.com", true},
+		{"groupsgooglecom", false},
+		{"groups.google.com/foo", true},
+		{"groups.google.comics.org", false},
+		{"some/groups.google.com", false},
+		{"lists.ubuntu.com", true},
+		{"lists.ubuntu.com/pipermail", true},
+		{"bugzilla.anything.org", true},
+		{"github.com/evacchi/flatpress/issues/14", true},
+		{"github.com/evacchi/issues/14", false},
+	} {
+		got := matchesNegativeRegexp(test.in)
+		if got != test.want {
+			t.Errorf("%s: got %t, want %t", test.in, got, test.want)
+		}
+	}
+}