internal/cvelist: find Go CVEs using references data

Initial logic is added for finding Go CVEs from the CVE list repository.

For each CVE created in 2020 or after, we check if the references
contains a URL that might be a link to a common VCS host (such as
github.com). If so, parse the module path from that URL, and check if
that page exists on pkg.go.dev.

If so, we can assume that this is likely a Go related CVE, an a GitHub
issue should be created.

Change-Id: Iaedad20f7b055ebcf814b4b3caf310d5356839ad
Reviewed-on: https://go-review.googlesource.com/c/vulndb/+/356173
Trust: Julie Qiu <julie@golang.org>
Run-TryBot: Julie Qiu <julie@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Roland Shoemaker <roland@golang.org>
diff --git a/internal/cvelist/cvelist.go b/internal/cvelist/cvelist.go
index 158d756..f90d8a1 100644
--- a/internal/cvelist/cvelist.go
+++ b/internal/cvelist/cvelist.go
@@ -10,6 +10,8 @@
 	"encoding/json"
 	"fmt"
 	"log"
+	"net/http"
+	"net/url"
 	"path"
 	"strings"
 
@@ -19,17 +21,21 @@
 	"github.com/go-git/go-git/v5/plumbing/object"
 	"github.com/go-git/go-git/v5/storage/memory"
 	"golang.org/x/vulndb/internal/cveschema"
+	"golang.org/x/vulndb/internal/report"
 )
 
 // Run clones the CVEProject/cvelist repository and compares the files to the
 // existing triaged-cve-list.
 func Run(triaged map[string]bool) error {
-	log.Printf("cloning %q", cvelistRepoURL)
+	log.Printf("cloning %q...", cvelistRepoURL)
 	repo, root, err := cloneRepo(cvelistRepoURL)
 	if err != nil {
 		return err
 	}
-	return walkRepo(repo, root, "", triaged)
+	if err := createIssuesToTriage(repo, root, triaged); err != nil {
+		return err
+	}
+	return nil
 }
 
 const cvelistRepoURL = "https://github.com/CVEProject/cvelist"
@@ -63,9 +69,24 @@
 	return repo, root, nil
 }
 
+// createIssuesToTriage creates GitHub issues to be triaged by the Go security
+// team.
+// TODO: Create GitHub issues. At the moment, this just prints the number of
+// issues to be created.
+func createIssuesToTriage(r *git.Repository, t *object.Tree, triaged map[string]bool) (err error) {
+	log.Printf("creating issues to triage...")
+	issues, err := walkRepo(r, t, "", triaged)
+	if err != nil {
+		return err
+	}
+	// TODO: create GitHub issues.
+	log.Printf("found %d new issues", len(issues))
+	return nil
+}
+
 // walkRepo looks at the files in t, recursively, and check if it is a CVE that
 // needs to be manually triaged.
-func walkRepo(r *git.Repository, t *object.Tree, dirpath string, triaged map[string]bool) (err error) {
+func walkRepo(r *git.Repository, t *object.Tree, dirpath string, triaged map[string]bool) (issues []*GoVulnIssue, err error) {
 	for _, e := range t.Entries {
 		fp := path.Join(dirpath, e.Name)
 		if !strings.HasPrefix(fp, "202") {
@@ -75,11 +96,13 @@
 		case filemode.Dir:
 			t2, err := r.TreeObject(e.Hash)
 			if err != nil {
-				return err
+				return nil, err
 			}
-			if err := walkRepo(r, t2, fp, triaged); err != nil {
-				return err
+			currIssues, err := walkRepo(r, t2, fp, triaged)
+			if err != nil {
+				return nil, err
 			}
+			issues = append(issues, currIssues...)
 		default:
 			if !strings.HasPrefix(e.Name, "CVE-") {
 				continue
@@ -88,14 +111,20 @@
 			if triaged[cveID] {
 				continue
 			}
-			_, err = parseCVE(r, e)
+			c, err := parseCVE(r, e)
 			if err != nil {
-				return err
+				return nil, err
 			}
-			// TODO: implement triage CVE logic
+			issue, err := cveToIssue(c)
+			if err != nil {
+				return nil, err
+			}
+			if issue != nil {
+				issues = append(issues, issue)
+			}
 		}
 	}
-	return nil
+	return issues, nil
 }
 
 // parseCVEJSON parses a CVE file following the CVE JSON format:
@@ -125,3 +154,129 @@
 	}
 	return &c, nil
 }
+
+// cveToIssue creates a GoVulnIssue from a c *cveschema.CVE.
+func cveToIssue(c *cveschema.CVE) (*GoVulnIssue, error) {
+	if isPendingCVE(c) {
+		return nil, nil
+	}
+	mp, err := modulePathFromCVE(c)
+	if err != nil {
+		return nil, fmt.Errorf("modulePathFromCVE: %v", err)
+	}
+	if mp == "" {
+		return nil, nil
+	}
+	// TODO: implement additional checks on description and vendor information.
+
+	var links report.Links
+	for _, r := range c.References.ReferenceData {
+		if links.Commit == "" && strings.Contains(r.URL, "/commit/") {
+			links.Commit = r.URL
+		} else if links.PR == "" && strings.Contains(r.URL, "/pull/") {
+			links.PR = r.URL
+		} else {
+			links.Context = append(links.Context, r.URL)
+		}
+	}
+
+	var cwe string
+	for _, pt := range c.Problemtype.ProblemtypeData {
+		for _, d := range pt.Description {
+			if strings.Contains(d.Value, "CWE") {
+				cwe = d.Value
+			}
+		}
+	}
+	r := report.Report{
+		Module:      mp,
+		Links:       links,
+		CVE:         c.CVEDataMeta.ID,
+		Description: description(c),
+	}
+	info := AdditionalInfo{
+		Products: products(c),
+		CWE:      cwe,
+	}
+	return &GoVulnIssue{Report: r, AdditionalInfo: info}, nil
+}
+
+// isPendingCVE reports if the CVE is still waiting on information and not
+// ready to be triaged.
+func isPendingCVE(c *cveschema.CVE) bool {
+	return c.CVEDataMeta.STATE == cveschema.StateReserved
+}
+
+var vcsHostsWithThreeElementRepoName = map[string]bool{
+	"bitbucket.org": true,
+	"gitea.com":     true,
+	"gitee.com":     true,
+	"github.com":    true,
+	"gitlab.com":    true,
+	"golang.org":    true,
+}
+
+// modulePathFromCVE returns a Go module path for a CVE, if we can determine
+// what it is.
+func modulePathFromCVE(c *cveschema.CVE) (string, error) {
+	for _, r := range c.References.ReferenceData {
+		if r.URL == "" {
+			continue
+		}
+		for host := range vcsHostsWithThreeElementRepoName {
+			if !strings.Contains(r.URL, host) {
+				continue
+			}
+			refURL, err := url.Parse(r.URL)
+			if err != nil {
+				return "", fmt.Errorf("url.Parse(%q): %v", r.URL, err)
+			}
+			u := refURL.Host + refURL.Path
+			parts := strings.Split(u, "/")
+			if len(parts) < 3 {
+				continue
+			}
+			mod := strings.Join(parts[0:3], "/")
+			r, err := http.DefaultClient.Get(fmt.Sprintf("https://pkg.go.dev/%s", mod))
+			if err != nil {
+				return "", err
+			}
+			if r.StatusCode == http.StatusOK {
+				return mod, nil
+			}
+		}
+	}
+	return "", nil
+}
+
+// GoVulnIssue represents a GitHub issue to be created about a Go
+// vulnerability.
+type GoVulnIssue struct {
+	AdditionalInfo AdditionalInfo
+	Report         report.Report
+}
+
+// AdditionalInfo contains additional information about the CVE not captured by
+// report.Report.
+type AdditionalInfo struct {
+	CWE      string
+	Products []*cveschema.ProductDataItem
+}
+
+func description(c *cveschema.CVE) string {
+	var ds []string
+	for _, d := range c.Description.DescriptionData {
+		ds = append(ds, d.Value)
+	}
+	return strings.Join(ds, "| \n ")
+}
+
+func products(c *cveschema.CVE) []*cveschema.ProductDataItem {
+	var pds []*cveschema.ProductDataItem
+	for _, v := range c.Affects.Vendor.VendorData {
+		for _, pd := range v.Product.ProductData {
+			pds = append(pds, &pd)
+		}
+	}
+	return pds
+}