blob: 0883e68ff30307f1e61ac425e699a5d27ede3408 [file] [log] [blame]
// Copyright 2022 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package govulncheck
import (
"context"
"encoding/json"
"errors"
"fmt"
"net/http"
"sort"
"strings"
"time"
bq "cloud.google.com/go/bigquery"
"cloud.google.com/go/civil"
"golang.org/x/exp/maps"
"golang.org/x/pkgsite-metrics/internal/bigquery"
"golang.org/x/pkgsite-metrics/internal/derrors"
"golang.org/x/pkgsite-metrics/internal/log"
"golang.org/x/pkgsite-metrics/internal/scan"
"golang.org/x/vuln/exp/govulncheck"
"google.golang.org/api/iterator"
)
// EnqueueQueryParams for govulncheck/enqueue
type EnqueueQueryParams struct {
Suffix string // appended to task queue IDs to generate unique tasks
Mode string // type of analysis to run
Min int // minimum import-by count for a module to be included
File string // path to file containing modules; if missing, use DB
}
// Request contains information passed
// to a scan endpoint.
type Request struct {
scan.ModuleURLPath
QueryParams
}
// QueryParams has query parameters for a govulncheck scan request.
type QueryParams struct {
ImportedBy int // imported-by count
Mode string // govulncheck mode
Insecure bool // if true, run outside sandbox
Serve bool // serve results back to client instead of writing them to BigQuery
}
// These methods implement queue.Task.
func (r *Request) Name() string { return r.Module + "@" + r.Version }
func (r *Request) Path() string { return r.ModuleURLPath.Path() }
func (r *Request) Params() string {
return scan.FormatParams(r.QueryParams)
}
// ParseRequest parses an http request r for an endpoint
// prefix and produces a corresponding ScanRequest.
//
// The module and version should have one of the following three forms:
// - <module>/@v/<version>
// - <module>@<version>
// - <module>/@latest
//
// (These are the same forms that the module proxy accepts.)
func ParseRequest(r *http.Request, prefix string) (*Request, error) {
mp, err := scan.ParseModuleURLPath(strings.TrimPrefix(r.URL.Path, prefix))
if err != nil {
return nil, err
}
rp := QueryParams{ImportedBy: -1}
if err := scan.ParseParams(r, &rp); err != nil {
return nil, err
}
if rp.ImportedBy < 0 {
return nil, errors.New(`missing or negative "importedby" query param`)
}
return &Request{
ModuleURLPath: mp,
QueryParams: rp,
}, nil
}
func ConvertGovulncheckOutput(v *govulncheck.Vuln) (vulns []*Vuln) {
for _, module := range v.Modules {
for pkgNum, pkg := range module.Packages {
addedSymbols := make(map[string]bool)
baseVuln := &Vuln{
ID: v.OSV.ID,
ModulePath: module.Path,
PackagePath: pkg.Path,
CallSink: bigquery.NullInt(0),
ImportSink: bigquery.NullInt(pkgNum + 1),
RequireSink: bigquery.NullInt(pkgNum + 1),
}
// For each called symbol, reconstruct sinks and create the corresponding bigquery vuln
for symbolNum, cs := range pkg.CallStacks {
addedSymbols[cs.Symbol] = true
toAdd := *baseVuln
toAdd.Symbol = cs.Symbol
toAdd.CallSink = bigquery.NullInt(symbolNum + 1)
vulns = append(vulns, &toAdd)
}
// Find the rest of the vulnerable imported symbols that haven't been called
// and create corresponding bigquery vulns
for _, affected := range v.OSV.Affected {
if affected.Package.Name == module.Path {
for _, imp := range affected.EcosystemSpecific.Imports {
if imp.Path == pkg.Path {
for _, symbol := range imp.Symbols {
if !addedSymbols[symbol] {
toAdd := *baseVuln
toAdd.Symbol = symbol
vulns = append(vulns, &toAdd)
}
}
}
}
}
}
}
}
return vulns
}
const TableName = "vulncheck"
// Note: before modifying Result or Vuln, make sure the change
// is a valid schema modification.
// The only supported changes are:
// - adding a nullable or repeated column
// - dropping a column
// - changing a column from required to nullable.
// See https://cloud.google.com/bigquery/docs/managing-table-schemas for details.
// Result is a row in the BigQuery govulncheck table.
type Result struct {
CreatedAt time.Time `bigquery:"created_at"`
ModulePath string `bigquery:"module_path"`
Version string `bigquery:"version"`
Suffix string `bigquery:"suffix"`
SortVersion string `bigquery:"sort_version"`
ImportedBy int `bigquery:"imported_by"`
Error string `bigquery:"error"`
ErrorCategory string `bigquery:"error_category"`
CommitTime time.Time `bigquery:"commit_time"`
ScanSeconds float64 `bigquery:"scan_seconds"`
ScanMemory int64 `bigquery:"scan_memory"`
PkgsMemory int64 `bigquery:"pkgs_memory"`
ScanMode string `bigquery:"scan_mode"`
// Workers is the concurrency limit under which a module is
// analyzed. Useful for interpreting memory measurements when
// there are multiple modules analyzed in the same process.
// 0 if no limit is specified, -1 for potential errors.
Workers int `bigquery:"workers"`
WorkVersion // InferSchema flattens embedded fields
Vulns []*Vuln `bigquery:"vulns"`
}
// WorkVersion contains information that can be used to avoid duplicate work.
// Given two WorkVersion values v1 and v2 for the same module path and version,
// if v1.Equal(v2) then it is not necessary to scan the module.
type WorkVersion struct {
// The version of the currently running code. This tracks changes in the
// logic of module scanning and processing.
WorkerVersion string `bigquery:"worker_version"`
// The version of the bigquery schema.
SchemaVersion string ` bigquery:"schema_version"`
// The version of the golang.org/x/vuln module used by the current module.
VulnVersion string `bigquery:"x_vuln_version"`
// When the vuln DB was last modified.
VulnDBLastModified time.Time `bigquery:"vulndb_last_modified"`
}
func (v1 *WorkVersion) Equal(v2 *WorkVersion) bool {
if v1 == nil || v2 == nil {
return v1 == v2
}
return v1.WorkerVersion == v2.WorkerVersion &&
v1.SchemaVersion == v2.SchemaVersion &&
v1.VulnVersion == v2.VulnVersion &&
v1.VulnDBLastModified.Equal(v2.VulnDBLastModified)
}
func (vr *Result) SetUploadTime(t time.Time) { vr.CreatedAt = t }
func (vr *Result) AddError(err error) {
if err == nil {
return
}
vr.Error = err.Error()
vr.ErrorCategory = derrors.CategorizeError(err)
}
// Vuln is a record in Result.
type Vuln struct {
ID string `bigquery:"id"`
Symbol string `bigquery:"symbol"`
PackagePath string `bigquery:"package_path"`
ModulePath string `bigquery:"module_path"`
CallSink bq.NullInt64 `bigquery:"call_sink"`
ImportSink bq.NullInt64 `bigquery:"import_sink"`
RequireSink bq.NullInt64 `bigquery:"require_sink"`
}
// SchemaVersion changes whenever the govulncheck schema changes.
var SchemaVersion string
func init() {
s, err := bigquery.InferSchema(Result{})
if err != nil {
panic(err)
}
SchemaVersion = bigquery.SchemaVersion(s)
bigquery.AddTable(TableName, s)
}
// ReadWorkVersions reads the most recent WorkVersions in the govulncheck table.
func ReadWorkVersions(ctx context.Context, c *bigquery.Client) (_ map[[2]string]*WorkVersion, err error) {
defer derrors.Wrap(&err, "ReadWorkVersions")
m := map[[2]string]*WorkVersion{}
query := bigquery.PartitionQuery{
Table: c.FullTableName(TableName),
Columns: "module_path, version, worker_version, schema_version, x_vuln_version, vulndb_last_modified",
PartitionOn: "module_path, sort_version",
OrderBy: "created_at DESC",
}.String()
iter, err := c.Query(ctx, query)
if err != nil {
return nil, err
}
err = bigquery.ForEachRow(iter, func(r *Result) bool {
m[[2]string{r.ModulePath, r.Version}] = &r.WorkVersion
return true
})
if err != nil {
return nil, err
}
return m, nil
}
// The module path along with the four sort columns should uniquely specify a
// row, because we do not generate a new row for a (module, version) if the
// other three versions are identical. (There is actually a fourth component of
// the work version, the schema version. But since it is represented by a struct
// in the worker code and the worker version captures every change to that code,
// it cannot change independently of worker_version.)
const orderByClauses = `
vulndb_last_modified DESC, -- latest version of database
x_vuln_version DESC, -- latest version of x/vuln
worker_version DESC, -- latest version of x/pkgsite-metrics
sort_version DESC, -- latest version of module
created_at DESC -- latest insertion time
`
func FetchResults(ctx context.Context, c *bigquery.Client) (rows []*Result, err error) {
return fetchResults(ctx, c, TableName)
}
func fetchResults(ctx context.Context, c *bigquery.Client, tableName string) (rows []*Result, err error) {
name := c.FullTableName(tableName)
query := bigquery.PartitionQuery{
Table: name,
PartitionOn: "module_path, scan_mode",
OrderBy: orderByClauses,
}.String()
log.Infof(ctx, "running latest query on %s", name)
iter, err := c.Query(ctx, query)
if err != nil {
return nil, err
}
rows, err = bigquery.All[Result](iter)
if err != nil {
return nil, err
}
log.Infof(ctx, "got %d rows", len(rows))
// Check for duplicate rows.
modvers := map[string]int{}
for _, r := range rows {
modvers[r.ModulePath+"@"+r.Version+" "+r.ScanMode]++
}
keys := maps.Keys(modvers)
sort.Strings(keys)
for _, k := range keys {
if n := modvers[k]; n > 1 {
return nil, fmt.Errorf("%s has %d rows", k, n)
}
}
return rows, nil
}
type ReportResult struct {
*Result
ReportDate civil.Date `bigquery:"report_date"` // for reporting (e.g. dashboard)
InsertedAt time.Time `bigquery:"inserted_at"` // to disambiguate if >1 insertion for same date
}
func init() {
s, err := bigquery.InferSchema(ReportResult{})
if err != nil {
panic(err)
}
bigquery.AddTable(TableName+"-report", s)
}
func InsertResults(ctx context.Context, c *bigquery.Client, results []*Result, date civil.Date, allowDuplicates bool) (err error) {
return insertResults(ctx, c, TableName+"-report", results, date, allowDuplicates)
}
func insertResults(ctx context.Context, c *bigquery.Client, reportTableName string, results []*Result, date civil.Date, allowDuplicates bool) (err error) {
derrors.Wrap(&err, "InsertResults(%s)", date)
// Create the report table if it doesn't exist.
if err := c.CreateTable(ctx, reportTableName); err != nil {
return err
}
if !allowDuplicates {
query := fmt.Sprintf("SELECT COUNT(*) FROM `%s` WHERE report_date = '%s'",
c.FullTableName(reportTableName), date)
iter, err := c.Query(ctx, query)
if err != nil {
return err
}
var count struct {
n int
}
err = iter.Next(&count)
if err != nil && err != iterator.Done {
return err
}
if count.n > 0 {
return fmt.Errorf("already have %d rows for %s", count.n, date)
}
}
now := time.Now()
var rows []ReportResult
for _, r := range results {
rows = append(rows, ReportResult{Result: r, ReportDate: date, InsertedAt: now})
}
ctx, cancel := context.WithTimeout(ctx, 5*time.Minute) // to avoid retrying forever on permanent errors
defer cancel()
const chunkSize = 1024 // Chunk rows to a void exceeding the maximum allowable request size.
return bigquery.UploadMany(ctx, c, reportTableName, rows, chunkSize)
}
// ScanStats represent monitoring information about a given
// run of govulncheck.
type ScanStats struct {
// ScanSeconds is the amount of time a scan took to run, in seconds.
ScanSeconds float64
// ScanMemory is the peak (heap) memory used by govulncheck, in kb.
ScanMemory uint64
}
// SandboxResponse contains the raw govulncheck result
// and statistics about memory usage and run time. Used
// for capturing result of govulncheck run in a sandbox.
type SandboxResponse struct {
Res govulncheck.Result
Stats ScanStats
}
func UnmarshalSandboxResponse(output []byte) (*SandboxResponse, error) {
var e struct{ Error string }
if err := json.Unmarshal(output, &e); err != nil {
return nil, err
}
if e.Error != "" {
return nil, errors.New(e.Error)
}
var res SandboxResponse
if err := json.Unmarshal(output, &res); err != nil {
return nil, err
}
return &res, nil
}
func UnmarshalGovulncheckResult(output []byte) (*govulncheck.Result, error) {
var e struct {
Error string
}
if err := json.Unmarshal(output, &e); err != nil {
return nil, err
}
if e.Error != "" {
return nil, errors.New(e.Error)
}
var res govulncheck.Result
if err := json.Unmarshal(output, &res); err != nil {
return nil, err
}
return &res, nil
}