internal/bigquery,vulncheck: only read work version columns

Reading work versions was extremely slow. The reason was that we were
reading the entirety of every row. This CL reads just the columns we
need, giving a huge speedup.

To do this, we needed a way to specify columns in a partition
query. The arguments to the PartitionQuery function were all strings,
so it was already a bit confusing what each string meant, and this was
going to be one more. PartitionQuery is now a struct, so the field
names can be written in the code.

Change-Id: I96205796b0a44220bdd9373332311e210d3176a5
Reviewed-on: https://go-review.googlesource.com/c/pkgsite-metrics/+/474896
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Jonathan Amsterdam <jba@google.com>
Reviewed-by: Zvonimir Pavlinovic <zpavlinovic@google.com>
diff --git a/internal/analysis/analysis.go b/internal/analysis/analysis.go
index fd312e2..3c2130e 100644
--- a/internal/analysis/analysis.go
+++ b/internal/analysis/analysis.go
@@ -200,7 +200,12 @@
 func ReadWorkVersions(ctx context.Context, c *bigquery.Client) (_ map[[2]string]*WorkVersion, err error) {
 	defer derrors.Wrap(&err, "ReadWorkVersions")
 	m := map[[2]string]*WorkVersion{}
-	query := bigquery.PartitionQuery(c.FullTableName(TableName), "module_path, sort_version", "created_at DESC")
+	query := bigquery.PartitionQuery{
+		Table:       c.FullTableName(TableName),
+		Columns:     "module_path, version, binary_version, binary_args, worker_version, schema_version",
+		PartitionOn: "module_path, sort_version",
+		OrderBy:     "created_at DESC",
+	}.String()
 	iter, err := c.Query(ctx, query)
 	if err != nil {
 		return nil, err
diff --git a/internal/bigquery/bigquery.go b/internal/bigquery/bigquery.go
index c51248b..b738242 100644
--- a/internal/bigquery/bigquery.go
+++ b/internal/bigquery/bigquery.go
@@ -304,22 +304,34 @@
 	return tableIDs
 }
 
-// PartitionQuery returns a query that returns one row for each distinct value
-// of partitionColumn in tableName.
-// The selected row will be the first one according to the orderings, which
-// should be comma-separated ORDER BY clauses.
+// PartitionQuery describes a query that returns one row for each distinct value
+// of the partition column in the given table.
+//
+// The selected row will be the first one according to the OrderBy clauses.
 //
 // For example, say the students table holds student names and classes.
 // Then
 //
-//	partitionQuery("students", "class", "name ASC")
+//	  PartitionQuery{
+//		   Table: "students",
+//		   PartitionOn: "class",
+//		   OrderBy: "name ASC",
+//		 }.String()
 //
 // will construct a query returning the student in each class whose name is
 // alphabetically first.
 //
 // (BigQuery SQL has no DISTINCT ON feature and doesn't allow columns of type RECORD
 // in queries with DISTINCT, so we have to take this approach.)
-func PartitionQuery(tableName, partitionColumn, orderings string) string {
+type PartitionQuery struct {
+	Table       string // full table name
+	Columns     string // comma-separated columns to select, or "*" ("" => "*")
+	PartitionOn string // comma-separated columns defining the partition
+	OrderBy     string // text after ORDER BY: comma-separated columns, each
+	// optionally followed by DESC or ASC
+}
+
+func (q PartitionQuery) String() string {
 	// This query first organizes the table rows into windows that have the same partitionColumn.
 	// The rows in each window are sorted by the given orderings.
 	// They are then assigned numbers, where 1 is the first row in the window.
@@ -330,15 +342,18 @@
 	const qf = `
 		SELECT * EXCEPT (rownum)
 		FROM (
-			SELECT *, ROW_NUMBER() OVER (
+			SELECT %s, ROW_NUMBER() OVER (
 				PARTITION BY %s
 				ORDER BY %s
 			) AS rownum
 			FROM %s
 		) WHERE rownum = 1
 	`
-
-	return fmt.Sprintf(qf, partitionColumn, orderings, "`"+tableName+"`")
+	cols := q.Columns
+	if cols == "" {
+		cols = "*"
+	}
+	return fmt.Sprintf(qf, cols, q.PartitionOn, q.OrderBy, "`"+q.Table+"`")
 }
 
 // Copy InferSchema so users don't have to import cloud.google.com/go/bigquery
diff --git a/internal/bigquery/vulndb_requests.go b/internal/bigquery/vulndb_requests.go
index 6b1d583..297306a 100644
--- a/internal/bigquery/vulndb_requests.go
+++ b/internal/bigquery/vulndb_requests.go
@@ -64,8 +64,11 @@
 
 func readVulnDBRequestCounts(ctx context.Context, c *Client) (_ []*VulnDBRequestCount, err error) {
 	// Select the most recently inserted row for each date.
-	q := fmt.Sprintf("(%s) ORDER BY date DESC",
-		PartitionQuery(c.FullTableName(VulnDBRequestTableName), "date", "inserted_at DESC"))
+	q := fmt.Sprintf("(%s) ORDER BY date DESC", PartitionQuery{
+		Table:       c.FullTableName(VulnDBRequestTableName),
+		PartitionOn: "date",
+		OrderBy:     "inserted_at DESC",
+	})
 	iter, err := c.Query(ctx, q)
 	if err != nil {
 		return nil, err
diff --git a/internal/vulncheck/vulncheck.go b/internal/vulncheck/vulncheck.go
index 1ba9019..25205d1 100644
--- a/internal/vulncheck/vulncheck.go
+++ b/internal/vulncheck/vulncheck.go
@@ -239,7 +239,12 @@
 func ReadWorkVersions(ctx context.Context, c *bigquery.Client) (_ map[[2]string]*WorkVersion, err error) {
 	defer derrors.Wrap(&err, "ReadWorkVersions")
 	m := map[[2]string]*WorkVersion{}
-	query := bigquery.PartitionQuery(c.FullTableName(TableName), "module_path, sort_version", "created_at DESC")
+	query := bigquery.PartitionQuery{
+		Table:       c.FullTableName(TableName),
+		Columns:     "module_path, version, worker_version, schema_version, x_vuln_version, vulndb_last_modified",
+		PartitionOn: "module_path, sort_version",
+		OrderBy:     "created_at DESC",
+	}.String()
 	iter, err := c.Query(ctx, query)
 	if err != nil {
 		return nil, err
@@ -274,7 +279,11 @@
 
 func fetchResults(ctx context.Context, c *bigquery.Client, tableName string) (rows []*Result, err error) {
 	name := c.FullTableName(tableName)
-	query := bigquery.PartitionQuery(name, "module_path, scan_mode", orderByClauses)
+	query := bigquery.PartitionQuery{
+		Table:       name,
+		PartitionOn: "module_path, scan_mode",
+		OrderBy:     orderByClauses,
+	}.String()
 	log.Infof(ctx, "running latest query on %s", name)
 	iter, err := c.Query(ctx, query)
 	if err != nil {