internal/report, etc.: speed up setting published date

Rather than checking the commit history for each report's file
individually (which internally involves walking the complete repo
history), walk the history once and build a table of modification
times for each file of interest.

This changes the time to set all dates on my laptop to from about
two minutes to about two seconds.

Updates golang/vulndb#50434

Change-Id: Ica696183fd7f07a039e66a33707663be04009dd8
Reviewed-on: https://go-review.googlesource.com/c/vulndb/+/391714
Reviewed-by: Jonathan Amsterdam <jba@google.com>
Trust: Damien Neil <dneil@google.com>
Run-TryBot: Damien Neil <dneil@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Julie Qiu <julie@golang.org>
diff --git a/cmd/vulnreport/main.go b/cmd/vulnreport/main.go
index e9afb3f..afd004f 100644
--- a/cmd/vulnreport/main.go
+++ b/cmd/vulnreport/main.go
@@ -20,7 +20,6 @@
 	"strings"
 	"time"
 
-	"github.com/go-git/go-git/v5"
 	"golang.org/x/tools/go/packages"
 	"golang.org/x/vulndb/internal/cvelistrepo"
 	"golang.org/x/vulndb/internal/derrors"
@@ -108,7 +107,11 @@
 		if err != nil {
 			log.Fatal(err)
 		}
-		f := func(name string) error { return setDates(repo, name) }
+		commitDates, err := gitrepo.AllCommitDates(repo, gitrepo.MainReference, "reports/")
+		if err != nil {
+			log.Fatal(err)
+		}
+		f := func(name string) error { return setDates(name, commitDates) }
 		if err := multi(f, names); err != nil {
 			log.Fatal(err)
 		}
@@ -330,7 +333,7 @@
 // transformed into a DB entry. The advantages of using this command are that it
 // speeds up gendb, and the dates become permanent (if you create and submit a
 // CL after running it).
-func setDates(repo *git.Repository, filename string) (err error) {
+func setDates(filename string, dates map[string]gitrepo.Dates) (err error) {
 	defer derrors.Wrap(&err, "setDates(%q)", filename)
 
 	r, err := report.Read(filename)
@@ -340,11 +343,11 @@
 	if !r.Published.IsZero() {
 		return nil
 	}
-	oldest, _, err := gitrepo.CommitDates(repo, filename)
-	if err != nil {
-		return err
+	d, ok := dates[filename]
+	if !ok {
+		return fmt.Errorf("can't find git repo commit dates for %q", filename)
 	}
-	r.Published = oldest
+	r.Published = d.Oldest
 	return r.Write(filename)
 }
 
diff --git a/internal/database/generate.go b/internal/database/generate.go
index 2a8b622..b844f1a 100644
--- a/internal/database/generate.go
+++ b/internal/database/generate.go
@@ -47,6 +47,11 @@
 		return err
 	}
 
+	commitDates, err := gitrepo.AllCommitDates(repo, gitrepo.MainReference, "reports/")
+	if err != nil {
+		return err
+	}
+
 	jsonVulns := map[string][]osv.Entry{}
 	var entries []osv.Entry
 	for _, f := range yamlFiles {
@@ -59,11 +64,11 @@
 		}
 		if r.Published.IsZero() {
 			yamlPath := filepath.Join(yamlDir, f.Name())
-			oldest, _, err := gitrepo.CommitDates(repo, yamlPath)
-			if err != nil {
-				return fmt.Errorf("can't find git repo commit dates for %q: %v", yamlPath, err)
+			dates, ok := commitDates[yamlPath]
+			if !ok {
+				return fmt.Errorf("can't find git repo commit dates for %q", yamlPath)
 			}
-			r.Published = oldest
+			r.Published = dates.Oldest
 		}
 		if lints := r.Lint(); len(lints) > 0 {
 			return fmt.Errorf("vuln.Lint: %v", lints)
diff --git a/internal/gitrepo/gitrepo.go b/internal/gitrepo/gitrepo.go
index 93aee19..2d35658 100644
--- a/internal/gitrepo/gitrepo.go
+++ b/internal/gitrepo/gitrepo.go
@@ -8,6 +8,7 @@
 import (
 	"context"
 	"fmt"
+	"io"
 	"strings"
 	"time"
 
@@ -148,40 +149,89 @@
 	}
 }
 
-// FileHistory calls f for every commit in filepath's history, starting from refName.
-func FileHistory(repo *git.Repository, refName plumbing.ReferenceName, filepath string, f func(*object.Commit) error) error {
-	ref, err := repo.Reference(refName, true)
+// ReferenceName is a git reference.
+type ReferenceName struct{ plumbing.ReferenceName }
+
+var (
+	HeadReference = ReferenceName{plumbing.HEAD}                                       // HEAD
+	MainReference = ReferenceName{plumbing.NewRemoteReferenceName("origin", "master")} // origin/master
+)
+
+// Dates is the oldest and newest commit timestamps for a file.
+type Dates struct {
+	Oldest, Newest time.Time
+}
+
+// AllCommitDates returns the oldest and newest commit timestamps for every
+// file in the repo at the given reference, where the filename begins with
+// prefix. The supplied prefix should include the trailing /.
+func AllCommitDates(repo *git.Repository, refName ReferenceName, prefix string) (dates map[string]Dates, err error) {
+	defer derrors.Wrap(&err, "AllCommitDates(%q)", prefix)
+
+	ref, err := repo.Reference(refName.ReferenceName, true)
 	if err != nil {
-		return err
+		return nil, err
 	}
 	commit, err := repo.CommitObject(ref.Hash())
 	if err != nil {
-		return err
+		return nil, err
 	}
-	return object.NewCommitFileIterFromIter(
-		filepath,
-		object.NewCommitPreorderIter(commit, nil, nil),
-		false,
-	).ForEach(f)
-}
-
-// CommitDates returns the oldest and newest commit date for filepath in origin/master.
-func CommitDates(repo *git.Repository, filepath string) (oldest, newest time.Time, err error) {
-	defer derrors.Wrap(&err, "CommitDates(%q)", filepath)
-
-	refName := plumbing.NewRemoteReferenceName("origin", "master")
-	err = FileHistory(repo, refName, filepath, func(commit *object.Commit) error {
-		when := commit.Committer.When.UTC()
-		if oldest.IsZero() || when.Before(oldest) {
-			oldest = when
-		}
-		if when.After(newest) {
-			newest = when
-		}
-		return nil
-	})
+	dates = make(map[string]Dates)
+	iter := object.NewCommitPreorderIter(commit, nil, nil)
+	commit, err = iter.Next()
 	if err != nil {
-		return time.Time{}, time.Time{}, err
+		return nil, err
 	}
-	return oldest, newest, nil
+	for commit != nil {
+		parentCommit, err := iter.Next()
+		if err != nil {
+			if err != io.EOF {
+				return nil, err
+			}
+			parentCommit = nil
+		}
+
+		currentTree, err := commit.Tree()
+		if err != nil {
+			return nil, err
+		}
+
+		var parentTree *object.Tree
+		if parentCommit != nil {
+			parentTree, err = parentCommit.Tree()
+			if err != nil {
+				return nil, err
+			}
+		}
+
+		changes, err := object.DiffTree(currentTree, parentTree)
+		if err != nil {
+			return nil, err
+		}
+
+		for _, change := range changes {
+			name := change.To.Name
+			if change.From.Name != "" {
+				name = change.From.Name
+			}
+			when := commit.Committer.When.UTC()
+			if !strings.HasPrefix(name, prefix) {
+				continue
+			}
+			d := dates[name]
+			if d.Oldest.IsZero() || when.Before(d.Oldest) {
+				if d.Oldest.After(d.Newest) {
+					d.Newest = d.Oldest
+				}
+				d.Oldest = when
+			}
+			if when.After(d.Newest) {
+				d.Newest = when
+			}
+			dates[name] = d
+		}
+
+		commit = parentCommit
+	}
+	return dates, nil
 }
diff --git a/internal/gitrepo/gitrepo_test.go b/internal/gitrepo/gitrepo_test.go
index 6218f00..ce8cd67 100644
--- a/internal/gitrepo/gitrepo_test.go
+++ b/internal/gitrepo/gitrepo_test.go
@@ -5,41 +5,53 @@
 package gitrepo_test
 
 import (
-	"reflect"
-	"strings"
+	"fmt"
 	"testing"
+	"time"
 
 	"github.com/go-git/go-billy/v5"
 	"github.com/go-git/go-billy/v5/memfs"
 	"github.com/go-git/go-git/v5"
-	"github.com/go-git/go-git/v5/plumbing"
 	"github.com/go-git/go-git/v5/plumbing/object"
 	"github.com/go-git/go-git/v5/storage/memory"
+	"github.com/google/go-cmp/cmp"
 	"golang.org/x/vulndb/internal/gitrepo"
 )
 
-func TestFileHistory(t *testing.T) {
+func TestAllCommitDates(t *testing.T) {
 	test := newTest(t)
-	message := []string{"one", "two", "three"}
-	for _, message := range message {
-		test.Commit(message, map[string]string{
-			"file": message,
-		})
-
-		// These commits touch other files, and should not be iterated over.
-		test.Commit("other commit", map[string]string{
-			"some_other_file": message,
-		})
+	want := map[string]gitrepo.Dates{
+		"files/1": gitrepo.Dates{
+			Oldest: time.Date(2020, 1, 1, 1, 0, 0, 0, time.UTC),
+			Newest: time.Date(2020, 1, 1, 1, 2, 0, 0, time.UTC),
+		},
+		"files/2": gitrepo.Dates{
+			Oldest: time.Date(2020, 1, 1, 1, 1, 0, 0, time.UTC),
+			Newest: time.Date(2020, 1, 1, 1, 3, 0, 0, time.UTC),
+		},
 	}
-	var got []string
-	gitrepo.FileHistory(test.Repo, plumbing.HEAD, "file", func(commit *object.Commit) error {
-		got = append([]string{strings.TrimSpace(commit.Message)}, got...)
-		return nil
-	})
-	if !reflect.DeepEqual(got, message) {
-		t.Errorf("got %v\nwant %v", got, message)
+	for name, dates := range want {
+		now := dates.Oldest
+		for {
+			if now.After(dates.Newest) {
+				now = dates.Newest
+			}
+			test.Commit("message", now, map[string]string{
+				name: fmt.Sprintf("commit at %v", now),
+			})
+			if now == dates.Newest {
+				break
+			}
+			now = now.Add(1 * time.Hour)
+		}
 	}
-
+	got, err := gitrepo.AllCommitDates(test.Repo, gitrepo.HeadReference, "files/")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if diff := cmp.Diff(want, got); diff != "" {
+		t.Errorf("AllCommitDates returned unexpected result (-want,+got):\n%v", diff)
+	}
 }
 
 type gitTest struct {
@@ -62,7 +74,7 @@
 	}
 }
 
-func (test *gitTest) Commit(message string, files map[string]string) {
+func (test *gitTest) Commit(message string, when time.Time, files map[string]string) {
 	test.t.Helper()
 	wt, err := test.Repo.Worktree()
 	if err != nil {
@@ -86,6 +98,7 @@
 	if _, err := wt.Commit(message, &git.CommitOptions{All: true, Author: &object.Signature{
 		Name:  "Author",
 		Email: "author@example.com",
+		When:  when,
 	}}); err != nil {
 		test.t.Fatal(err)
 	}