cmd/gopherbot: mention gerrit CLs on Github (cl2issue port)

This adds indexing of Github mentions to maintner, then adds gopherbot
functionality to say "CL https://golang.org/cl/NNNN mentions this issue."
on Github when a Gerrit CL references it.

Also the start of the cherry-pick milestone bot which needs the same
Github issue reference tracking. But that part's only barely started
and still disabled.

Fixes golang/go#18818
Updates golang/go#19776

Change-Id: Ie5f7f6845317a7c39cc36d7397c7539bf99c3f92
Reviewed-on: https://go-review.googlesource.com/39352
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
diff --git a/cmd/gopherbot/gopherbot.go b/cmd/gopherbot/gopherbot.go
index 7a83818..b976519 100644
--- a/cmd/gopherbot/gopherbot.go
+++ b/cmd/gopherbot/gopherbot.go
@@ -8,6 +8,7 @@
 
 import (
 	"context"
+	"errors"
 	"flag"
 	"fmt"
 	"io/ioutil"
@@ -15,6 +16,7 @@
 	"net/http"
 	"os"
 	"path/filepath"
+	"sort"
 	"strings"
 	"time"
 
@@ -117,6 +119,17 @@
 		fail = true
 	}
 
+	// "CL nnnn mentions this issue"
+	if err := bot.cl2issue(ctx); err != nil {
+		log.Printf("cl2issue: %v", err)
+		fail = true
+	}
+
+	if err := bot.checkCherryPicks(ctx); err != nil {
+		log.Printf("checking cherry picks: %v", err)
+		fail = true
+	}
+
 	if fail {
 		os.Exit(1)
 	}
@@ -133,6 +146,54 @@
 	return err
 }
 
+func (b *gopherbot) addGitHubComment(ctx context.Context, org, repo string, issueNum int32, msg string) error {
+	gr := b.corpus.GitHub().Repo(org, repo)
+	if gr == nil {
+		return fmt.Errorf("unknown github repo %s/%s", org, repo)
+	}
+	var since time.Time
+	if gi := gr.Issue(issueNum); gi != nil {
+		dup := false
+		gi.ForeachComment(func(c *maintner.GitHubComment) error {
+			since = c.Updated
+			// TODO: check for gopherbot as author? check for exact match?
+			// This seems fine for now.
+			if strings.Contains(c.Body, msg) {
+				dup = true
+				return errStopIteration
+			}
+			return nil
+		})
+		if dup {
+			// Comment's already been posted. Nothing to do.
+			return nil
+		}
+	}
+	// See if there is a dup comment from when gopherbot last got
+	// its data from maintner.
+	ics, _, err := b.ghc.Issues.ListComments(ctx, org, repo, int(issueNum), &github.IssueListCommentsOptions{
+		Since:       since,
+		ListOptions: github.ListOptions{PerPage: 1000},
+	})
+	if err != nil {
+		return err
+	}
+	for _, ic := range ics {
+		if strings.Contains(ic.GetBody(), msg) {
+			// Dup.
+			return nil
+		}
+	}
+	if *dryRun {
+		log.Printf("[dry run] would add comment to github.com/%s/%s/issues/%d: %v", org, repo, issueNum, msg)
+		return nil
+	}
+	_, _, err = b.ghc.Issues.CreateComment(ctx, org, repo, int(issueNum), &github.IssueComment{
+		Body: github.String(msg),
+	})
+	return err
+}
+
 // freezeOldIssues locks any issue that's old and closed.
 // (Otherwise people find ancient bugs via searches and start asking questions
 // into a void and it's sad for everybody.)
@@ -337,18 +398,90 @@
 		}
 
 		// TODO: write a task that reopens issues if the OP speaks up.
-		_, _, err := b.ghc.Issues.CreateComment(ctx, "golang", "go", int(gi.Number), &github.IssueComment{
-			Body: github.String("Timed out in state WaitingForInfo. Closing.\n\n(I am just a bot, though. Please speak up if this is a mistake or you have the requested information.)"),
-		})
-		if err != nil {
+		if err := b.addGitHubComment(ctx, "golang", "go", gi.Number,
+			"Timed out in state WaitingForInfo. Closing.\n\n(I am just a bot, though. Please speak up if this is a mistake or you have the requested information.)"); err != nil {
 			return err
 		}
-		_, _, err = b.ghc.Issues.Edit(ctx, "golang", "go", int(gi.Number), &github.IssueRequest{State: github.String("closed")})
+		_, _, err := b.ghc.Issues.Edit(ctx, "golang", "go", int(gi.Number), &github.IssueRequest{State: github.String("closed")})
 		return err
 	})
 
 }
 
+// Issue 19776: assist with cherry picks based on Milestones
+func (b *gopherbot) checkCherryPicks(ctx context.Context) error {
+	// TODO(bradfitz): write this. Debugging stuff below only.
+	return nil
+
+	sum := map[string]int{}
+	b.gorepo.ForeachIssue(func(gi *maintner.GitHubIssue) error {
+		if gi.Milestone.IsNone() || gi.Milestone.IsUnknown() || gi.Milestone.Closed {
+			return nil
+		}
+		title := gi.Milestone.Title
+		if !strings.HasPrefix(title, "Go") || strings.Count(title, ".") != 2 {
+			return nil
+		}
+		sum[title]++
+		return nil
+	})
+	var titles []string
+	for k := range sum {
+		titles = append(titles, k)
+	}
+	sort.Slice(titles, func(i, j int) bool { return sum[titles[i]] < sum[titles[j]] })
+	for _, title := range titles {
+		fmt.Printf(" %10d %s\n", sum[title], title)
+	}
+	return nil
+}
+
+// Write "CL https://golang.org/issue/NNNN mentions this issue" on
+// Github when a new Gerrit CL references a Github issue.
+func (b *gopherbot) cl2issue(ctx context.Context) error {
+	monthAgo := time.Now().Add(-30 * 24 * time.Hour)
+	b.corpus.Gerrit().ForeachProjectUnsorted(func(gp *maintner.GerritProject) error {
+		if gp.Server() != "go.googlesource.com" {
+			return nil
+		}
+		return gp.ForeachCLUnsorted(func(cl *maintner.GerritCL) error {
+			if cl.Meta == nil || cl.Meta.AuthorTime.Before(monthAgo) {
+				// If the CL was last updated over a
+				// month ago, assume (as an
+				// optimization) that gopherbot
+				// already processed this issue.
+				return nil
+			}
+			for _, ref := range cl.GitHubIssueRefs {
+				gi := ref.Repo.Issue(ref.Number)
+				if gi == nil || gi.Closed {
+					continue
+				}
+				hasComment := false
+				substr := fmt.Sprintf("%d mentions this issue.", cl.Number)
+				gi.ForeachComment(func(c *maintner.GitHubComment) error {
+					if strings.Contains(c.Body, substr) {
+						hasComment = true
+						return errStopIteration
+					}
+					return nil
+				})
+				if !hasComment {
+					if err := b.addGitHubComment(ctx, "golang", "go", gi.Number, fmt.Sprintf("CL https://golang.org/cl/%d mentions this issue.", cl.Number)); err != nil {
+						return err
+					}
+				}
+			}
+			return nil
+		})
+	})
+	return nil
+}
+
+// errStopIteration is used to stop iteration over issues or comments.
+// It has no special meaning.
+var errStopIteration = errors.New("stop iteration")
+
 func isDocumentationTitle(t string) bool {
 	if !strings.Contains(t, "doc") && !strings.Contains(t, "Doc") {
 		return false
diff --git a/maintner/gerrit.go b/maintner/gerrit.go
index 809b8a2..599ac1e 100644
--- a/maintner/gerrit.go
+++ b/maintner/gerrit.go
@@ -30,10 +30,11 @@
 
 // Gerrit holds information about a number of Gerrit projects.
 type Gerrit struct {
-	c       *Corpus
-	dataDir string // the root Corpus data directory
-	// keys are like "https://go.googlesource.com/build"
-	projects map[string]*GerritProject
+	c        *Corpus
+	dataDir  string                    // the root Corpus data directory
+	projects map[string]*GerritProject // keyed by "go.googlesource.com/build"
+
+	clsReferencingGithubIssue map[GitHubIssueRef][]*GerritCL
 }
 
 // c.mu must be held
@@ -53,6 +54,17 @@
 	return proj
 }
 
+// ForeachProjectUnsorted calls fn for each known Gerrit project.
+// Iteration ends if fn returns a non-nil value.
+func (g *Gerrit) ForeachProjectUnsorted(fn func(*GerritProject) error) error {
+	for _, p := range g.projects {
+		if err := fn(p); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
 // GerritProject represents a single Gerrit project.
 type GerritProject struct {
 	gerrit *Gerrit
@@ -66,6 +78,24 @@
 	need   map[GitHash]bool
 }
 
+func (gp *GerritProject) ServerSlashProject() string { return gp.proj }
+
+// Server returns the Gerrit server, such as "go.googlesource.com".
+func (gp *GerritProject) Server() string {
+	if i := strings.IndexByte(gp.proj, '/'); i != -1 {
+		return gp.proj[:i]
+	}
+	return ""
+}
+
+// Project returns the Gerrit project on the server, such as "go" or "crypto".
+func (gp *GerritProject) Project() string {
+	if i := strings.IndexByte(gp.proj, '/'); i != -1 {
+		return gp.proj[i+1:]
+	}
+	return ""
+}
+
 // ForeachOpenCL calls fn for each open CL in the repo.
 //
 // If fn returns an error, iteration ends and ForeachIssue returns
@@ -90,10 +120,22 @@
 	return nil
 }
 
+func (gp *GerritProject) ForeachCLUnsorted(fn func(*GerritCL) error) error {
+	for _, cl := range gp.cls {
+		if err := fn(cl); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
 func (gp *GerritProject) logf(format string, args ...interface{}) {
 	log.Printf("gerrit "+gp.proj+": "+format, args...)
 }
 
+// gerritCLVersion is a value type used as a map key to store a CL
+// number and a patchset version. Its Version field is overloaded
+// to reference the "meta" metadata commit if the Version is 0.
 type gerritCLVersion struct {
 	CLNumber int32
 	Version  int32 // version 0 is used for the "meta" ref.
@@ -101,6 +143,9 @@
 
 // A GerritCL represents a single change in Gerrit.
 type GerritCL struct {
+	// Project is the project this CL is part of.
+	Project *GerritProject
+
 	// Number is the CL number on the Gerrit
 	// server. (e.g. 1, 2, 3)
 	Number int32
@@ -120,6 +165,47 @@
 
 	// Status will be "merged", "abandoned", "new", or "draft".
 	Status string
+
+	// GitHubIssueRefs are parsed references to GitHub issues.
+	GitHubIssueRefs []GitHubIssueRef
+}
+
+// References reports whether cl includes a commit message reference
+// to the provided Github issue ref.
+func (cl *GerritCL) References(ref GitHubIssueRef) bool {
+	for _, eref := range cl.GitHubIssueRefs {
+		if eref == ref {
+			return true
+		}
+	}
+	return false
+}
+
+func (cl *GerritCL) updateGithubIssueRefs() {
+	gp := cl.Project
+	gerrit := gp.gerrit
+	gc := cl.Commit
+
+	oldRefs := cl.GitHubIssueRefs
+	newRefs := gerrit.c.parseGithubRefs(gp.proj, gc.Msg)
+	cl.GitHubIssueRefs = newRefs
+	for _, ref := range newRefs {
+		if !clSliceContains(gerrit.clsReferencingGithubIssue[ref], cl) {
+			// TODO: make this as small as
+			// possible? Most will have length
+			// 1. Care about default capacity of
+			// 2?
+			gerrit.clsReferencingGithubIssue[ref] = append(gerrit.clsReferencingGithubIssue[ref], cl)
+		}
+	}
+	for _, ref := range oldRefs {
+		if !cl.References(ref) {
+			// TODO: remove ref from gerrit.clsReferencingGithubIssue
+			// It could be a map of maps I suppose, but not as compact.
+			// So uses a slice as the second layer, since there will normally
+			// be one item.
+		}
+	}
 }
 
 // c.mu must be held
@@ -128,9 +214,10 @@
 		return
 	}
 	c.gerrit = &Gerrit{
-		c:        c,
-		dataDir:  c.dataDir,
-		projects: map[string]*GerritProject{},
+		c:                         c,
+		dataDir:                   c.dataDir,
+		projects:                  map[string]*GerritProject{},
+		clsReferencingGithubIssue: map[GitHubIssueRef][]*GerritCL{},
 	}
 }
 
@@ -256,6 +343,7 @@
 		} else {
 			cl.Commit = gc
 			cl.Version = clv.Version
+			cl.updateGithubIssueRefs()
 		}
 		if c.didInit {
 			gp.logf("Ref %+v => %v", clv, hash)
@@ -276,6 +364,16 @@
 	}
 }
 
+// clSliceContains reports whether cls contains cl.
+func clSliceContains(cls []*GerritCL, cl *GerritCL) bool {
+	for _, v := range cls {
+		if v == cl {
+			return true
+		}
+	}
+	return false
+}
+
 // c.mu must be held
 func (gp *GerritProject) markNeededCommit(hash GitHash) {
 	c := gp.gerrit.c
@@ -296,7 +394,8 @@
 		return cl
 	}
 	cl = &GerritCL{
-		Number: num,
+		Project: gp,
+		Number:  num,
 	}
 	gp.cls[num] = cl
 	return cl
diff --git a/maintner/github.go b/maintner/github.go
index 61fd8a9..ca2b1e0 100644
--- a/maintner/github.go
+++ b/maintner/github.go
@@ -35,14 +35,14 @@
 // package for responses fulfilled from cache due to a 304 from the server.
 const xFromCache = "X-From-Cache"
 
-// githubRepoID is a github org & repo, lowercase.
-type githubRepoID struct {
+// GithubRepoID is a github org & repo, lowercase.
+type GithubRepoID struct {
 	Owner, Repo string
 }
 
-func (id githubRepoID) String() string { return id.Owner + "/" + id.Repo }
+func (id GithubRepoID) String() string { return id.Owner + "/" + id.Repo }
 
-func (id githubRepoID) valid() bool {
+func (id GithubRepoID) valid() bool {
 	if id.Owner == "" || id.Repo == "" {
 		// TODO: more validation. whatever github requires.
 		return false
@@ -53,15 +53,16 @@
 type GitHub struct {
 	c     *Corpus
 	users map[int64]*GitHubUser
-	repos map[githubRepoID]*GitHubRepo
+	repos map[GithubRepoID]*GitHubRepo
 }
 
+// Repo returns the repo if it's known. Otherwise it returns nil.
 func (g *GitHub) Repo(owner, repo string) *GitHubRepo {
-	return g.repos[githubRepoID{owner, repo}]
+	return g.repos[GithubRepoID{owner, repo}]
 }
 
 func (g *GitHub) getOrCreateRepo(owner, repo string) *GitHubRepo {
-	id := githubRepoID{owner, repo}
+	id := GithubRepoID{owner, repo}
 	if !id.valid() {
 		return nil
 	}
@@ -80,12 +81,17 @@
 
 type GitHubRepo struct {
 	github     *GitHub
-	id         githubRepoID
+	id         GithubRepoID
 	issues     map[int32]*GitHubIssue // num -> issue
 	milestones map[int64]*GitHubMilestone
 	labels     map[int64]*GitHubLabel
 }
 
+func (gr *GitHubRepo) ID() GithubRepoID { return gr.id }
+
+// Issue returns the the provided issue number, or nil if it's not known.
+func (gr *GitHubRepo) Issue(n int32) *GitHubIssue { return gr.issues[n] }
+
 // ForeachIssue calls fn for each issue in the repo.
 //
 // If fn returns an error, iteration ends and ForeachIssue returns
@@ -146,6 +152,14 @@
 	Login string
 }
 
+// GitHubIssueRef is a reference to an issue (or pull request) number
+// in a repo. These are parsed from text making references such as
+// "golang/go#1234" or just "#1234" (with an implicit Repo).
+type GitHubIssueRef struct {
+	Repo   *GitHubRepo // must be non-nil
+	Number int32       // GitHubIssue.Number
+}
+
 // GitHubIssue represents a github issue.
 // This is maintner's in-memory representation. It differs slightly
 // from the API's *github.Issue type, notably in the lack of pointers
@@ -314,6 +328,10 @@
 // IsNone reports whether ms represents the sentinel "no milestone" milestone.
 func (ms *GitHubMilestone) IsNone() bool { return ms == noMilestone }
 
+// IsUnknown reports whether ms is nil, which represents the unknown
+// state. Milestones should never be in this state, though.
+func (ms *GitHubMilestone) IsUnknown() bool { return ms == nil }
+
 // emptyMilestone is a non-nil *githubMilestone with zero values for
 // all fields.
 var emptyMilestone = new(GitHubMilestone)
@@ -509,7 +527,7 @@
 	}
 	c.github = &GitHub{
 		c:     c,
-		repos: map[githubRepoID]*GitHubRepo{},
+		repos: map[GithubRepoID]*GitHubRepo{},
 	}
 }
 
@@ -1891,3 +1909,58 @@
 	}
 	return gr
 }
+
+var rxReferences = regexp.MustCompile(`\b(?:([\w\-]+)/([\w\-]+))?\#(\d+)\b`)
+
+func (c *Corpus) parseGithubRefs(gerritProj string, commitMsg string) []GitHubIssueRef {
+	// Use of rxReferences by itself caused this function to take 20% of the CPU time.
+	// TODO(bradfitz): stop using regexps here.
+	// But in the meantime, help the regexp engine with this one weird trick:
+	// Reduce the length of the string given to FindAllStringSubmatch.
+	// Discard all lines before the first line containing a '#'.
+	// The "Fixes #nnnn" is usually at the end, so this discards most of the input.
+	// Now CPU is only 2% instead of 20%.
+	hash := strings.IndexByte(commitMsg, '#')
+	if hash == -1 {
+		return nil
+	}
+	nl := strings.LastIndexByte(commitMsg[:hash], '\n')
+	commitMsg = commitMsg[nl+1:]
+
+	// TODO: use FindAllStringSubmatchIndex instead, so we can
+	// back up and see what's behind it and ignore "#1", "#2",
+	// "#3" 'references' which are actually bullets or ARM
+	// disassembly, and only respect them as real if they have the
+	// word "Fixes " or "Issue " or similar before them.
+	ms := rxReferences.FindAllStringSubmatch(commitMsg, -1)
+	if len(ms) == 0 {
+		return nil
+	}
+	/* e.g.
+	2017/03/30 21:42:07 matches: [["golang/go#9327" "golang" "go" "9327"]]
+	2017/03/30 21:42:07 matches: [["golang/go#16512" "golang" "go" "16512"] ["golang/go#18404" "golang" "go" "18404"]]
+	2017/03/30 21:42:07 matches: [["#1" "" "" "1"]]
+	2017/03/30 21:42:07 matches: [["#10234" "" "" "10234"]]
+	2017/03/30 21:42:31 matches: [["GoogleCloudPlatform/gcloud-golang#262" "GoogleCloudPlatform" "gcloud-golang" "262"]]
+	2017/03/30 21:42:31 matches: [["GoogleCloudPlatform/google-cloud-go#481" "GoogleCloudPlatform" "google-cloud-go" "481"]]
+	*/
+	github := c.GitHub()
+	refs := make([]GitHubIssueRef, 0, len(ms))
+	for _, m := range ms {
+		owner, repo, numStr := strings.ToLower(m[1]), strings.ToLower(m[2]), m[3]
+		num, err := strconv.ParseInt(numStr, 10, 32)
+		if err != nil {
+			continue
+		}
+		if owner == "" {
+			if gerritProj == "go.googlesource.com/go" {
+				owner, repo = "golang", "go"
+			} else {
+				continue
+			}
+		}
+		gr := github.getOrCreateRepo(owner, repo)
+		refs = append(refs, GitHubIssueRef{gr, int32(num)})
+	}
+	return refs
+}
diff --git a/maintner/logger.go b/maintner/logger.go
index 8cc4a3f..d22c9fa 100644
--- a/maintner/logger.go
+++ b/maintner/logger.go
@@ -7,7 +7,6 @@
 import (
 	"context"
 	"fmt"
-	"log"
 	"os"
 	"path/filepath"
 	"strings"
@@ -87,7 +86,6 @@
 	go func() {
 		defer close(ch)
 		err := d.ForeachFile(func(fullPath string, fi os.FileInfo) error {
-			log.Printf("File %s has size %v", fullPath, fi.Size())
 			return reclog.ForeachFileRecord(fullPath, func(off int64, hdr, rec []byte) error {
 				m := new(maintpb.Mutation)
 				if err := proto.Unmarshal(rec, m); err != nil {
diff --git a/maintner/maintner.go b/maintner/maintner.go
index 94fcb16..0d3cd87 100644
--- a/maintner/maintner.go
+++ b/maintner/maintner.go
@@ -77,6 +77,14 @@
 	return new(GitHub)
 }
 
+// Gerrit returns the corpus's Gerrit data.
+func (c *Corpus) Gerrit() *Gerrit {
+	if c.gerrit != nil {
+		return c.gerrit
+	}
+	return new(Gerrit)
+}
+
 // mustProtoFromTime turns a time.Time into a *timestamp.Timestamp or panics if
 // in is invalid.
 func mustProtoFromTime(in time.Time) *timestamp.Timestamp {
diff --git a/maintner/maintner_test.go b/maintner/maintner_test.go
index d3eb200..be05cef 100644
--- a/maintner/maintner_test.go
+++ b/maintner/maintner_test.go
@@ -80,10 +80,10 @@
 	github.users = map[int64]*GitHubUser{
 		u1.ID: u1,
 	}
-	github.repos = map[githubRepoID]*GitHubRepo{
-		githubRepoID{"golang", "go"}: &GitHubRepo{
+	github.repos = map[GithubRepoID]*GitHubRepo{
+		GithubRepoID{"golang", "go"}: &GitHubRepo{
 			github: github,
-			id:     githubRepoID{"golang", "go"},
+			id:     GithubRepoID{"golang", "go"},
 			issues: map[int32]*GitHubIssue{
 				3: &GitHubIssue{
 					Number:    3,
@@ -121,7 +121,7 @@
 		State:     github.String("closed"),
 	}
 	gr := &GitHubRepo{
-		id: githubRepoID{"golang", "go"},
+		id: GithubRepoID{"golang", "go"},
 	}
 	is := gr.newMutationFromIssue(nil, gh)
 	want := &maintpb.Mutation{GithubIssue: &maintpb.GithubIssueMutation{
@@ -167,7 +167,7 @@
 		Assignees: assignees,
 	}
 	gr := &GitHubRepo{
-		id: githubRepoID{"golang", "go"},
+		id: GithubRepoID{"golang", "go"},
 		issues: map[int32]*GitHubIssue{
 			3: issue,
 		},
@@ -176,8 +176,8 @@
 		users: map[int64]*GitHubUser{
 			u1.ID: u1,
 		},
-		repos: map[githubRepoID]*GitHubRepo{
-			githubRepoID{"golang", "go"}: gr,
+		repos: map[GithubRepoID]*GitHubRepo{
+			GithubRepoID{"golang", "go"}: gr,
 		},
 	}