blob: 8666425b80028d15492feeff376b0d61e5e5a44e [file] [log] [blame]
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package related implements posting about related issues to GitHub.
package related
import (
"context"
"encoding/json"
"errors"
"fmt"
"log/slog"
"strings"
"time"
"golang.org/x/oscar/internal/actions"
"golang.org/x/oscar/internal/docs"
"golang.org/x/oscar/internal/github"
"golang.org/x/oscar/internal/search"
"golang.org/x/oscar/internal/storage"
"golang.org/x/oscar/internal/storage/timed"
"rsc.io/ordered"
)
// A Poster posts to GitHub about related issues (and eventually other documents).
type Poster struct {
slog *slog.Logger
db storage.DB
vdb storage.VectorDB
github *github.Client
docs *docs.Corpus
projects map[string]bool
watcher *timed.Watcher[*github.Event]
name string
timeLimit time.Time
ignores []func(*github.Issue) bool
maxResults int
scoreCutoff float64
post bool
// For the action log.
actionKind string
logAction actions.BeforeFunc
}
// New creates and returns a new Poster. It logs to lg, stores state in db,
// watches for new GitHub issues using gh, looks up related documents in vdb,
// and reads the document content from docs.
// For the purposes of storing its own state, it uses the given name.
// Future calls to New with the same name will use the same state.
//
// Use the [Poster] methods to configure the posting parameters
// (especially [Poster.EnableProject] and [Poster.EnablePosts])
// before calling [Poster.Run].
func New(lg *slog.Logger, db storage.DB, gh *github.Client, vdb storage.VectorDB, docs *docs.Corpus, name string) *Poster {
p := &Poster{
slog: lg,
db: db,
vdb: vdb,
github: gh,
docs: docs,
projects: make(map[string]bool),
watcher: gh.EventWatcher("related.Poster:" + name),
name: name,
timeLimit: time.Now().Add(-defaultTooOld),
maxResults: defaultMaxResults,
scoreCutoff: defaultScoreCutoff,
}
// TODO: Perhaps the action kind should include name, but perhaps not.
// This makes sure we only ever post to each issue once.
p.actionKind = "related.Poster"
p.logAction = actions.Register(p.actionKind, &actioner{p})
return p
}
// SetTimeLimit controls how old an issue can be for the Poster to post to it.
// Issues created before time t will be skipped.
// The default is not to post to issues that are more than 48 hours old
// at the time of the call to [New].
func (p *Poster) SetTimeLimit(t time.Time) {
p.timeLimit = t
}
const defaultTooOld = 48 * time.Hour
// SetMaxResults sets the maximum number of related documents to
// post to the issue.
// The default is 10.
func (p *Poster) SetMaxResults(max int) {
p.maxResults = max
}
const defaultMaxResults = 10
// SetMinScore sets the minimum vector search score that a
// [storage.VectorResult] must have to be considered a related document
// The default is 0.82, which was determined empirically.
func (p *Poster) SetMinScore(min float64) {
p.scoreCutoff = min
}
const defaultScoreCutoff = 0.82
// SkipBodyContains configures the Poster to skip issues with a body containing
// the given text.
func (p *Poster) SkipBodyContains(text string) {
p.ignores = append(p.ignores, func(issue *github.Issue) bool {
return strings.Contains(issue.Body, text)
})
}
// SkipTitlePrefix configures the Poster to skip issues with a title starting
// with the given prefix.
func (p *Poster) SkipTitlePrefix(prefix string) {
p.ignores = append(p.ignores, func(issue *github.Issue) bool {
return strings.HasPrefix(issue.Title, prefix)
})
}
// SkipTitleSuffix configures the Poster to skip issues with a title starting
// with the given suffix.
func (p *Poster) SkipTitleSuffix(suffix string) {
p.ignores = append(p.ignores, func(issue *github.Issue) bool {
return strings.HasSuffix(issue.Title, suffix)
})
}
// EnableProject enables the Poster to post on issues in the given GitHub project (for example "golang/go").
// See also [Poster.EnablePosts], which must also be called to post anything to GitHub.
func (p *Poster) EnableProject(project string) {
p.projects[project] = true
}
// EnablePosts enables the Poster to post to GitHub.
// If EnablePosts has not been called, [Poster.Run] logs what it would post but does not post the messages.
// See also [Poster.EnableProject], which must also be called to set the projects being considered.
func (p *Poster) EnablePosts() {
p.post = true
}
// An action has all the information needed to post a comment to a GitHub issue.
type action struct {
Issue *github.Issue
Changes *github.IssueCommentChanges
}
// result is the result of apply an action.
type result struct {
URL string // URL of new comment
}
// Run runs a single round of posting to GitHub.
// It scans all open issues that have been created since the last call to [Poster.Run]
// using a Poster with the same name (see [New]).
// Run skips closed issues, and it also skips pull requests.
//
// For each issue that matches the configured posting constraints
// (see [Poster.EnableProject], [Poster.SetTimeLimit], [Poster.IgnoreBodyContains], [Poster.IgnoreTitlePrefix], and [Poster.IgnoreTitleSuffix]),
// Run computes an embedding of the issue body text (ignoring comments)
// and looks in the vector database for other documents (currently only issues)
// that are aligned closely enough with that body text
// (see [Poster.SetMinScore]) and posts a limited number of matches
// (see [Poster.SetMaxResults]).
//
// Run logs each post to the [slog.Logger] passed to [New].
// If [Poster.EnablePosts] has been called, then [Run] also adds an action to the
// action log an action that will post the comment to GitHub (see [actions.Run]),
// and advances its GitHub issue watcher's incremental cursor to speed future calls to [Run].
//
// When [Poster.EnablePosts] has not been called, Run only logs the comments it would post.
// Future calls to Run will reprocess the same issues and re-log the same comments.
func (p *Poster) Run(ctx context.Context) error {
p.slog.Info("related.Poster start", "name", p.name, "post", p.post, "latest", p.watcher.Latest())
defer func() {
p.slog.Info("related.Poster end", "name", p.name, "latest", p.watcher.Latest())
}()
defer p.watcher.Flush()
for e := range p.watcher.Recent() {
advance, err := p.logPostIssue(ctx, e)
if err != nil {
p.slog.Error("related.Poster", "issue", e.Issue, "event", e, "error", err)
continue
}
if advance {
p.watcher.MarkOld(e.DBTime)
// Flush immediately to make sure we don't re-post if interrupted later in the loop.
p.watcher.Flush()
p.slog.Info("related.Poster advanced watcher", "latest", p.watcher.Latest(), "event", e)
} else {
p.slog.Info("related.Poster watcher not advanced", "latest", p.watcher.Latest(), "event", e)
}
}
return nil
}
// Post posts an issue comment for the given GitHub issue.
//
// It follows the same logic as [Poster.Run] for a single event, except
// that it does not rely on or modify the Poster's GitHub issue watcher's
// incremental cursor.
// This means that [Poster.Post] can be called on any issue event without
// affecting the starting point of future calls to [Poster.Run].
//
// It requires that there be a database and vector database entry for
// the given issue.
func (p *Poster) Post(ctx context.Context, project string, issue int64) error {
e := lookupIssueEvent(project, issue, p.github)
if e == nil {
return fmt.Errorf("related.Poster.Post(project=%s, issue=%d): %w", project, issue, errEventNotFound)
}
_, err := p.logPostIssue(ctx, e)
return err
}
var (
errEventNotFound = errors.New("event not found in database")
errVectorSearchFailed = errors.New("vector search failed")
errPostIssueCommentFailed = errors.New("post issue comment failed")
)
// lookupIssueEvent returns the first event for the "/issues" API with
// the given ID in the database, or nil if not found.
func lookupIssueEvent(project string, issue int64, gh *github.Client) *github.Event {
for event := range gh.Events(project, issue, issue) {
if event.API == "/issues" {
return event
}
}
return nil
}
// logPostIssue logs an action to post an issue for the event.
// advance is true if the event should be considered to have been
// handled by this or a previous run function, indicating
// that the Poster's watcher can be advanced.
// An issue is handled if
// - posting is enabled, AND
// - an issue posting was successfully logged, or no issue was needed
// because no related documents were found
//
// Skipped issues are not considered handled.
func (p *Poster) logPostIssue(ctx context.Context, e *github.Event) (advance bool, _ error) {
if skip, reason := p.skip(e); skip {
p.slog.Info("related.Poster skip", "name", p.name, "project",
e.Project, "issue", e.Issue, "reason", reason, "event", e)
return false, nil
}
// If an action has already been logged for this event, do nothing.
// This is just an optimization to avoid an expensive vector search, so we don't
// need a lock. [actions.before] will lock to avoid multiple log entries.
if _, ok := actions.Get(p.db, p.actionKind, logKey(e)); ok {
p.slog.Info("related.Poster already logged", "name", p.name, "project", e.Project, "issue", e.Issue, "event", e)
// If posting is enabled, we can advance the watcher because
// a comment has already been logged for this issue.
return p.post, nil
}
u := issueURL(e.Project, e.Issue)
p.slog.Debug("related.Poster consider", "url", u)
results, ok := p.search(u)
if !ok {
return false, fmt.Errorf("%w url=%s", errVectorSearchFailed, u)
}
if len(results) == 0 {
p.slog.Info("related.Poster found no related documents", "name", p.name, "project", e.Project, "issue", e.Issue, "event", e)
// If posting is enabled, an issue with no related documents
// should be considered handled, and not looked at again.
return p.post, nil
}
comment := p.comment(results)
p.slog.Info("related.Poster post", "name", p.name, "project", e.Project, "issue", e.Issue, "comment", comment)
if !p.post {
// Posting is disabled so we did not handle this issue.
return false, nil
}
act := &action{
Issue: e.Typed.(*github.Issue),
Changes: &github.IssueCommentChanges{Body: comment},
}
p.logAction(p.db, logKey(e), storage.JSON(act), !actions.RequiresApproval)
return true, nil
}
type actioner struct {
p *Poster
}
func (ar *actioner) Run(ctx context.Context, data []byte) ([]byte, error) {
return ar.p.runFromActionLog(ctx, data)
}
func (ar *actioner) ForDisplay(data []byte) string {
var a action
if err := json.Unmarshal(data, &a); err != nil {
return fmt.Sprintf("ERROR: %v", err)
}
return a.Issue.HTMLURL + "\n" + a.Changes.Body
}
// runFromActionLog is called by actions.Run to execute an action.
// It decodes the action, calls [Poster.runAction], then encodes the result.
func (p *Poster) runFromActionLog(ctx context.Context, data []byte) ([]byte, error) {
var a action
if err := json.Unmarshal(data, &a); err != nil {
return nil, err
}
res, err := p.runAction(ctx, &a)
if err != nil {
return nil, err
}
return storage.JSON(res), nil
}
// runAction runs the given action.
func (p *Poster) runAction(ctx context.Context, a *action) (*result, error) {
_, url, err := p.github.PostIssueComment(ctx, a.Issue, a.Changes)
// If GitHub returns an error, add it to the action log for this action.
//
// Gaby's original behavior was to log the error, not advance the watcher,
// and continue iterating over watcher.Recent. So subsequent successful
// posts would advance the watcher over the failed one, leaving only the
// slog entry as evidence of the failure.
//
// The current behavior always advances the watcher and preserves the error
// in the action log.
//
// It is unclear what the right behavior is, but at least at present all
// failed actions are available to the program and could be re-run.
if err != nil {
return nil, fmt.Errorf("%w issue=%d: %v", errPostIssueCommentFailed, a.Issue.Number, err)
}
return &result{URL: url}, nil
}
// issueURL returns the URL of the GitHub issue in the given project.
func issueURL(project string, issue int64) string {
return fmt.Sprintf("https://github.com/%s/issues/%d", project, issue)
}
// search performs a vector search to find related issues for the given
// issue URL. It removes any results that don't meet the cutoff in
// p.scoreCutoff and trims the results list to a max length of p.maxResults.
// It expects that there is already an entry for the url in the vector
// database, and returns ok=false if there is no such entry.
func (p *Poster) search(u string) (_ []search.Result, ok bool) {
vec, ok := p.vdb.Get(u)
if !ok {
return nil, false
}
results := search.Vector(p.vdb, p.docs, &search.VectorRequest{
Options: search.Options{
Threshold: p.scoreCutoff,
Limit: p.maxResults + 5, // add a buffer for filters
DenyKind: []string{search.KindUnknown},
},
Vector: vec,
})
// Remove the query itself if present.
if len(results) > 0 && results[0].ID == u {
results = results[1:]
}
// Trim length.
if len(results) > p.maxResults {
results = results[:p.maxResults]
}
return results, true
}
// relatedContentGroup is used to represent different
// groupings of the related post content. Examples
// are groups containing related issues and group
// with related code changes.
type relatedContentGroup float64
const (
issues relatedContentGroup = iota
changes
discussions
documentation
)
// relatedGroupTitles are the titles for each
// related content group, to be displayed in
// in the related post comment.
var relatedGroupTitles = map[relatedContentGroup]string{
issues: "Related Issues",
changes: "Related Code Changes",
discussions: "Related Discussions",
documentation: "Related Documentation",
}
// comment returns the comment to post to GitHub for the given related
// issues.
func (p *Poster) comment(results []search.Result) string {
// Break results into issues, changes, discusssions
// and documentation sections.
rg := make(map[relatedContentGroup][]search.Result)
for _, r := range results {
switch r.Kind {
case search.KindGitHubIssue:
rg[issues] = append(rg[issues], r)
case search.KindGoGerritChange:
rg[changes] = append(rg[changes], r)
case search.KindGitHubDiscussion, search.KindGoogleGroupConversation:
rg[discussions] = append(rg[discussions], r)
default:
// KindGoDocumentation, KindGoDevPage, KindGoWiki,
// KindGoBlog, KindGoReference
rg[documentation] = append(rg[documentation], r)
}
}
// section generates a comment markdown for a group
// of results with a title.
section := func(title string, results []search.Result) string {
var comment strings.Builder
fmt.Fprintf(&comment, "**%s**\n\n", title)
for _, r := range results {
title := cleanTitle(r.ID)
if r.Title != "" {
title = r.Title
}
info := ""
if issue, err := p.github.LookupIssueURL(r.ID); err == nil {
info = fmt.Sprint(" #", issue.Number)
if issue.ClosedAt != "" {
info += " (closed)"
}
}
fmt.Fprintf(&comment, " - [%s%s](%s) <!-- score=%.5f -->\n", markdownEscape(title), info, r.ID, r.Score)
}
return comment.String()
}
var sections []string
for _, group := range []relatedContentGroup{issues, changes, documentation, discussions} {
res := rg[group]
if len(res) == 0 {
continue
}
s := section(relatedGroupTitles[group], res)
sections = append(sections, s)
}
footer := "\n<sub>(Emoji vote if this was helpful or unhelpful; more detailed feedback welcome in [this discussion](https://github.com/golang/go/discussions/67901).)</sub>\n"
return strings.Join(sections, "\n") + footer
}
// cleanTitle cleans up document title t to make it more readable
// and understandable to the user. For instance, it removes URL
// fragments synthetically added by document embedders.
func cleanTitle(t string) string {
return strings.TrimSuffix(t, "#related-content") // gerrit related change URLs
}
// skip reports whether the event should be skipped and why.
func (p *Poster) skip(e *github.Event) (_ bool, reason string) {
if !p.projects[e.Project] {
return true, fmt.Sprintf("project %s not enabled for this Poster", e.Project)
}
if e.API != "/issues" {
return true, fmt.Sprintf("wrong API %s (expected %s)", e.API, "/issues")
}
issue := e.Typed.(*github.Issue)
if issue.State == "closed" {
return true, "issue is closed"
}
if issue.PullRequest != nil {
return true, "pull request"
}
tm, err := time.Parse(time.RFC3339, issue.CreatedAt)
if err != nil {
p.slog.Error("related.Poster parse createdat", "CreatedAt", issue.CreatedAt, "err", err)
return true, "could not parse createdat"
}
if tm.Before(p.timeLimit) {
return true, fmt.Sprintf("created=%s before time limit=%s", tm, p.timeLimit)
}
for i, ig := range p.ignores {
if ig(issue) {
return true, fmt.Sprintf("ignored by function ignores[%d]", i)
}
}
if p.posted(e) {
return true, "already posted"
}
return false, ""
}
// posted reports whether the event has already been posted.
// This should only be necessary for a short time, since the action log
// is now handling this check.
func (p *Poster) posted(e *github.Event) bool {
_, ok := p.db.Get(postedKey(e))
return ok
}
// postedKey returns the database key to use when marking an event as posted.
func postedKey(e *github.Event) []byte {
return ordered.Encode("triage.Posted", e.Project, e.Issue)
}
// logKey returns the key for the event in the action log.
// This is only a portion of the database key; it is prefixed by the Poster's action
// kind.
func logKey(e *github.Event) []byte {
return ordered.Encode(e.Project, e.Issue)
}
// Latest returns the latest known DBTime marked old by the Poster's Watcher.
func (p *Poster) Latest() timed.DBTime {
return p.watcher.Latest()
}
var markdownEscaper = strings.NewReplacer(
"_", `\_`,
"*", `\*`,
"`", "\\`",
"[", `\[`,
"]", `\]`,
"<", `\<`,
">", `\>`,
"&", `\&`,
)
func markdownEscape(s string) string {
return markdownEscaper.Replace(s)
}