blob: c7ce2ef89e0c512496a8fc6e179e880033f1464e [file] [log] [blame]
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package gerrit
import (
"fmt"
"iter"
"slices"
"strings"
"golang.org/x/oscar/internal/docs"
"golang.org/x/oscar/internal/storage/timed"
)
// LastWritten implements [docs.Entry.LastWritten].
func (ce *ChangeEvent) LastWritten() timed.DBTime {
return ce.DBTime
}
// ToDocs converts a ChangeEvent to an embeddable document (wrapped
// as an iterator).
//
// This document consists of a change commit message and its comments.
// The ID for such documents is of the form
//
// https://<gerrit-instance>/c/<repo>/+/<n>#related-content.
//
// The "#related-content" fragment is used to allow other types of
// gerrit documents to reuse the main portion of the change URL.
// The URL points to the top of the CL page since the fragment
// does not exist.
//
// ToDocs returns (nil, false) if any of the necessary data cannot be found
// in the client's db.
//
// Implements [docs.Source.ToDocs].
func (c *Client) ToDocs(ce *ChangeEvent) (iter.Seq[*docs.Doc], bool) {
ch := c.change(ce)
if ch == nil {
c.slog.Error("gerrit.ChangeEvent.ToDocs cannot find change", "change", ce.ChangeNum)
return nil, false
}
title := c.ChangeSubject(ch.ch)
body, err := c.relatedDocBody(ch)
if err != nil {
c.slog.Error("gerrit.ChangeEvent.ToDocs cannot find comments", "change", ce.ChangeNum)
return nil, false
}
if len(body) > geminiCharLimit {
c.slog.Warn("gerrit.ChangeEvent.ToDocs potential truncation by gemini", "change", ce.ChangeNum, "docSize", len(body))
}
text := cleanBody(body)
id := relatedDocURL(ch)
return slices.Values([]*docs.Doc{{
ID: id,
Title: title,
Text: text,
}}), true
}
// geminiCharLimit is an approximate limit on the number of
// document characters a gemini text embedding can accept.
// Gemini text embedding models have an input token limit
// of 2048, where each token is about four characters long.
// Gemini truncates documents after this limit.
// For more info, see
// https://ai.google.dev/gemini-api/docs/models/gemini#text-embedding-and-embedding
const geminiCharLimit = 8200
// changeInfo accumulates information from [Change]
// and [ChangeEvent] needed to grab change subject,
// messages, and comments.
type changeInfo struct {
instance string
project string
number int
ch *Change
}
// change returns a gerrit change information corresponding to ce.
// The project of the change must be one of projects.
func (c *Client) change(ce *ChangeEvent) *changeInfo {
ci := &changeInfo{
instance: ce.Instance,
number: ce.ChangeNum,
}
for p := range c.projects() {
if ch := c.Change(p, ce.ChangeNum); ch != nil {
ci.project = p // at most one project can match ce.ChangeNum
ci.ch = ch
return ci
}
}
return nil
}
// relatedDocBody returns the document body for the gerrit change ci,
// intended for surfacing related content. The body consists of
// the most recent commit message followed by change messages and
// comments appearing in their chronological order. There is a new
// line added between each message and comment.
func (c *Client) relatedDocBody(ci *changeInfo) (string, error) {
comments, err := c.comments(ci)
if err != nil {
return "", nil
}
messages := c.ChangeMessages(ci.ch)
// Sort comments and messages based on their creation/update time.
type datedMessage struct {
date TimeStamp
message string
}
var dmsgs []datedMessage
for _, cmt := range comments {
dmsgs = append(dmsgs, datedMessage{date: cmt.Updated, message: cmt.Message})
}
for _, msg := range messages {
dmsgs = append(dmsgs, datedMessage{date: msg.Date, message: msg.Message})
}
slices.SortStableFunc(dmsgs, func(mi, mj datedMessage) int {
ti := mi.date.Time()
tj := mj.date.Time()
return ti.Compare(tj)
})
trim := strings.TrimSpace
components := []string{trim(c.ChangeDescription(ci.ch))}
for _, m := range dmsgs {
components = append(components, trim(m.message))
}
return strings.Join(components, "\n\n"), nil
}
// relatedDocURL returns a unique URL for the document corresponding
// to the gerrit change info ci, intended for indexing documents used
// to surface related content.
func relatedDocURL(ci *changeInfo) string {
return fmt.Sprintf("https://%s/c/%s/+/%d#related-content", ci.instance, ci.project, ci.number)
}
// comments returns file comments for the gerrit change.
func (c *Client) comments(ci *changeInfo) ([]*CommentInfo, error) {
var cmts []*CommentInfo
cmtsMap := c.Comments(ci.project, ci.number)
for _, cs := range cmtsMap { // we don't care about comment file locations
cmts = append(cmts, cs...)
}
return cmts, nil
}
// cleanBody should clean the body for indexing.
// For now we assume the LLM is good enough.
// In the future we may want to make various changes like inlining
// other mentioned changes, playground URLs, and GH issues.
// TODO(#35): remove irrelevant comments to fit the Gemini token limit.
func cleanBody(body string) string {
return body
}