internal: unify docs Sync
Refactor the internal packages to re-use code for syncing data
to a [docs.Corpus].
The *docs packages were all essentially doing the same thing, requiring
us to create a new package and copy code every time we wanted to add
a new data source.
With this change, a data source that implements the [docs.Source]
interface (and whose events implement the [docs.Embeddable] interface)
can call [docs.Sync] to sync their new events to a corpus.
This change allows us to delete the githubdocs, discussiondocs,
gerritdocs and crawldocs packages. All tests are retained and moved to
the client packages.
Note also that the Sync functions had some unused parameters and
unused return values (error), which are cleaned up here.
Change-Id: I7fb5b360d7f7409fc7cec91c70fce4519a122067
Reviewed-on: https://go-review.googlesource.com/c/oscar/+/619697
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Zvonimir Pavlinovic <zpavlinovic@google.com>
Reviewed-by: Jonathan Amsterdam <jba@google.com>
diff --git a/internal/crawl/crawl.go b/internal/crawl/crawl.go
index 01bf2ce..ba648a4 100644
--- a/internal/crawl/crawl.go
+++ b/internal/crawl/crawl.go
@@ -21,6 +21,8 @@
"time"
"golang.org/x/net/html"
+ "golang.org/x/oscar/internal/docs"
+ "golang.org/x/oscar/internal/htmlutil"
"golang.org/x/oscar/internal/storage"
"golang.org/x/oscar/internal/storage/timed"
"rsc.io/ordered"
@@ -54,6 +56,37 @@
rules []rule
}
+var _ docs.Source[*Page] = (*Crawler)(nil)
+
+const DocWatcherID = "crawldocs"
+
+// DocWatcher returns the page watcher with name "crawldocs".
+// Implements [docs.Source.DocWatcher].
+func (cr *Crawler) DocWatcher() *timed.Watcher[*Page] {
+ return cr.PageWatcher(DocWatcherID)
+}
+
+// ToDocs converts a crawled page to a list of embeddable documents,
+// split into sections using [htmlutil.Split].
+//
+// Implements [docs.Source.ToDocs].
+func (*Crawler) ToDocs(p *Page) (iter.Seq[*docs.Doc], bool) {
+ return func(yield func(*docs.Doc) bool) {
+ // TODO(rsc): We should probably delete the existing docs
+ // starting with p.URL# before embedding them.
+ for s := range htmlutil.Split(p.HTML) {
+ d := &docs.Doc{
+ ID: p.URL + "#" + s.ID,
+ Title: s.Title,
+ Text: s.Text,
+ }
+ if !yield(d) {
+ return
+ }
+ }
+ }, true
+}
+
// A rule is a rule about which URLs can be crawled.
// See [Crawler.Allow] for more details.
type rule struct {
@@ -74,6 +107,13 @@
Error string // error fetching page, if any
}
+var _ docs.Entry = (*Page)(nil)
+
+// LastWritten implements [docs.Entry.LastWritten].
+func (p *Page) LastWritten() timed.DBTime {
+ return p.DBTime
+}
+
// A crawlPage is the JSON form of Page.
// The fields and field order of crawlPage and Page must match exactly; only the struct tags differ.
// We omit the DBTime, URL, and HTML fields from JSON, because they are encoded separately.
diff --git a/internal/crawldocs/sync_test.go b/internal/crawl/syncdocs_test.go
similarity index 89%
rename from internal/crawldocs/sync_test.go
rename to internal/crawl/syncdocs_test.go
index d3ff968..b5b10df 100644
--- a/internal/crawldocs/sync_test.go
+++ b/internal/crawl/syncdocs_test.go
@@ -2,21 +2,18 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-package crawldocs
+package crawl
import (
- "context"
"os"
"testing"
- "golang.org/x/oscar/internal/crawl"
"golang.org/x/oscar/internal/docs"
"golang.org/x/oscar/internal/storage"
"golang.org/x/oscar/internal/testutil"
)
-func TestSync(t *testing.T) {
- ctx := context.Background()
+func TestCrawlDocsSync(t *testing.T) {
check := testutil.Checker(t)
lg := testutil.Slogger(t)
db := storage.MemDB()
@@ -24,17 +21,17 @@
data, err := os.ReadFile("testdata/toolchain.html")
check(err)
dc := docs.New(lg, db)
- cr := crawl.New(lg, db, nil)
- cr.Set(&crawl.Page{
+ cr := New(lg, db, nil)
+ cr.Set(&Page{
URL: "https://go.dev/doc/toolchain",
HTML: data,
})
- cr.Set(&crawl.Page{
+ cr.Set(&Page{
URL: "https://go.dev/doc/empty",
HTML: nil,
})
- check(Sync(ctx, lg, dc, cr))
+ docs.Sync(dc, cr)
var want = []string{
"https://go.dev/doc/toolchain#GOTOOLCHAIN",
@@ -70,14 +67,14 @@
}
dc.Add(download, "OLD TITLE", "OLD TEXT")
- check(Sync(ctx, lg, dc, cr))
+ docs.Sync(dc, cr)
d, _ := dc.Get(download)
if d.Title != "OLD TITLE" || d.Text != "OLD TEXT" {
t.Errorf("Sync rewrote #download: Title=%q Text=%q, want OLD TITLE, OLD TEXT", d.Title, d.Text)
}
- Restart(ctx, lg, cr)
- check(Sync(ctx, lg, dc, cr))
+ docs.Restart(cr)
+ docs.Sync(dc, cr)
d, _ = dc.Get(download)
if d.Title == "OLD TITLE" || d.Text == "OLD TEXT" {
t.Errorf("Restart+Sync did not rewrite #download: Title=%q Text=%q", d.Title, d.Text)
diff --git a/internal/crawldocs/testdata/toolchain.html b/internal/crawl/testdata/toolchain.html
similarity index 100%
rename from internal/crawldocs/testdata/toolchain.html
rename to internal/crawl/testdata/toolchain.html
diff --git a/internal/crawldocs/sync.go b/internal/crawldocs/sync.go
deleted file mode 100644
index 04059b4..0000000
--- a/internal/crawldocs/sync.go
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright 2024 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package crawldocs
-
-import (
- "context"
- "log/slog"
-
- "golang.org/x/oscar/internal/crawl"
- "golang.org/x/oscar/internal/docs"
- "golang.org/x/oscar/internal/storage/timed"
-)
-
-// Sync reads new HTML pages from cr, splits them into sections using [Split],
-// and adds each page section's text to the corpus dc.
-//
-// Sync uses [crawl.Crawler.PageWatcher] with the name "crawldocs"
-// to save its position across multiple calls.
-//
-// Sync logs status and unexpected problems to lg.
-//
-// Sync makes no use of its context.
-func Sync(ctx context.Context, lg *slog.Logger, dc *docs.Corpus, cr *crawl.Crawler) error {
- w := cr.PageWatcher("crawldocs")
- for p := range w.Recent() {
- lg.Debug("crawldocs sync", "page", p.URL, "dbtime", p.DBTime)
- // TODO(rsc): We should probably delete the existing docs
- // starting with p.URL#.
- for s := range Split(p.HTML) {
- dc.Add(p.URL+"#"+s.ID, s.Title, s.Text)
- }
- w.MarkOld(p.DBTime)
- }
- return nil
-}
-
-// Restart restarts the "crawldocs" page watcher,
-// so that a future call to [Sync] will reprocess all documents.
-// Calling [Restart] may be necessary after changing [Split],
-// to reprocess those pages.
-//
-// Restart makes no use of its context.
-func Restart(ctx context.Context, lg *slog.Logger, cr *crawl.Crawler) {
- cr.PageWatcher("crawldocs").Restart()
-}
-
-// Latest returns the latest known DBTime marked old by the crawler's Watcher.
-func Latest(cr *crawl.Crawler) timed.DBTime {
- return cr.PageWatcher("crawldocs").Latest()
-}
diff --git a/internal/discussion/events.go b/internal/discussion/events.go
index 7ddfac2..6ad11a0 100644
--- a/internal/discussion/events.go
+++ b/internal/discussion/events.go
@@ -10,6 +10,7 @@
"math"
"time"
+ "golang.org/x/oscar/internal/docs"
"golang.org/x/oscar/internal/storage"
"golang.org/x/oscar/internal/storage/timed"
"rsc.io/ordered"
@@ -39,6 +40,13 @@
Updated time.Time // when the event was last updated (according to GitHub)
}
+var _ docs.Entry = (*Event)(nil)
+
+// LastWritten implements [docs.Entry.LastWritten].
+func (e *Event) LastWritten() timed.DBTime {
+ return e.DBTime
+}
+
// The recognized event kinds.
// The events are fetched from the GrapQL API, which
// uses queries instead of API endpoints, so these "endpoints"
diff --git a/internal/discussion/sync.go b/internal/discussion/sync.go
index 85c3933..1fd90ff 100644
--- a/internal/discussion/sync.go
+++ b/internal/discussion/sync.go
@@ -42,11 +42,14 @@
"fmt"
"iter"
"log/slog"
+ "slices"
"strconv"
"strings"
"sync"
"time"
+ "golang.org/x/oscar/internal/docs"
+ "golang.org/x/oscar/internal/github"
"golang.org/x/oscar/internal/secret"
"golang.org/x/oscar/internal/storage"
"golang.org/x/oscar/internal/storage/timed"
@@ -81,6 +84,32 @@
}
}
+var _ docs.Source[*Event] = (*Client)(nil)
+
+const DocWatcherID = "discussiondocs"
+
+// DocWatcher returns the page watcher with name "discussiondocs".
+// Implements [docs.Source.DocWatcher].
+func (c *Client) DocWatcher() *timed.Watcher[*Event] {
+ return c.EventWatcher(DocWatcherID)
+}
+
+// ToDocs converts an event containing a discussion to
+// an embeddable document (wrapped as an iterator).
+// It returns (nil, false) if the event is not a discussion.
+// Implements [docs.Source.ToDocs].
+func (*Client) ToDocs(e *Event) (iter.Seq[*docs.Doc], bool) {
+ d, ok := e.Typed.(*Discussion)
+ if !ok {
+ return nil, false
+ }
+ return slices.Values([]*docs.Doc{{
+ ID: d.URL,
+ Title: github.CleanTitle(d.Title),
+ Text: github.CleanBody(d.Body),
+ }}), true
+}
+
// Sync syncs all projects.
func (c *Client) Sync(ctx context.Context) error {
var errs []error
diff --git a/internal/discussiondocs/sync_test.go b/internal/discussion/syncdocs_test.go
similarity index 81%
rename from internal/discussiondocs/sync_test.go
rename to internal/discussion/syncdocs_test.go
index 29cb72e..dfaac86 100644
--- a/internal/discussiondocs/sync_test.go
+++ b/internal/discussion/syncdocs_test.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-package discussiondocs
+package discussion
import (
"context"
@@ -12,33 +12,32 @@
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
- "golang.org/x/oscar/internal/discussion"
"golang.org/x/oscar/internal/docs"
"golang.org/x/oscar/internal/secret"
"golang.org/x/oscar/internal/storage"
"golang.org/x/oscar/internal/testutil"
)
-func TestSync(t *testing.T) {
+func TestDiscussionDocSync(t *testing.T) {
check := testutil.Checker(t)
lg := testutil.Slogger(t)
sdb := secret.Empty()
db := storage.MemDB()
ctx := context.Background()
- c := discussion.New(ctx, lg, sdb, db)
+ c := New(ctx, lg, sdb, db)
project := "test/project"
check(c.Add(project))
- d1 := &discussion.Discussion{
+ d1 := &Discussion{
Title: "A discussion",
Body: "A body",
}
- d2 := &discussion.Discussion{
+ d2 := &Discussion{
Title: "Another discussion",
Body: "Another body",
}
- c1 := &discussion.Comment{
+ c1 := &Comment{
Body: "comment",
}
@@ -47,7 +46,7 @@
id2 := c.Testing().AddDiscussion(project, d2)
dc := docs.New(lg, db)
- check(Sync(ctx, lg, dc, c))
+ docs.Sync(dc, c)
dURL := func(d int64) string { return fmt.Sprintf("https://github.com/test/project/discussions/%d", d) }
got := slices.Collect(dc.Docs(""))
@@ -61,23 +60,23 @@
u := dURL(id)
dc.Add(u, "OLD TITLE", "OLD TEXT")
- check(Sync(ctx, lg, dc, c))
+ docs.Sync(dc, c)
d, _ := dc.Get(u)
if d.Title != "OLD TITLE" || d.Text != "OLD TEXT" {
t.Errorf("Sync rewrote: Title=%q Text=%q, want OLD TITLE, OLD TEXT", d.Title, d.Text)
}
- latestBefore := Latest(c)
+ latestBefore := docs.Latest(c)
- Restart(lg, c)
- if lr := Latest(c); lr != 0 {
+ docs.Restart(c)
+ if lr := docs.Latest(c); lr != 0 {
t.Errorf("latest is not 0 after restart: %d", lr)
}
- check(Sync(ctx, lg, dc, c))
+ docs.Sync(dc, c)
d, _ = dc.Get(u)
if d.Title == "OLD TITLE" || d.Text == "OLD TEXT" {
t.Errorf("Restart+Sync did not rewrite: Title=%q Text=%q", d.Title, d.Text)
}
- latestAfter := Latest(c)
+ latestAfter := docs.Latest(c)
if latestBefore != latestAfter {
t.Errorf("latest mismatch before=%d, after=%d", latestBefore, latestAfter)
diff --git a/internal/discussiondocs/sync.go b/internal/discussiondocs/sync.go
deleted file mode 100644
index 44d6ed1..0000000
--- a/internal/discussiondocs/sync.go
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright 2024 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package discussiondocs implements converting GitHub discussions into text docs
-// for [golang.org/x/oscar/internal/docs].
-package discussiondocs
-
-import (
- "context"
- "log/slog"
-
- "golang.org/x/oscar/internal/discussion"
- "golang.org/x/oscar/internal/docs"
- "golang.org/x/oscar/internal/storage/timed"
-)
-
-// Sync writes to dc docs corresponding to each discussion in gh that is
-// new since the last call to Sync.
-//
-// If a discussion is edited on GitHub, it will appear new in gh and
-// the new text will be written to dc, replacing the old issue text.
-// Only the discussion body is saved as a document.
-//
-// The document ID for each discussion is its GitHub URL: "https://github.com/<org>/<repo>/discussions/<n>".
-func Sync(ctx context.Context, lg *slog.Logger, dc *docs.Corpus, gh *discussion.Client) error {
- w := gh.EventWatcher(watcherID)
- for e := range w.Recent() {
- if e.API != discussion.DiscussionAPI {
- continue
- }
- lg.Debug("discussiondocs sync", "discussion", e.Discussion, "dbtime", e.DBTime)
- d := e.Typed.(*discussion.Discussion)
- title := cleanTitle(d.Title)
- text := cleanBody(d.Body)
- dc.Add(d.URL, title, text)
- w.MarkOld(e.DBTime)
- }
- return nil
-}
-
-const watcherID = "discussiondocs"
-
-// Restart causes the next call to [Sync] to behave as if
-// it has never sync'ed any issues before.
-// The result is that all issues will be reconverted to doc form
-// and re-added.
-// Docs that have not changed since the last addition to the corpus
-// will appear unmodified; others will be marked new in the corpus.
-func Restart(lg *slog.Logger, gh *discussion.Client) {
- gh.EventWatcher(watcherID).Restart()
-}
-
-// Latest returns the latest known DBTime marked old by the client's Watcher.
-func Latest(gh *discussion.Client) timed.DBTime {
- return gh.EventWatcher(watcherID).Latest()
-}
-
-// cleanTitle should clean the title for indexing.
-// For now we assume the LLM is good enough at Markdown not to bother.
-func cleanTitle(title string) string {
- // TODO
- return title
-}
-
-// cleanBody should clean the body for indexing.
-// For now we assume the LLM is good enough at Markdown not to bother.
-// In the future we may want to make various changes like inlining
-// the programs associated with playground URLs,
-// and we may also want to remove any HTML tags from the Markdown.
-func cleanBody(body string) string {
- // TODO
- return body
-}
diff --git a/internal/docs/sync.go b/internal/docs/sync.go
new file mode 100644
index 0000000..6106b3d
--- /dev/null
+++ b/internal/docs/sync.go
@@ -0,0 +1,73 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package docs
+
+import (
+ "iter"
+
+ "golang.org/x/oscar/internal/storage/timed"
+)
+
+// Source is a data source to pull into a [Corpus].
+type Source[T Entry] interface {
+ // DocWatcher returns the watcher to use to keep track
+ // of last [Sync] for this data source.
+ DocWatcher() *timed.Watcher[T]
+ // ToDocs converts the data to an iterator of [*Doc] values
+ // that can be stored in a [Corpus].
+ // It returns (nil, false) if the data should not be stored
+ // in the [Corpus].
+ ToDocs(T) (iter.Seq[*Doc], bool)
+}
+
+// Entry is a timed entry in a [Source].
+type Entry interface {
+ // LastWritten returns the DBTime this piece of data was last written
+ // to its data source.
+ LastWritten() timed.DBTime
+}
+
+// Sync reads new embeddable values from src and adds the
+// documents to the corpus dc.
+//
+// Sync uses [Source.DocWatcher] to save its position across multiple calls.
+//
+// Sync logs status and unexpected problems to lg.
+func Sync[T Entry, S Source[T]](dc *Corpus, src S) {
+ w := src.DocWatcher()
+ for e := range w.Recent() {
+ ds, ok := src.ToDocs(e)
+ if !ok {
+ // Not embeddable, skip.
+ continue
+ }
+ dc.slog.Debug("docs.Sync", "event", e, "dbtime", e.LastWritten())
+ for d := range ds {
+ dc.Add(d.ID, d.Title, d.Text)
+ }
+ w.MarkOld(e.LastWritten())
+ }
+}
+
+// Restart causes the next call to [Sync] to behave as if
+// it has never sync'ed any data before for the src.
+// The result is that all data will be reconverted to doc form
+// and re-added.
+// Docs that have not changed since the last addition to the corpus
+// will appear unmodified; others will be marked new in the corpus.
+func Restart[T Entry](src Source[T]) {
+ src.DocWatcher().Restart()
+}
+
+// Latest returns the latest known DBTime marked old by the source's DocWatcher.
+func Latest[T Entry](src Source[T]) timed.DBTime {
+ return src.DocWatcher().Latest()
+}
+
+// Latest returns a function that returns the latest known DBTime marked
+// old by the source's DocWatcher.
+func LatestFunc[T Entry](src Source[T]) func() timed.DBTime {
+ return func() timed.DBTime { return Latest[T](src) }
+}
diff --git a/internal/gaby/github_event.go b/internal/gaby/github_event.go
index 51344a6..56163cc 100644
--- a/internal/gaby/github_event.go
+++ b/internal/gaby/github_event.go
@@ -11,8 +11,8 @@
"net/http"
"golang.org/x/oscar/internal/actions"
+ "golang.org/x/oscar/internal/docs"
"golang.org/x/oscar/internal/github"
- "golang.org/x/oscar/internal/githubdocs"
)
// handleGitHubEvent handles incoming webhook requests from GitHub
@@ -163,5 +163,6 @@
if err := g.github.SyncProject(ctx, project); err != nil {
return err
}
- return githubdocs.Sync(ctx, g.slog, g.docs, g.github)
+ docs.Sync(g.docs, g.github)
+ return nil
}
diff --git a/internal/gaby/main.go b/internal/gaby/main.go
index a641832..5422bf3 100644
--- a/internal/gaby/main.go
+++ b/internal/gaby/main.go
@@ -24,9 +24,7 @@
"golang.org/x/oscar/internal/actions"
"golang.org/x/oscar/internal/commentfix"
"golang.org/x/oscar/internal/crawl"
- "golang.org/x/oscar/internal/crawldocs"
"golang.org/x/oscar/internal/discussion"
- "golang.org/x/oscar/internal/discussiondocs"
"golang.org/x/oscar/internal/docs"
"golang.org/x/oscar/internal/embeddocs"
"golang.org/x/oscar/internal/gcp/firestore"
@@ -35,9 +33,7 @@
"golang.org/x/oscar/internal/gcp/gcpsecret"
"golang.org/x/oscar/internal/gcp/gemini"
"golang.org/x/oscar/internal/gerrit"
- "golang.org/x/oscar/internal/gerritdocs"
"golang.org/x/oscar/internal/github"
- "golang.org/x/oscar/internal/githubdocs"
"golang.org/x/oscar/internal/llm"
"golang.org/x/oscar/internal/pebble"
"golang.org/x/oscar/internal/related"
@@ -117,9 +113,6 @@
shutdown := g.initGCP()
defer shutdown()
- // Named functions to retrieve latest Watcher times.
- watcherLatests := map[string]func() timed.DBTime{}
-
g.github = github.New(g.slog, g.db, g.secret, g.http)
g.disc = discussion.New(g.ctx, g.slog, g.secret, g.db)
_ = g.disc.Add(g.githubProject) // only needed once per g.db lifetime
@@ -130,16 +123,12 @@
}
g.docs = docs.New(g.slog, g.db)
- watcherLatests["githubdocs"] = func() timed.DBTime { return githubdocs.Latest(g.github) }
- watcherLatests["gerritrelateddocs"] = func() timed.DBTime { return gerritdocs.RelatedLatest(g.gerrit) }
- watcherLatests["discussiondocs"] = func() timed.DBTime { return discussiondocs.Latest(g.disc) }
ai, err := gemini.NewClient(g.ctx, g.slog, g.secret, g.http, "text-embedding-004")
if err != nil {
log.Fatal(err)
}
g.embed = ai
- watcherLatests["embeddocs"] = func() timed.DBTime { return embeddocs.Latest(g.docs) }
cr := crawl.New(g.slog, g.db, g.http)
cr.Add("https://go.dev/")
@@ -147,7 +136,6 @@
cr.Deny(godevDeny...)
cr.Clean(godevClean)
g.crawler = cr
- watcherLatests["crawldocs"] = func() timed.DBTime { return crawldocs.Latest(cr) }
if flags.search {
g.searchLoop()
@@ -160,7 +148,6 @@
cf.ReplaceURL(`\Qhttps://go-review.git.corp.google.com/\E`, "https://go-review.googlesource.com/")
cf.EnableEdits()
g.commentFixer = cf
- watcherLatests["gerritlinks fix"] = cf.Latest
rp := related.New(g.slog, g.db, g.github, g.vector, g.docs, "related")
rp.EnableProject(g.githubProject)
@@ -169,7 +156,19 @@
rp.SkipTitleSuffix(" backport]")
rp.EnablePosts()
g.relatedPoster = rp
- watcherLatests["related"] = rp.Latest
+
+ // Named functions to retrieve latest Watcher times.
+ watcherLatests := map[string]func() timed.DBTime{
+ github.DocWatcherID: docs.LatestFunc(g.github),
+ gerrit.DocWatcherID: docs.LatestFunc(g.gerrit),
+ discussion.DocWatcherID: docs.LatestFunc(g.disc),
+ crawl.DocWatcherID: docs.LatestFunc(cr),
+
+ "embeddocs": func() timed.DBTime { return embeddocs.Latest(g.docs) },
+
+ "gerritlinks fix": cf.Latest,
+ "related": rp.Latest,
+ }
// Install a metric that observes the latest values of the watchers each time metrics are sampled.
g.registerWatcherMetric(watcherLatests)
@@ -495,7 +494,8 @@
if err := g.crawler.Run(ctx); err != nil {
return err
}
- return crawldocs.Sync(ctx, g.slog, g.docs, g.crawler)
+ docs.Sync(g.docs, g.crawler)
+ return nil
}
// syncAndRunAll runs all fast syncs (if enablesync is true) and Gaby actions
@@ -554,7 +554,8 @@
}
// Store newly downloaded GitHub issue events in the document
// database.
- return githubdocs.Sync(ctx, g.slog, g.docs, g.github)
+ docs.Sync(g.docs, g.github)
+ return nil
}
func (g *Gaby) syncGitHubDiscussions(ctx context.Context) error {
@@ -568,7 +569,8 @@
}
// Store newly downloaded GitHub discussions in the document database.
- return discussiondocs.Sync(ctx, g.slog, g.docs, g.disc)
+ docs.Sync(g.docs, g.disc)
+ return nil
}
func (g *Gaby) syncGerrit(ctx context.Context) error {
@@ -580,7 +582,8 @@
return err
}
// Store newly downloaded gerrit events in the document database.
- return gerritdocs.Sync(ctx, g.slog, g.docs, g.gerrit, g.gerritProjects)
+ docs.Sync(g.docs, g.gerrit)
+ return nil
}
// embedAll store embeddings for all new documents in the vector database.
diff --git a/internal/gerrit/data.go b/internal/gerrit/data.go
index c6055be..5cab58f 100644
--- a/internal/gerrit/data.go
+++ b/internal/gerrit/data.go
@@ -140,20 +140,20 @@
// ChangeWatcher returns a new [timed.Watcher] with the given name.
// It picks up where any previous Watcher of the same name left odd.
-func (c *Client) ChangeWatcher(name string) *timed.Watcher[ChangeEvent] {
+func (c *Client) ChangeWatcher(name string) *timed.Watcher[*ChangeEvent] {
return timed.NewWatcher(c.slog, c.db, name, changeUpdateKind, c.decodeChangeEvent)
}
// decodeChangeUpdateEntry decodes a changeUpdateKind [timed.Entry] into
// a change number.
-func (c *Client) decodeChangeEvent(t *timed.Entry) ChangeEvent {
+func (c *Client) decodeChangeEvent(t *timed.Entry) *ChangeEvent {
ce := ChangeEvent{
DBTime: t.ModTime,
}
if err := ordered.Decode(t.Key, &ce.Instance, &ce.ChangeNum, nil); err != nil {
c.db.Panic("gerrit change event decode", "key", storage.Fmt(t.Key), "err", err)
}
- return ce
+ return &ce
}
// timeStampLayout is the timestamp format used by Gerrit.
diff --git a/internal/gerrit/embed.go b/internal/gerrit/embed.go
new file mode 100644
index 0000000..c7ce2ef
--- /dev/null
+++ b/internal/gerrit/embed.go
@@ -0,0 +1,161 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gerrit
+
+import (
+ "fmt"
+ "iter"
+ "slices"
+ "strings"
+
+ "golang.org/x/oscar/internal/docs"
+ "golang.org/x/oscar/internal/storage/timed"
+)
+
+// LastWritten implements [docs.Entry.LastWritten].
+func (ce *ChangeEvent) LastWritten() timed.DBTime {
+ return ce.DBTime
+}
+
+// ToDocs converts a ChangeEvent to an embeddable document (wrapped
+// as an iterator).
+//
+// This document consists of a change commit message and its comments.
+// The ID for such documents is of the form
+//
+// https://<gerrit-instance>/c/<repo>/+/<n>#related-content.
+//
+// The "#related-content" fragment is used to allow other types of
+// gerrit documents to reuse the main portion of the change URL.
+// The URL points to the top of the CL page since the fragment
+// does not exist.
+//
+// ToDocs returns (nil, false) if any of the necessary data cannot be found
+// in the client's db.
+//
+// Implements [docs.Source.ToDocs].
+func (c *Client) ToDocs(ce *ChangeEvent) (iter.Seq[*docs.Doc], bool) {
+ ch := c.change(ce)
+ if ch == nil {
+ c.slog.Error("gerrit.ChangeEvent.ToDocs cannot find change", "change", ce.ChangeNum)
+ return nil, false
+ }
+ title := c.ChangeSubject(ch.ch)
+ body, err := c.relatedDocBody(ch)
+ if err != nil {
+ c.slog.Error("gerrit.ChangeEvent.ToDocs cannot find comments", "change", ce.ChangeNum)
+ return nil, false
+ }
+ if len(body) > geminiCharLimit {
+ c.slog.Warn("gerrit.ChangeEvent.ToDocs potential truncation by gemini", "change", ce.ChangeNum, "docSize", len(body))
+ }
+ text := cleanBody(body)
+ id := relatedDocURL(ch)
+ return slices.Values([]*docs.Doc{{
+ ID: id,
+ Title: title,
+ Text: text,
+ }}), true
+}
+
+// geminiCharLimit is an approximate limit on the number of
+// document characters a gemini text embedding can accept.
+// Gemini text embedding models have an input token limit
+// of 2048, where each token is about four characters long.
+// Gemini truncates documents after this limit.
+// For more info, see
+// https://ai.google.dev/gemini-api/docs/models/gemini#text-embedding-and-embedding
+const geminiCharLimit = 8200
+
+// changeInfo accumulates information from [Change]
+// and [ChangeEvent] needed to grab change subject,
+// messages, and comments.
+type changeInfo struct {
+ instance string
+ project string
+ number int
+ ch *Change
+}
+
+// change returns a gerrit change information corresponding to ce.
+// The project of the change must be one of projects.
+func (c *Client) change(ce *ChangeEvent) *changeInfo {
+ ci := &changeInfo{
+ instance: ce.Instance,
+ number: ce.ChangeNum,
+ }
+ for p := range c.projects() {
+ if ch := c.Change(p, ce.ChangeNum); ch != nil {
+ ci.project = p // at most one project can match ce.ChangeNum
+ ci.ch = ch
+ return ci
+ }
+ }
+ return nil
+}
+
+// relatedDocBody returns the document body for the gerrit change ci,
+// intended for surfacing related content. The body consists of
+// the most recent commit message followed by change messages and
+// comments appearing in their chronological order. There is a new
+// line added between each message and comment.
+func (c *Client) relatedDocBody(ci *changeInfo) (string, error) {
+ comments, err := c.comments(ci)
+ if err != nil {
+ return "", nil
+ }
+ messages := c.ChangeMessages(ci.ch)
+
+ // Sort comments and messages based on their creation/update time.
+ type datedMessage struct {
+ date TimeStamp
+ message string
+ }
+ var dmsgs []datedMessage
+ for _, cmt := range comments {
+ dmsgs = append(dmsgs, datedMessage{date: cmt.Updated, message: cmt.Message})
+ }
+ for _, msg := range messages {
+ dmsgs = append(dmsgs, datedMessage{date: msg.Date, message: msg.Message})
+ }
+ slices.SortStableFunc(dmsgs, func(mi, mj datedMessage) int {
+ ti := mi.date.Time()
+ tj := mj.date.Time()
+ return ti.Compare(tj)
+ })
+
+ trim := strings.TrimSpace
+ components := []string{trim(c.ChangeDescription(ci.ch))}
+ for _, m := range dmsgs {
+ components = append(components, trim(m.message))
+ }
+ return strings.Join(components, "\n\n"), nil
+}
+
+// relatedDocURL returns a unique URL for the document corresponding
+// to the gerrit change info ci, intended for indexing documents used
+// to surface related content.
+func relatedDocURL(ci *changeInfo) string {
+ return fmt.Sprintf("https://%s/c/%s/+/%d#related-content", ci.instance, ci.project, ci.number)
+}
+
+// comments returns file comments for the gerrit change.
+func (c *Client) comments(ci *changeInfo) ([]*CommentInfo, error) {
+ var cmts []*CommentInfo
+ cmtsMap := c.Comments(ci.project, ci.number)
+ for _, cs := range cmtsMap { // we don't care about comment file locations
+ cmts = append(cmts, cs...)
+ }
+ return cmts, nil
+}
+
+// cleanBody should clean the body for indexing.
+// For now we assume the LLM is good enough.
+// In the future we may want to make various changes like inlining
+// other mentioned changes, playground URLs, and GH issues.
+// TODO(#35): remove irrelevant comments to fit the Gemini token limit.
+func cleanBody(body string) string {
+ return body
+}
diff --git a/internal/gerrit/sync.go b/internal/gerrit/sync.go
index d7831ca..9debacd 100644
--- a/internal/gerrit/sync.go
+++ b/internal/gerrit/sync.go
@@ -23,6 +23,7 @@
"testing"
"time"
+ "golang.org/x/oscar/internal/docs"
"golang.org/x/oscar/internal/secret"
"golang.org/x/oscar/internal/storage"
"golang.org/x/oscar/internal/storage/timed"
@@ -122,6 +123,16 @@
}
}
+var _ docs.Source[*ChangeEvent] = (*Client)(nil)
+
+const DocWatcherID = "gerritrelateddocs"
+
+// DocWatcher returns the change event watcher with name "gerritrelateddocs".
+// Implements [docs.Source.DocWatcher].
+func (c *Client) DocWatcher() *timed.Watcher[*ChangeEvent] {
+ return c.ChangeWatcher(DocWatcherID)
+}
+
// RequestFlush asks a Gerrit sync to flush the database to disk
// when convenient. This may be called concurrently with Sync.
func (c *Client) RequestFlush() {
@@ -169,11 +180,7 @@
// Sync syncs the data for all projects in this client's instance.
func (c *Client) Sync(ctx context.Context) error {
var errs []error
- for key := range c.db.Scan(o(syncProjectKind, c.instance), o(syncProjectKind, c.instance, ordered.Inf)) {
- var project string
- if err := ordered.Decode(key, nil, nil, &project); err != nil {
- c.db.Panic("gerrit client sync decode", "key", storage.Fmt(key), "err", err)
- }
+ for project := range c.projects() {
if err := c.SyncProject(ctx, project); err != nil {
errs = append(errs, err)
}
@@ -181,6 +188,22 @@
return errors.Join(errs...)
}
+// projects returns an iterator over all Gerrit projects in the client's
+// database.
+func (c *Client) projects() iter.Seq[string] {
+ return func(yield func(string) bool) {
+ for key := range c.db.Scan(o(syncProjectKind, c.instance), o(syncProjectKind, c.instance, ordered.Inf)) {
+ var project string
+ if err := ordered.Decode(key, nil, nil, &project); err != nil {
+ c.db.Panic("gerrit client projects decode", "key", storage.Fmt(key), "err", err)
+ }
+ if !yield(project) {
+ return
+ }
+ }
+ }
+}
+
// SyncProject syncs a single project.
func (c *Client) SyncProject(ctx context.Context, project string) (err error) {
c.slog.Debug("gerrit.SyncProject", "project", project)
diff --git a/internal/gerritdocs/sync_test.go b/internal/gerrit/syncdocs_test.go
similarity index 85%
rename from internal/gerritdocs/sync_test.go
rename to internal/gerrit/syncdocs_test.go
index 7121064..3186474 100644
--- a/internal/gerritdocs/sync_test.go
+++ b/internal/gerrit/syncdocs_test.go
@@ -2,25 +2,24 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-package gerritdocs
+package gerrit
import (
"context"
"testing"
"golang.org/x/oscar/internal/docs"
- "golang.org/x/oscar/internal/gerrit"
"golang.org/x/oscar/internal/storage"
"golang.org/x/oscar/internal/testutil"
)
-func TestSync(t *testing.T) {
+func TestSyncGerritDocs(t *testing.T) {
check := testutil.Checker(t)
lg := testutil.Slogger(t)
db := storage.MemDB()
ctx := context.Background()
- gr := gerrit.New("go-review.googlesource.com", lg, db, nil, nil)
+ gr := New("go-review.googlesource.com", lg, db, nil, nil)
check(gr.Testing().LoadTxtar("testdata/changes.txt"))
check(gr.Add("test"))
@@ -29,7 +28,7 @@
check(gr.Sync(ctx))
dc := docs.New(lg, db)
- check(Sync(ctx, lg, dc, gr, []string{"test"}))
+ docs.Sync(dc, gr)
var want = []string{
"https://go-review.googlesource.com/c/test/+/1#related-content",
@@ -58,14 +57,14 @@
}
dc.Add("https://go-review.googlesource.com/c/test/+/1#related-content", "OLD TITLE", "OLD TEXT")
- check(Sync(ctx, lg, dc, gr, []string{"test"}))
+ docs.Sync(dc, gr)
d, _ := dc.Get(ch1)
if d.Title != "OLD TITLE" || d.Text != "OLD TEXT" {
t.Errorf("Sync rewrote #1: Title=%q Text=%q, want OLD TITLE, OLD TEXT", d.Title, d.Text)
}
- Restart(lg, gr)
- check(Sync(ctx, lg, dc, gr, []string{"test"}))
+ docs.Restart(gr)
+ docs.Sync(dc, gr)
d, _ = dc.Get(ch1)
if d.Title == "OLD TITLE" || d.Text == "OLD TEXT" {
t.Errorf("Restart+Sync did not rewrite #1: Title=%q Text=%q", d.Title, d.Text)
diff --git a/internal/gerritdocs/testdata/changes.txt b/internal/gerrit/testdata/changes.txt
similarity index 100%
rename from internal/gerritdocs/testdata/changes.txt
rename to internal/gerrit/testdata/changes.txt
diff --git a/internal/gerritdocs/sync.go b/internal/gerritdocs/sync.go
deleted file mode 100644
index 2fec090..0000000
--- a/internal/gerritdocs/sync.go
+++ /dev/null
@@ -1,179 +0,0 @@
-// Copyright 2024 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package gerritdocs implements converting gerrit changes into text docs
-// for [golang.org/x/oscar/internal/docs].
-package gerritdocs
-
-import (
- "context"
- "fmt"
- "log/slog"
- "slices"
- "strings"
-
- "golang.org/x/oscar/internal/docs"
- "golang.org/x/oscar/internal/gerrit"
- "golang.org/x/oscar/internal/storage/timed"
-)
-
-// Sync writes to dc docs corresponding to each gerrit change that has
-// been created or updated since the last call to Sync.
-//
-// A modification to a gerrit change will generate a new change info
-// in gerrit. The state of the change will be written to dc, replacing
-// the old change contents.
-//
-// Sync currently creates only one type of documents intended to be
-// surfaced as a related content on new issues or other changes.
-// This document consist of a change commit message and its comments.
-// The ID for such documents is of the form
-//
-// https://<gerrit-instance>/c/<repo>/+/<n>#related-content.
-//
-// The "#related-content" fragment is used to allow other types of
-// gerrit documents to reuse the main portion of the change URL.
-// The URL points to the top of the CL page since the fragment
-// does not exist.
-func Sync(ctx context.Context, lg *slog.Logger, dc *docs.Corpus, gr *gerrit.Client, projects []string) error {
- w := gr.ChangeWatcher("gerritrelateddocs")
- for ce := range w.Recent() {
- lg.Debug("gerritrelateddocs sync", "change", ce.ChangeNum, "dbtime", ce.DBTime)
- c := change(ce, gr, projects)
- if c == nil {
- lg.Error("gerritrelateddocs cannot find change", "change", ce.ChangeNum)
- continue
- }
- title := gr.ChangeSubject(c.ch)
- body, err := relatedDocBody(gr, c)
- if err != nil {
- lg.Error("gerritrelateddocs cannot find comments", "change", ce.ChangeNum)
- continue
- }
- if len(body) > geminiCharLimit {
- lg.Warn("gerritrelateddocs potential truncation by gemini", "change", ce.ChangeNum, "docSize", len(body))
- }
- text := cleanBody(body)
- id := relatedDocURL(gr, c)
- dc.Add(id, title, text)
- w.MarkOld(ce.DBTime)
- }
- return nil
-}
-
-// geminiCharLimit is an approximate limit on the number of
-// document characters a gemini text embedding can accept.
-// Gemini text embedding models have an input token limit
-// of 2048, where each token is about four characters long.
-// Gemini truncates documents after this limit.
-// For more info, see
-// https://ai.google.dev/gemini-api/docs/models/gemini#text-embedding-and-embedding
-const geminiCharLimit = 8200
-
-// changeInfo accumulates information from [gerrit.Change]
-// and [gerrit.ChangeEvent] needed to grab change subject,
-// messages, and comments.
-type changeInfo struct {
- instance string
- project string
- number int
- ch *gerrit.Change
-}
-
-// change returns a gerrit change information corresponding to ce.
-// The project of the change must be one of projects.
-func change(ce gerrit.ChangeEvent, gr *gerrit.Client, projects []string) *changeInfo {
- c := &changeInfo{
- instance: ce.Instance,
- number: ce.ChangeNum,
- }
- for _, p := range projects {
- if ch := gr.Change(p, ce.ChangeNum); ch != nil {
- c.project = p // at most one project can match ce.ChangeNum
- c.ch = ch
- return c
- }
- }
- return nil
-}
-
-// relatedDocBody returns the document body for the gerrit change c,
-// intended for surfacing related content. The body consists of
-// the most recent commit message followed by change messages and
-// comments appearing in their chronological order. There is a new
-// line added between each message and comment.
-func relatedDocBody(gr *gerrit.Client, c *changeInfo) (string, error) {
- comments, err := comments(gr, c)
- if err != nil {
- return "", nil
- }
- messages := gr.ChangeMessages(c.ch)
-
- // Sort comments and messages based on their creation/update time.
- type datedMessage struct {
- date gerrit.TimeStamp
- message string
- }
- var dmsgs []datedMessage
- for _, cmt := range comments {
- dmsgs = append(dmsgs, datedMessage{date: cmt.Updated, message: cmt.Message})
- }
- for _, msg := range messages {
- dmsgs = append(dmsgs, datedMessage{date: msg.Date, message: msg.Message})
- }
- slices.SortStableFunc(dmsgs, func(mi, mj datedMessage) int {
- ti := mi.date.Time()
- tj := mj.date.Time()
- return ti.Compare(tj)
- })
-
- trim := strings.TrimSpace
- components := []string{trim(gr.ChangeDescription(c.ch))}
- for _, m := range dmsgs {
- components = append(components, trim(m.message))
- }
- return strings.Join(components, "\n\n"), nil
-}
-
-// relatedDocURL returns a unique URL for the document corresponding
-// to the gerrit change info c, intended for indexing documents used
-// to surface related content.
-func relatedDocURL(gr *gerrit.Client, c *changeInfo) string {
- return fmt.Sprintf("https://%s/c/%s/+/%d#related-content", c.instance, c.project, c.number)
-}
-
-// comments returns file comments for the gerrit change.
-func comments(gr *gerrit.Client, c *changeInfo) ([]*gerrit.CommentInfo, error) {
- var cmts []*gerrit.CommentInfo
- cmtsMap := gr.Comments(c.project, c.number)
- for _, cs := range cmtsMap { // we don't care about comment file locations
- cmts = append(cmts, cs...)
- }
- return cmts, nil
-}
-
-// Restart causes the next call to Sync to behave as if
-// it has never sync'ed any issues before.
-// The result is that all issues will be reconverted to doc form
-// and re-added.
-// Docs that have not changed since the last addition to the corpus
-// will appear unmodified; others will be marked new in the corpus.
-func Restart(lg *slog.Logger, gr *gerrit.Client) {
- gr.ChangeWatcher("gerritrelateddocs").Restart()
-}
-
-// RelatedLatest returns the latest known DBTime marked old by
-// the client's "gerritrelateddocs" Watcher.
-func RelatedLatest(gr *gerrit.Client) timed.DBTime {
- return gr.ChangeWatcher("gerritrelateddocs").Latest()
-}
-
-// cleanBody should clean the body for indexing.
-// For now we assume the LLM is good enough.
-// In the future we may want to make various changes like inlining
-// other mentioned changes, playground URLs, and GH issues.
-// TODO(#35): remove irrelevant comments to fit the Gemini token limit.
-func cleanBody(body string) string {
- return body
-}
diff --git a/internal/github/data.go b/internal/github/data.go
index acf7140..d904e87 100644
--- a/internal/github/data.go
+++ b/internal/github/data.go
@@ -12,6 +12,7 @@
"strconv"
"strings"
+ "golang.org/x/oscar/internal/docs"
"golang.org/x/oscar/internal/storage"
"golang.org/x/oscar/internal/storage/timed"
"rsc.io/ordered"
@@ -61,6 +62,30 @@
Typed any // Typed unmarshaling of the event data, of type *Issue, *IssueComment, or *IssueEvent
}
+var _ docs.Entry = (*Event)(nil)
+
+// LastWritten implements [docs.Entry.LastWritten].
+func (e *Event) LastWritten() timed.DBTime {
+ return e.DBTime
+}
+
+// CleanTitle should clean the title for indexing.
+// For now we assume the LLM is good enough at Markdown not to bother.
+func CleanTitle(title string) string {
+ // TODO
+ return title
+}
+
+// CleanBody should clean the body for indexing.
+// For now we assume the LLM is good enough at Markdown not to bother.
+// In the future we may want to make various changes like inlining
+// the programs associated with playground URLs,
+// and we may also want to remove any HTML tags from the Markdown.
+func CleanBody(body string) string {
+ // TODO
+ return body
+}
+
// Events returns an iterator over issue events for the given project,
// limited to issues in the range issueMin ≤ issue ≤ issueMax.
// If issueMax < 0, there is no upper limit.
diff --git a/internal/github/sync.go b/internal/github/sync.go
index 9d45c78..1a4fefb 100644
--- a/internal/github/sync.go
+++ b/internal/github/sync.go
@@ -18,12 +18,14 @@
"log/slog"
"net/http"
"net/url"
+ "slices"
"strconv"
"strings"
"sync"
"testing"
"time"
+ "golang.org/x/oscar/internal/docs"
"golang.org/x/oscar/internal/secret"
"golang.org/x/oscar/internal/storage"
"golang.org/x/oscar/internal/storage/timed"
@@ -102,6 +104,34 @@
}
}
+var _ docs.Source[*Event] = (*Client)(nil)
+
+const DocWatcherID = "githubdocs"
+
+// DocWatcher returns the event watcher with name "githubdocs".
+// Implements [docs.Source.DocWatcher].
+func (c *Client) DocWatcher() *timed.Watcher[*Event] {
+ return c.EventWatcher(DocWatcherID)
+}
+
+// ToDocs converts an event containing an issue to an
+// embeddable document.
+// It returns (nil, false) if the event is not an issue.
+// Implements [docs.Source.ToDocs].
+func (*Client) ToDocs(e *Event) (iter.Seq[*docs.Doc], bool) {
+ issue, ok := e.Typed.(*Issue)
+ if !ok {
+ return nil, false
+ }
+ return slices.Values([]*docs.Doc{
+ {
+ ID: fmt.Sprintf("https://github.com/%s/issues/%d", e.Project, e.Issue),
+ Title: CleanTitle(issue.Title),
+ Text: CleanBody(issue.Body),
+ },
+ }), true
+}
+
// A projectSync is per-GitHub project ("owner/repo") sync state stored in the database.
type projectSync struct {
Name string // owner/repo
diff --git a/internal/githubdocs/sync_test.go b/internal/github/syncdocs_test.go
similarity index 89%
rename from internal/githubdocs/sync_test.go
rename to internal/github/syncdocs_test.go
index 08888e2..d1c31fe 100644
--- a/internal/githubdocs/sync_test.go
+++ b/internal/github/syncdocs_test.go
@@ -2,29 +2,25 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-package githubdocs
+package github
import (
- "context"
"testing"
"golang.org/x/oscar/internal/docs"
- "golang.org/x/oscar/internal/github"
"golang.org/x/oscar/internal/storage"
"golang.org/x/oscar/internal/testutil"
)
-var ctx = context.Background()
-
-func TestMarkdown(t *testing.T) {
+func TestIssueSync(t *testing.T) {
check := testutil.Checker(t)
lg := testutil.Slogger(t)
db := storage.MemDB()
- gh := github.New(lg, db, nil, nil)
+ gh := New(lg, db, nil, nil)
check(gh.Testing().LoadTxtar("../testdata/markdown.txt"))
dc := docs.New(lg, db)
- check(Sync(ctx, lg, dc, gh))
+ docs.Sync(dc, gh)
var want = []string{
"https://github.com/rsc/markdown/issues/1",
@@ -69,14 +65,14 @@
}
dc.Add("https://github.com/rsc/markdown/issues/1", "OLD TITLE", "OLD TEXT")
- check(Sync(ctx, lg, dc, gh))
+ docs.Sync(dc, gh)
d, _ := dc.Get(md1)
if d.Title != "OLD TITLE" || d.Text != "OLD TEXT" {
t.Errorf("Sync rewrote #1: Title=%q Text=%q, want OLD TITLE, OLD TEXT", d.Title, d.Text)
}
- Restart(lg, gh)
- check(Sync(ctx, lg, dc, gh))
+ docs.Restart(gh)
+ docs.Sync(dc, gh)
d, _ = dc.Get(md1)
if d.Title == "OLD TITLE" || d.Text == "OLD TEXT" {
t.Errorf("Restart+Sync did not rewrite #1: Title=%q Text=%q", d.Title, d.Text)
diff --git a/internal/githubdocs/sync.go b/internal/githubdocs/sync.go
deleted file mode 100644
index 03be2b9..0000000
--- a/internal/githubdocs/sync.go
+++ /dev/null
@@ -1,73 +0,0 @@
-// Copyright 2024 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package githubdocs implements converting GitHub issues into text docs
-// for [golang.org/x/oscar/internal/docs].
-package githubdocs
-
-import (
- "context"
- "fmt"
- "log/slog"
-
- "golang.org/x/oscar/internal/docs"
- "golang.org/x/oscar/internal/github"
- "golang.org/x/oscar/internal/storage/timed"
-)
-
-// Sync writes to dc docs corresponding to each issue in gh that is
-// new since the last call to Sync.
-//
-// If an issue is edited on GitHub, it will appear new in gh and
-// the new text will be written to dc, replacing the old issue text.
-// Only the issue body (what looks like the top comment in the UI)
-// is saved as a document.
-// The document ID for each issue is its GitHub URL: "https://github.com/<org>/<repo>/issues/<n>".
-func Sync(ctx context.Context, lg *slog.Logger, dc *docs.Corpus, gh *github.Client) error {
- w := gh.EventWatcher("githubdocs")
- for e := range w.Recent() {
- if e.API != "/issues" {
- continue
- }
- lg.Debug("githubdocs sync", "issue", e.Issue, "dbtime", e.DBTime)
- issue := e.Typed.(*github.Issue)
- title := cleanTitle(issue.Title)
- text := cleanBody(issue.Body)
- dc.Add(fmt.Sprintf("https://github.com/%s/issues/%d", e.Project, e.Issue), title, text)
- w.MarkOld(e.DBTime)
- }
- return nil
-}
-
-// Restart causes the next call to Sync to behave as if
-// it has never sync'ed any issues before.
-// The result is that all issues will be reconverted to doc form
-// and re-added.
-// Docs that have not changed since the last addition to the corpus
-// will appear unmodified; others will be marked new in the corpus.
-func Restart(lg *slog.Logger, gh *github.Client) {
- gh.EventWatcher("githubdocs").Restart()
-}
-
-// Latest returns the latest known DBTime marked old by the client's Watcher.
-func Latest(gh *github.Client) timed.DBTime {
- return gh.EventWatcher("githubdocs").Latest()
-}
-
-// cleanTitle should clean the title for indexing.
-// For now we assume the LLM is good enough at Markdown not to bother.
-func cleanTitle(title string) string {
- // TODO
- return title
-}
-
-// cleanBody should clean the body for indexing.
-// For now we assume the LLM is good enough at Markdown not to bother.
-// In the future we may want to make various changes like inlining
-// the programs associated with playground URLs,
-// and we may also want to remove any HTML tags from the Markdown.
-func cleanBody(body string) string {
- // TODO
- return body
-}
diff --git a/internal/crawldocs/split.go b/internal/htmlutil/split.go
similarity index 91%
rename from internal/crawldocs/split.go
rename to internal/htmlutil/split.go
index c27a6a1..a2d646f 100644
--- a/internal/crawldocs/split.go
+++ b/internal/htmlutil/split.go
@@ -2,13 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// Package crawldocs splits crawled HTML pages into sections
-// and inserts them into a document corpus.
-//
-// [Split] provides access to the HTML splitter;
-// [Sync] and [Restart] implement the incremental
-// splitting of crawled HTML into a document corpus.
-package crawldocs
+// Package htmlutil splits crawled HTML pages into sections via [Split].
+package htmlutil
import (
"bytes"
@@ -38,7 +33,7 @@
// but we haven't configured any,
// so we can assume there won't be an error.
// (There is no such thing as "bad" HTML 5.)
- panic("crawldocs: internal error: HTML 5 parse failed: " + err.Error())
+ panic("htmlutil: internal error: HTML 5 parse failed: " + err.Error())
}
walkDoc(doc, yield)
}
diff --git a/internal/crawldocs/split_test.go b/internal/htmlutil/split_test.go
similarity index 99%
rename from internal/crawldocs/split_test.go
rename to internal/htmlutil/split_test.go
index 4fb29c6..93f0106 100644
--- a/internal/crawldocs/split_test.go
+++ b/internal/htmlutil/split_test.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-package crawldocs
+package htmlutil
import (
"os"
diff --git a/internal/crawldocs/testdata/basic.html b/internal/htmlutil/testdata/basic.html
similarity index 100%
rename from internal/crawldocs/testdata/basic.html
rename to internal/htmlutil/testdata/basic.html
diff --git a/internal/crawldocs/testdata/cmdgo.html b/internal/htmlutil/testdata/cmdgo.html
similarity index 100%
rename from internal/crawldocs/testdata/cmdgo.html
rename to internal/htmlutil/testdata/cmdgo.html
diff --git a/internal/crawldocs/testdata/release.html b/internal/htmlutil/testdata/release.html
similarity index 100%
rename from internal/crawldocs/testdata/release.html
rename to internal/htmlutil/testdata/release.html
diff --git a/internal/crawldocs/testdata/show.go b/internal/htmlutil/testdata/show.go
similarity index 76%
rename from internal/crawldocs/testdata/show.go
rename to internal/htmlutil/testdata/show.go
index ce34bda..72642bf 100644
--- a/internal/crawldocs/testdata/show.go
+++ b/internal/htmlutil/testdata/show.go
@@ -4,7 +4,7 @@
//go:build ignore
-// Show shows the result of running crawldocs.Split on a single input file.
+// Show shows the result of running htmlutil.Split on a single input file.
//
// Usage:
//
@@ -18,7 +18,7 @@
"log"
"os"
- "golang.org/x/oscar/internal/crawldocs"
+ "golang.org/x/oscar/internal/htmlutil"
)
func main() {
@@ -27,7 +27,7 @@
log.Fatal(err)
}
- for s := range crawldocs.Split(data) {
+ for s := range htmlutil.Split(data) {
fmt.Printf("{%q, %q, %q},\n", s.Title, s.ID, s.Text[:min(len(s.Text), 40)])
}
}
diff --git a/internal/crawldocs/testdata/toolchain.html b/internal/htmlutil/testdata/toolchain.html
similarity index 100%
copy from internal/crawldocs/testdata/toolchain.html
copy to internal/htmlutil/testdata/toolchain.html
diff --git a/internal/crawldocs/testdata/trace.html b/internal/htmlutil/testdata/trace.html
similarity index 100%
rename from internal/crawldocs/testdata/trace.html
rename to internal/htmlutil/testdata/trace.html
diff --git a/internal/related/related_test.go b/internal/related/related_test.go
index e45f42b..946fce1 100644
--- a/internal/related/related_test.go
+++ b/internal/related/related_test.go
@@ -21,7 +21,6 @@
"golang.org/x/oscar/internal/docs"
"golang.org/x/oscar/internal/embeddocs"
"golang.org/x/oscar/internal/github"
- "golang.org/x/oscar/internal/githubdocs"
"golang.org/x/oscar/internal/llm"
"golang.org/x/oscar/internal/storage"
"golang.org/x/oscar/internal/testutil"
@@ -44,7 +43,7 @@
}
dc := docs.New(lg, db)
- githubdocs.Sync(ctx, lg, dc, gh)
+ docs.Sync(dc, gh)
vdb := storage.MemVectorDB(db, lg, "vecs")
embeddocs.Sync(ctx, lg, vdb, llm.QuoteEmbedder(), dc)
@@ -218,7 +217,7 @@
gh.Testing().LoadTxtar("../testdata/rsctmp.txt")
dc := docs.New(lg, db)
- githubdocs.Sync(ctx, lg, dc, gh)
+ docs.Sync(dc, gh)
vdb := storage.MemVectorDB(db, lg, "vecs")
embeddocs.Sync(ctx, lg, vdb, llm.QuoteEmbedder(), dc)