internal/gaby: add /embed endpoint to sync only embeddings
Change-Id: If50ee6ef956a894090f134cdd9f3519f784edd12
Reviewed-on: https://go-review.googlesource.com/c/oscar/+/708878
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Hyang-Ah Hana Kim <hyangah@gmail.com>
diff --git a/internal/embeddocs/sync.go b/internal/embeddocs/sync.go
index 1c193f3..3430c8e 100644
--- a/internal/embeddocs/sync.go
+++ b/internal/embeddocs/sync.go
@@ -59,15 +59,22 @@
return nil
}
+ var start, end string
for d := range w.Recent() {
- lg.Debug("embeddocs sync start", "model", model, "doc", d.ID)
+ if start == "" {
+ start = d.ID
+ }
+ end = d.ID
batch = append(batch, llm.EmbedDoc{Title: d.Title, Text: d.Text})
ids = append(ids, d.ID)
batchLast = d.DBTime
if len(batch) >= batchSize {
+ lg.Debug("embeddocs sync flush", "model", model, "start", start, "end", end)
if err := flush(); err != nil {
return err
}
+ start = ""
+ end = ""
}
}
if len(batch) > 0 {
@@ -75,6 +82,7 @@
// which has to be called during an iteration over w.Recent.
// Start a new iteration just to call flush and then break out.
for _ = range w.Recent() {
+ lg.Debug("embeddocs sync flush", "model", model, "start", start, "end", end)
if err := flush(); err != nil {
return err
}
diff --git a/internal/gaby/main.go b/internal/gaby/main.go
index d2ef278..a7cccb1 100644
--- a/internal/gaby/main.go
+++ b/internal/gaby/main.go
@@ -540,10 +540,12 @@
setLevelEndpoint = "setlevel"
githubEventEndpoint = "github-event"
crawlEndpoint = "crawl"
+ embedEndpoint = "embed"
bisectEndpoint = "bisect"
)
cronEndpointCounter := g.newEndpointCounter(cronEndpoint)
crawlEndpointCounter := g.newEndpointCounter(crawlEndpoint)
+ embedEndpointCounter := g.newEndpointCounter(embedEndpoint)
githubEventEndpointCounter := g.newEndpointCounter(githubEventEndpoint)
mux := http.NewServeMux()
@@ -586,6 +588,23 @@
cronEndpointCounter.Add(r.Context(), 1)
})
+ // embedEndpoint is meant to be called by hand
+ // when running a binary with a new embedding database configured,
+ // to backfill embeddings.
+ mux.HandleFunc("GET /"+embedEndpoint, func(w http.ResponseWriter, r *http.Request) {
+ g.slog.Info(embedEndpoint + " start")
+ defer g.slog.Info(embedEndpoint + " end")
+
+ // No lock here - g.embedAll already locks.
+
+ if err := g.embedAll(g.ctx); err != nil {
+ report(err, r)
+ http.Error(w, err.Error(), http.StatusInternalServerError)
+ }
+
+ embedEndpointCounter.Add(r.Context(), 1)
+ })
+
// crawlEndpoint triggers the web crawl configured in [Gaby.crawler].
// It is intended to be triggered by a Cloud Scheduler job (or similar)
// to run periodically.