internal/gaby: add /embed endpoint to sync only embeddings Change-Id: If50ee6ef956a894090f134cdd9f3519f784edd12 Reviewed-on: https://go-review.googlesource.com/c/oscar/+/708878 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Hyang-Ah Hana Kim <hyangah@gmail.com>
diff --git a/internal/embeddocs/sync.go b/internal/embeddocs/sync.go index 1c193f3..3430c8e 100644 --- a/internal/embeddocs/sync.go +++ b/internal/embeddocs/sync.go
@@ -59,15 +59,22 @@ return nil } + var start, end string for d := range w.Recent() { - lg.Debug("embeddocs sync start", "model", model, "doc", d.ID) + if start == "" { + start = d.ID + } + end = d.ID batch = append(batch, llm.EmbedDoc{Title: d.Title, Text: d.Text}) ids = append(ids, d.ID) batchLast = d.DBTime if len(batch) >= batchSize { + lg.Debug("embeddocs sync flush", "model", model, "start", start, "end", end) if err := flush(); err != nil { return err } + start = "" + end = "" } } if len(batch) > 0 { @@ -75,6 +82,7 @@ // which has to be called during an iteration over w.Recent. // Start a new iteration just to call flush and then break out. for _ = range w.Recent() { + lg.Debug("embeddocs sync flush", "model", model, "start", start, "end", end) if err := flush(); err != nil { return err }
diff --git a/internal/gaby/main.go b/internal/gaby/main.go index d2ef278..a7cccb1 100644 --- a/internal/gaby/main.go +++ b/internal/gaby/main.go
@@ -540,10 +540,12 @@ setLevelEndpoint = "setlevel" githubEventEndpoint = "github-event" crawlEndpoint = "crawl" + embedEndpoint = "embed" bisectEndpoint = "bisect" ) cronEndpointCounter := g.newEndpointCounter(cronEndpoint) crawlEndpointCounter := g.newEndpointCounter(crawlEndpoint) + embedEndpointCounter := g.newEndpointCounter(embedEndpoint) githubEventEndpointCounter := g.newEndpointCounter(githubEventEndpoint) mux := http.NewServeMux() @@ -586,6 +588,23 @@ cronEndpointCounter.Add(r.Context(), 1) }) + // embedEndpoint is meant to be called by hand + // when running a binary with a new embedding database configured, + // to backfill embeddings. + mux.HandleFunc("GET /"+embedEndpoint, func(w http.ResponseWriter, r *http.Request) { + g.slog.Info(embedEndpoint + " start") + defer g.slog.Info(embedEndpoint + " end") + + // No lock here - g.embedAll already locks. + + if err := g.embedAll(g.ctx); err != nil { + report(err, r) + http.Error(w, err.Error(), http.StatusInternalServerError) + } + + embedEndpointCounter.Add(r.Context(), 1) + }) + // crawlEndpoint triggers the web crawl configured in [Gaby.crawler]. // It is intended to be triggered by a Cloud Scheduler job (or similar) // to run periodically.