internal/gerritdocs: log potential gemini doc truncation
This will give us an idea of how often and how much of the
truncation actually happens.
Updates golang/oscar#35
Change-Id: I90669124c9447645081aed43ed2c4f638c2c80c7
Reviewed-on: https://go-review.googlesource.com/c/oscar/+/617757
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Jonathan Amsterdam <jba@google.com>
Reviewed-by: Tatiana Bradley <tatianabradley@google.com>
diff --git a/internal/gerritdocs/sync.go b/internal/gerritdocs/sync.go
index b2b0f6c..de000cf 100644
--- a/internal/gerritdocs/sync.go
+++ b/internal/gerritdocs/sync.go
@@ -42,15 +42,18 @@
lg.Debug("gerritrelateddocs sync", "change", ce.ChangeNum, "dbtime", ce.DBTime)
c := change(ce, gr, projects)
if c == nil {
- lg.Error("gerritrelatedocs cannot find change", "number", ce.ChangeNum)
+ lg.Error("gerritrelateddocs cannot find change", "change", ce.ChangeNum)
continue
}
title := gr.ChangeSubject(c.ch)
body, err := relatedDocBody(gr, c)
if err != nil {
- lg.Error("gerritrelatedocs cannot find comments for change", "number", ce.ChangeNum)
+ lg.Error("gerritrelateddocs cannot find comments", "change", ce.ChangeNum)
continue
}
+ if len(body) > geminiCharLimit {
+ lg.Warn("gerritrelateddocs potential truncation by gemini", "change", ce.ChangeNum, "docSize", len(body))
+ }
text := cleanBody(body)
id := relatedDocURL(gr, c)
dc.Add(id, title, text)
@@ -59,6 +62,15 @@
return nil
}
+// geminiCharLimit is an approximate limit on the number of
+// document characters a gemini text embedding can accept.
+// Gemini text embedding models have an input token limit
+// of 2048, where each token is about four characters long.
+// Gemini truncates documents after this limit.
+// For more info, see
+// https://ai.google.dev/gemini-api/docs/models/gemini#text-embedding-and-embedding
+const geminiCharLimit = 8200
+
// changeInfo accumulates information from [gerrit.Change]
// and [gerrit.ChangeEvent] needed to grab change subject,
// messages, and comments.