blob: 2cd43f4e3e2977284d45ae96fc12552e96f03486 [file] [log] [blame]
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// rmdoc deletes the documents from the corpus (including the vector db).
//
// Usage: go run . -project oscar-go-1 -firestoredb devel https://go.dev/x/y/z
package main
import (
"context"
"flag"
"fmt"
"log"
"log/slog"
"strings"
"cloud.google.com/go/compute/metadata"
"golang.org/x/oscar/internal/dbspec"
"golang.org/x/oscar/internal/docs"
"golang.org/x/oscar/internal/gcp/firestore"
"golang.org/x/oscar/internal/storage"
)
var flags = struct {
project string
firestoredb string
overlay string
}{}
func init() {
flag.StringVar(&flags.project, "project", "", "name of the Google Cloud Project")
flag.StringVar(&flags.firestoredb, "firestoredb", "", "name of the firestore db")
}
var logger = slog.Default()
func main() {
flag.Parse()
args := flag.Args()
if len(args) == 0 {
log.Fatal("no args")
}
gabyDB, gabyVectorDB := initGCP()
corpus := docs.New(logger, gabyDB)
for _, url := range args {
if !strings.HasPrefix(url, "https://go.dev/") {
log.Println("ignoring unrecognized url:", url)
continue
}
// TODO: do we need to delete crawl.Page entries too?
for doc := range corpus.Docs(url) {
hasVector := " "
if _, ok := gabyVectorDB.Get(doc.ID); ok {
hasVector = "*"
}
fmt.Printf("%v %v", hasVector, doc.ID)
fmt.Printf(" delete (y/N)? ")
var a string
fmt.Scanln(&a)
if answer := strings.ToLower(strings.TrimSpace(a)); answer == "y" || answer == "yes" {
gabyVectorDB.Delete(doc.ID)
corpus.Delete(doc.ID)
if _, ok := gabyVectorDB.Get(doc.ID); ok {
log.Fatalf("error - %v not removed from vector db", doc.ID)
}
fmt.Print(" ↪ deleted")
} else {
fmt.Print(" ↪ skipped")
}
fmt.Println()
}
}
}
func initGCP() (storage.DB, storage.VectorDB) {
ctx := context.TODO()
if flags.project == "" {
projectID, err := metadata.ProjectIDWithContext(ctx)
if err != nil {
log.Fatalf("metadata project ID: %v", err)
}
if projectID == "" {
log.Fatal("project ID from metadata is empty")
}
flags.project = projectID
}
spec := &dbspec.Spec{
Kind: "firestore",
Location: flags.project,
Name: flags.firestoredb,
}
db, err := spec.Open(context.TODO(), logger)
if err != nil {
log.Fatal(err)
}
const vectorDBNamespace = "gaby"
vdb, err := firestore.NewVectorDB(ctx, slog.Default(), flags.project, flags.firestoredb, vectorDBNamespace)
if err != nil {
log.Fatal(err)
}
return db, vdb
}