|  | // Copyright 2024 The Go Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style | 
|  | // license that can be found in the LICENSE file. | 
|  |  | 
|  | package storage | 
|  |  | 
|  | import ( | 
|  | "cmp" | 
|  | "iter" | 
|  |  | 
|  | "golang.org/x/oscar/internal/llm" | 
|  | ) | 
|  |  | 
|  | // A VectorDB is a vector database that implements | 
|  | // nearest-neighbor search over embedding vectors | 
|  | // corresponding to documents. | 
|  | type VectorDB interface { | 
|  | // Set sets the vector associated with the given document ID to vec. | 
|  | // The id argument must not be empty. | 
|  | Set(id string, vec llm.Vector) | 
|  |  | 
|  | // Delete deletes any vector associated with document ID key. | 
|  | // Delete of an unset key is a no-op. | 
|  | Delete(id string) | 
|  |  | 
|  | // Get gets the vector associated with the given document ID. | 
|  | // If no such document exists, Get returns nil, false. | 
|  | // If a document exists, Get returns vec, true. | 
|  | Get(id string) (llm.Vector, bool) | 
|  |  | 
|  | // All returns an iterator over all ID-vector pairs in the vector db. | 
|  | // The second value in each iteration pair is a function returning a | 
|  | // vector, not the vector itself: | 
|  | // | 
|  | //	for key, getVec := range vecdb.All() { | 
|  | //		vec := getVec() | 
|  | //		fmt.Printf("%q: %q\n", key, vec) | 
|  | //	} | 
|  | // | 
|  | // The pairs are ordered in lexicographic order of IDs. | 
|  | // In iterations that only need the keys or only need the vectors for a subset of keys, | 
|  | // some VectorDB implementations may avoid work when the value function is not called. | 
|  | All() iter.Seq2[string, func() llm.Vector] | 
|  |  | 
|  | // Batch returns a new [VectorBatch] that accumulates | 
|  | // vector database mutations to apply in an atomic operation. | 
|  | // It is more efficient than repeated calls to Set. | 
|  | Batch() VectorBatch | 
|  |  | 
|  | // Search searches the database for the n vectors | 
|  | // most similar to vec, returning the document IDs | 
|  | // and similarity scores. | 
|  | // | 
|  | // Normally a VectorDB is used entirely with vectors of a single length. | 
|  | // Search ignores stored vectors with a different length than vec. | 
|  | Search(vec llm.Vector, n int) []VectorResult | 
|  |  | 
|  | // Flush flushes storage to disk. | 
|  | Flush() | 
|  | } | 
|  |  | 
|  | // A VectorBatch accumulates vector database mutations | 
|  | // that are applied to a [VectorDB] in a single atomic operation. | 
|  | // Applying bulk operations in a batch is also more efficient than | 
|  | // making individual [VectorDB] method calls. | 
|  | // The batched operations apply in the order they are made. | 
|  | type VectorBatch interface { | 
|  | // Set sets the vector associated with the given document ID to vec. | 
|  | Set(id string, vec llm.Vector) | 
|  |  | 
|  | // Delete deletes any vector associated with document ID key. | 
|  | // Delete of an unset key is a no-op. | 
|  | Delete(id string) | 
|  |  | 
|  | // MaybeApply calls Apply if the VectorBatch is getting close to full. | 
|  | // Every VectorBatch has a limit to how many operations can be batched, | 
|  | // so in a bulk operation where atomicity of the entire batch is not a concern, | 
|  | // calling MaybeApply gives the VectorBatch implementation | 
|  | // permission to flush the batch at specific “safe points”. | 
|  | // A typical limit for a batch is about 100MB worth of logged operations. | 
|  | // | 
|  | // MaybeApply reports whether it called Apply. | 
|  | MaybeApply() bool | 
|  |  | 
|  | // Apply applies all the batched operations to the underlying VectorDB | 
|  | // as a single atomic unit. | 
|  | // When Apply returns, the VectorBatch is an empty batch ready for | 
|  | // more operations. | 
|  | Apply() | 
|  | } | 
|  |  | 
|  | // A VectorResult is a single document returned by a VectorDB search. | 
|  | type VectorResult struct { | 
|  | ID    string  // document ID | 
|  | Score float64 // similarity score in range [0, 1]; 1 is exact match | 
|  | } | 
|  |  | 
|  | func (x VectorResult) cmp(y VectorResult) int { | 
|  | if x.Score != y.Score { | 
|  | return cmp.Compare(x.Score, y.Score) | 
|  | } | 
|  | return cmp.Compare(x.ID, y.ID) | 
|  | } |