internal/llm/embed.go - oscar - Git at Google

 // Copyright 2024 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package llm

 import (
 	"context"
 	"math"
 )

 const quoteLen = 123

 // QuoteEmbedder returns an implementation
 // of Embedder that can be useful for testing but
 // is completely pointless for real use.
 // It encodes up to the first 122 bytes of each document
 // directly into the first 122 elements of a 123-element unit vector.
 func QuoteEmbedder() Embedder {
 	return quoter{}
 }

 // quote quotes text into a vector.
 // The text ends at the first negative entry in the vector.
 // The final entry of the vector is hard-coded to -1
 // before normalization, so that the final entry of a
 // normalized vector lets us know scaling to reverse
 // to obtain the original bytes.
 func quote(text string) Vector {
 	v := make(Vector, quoteLen)
 	var d float64
 	for i := range len(text) {
 		if i >= len(v)-1 {
 			break
 		}
 		v[i] = float32(byte(text[i])) / 256
 		d += float64(v[i]) * float64(v[i])
 	}
 	if len(text)+1 < len(v) {
 		v[len(text)] = -1
 		d += 1
 	}
 	v[len(v)-1] = -1
 	d += 1

 	d = 1 / math.Sqrt(d)
 	for i := range v {
 		v[i] *= float32(d)
 	}
 	return v
 }

 // quoter is a quoting Embedder, returned by QuoteEmbedder
 type quoter struct{}

 // EmbedDocs implements Embedder by quoting.
 func (quoter) EmbedDocs(ctx context.Context, docs []EmbedDoc) ([]Vector, error) {
 	var vecs []Vector
 	for _, d := range docs {
 		vecs = append(vecs, quote(d.Text))
 	}
 	return vecs, nil
 }

 // UnquoteVector recovers the original text prefix
 // passed to a [QuoteEmbedder]'s EmbedDocs method.
 // Like QuoteEmbedder, UnquoteVector is only useful in tests.
 func UnquoteVector(v Vector) string {
 	if len(v) != quoteLen {
 		panic("UnquoteVector of non-quotation vector")
 	}
 	d := -1 / v[len(v)-1]
 	var b []byte
 	for _, f := range v {
 		if f < 0 {
 			break
 		}
 		b = append(b, byte(256*f*d+0.5))
 	}
 	return string(b)
 }
	// Copyright 2024 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package llm

	import (
	"context"
	"math"
	)

	const quoteLen = 123

	// QuoteEmbedder returns an implementation
	// of Embedder that can be useful for testing but
	// is completely pointless for real use.
	// It encodes up to the first 122 bytes of each document
	// directly into the first 122 elements of a 123-element unit vector.
	func QuoteEmbedder() Embedder {
	return quoter{}
	}

	// quote quotes text into a vector.
	// The text ends at the first negative entry in the vector.
	// The final entry of the vector is hard-coded to -1
	// before normalization, so that the final entry of a
	// normalized vector lets us know scaling to reverse
	// to obtain the original bytes.
	func quote(text string) Vector {
	v := make(Vector, quoteLen)
	var d float64
	for i := range len(text) {
	if i >= len(v)-1 {
	break
	}
	v[i] = float32(byte(text[i])) / 256
	d += float64(v[i]) * float64(v[i])
	}
	if len(text)+1 < len(v) {
	v[len(text)] = -1
	d += 1
	}
	v[len(v)-1] = -1
	d += 1

	d = 1 / math.Sqrt(d)
	for i := range v {
	v[i] *= float32(d)
	}
	return v
	}

	// quoter is a quoting Embedder, returned by QuoteEmbedder
	type quoter struct{}

	// EmbedDocs implements Embedder by quoting.
	func (quoter) EmbedDocs(ctx context.Context, docs []EmbedDoc) ([]Vector, error) {
	var vecs []Vector
	for _, d := range docs {
	vecs = append(vecs, quote(d.Text))
	}
	return vecs, nil
	}

	// UnquoteVector recovers the original text prefix
	// passed to a [QuoteEmbedder]'s EmbedDocs method.
	// Like QuoteEmbedder, UnquoteVector is only useful in tests.
	func UnquoteVector(v Vector) string {
	if len(v) != quoteLen {
	panic("UnquoteVector of non-quotation vector")
	}
	d := -1 / v[len(v)-1]
	var b []byte
	for _, f := range v {
	if f < 0 {
	break
	}
	b = append(b, byte(256fd+0.5))
	}
	return string(b)
	}