internal/llm/policy_checker.go - oscar - Git at Google

 // Copyright 2024 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package llm

 import (
 	"context"
 	"fmt"
 )

 // A PolicyChecker checks inputs and outputs to LLMs against
 // a fixed set of safety policies.
 type PolicyChecker interface {
 	// Name returns the name of the policy checker.
 	// This function should always return the same result for a given
 	// instance of a [PolicyChecker].
 	Name() string
 	// Policies returns the list of policies configured on the checker.
 	// This function should always return the same result for a given
 	// instance of a [PolicyChecker].
 	Policies() []*PolicyConfig
 	// CheckText evaluates the policies configured on this [PolicyChecker]
 	// against the given text and returns a result for each [PolicyConfig].
 	// If the text represents a model output, the prompt parts used to generate it
 	// may optionally be provided as context. If the text represents a model input,
 	// prompt should be empty.
 	CheckText(ctx context.Context, text string, prompt ...Part) ([]*PolicyResult, error)
 }

 // A PolicyConfig is a policy to apply to an input or output to an LLM.
 //
 // Copied from "google.golang.org/api/checks/v1alpha" to avoid direct dependency.
 type PolicyConfig struct {
 	// PolicyType: Required. Type of the policy.
 	PolicyType PolicyType
 	// Threshold: Optional. Score threshold to use when deciding if the content is
 	// violative or non-violative. If not specified, the default 0.5 threshold for
 	// the policy will be used.
 	Threshold float64
 }

 // A PolicyResult is the result of evaluating a policy against
 // an input or output to an LLM.
 //
 // Copied from "google.golang.org/api/checks/v1alpha" to avoid direct dependency.
 type PolicyResult struct {
 	// PolicyType: Type of the policy.
 	PolicyType PolicyType
 	// Score: Final score for the results of this policy.
 	Score float64
 	// ViolationResult: Result of the classification for the policy.
 	ViolationResult ViolationResult
 }

 type PolicyType string

 // Possible values for [PolicyType].
 const (
 	// Default.
 	PolicyTypeUnspecified = PolicyType("POLICY_TYPE_UNSPECIFIED")
 	// The model facilitates, promotes or enables access to
 	// harmful goods, services, and activities.
 	PolicyTypeDangerousContent = PolicyType("DANGEROUS_CONTENT")
 	// The model reveals an individual’s personal
 	// information and data.
 	PolicyTypePIISolicitingReciting = PolicyType("PII_SOLICITING_RECITING")
 	// The model generates content that is malicious,
 	// intimidating, bullying, or abusive towards another individual.
 	PolicyTypeHarassment = PolicyType("HARASSMENT")
 	// The model generates content that is sexually
 	// explicit in nature.
 	PolicyTypeSexuallyExplicit = PolicyType("SEXUALLY_EXPLICIT")
 	// The model promotes violence, hatred, discrimination on the
 	// basis of race, religion, etc.
 	PolicyTypeHateSpeech = PolicyType("HATE_SPEECH")
 	// The model provides or offers to facilitate access to
 	// medical advice or guidance.
 	PolicyTypeMedicalInfo = PolicyType("MEDICAL_INFO")
 	// The model generates content that contains
 	// gratuitous, realistic descriptions of violence or gore.
 	PolicyTypeViolenceAndGore = PolicyType("VIOLENCE_AND_GORE")
 	// The model generates profanity and obscenities.
 	PolicyTypeObscenityAndProfanity = PolicyType("OBSCENITY_AND_PROFANITY")
 )

 // AllPolicyTypes returns a list of policies that checks for all available dangerous
 // content types at the default threshold.
 func AllPolicyTypes() []*PolicyConfig {
 	return []*PolicyConfig{
 		{PolicyType: PolicyTypeDangerousContent},
 		{PolicyType: PolicyTypePIISolicitingReciting},
 		{PolicyType: PolicyTypeHarassment},
 		{PolicyType: PolicyTypeSexuallyExplicit},
 		{PolicyType: PolicyTypeHateSpeech},
 		{PolicyType: PolicyTypeMedicalInfo},
 		{PolicyType: PolicyTypeViolenceAndGore},
 		{PolicyType: PolicyTypeObscenityAndProfanity},
 	}
 }

 type ViolationResult string

 // Possible values for [ViolationResult].
 const (
 	// Unspecified result.
 	ViolationResultUnspecified = ViolationResult("VIOLATION_RESULT_UNSPECIFIED")
 	// The final score is greater or equal the input score
 	// threshold.
 	ViolationResultViolative = ViolationResult("VIOLATIVE")
 	// The final score is smaller than the input score
 	// threshold.
 	ViolationResultNonViolative = ViolationResult("NON_VIOLATIVE")
 	// There was an error and the violation result could
 	// not be determined.
 	ViolationResultClassificationError = ViolationResult("CLASSIFICATION_ERROR")
 )

 // IsViolative reports whether the policy result represents
 // a violated policy.
 func (pr *PolicyResult) IsViolative() bool {
 	return pr.ViolationResult == ViolationResultViolative
 }

 // String returns a string representation of the policy result.
 func (pr *PolicyResult) String() string {
 	return fmt.Sprintf("%s: %s (%f)", pr.PolicyType, pr.ViolationResult, pr.Score)
 }
	// Copyright 2024 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package llm

	import (
	"context"
	"fmt"
	)

	// A PolicyChecker checks inputs and outputs to LLMs against
	// a fixed set of safety policies.
	type PolicyChecker interface {
	// Name returns the name of the policy checker.
	// This function should always return the same result for a given
	// instance of a [PolicyChecker].
	Name() string
	// Policies returns the list of policies configured on the checker.
	// This function should always return the same result for a given
	// instance of a [PolicyChecker].
	Policies() []*PolicyConfig
	// CheckText evaluates the policies configured on this [PolicyChecker]
	// against the given text and returns a result for each [PolicyConfig].
	// If the text represents a model output, the prompt parts used to generate it
	// may optionally be provided as context. If the text represents a model input,
	// prompt should be empty.
	CheckText(ctx context.Context, text string, prompt ...Part) ([]*PolicyResult, error)
	}

	// A PolicyConfig is a policy to apply to an input or output to an LLM.
	//
	// Copied from "google.golang.org/api/checks/v1alpha" to avoid direct dependency.
	type PolicyConfig struct {
	// PolicyType: Required. Type of the policy.
	PolicyType PolicyType
	// Threshold: Optional. Score threshold to use when deciding if the content is
	// violative or non-violative. If not specified, the default 0.5 threshold for
	// the policy will be used.
	Threshold float64
	}

	// A PolicyResult is the result of evaluating a policy against
	// an input or output to an LLM.
	//
	// Copied from "google.golang.org/api/checks/v1alpha" to avoid direct dependency.
	type PolicyResult struct {
	// PolicyType: Type of the policy.
	PolicyType PolicyType
	// Score: Final score for the results of this policy.
	Score float64
	// ViolationResult: Result of the classification for the policy.
	ViolationResult ViolationResult
	}

	type PolicyType string

	// Possible values for [PolicyType].
	const (
	// Default.
	PolicyTypeUnspecified = PolicyType("POLICY_TYPE_UNSPECIFIED")
	// The model facilitates, promotes or enables access to
	// harmful goods, services, and activities.
	PolicyTypeDangerousContent = PolicyType("DANGEROUS_CONTENT")
	// The model reveals an individual’s personal
	// information and data.
	PolicyTypePIISolicitingReciting = PolicyType("PII_SOLICITING_RECITING")
	// The model generates content that is malicious,
	// intimidating, bullying, or abusive towards another individual.
	PolicyTypeHarassment = PolicyType("HARASSMENT")
	// The model generates content that is sexually
	// explicit in nature.
	PolicyTypeSexuallyExplicit = PolicyType("SEXUALLY_EXPLICIT")
	// The model promotes violence, hatred, discrimination on the
	// basis of race, religion, etc.
	PolicyTypeHateSpeech = PolicyType("HATE_SPEECH")
	// The model provides or offers to facilitate access to
	// medical advice or guidance.
	PolicyTypeMedicalInfo = PolicyType("MEDICAL_INFO")
	// The model generates content that contains
	// gratuitous, realistic descriptions of violence or gore.
	PolicyTypeViolenceAndGore = PolicyType("VIOLENCE_AND_GORE")
	// The model generates profanity and obscenities.
	PolicyTypeObscenityAndProfanity = PolicyType("OBSCENITY_AND_PROFANITY")
	)

	// AllPolicyTypes returns a list of policies that checks for all available dangerous
	// content types at the default threshold.
	func AllPolicyTypes() []*PolicyConfig {
	return []*PolicyConfig{
	{PolicyType: PolicyTypeDangerousContent},
	{PolicyType: PolicyTypePIISolicitingReciting},
	{PolicyType: PolicyTypeHarassment},
	{PolicyType: PolicyTypeSexuallyExplicit},
	{PolicyType: PolicyTypeHateSpeech},
	{PolicyType: PolicyTypeMedicalInfo},
	{PolicyType: PolicyTypeViolenceAndGore},
	{PolicyType: PolicyTypeObscenityAndProfanity},
	}
	}

	type ViolationResult string

	// Possible values for [ViolationResult].
	const (
	// Unspecified result.
	ViolationResultUnspecified = ViolationResult("VIOLATION_RESULT_UNSPECIFIED")
	// The final score is greater or equal the input score
	// threshold.
	ViolationResultViolative = ViolationResult("VIOLATIVE")
	// The final score is smaller than the input score
	// threshold.
	ViolationResultNonViolative = ViolationResult("NON_VIOLATIVE")
	// There was an error and the violation result could
	// not be determined.
	ViolationResultClassificationError = ViolationResult("CLASSIFICATION_ERROR")
	)

	// IsViolative reports whether the policy result represents
	// a violated policy.
	func (pr *PolicyResult) IsViolative() bool {
	return pr.ViolationResult == ViolationResultViolative
	}

	// String returns a string representation of the policy result.
	func (pr *PolicyResult) String() string {
	return fmt.Sprintf("%s: %s (%f)", pr.PolicyType, pr.ViolationResult, pr.Score)
	}