blob: f3316ddecba4a58dbdc4116f96a34f05d341880c [file] [log] [blame] [edit]
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package stats provides basic descriptive statistics.
//
// This is intended not as a comprehensive statistics package, but
// to provide common, everyday statistical functions.
//
// As a rule of thumb, a statistical function belongs in this package
// if it would be explained in a typical high school.
//
// These functions aim to balance performance and accuracy, but some
// amount of error is inevitable in floating-point computations.
// The underlying implementations may change, resulting in small
// changes in their results from version to version. If the caller
// needs particular guarantees on accuracy and overflow behavior or
// version stability, they should use a more specialized
// implementation.
package stats
// References:
//
// Hyndman, Rob J.; Fan, Yanan (November 1996).
// "Sample Quantiles in Statistical Packages".
// American Statistician. 50 (4).
// American Statistical Association: 361–365.
// doi:10.2307/2684934. JSTOR 2684934.
import (
"math"
"slices"
)
// Mean returns the arithmetic mean of the values in values.
//
// Mean does not modify the array.
//
// Mean panics if values is an empty slice.
//
// If values contains NaN or both Inf and -Inf, it returns NaN.
// If values contains Inf, it returns Inf. If values contains -Inf, it returns -Inf.
func Mean[F ~float64](values []F) F {
mean, infs := meanInf(values)
switch infs {
case negInf:
return F(math.Inf(-1))
case posInf:
return F(math.Inf(1))
case negInf | posInf:
return F(math.NaN())
default: // passthrough mean or NaN
}
return mean
}
// MeanAndStdDev returns the arithmetic mean and
// sample standard deviation of values; the standard
// deviation is only defined for len(values) > 1.
//
// MeanAndStdDev does not modify the array.
//
// MeanAndStdDev panics if values is an empty slice.
//
// If values contains NaN, it returns NaN, NaN.
// If values contains both Inf and -Inf, it returns NaN, Inf.
// If values contains Inf, it returns Inf, Inf.
// If values contains -Inf, it returns -Inf, Inf.
func MeanAndStdDev[F ~float64](values []F) (F, F) {
mean, infs := meanInf(values)
switch infs {
case 0:
if math.IsNaN(float64(mean)) {
return mean, F(math.NaN())
}
case negInf, posInf, negInf | posInf:
return mean, F(math.Inf(1))
}
if len(values) == 1 {
return mean, 0
}
squaredDiffs := F(0.0)
for _, v := range values {
diff := v - mean
squaredDiffs += diff * diff
}
return mean, F(math.Sqrt(float64(squaredDiffs) / float64(len(values)-1)))
}
// meanInf calculates a naive mean value
// and reports the infinities status.
func meanInf[F ~float64](values []F) (F, infinities) {
if len(values) == 0 {
panic("mean: empty slice")
}
sum, infs := F(0.0), infinities(0)
for _, v := range values {
switch {
case math.IsInf(float64(v), 1):
infs |= posInf
case math.IsInf(float64(v), -1):
infs |= negInf
}
sum += v
}
return F(sum / F(len(values))), infs
}
// infinities is a bitset that records the presence of ±Inf in the input
type infinities uint8
const (
negInf infinities = 1 << iota
posInf
)
// Median returns the median of the values in values.
//
// Median does not modify the array.
//
// Median may perform asymptotically faster and allocate
// asymptotically less if the slice is already sorted.
//
// If values is an empty slice, it panics.
// If values contains NaN, it returns NaN.
// -Inf is treated as smaller than all other values,
// Inf is treated as larger than all other values, and
// -0.0 is treated as smaller than 0.0.
func Median[F ~float64](values []F) F { return Quantiles(values, 0.5)[0] }
// Quantiles returns a sequence of quantiles of values.
//
// The returned slice has the same length as the quantiles slice,
// and the elements are one-to-one with the input quantiles.
// A quantile of 0 corresponds to the minimum value in values and
// a quantile of 1 corresponds to the maximum value in values.
// A quantile of 0.5 is the same as the value returned by [Median].
//
// Quantiles does not modify the array.
//
// Quantiles may perform asymptotically faster and allocate
// asymptotically less if the slice is already sorted.
//
// Quantiles panics if values is an empty slice or any
// quantile is not contained in the interval [0, 1].
//
// If values contains NaN, it returns [NaN, ..., NaN].
// -Inf is treated as smaller than all other values,
// Inf is treated as larger than all other values, and
// -0.0 is treated as smaller than 0.0.
func Quantiles[F, Q ~float64](values []F, quantiles ...Q) []F {
if len(values) == 0 {
panic("quantiles: empty slice")
}
if !slices.IsSorted(values) {
values = slices.Clone(values)
slices.Sort(values)
}
res := make([]F, len(quantiles))
if math.IsNaN(float64(values[0])) {
for i := range res {
res[i] = F(math.NaN())
}
return res
}
for i, q := range quantiles {
if !(0 <= q && q <= 1) {
panic("quantile must be contained in the interval [0, 1]")
}
// There are many methods for computing quantiles. Quantiles uses the
// "inclusive" method, also known as Q7 in Hyndman and Fan, or the
// "linear" or "R-7" method. This assumes that the data is either a
// population or a sample that includes the most extreme values of the
// underlying population.
res[i] = hyndmanFanR7(values, F(q))
}
return res
}
// hyndmanFanR7 implements the Hyndman and Fan "R-7"
// method of computing interpolated quantile values
// over a sorted slice of vals.
//
// hyndmanFanR7 does not modify the array.
func hyndmanFanR7[F ~float64](values []F, q F) F {
h := F(len(values)-1)*q + 1
// the h-th smallest of len(vals) values is at fn(h)-1.
return values[floor(h-1)] + (h-F(math.Floor(float64(h))))*(values[ceil(h-1)]-values[floor(h-1)])
}
// ceil returns the integer value of [math.Ceil].
func ceil[F ~float64](n F) int { return int(math.Ceil(float64(n))) }
// floor returns the integer value of [math.Floor].
func floor[F ~float64](n F) int { return int(math.Floor(float64(n))) }