blob: 6bcbd4925c3f705279b20e715d0661d2843ad17d [file] [log] [blame]
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package stats
import (
"fmt"
"math"
)
// A KDE is a distribution that estimates the underlying distribution
// of a Sample using kernel density estimation.
//
// Kernel density estimation is a method for constructing an estimate
// ƒ̂(x) of a unknown distribution ƒ(x) given a sample from that
// distribution. Unlike many techniques, kernel density estimation is
// non-parametric: in general, it doesn't assume any particular true
// distribution (note, however, that the resulting distribution
// depends deeply on the selected bandwidth, and many bandwidth
// estimation techniques assume normal reference rules).
//
// A kernel density estimate is similar to a histogram, except that it
// is a smooth probability estimate and does not require choosing a
// bin size and discretizing the data.
//
// Sample is the only required field. All others have reasonable
// defaults.
type KDE struct {
// Sample is the data sample underlying this KDE.
Sample Sample
// Kernel is the kernel to use for the KDE.
Kernel KDEKernel
// Bandwidth is the bandwidth to use for the KDE.
//
// If this is zero, the bandwidth is computed from the
// provided data using a default bandwidth estimator
// (currently BandwidthScott).
Bandwidth float64
// BoundaryMethod is the boundary correction method to use for
// the KDE. The default value is BoundaryReflect; however, the
// default bounds are effectively +/-inf, which is equivalent
// to performing no boundary correction.
BoundaryMethod KDEBoundaryMethod
// [BoundaryMin, BoundaryMax) specify a bounded support for
// the KDE. If both are 0 (their default values), they are
// treated as +/-inf.
//
// To specify a half-bounded support, set Min to math.Inf(-1)
// or Max to math.Inf(1).
BoundaryMin float64
BoundaryMax float64
}
// BandwidthSilverman is a bandwidth estimator implementing
// Silverman's Rule of Thumb. It's fast, but not very robust to
// outliers as it assumes data is approximately normal.
//
// Silverman, B. W. (1986) Density Estimation.
func BandwidthSilverman(data interface {
StdDev() float64
Weight() float64
}) float64 {
return 1.06 * data.StdDev() * math.Pow(data.Weight(), -1.0/5)
}
// BandwidthScott is a bandwidth estimator implementing Scott's Rule.
// This is generally robust to outliers: it chooses the minimum
// between the sample's standard deviation and an robust estimator of
// a Gaussian distribution's standard deviation.
//
// Scott, D. W. (1992) Multivariate Density Estimation: Theory,
// Practice, and Visualization.
func BandwidthScott(data interface {
StdDev() float64
Weight() float64
Quantile(float64) float64
}) float64 {
iqr := data.Quantile(0.75) - data.Quantile(0.25)
hScale := 1.06 * math.Pow(data.Weight(), -1.0/5)
stdDev := data.StdDev()
if stdDev < iqr/1.349 {
// Use Silverman's Rule of Thumb
return hScale * stdDev
} else {
// Use IQR/1.349 as a robust estimator of the standard
// deviation of a Gaussian distribution.
return hScale * (iqr / 1.349)
}
}
// TODO(austin) Implement bandwidth estimator from Botev, Grotowski,
// Kroese. (2010) Kernel Density Estimation via Diffusion.
// KDEKernel represents a kernel to use for a KDE.
type KDEKernel int
//go:generate stringer -type=KDEKernel
const (
// An EpanechnikovKernel is a smooth kernel with bounded
// support. As a result, the KDE will also have bounded
// support. It is "optimal" in the sense that it minimizes the
// asymptotic mean integrated squared error (AMISE).
EpanechnikovKernel KDEKernel = iota
// A GaussianKernel is a Gaussian (normal) kernel.
GaussianKernel
// A DeltaKernel is a Dirac delta function. The PDF of such a
// KDE is not well-defined, but the CDF will represent each
// sample as an instantaneous increase. This kernel ignores
// bandwidth and never requires boundary correction.
DeltaKernel
)
// KDEBoundaryMethod represents a boundary correction method for
// constructing a KDE with bounded support.
type KDEBoundaryMethod int
//go:generate stringer -type=KDEBoundaryMethod
const (
// BoundaryReflect reflects the density estimate at the
// boundaries. For example, for a KDE with support [0, inf),
// this is equivalent to ƒ̂ᵣ(x)=ƒ̂(x)+ƒ̂(-x) for x>=0. This is a
// simple and fast technique, but enforces that ƒ̂ᵣ'(0)=0, so
// it may not be applicable to all distributions.
BoundaryReflect KDEBoundaryMethod = iota
)
type kdeKernel interface {
pdfEach(xs []float64) []float64
cdfEach(xs []float64) []float64
}
func (k *KDE) prepare() (kdeKernel, bool) {
// Compute bandwidth.
if k.Bandwidth == 0 {
k.Bandwidth = BandwidthScott(k.Sample)
}
// Construct kernel.
kernel := kdeKernel(nil)
switch k.Kernel {
default:
panic(fmt.Sprint("unknown kernel", k))
case EpanechnikovKernel:
kernel = epanechnikovKernel{k.Bandwidth}
case GaussianKernel:
kernel = NormalDist{0, k.Bandwidth}
case DeltaKernel:
kernel = DeltaDist{0}
}
// Use boundary correction?
bc := k.BoundaryMin != 0 || k.BoundaryMax != 0
return kernel, bc
}
// TODO: For KDEs of histograms, make histograms able to create a
// weighted Sample and simply require the caller to provide a
// good bandwidth from a StreamStats.
// normalizedXs returns x - kde.Sample.Xs. Evaluating kernels shifted
// by kde.Sample.Xs all at x is equivalent to evaluating one unshifted
// kernel at x - kde.Sample.Xs.
func (kde *KDE) normalizedXs(x float64) []float64 {
txs := make([]float64, len(kde.Sample.Xs))
for i, xi := range kde.Sample.Xs {
txs[i] = x - xi
}
return txs
}
func (kde *KDE) PDF(x float64) float64 {
kernel, bc := kde.prepare()
// Apply boundary
if bc && (x < kde.BoundaryMin || x >= kde.BoundaryMax) {
return 0
}
y := func(x float64) float64 {
// Shift kernel to each of kde.xs and evaluate at x
ys := kernel.pdfEach(kde.normalizedXs(x))
// Kernel samples are weighted according to the weights of xs
wys := Sample{Xs: ys, Weights: kde.Sample.Weights}
return wys.Sum() / wys.Weight()
}
if !bc {
return y(x)
}
switch kde.BoundaryMethod {
default:
panic("unknown boundary correction method")
case BoundaryReflect:
if math.IsInf(kde.BoundaryMax, 1) {
return y(x) + y(2*kde.BoundaryMin-x)
} else if math.IsInf(kde.BoundaryMin, -1) {
return y(x) + y(2*kde.BoundaryMax-x)
} else {
d := 2 * (kde.BoundaryMax - kde.BoundaryMin)
w := 2 * (x - kde.BoundaryMin)
return series(func(n float64) float64 {
// Points >= x
return y(x+n*d) + y(x+n*d-w)
}) + series(func(n float64) float64 {
// Points < x
return y(x-(n+1)*d+w) + y(x-(n+1)*d)
})
}
}
}
func (kde *KDE) CDF(x float64) float64 {
kernel, bc := kde.prepare()
// Apply boundary
if bc {
if x < kde.BoundaryMin {
return 0
} else if x >= kde.BoundaryMax {
return 1
}
}
y := func(x float64) float64 {
// Shift kernel integral to each of cdf.xs and evaluate at x
ys := kernel.cdfEach(kde.normalizedXs(x))
// Kernel samples are weighted according to the weights of xs
wys := Sample{Xs: ys, Weights: kde.Sample.Weights}
return wys.Sum() / wys.Weight()
}
if !bc {
return y(x)
}
switch kde.BoundaryMethod {
default:
panic("unknown boundary correction method")
case BoundaryReflect:
if math.IsInf(kde.BoundaryMax, 1) {
return y(x) - y(2*kde.BoundaryMin-x)
} else if math.IsInf(kde.BoundaryMin, -1) {
return y(x) + (1 - y(2*kde.BoundaryMax-x))
} else {
d := 2 * (kde.BoundaryMax - kde.BoundaryMin)
w := 2 * (x - kde.BoundaryMin)
return series(func(n float64) float64 {
// Windows >= x-w
return y(x+n*d) - y(x+n*d-w)
}) + series(func(n float64) float64 {
// Windows < x-w
return y(x-(n+1)*d) - y(x-(n+1)*d-w)
})
}
}
}
func (kde *KDE) Bounds() (low float64, high float64) {
_, bc := kde.prepare()
// TODO(austin) If this KDE came from a histogram, we'd better
// not sample at a significantly higher rate than the
// histogram. Maybe we want to just return the bounds of the
// histogram?
// TODO(austin) It would be nice if this could be instructed
// to include all original data points, even if they are in
// the tail. Probably that should just be up to the caller to
// pass an axis derived from the bounds of the original data.
// Use the lowest and highest samples as starting points
lowX, highX := kde.Sample.Bounds()
if lowX == highX {
lowX -= 1
highX += 1
}
// Find the end points that contain 99% of the CDF's weight.
// Since bisect requires that the root be bracketed, start by
// expanding our range if necessary. TODO(austin) This can
// definitely be done faster.
const (
lowY = 0.005
highY = 0.995
tolerance = 0.001
)
for kde.CDF(lowX) > lowY {
lowX -= highX - lowX
}
for kde.CDF(highX) < highY {
highX += highX - lowX
}
// Explicitly accept discontinuities, since we may be using a
// discontiguous kernel.
low, _ = bisect(func(x float64) float64 { return kde.CDF(x) - lowY }, lowX, highX, tolerance)
high, _ = bisect(func(x float64) float64 { return kde.CDF(x) - highY }, lowX, highX, tolerance)
// Expand width by 20% to give some margins
width := high - low
low, high = low-0.1*width, high+0.1*width
// Limit to bounds
if bc {
low = math.Max(low, kde.BoundaryMin)
high = math.Min(high, kde.BoundaryMax)
}
return
}
type epanechnikovKernel struct {
h float64
}
func (d epanechnikovKernel) pdfEach(xs []float64) []float64 {
ys := make([]float64, len(xs))
a := 0.75 / d.h
invhh := 1 / (d.h * d.h)
for i, x := range xs {
if -d.h < x && x < d.h {
ys[i] = a * (1 - x*x*invhh)
}
}
return ys
}
func (d epanechnikovKernel) cdfEach(xs []float64) []float64 {
ys := make([]float64, len(xs))
invh := 1 / d.h
for i, x := range xs {
if x > d.h {
ys[i] = 1
} else if x > -d.h {
u := x * invh
ys[i] = 0.25 * (2 + 3*u - u*u*u)
}
}
return ys
}