internal/stats: add edm/edmx detectors
Add single-instance edm/edmx breakout detectors. These algorithms are
described in the paper:
https://courses.cit.cornell.edu/nj89/docs/edm.pdf
This implementation was compared against an open-source R
implementation, and yielded the same breakouts for a wide range of data
from cmd/bent.
Change-Id: I088e37e6444e3c9d3275a15d04b5cecf6e1d2fe9
Reviewed-on: https://go-review.googlesource.com/c/benchmarks/+/292549
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Jeremy Faller <jeremy@golang.org>
Run-TryBot: Jeremy Faller <jeremy@golang.org>
Reviewed-by: David Chase <drchase@google.com>
diff --git a/internal/stats/edm.go b/internal/stats/edm.go
new file mode 100644
index 0000000..70e0c80
--- /dev/null
+++ b/internal/stats/edm.go
@@ -0,0 +1,218 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package stats
+
+import (
+ "math"
+)
+
+// edm carries data to calculate e-divisive with medians.
+type edm struct {
+ z []float64
+ delta int
+ bestStat float64
+ bestIdx int
+ tau int
+ ta, tb, tab *IntervalTree
+}
+
+// normalize normalizes an slice of floats in place.
+func normalize(input []float64) []float64 {
+ // gracefully handle empty inputs.
+ if len(input) == 0 {
+ return input
+ }
+ min, max := input[0], input[0]
+ for _, v := range input {
+ if v > max {
+ max = v
+ }
+ if v < min {
+ min = v
+ }
+ }
+ for i, v := range input {
+ input[i] = (v - min) / (max - min)
+ }
+ return input
+}
+
+// medianResolution finds a good compromise for the approximate median, as
+// described in the paper.
+func medianResolution(l int) int {
+ d := 10 // min resolution 1/(1<<d).
+ if l := int(math.Ceil(math.Log2(float64(l)))); l > d {
+ d = l
+ }
+ return d
+}
+
+// toFloat converts an slice of integers to float64.
+func toFloat(input []int) []float64 {
+ output := make([]float64, len(input))
+ for i, v := range input {
+ output[i] = float64(v)
+ }
+ return output
+}
+
+// stat calculates the edx stat for a given location, saving it if it's better
+// than the currently stored statistic.
+func (e *edm) stat(tau2 int) float64 {
+ a, b, c := e.ta.Median(), e.tb.Median(), e.tab.Median()
+ a, b, c = a*a, b*b, c*c
+ stat := 2*c - a - b
+ stat *= float64(e.tau*(tau2-e.tau)) / float64(tau2)
+ if stat > e.bestStat {
+ e.bestStat = stat
+ e.bestIdx = e.tau
+ }
+ return stat
+}
+
+// calc handles the brunt of the E-divisive with median algorithm described in:
+// https://courses.cit.cornell.edu/nj89/docs/edm.pdf.
+func (e *edm) calc() int {
+ normalize(e.z)
+
+ e.bestStat = math.Inf(-1)
+ e.tau = e.delta
+ tau2 := 2 * e.delta
+
+ d := medianResolution(len(e.z))
+ e.ta = NewIntervalTree(d)
+ e.tb = NewIntervalTree(d)
+ e.tab = NewIntervalTree(d)
+
+ for i := 0; i < e.tau; i++ {
+ for j := i + 1; j < e.tau; j++ {
+ e.ta.Insert(e.z[i] - e.z[j])
+ }
+ }
+
+ for i := e.tau; i < tau2; i++ {
+ for j := i + 1; j < tau2; j++ {
+ e.tb.Insert(e.z[i] - e.z[j])
+ }
+ }
+
+ for i := 0; i < e.tau; i++ {
+ for j := e.tau; j < tau2; j++ {
+ e.tab.Insert(e.z[i] - e.z[j])
+ }
+ }
+
+ tau2 += 1
+ for ; tau2 < len(e.z)+1; tau2++ {
+ e.tb.Insert(e.z[tau2-1] - e.z[tau2-2])
+ e.stat(tau2)
+ }
+
+ forward := false
+ for e.tau < len(e.z)-e.delta {
+ if forward {
+ e.forwardUpdate()
+ } else {
+ e.backwardUpdate()
+ }
+ forward = !forward
+ }
+
+ return e.bestIdx
+}
+
+func (e *edm) forwardUpdate() {
+ tau2 := e.tau + e.delta
+ e.tau += 1
+
+ for i := e.tau - e.delta; i < e.tau-1; i++ {
+ e.ta.Insert(e.z[i] - e.z[e.tau-1])
+ }
+ for i := e.tau - e.delta; i < e.tau; i++ {
+ e.ta.Remove(e.z[i] - e.z[e.tau-e.delta-1])
+ }
+ e.ta.Insert(e.z[e.tau-e.delta-1] - e.z[e.tau-e.delta])
+
+ e.tab.Remove(e.z[e.tau-1] - e.z[e.tau-e.delta-1])
+ for i := e.tau; i < tau2; i++ {
+ e.tab.Remove(e.z[i] - e.z[e.tau-e.delta-1])
+ e.tab.Insert(e.z[i] - e.z[e.tau-1])
+ }
+ for i := e.tau - e.delta; i < e.tau-1; i++ {
+ e.tab.Remove(e.z[i] - e.z[e.tau-1])
+ e.tab.Insert(e.z[i] - e.z[tau2])
+ }
+ e.tab.Insert(e.z[e.tau-1] - e.z[tau2])
+
+ for i := e.tau; i < tau2; i++ {
+ e.tb.Remove(e.z[i] - e.z[e.tau-1])
+ e.tb.Insert(e.z[i] - e.z[tau2])
+ }
+
+ tau2 += 1
+ for ; tau2 < len(e.z)+1; tau2++ {
+ e.tb.Insert(e.z[tau2-1] - e.z[tau2-2])
+ e.stat(tau2)
+ }
+}
+
+func (e *edm) backwardUpdate() {
+ tau2 := e.tau + e.delta
+ e.tau += 1
+
+ for i := e.tau - e.delta; i < e.tau-1; i++ {
+ e.ta.Insert(e.z[i] - e.z[e.tau-1])
+ }
+ for i := e.tau - e.delta; i < e.tau; i++ {
+ e.ta.Remove(e.z[i] - e.z[e.tau-e.delta-1])
+ }
+ e.ta.Insert(e.z[e.tau-e.delta-1] - e.z[e.tau-e.delta])
+
+ e.tab.Remove(e.z[e.tau-1] - e.z[e.tau-e.delta-1])
+ for i := e.tau; i < tau2; i++ {
+ e.tab.Remove(e.z[i] - e.z[e.tau-e.delta-1])
+ e.tab.Insert(e.z[i] - e.z[e.tau-1])
+ }
+ for i := e.tau - e.delta; i < e.tau-1; i++ {
+ e.tab.Remove(e.z[i] - e.z[e.tau-1])
+ e.tab.Insert(e.z[i] - e.z[tau2])
+ }
+ e.tab.Insert(e.z[e.tau-1] - e.z[tau2])
+
+ for i := e.tau; i < e.tau+e.delta-1; i++ {
+ e.tb.Insert(e.z[e.tau+e.delta-1] - e.z[i])
+ e.tb.Remove(e.z[i] - e.z[e.tau-1])
+ }
+
+ for tau2 = len(e.z); tau2 >= e.tau+e.delta; tau2-- {
+ e.tb.Remove(e.z[tau2-1] - e.z[tau2-2])
+ e.stat(tau2)
+ }
+}
+
+// EDM performs the approximate E-divisive with means calculation as described
+// in https://courses.cit.cornell.edu/nj89/docs/edm.pdf.
+//
+// EDM is an algorithm for finding breakout points in a time-series data set.
+// The function accepts the input vector, and a window width that describes the
+// search window during which the median breakout must occur. The window, ∂,
+// is described in the paper.
+//
+// Note this algorithm uses the interval tree median approach as described in
+// the paper. The medians used will have a resolution of 1/(2^d), where d is
+// max(log2(len(input)), 10). That is the value recommended in the paper.
+func EDM(input []float64, delta int) int {
+ // edm modifies the input, we don't want to do that to our callers.
+ c := make([]float64, len(input))
+ copy(c, input)
+ e := &edm{z: c, delta: delta}
+ return e.calc()
+}
+
+// EDMInt is the integer version of EDM.
+func EDMInt(input []int, delta int) int {
+ e := &edm{z: toFloat(input), delta: delta}
+ return e.calc()
+}
diff --git a/internal/stats/edm_test.go b/internal/stats/edm_test.go
new file mode 100644
index 0000000..cfdfb8c
--- /dev/null
+++ b/internal/stats/edm_test.go
@@ -0,0 +1,83 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package stats
+
+import (
+ "reflect"
+ "testing"
+)
+
+func TestNormalize(t *testing.T) {
+ t.Parallel()
+ tests := []struct {
+ input, expected []float64
+ }{
+ {[]float64{}, []float64{}},
+ {[]float64{0, 1}, []float64{0, 1}},
+ {[]float64{1, 2, 3}, []float64{0, 0.5, 1}},
+ {[]float64{-1, 0, 1}, []float64{0, 0.5, 1}},
+ {[]float64{-3, -2, -1}, []float64{0, 0.5, 1}},
+ {[]float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, []float64{0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1}},
+ }
+ for i, test := range tests {
+ if out := normalize(test.input); !reflect.DeepEqual(out, test.expected) {
+ t.Errorf("[%d] normalizeInput(%v) = %v, expected %v", i, test.input, out, test.expected)
+ }
+ }
+}
+
+func TestMedianResolution(t *testing.T) {
+ t.Parallel()
+ tests := []struct {
+ l, expected int
+ }{
+ {1, 10},
+ {1000, 10},
+ {2048, 11},
+ }
+ for i, test := range tests {
+ if v := medianResolution(test.l); v != test.expected {
+ t.Errorf("[%d] medianResolution(%v) = %v, expected %v", i, test.l, v, test.expected)
+ }
+ }
+}
+
+func TestEDMXGolden(t *testing.T) {
+ // This test case is used in the paper.
+ input := []float64{
+ 105.08333, 90.90000, 763.90000, 83.36667, 78.36667, 80.58333,
+ 76.36667, 210.98333, 78.00000, 77.51667, 83.01667, 89.23333,
+ 84.86667, 653.16667, 70.91667, 72.83333, 75.91667, 73.53333,
+ 548.86667, 66.23333, 73.45000, 66.96667, 71.11667, 68.31667,
+ 285.38333, 317.20000, 63.28333, 64.08333, 60.50000, 550.88333,
+ 399.68333, 75.90000, 115.35000, 78.93333, 88.68333, 475.53333,
+ 30.11667, 31.51667, 34.08333, 39.55000, 47.51667, 423.63333,
+ 52.55000, 50.21667, 61.41667, 56.61667, 64.41667, 742.30000,
+ 165.85000, 122.88333, 122.21667, 114.66667, 565.96667, 134.70000,
+ 141.16667, 160.78333, 168.48333, 458.65000, 513.28333, 154.36667,
+ 130.66667, 125.93333, 127.25000, 615.58333, 122.90000, 97.45000,
+ 122.76667, 115.10000, 111.95000, 442.78333, 113.83333, 116.11667,
+ 128.70000, 135.03333, 138.75000, 153.38333, 143.58333, 161.50000,
+ 168.11667, 152.25000, 147.11667, 163.91667, 161.10000, 146.95000,
+ 132.65000, 127.28333, 116.10000, 92.28333, 54.88333, 111.35000,
+ 114.98333, 110.98333, 1015.35000, 774.58333, 232.65000, 134.61667,
+ 130.25000, 98.66667, 102.40000, 184.86667, 258.76667, 70.33333,
+ 81.38333, 81.10000, 89.21667, 536.96667, 85.83333, 95.63333,
+ 76.10000, 94.38333, 73.25000, 346.70000, 65.38333, 84.73333,
+ 140.56667, 120.60000, 121.38333, 359.23333, 55.28333, 54.55000,
+ 52.18333, 56.20000, 112.11667, 208.53333, 49.40000, 49.06667,
+ 56.06667, 54.01667, 63.51667, 344.41667, 42.06667, 55.36667,
+ 55.96667, 55.85000, 56.30000, 46.56667, 49.25000, 43.90000,
+ 357.61667, 44.10000, 44.68333, 43.13333, 40.55000, 452.20000,
+ 47.06667, 40.00000, 42.35000, 48.36667, 44.86667, 48.51667,
+ 244.01667, 50.16667, 48.73333, 47.91667, 51.96667, 343.33333,
+ 35.25000, 45.33333, 46.86667, 48.78333}
+ if v := EDMX(input, 24); v != 95 {
+ t.Errorf("EDMX() = %v, expected 95", v)
+ }
+ if v := EDM(input, 24); v != 47 {
+ t.Errorf("EDM() = %v, expected 47", v)
+ }
+}
diff --git a/internal/stats/edmx.go b/internal/stats/edmx.go
new file mode 100644
index 0000000..59d91af
--- /dev/null
+++ b/internal/stats/edmx.go
@@ -0,0 +1,119 @@
+package stats
+
+import (
+ "container/heap"
+ "math"
+)
+
+type maxHeap []float64
+
+func (h maxHeap) Len() int { return len(h) }
+func (h maxHeap) Less(i, j int) bool { return h[i] > h[j] }
+func (h maxHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
+func (h *maxHeap) Push(x interface{}) {
+ *h = append(*h, x.(float64))
+}
+func (h *maxHeap) Pop() interface{} {
+ old := *h
+ n := len(old)
+ x := old[n-1]
+ *h = old[0 : n-1]
+ return x
+}
+
+type minHeap []float64
+
+func (h minHeap) Len() int { return len(h) }
+func (h minHeap) Less(i, j int) bool { return h[i] < h[j] }
+func (h minHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
+func (h *minHeap) Push(x interface{}) {
+ *h = append(*h, x.(float64))
+}
+func (h *minHeap) Pop() interface{} {
+ old := *h
+ n := len(old)
+ x := old[n-1]
+ *h = old[0 : n-1]
+ return x
+}
+
+// addToHeaps adds a value to the appropriate heap,and keeps their sizes close.
+// This is a direct translation of the addToHeaps function described in the
+// paper.
+func addToHeaps(min *minHeap, max *maxHeap, x float64) {
+ // NB: There's an ambiguity in the paper here related to if-then/else
+ // evaluation. This structure seems to yield the desired results.
+ if min.Len() == 0 || x < (*min)[0] {
+ heap.Push(max, x)
+ } else {
+ heap.Push(min, x)
+ }
+ if min.Len() > max.Len()+1 {
+ heap.Push(max, heap.Pop(min))
+ } else if max.Len() > min.Len()+1 {
+ heap.Push(min, heap.Pop(max))
+ }
+}
+
+// getMedian finds the median of the min and max heaps.
+// This is a direct translation of the getMedian function described in the
+// paper.
+func getMedian(min minHeap, max maxHeap) float64 {
+ if min.Len() > max.Len() {
+ return min[0]
+ }
+ if max.Len() > min.Len() {
+ return max[0]
+ }
+ return (max[0] + min[0]) / 2
+}
+
+// edmx implements the EDM-X algorithm.
+// This algorithm runs in place, modifying the data.
+func edmx(input []float64, delta int) int {
+ input = normalize(input)
+ var lmax, rmax maxHeap
+ var lmin, rmin minHeap
+ heap.Init(&lmax)
+ heap.Init(&rmax)
+ heap.Init(&lmin)
+ heap.Init(&rmin)
+ bestStat := math.Inf(-1)
+ bestLoc := -1
+
+ for i := 0; i < delta-1; i++ {
+ addToHeaps(&lmin, &lmax, input[i])
+ }
+ for i := delta; i < len(input)-delta+1; i++ {
+ addToHeaps(&lmin, &lmax, input[i-1])
+ ml := getMedian(lmin, lmax)
+ rmax, rmin = rmax[:0], rmin[:0]
+ for j := i; j < i+delta-1; j++ {
+ addToHeaps(&rmin, &rmax, input[j])
+ }
+ for j := i + delta; j < len(input)+1; j++ {
+ addToHeaps(&rmin, &rmax, input[j-1])
+ mr := getMedian(rmin, rmax)
+ stat := float64(i*(j-i)) / float64(j) * (ml - mr) * (ml - mr)
+ if stat > bestStat {
+ bestStat = stat
+ bestLoc = i
+ }
+ }
+ }
+
+ return bestLoc
+}
+
+// EDMX runs the EDM-X algorithm on a slice of floats.
+func EDMX(input []float64, delta int) int {
+ // edmx modfies the input, don't do that.
+ c := make([]float64, len(input))
+ copy(c, input)
+ return edmx(c, delta)
+}
+
+// EMDXInt runs the EDM-X algorithm on a slice of integer datapoints.
+func EDMXInt(input []int, delta int) int {
+ return edmx(toFloat(input), delta)
+}
diff --git a/internal/stats/itree.go b/internal/stats/itree.go
new file mode 100644
index 0000000..7a79052
--- /dev/null
+++ b/internal/stats/itree.go
@@ -0,0 +1,91 @@
+package stats
+
+import (
+ "math"
+)
+
+// IntervalTree is a structure used to make caluclating running medians quick.
+// The structure is described in the Section 8/Appendix of the paper.
+type IntervalTree struct {
+ d int
+ vals []int
+}
+
+// NewIntervalTree creates a new IntervalTree of depth d.
+func NewIntervalTree(d int) *IntervalTree {
+ if d < 0 {
+ panic("invalid depth")
+ }
+ return &IntervalTree{
+ d: d,
+ vals: make([]int, (1<<(d+1))-1),
+ }
+}
+
+// walk is a generic function on interval trees to add or remove elements.
+func (it *IntervalTree) walk(v float64, update int) {
+ v = math.Abs(v)
+ mid, inc := 0.5, 0.25
+ idx := 0
+ // Update the levels in the tree.
+ for i := 0; i <= it.d; i++ {
+ it.vals[idx] += update
+ idx = idx*2 + 1
+ if v > mid {
+ mid += inc
+ idx += 1
+ } else {
+ mid -= inc
+ }
+ inc /= 2.
+ }
+}
+
+// Insert puts an element in an interval tree.
+func (it *IntervalTree) Insert(v float64) {
+ it.walk(v, 1)
+}
+
+// Remove removes an element from an interview tree.
+func (it *IntervalTree) Remove(v float64) {
+ it.walk(v, -1)
+}
+
+// Median returns the current median.
+func (it *IntervalTree) Median() float64 {
+ // If empty, special case and return 0.
+ numElements := it.NumElements()
+ if numElements == 0 {
+ return 0
+ }
+
+ l, u := 0., 1.
+ k := int(math.Ceil(float64(numElements) / 2.))
+ for i := 0; i < len(it.vals); {
+ j := 2*i + 1
+ if j >= len(it.vals) {
+ break
+ }
+ if it.vals[i] == k {
+ kf := float64(k)
+ a, b := float64(it.vals[j])/kf, float64(it.vals[j+1])/kf
+ x := (l + (l+u)/2.) / 2.
+ y := (u + (l+u)/2.) / 2.
+ return (a*x + b*y) / (a + b)
+ }
+ if v := it.vals[j]; v >= k {
+ i = j
+ u = (l + u) / 2.
+ } else {
+ k -= v
+ i = j + 1
+ l = (l + u) / 2.
+ }
+ }
+ return (u-l)/2. + l
+}
+
+// NumElements returns the number of elements in the tree.
+func (it *IntervalTree) NumElements() int {
+ return it.vals[0]
+}
diff --git a/internal/stats/itree_test.go b/internal/stats/itree_test.go
new file mode 100644
index 0000000..cf79215
--- /dev/null
+++ b/internal/stats/itree_test.go
@@ -0,0 +1,41 @@
+package stats
+
+import (
+ "reflect"
+ "testing"
+)
+
+func TestIntervalTree(t *testing.T) {
+ t.Parallel()
+ inserts := []struct {
+ v float64
+ tree []int
+ median float64
+ }{
+ // These values were pulled from the appendix of the edm paper where the
+ // interval tree is described.
+ {0.09, []int{1, 1, 0, 1, 0, 0, 0}, 0.25},
+ {0.42, []int{2, 2, 0, 1, 1, 0, 0}, 0.125},
+ {0.99, []int{3, 2, 1, 1, 1, 0, 1}, 0.25},
+ {0.36, []int{4, 3, 1, 1, 2, 0, 1}, 0.375},
+ }
+ tree := NewIntervalTree(2)
+ if l := tree.NumElements(); l != 0 {
+ t.Errorf("tree.Length() = %d, expected 0", l)
+ }
+ if v := tree.Median(); v != 0 {
+ t.Errorf("tree.Median() = %f, expected 0", v)
+ }
+ for i, ins := range inserts {
+ tree.Insert(ins.v)
+ if !reflect.DeepEqual(tree.vals, ins.tree) {
+ t.Errorf("[%d] tree.Insert(%v) = %v, expected %v", i, ins.v, tree.vals, ins.tree)
+ }
+ if l := tree.NumElements(); l != i+1 {
+ t.Errorf("[%d] tree.Length() = %d, expected %d", i, l, i+1)
+ }
+ if v := tree.Median(); v != ins.median {
+ t.Errorf("[%d] tree.Median() = %f, expected %f", i, v, ins.median)
+ }
+ }
+}