internal/stats: add edm/edmx detectors Add single-instance edm/edmx breakout detectors. These algorithms are described in the paper: https://courses.cit.cornell.edu/nj89/docs/edm.pdf This implementation was compared against an open-source R implementation, and yielded the same breakouts for a wide range of data from cmd/bent. Change-Id: I088e37e6444e3c9d3275a15d04b5cecf6e1d2fe9 Reviewed-on: https://go-review.googlesource.com/c/benchmarks/+/292549 TryBot-Result: Go Bot <gobot@golang.org> Trust: Jeremy Faller <jeremy@golang.org> Run-TryBot: Jeremy Faller <jeremy@golang.org> Reviewed-by: David Chase <drchase@google.com>

commit: b64cdd10b5d92cde38c83cc1dcfd939e7f892fac [log] [tgz]
author: Jeremy Faller <jeremy@golang.org> Tue Feb 16 10:48:41 2021 -0500
committer: David Chase <drchase@google.com> Tue May 25 14:27:49 2021 +0000
tree: 9752ecde4636128e9311e6b5ee080f85062aea44
parent: f3eefda5d1b2cd04f6d90d8ccf885dc98b96a446 [diff]
diff --git a/internal/stats/edm.go b/internal/stats/edm.go
new file mode 100644
index 0000000..70e0c80
--- /dev/null
+++ b/internal/stats/edm.go

@@ -0,0 +1,218 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package stats
+
+import (
+	"math"
+)
+
+// edm carries data to calculate e-divisive with medians.
+type edm struct {
+	z           []float64
+	delta       int
+	bestStat    float64
+	bestIdx     int
+	tau         int
+	ta, tb, tab *IntervalTree
+}
+
+// normalize normalizes an slice of floats in place.
+func normalize(input []float64) []float64 {
+	// gracefully handle empty inputs.
+	if len(input) == 0 {
+		return input
+	}
+	min, max := input[0], input[0]
+	for _, v := range input {
+		if v > max {
+			max = v
+		}
+		if v < min {
+			min = v
+		}
+	}
+	for i, v := range input {
+		input[i] = (v - min) / (max - min)
+	}
+	return input
+}
+
+// medianResolution finds a good compromise for the approximate median, as
+// described in the paper.
+func medianResolution(l int) int {
+	d := 10 // min resolution 1/(1<<d).
+	if l := int(math.Ceil(math.Log2(float64(l)))); l > d {
+		d = l
+	}
+	return d
+}
+
+// toFloat converts an slice of integers to float64.
+func toFloat(input []int) []float64 {
+	output := make([]float64, len(input))
+	for i, v := range input {
+		output[i] = float64(v)
+	}
+	return output
+}
+
+// stat calculates the edx stat for a given location, saving it if it's better
+// than the currently stored statistic.
+func (e *edm) stat(tau2 int) float64 {
+	a, b, c := e.ta.Median(), e.tb.Median(), e.tab.Median()
+	a, b, c = a*a, b*b, c*c
+	stat := 2*c - a - b
+	stat *= float64(e.tau*(tau2-e.tau)) / float64(tau2)
+	if stat > e.bestStat {
+		e.bestStat = stat
+		e.bestIdx = e.tau
+	}
+	return stat
+}
+
+// calc handles the brunt of the E-divisive with median algorithm described in:
+// https://courses.cit.cornell.edu/nj89/docs/edm.pdf.
+func (e *edm) calc() int {
+	normalize(e.z)
+
+	e.bestStat = math.Inf(-1)
+	e.tau = e.delta
+	tau2 := 2 * e.delta
+
+	d := medianResolution(len(e.z))
+	e.ta = NewIntervalTree(d)
+	e.tb = NewIntervalTree(d)
+	e.tab = NewIntervalTree(d)
+
+	for i := 0; i < e.tau; i++ {
+		for j := i + 1; j < e.tau; j++ {
+			e.ta.Insert(e.z[i] - e.z[j])
+		}
+	}
+
+	for i := e.tau; i < tau2; i++ {
+		for j := i + 1; j < tau2; j++ {
+			e.tb.Insert(e.z[i] - e.z[j])
+		}
+	}
+
+	for i := 0; i < e.tau; i++ {
+		for j := e.tau; j < tau2; j++ {
+			e.tab.Insert(e.z[i] - e.z[j])
+		}
+	}
+
+	tau2 += 1
+	for ; tau2 < len(e.z)+1; tau2++ {
+		e.tb.Insert(e.z[tau2-1] - e.z[tau2-2])
+		e.stat(tau2)
+	}
+
+	forward := false
+	for e.tau < len(e.z)-e.delta {
+		if forward {
+			e.forwardUpdate()
+		} else {
+			e.backwardUpdate()
+		}
+		forward = !forward
+	}
+
+	return e.bestIdx
+}
+
+func (e *edm) forwardUpdate() {
+	tau2 := e.tau + e.delta
+	e.tau += 1
+
+	for i := e.tau - e.delta; i < e.tau-1; i++ {
+		e.ta.Insert(e.z[i] - e.z[e.tau-1])
+	}
+	for i := e.tau - e.delta; i < e.tau; i++ {
+		e.ta.Remove(e.z[i] - e.z[e.tau-e.delta-1])
+	}
+	e.ta.Insert(e.z[e.tau-e.delta-1] - e.z[e.tau-e.delta])
+
+	e.tab.Remove(e.z[e.tau-1] - e.z[e.tau-e.delta-1])
+	for i := e.tau; i < tau2; i++ {
+		e.tab.Remove(e.z[i] - e.z[e.tau-e.delta-1])
+		e.tab.Insert(e.z[i] - e.z[e.tau-1])
+	}
+	for i := e.tau - e.delta; i < e.tau-1; i++ {
+		e.tab.Remove(e.z[i] - e.z[e.tau-1])
+		e.tab.Insert(e.z[i] - e.z[tau2])
+	}
+	e.tab.Insert(e.z[e.tau-1] - e.z[tau2])
+
+	for i := e.tau; i < tau2; i++ {
+		e.tb.Remove(e.z[i] - e.z[e.tau-1])
+		e.tb.Insert(e.z[i] - e.z[tau2])
+	}
+
+	tau2 += 1
+	for ; tau2 < len(e.z)+1; tau2++ {
+		e.tb.Insert(e.z[tau2-1] - e.z[tau2-2])
+		e.stat(tau2)
+	}
+}
+
+func (e *edm) backwardUpdate() {
+	tau2 := e.tau + e.delta
+	e.tau += 1
+
+	for i := e.tau - e.delta; i < e.tau-1; i++ {
+		e.ta.Insert(e.z[i] - e.z[e.tau-1])
+	}
+	for i := e.tau - e.delta; i < e.tau; i++ {
+		e.ta.Remove(e.z[i] - e.z[e.tau-e.delta-1])
+	}
+	e.ta.Insert(e.z[e.tau-e.delta-1] - e.z[e.tau-e.delta])
+
+	e.tab.Remove(e.z[e.tau-1] - e.z[e.tau-e.delta-1])
+	for i := e.tau; i < tau2; i++ {
+		e.tab.Remove(e.z[i] - e.z[e.tau-e.delta-1])
+		e.tab.Insert(e.z[i] - e.z[e.tau-1])
+	}
+	for i := e.tau - e.delta; i < e.tau-1; i++ {
+		e.tab.Remove(e.z[i] - e.z[e.tau-1])
+		e.tab.Insert(e.z[i] - e.z[tau2])
+	}
+	e.tab.Insert(e.z[e.tau-1] - e.z[tau2])
+
+	for i := e.tau; i < e.tau+e.delta-1; i++ {
+		e.tb.Insert(e.z[e.tau+e.delta-1] - e.z[i])
+		e.tb.Remove(e.z[i] - e.z[e.tau-1])
+	}
+
+	for tau2 = len(e.z); tau2 >= e.tau+e.delta; tau2-- {
+		e.tb.Remove(e.z[tau2-1] - e.z[tau2-2])
+		e.stat(tau2)
+	}
+}
+
+// EDM performs the approximate E-divisive with means calculation as described
+// in https://courses.cit.cornell.edu/nj89/docs/edm.pdf.
+//
+// EDM is an algorithm for finding breakout points in a time-series data set.
+// The function accepts the input vector, and a window width that describes the
+// search window during which the median breakout must occur. The window, ∂,
+// is described in the paper.
+//
+// Note this algorithm uses the interval tree median approach as described in
+// the paper. The medians used will have a resolution of 1/(2^d), where d is
+// max(log2(len(input)), 10). That is the value recommended in the paper.
+func EDM(input []float64, delta int) int {
+	// edm modifies the input, we don't want to do that to our callers.
+	c := make([]float64, len(input))
+	copy(c, input)
+	e := &edm{z: c, delta: delta}
+	return e.calc()
+}
+
+// EDMInt is the integer version of EDM.
+func EDMInt(input []int, delta int) int {
+	e := &edm{z: toFloat(input), delta: delta}
+	return e.calc()
+}

diff --git a/internal/stats/edm_test.go b/internal/stats/edm_test.go
new file mode 100644
index 0000000..cfdfb8c
--- /dev/null
+++ b/internal/stats/edm_test.go

@@ -0,0 +1,83 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package stats
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestNormalize(t *testing.T) {
+	t.Parallel()
+	tests := []struct {
+		input, expected []float64
+	}{
+		{[]float64{}, []float64{}},
+		{[]float64{0, 1}, []float64{0, 1}},
+		{[]float64{1, 2, 3}, []float64{0, 0.5, 1}},
+		{[]float64{-1, 0, 1}, []float64{0, 0.5, 1}},
+		{[]float64{-3, -2, -1}, []float64{0, 0.5, 1}},
+		{[]float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, []float64{0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1}},
+	}
+	for i, test := range tests {
+		if out := normalize(test.input); !reflect.DeepEqual(out, test.expected) {
+			t.Errorf("[%d] normalizeInput(%v) = %v, expected %v", i, test.input, out, test.expected)
+		}
+	}
+}
+
+func TestMedianResolution(t *testing.T) {
+	t.Parallel()
+	tests := []struct {
+		l, expected int
+	}{
+		{1, 10},
+		{1000, 10},
+		{2048, 11},
+	}
+	for i, test := range tests {
+		if v := medianResolution(test.l); v != test.expected {
+			t.Errorf("[%d] medianResolution(%v) = %v, expected %v", i, test.l, v, test.expected)
+		}
+	}
+}
+
+func TestEDMXGolden(t *testing.T) {
+	// This test case is used in the paper.
+	input := []float64{
+		105.08333, 90.90000, 763.90000, 83.36667, 78.36667, 80.58333,
+		76.36667, 210.98333, 78.00000, 77.51667, 83.01667, 89.23333,
+		84.86667, 653.16667, 70.91667, 72.83333, 75.91667, 73.53333,
+		548.86667, 66.23333, 73.45000, 66.96667, 71.11667, 68.31667,
+		285.38333, 317.20000, 63.28333, 64.08333, 60.50000, 550.88333,
+		399.68333, 75.90000, 115.35000, 78.93333, 88.68333, 475.53333,
+		30.11667, 31.51667, 34.08333, 39.55000, 47.51667, 423.63333,
+		52.55000, 50.21667, 61.41667, 56.61667, 64.41667, 742.30000,
+		165.85000, 122.88333, 122.21667, 114.66667, 565.96667, 134.70000,
+		141.16667, 160.78333, 168.48333, 458.65000, 513.28333, 154.36667,
+		130.66667, 125.93333, 127.25000, 615.58333, 122.90000, 97.45000,
+		122.76667, 115.10000, 111.95000, 442.78333, 113.83333, 116.11667,
+		128.70000, 135.03333, 138.75000, 153.38333, 143.58333, 161.50000,
+		168.11667, 152.25000, 147.11667, 163.91667, 161.10000, 146.95000,
+		132.65000, 127.28333, 116.10000, 92.28333, 54.88333, 111.35000,
+		114.98333, 110.98333, 1015.35000, 774.58333, 232.65000, 134.61667,
+		130.25000, 98.66667, 102.40000, 184.86667, 258.76667, 70.33333,
+		81.38333, 81.10000, 89.21667, 536.96667, 85.83333, 95.63333,
+		76.10000, 94.38333, 73.25000, 346.70000, 65.38333, 84.73333,
+		140.56667, 120.60000, 121.38333, 359.23333, 55.28333, 54.55000,
+		52.18333, 56.20000, 112.11667, 208.53333, 49.40000, 49.06667,
+		56.06667, 54.01667, 63.51667, 344.41667, 42.06667, 55.36667,
+		55.96667, 55.85000, 56.30000, 46.56667, 49.25000, 43.90000,
+		357.61667, 44.10000, 44.68333, 43.13333, 40.55000, 452.20000,
+		47.06667, 40.00000, 42.35000, 48.36667, 44.86667, 48.51667,
+		244.01667, 50.16667, 48.73333, 47.91667, 51.96667, 343.33333,
+		35.25000, 45.33333, 46.86667, 48.78333}
+	if v := EDMX(input, 24); v != 95 {
+		t.Errorf("EDMX() = %v, expected 95", v)
+	}
+	if v := EDM(input, 24); v != 47 {
+		t.Errorf("EDM() = %v, expected 47", v)
+	}
+}

diff --git a/internal/stats/edmx.go b/internal/stats/edmx.go
new file mode 100644
index 0000000..59d91af
--- /dev/null
+++ b/internal/stats/edmx.go

@@ -0,0 +1,119 @@
+package stats
+
+import (
+	"container/heap"
+	"math"
+)
+
+type maxHeap []float64
+
+func (h maxHeap) Len() int           { return len(h) }
+func (h maxHeap) Less(i, j int) bool { return h[i] > h[j] }
+func (h maxHeap) Swap(i, j int)      { h[i], h[j] = h[j], h[i] }
+func (h *maxHeap) Push(x interface{}) {
+	*h = append(*h, x.(float64))
+}
+func (h *maxHeap) Pop() interface{} {
+	old := *h
+	n := len(old)
+	x := old[n-1]
+	*h = old[0 : n-1]
+	return x
+}
+
+type minHeap []float64
+
+func (h minHeap) Len() int           { return len(h) }
+func (h minHeap) Less(i, j int) bool { return h[i] < h[j] }
+func (h minHeap) Swap(i, j int)      { h[i], h[j] = h[j], h[i] }
+func (h *minHeap) Push(x interface{}) {
+	*h = append(*h, x.(float64))
+}
+func (h *minHeap) Pop() interface{} {
+	old := *h
+	n := len(old)
+	x := old[n-1]
+	*h = old[0 : n-1]
+	return x
+}
+
+// addToHeaps adds a value to the appropriate heap,and keeps their sizes close.
+// This is a direct translation of the addToHeaps function described in the
+// paper.
+func addToHeaps(min *minHeap, max *maxHeap, x float64) {
+	// NB: There's an ambiguity in the paper here related to if-then/else
+	// evaluation. This structure seems to yield the desired results.
+	if min.Len() == 0 || x < (*min)[0] {
+		heap.Push(max, x)
+	} else {
+		heap.Push(min, x)
+	}
+	if min.Len() > max.Len()+1 {
+		heap.Push(max, heap.Pop(min))
+	} else if max.Len() > min.Len()+1 {
+		heap.Push(min, heap.Pop(max))
+	}
+}
+
+// getMedian finds the median of the min and max heaps.
+// This is a direct translation of the getMedian function described in the
+// paper.
+func getMedian(min minHeap, max maxHeap) float64 {
+	if min.Len() > max.Len() {
+		return min[0]
+	}
+	if max.Len() > min.Len() {
+		return max[0]
+	}
+	return (max[0] + min[0]) / 2
+}
+
+// edmx implements the EDM-X algorithm.
+// This algorithm runs in place, modifying the data.
+func edmx(input []float64, delta int) int {
+	input = normalize(input)
+	var lmax, rmax maxHeap
+	var lmin, rmin minHeap
+	heap.Init(&lmax)
+	heap.Init(&rmax)
+	heap.Init(&lmin)
+	heap.Init(&rmin)
+	bestStat := math.Inf(-1)
+	bestLoc := -1
+
+	for i := 0; i < delta-1; i++ {
+		addToHeaps(&lmin, &lmax, input[i])
+	}
+	for i := delta; i < len(input)-delta+1; i++ {
+		addToHeaps(&lmin, &lmax, input[i-1])
+		ml := getMedian(lmin, lmax)
+		rmax, rmin = rmax[:0], rmin[:0]
+		for j := i; j < i+delta-1; j++ {
+			addToHeaps(&rmin, &rmax, input[j])
+		}
+		for j := i + delta; j < len(input)+1; j++ {
+			addToHeaps(&rmin, &rmax, input[j-1])
+			mr := getMedian(rmin, rmax)
+			stat := float64(i*(j-i)) / float64(j) * (ml - mr) * (ml - mr)
+			if stat > bestStat {
+				bestStat = stat
+				bestLoc = i
+			}
+		}
+	}
+
+	return bestLoc
+}
+
+// EDMX runs the EDM-X algorithm on a slice of floats.
+func EDMX(input []float64, delta int) int {
+	// edmx modfies the input, don't do that.
+	c := make([]float64, len(input))
+	copy(c, input)
+	return edmx(c, delta)
+}
+
+// EMDXInt runs the EDM-X algorithm on a slice of integer datapoints.
+func EDMXInt(input []int, delta int) int {
+	return edmx(toFloat(input), delta)
+}

diff --git a/internal/stats/itree.go b/internal/stats/itree.go
new file mode 100644
index 0000000..7a79052
--- /dev/null
+++ b/internal/stats/itree.go

@@ -0,0 +1,91 @@
+package stats
+
+import (
+	"math"
+)
+
+// IntervalTree is a structure used to make caluclating running medians quick.
+// The structure is described in the Section 8/Appendix of the paper.
+type IntervalTree struct {
+	d    int
+	vals []int
+}
+
+// NewIntervalTree creates a new IntervalTree of depth d.
+func NewIntervalTree(d int) *IntervalTree {
+	if d < 0 {
+		panic("invalid depth")
+	}
+	return &IntervalTree{
+		d:    d,
+		vals: make([]int, (1<<(d+1))-1),
+	}
+}
+
+// walk is a generic function on interval trees to add or remove elements.
+func (it *IntervalTree) walk(v float64, update int) {
+	v = math.Abs(v)
+	mid, inc := 0.5, 0.25
+	idx := 0
+	// Update the levels in the tree.
+	for i := 0; i <= it.d; i++ {
+		it.vals[idx] += update
+		idx = idx*2 + 1
+		if v > mid {
+			mid += inc
+			idx += 1
+		} else {
+			mid -= inc
+		}
+		inc /= 2.
+	}
+}
+
+// Insert puts an element in an interval tree.
+func (it *IntervalTree) Insert(v float64) {
+	it.walk(v, 1)
+}
+
+// Remove removes an element from an interview tree.
+func (it *IntervalTree) Remove(v float64) {
+	it.walk(v, -1)
+}
+
+// Median returns the current median.
+func (it *IntervalTree) Median() float64 {
+	// If empty, special case and return 0.
+	numElements := it.NumElements()
+	if numElements == 0 {
+		return 0
+	}
+
+	l, u := 0., 1.
+	k := int(math.Ceil(float64(numElements) / 2.))
+	for i := 0; i < len(it.vals); {
+		j := 2*i + 1
+		if j >= len(it.vals) {
+			break
+		}
+		if it.vals[i] == k {
+			kf := float64(k)
+			a, b := float64(it.vals[j])/kf, float64(it.vals[j+1])/kf
+			x := (l + (l+u)/2.) / 2.
+			y := (u + (l+u)/2.) / 2.
+			return (a*x + b*y) / (a + b)
+		}
+		if v := it.vals[j]; v >= k {
+			i = j
+			u = (l + u) / 2.
+		} else {
+			k -= v
+			i = j + 1
+			l = (l + u) / 2.
+		}
+	}
+	return (u-l)/2. + l
+}
+
+// NumElements returns the number of elements in the tree.
+func (it *IntervalTree) NumElements() int {
+	return it.vals[0]
+}

diff --git a/internal/stats/itree_test.go b/internal/stats/itree_test.go
new file mode 100644
index 0000000..cf79215
--- /dev/null
+++ b/internal/stats/itree_test.go

@@ -0,0 +1,41 @@
+package stats
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestIntervalTree(t *testing.T) {
+	t.Parallel()
+	inserts := []struct {
+		v      float64
+		tree   []int
+		median float64
+	}{
+		// These values were pulled from the appendix of the edm paper where the
+		// interval tree is described.
+		{0.09, []int{1, 1, 0, 1, 0, 0, 0}, 0.25},
+		{0.42, []int{2, 2, 0, 1, 1, 0, 0}, 0.125},
+		{0.99, []int{3, 2, 1, 1, 1, 0, 1}, 0.25},
+		{0.36, []int{4, 3, 1, 1, 2, 0, 1}, 0.375},
+	}
+	tree := NewIntervalTree(2)
+	if l := tree.NumElements(); l != 0 {
+		t.Errorf("tree.Length() = %d, expected 0", l)
+	}
+	if v := tree.Median(); v != 0 {
+		t.Errorf("tree.Median() = %f, expected 0", v)
+	}
+	for i, ins := range inserts {
+		tree.Insert(ins.v)
+		if !reflect.DeepEqual(tree.vals, ins.tree) {
+			t.Errorf("[%d] tree.Insert(%v) = %v, expected %v", i, ins.v, tree.vals, ins.tree)
+		}
+		if l := tree.NumElements(); l != i+1 {
+			t.Errorf("[%d] tree.Length() = %d, expected %d", i, l, i+1)
+		}
+		if v := tree.Median(); v != ins.median {
+			t.Errorf("[%d] tree.Median() = %f, expected %f", i, v, ins.median)
+		}
+	}
+}
commit	b64cdd10b5d92cde38c83cc1dcfd939e7f892fac	[log] [tgz]
author	Jeremy Faller <jeremy@golang.org>	Tue Feb 16 10:48:41 2021 -0500
committer	David Chase <drchase@google.com>	Tue May 25 14:27:49 2021 +0000
tree	9752ecde4636128e9311e6b5ee080f85062aea44
parent	f3eefda5d1b2cd04f6d90d8ccf885dc98b96a446 [diff]