benchstat: move bulk of logic into library

Change-Id: I90c73aa77b97b4921e839b376616646604e85a09
Reviewed-on: https://go-review.googlesource.com/35942
Reviewed-by: Quentin Smith <quentin@golang.org>
diff --git a/benchstat/data.go b/benchstat/data.go
new file mode 100644
index 0000000..0284216
--- /dev/null
+++ b/benchstat/data.go
@@ -0,0 +1,169 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package benchstat
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+
+	"golang.org/x/perf/internal/stats"
+)
+
+// A Collection is a collection of benchmark results.
+type Collection struct {
+	// Configs, Benchmarks, and Units give the set of configs,
+	// benchmarks, and units from the keys in Stats in an order
+	// meant to match the order the benchmarks were read in.
+	Configs, Benchmarks, Units []string
+
+	// Metrics holds the accumulated metrics for each key.
+	Metrics map[Key]*Metrics
+
+	// DeltaTest is the test to use to decide if a change is significant.
+	// If nil, it defaults to UTest.
+	DeltaTest DeltaTest
+
+	// Alpha is the p-value cutoff to report a change as significant.
+	// If zero, it defaults to 0.05.
+	Alpha float64
+
+	// AddGeoMean specifies whether to add a line to the table
+	// showing the geometric mean of all the benchmark results.
+	AddGeoMean bool
+}
+
+// A Key identifies one metric (e.g., "ns/op", "B/op") from one
+// benchmark (function name sans "Benchmark" prefix) in one
+// configuration (input file name).
+type Key struct {
+	Config, Benchmark, Unit string
+}
+
+// A Metrics holds the measurements of a single metric
+// (for example, ns/op or MB/s)
+// for all runs of a particular benchmark.
+type Metrics struct {
+	Unit    string    // unit being measured
+	Values  []float64 // measured values
+	RValues []float64 // Values with outliers removed
+	Min     float64   // min of RValues
+	Mean    float64   // mean of RValues
+	Max     float64   // max of RValues
+}
+
+// FormatMean formats m.Mean using scaler.
+func (m *Metrics) FormatMean(scaler Scaler) string {
+	var s string
+	if scaler != nil {
+		s = scaler(m.Mean)
+	} else {
+		s = fmt.Sprint(m.Mean)
+	}
+	return s
+}
+
+// FormatDiff computes and formats the percent variation of max and min compared to mean.
+// If b.Mean or b.Max is zero, FormatDiff returns an empty string.
+func (m *Metrics) FormatDiff() string {
+	if m.Mean == 0 || m.Max == 0 {
+		return ""
+	}
+	diff := 1 - m.Min/m.Mean
+	if d := m.Max/m.Mean - 1; d > diff {
+		diff = d
+	}
+	return fmt.Sprintf("%.0f%%", diff*100.0)
+}
+
+// Format returns a textual formatting of "Mean ±Diff" using scaler.
+func (m *Metrics) Format(scaler Scaler) string {
+	if m.Unit == "" {
+		return ""
+	}
+	mean := m.FormatMean(scaler)
+	diff := m.FormatDiff()
+	if diff == "" {
+		return mean + "     "
+	}
+	return fmt.Sprintf("%s ±%3s", mean, diff)
+}
+
+// computeStats updates the derived statistics in m from the raw
+// samples in m.Values.
+func (m *Metrics) computeStats() {
+	// Discard outliers.
+	values := stats.Sample{Xs: m.Values}
+	q1, q3 := values.Percentile(0.25), values.Percentile(0.75)
+	lo, hi := q1-1.5*(q3-q1), q3+1.5*(q3-q1)
+	for _, value := range m.Values {
+		if lo <= value && value <= hi {
+			m.RValues = append(m.RValues, value)
+		}
+	}
+
+	// Compute statistics of remaining data.
+	m.Min, m.Max = stats.Bounds(m.RValues)
+	m.Mean = stats.Mean(m.RValues)
+}
+
+// addMetrics returns the metrics with the given key from c,
+// creating a new one if needed.
+func (c *Collection) addMetrics(key Key) *Metrics {
+	if c.Metrics == nil {
+		c.Metrics = make(map[Key]*Metrics)
+	}
+	if stat, ok := c.Metrics[key]; ok {
+		return stat
+	}
+
+	addString := func(strings *[]string, add string) {
+		for _, s := range *strings {
+			if s == add {
+				return
+			}
+		}
+		*strings = append(*strings, add)
+	}
+	addString(&c.Configs, key.Config)
+	addString(&c.Benchmarks, key.Benchmark)
+	addString(&c.Units, key.Unit)
+	m := &Metrics{Unit: key.Unit}
+	c.Metrics[key] = m
+	return m
+}
+
+// AddConfig adds a set of benchmark results from a single configuration to the collection.
+func (c *Collection) AddConfig(config string, data []byte) {
+	c.Configs = append(c.Configs, config)
+	key := Key{Config: config}
+
+	for _, line := range strings.Split(string(data), "\n") {
+		f := strings.Fields(line)
+		if len(f) < 4 {
+			continue
+		}
+		name := f[0]
+		if !strings.HasPrefix(name, "Benchmark") {
+			continue
+		}
+		name = strings.TrimPrefix(name, "Benchmark")
+		n, _ := strconv.Atoi(f[1])
+		if n == 0 {
+			continue
+		}
+
+		key.Benchmark = name
+		for i := 2; i+2 <= len(f); i += 2 {
+			val, err := strconv.ParseFloat(f[i], 64)
+			if err != nil {
+				continue
+			}
+			key.Unit = f[i+1]
+			m := c.addMetrics(key)
+			m.Values = append(m.Values, val)
+		}
+	}
+}
diff --git a/benchstat/delta.go b/benchstat/delta.go
new file mode 100644
index 0000000..2a28653
--- /dev/null
+++ b/benchstat/delta.go
@@ -0,0 +1,72 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Significance tests.
+
+package benchstat
+
+import (
+	"errors"
+
+	"golang.org/x/perf/internal/stats"
+)
+
+// A DeltaTest compares the old and new metrics and returns the
+// expected probability that they are drawn from the same distribution.
+//
+// If a probability cannot be computed, the DeltaTest returns an
+// error explaining why. Common errors include ErrSamplesEqual
+// (all samples are equal), ErrSampleSize (there aren't enough samples),
+// and ErrZeroVariance (the sample has zero variance).
+//
+// As a special case, the missing test NoDeltaTest returns -1, nil.
+type DeltaTest func(old, new *Metrics) (float64, error)
+
+// Errors returned by DeltaTest.
+var (
+	ErrSamplesEqual = errors.New("all equal")
+	ErrSampleSize   = errors.New("too few samples")
+	ErrZeroVariance = errors.New("zero variance")
+)
+
+// NoDeltaTest applies no delta test; it returns -1, nil.
+func NoDeltaTest(old, new *Metrics) (pval float64, err error) {
+	return -1, nil
+}
+
+// TTest is a DeltaTest using the two-sample Welch t-test.
+func TTest(old, new *Metrics) (pval float64, err error) {
+	t, err := stats.TwoSampleWelchTTest(stats.Sample{Xs: old.RValues}, stats.Sample{Xs: new.RValues}, stats.LocationDiffers)
+	if err != nil {
+		return -1, convertErr(err)
+	}
+	return t.P, nil
+}
+
+// UTest is a DeltaTest using the Mann-Whitney U test.
+func UTest(old, new *Metrics) (pval float64, err error) {
+	u, err := stats.MannWhitneyUTest(old.RValues, new.RValues, stats.LocationDiffers)
+	if err != nil {
+		return -1, convertErr(err)
+	}
+	return u.P, nil
+}
+
+// convertErr converts from the stats package's internal errors
+// to errors exported by this package and expected from
+// a DeltaTest.
+// Using different errors makes it possible for clients to use
+// package benchstat without access to the internal stats package,
+// and it also gives us a chance to use shorter error messages.
+func convertErr(err error) error {
+	switch err {
+	case stats.ErrZeroVariance:
+		return ErrZeroVariance
+	case stats.ErrSampleSize:
+		return ErrSampleSize
+	case stats.ErrSamplesEqual:
+		return ErrSamplesEqual
+	}
+	return err
+}
diff --git a/benchstat/html.go b/benchstat/html.go
new file mode 100644
index 0000000..5f03d60
--- /dev/null
+++ b/benchstat/html.go
@@ -0,0 +1,56 @@
+// Copyright 2017 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package benchstat
+
+import (
+	"bytes"
+	"html/template"
+	"strings"
+)
+
+var htmlTemplate = template.Must(template.New("").Funcs(htmlFuncs).Parse(`
+{{with index . 0}}
+<table class='benchstat {{if .OldNewDelta}}oldnew{{end}}'>
+{{if .OldNewDelta -}}
+{{- else if eq (len .Configs) 1}}
+{{- else -}}
+<tr class='configs'><th>{{range .Configs}}<th>{{.}}{{end}}
+{{end}}
+{{end}}
+{{- range $i, $table := .}}
+<tbody>
+{{if .OldNewDelta -}}
+<tr><th><th>old {{.Metric}}<th>new {{.Metric}}<th>delta<th>
+{{else if eq (len .Configs) 1}}
+<tr><th><th>{{.Metric}}
+{{else -}}
+<tr><th><th colspan='{{len .Configs}}' class='metric'>{{.Metric}}
+{{end}}{{range $row := $table.Rows -}}
+{{if $table.OldNewDelta -}}
+<tr class='{{if eq .Change 1}}better{{else if eq .Change -1}}worse{{else}}unchanged{{end}}'>
+{{- else -}}
+<tr>
+{{- end -}}
+<td>{{.Benchmark}}{{range .Metrics}}<td>{{.Format $row.Scaler}}{{end}}{{if $table.OldNewDelta}}<td class='{{if eq .Delta "~"}}nodelta{{else}}delta{{end}}'>{{replace .Delta "-" "−" -1}}<td class='note'>{{.Note}}{{end}}
+{{end -}}
+<tr><td>&nbsp;
+</tbody>
+{{end}}
+</table>
+`))
+
+var htmlFuncs = template.FuncMap{
+	"replace": strings.Replace,
+}
+
+// FormatHTML appends an HTML formatting of the tables to buf.
+func FormatHTML(buf *bytes.Buffer, tables []*Table) {
+	err := htmlTemplate.Execute(buf, tables)
+	if err != nil {
+		// Only possible errors here are template not matching data structure.
+		// Don't make caller check - it's our fault.
+		panic(err)
+	}
+}
diff --git a/benchstat/scaler.go b/benchstat/scaler.go
new file mode 100644
index 0000000..c775caa
--- /dev/null
+++ b/benchstat/scaler.go
@@ -0,0 +1,109 @@
+// Copyright 2017 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package benchstat
+
+import "fmt"
+
+// A Scaler is a function that scales and formats a measurement.
+// All measurements within a given table row are formatted
+// using the same scaler, so that the units are consistent
+// across the row.
+type Scaler func(float64) string
+
+// NewScaler returns a Scaler appropriate for formatting
+// the measurement val, which has the given unit.
+func NewScaler(val float64, unit string) Scaler {
+	if unit == "ns/op" {
+		return timeScaler(val)
+	}
+
+	var format string
+	var scale float64
+	var suffix string
+
+	prescale := 1.0
+	if unit == "MB/s" {
+		prescale = 1e6
+	}
+
+	switch x := val * prescale; {
+	case x >= 99500000000000:
+		format, scale, suffix = "%.0f", 1e12, "T"
+	case x >= 9950000000000:
+		format, scale, suffix = "%.1f", 1e12, "T"
+	case x >= 995000000000:
+		format, scale, suffix = "%.2f", 1e12, "T"
+	case x >= 99500000000:
+		format, scale, suffix = "%.0f", 1e9, "G"
+	case x >= 9950000000:
+		format, scale, suffix = "%.1f", 1e9, "G"
+	case x >= 995000000:
+		format, scale, suffix = "%.2f", 1e9, "G"
+	case x >= 99500000:
+		format, scale, suffix = "%.0f", 1e6, "M"
+	case x >= 9950000:
+		format, scale, suffix = "%.1f", 1e6, "M"
+	case x >= 995000:
+		format, scale, suffix = "%.2f", 1e6, "M"
+	case x >= 99500:
+		format, scale, suffix = "%.0f", 1e3, "k"
+	case x >= 9950:
+		format, scale, suffix = "%.1f", 1e3, "k"
+	case x >= 995:
+		format, scale, suffix = "%.2f", 1e3, "k"
+	case x >= 99.5:
+		format, scale, suffix = "%.0f", 1, ""
+	case x >= 9.95:
+		format, scale, suffix = "%.1f", 1, ""
+	default:
+		format, scale, suffix = "%.2f", 1, ""
+	}
+
+	if unit == "B/op" {
+		suffix += "B"
+	}
+	if unit == "MB/s" {
+		suffix += "B/s"
+	}
+	scale /= prescale
+
+	return func(val float64) string {
+		return fmt.Sprintf(format+suffix, val/scale)
+	}
+}
+
+func timeScaler(ns float64) Scaler {
+	var format string
+	var scale float64
+	switch x := ns / 1e9; {
+	case x >= 99.5:
+		format, scale = "%.0fs", 1
+	case x >= 9.95:
+		format, scale = "%.1fs", 1
+	case x >= 0.995:
+		format, scale = "%.2fs", 1
+	case x >= 0.0995:
+		format, scale = "%.0fms", 1000
+	case x >= 0.00995:
+		format, scale = "%.1fms", 1000
+	case x >= 0.000995:
+		format, scale = "%.2fms", 1000
+	case x >= 0.0000995:
+		format, scale = "%.0fµs", 1000*1000
+	case x >= 0.00000995:
+		format, scale = "%.1fµs", 1000*1000
+	case x >= 0.000000995:
+		format, scale = "%.2fµs", 1000*1000
+	case x >= 0.0000000995:
+		format, scale = "%.0fns", 1000*1000*1000
+	case x >= 0.00000000995:
+		format, scale = "%.1fns", 1000*1000*1000
+	default:
+		format, scale = "%.2fns", 1000*1000*1000
+	}
+	return func(ns float64) string {
+		return fmt.Sprintf(format, ns/1e9*scale)
+	}
+}
diff --git a/benchstat/table.go b/benchstat/table.go
new file mode 100644
index 0000000..e5249c1
--- /dev/null
+++ b/benchstat/table.go
@@ -0,0 +1,167 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package benchstat
+
+import (
+	"fmt"
+
+	"golang.org/x/perf/internal/stats"
+)
+
+// A Table is a table for display in the benchstat output.
+type Table struct {
+	Metric      string
+	OldNewDelta bool // is this an old-new-delta table?
+	Configs     []string
+	Rows        []*Row
+}
+
+// A Row is a table row for display in the benchstat output.
+type Row struct {
+	Benchmark string     // benchmark name
+	Scaler    Scaler     // formatter for stats means
+	Metrics   []*Metrics // columns of statistics
+	Delta     string     // formatted percent change
+	Note      string     // additional information
+	Change    int        // +1 better, -1 worse, 0 unchanged
+}
+
+// Tables returns tables comparing the benchmarks in the collection.
+func (c *Collection) Tables() []*Table {
+	deltaTest := c.DeltaTest
+	if deltaTest == nil {
+		deltaTest = UTest
+	}
+	alpha := c.Alpha
+	if alpha == 0 {
+		alpha = 0.05
+	}
+
+	// Update statistics.
+	for _, m := range c.Metrics {
+		m.computeStats()
+	}
+
+	var tables []*Table
+	key := Key{}
+	for _, key.Unit = range c.Units {
+		table := new(Table)
+		table.Configs = c.Configs
+		table.Metric = metricOf(key.Unit)
+		table.OldNewDelta = len(c.Configs) == 2
+		for _, key.Benchmark = range c.Benchmarks {
+			row := &Row{Benchmark: key.Benchmark}
+
+			for _, key.Config = range c.Configs {
+				m := c.Metrics[key]
+				if m == nil {
+					row.Metrics = append(row.Metrics, new(Metrics))
+					continue
+				}
+				row.Metrics = append(row.Metrics, m)
+				if row.Scaler == nil {
+					row.Scaler = NewScaler(m.Mean, m.Unit)
+				}
+			}
+
+			// If there are only two configs being compared, add stats.
+			if table.OldNewDelta {
+				k0 := key
+				k0.Config = c.Configs[0]
+				k1 := key
+				k1.Config = c.Configs[1]
+				old := c.Metrics[k0]
+				new := c.Metrics[k1]
+				// If one is missing, omit row entirely.
+				// TODO: Control this better.
+				if old == new || new == nil {
+					continue
+				}
+				pval, testerr := deltaTest(old, new)
+				row.Delta = "~"
+				if testerr == stats.ErrZeroVariance {
+					row.Note = "(zero variance)"
+				} else if testerr == stats.ErrSampleSize {
+					row.Note = "(too few samples)"
+				} else if testerr == stats.ErrSamplesEqual {
+					row.Note = "(all equal)"
+				} else if testerr != nil {
+					row.Note = fmt.Sprintf("(%s)", testerr)
+				} else if pval < alpha {
+					pct := ((new.Mean / old.Mean) - 1.0) * 100.0
+					row.Delta = fmt.Sprintf("%+.2f%%", pct)
+					if pct < 0 == (table.Metric != "speed") { // smaller is better, except speeds
+						row.Change = +1
+					} else {
+						row.Change = -1
+					}
+				}
+				if row.Note == "" && pval != -1 {
+					row.Note = fmt.Sprintf("(p=%0.3f n=%d+%d)", pval, len(old.RValues), len(new.RValues))
+				}
+			}
+
+			table.Rows = append(table.Rows, row)
+		}
+
+		if len(table.Rows) > 0 {
+			if c.AddGeoMean {
+				addGeomean(c, table, key.Unit, table.OldNewDelta)
+			}
+			tables = append(tables, table)
+		}
+	}
+
+	return tables
+}
+
+// metricOf returns the name of the metric with the given unit.
+func metricOf(unit string) string {
+	switch unit {
+	case "ns/op":
+		return "time/op"
+	case "B/op":
+		return "alloc/op"
+	case "MB/s":
+		return "speed"
+	default:
+		return unit
+	}
+}
+
+// addGeomean adds a "geomean" row to the table,
+// showing the geometric mean of all the benchmarks.
+func addGeomean(c *Collection, t *Table, unit string, delta bool) {
+	row := &Row{Benchmark: "[Geo mean]"}
+	key := Key{Unit: unit}
+	geomeans := []float64{}
+	for _, key.Config = range c.Configs {
+		var means []float64
+		for _, key.Benchmark = range c.Benchmarks {
+			m := c.Metrics[key]
+			if m != nil {
+				means = append(means, m.Mean)
+			}
+		}
+		if len(means) == 0 {
+			row.Metrics = append(row.Metrics, new(Metrics))
+			delta = false
+		} else {
+			geomean := stats.GeoMean(means)
+			geomeans = append(geomeans, geomean)
+			if row.Scaler == nil {
+				row.Scaler = NewScaler(geomean, unit)
+			}
+			row.Metrics = append(row.Metrics, &Metrics{
+				Unit: unit,
+				Mean: geomean,
+			})
+		}
+	}
+	if delta {
+		row.Delta = fmt.Sprintf("%+.2f%%", ((geomeans[1]/geomeans[0])-1.0)*100.0)
+	}
+	t.Rows = append(t.Rows, row)
+}
diff --git a/benchstat/text.go b/benchstat/text.go
new file mode 100644
index 0000000..36d2d50
--- /dev/null
+++ b/benchstat/text.go
@@ -0,0 +1,126 @@
+// Copyright 2017 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package benchstat
+
+import (
+	"bytes"
+	"fmt"
+	"unicode/utf8"
+)
+
+// FormatText appends a fixed-width text formatting of the tables to buf.
+func FormatText(buf *bytes.Buffer, tables []*Table) {
+	var textTables [][]*textRow
+	for _, t := range tables {
+		textTables = append(textTables, toText(t))
+	}
+
+	var max []int
+	for _, table := range textTables {
+		for _, row := range table {
+			for len(max) < len(row.cols) {
+				max = append(max, 0)
+			}
+			for i, s := range row.cols {
+				n := utf8.RuneCountInString(s)
+				if max[i] < n {
+					max[i] = n
+				}
+			}
+		}
+	}
+
+	for i, table := range textTables {
+		if i > 0 {
+			fmt.Fprintf(buf, "\n")
+		}
+
+		// headings
+		row := table[0]
+		for i, s := range row.cols {
+			switch i {
+			case 0:
+				fmt.Fprintf(buf, "%-*s", max[i], s)
+			default:
+				fmt.Fprintf(buf, "  %-*s", max[i], s)
+			case len(row.cols) - 1:
+				fmt.Fprintf(buf, "  %s\n", s)
+			}
+		}
+
+		// data
+		for _, row := range table[1:] {
+			for i, s := range row.cols {
+				switch i {
+				case 0:
+					fmt.Fprintf(buf, "%-*s", max[i], s)
+				default:
+					if i == len(row.cols)-1 && len(s) > 0 && s[0] == '(' {
+						// Left-align p value.
+						fmt.Fprintf(buf, "  %s", s)
+						break
+					}
+					fmt.Fprintf(buf, "  %*s", max[i], s)
+				}
+			}
+			fmt.Fprintf(buf, "\n")
+		}
+	}
+}
+
+// A textRow is a row of printed text columns.
+type textRow struct {
+	cols []string
+}
+
+func newTextRow(cols ...string) *textRow {
+	return &textRow{cols: cols}
+}
+
+func (r *textRow) add(col string) {
+	r.cols = append(r.cols, col)
+}
+
+func (r *textRow) trim() {
+	for len(r.cols) > 0 && r.cols[len(r.cols)-1] == "" {
+		r.cols = r.cols[:len(r.cols)-1]
+	}
+}
+
+// toText converts the Table to a textual grid of cells,
+// which can then be printed in fixed-width output.
+func toText(t *Table) []*textRow {
+	var textRows []*textRow
+	switch len(t.Configs) {
+	case 1:
+		textRows = append(textRows, newTextRow("name", t.Metric))
+	case 2:
+		textRows = append(textRows, newTextRow("name", "old "+t.Metric, "new "+t.Metric, "delta"))
+	default:
+		row := newTextRow("name \\ " + t.Metric)
+		row.cols = append(row.cols, t.Configs...)
+		textRows = append(textRows, row)
+	}
+
+	for _, row := range t.Rows {
+		text := newTextRow(row.Benchmark)
+		for _, m := range row.Metrics {
+			text.cols = append(text.cols, m.Format(row.Scaler))
+		}
+		if len(t.Configs) == 2 {
+			delta := row.Delta
+			if delta == "~" {
+				delta = "~   "
+			}
+			text.cols = append(text.cols, delta)
+			text.cols = append(text.cols, row.Note)
+		}
+		textRows = append(textRows, text)
+	}
+	for _, r := range textRows {
+		r.trim()
+	}
+	return textRows
+}