benchstat: move bulk of logic into library
Change-Id: I90c73aa77b97b4921e839b376616646604e85a09
Reviewed-on: https://go-review.googlesource.com/35942
Reviewed-by: Quentin Smith <quentin@golang.org>
diff --git a/benchstat/data.go b/benchstat/data.go
new file mode 100644
index 0000000..0284216
--- /dev/null
+++ b/benchstat/data.go
@@ -0,0 +1,169 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package benchstat
+
+import (
+ "fmt"
+ "strconv"
+ "strings"
+
+ "golang.org/x/perf/internal/stats"
+)
+
+// A Collection is a collection of benchmark results.
+type Collection struct {
+ // Configs, Benchmarks, and Units give the set of configs,
+ // benchmarks, and units from the keys in Stats in an order
+ // meant to match the order the benchmarks were read in.
+ Configs, Benchmarks, Units []string
+
+ // Metrics holds the accumulated metrics for each key.
+ Metrics map[Key]*Metrics
+
+ // DeltaTest is the test to use to decide if a change is significant.
+ // If nil, it defaults to UTest.
+ DeltaTest DeltaTest
+
+ // Alpha is the p-value cutoff to report a change as significant.
+ // If zero, it defaults to 0.05.
+ Alpha float64
+
+ // AddGeoMean specifies whether to add a line to the table
+ // showing the geometric mean of all the benchmark results.
+ AddGeoMean bool
+}
+
+// A Key identifies one metric (e.g., "ns/op", "B/op") from one
+// benchmark (function name sans "Benchmark" prefix) in one
+// configuration (input file name).
+type Key struct {
+ Config, Benchmark, Unit string
+}
+
+// A Metrics holds the measurements of a single metric
+// (for example, ns/op or MB/s)
+// for all runs of a particular benchmark.
+type Metrics struct {
+ Unit string // unit being measured
+ Values []float64 // measured values
+ RValues []float64 // Values with outliers removed
+ Min float64 // min of RValues
+ Mean float64 // mean of RValues
+ Max float64 // max of RValues
+}
+
+// FormatMean formats m.Mean using scaler.
+func (m *Metrics) FormatMean(scaler Scaler) string {
+ var s string
+ if scaler != nil {
+ s = scaler(m.Mean)
+ } else {
+ s = fmt.Sprint(m.Mean)
+ }
+ return s
+}
+
+// FormatDiff computes and formats the percent variation of max and min compared to mean.
+// If b.Mean or b.Max is zero, FormatDiff returns an empty string.
+func (m *Metrics) FormatDiff() string {
+ if m.Mean == 0 || m.Max == 0 {
+ return ""
+ }
+ diff := 1 - m.Min/m.Mean
+ if d := m.Max/m.Mean - 1; d > diff {
+ diff = d
+ }
+ return fmt.Sprintf("%.0f%%", diff*100.0)
+}
+
+// Format returns a textual formatting of "Mean ±Diff" using scaler.
+func (m *Metrics) Format(scaler Scaler) string {
+ if m.Unit == "" {
+ return ""
+ }
+ mean := m.FormatMean(scaler)
+ diff := m.FormatDiff()
+ if diff == "" {
+ return mean + " "
+ }
+ return fmt.Sprintf("%s ±%3s", mean, diff)
+}
+
+// computeStats updates the derived statistics in m from the raw
+// samples in m.Values.
+func (m *Metrics) computeStats() {
+ // Discard outliers.
+ values := stats.Sample{Xs: m.Values}
+ q1, q3 := values.Percentile(0.25), values.Percentile(0.75)
+ lo, hi := q1-1.5*(q3-q1), q3+1.5*(q3-q1)
+ for _, value := range m.Values {
+ if lo <= value && value <= hi {
+ m.RValues = append(m.RValues, value)
+ }
+ }
+
+ // Compute statistics of remaining data.
+ m.Min, m.Max = stats.Bounds(m.RValues)
+ m.Mean = stats.Mean(m.RValues)
+}
+
+// addMetrics returns the metrics with the given key from c,
+// creating a new one if needed.
+func (c *Collection) addMetrics(key Key) *Metrics {
+ if c.Metrics == nil {
+ c.Metrics = make(map[Key]*Metrics)
+ }
+ if stat, ok := c.Metrics[key]; ok {
+ return stat
+ }
+
+ addString := func(strings *[]string, add string) {
+ for _, s := range *strings {
+ if s == add {
+ return
+ }
+ }
+ *strings = append(*strings, add)
+ }
+ addString(&c.Configs, key.Config)
+ addString(&c.Benchmarks, key.Benchmark)
+ addString(&c.Units, key.Unit)
+ m := &Metrics{Unit: key.Unit}
+ c.Metrics[key] = m
+ return m
+}
+
+// AddConfig adds a set of benchmark results from a single configuration to the collection.
+func (c *Collection) AddConfig(config string, data []byte) {
+ c.Configs = append(c.Configs, config)
+ key := Key{Config: config}
+
+ for _, line := range strings.Split(string(data), "\n") {
+ f := strings.Fields(line)
+ if len(f) < 4 {
+ continue
+ }
+ name := f[0]
+ if !strings.HasPrefix(name, "Benchmark") {
+ continue
+ }
+ name = strings.TrimPrefix(name, "Benchmark")
+ n, _ := strconv.Atoi(f[1])
+ if n == 0 {
+ continue
+ }
+
+ key.Benchmark = name
+ for i := 2; i+2 <= len(f); i += 2 {
+ val, err := strconv.ParseFloat(f[i], 64)
+ if err != nil {
+ continue
+ }
+ key.Unit = f[i+1]
+ m := c.addMetrics(key)
+ m.Values = append(m.Values, val)
+ }
+ }
+}
diff --git a/benchstat/delta.go b/benchstat/delta.go
new file mode 100644
index 0000000..2a28653
--- /dev/null
+++ b/benchstat/delta.go
@@ -0,0 +1,72 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Significance tests.
+
+package benchstat
+
+import (
+ "errors"
+
+ "golang.org/x/perf/internal/stats"
+)
+
+// A DeltaTest compares the old and new metrics and returns the
+// expected probability that they are drawn from the same distribution.
+//
+// If a probability cannot be computed, the DeltaTest returns an
+// error explaining why. Common errors include ErrSamplesEqual
+// (all samples are equal), ErrSampleSize (there aren't enough samples),
+// and ErrZeroVariance (the sample has zero variance).
+//
+// As a special case, the missing test NoDeltaTest returns -1, nil.
+type DeltaTest func(old, new *Metrics) (float64, error)
+
+// Errors returned by DeltaTest.
+var (
+ ErrSamplesEqual = errors.New("all equal")
+ ErrSampleSize = errors.New("too few samples")
+ ErrZeroVariance = errors.New("zero variance")
+)
+
+// NoDeltaTest applies no delta test; it returns -1, nil.
+func NoDeltaTest(old, new *Metrics) (pval float64, err error) {
+ return -1, nil
+}
+
+// TTest is a DeltaTest using the two-sample Welch t-test.
+func TTest(old, new *Metrics) (pval float64, err error) {
+ t, err := stats.TwoSampleWelchTTest(stats.Sample{Xs: old.RValues}, stats.Sample{Xs: new.RValues}, stats.LocationDiffers)
+ if err != nil {
+ return -1, convertErr(err)
+ }
+ return t.P, nil
+}
+
+// UTest is a DeltaTest using the Mann-Whitney U test.
+func UTest(old, new *Metrics) (pval float64, err error) {
+ u, err := stats.MannWhitneyUTest(old.RValues, new.RValues, stats.LocationDiffers)
+ if err != nil {
+ return -1, convertErr(err)
+ }
+ return u.P, nil
+}
+
+// convertErr converts from the stats package's internal errors
+// to errors exported by this package and expected from
+// a DeltaTest.
+// Using different errors makes it possible for clients to use
+// package benchstat without access to the internal stats package,
+// and it also gives us a chance to use shorter error messages.
+func convertErr(err error) error {
+ switch err {
+ case stats.ErrZeroVariance:
+ return ErrZeroVariance
+ case stats.ErrSampleSize:
+ return ErrSampleSize
+ case stats.ErrSamplesEqual:
+ return ErrSamplesEqual
+ }
+ return err
+}
diff --git a/benchstat/html.go b/benchstat/html.go
new file mode 100644
index 0000000..5f03d60
--- /dev/null
+++ b/benchstat/html.go
@@ -0,0 +1,56 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package benchstat
+
+import (
+ "bytes"
+ "html/template"
+ "strings"
+)
+
+var htmlTemplate = template.Must(template.New("").Funcs(htmlFuncs).Parse(`
+{{with index . 0}}
+<table class='benchstat {{if .OldNewDelta}}oldnew{{end}}'>
+{{if .OldNewDelta -}}
+{{- else if eq (len .Configs) 1}}
+{{- else -}}
+<tr class='configs'><th>{{range .Configs}}<th>{{.}}{{end}}
+{{end}}
+{{end}}
+{{- range $i, $table := .}}
+<tbody>
+{{if .OldNewDelta -}}
+<tr><th><th>old {{.Metric}}<th>new {{.Metric}}<th>delta<th>
+{{else if eq (len .Configs) 1}}
+<tr><th><th>{{.Metric}}
+{{else -}}
+<tr><th><th colspan='{{len .Configs}}' class='metric'>{{.Metric}}
+{{end}}{{range $row := $table.Rows -}}
+{{if $table.OldNewDelta -}}
+<tr class='{{if eq .Change 1}}better{{else if eq .Change -1}}worse{{else}}unchanged{{end}}'>
+{{- else -}}
+<tr>
+{{- end -}}
+<td>{{.Benchmark}}{{range .Metrics}}<td>{{.Format $row.Scaler}}{{end}}{{if $table.OldNewDelta}}<td class='{{if eq .Delta "~"}}nodelta{{else}}delta{{end}}'>{{replace .Delta "-" "−" -1}}<td class='note'>{{.Note}}{{end}}
+{{end -}}
+<tr><td>
+</tbody>
+{{end}}
+</table>
+`))
+
+var htmlFuncs = template.FuncMap{
+ "replace": strings.Replace,
+}
+
+// FormatHTML appends an HTML formatting of the tables to buf.
+func FormatHTML(buf *bytes.Buffer, tables []*Table) {
+ err := htmlTemplate.Execute(buf, tables)
+ if err != nil {
+ // Only possible errors here are template not matching data structure.
+ // Don't make caller check - it's our fault.
+ panic(err)
+ }
+}
diff --git a/benchstat/scaler.go b/benchstat/scaler.go
new file mode 100644
index 0000000..c775caa
--- /dev/null
+++ b/benchstat/scaler.go
@@ -0,0 +1,109 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package benchstat
+
+import "fmt"
+
+// A Scaler is a function that scales and formats a measurement.
+// All measurements within a given table row are formatted
+// using the same scaler, so that the units are consistent
+// across the row.
+type Scaler func(float64) string
+
+// NewScaler returns a Scaler appropriate for formatting
+// the measurement val, which has the given unit.
+func NewScaler(val float64, unit string) Scaler {
+ if unit == "ns/op" {
+ return timeScaler(val)
+ }
+
+ var format string
+ var scale float64
+ var suffix string
+
+ prescale := 1.0
+ if unit == "MB/s" {
+ prescale = 1e6
+ }
+
+ switch x := val * prescale; {
+ case x >= 99500000000000:
+ format, scale, suffix = "%.0f", 1e12, "T"
+ case x >= 9950000000000:
+ format, scale, suffix = "%.1f", 1e12, "T"
+ case x >= 995000000000:
+ format, scale, suffix = "%.2f", 1e12, "T"
+ case x >= 99500000000:
+ format, scale, suffix = "%.0f", 1e9, "G"
+ case x >= 9950000000:
+ format, scale, suffix = "%.1f", 1e9, "G"
+ case x >= 995000000:
+ format, scale, suffix = "%.2f", 1e9, "G"
+ case x >= 99500000:
+ format, scale, suffix = "%.0f", 1e6, "M"
+ case x >= 9950000:
+ format, scale, suffix = "%.1f", 1e6, "M"
+ case x >= 995000:
+ format, scale, suffix = "%.2f", 1e6, "M"
+ case x >= 99500:
+ format, scale, suffix = "%.0f", 1e3, "k"
+ case x >= 9950:
+ format, scale, suffix = "%.1f", 1e3, "k"
+ case x >= 995:
+ format, scale, suffix = "%.2f", 1e3, "k"
+ case x >= 99.5:
+ format, scale, suffix = "%.0f", 1, ""
+ case x >= 9.95:
+ format, scale, suffix = "%.1f", 1, ""
+ default:
+ format, scale, suffix = "%.2f", 1, ""
+ }
+
+ if unit == "B/op" {
+ suffix += "B"
+ }
+ if unit == "MB/s" {
+ suffix += "B/s"
+ }
+ scale /= prescale
+
+ return func(val float64) string {
+ return fmt.Sprintf(format+suffix, val/scale)
+ }
+}
+
+func timeScaler(ns float64) Scaler {
+ var format string
+ var scale float64
+ switch x := ns / 1e9; {
+ case x >= 99.5:
+ format, scale = "%.0fs", 1
+ case x >= 9.95:
+ format, scale = "%.1fs", 1
+ case x >= 0.995:
+ format, scale = "%.2fs", 1
+ case x >= 0.0995:
+ format, scale = "%.0fms", 1000
+ case x >= 0.00995:
+ format, scale = "%.1fms", 1000
+ case x >= 0.000995:
+ format, scale = "%.2fms", 1000
+ case x >= 0.0000995:
+ format, scale = "%.0fµs", 1000*1000
+ case x >= 0.00000995:
+ format, scale = "%.1fµs", 1000*1000
+ case x >= 0.000000995:
+ format, scale = "%.2fµs", 1000*1000
+ case x >= 0.0000000995:
+ format, scale = "%.0fns", 1000*1000*1000
+ case x >= 0.00000000995:
+ format, scale = "%.1fns", 1000*1000*1000
+ default:
+ format, scale = "%.2fns", 1000*1000*1000
+ }
+ return func(ns float64) string {
+ return fmt.Sprintf(format, ns/1e9*scale)
+ }
+}
diff --git a/benchstat/table.go b/benchstat/table.go
new file mode 100644
index 0000000..e5249c1
--- /dev/null
+++ b/benchstat/table.go
@@ -0,0 +1,167 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package benchstat
+
+import (
+ "fmt"
+
+ "golang.org/x/perf/internal/stats"
+)
+
+// A Table is a table for display in the benchstat output.
+type Table struct {
+ Metric string
+ OldNewDelta bool // is this an old-new-delta table?
+ Configs []string
+ Rows []*Row
+}
+
+// A Row is a table row for display in the benchstat output.
+type Row struct {
+ Benchmark string // benchmark name
+ Scaler Scaler // formatter for stats means
+ Metrics []*Metrics // columns of statistics
+ Delta string // formatted percent change
+ Note string // additional information
+ Change int // +1 better, -1 worse, 0 unchanged
+}
+
+// Tables returns tables comparing the benchmarks in the collection.
+func (c *Collection) Tables() []*Table {
+ deltaTest := c.DeltaTest
+ if deltaTest == nil {
+ deltaTest = UTest
+ }
+ alpha := c.Alpha
+ if alpha == 0 {
+ alpha = 0.05
+ }
+
+ // Update statistics.
+ for _, m := range c.Metrics {
+ m.computeStats()
+ }
+
+ var tables []*Table
+ key := Key{}
+ for _, key.Unit = range c.Units {
+ table := new(Table)
+ table.Configs = c.Configs
+ table.Metric = metricOf(key.Unit)
+ table.OldNewDelta = len(c.Configs) == 2
+ for _, key.Benchmark = range c.Benchmarks {
+ row := &Row{Benchmark: key.Benchmark}
+
+ for _, key.Config = range c.Configs {
+ m := c.Metrics[key]
+ if m == nil {
+ row.Metrics = append(row.Metrics, new(Metrics))
+ continue
+ }
+ row.Metrics = append(row.Metrics, m)
+ if row.Scaler == nil {
+ row.Scaler = NewScaler(m.Mean, m.Unit)
+ }
+ }
+
+ // If there are only two configs being compared, add stats.
+ if table.OldNewDelta {
+ k0 := key
+ k0.Config = c.Configs[0]
+ k1 := key
+ k1.Config = c.Configs[1]
+ old := c.Metrics[k0]
+ new := c.Metrics[k1]
+ // If one is missing, omit row entirely.
+ // TODO: Control this better.
+ if old == new || new == nil {
+ continue
+ }
+ pval, testerr := deltaTest(old, new)
+ row.Delta = "~"
+ if testerr == stats.ErrZeroVariance {
+ row.Note = "(zero variance)"
+ } else if testerr == stats.ErrSampleSize {
+ row.Note = "(too few samples)"
+ } else if testerr == stats.ErrSamplesEqual {
+ row.Note = "(all equal)"
+ } else if testerr != nil {
+ row.Note = fmt.Sprintf("(%s)", testerr)
+ } else if pval < alpha {
+ pct := ((new.Mean / old.Mean) - 1.0) * 100.0
+ row.Delta = fmt.Sprintf("%+.2f%%", pct)
+ if pct < 0 == (table.Metric != "speed") { // smaller is better, except speeds
+ row.Change = +1
+ } else {
+ row.Change = -1
+ }
+ }
+ if row.Note == "" && pval != -1 {
+ row.Note = fmt.Sprintf("(p=%0.3f n=%d+%d)", pval, len(old.RValues), len(new.RValues))
+ }
+ }
+
+ table.Rows = append(table.Rows, row)
+ }
+
+ if len(table.Rows) > 0 {
+ if c.AddGeoMean {
+ addGeomean(c, table, key.Unit, table.OldNewDelta)
+ }
+ tables = append(tables, table)
+ }
+ }
+
+ return tables
+}
+
+// metricOf returns the name of the metric with the given unit.
+func metricOf(unit string) string {
+ switch unit {
+ case "ns/op":
+ return "time/op"
+ case "B/op":
+ return "alloc/op"
+ case "MB/s":
+ return "speed"
+ default:
+ return unit
+ }
+}
+
+// addGeomean adds a "geomean" row to the table,
+// showing the geometric mean of all the benchmarks.
+func addGeomean(c *Collection, t *Table, unit string, delta bool) {
+ row := &Row{Benchmark: "[Geo mean]"}
+ key := Key{Unit: unit}
+ geomeans := []float64{}
+ for _, key.Config = range c.Configs {
+ var means []float64
+ for _, key.Benchmark = range c.Benchmarks {
+ m := c.Metrics[key]
+ if m != nil {
+ means = append(means, m.Mean)
+ }
+ }
+ if len(means) == 0 {
+ row.Metrics = append(row.Metrics, new(Metrics))
+ delta = false
+ } else {
+ geomean := stats.GeoMean(means)
+ geomeans = append(geomeans, geomean)
+ if row.Scaler == nil {
+ row.Scaler = NewScaler(geomean, unit)
+ }
+ row.Metrics = append(row.Metrics, &Metrics{
+ Unit: unit,
+ Mean: geomean,
+ })
+ }
+ }
+ if delta {
+ row.Delta = fmt.Sprintf("%+.2f%%", ((geomeans[1]/geomeans[0])-1.0)*100.0)
+ }
+ t.Rows = append(t.Rows, row)
+}
diff --git a/benchstat/text.go b/benchstat/text.go
new file mode 100644
index 0000000..36d2d50
--- /dev/null
+++ b/benchstat/text.go
@@ -0,0 +1,126 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package benchstat
+
+import (
+ "bytes"
+ "fmt"
+ "unicode/utf8"
+)
+
+// FormatText appends a fixed-width text formatting of the tables to buf.
+func FormatText(buf *bytes.Buffer, tables []*Table) {
+ var textTables [][]*textRow
+ for _, t := range tables {
+ textTables = append(textTables, toText(t))
+ }
+
+ var max []int
+ for _, table := range textTables {
+ for _, row := range table {
+ for len(max) < len(row.cols) {
+ max = append(max, 0)
+ }
+ for i, s := range row.cols {
+ n := utf8.RuneCountInString(s)
+ if max[i] < n {
+ max[i] = n
+ }
+ }
+ }
+ }
+
+ for i, table := range textTables {
+ if i > 0 {
+ fmt.Fprintf(buf, "\n")
+ }
+
+ // headings
+ row := table[0]
+ for i, s := range row.cols {
+ switch i {
+ case 0:
+ fmt.Fprintf(buf, "%-*s", max[i], s)
+ default:
+ fmt.Fprintf(buf, " %-*s", max[i], s)
+ case len(row.cols) - 1:
+ fmt.Fprintf(buf, " %s\n", s)
+ }
+ }
+
+ // data
+ for _, row := range table[1:] {
+ for i, s := range row.cols {
+ switch i {
+ case 0:
+ fmt.Fprintf(buf, "%-*s", max[i], s)
+ default:
+ if i == len(row.cols)-1 && len(s) > 0 && s[0] == '(' {
+ // Left-align p value.
+ fmt.Fprintf(buf, " %s", s)
+ break
+ }
+ fmt.Fprintf(buf, " %*s", max[i], s)
+ }
+ }
+ fmt.Fprintf(buf, "\n")
+ }
+ }
+}
+
+// A textRow is a row of printed text columns.
+type textRow struct {
+ cols []string
+}
+
+func newTextRow(cols ...string) *textRow {
+ return &textRow{cols: cols}
+}
+
+func (r *textRow) add(col string) {
+ r.cols = append(r.cols, col)
+}
+
+func (r *textRow) trim() {
+ for len(r.cols) > 0 && r.cols[len(r.cols)-1] == "" {
+ r.cols = r.cols[:len(r.cols)-1]
+ }
+}
+
+// toText converts the Table to a textual grid of cells,
+// which can then be printed in fixed-width output.
+func toText(t *Table) []*textRow {
+ var textRows []*textRow
+ switch len(t.Configs) {
+ case 1:
+ textRows = append(textRows, newTextRow("name", t.Metric))
+ case 2:
+ textRows = append(textRows, newTextRow("name", "old "+t.Metric, "new "+t.Metric, "delta"))
+ default:
+ row := newTextRow("name \\ " + t.Metric)
+ row.cols = append(row.cols, t.Configs...)
+ textRows = append(textRows, row)
+ }
+
+ for _, row := range t.Rows {
+ text := newTextRow(row.Benchmark)
+ for _, m := range row.Metrics {
+ text.cols = append(text.cols, m.Format(row.Scaler))
+ }
+ if len(t.Configs) == 2 {
+ delta := row.Delta
+ if delta == "~" {
+ delta = "~ "
+ }
+ text.cols = append(text.cols, delta)
+ text.cols = append(text.cols, row.Note)
+ }
+ textRows = append(textRows, text)
+ }
+ for _, r := range textRows {
+ r.trim()
+ }
+ return textRows
+}