blob: 5813fd349521813ad633e53e11e98894b245a6d3 [file] [log] [blame]
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Benchstat computes and compares statistics about benchmarks.
//
// Usage:
//
// benchstat [-delta-test name] [-geomean] [-html] old.txt [new.txt] [more.txt ...]
//
// Each input file should contain the concatenated output of a number
// of runs of ``go test -bench.'' For each different benchmark listed in an input file,
// benchstat computes the mean, minimum, and maximum run time,
// after removing outliers using the interquartile range rule.
//
// If invoked on a single input file, benchstat prints the per-benchmark statistics
// for that file.
//
// If invoked on a pair of input files, benchstat adds to the output a column
// showing the statistics from the second file and a column showing the
// percent change in mean from the first to the second file.
// Next to the percent change, benchstat shows the p-value and sample
// sizes from a test of the two distributions of benchmark times.
// Small p-values indicate that the two distributions are significantly different.
// If the test indicates that there was no significant change between the two
// benchmarks (defined as p > 0.05), benchstat displays a single ~ instead of
// the percent change.
//
// The -delta-test option controls which significance test is applied:
// utest (Mann-Whitney U-test), ttest (two-sample Welch t-test), or none.
// The default is the U-test, sometimes also referred to as the Wilcoxon rank
// sum test.
//
// If invoked on more than two input files, benchstat prints the per-benchmark
// statistics for all the files, showing one column of statistics for each file,
// with no column for percent change or statistical significance.
//
// The -html option causes benchstat to print the results as an HTML table.
//
// Example
//
// Suppose we collect benchmark results from running ``go test -bench=Encode''
// five times before and after a particular change.
//
// The file old.txt contains:
//
// BenchmarkGobEncode 100 13552735 ns/op 56.63 MB/s
// BenchmarkJSONEncode 50 32395067 ns/op 59.90 MB/s
// BenchmarkGobEncode 100 13553943 ns/op 56.63 MB/s
// BenchmarkJSONEncode 50 32334214 ns/op 60.01 MB/s
// BenchmarkGobEncode 100 13606356 ns/op 56.41 MB/s
// BenchmarkJSONEncode 50 31992891 ns/op 60.65 MB/s
// BenchmarkGobEncode 100 13683198 ns/op 56.09 MB/s
// BenchmarkJSONEncode 50 31735022 ns/op 61.15 MB/s
//
// The file new.txt contains:
//
// BenchmarkGobEncode 100 11773189 ns/op 65.19 MB/s
// BenchmarkJSONEncode 50 32036529 ns/op 60.57 MB/s
// BenchmarkGobEncode 100 11942588 ns/op 64.27 MB/s
// BenchmarkJSONEncode 50 32156552 ns/op 60.34 MB/s
// BenchmarkGobEncode 100 11786159 ns/op 65.12 MB/s
// BenchmarkJSONEncode 50 31288355 ns/op 62.02 MB/s
// BenchmarkGobEncode 100 11628583 ns/op 66.00 MB/s
// BenchmarkJSONEncode 50 31559706 ns/op 61.49 MB/s
// BenchmarkGobEncode 100 11815924 ns/op 64.96 MB/s
// BenchmarkJSONEncode 50 31765634 ns/op 61.09 MB/s
//
// The order of the lines in the file does not matter, except that the
// output lists benchmarks in order of appearance.
//
// If run with just one input file, benchstat summarizes that file:
//
// $ benchstat old.txt
// name time/op
// GobEncode 13.6ms ± 1%
// JSONEncode 32.1ms ± 1%
// $
//
// If run with two input files, benchstat summarizes and compares:
//
// $ benchstat old.txt new.txt
// name old time/op new time/op delta
// GobEncode 13.6ms ± 1% 11.8ms ± 1% -13.31% (p=0.016 n=4+5)
// JSONEncode 32.1ms ± 1% 31.8ms ± 1% ~ (p=0.286 n=4+5)
// $
//
// Note that the JSONEncode result is reported as
// statistically insignificant instead of a -0.93% delta.
//
package main
import (
"bytes"
"flag"
"fmt"
"html"
"io/ioutil"
"log"
"os"
"strings"
"unicode/utf8"
"golang.org/x/perf/internal/stats"
)
func usage() {
fmt.Fprintf(os.Stderr, "usage: benchstat [options] old.txt [new.txt] [more.txt ...]\n")
fmt.Fprintf(os.Stderr, "options:\n")
flag.PrintDefaults()
os.Exit(2)
}
var (
flagDeltaTest = flag.String("delta-test", "utest", "significance `test` to apply to delta: utest, ttest, or none")
flagAlpha = flag.Float64("alpha", 0.05, "consider change significant if p < `α`")
flagGeomean = flag.Bool("geomean", false, "print the geometric mean of each file")
flagHTML = flag.Bool("html", false, "print results as an HTML table")
)
var deltaTestNames = map[string]DeltaTest{
"none": NoDeltaTest,
"u": UTest,
"u-test": UTest,
"utest": UTest,
"t": TTest,
"t-test": TTest,
"ttest": TTest,
}
type row struct {
cols []string
}
func newRow(cols ...string) *row {
return &row{cols: cols}
}
func (r *row) add(col string) {
r.cols = append(r.cols, col)
}
func (r *row) trim() {
for len(r.cols) > 0 && r.cols[len(r.cols)-1] == "" {
r.cols = r.cols[:len(r.cols)-1]
}
}
func main() {
log.SetPrefix("benchstat: ")
log.SetFlags(0)
flag.Usage = usage
flag.Parse()
deltaTest := deltaTestNames[strings.ToLower(*flagDeltaTest)]
if flag.NArg() < 1 || deltaTest == nil {
flag.Usage()
}
// Read in benchmark data.
c := new(Collection)
for _, file := range flag.Args() {
data, err := ioutil.ReadFile(file)
if err != nil {
log.Fatal(err)
}
c.AddConfig(file, data)
}
for _, m := range c.Metrics {
m.computeStats()
}
var tables [][]*row
switch len(c.Configs) {
case 2:
before, after := c.Configs[0], c.Configs[1]
key := Key{}
for _, key.Unit = range c.Units {
var table []*row
metric := metricOf(key.Unit)
for _, key.Benchmark = range c.Benchmarks {
key.Config = before
old := c.Metrics[key]
key.Config = after
new := c.Metrics[key]
if old == nil || new == nil {
continue
}
if len(table) == 0 {
table = append(table, newRow("name", "old "+metric, "new "+metric, "delta"))
}
pval, testerr := deltaTest(old, new)
scaler := NewScaler(old.Mean, old.Unit)
row := newRow(key.Benchmark, old.Format(scaler), new.Format(scaler), "~ ")
if testerr != nil {
row.add(fmt.Sprintf("(%s)", testerr))
} else if pval < *flagAlpha {
row.cols[3] = fmt.Sprintf("%+.2f%%", ((new.Mean/old.Mean)-1.0)*100.0)
}
if len(row.cols) == 4 && pval != -1 {
row.add(fmt.Sprintf("(p=%0.3f n=%d+%d)", pval, len(old.RValues), len(new.RValues)))
}
table = append(table, row)
}
if len(table) > 0 {
table = addGeomean(table, c, key.Unit, true)
tables = append(tables, table)
}
}
default:
key := Key{}
for _, key.Unit = range c.Units {
var table []*row
metric := metricOf(key.Unit)
if len(c.Configs) > 1 {
hdr := newRow("name \\ " + metric)
for _, config := range c.Configs {
hdr.add(config)
}
table = append(table, hdr)
} else {
table = append(table, newRow("name", metric))
}
for _, key.Benchmark = range c.Benchmarks {
row := newRow(key.Benchmark)
var scaler Scaler
for _, key.Config = range c.Configs {
m := c.Metrics[key]
if m == nil {
row.add("")
continue
}
if scaler == nil {
scaler = NewScaler(m.Mean, m.Unit)
}
row.add(m.Format(scaler))
}
row.trim()
if len(row.cols) > 1 {
table = append(table, row)
}
}
table = addGeomean(table, c, key.Unit, false)
tables = append(tables, table)
}
}
numColumn := 0
for _, table := range tables {
for _, row := range table {
if numColumn < len(row.cols) {
numColumn = len(row.cols)
}
}
}
max := make([]int, numColumn)
for _, table := range tables {
for _, row := range table {
for i, s := range row.cols {
n := utf8.RuneCountInString(s)
if max[i] < n {
max[i] = n
}
}
}
}
var buf bytes.Buffer
for i, table := range tables {
if i > 0 {
fmt.Fprintf(&buf, "\n")
}
if *flagHTML {
fmt.Fprintf(&buf, "<style>.benchstat tbody td:nth-child(1n+2) { text-align: right; padding: 0em 1em; }</style>\n")
fmt.Fprintf(&buf, "<table class='benchstat'>\n")
printRow := func(row *row, tag string) {
fmt.Fprintf(&buf, "<tr>")
for _, cell := range row.cols {
fmt.Fprintf(&buf, "<%s>%s</%s>", tag, html.EscapeString(cell), tag)
}
fmt.Fprintf(&buf, "\n")
}
printRow(table[0], "th")
for _, row := range table[1:] {
printRow(row, "td")
}
fmt.Fprintf(&buf, "</table>\n")
continue
}
// headings
row := table[0]
for i, s := range row.cols {
switch i {
case 0:
fmt.Fprintf(&buf, "%-*s", max[i], s)
default:
fmt.Fprintf(&buf, " %-*s", max[i], s)
case len(row.cols) - 1:
fmt.Fprintf(&buf, " %s\n", s)
}
}
// data
for _, row := range table[1:] {
for i, s := range row.cols {
switch i {
case 0:
fmt.Fprintf(&buf, "%-*s", max[i], s)
default:
if i == len(row.cols)-1 && len(s) > 0 && s[0] == '(' {
// Left-align p value.
fmt.Fprintf(&buf, " %s", s)
break
}
fmt.Fprintf(&buf, " %*s", max[i], s)
}
}
fmt.Fprintf(&buf, "\n")
}
}
os.Stdout.Write(buf.Bytes())
}
func addGeomean(table []*row, c *Collection, unit string, delta bool) []*row {
if !*flagGeomean {
return table
}
row := newRow("[Geo mean]")
key := Key{Unit: unit}
geomeans := []float64{}
for _, key.Config = range c.Configs {
var means []float64
for _, key.Benchmark = range c.Benchmarks {
m := c.Metrics[key]
if m != nil {
means = append(means, m.Mean)
}
}
if len(means) == 0 {
row.add("")
delta = false
} else {
geomean := stats.GeoMean(means)
geomeans = append(geomeans, geomean)
row.add(NewScaler(geomean, unit)(geomean) + " ")
}
}
if delta {
row.add(fmt.Sprintf("%+.2f%%", ((geomeans[1]/geomeans[0])-1.0)*100.0))
}
return append(table, row)
}
func metricOf(unit string) string {
switch unit {
case "ns/op":
return "time/op"
case "B/op":
return "alloc/op"
case "MB/s":
return "speed"
default:
return unit
}
}