blob: 9fe399e055a53bfa458d4ff89403da32c98c2f1b [file] [log] [blame]
// Copyright 2021 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Note: Blocks that begin with "$ benchstat" below will be tested by
// doc_test.go.
// Benchstat computes statistical summaries and A/B comparisons of Go
// benchmarks.
//
// Usage:
//
// benchstat [flags] inputs...
//
// Each input file should be in the Go benchmark format
// (https://golang.org/design/14313-benchmark-format), such as the
// output of “go test -bench .”. Typically, there should be two (or
// more) inputs files for before and after some change (or series of
// changes) to be measured. Each benchmark should be run at least 10
// times to gather a statistically significant sample of results. For
// each benchmark, benchstat computes the median and the confidence
// interval for the median. By default, if there are two or more
// inputs files, it compares each benchmark in the first file to the
// same benchmark in each subsequent file and reports whether there
// was a statistically significant difference, though it can be
// configured to compare on other dimensions.
//
// # Example
//
// Suppose we collect results from running a set of benchmarks 10 times
// before a particular change:
//
// go test -run='^$' -bench=. -count=10 > old.txt
//
// And the same benchmarks 10 times after:
//
// go test -run='^$' -bench=. -count=10 > new.txt
//
// The file old.txt contains:
//
// goos: linux
// goarch: amd64
// pkg: golang.org/x/perf/cmd/benchstat/testdata
// BenchmarkEncode/format=json-48 690848 1726 ns/op
// BenchmarkEncode/format=json-48 684861 1723 ns/op
// BenchmarkEncode/format=json-48 693285 1707 ns/op
// BenchmarkEncode/format=json-48 677692 1707 ns/op
// BenchmarkEncode/format=json-48 692130 1713 ns/op
// BenchmarkEncode/format=json-48 684164 1729 ns/op
// BenchmarkEncode/format=json-48 682500 1736 ns/op
// BenchmarkEncode/format=json-48 677509 1707 ns/op
// BenchmarkEncode/format=json-48 687295 1705 ns/op
// BenchmarkEncode/format=json-48 695533 1774 ns/op
// BenchmarkEncode/format=gob-48 372699 3069 ns/op
// BenchmarkEncode/format=gob-48 394740 3075 ns/op
// BenchmarkEncode/format=gob-48 391335 3069 ns/op
// BenchmarkEncode/format=gob-48 383588 3067 ns/op
// BenchmarkEncode/format=gob-48 385885 3207 ns/op
// BenchmarkEncode/format=gob-48 389970 3064 ns/op
// BenchmarkEncode/format=gob-48 393361 3064 ns/op
// BenchmarkEncode/format=gob-48 393882 3058 ns/op
// BenchmarkEncode/format=gob-48 396171 3059 ns/op
// BenchmarkEncode/format=gob-48 397812 3062 ns/op
//
// The file new.txt contains:
//
// goos: linux
// goarch: amd64
// pkg: golang.org/x/perf/cmd/benchstat/testdata
// BenchmarkEncode/format=json-48 714387 1423 ns/op
// BenchmarkEncode/format=json-48 845445 1416 ns/op
// BenchmarkEncode/format=json-48 815714 1411 ns/op
// BenchmarkEncode/format=json-48 828824 1413 ns/op
// BenchmarkEncode/format=json-48 834070 1412 ns/op
// BenchmarkEncode/format=json-48 828123 1426 ns/op
// BenchmarkEncode/format=json-48 834493 1422 ns/op
// BenchmarkEncode/format=json-48 838406 1424 ns/op
// BenchmarkEncode/format=json-48 836227 1447 ns/op
// BenchmarkEncode/format=json-48 830835 1425 ns/op
// BenchmarkEncode/format=gob-48 394441 3075 ns/op
// BenchmarkEncode/format=gob-48 393207 3065 ns/op
// BenchmarkEncode/format=gob-48 392374 3059 ns/op
// BenchmarkEncode/format=gob-48 396037 3065 ns/op
// BenchmarkEncode/format=gob-48 393255 3060 ns/op
// BenchmarkEncode/format=gob-48 382629 3081 ns/op
// BenchmarkEncode/format=gob-48 389558 3186 ns/op
// BenchmarkEncode/format=gob-48 392668 3135 ns/op
// BenchmarkEncode/format=gob-48 392313 3087 ns/op
// BenchmarkEncode/format=gob-48 394274 3062 ns/op
//
// The order of the lines in the file does not matter, except that the
// output lists benchmarks in order of appearance.
//
// If we run “benchstat old.txt new.txt”, it will summarize the
// benchmarks and compare the before and after results:
//
// $ benchstat old.txt new.txt
// goos: linux
// goarch: amd64
// pkg: golang.org/x/perf/cmd/benchstat/testdata
// │ old.txt │ new.txt │
// │ sec/op │ sec/op vs base │
// Encode/format=json-48 1.718µ ± 1% 1.423µ ± 1% -17.20% (p=0.000 n=10)
// Encode/format=gob-48 3.066µ ± 0% 3.070µ ± 2% ~ (p=0.446 n=10)
// geomean 2.295µ 2.090µ -8.94%
//
// Before the comparison table, we see common file-level
// configuration. If there are benchmarks with different configuration
// (for example, from different packages), benchstat will print
// separate tables for each configuration.
//
// The table then compares the two input files for each benchmark. It
// shows the median and 95% confidence interval summaries for each
// benchmark before and after the change, and an A/B comparison under
// "vs base". The comparison shows that Encode/format=json got 17.20%
// faster with a p-value of 0.000 and 10 samples from each input file.
// The p-value measures how likely it is that any differences were due
// to random chance (i.e., noise). In this case, it's extremely
// unlikely the difference between the medians was due to chance. For
// Encode/format=gob, the "~" means benchstat did not detect a
// statistically significant difference between the two inputs. In
// this case, we see a p-value of 0.446, meaning it's very likely the
// differences for this benchmark are simply due to random chance.
//
// Note that "statistically significant" is not the same as "large":
// with enough low-noise data, even very small changes can be
// distinguished from noise and considered statistically significant.
// It is, of course, generally easier to distinguish large changes
// from noise.
//
// Finally, the last row of the table shows the geometric mean of each
// column, giving an overall picture of how the benchmarks changed.
// Proportional changes in the geomean reflect proportional changes in
// the benchmarks. For example, given n benchmarks, if sec/op for one
// of them increases by a factor of 2, then the sec/op geomean will
// increase by a factor of ⁿ√2.
//
// # Filtering
//
// benchstat has a very flexible system of configuring exactly which
// benchmarks are summarized and compared. First, all inputs are
// filtered according to an expression provided as the -filter flag.
//
// Filters are built from key-value terms:
//
// key:value - Match if key equals value.
// key:"value" - Same, but value is a double-quoted Go string that
// may contain spaces or other special characters.
// "key":value - Keys may also be double-quoted.
// key:/regexp/ - Match if key matches a regular expression.
// key:(val1 OR val2 OR ...)
// - Short-hand for key:val1 OR key:val2. Values may be
// double-quoted strings or regexps.
// * - Match everything.
//
// These terms can be combined into larger expressions as follows:
//
// x y ... - Match if x, y, etc. all match.
// x AND y - Same as x y.
// x OR y - Match if x or y match.
// -x - Match if x does not match.
// (...) - Subexpression.
//
// Each key is one of the following:
//
// .name - The base name of a benchmark
// .fullname - The full name of a benchmark (including configuration)
// .file - The name of the input file or user-provided file label
// /{name-key} - Per-benchmark sub-name configuration key
// {file-key} - File-level configuration key
// .unit - The name of a unit for a particular metric
//
// For example, the following matches benchmarks with "/format=json"
// in the sub-name keys with file-level configuration "goos" equal to
// "linux" and extracts the "ns/op" and "B/op" measurements:
//
// $ benchstat -filter "/format:json goos:linux .unit:(ns/op OR B/op)" old.txt new.txt
// goos: linux
// goarch: amd64
// pkg: golang.org/x/perf/cmd/benchstat/testdata
// │ old.txt │ new.txt │
// │ sec/op │ sec/op vs base │
// Encode/format=json-48 1.718µ ± 1% 1.423µ ± 1% -17.20% (p=0.000 n=10)
//
// # Configuring comparisons
//
// The way benchstat groups and compares results is configurable using
// a similar set of keys as used for filtering. By default, benchstat
// groups results into tables using all file-level configuration keys,
// then within each table, it groups results into rows by .fullname
// (the benchmark's full name) and compares across columns by .file
// (the name of each input file). This can be changed via the
// following flags:
//
// -table KEYS - Group results into tables by KEYS
// -row KEYS - Group results into table rows by KEYS
// -col KEYS - Compare across results with different values of KEYS
//
// Using these flags, benchstat "projects" each result into a
// particular table cell. Each KEYS argument is a comma- or
// space-separated list of keys, each of which can optionally also
// specify a sort order (described below).
//
// Each key is one of the following:
//
// .name - The base name of a benchmark
// .fullname - The full name of a benchmark (including configuration)
// .file - The name of the input file or user-provided file label
// /{name-key} - Per-benchmark sub-name configuration key
// {file-key} - File-level configuration key
// .config - All file-level configuration keys
//
// Some of these keys can overlap. For example, ".config" includes the
// file-level key "goos", and ".fullname" includes the sub-name key
// "/format". When keys overlap like this, benchstat omits the more
// specific key from the general key. For example, if -table is the
// full file-level configuration ".config", and -col is the specific
// file key "goos", benchstat will omit "goos" from ".config".
//
// Finally, the -ignore flag can list keys that benchstat should
// ignore when grouping results. Continuing the previous example, if
// -table is ".config" and -ignore is "goos", benchstat will omit
// "goos" from ".config", but also not use it for any grouping.
//
// For precise details of the filter syntax and supported keys, see
// https://pkg.go.dev/golang.org/x/perf/benchproc/syntax.
//
// # Projection examples
//
// Returning to our first example, we can now see how the default
// projection flags produce this output:
//
// $ benchstat -table .config -row .fullname -col .file old.txt new.txt
// goos: linux
// goarch: amd64
// pkg: golang.org/x/perf/cmd/benchstat/testdata
// │ old.txt │ new.txt │
// │ sec/op │ sec/op vs base │
// Encode/format=json-48 1.718µ ± 1% 1.423µ ± 1% -17.20% (p=0.000 n=10)
// Encode/format=gob-48 3.066µ ± 0% 3.070µ ± 2% ~ (p=0.446 n=10)
// geomean 2.295µ 2.090µ -8.94%
//
// In this example, all benchmarks have the same file-level
// configuration, consisting of "goos", "goarch", and "pkg", so
// ".config" groups them into just one table. Within this table,
// results are grouped into rows by their full name, including
// configuration, and grouped into columns by the name of each input
// file.
//
// Suppose we instead want to compare json encoding to gob encoding
// from new.txt.
//
// $ benchstat -col /format new.txt
// goos: linux
// goarch: amd64
// pkg: golang.org/x/perf/cmd/benchstat/testdata
// │ json │ gob │
// │ sec/op │ sec/op vs base │
// Encode-48 1.423µ ± 1% 3.070µ ± 2% +115.82% (p=0.000 n=10)
//
// The columns are now labeled by the "/format" configuration from the
// benchmark name. benchstat still compares columns even though we've
// only provided a single input file. We also see that /format has
// been removed from the benchmark name to make a single row.
//
// We can simplify the output by grouping rows by just the benchmark name,
// rather than the full name:
//
// $ benchstat -col /format -row .name new.txt
// goos: linux
// goarch: amd64
// pkg: golang.org/x/perf/cmd/benchstat/testdata
// │ json │ gob │
// │ sec/op │ sec/op vs base │
// Encode 1.423µ ± 1% 3.070µ ± 2% +115.82% (p=0.000 n=10)
//
// benchstat will attempt to detect and warn if projections strip away
// too much information. For example, here we group together json and
// gob results into a single row:
//
// $ benchstat -row .name new.txt
// goos: linux
// goarch: amd64
// pkg: golang.org/x/perf/cmd/benchstat/testdata
// │ new.txt │
// │ sec/op │
// Encode 2.253µ ± 37% ¹
// ¹ benchmarks vary in .fullname
//
// Since this is probably not a meaningful comparison, benchstat warns
// that the benchmarks it grouped together vary in a hidden dimension.
// If this really were our intent, we could -ignore .fullname.
//
// # Sorting
//
// By default, benchstat sorts each dimension according to the order
// in which it first observes each value of that dimension. This can
// be overridden in each projection using the following syntax:
//
// {key}@{order} - specifies one of the built-in named sort orders.
// This can be "alpha" or "num" for alphabetic or numeric sorting.
// "num" understands basic use of metric and IEC prefixes like "2k"
// and "1Mi".
//
// {key}@({value} {value} ...) - specifies a fixed value order for
// key. It also specifies a filter: if key has a value that isn't any
// of the specified values, the result is filtered out.
//
// For example, we can use a fixed order to compare the improvement of
// json over gob rather than the other way around:
//
// $ benchstat -col "/format@(gob json)" -row .name -ignore .file new.txt
// goos: linux
// goarch: amd64
// pkg: golang.org/x/perf/cmd/benchstat/testdata
// │ gob │ json │
// │ sec/op │ sec/op vs base │
// Encode 3.070µ ± 2% 1.423µ ± 1% -53.66% (p=0.000 n=10)
//
// # Overriding .file
//
// Often, you want to compare results from different files, but want
// to provide more meaningful (or perhaps shorter) column labels than
// raw file names. File name labels can be overridden by specifying an
// input argument of the form "label=path" instead of just "path".
// This provides a custom value for the .file key.
//
// For example, the following will perform the default comparison, but
// label the columns O and N instead of old.txt and new.txt:
//
// $ benchstat O=old.txt N=new.txt
// goos: linux
// goarch: amd64
// pkg: golang.org/x/perf/cmd/benchstat/testdata
// │ O │ N │
// │ sec/op │ sec/op vs base │
// Encode/format=json-48 1.718µ ± 1% 1.423µ ± 1% -17.20% (p=0.000 n=10)
// Encode/format=gob-48 3.066µ ± 0% 3.070µ ± 2% ~ (p=0.446 n=10)
// geomean 2.295µ 2.090µ -8.94%
//
// # Units
//
// benchstat normalizes the units "ns" to "sec" and "MB" to "B" to
// avoid creating nonsense units like "µns/op". These appear in the
// testing package's default metrics and are also common in custom
// metrics.
//
// benchstat supports custom unit metadata (see
// https://golang.org/design/14313-benchmark-format). In particular,
// "assume" metadata is useful for controlling the statistics used by
// benchstat. By default, units use "assume=nothing", so benchstat
// uses non-parametric statistics: median for summaries, and the
// Mann-Whitney U-test for A/B comparisons.
//
// Some benchmarks measure things that have no noise, such as the size
// of a binary produced by a compiler. These do not benefit from
// repeated measurements or non-parametric statistics. For these
// units, it's useful to set "assume=exact". This will cause benchstat
// to warn if there's any variation in the measured values, and to
// show A/B comparisons even if there's only one before and after
// measurement.
//
// # Tips
//
// Reducing noise and/or increasing the number of benchmark runs will
// enable benchstat to discern smaller changes as "statistically
// significant". To reduce noise, make sure you run benchmarks on an
// otherwise idle machine, ideally one that isn't running on battery
// and isn't likely to be affected by thermal throttling.
// https://llvm.org/docs/Benchmarking.html has many good tips on
// reducing noise in benchmarks.
//
// It's also important that noise is evenly distributed across
// benchmark runs. The best way to do this is to interleave before and
// after runs, rather than running, say, 10 iterations of the before
// benchmark, and then 10 iterations of the after benchmark. For Go
// benchmarks, you can often speed up this process by using "go test
// -c" to pre-compile the benchmark binary.
//
// Pick a number of benchmark runs (at least 10, ideally 20) and stick
// to it. If benchstat reports no statistically significant change,
// avoid simply rerunning your benchmarks until it reports a
// significant change. This is known as "multiple testing" and is a
// common statistical error. By default, benchstat uses an ɑ threshold
// of 0.05, which means it is *expected* to show a difference 5% of
// the time even if there is no difference. Hence, if you rerun
// benchmarks looking for a change, benchstat will probably eventually
// say there is a change, even if there isn't, which creates a
// statistical bias.
//
// As an extension of this, if you compare a large number of
// benchmarks, you should expect that about 5% of them will report a
// statistically significant change even if there is no difference
// between the before and after.
package main
import (
"flag"
"fmt"
"io"
"os"
"golang.org/x/perf/benchfmt"
"golang.org/x/perf/benchmath"
"golang.org/x/perf/benchproc"
"golang.org/x/perf/cmd/benchstat/internal/benchtab"
)
// TODO: Add a flag to perform Holm–Bonferroni correction for
// family-wise error rates. This can be done after-the-fact on a
// collection of benchstat.Comparison values.
// TODO: -unit flag.
// TODO: Support sorting by commit order.
// TODO: Add some quick usage examples to the -h output?
// TODO: If the projection results in a very sparse table, that's
// usually the result of correlated keys. Can we detect that and
// suggest fixes?
func main() {
if err := benchstat(os.Stdout, os.Stderr, os.Args[1:]); err != nil {
fmt.Fprintf(os.Stderr, "benchstat: %s\n", err)
}
}
func benchstat(w, wErr io.Writer, args []string) error {
flags := flag.NewFlagSet("", flag.ExitOnError)
flags.Usage = func() {
fmt.Fprintf(flags.Output(), `Usage: benchstat [flags] inputs...
benchstat computes statistical summaries and A/B comparisons of Go
benchmarks. It shows benchmark medians in a table with a row for each
benchmark and a column for each input file. If there is more than one
input file, it also shows A/B comparisons between the files. If a
difference is likely to be noise, it shows "~".
For details, see https://pkg.go.dev/golang.org/x/perf/cmd/benchstat.
`)
flags.PrintDefaults()
}
thresholds := benchmath.DefaultThresholds
flagTable := flags.String("table", ".config", "split results into tables by distinct values of `projection`")
flagRow := flags.String("row", ".fullname", "split results into rows by distinct values of `projection`")
flagCol := flags.String("col", ".file", "split results into columns by distinct values of `projection`")
flagIgnore := flags.String("ignore", "", "ignore variations in `keys`")
flagFilter := flags.String("filter", "*", "use only benchmarks matching benchfilter `query`")
flags.Float64Var(&thresholds.CompareAlpha, "alpha", thresholds.CompareAlpha, "consider change significant if p < `α`")
// TODO: Support -confidence none to disable CI column? This
// would be equivalent to benchstat v1's -norange for CSV.
flagConfidence := flags.Float64("confidence", 0.95, "confidence `level` for ranges")
flagFormat := flags.String("format", "text", "print results in `format`:\n text - plain text\n csv - comma-separated values (warnings will be written to stderr)\n")
flags.Parse(args)
if flags.NArg() == 0 {
flags.Usage()
os.Exit(2)
}
filter, err := benchproc.NewFilter(*flagFilter)
if err != nil {
return fmt.Errorf("parsing -filter: %s", err)
}
var parser benchproc.ProjectionParser
var parseErr error
mustParse := func(name, val string, unit bool) *benchproc.Projection {
var proj *benchproc.Projection
var err error
if unit {
proj, _, err = parser.ParseWithUnit(val, filter)
} else {
proj, err = parser.Parse(val, filter)
}
if err != nil && parseErr == nil {
parseErr = fmt.Errorf("parsing %s: %s", name, err)
}
return proj
}
tableBy := mustParse("-table", *flagTable, true)
rowBy := mustParse("-row", *flagRow, false)
colBy := mustParse("-col", *flagCol, false)
mustParse("-ignore", *flagIgnore, false)
residue := parser.Residue()
if parseErr != nil {
return parseErr
}
if thresholds.CompareAlpha < 0 || thresholds.CompareAlpha > 1 {
return fmt.Errorf("-alpha must be in range [0, 1]")
}
if *flagConfidence < 0 || *flagConfidence > 1 {
return fmt.Errorf("-confidence must be in range [0, 1]")
}
var format func(t *benchtab.Tables) error
switch *flagFormat {
default:
return fmt.Errorf("-format must be text or csv")
case "text":
format = func(t *benchtab.Tables) error { return t.ToText(w, false) }
case "csv":
format = func(t *benchtab.Tables) error { return t.ToCSV(w, wErr) }
}
stat := benchtab.NewBuilder(tableBy, rowBy, colBy, residue)
files := benchfmt.Files{Paths: flags.Args(), AllowStdin: true, AllowLabels: true}
for files.Scan() {
switch rec := files.Result(); rec := rec.(type) {
case *benchfmt.SyntaxError:
// Non-fatal result parse error. Warn
// but keep going.
fmt.Fprintln(wErr, rec)
case *benchfmt.Result:
if ok, err := filter.Apply(rec); !ok {
if err != nil {
// Print the reason we rejected this result.
fmt.Fprintln(wErr, err)
}
continue
}
stat.Add(rec)
}
}
if err := files.Err(); err != nil {
return err
}
tables := stat.ToTables(benchtab.TableOpts{
Confidence: *flagConfidence,
Thresholds: &thresholds,
Units: files.Units(),
})
return format(tables)
}