benchstat, cmd/benchstat: group benchmark results

Go 1.9 and up write "pkg", "goos", and "goarch" keys in benchmark
output. benchstat now understands benchmark labels, and uses them to
separate incomparable benchmark results. cmd/benchstat gains a
command-line flag called "-split" to control this, defaulting to
"pkg,goos,goarch".

Change-Id: I00413ab348bbff31743b59e81d88c4faab1a8dca
Reviewed-on: https://go-review.googlesource.com/38584
Run-TryBot: Quentin Smith <quentin@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
diff --git a/analysis/app/compare.go b/analysis/app/compare.go
index c60cefd..03c3416 100644
--- a/analysis/app/compare.go
+++ b/analysis/app/compare.go
@@ -288,6 +288,7 @@
 	// Compute benchstat
 	c := &benchstat.Collection{
 		AddGeoMean: true,
+		SplitBy:    []string{"pkg", "goos", "goarch"},
 	}
 	for _, g := range groups {
 		c.AddResults(g.Q, g.results)
diff --git a/benchstat/data.go b/benchstat/data.go
index 756e2b0..1123532 100644
--- a/benchstat/data.go
+++ b/benchstat/data.go
@@ -5,6 +5,7 @@
 package benchstat
 
 import (
+	"bytes"
 	"fmt"
 	"strconv"
 	"strings"
@@ -15,10 +16,15 @@
 
 // A Collection is a collection of benchmark results.
 type Collection struct {
-	// Configs, Benchmarks, and Units give the set of configs,
-	// benchmarks, and units from the keys in Stats in an order
+	// Configs, Groups, and Units give the set of configs,
+	// groups, and units from the keys in Stats in an order
 	// meant to match the order the benchmarks were read in.
-	Configs, Benchmarks, Units []string
+	Configs, Groups, Units []string
+
+	// Benchmarks gives the set of benchmarks from the keys in
+	// Stats by group in an order meant to match the order
+	// benchmarks were read in.
+	Benchmarks map[string][]string
 
 	// Metrics holds the accumulated metrics for each key.
 	Metrics map[Key]*Metrics
@@ -34,13 +40,17 @@
 	// AddGeoMean specifies whether to add a line to the table
 	// showing the geometric mean of all the benchmark results.
 	AddGeoMean bool
+
+	// SplitBy specifies the labels to split results by.
+	// By default, results will only be split by full name.
+	SplitBy []string
 }
 
 // A Key identifies one metric (e.g., "ns/op", "B/op") from one
-// benchmark (function name sans "Benchmark" prefix) in one
-// configuration (input file name).
+// benchmark (function name sans "Benchmark" prefix) and optional
+// group in one configuration (input file name).
 type Key struct {
-	Config, Benchmark, Unit string
+	Config, Group, Benchmark, Unit string
 }
 
 // A Metrics holds the measurements of a single metric
@@ -129,7 +139,13 @@
 		*strings = append(*strings, add)
 	}
 	addString(&c.Configs, key.Config)
-	addString(&c.Benchmarks, key.Benchmark)
+	addString(&c.Groups, key.Group)
+	if c.Benchmarks == nil {
+		c.Benchmarks = make(map[string][]string)
+	}
+	benchmarks := c.Benchmarks[key.Group]
+	addString(&benchmarks, key.Benchmark)
+	c.Benchmarks[key.Group] = benchmarks
 	addString(&c.Units, key.Unit)
 	m := &Metrics{Unit: key.Unit}
 	c.Metrics[key] = m
@@ -141,7 +157,14 @@
 func (c *Collection) AddConfig(config string, data []byte) {
 	c.Configs = append(c.Configs, config)
 	key := Key{Config: config}
-	c.addText(key, string(data))
+	br := benchfmt.NewReader(bytes.NewReader(data))
+	for br.Next() {
+		c.addResult(key, br.Result())
+	}
+	if err := br.Err(); err != nil {
+		// bytes.Reader never returns errors
+		panic(err)
+	}
 }
 
 // AddResults adds the benchmark results to the named configuration.
@@ -149,35 +172,50 @@
 	c.Configs = append(c.Configs, config)
 	key := Key{Config: config}
 	for _, r := range results {
-		c.addText(key, r.Content)
+		c.addResult(key, r)
 	}
 }
 
-func (c *Collection) addText(key Key, data string) {
-	for _, line := range strings.Split(string(data), "\n") {
-		f := strings.Fields(line)
-		if len(f) < 4 {
+func (c *Collection) addResult(key Key, r *benchfmt.Result) {
+	f := strings.Fields(r.Content)
+	if len(f) < 4 {
+		return
+	}
+	name := f[0]
+	if !strings.HasPrefix(name, "Benchmark") {
+		return
+	}
+	name = strings.TrimPrefix(name, "Benchmark")
+	n, _ := strconv.Atoi(f[1])
+	if n == 0 {
+		return
+	}
+	key.Group = c.makeGroup(r)
+	key.Benchmark = name
+	for i := 2; i+2 <= len(f); i += 2 {
+		val, err := strconv.ParseFloat(f[i], 64)
+		if err != nil {
 			continue
 		}
-		name := f[0]
-		if !strings.HasPrefix(name, "Benchmark") {
-			continue
-		}
-		name = strings.TrimPrefix(name, "Benchmark")
-		n, _ := strconv.Atoi(f[1])
-		if n == 0 {
-			continue
-		}
+		key.Unit = f[i+1]
+		m := c.addMetrics(key)
+		m.Values = append(m.Values, val)
+	}
+}
 
-		key.Benchmark = name
-		for i := 2; i+2 <= len(f); i += 2 {
-			val, err := strconv.ParseFloat(f[i], 64)
-			if err != nil {
-				continue
+func (c *Collection) makeGroup(r *benchfmt.Result) string {
+	var out string
+	for _, s := range c.SplitBy {
+		v := r.NameLabels[s]
+		if v == "" {
+			v = r.Labels[s]
+		}
+		if v != "" {
+			if out != "" {
+				out = out + " "
 			}
-			key.Unit = f[i+1]
-			m := c.addMetrics(key)
-			m.Values = append(m.Values, val)
+			out += fmt.Sprintf("%s:%s", s, v)
 		}
 	}
+	return out
 }
diff --git a/benchstat/html.go b/benchstat/html.go
index 3817e77..eea80df 100644
--- a/benchstat/html.go
+++ b/benchstat/html.go
@@ -25,7 +25,9 @@
 <tr><th><th>{{.Metric}}
 {{else -}}
 <tr><th><th colspan='{{len .Configs}}' class='metric'>{{.Metric}}{{if .OldNewDelta}}<th>delta{{end}}
-{{end}}{{range $row := $table.Rows -}}
+{{end}}{{range $group := group $table.Rows -}}
+{{if and (gt (len $table.Groups) 1) (len (index . 0).Group)}}<tr class='group'><th colspan='{{colspan (len $table.Configs) $table.OldNewDelta}}'>{{(index . 0).Group}}{{end}}
+{{- range $row := . -}}
 {{if $table.OldNewDelta -}}
 <tr class='{{if eq .Change 1}}better{{else if eq .Change -1}}worse{{else}}unchanged{{end}}'>
 {{- else -}}
@@ -33,6 +35,7 @@
 {{- end -}}
 <td>{{.Benchmark}}{{range .Metrics}}<td>{{.Format $row.Scaler}}{{end}}{{if $table.OldNewDelta}}<td class='{{if eq .Delta "~"}}nodelta{{else}}delta{{end}}'>{{replace .Delta "-" "−" -1}}<td class='note'>{{.Note}}{{end}}
 {{end -}}
+{{- end -}}
 <tr><td>&nbsp;
 </tbody>
 {{end}}
@@ -42,6 +45,34 @@
 
 var htmlFuncs = template.FuncMap{
 	"replace": strings.Replace,
+	"group":   htmlGroup,
+	"colspan": htmlColspan,
+}
+
+func htmlColspan(configs int, delta bool) int {
+	if delta {
+		configs++
+	}
+	return configs + 1
+}
+
+func htmlGroup(rows []*Row) (out [][]*Row) {
+	var group string
+	var cur []*Row
+	for _, r := range rows {
+		if r.Group != group {
+			group = r.Group
+			if len(cur) > 0 {
+				out = append(out, cur)
+				cur = nil
+			}
+		}
+		cur = append(cur, r)
+	}
+	if len(cur) > 0 {
+		out = append(out, cur)
+	}
+	return
 }
 
 // FormatHTML appends an HTML formatting of the tables to buf.
diff --git a/benchstat/table.go b/benchstat/table.go
index 190f8ec..9693389 100644
--- a/benchstat/table.go
+++ b/benchstat/table.go
@@ -15,12 +15,14 @@
 	Metric      string
 	OldNewDelta bool // is this an old-new-delta table?
 	Configs     []string
+	Groups      []string
 	Rows        []*Row
 }
 
 // A Row is a table row for display in the benchstat output.
 type Row struct {
 	Benchmark string     // benchmark name
+	Group     string     // group name
 	Scaler    Scaler     // formatter for stats means
 	Metrics   []*Metrics // columns of statistics
 	Delta     string     // formatted percent change
@@ -49,61 +51,68 @@
 	for _, key.Unit = range c.Units {
 		table := new(Table)
 		table.Configs = c.Configs
+		table.Groups = c.Groups
 		table.Metric = metricOf(key.Unit)
 		table.OldNewDelta = len(c.Configs) == 2
-		for _, key.Benchmark = range c.Benchmarks {
-			row := &Row{Benchmark: key.Benchmark}
+		for _, key.Group = range c.Groups {
+			for _, key.Benchmark = range c.Benchmarks[key.Group] {
+				row := &Row{Benchmark: key.Benchmark}
+				if len(c.Groups) > 1 {
+					// Show group headers if there is more than one group.
+					row.Group = key.Group
+				}
 
-			for _, key.Config = range c.Configs {
-				m := c.Metrics[key]
-				if m == nil {
-					row.Metrics = append(row.Metrics, new(Metrics))
-					continue
-				}
-				row.Metrics = append(row.Metrics, m)
-				if row.Scaler == nil {
-					row.Scaler = NewScaler(m.Mean, m.Unit)
-				}
-			}
-
-			// If there are only two configs being compared, add stats.
-			if table.OldNewDelta {
-				k0 := key
-				k0.Config = c.Configs[0]
-				k1 := key
-				k1.Config = c.Configs[1]
-				old := c.Metrics[k0]
-				new := c.Metrics[k1]
-				// If one is missing, omit row entirely.
-				// TODO: Control this better.
-				if old == nil || new == nil {
-					continue
-				}
-				pval, testerr := deltaTest(old, new)
-				row.Delta = "~"
-				if testerr == stats.ErrZeroVariance {
-					row.Note = "(zero variance)"
-				} else if testerr == stats.ErrSampleSize {
-					row.Note = "(too few samples)"
-				} else if testerr == stats.ErrSamplesEqual {
-					row.Note = "(all equal)"
-				} else if testerr != nil {
-					row.Note = fmt.Sprintf("(%s)", testerr)
-				} else if pval < alpha {
-					pct := ((new.Mean / old.Mean) - 1.0) * 100.0
-					row.Delta = fmt.Sprintf("%+.2f%%", pct)
-					if pct < 0 == (table.Metric != "speed") { // smaller is better, except speeds
-						row.Change = +1
-					} else {
-						row.Change = -1
+				for _, key.Config = range c.Configs {
+					m := c.Metrics[key]
+					if m == nil {
+						row.Metrics = append(row.Metrics, new(Metrics))
+						continue
+					}
+					row.Metrics = append(row.Metrics, m)
+					if row.Scaler == nil {
+						row.Scaler = NewScaler(m.Mean, m.Unit)
 					}
 				}
-				if row.Note == "" && pval != -1 {
-					row.Note = fmt.Sprintf("(p=%0.3f n=%d+%d)", pval, len(old.RValues), len(new.RValues))
-				}
-			}
 
-			table.Rows = append(table.Rows, row)
+				// If there are only two configs being compared, add stats.
+				if table.OldNewDelta {
+					k0 := key
+					k0.Config = c.Configs[0]
+					k1 := key
+					k1.Config = c.Configs[1]
+					old := c.Metrics[k0]
+					new := c.Metrics[k1]
+					// If one is missing, omit row entirely.
+					// TODO: Control this better.
+					if old == nil || new == nil {
+						continue
+					}
+					pval, testerr := deltaTest(old, new)
+					row.Delta = "~"
+					if testerr == stats.ErrZeroVariance {
+						row.Note = "(zero variance)"
+					} else if testerr == stats.ErrSampleSize {
+						row.Note = "(too few samples)"
+					} else if testerr == stats.ErrSamplesEqual {
+						row.Note = "(all equal)"
+					} else if testerr != nil {
+						row.Note = fmt.Sprintf("(%s)", testerr)
+					} else if pval < alpha {
+						pct := ((new.Mean / old.Mean) - 1.0) * 100.0
+						row.Delta = fmt.Sprintf("%+.2f%%", pct)
+						if pct < 0 == (table.Metric != "speed") { // smaller is better, except speeds
+							row.Change = +1
+						} else {
+							row.Change = -1
+						}
+					}
+					if row.Note == "" && pval != -1 {
+						row.Note = fmt.Sprintf("(p=%0.3f n=%d+%d)", pval, len(old.RValues), len(new.RValues))
+					}
+				}
+
+				table.Rows = append(table.Rows, row)
+			}
 		}
 
 		if len(table.Rows) > 0 {
@@ -140,16 +149,18 @@
 	maxCount := 0
 	for _, key.Config = range c.Configs {
 		var means []float64
-		for _, key.Benchmark = range c.Benchmarks {
-			m := c.Metrics[key]
-			// Omit 0 values from the geomean calculation,
-			// as these either make the geomean undefined
-			// or zero (depending on who you ask). This
-			// typically comes up with things like
-			// allocation counts, where it's fine to just
-			// ignore the benchmark.
-			if m != nil && m.Mean != 0 {
-				means = append(means, m.Mean)
+		for _, key.Group = range c.Groups {
+			for _, key.Benchmark = range c.Benchmarks[key.Group] {
+				m := c.Metrics[key]
+				// Omit 0 values from the geomean calculation,
+				// as these either make the geomean undefined
+				// or zero (depending on who you ask). This
+				// typically comes up with things like
+				// allocation counts, where it's fine to just
+				// ignore the benchmark.
+				if m != nil && m.Mean != 0 {
+					means = append(means, m.Mean)
+				}
 			}
 		}
 		if len(means) > maxCount {
diff --git a/benchstat/text.go b/benchstat/text.go
index b8189f8..99ec736 100644
--- a/benchstat/text.go
+++ b/benchstat/text.go
@@ -20,6 +20,10 @@
 	var max []int
 	for _, table := range textTables {
 		for _, row := range table {
+			if len(row.cols) == 1 {
+				// Header row
+				continue
+			}
 			for len(max) < len(row.cols) {
 				max = append(max, 0)
 			}
@@ -53,8 +57,11 @@
 		// data
 		for _, row := range table[1:] {
 			for i, s := range row.cols {
-				switch i {
-				case 0:
+				switch {
+				case len(row.cols) == 1:
+					// Header row
+					fmt.Fprint(w, s)
+				case i == 0:
 					fmt.Fprintf(w, "%-*s", max[i], s)
 				default:
 					if i == len(row.cols)-1 && len(s) > 0 && s[0] == '(' {
@@ -104,7 +111,13 @@
 		textRows = append(textRows, row)
 	}
 
+	var group string
+
 	for _, row := range t.Rows {
+		if row.Group != group {
+			group = row.Group
+			textRows = append(textRows, newTextRow(group))
+		}
 		text := newTextRow(row.Benchmark)
 		for _, m := range row.Metrics {
 			text.cols = append(text.cols, m.Format(row.Scaler))
diff --git a/cmd/benchstat/main.go b/cmd/benchstat/main.go
index 570b186..b7fb691 100644
--- a/cmd/benchstat/main.go
+++ b/cmd/benchstat/main.go
@@ -114,6 +114,7 @@
 	flagAlpha     = flag.Float64("alpha", 0.05, "consider change significant if p < `α`")
 	flagGeomean   = flag.Bool("geomean", false, "print the geometric mean of each file")
 	flagHTML      = flag.Bool("html", false, "print results as an HTML table")
+	flagSplit     = flag.String("split", "pkg,goos,goarch", "split benchmarks by `labels`")
 )
 
 var deltaTestNames = map[string]benchstat.DeltaTest{
@@ -141,6 +142,9 @@
 		AddGeoMean: *flagGeomean,
 		DeltaTest:  deltaTest,
 	}
+	if *flagSplit != "" {
+		c.SplitBy = strings.Split(*flagSplit, ",")
+	}
 	for _, file := range flag.Args() {
 		data, err := ioutil.ReadFile(file)
 		if err != nil {
diff --git a/cmd/benchstat/main_test.go b/cmd/benchstat/main_test.go
index 8bdd58b..26f5433 100644
--- a/cmd/benchstat/main_test.go
+++ b/cmd/benchstat/main_test.go
@@ -6,6 +6,7 @@
 
 import (
 	"bytes"
+	"flag"
 	"io/ioutil"
 	"os"
 	"os/exec"
@@ -28,12 +29,15 @@
 		t.Fatal("skipping other tests")
 	}
 	check(t, "all", "new.txt", "old.txt", "slashslash4.txt", "x386.txt")
+	check(t, "allnosplit", "-split", "", "new.txt", "old.txt", "slashslash4.txt", "x386.txt")
 	check(t, "oldnew", "old.txt", "new.txt")
 	check(t, "oldnewgeo", "-geomean", "old.txt", "new.txt")
 	check(t, "new4", "new.txt", "slashslash4.txt")
 	check(t, "oldnewhtml", "-html", "old.txt", "new.txt")
 	check(t, "oldnew4html", "-html", "old.txt", "new.txt", "slashslash4.txt")
 	check(t, "oldnewttest", "-delta-test=ttest", "old.txt", "new.txt")
+	check(t, "packagesold", "packagesold.txt")
+	check(t, "packages", "packagesold.txt", "packagesnew.txt")
 }
 
 func check(t *testing.T, name string, files ...string) {
@@ -59,6 +63,7 @@
 		*flagGeomean = false
 		*flagHTML = false
 		*flagDeltaTest = "utest"
+		*flagSplit = flag.Lookup("split").DefValue
 
 		main()
 
diff --git a/cmd/benchstat/testdata/all.golden b/cmd/benchstat/testdata/all.golden
index eff91db..2ce36ba 100644
--- a/cmd/benchstat/testdata/all.golden
+++ b/cmd/benchstat/testdata/all.golden
@@ -1,75 +1,151 @@
 name \ time/op                             new.txt        old.txt        slashslash4.txt  x386.txt
-CRC32/poly=IEEE/size=15/align=0-8            44.5ns ± 3%    46.9ns ± 8%                     62.4ns ± 9%
-CRC32/poly=IEEE/size=15/align=1-8            44.5ns ± 4%    44.7ns ± 5%                     63.5ns ± 8%
-CRC32/poly=IEEE/size=40/align=0-8            42.5ns ± 6%    41.0ns ± 1%      42.1ns ± 3%    57.4ns ± 3%
-CRC32/poly=IEEE/size=40/align=1-8            42.0ns ± 3%    41.1ns ± 1%      41.7ns ± 5%    57.3ns ± 3%
-CRC32/poly=IEEE/size=512/align=0-8           57.1ns ± 3%   238.0ns ± 5%                    332.3ns ± 2%
-CRC32/poly=IEEE/size=512/align=1-8           57.2ns ± 3%   235.5ns ± 3%                    335.3ns ± 3%
-CRC32/poly=IEEE/size=1kB/align=0-8           94.1ns ± 2%   452.5ns ± 4%                    626.3ns ± 2%
-CRC32/poly=IEEE/size=1kB/align=1-8           93.3ns ± 2%   443.6ns ± 2%                    635.3ns ± 6%
-CRC32/poly=IEEE/size=4kB/align=0-8            298ns ± 1%    1740ns ± 8%      1682ns ± 2%    2457ns ± 7%
-CRC32/poly=IEEE/size=4kB/align=1-8            299ns ± 3%    1764ns ± 6%      1690ns ± 4%    2434ns ± 5%
-CRC32/poly=IEEE/size=32kB/align=0-8          2.16µs ± 3%   14.95µs ± 7%                    19.15µs ± 4%
-CRC32/poly=IEEE/size=32kB/align=1-8          2.18µs ± 3%   14.19µs ± 7%                    19.42µs ± 5%
-CRC32/poly=Castagnoli/size=15/align=0-8      16.3ns ± 2%    16.4ns ± 3%                     59.4ns ± 1%
-CRC32/poly=Castagnoli/size=15/align=1-8      17.3ns ± 2%    17.2ns ± 2%                     59.4ns ± 2%
-CRC32/poly=Castagnoli/size=40/align=0-8      17.5ns ± 4%    17.4ns ± 2%      18.6ns ±11%    59.7ns ± 6%
-CRC32/poly=Castagnoli/size=40/align=1-8      19.4ns ± 2%    19.7ns ± 3%      19.6ns ± 2%    58.1ns ± 7%
-CRC32/poly=Castagnoli/size=512/align=0-8     40.1ns ± 4%    40.2ns ± 2%                    350.8ns ± 5%
-CRC32/poly=Castagnoli/size=512/align=1-8     41.9ns ± 2%    42.1ns ± 3%                    349.8ns ± 7%
-CRC32/poly=Castagnoli/size=1kB/align=0-8     66.2ns ± 1%    65.5ns ± 1%                    656.7ns ± 4%
-CRC32/poly=Castagnoli/size=1kB/align=1-8     68.5ns ± 2%    70.1ns ± 6%                    656.8ns ± 6%
-CRC32/poly=Castagnoli/size=4kB/align=0-8      159ns ± 3%     163ns ± 5%       161ns ± 8%    2452ns ± 4%
-CRC32/poly=Castagnoli/size=4kB/align=1-8      162ns ± 3%     169ns ± 6%       170ns ± 8%    2448ns ± 5%
-CRC32/poly=Castagnoli/size=32kB/align=0-8    1.21µs ± 3%    1.22µs ± 4%                    20.53µs ± 5%
-CRC32/poly=Castagnoli/size=32kB/align=1-8    1.22µs ± 4%    1.26µs ± 3%                    20.18µs ± 9%
-CRC32/poly=Koopman/size=15/align=0-8         35.6ns ± 3%    36.5ns ±11%                     58.2ns ± 4%
-CRC32/poly=Koopman/size=15/align=1-8         35.5ns ± 1%    35.1ns ± 5%                     56.4ns ± 7%
-CRC32/poly=Koopman/size=40/align=0-8         87.6ns ± 2%    91.6ns ± 9%      93.8ns ±13%   142.3ns ± 8%
-CRC32/poly=Koopman/size=40/align=1-8         88.0ns ± 3%    91.1ns ± 6%      86.9ns ± 3%   136.2ns ± 5%
-CRC32/poly=Koopman/size=512/align=0-8        1.08µs ± 3%    1.13µs ± 5%                     1.68µs ± 5%
-CRC32/poly=Koopman/size=512/align=1-8        1.17µs ± 8%    1.13µs ± 6%                     1.64µs ± 4%
-CRC32/poly=Koopman/size=1kB/align=0-8        2.34µs ± 4%    2.24µs ± 6%                     3.31µs ± 4%
-CRC32/poly=Koopman/size=1kB/align=1-8        2.36µs ± 5%    2.15µs ± 2%                     3.28µs ± 3%
-CRC32/poly=Koopman/size=4kB/align=0-8        9.00µs ± 6%    9.03µs ± 6%      9.08µs ± 8%   13.32µs ± 3%
-CRC32/poly=Koopman/size=4kB/align=1-8        9.05µs ±12%    8.94µs ±10%      9.46µs ± 8%   13.16µs ± 3%
-CRC32/poly=Koopman/size=32kB/align=0-8       72.9µs ± 4%    72.4µs ± 9%                    106.9µs ± 6%
-CRC32/poly=Koopman/size=32kB/align=1-8       74.3µs ± 3%    69.6µs ± 3%                    106.1µs ± 4%
+pkg:hash/crc32 goos:darwin goarch:amd64
+CRC32/poly=IEEE/size=15/align=0-8            44.5ns ± 3%    46.9ns ± 8%
+CRC32/poly=IEEE/size=15/align=1-8            44.5ns ± 4%    44.7ns ± 5%
+CRC32/poly=IEEE/size=40/align=0-8            42.5ns ± 6%    41.0ns ± 1%      42.1ns ± 3%
+CRC32/poly=IEEE/size=40/align=1-8            42.0ns ± 3%    41.1ns ± 1%      41.7ns ± 5%
+CRC32/poly=IEEE/size=512/align=0-8           57.1ns ± 3%   238.0ns ± 5%
+CRC32/poly=IEEE/size=512/align=1-8           57.2ns ± 3%   235.5ns ± 3%
+CRC32/poly=IEEE/size=1kB/align=0-8           94.1ns ± 2%   452.5ns ± 4%
+CRC32/poly=IEEE/size=1kB/align=1-8           93.3ns ± 2%   443.6ns ± 2%
+CRC32/poly=IEEE/size=4kB/align=0-8            298ns ± 1%    1740ns ± 8%      1682ns ± 2%
+CRC32/poly=IEEE/size=4kB/align=1-8            299ns ± 3%    1764ns ± 6%      1690ns ± 4%
+CRC32/poly=IEEE/size=32kB/align=0-8          2.16µs ± 3%   14.95µs ± 7%
+CRC32/poly=IEEE/size=32kB/align=1-8          2.18µs ± 3%   14.19µs ± 7%
+CRC32/poly=Castagnoli/size=15/align=0-8      16.3ns ± 2%    16.4ns ± 3%
+CRC32/poly=Castagnoli/size=15/align=1-8      17.3ns ± 2%    17.2ns ± 2%
+CRC32/poly=Castagnoli/size=40/align=0-8      17.5ns ± 4%    17.4ns ± 2%      18.6ns ±11%
+CRC32/poly=Castagnoli/size=40/align=1-8      19.4ns ± 2%    19.7ns ± 3%      19.6ns ± 2%
+CRC32/poly=Castagnoli/size=512/align=0-8     40.1ns ± 4%    40.2ns ± 2%
+CRC32/poly=Castagnoli/size=512/align=1-8     41.9ns ± 2%    42.1ns ± 3%
+CRC32/poly=Castagnoli/size=1kB/align=0-8     66.2ns ± 1%    65.5ns ± 1%
+CRC32/poly=Castagnoli/size=1kB/align=1-8     68.5ns ± 2%    70.1ns ± 6%
+CRC32/poly=Castagnoli/size=4kB/align=0-8      159ns ± 3%     163ns ± 5%       161ns ± 8%
+CRC32/poly=Castagnoli/size=4kB/align=1-8      162ns ± 3%     169ns ± 6%       170ns ± 8%
+CRC32/poly=Castagnoli/size=32kB/align=0-8    1.21µs ± 3%    1.22µs ± 4%
+CRC32/poly=Castagnoli/size=32kB/align=1-8    1.22µs ± 4%    1.26µs ± 3%
+CRC32/poly=Koopman/size=15/align=0-8         35.6ns ± 3%    36.5ns ±11%
+CRC32/poly=Koopman/size=15/align=1-8         35.5ns ± 1%    35.1ns ± 5%
+CRC32/poly=Koopman/size=40/align=0-8         87.6ns ± 2%    91.6ns ± 9%      93.8ns ±13%
+CRC32/poly=Koopman/size=40/align=1-8         88.0ns ± 3%    91.1ns ± 6%      86.9ns ± 3%
+CRC32/poly=Koopman/size=512/align=0-8        1.08µs ± 3%    1.13µs ± 5%
+CRC32/poly=Koopman/size=512/align=1-8        1.17µs ± 8%    1.13µs ± 6%
+CRC32/poly=Koopman/size=1kB/align=0-8        2.34µs ± 4%    2.24µs ± 6%
+CRC32/poly=Koopman/size=1kB/align=1-8        2.36µs ± 5%    2.15µs ± 2%
+CRC32/poly=Koopman/size=4kB/align=0-8        9.00µs ± 6%    9.03µs ± 6%      9.08µs ± 8%
+CRC32/poly=Koopman/size=4kB/align=1-8        9.05µs ±12%    8.94µs ±10%      9.46µs ± 8%
+CRC32/poly=Koopman/size=32kB/align=0-8       72.9µs ± 4%    72.4µs ± 9%
+CRC32/poly=Koopman/size=32kB/align=1-8       74.3µs ± 3%    69.6µs ± 3%
+pkg:hash/crc32 goos:darwin goarch:386
+CRC32/poly=IEEE/size=15/align=0-8                                                           62.4ns ± 9%
+CRC32/poly=IEEE/size=15/align=1-8                                                           63.5ns ± 8%
+CRC32/poly=IEEE/size=40/align=0-8                                                           57.4ns ± 3%
+CRC32/poly=IEEE/size=40/align=1-8                                                           57.3ns ± 3%
+CRC32/poly=IEEE/size=512/align=0-8                                                           332ns ± 2%
+CRC32/poly=IEEE/size=512/align=1-8                                                           335ns ± 3%
+CRC32/poly=IEEE/size=1kB/align=0-8                                                           626ns ± 2%
+CRC32/poly=IEEE/size=1kB/align=1-8                                                           635ns ± 6%
+CRC32/poly=IEEE/size=4kB/align=0-8                                                          2.46µs ± 7%
+CRC32/poly=IEEE/size=4kB/align=1-8                                                          2.43µs ± 5%
+CRC32/poly=IEEE/size=32kB/align=0-8                                                         19.1µs ± 4%
+CRC32/poly=IEEE/size=32kB/align=1-8                                                         19.4µs ± 5%
+CRC32/poly=Castagnoli/size=15/align=0-8                                                     59.4ns ± 1%
+CRC32/poly=Castagnoli/size=15/align=1-8                                                     59.4ns ± 2%
+CRC32/poly=Castagnoli/size=40/align=0-8                                                     59.7ns ± 6%
+CRC32/poly=Castagnoli/size=40/align=1-8                                                     58.1ns ± 7%
+CRC32/poly=Castagnoli/size=512/align=0-8                                                     351ns ± 5%
+CRC32/poly=Castagnoli/size=512/align=1-8                                                     350ns ± 7%
+CRC32/poly=Castagnoli/size=1kB/align=0-8                                                     657ns ± 4%
+CRC32/poly=Castagnoli/size=1kB/align=1-8                                                     657ns ± 6%
+CRC32/poly=Castagnoli/size=4kB/align=0-8                                                    2.45µs ± 4%
+CRC32/poly=Castagnoli/size=4kB/align=1-8                                                    2.45µs ± 5%
+CRC32/poly=Castagnoli/size=32kB/align=0-8                                                   20.5µs ± 5%
+CRC32/poly=Castagnoli/size=32kB/align=1-8                                                   20.2µs ± 9%
+CRC32/poly=Koopman/size=15/align=0-8                                                        58.2ns ± 4%
+CRC32/poly=Koopman/size=15/align=1-8                                                        56.4ns ± 7%
+CRC32/poly=Koopman/size=40/align=0-8                                                         142ns ± 8%
+CRC32/poly=Koopman/size=40/align=1-8                                                         136ns ± 5%
+CRC32/poly=Koopman/size=512/align=0-8                                                       1.68µs ± 5%
+CRC32/poly=Koopman/size=512/align=1-8                                                       1.64µs ± 4%
+CRC32/poly=Koopman/size=1kB/align=0-8                                                       3.31µs ± 4%
+CRC32/poly=Koopman/size=1kB/align=1-8                                                       3.28µs ± 3%
+CRC32/poly=Koopman/size=4kB/align=0-8                                                       13.3µs ± 3%
+CRC32/poly=Koopman/size=4kB/align=1-8                                                       13.2µs ± 3%
+CRC32/poly=Koopman/size=32kB/align=0-8                                                       107µs ± 6%
+CRC32/poly=Koopman/size=32kB/align=1-8                                                       106µs ± 4%
 
 name \ speed                               new.txt        old.txt        slashslash4.txt  x386.txt
-CRC32/poly=IEEE/size=15/align=0-8           337MB/s ± 3%   321MB/s ± 8%                    241MB/s ± 8%
-CRC32/poly=IEEE/size=15/align=1-8           337MB/s ± 4%   336MB/s ± 4%                    237MB/s ± 8%
-CRC32/poly=IEEE/size=40/align=0-8           942MB/s ± 5%   975MB/s ± 1%     951MB/s ± 3%   696MB/s ± 3%
-CRC32/poly=IEEE/size=40/align=1-8           952MB/s ± 3%   974MB/s ± 1%     960MB/s ± 4%   698MB/s ± 3%
-CRC32/poly=IEEE/size=512/align=0-8         8.97GB/s ± 3%  2.15GB/s ± 4%                   1.54GB/s ± 2%
-CRC32/poly=IEEE/size=512/align=1-8         8.96GB/s ± 3%  2.17GB/s ± 3%                   1.52GB/s ± 3%
-CRC32/poly=IEEE/size=1kB/align=0-8         10.9GB/s ± 2%   2.3GB/s ± 4%                    1.6GB/s ± 2%
-CRC32/poly=IEEE/size=1kB/align=1-8         11.0GB/s ± 2%   2.3GB/s ± 2%                    1.6GB/s ± 6%
-CRC32/poly=IEEE/size=4kB/align=0-8         13.7GB/s ± 1%   2.4GB/s ± 7%     2.4GB/s ± 2%   1.7GB/s ± 7%
-CRC32/poly=IEEE/size=4kB/align=1-8         13.7GB/s ± 3%   2.3GB/s ± 6%     2.4GB/s ± 4%   1.7GB/s ± 5%
-CRC32/poly=IEEE/size=32kB/align=0-8        15.2GB/s ± 3%   2.2GB/s ± 7%                    1.7GB/s ± 4%
-CRC32/poly=IEEE/size=32kB/align=1-8        15.0GB/s ± 3%   2.3GB/s ± 8%                    1.7GB/s ± 5%
-CRC32/poly=Castagnoli/size=15/align=0-8     920MB/s ± 2%   916MB/s ± 2%                    253MB/s ± 1%
-CRC32/poly=Castagnoli/size=15/align=1-8     867MB/s ± 2%   870MB/s ± 2%                    253MB/s ± 2%
-CRC32/poly=Castagnoli/size=40/align=0-8    2.28GB/s ± 4%  2.30GB/s ± 2%    2.16GB/s ±11%  0.67GB/s ± 6%
-CRC32/poly=Castagnoli/size=40/align=1-8    2.06GB/s ± 2%  2.03GB/s ± 3%    2.04GB/s ± 2%  0.69GB/s ± 6%
-CRC32/poly=Castagnoli/size=512/align=0-8   12.8GB/s ± 4%  12.7GB/s ± 2%                    1.5GB/s ± 5%
-CRC32/poly=Castagnoli/size=512/align=1-8   12.2GB/s ± 1%  12.1GB/s ± 3%                    1.5GB/s ± 7%
-CRC32/poly=Castagnoli/size=1kB/align=0-8   15.5GB/s ± 1%  15.6GB/s ± 1%                    1.6GB/s ± 4%
-CRC32/poly=Castagnoli/size=1kB/align=1-8   15.0GB/s ± 2%  14.6GB/s ± 6%                    1.6GB/s ± 5%
-CRC32/poly=Castagnoli/size=4kB/align=0-8   25.7GB/s ± 3%  25.1GB/s ± 5%    25.4GB/s ± 7%   1.7GB/s ± 4%
-CRC32/poly=Castagnoli/size=4kB/align=1-8   25.3GB/s ± 3%  24.1GB/s ± 6%    24.1GB/s ± 8%   1.7GB/s ± 5%
-CRC32/poly=Castagnoli/size=32kB/align=0-8  26.8GB/s ± 5%  26.9GB/s ± 4%                    1.6GB/s ± 5%
-CRC32/poly=Castagnoli/size=32kB/align=1-8  26.8GB/s ± 4%  25.9GB/s ± 3%                    1.6GB/s ± 8%
-CRC32/poly=Koopman/size=15/align=0-8        421MB/s ± 3%   412MB/s ±10%                    258MB/s ± 5%
-CRC32/poly=Koopman/size=15/align=1-8        422MB/s ± 1%   427MB/s ± 5%                    266MB/s ± 6%
-CRC32/poly=Koopman/size=40/align=0-8        456MB/s ± 2%   437MB/s ± 9%     428MB/s ±12%   281MB/s ± 7%
-CRC32/poly=Koopman/size=40/align=1-8        455MB/s ± 3%   440MB/s ± 6%     461MB/s ± 3%   290MB/s ± 8%
-CRC32/poly=Koopman/size=512/align=0-8       476MB/s ± 3%   453MB/s ± 5%                    304MB/s ± 5%
-CRC32/poly=Koopman/size=512/align=1-8       440MB/s ± 8%   455MB/s ± 6%                    312MB/s ± 3%
-CRC32/poly=Koopman/size=1kB/align=0-8       438MB/s ± 4%   452MB/s ± 9%                    310MB/s ± 4%
-CRC32/poly=Koopman/size=1kB/align=1-8       434MB/s ± 5%   477MB/s ± 2%                    312MB/s ± 3%
-CRC32/poly=Koopman/size=4kB/align=0-8       455MB/s ± 6%   454MB/s ± 5%     452MB/s ± 8%   308MB/s ± 3%
-CRC32/poly=Koopman/size=4kB/align=1-8       455MB/s ±11%   459MB/s ± 9%     434MB/s ± 9%   311MB/s ± 3%
-CRC32/poly=Koopman/size=32kB/align=0-8      450MB/s ± 4%   453MB/s ± 8%                    307MB/s ± 6%
-CRC32/poly=Koopman/size=32kB/align=1-8      441MB/s ± 3%   471MB/s ± 3%                    309MB/s ± 4%
+pkg:hash/crc32 goos:darwin goarch:amd64
+CRC32/poly=IEEE/size=15/align=0-8           337MB/s ± 3%   321MB/s ± 8%
+CRC32/poly=IEEE/size=15/align=1-8           337MB/s ± 4%   336MB/s ± 4%
+CRC32/poly=IEEE/size=40/align=0-8           942MB/s ± 5%   975MB/s ± 1%     951MB/s ± 3%
+CRC32/poly=IEEE/size=40/align=1-8           952MB/s ± 3%   974MB/s ± 1%     960MB/s ± 4%
+CRC32/poly=IEEE/size=512/align=0-8         8.97GB/s ± 3%  2.15GB/s ± 4%
+CRC32/poly=IEEE/size=512/align=1-8         8.96GB/s ± 3%  2.17GB/s ± 3%
+CRC32/poly=IEEE/size=1kB/align=0-8         10.9GB/s ± 2%   2.3GB/s ± 4%
+CRC32/poly=IEEE/size=1kB/align=1-8         11.0GB/s ± 2%   2.3GB/s ± 2%
+CRC32/poly=IEEE/size=4kB/align=0-8         13.7GB/s ± 1%   2.4GB/s ± 7%     2.4GB/s ± 2%
+CRC32/poly=IEEE/size=4kB/align=1-8         13.7GB/s ± 3%   2.3GB/s ± 6%     2.4GB/s ± 4%
+CRC32/poly=IEEE/size=32kB/align=0-8        15.2GB/s ± 3%   2.2GB/s ± 7%
+CRC32/poly=IEEE/size=32kB/align=1-8        15.0GB/s ± 3%   2.3GB/s ± 8%
+CRC32/poly=Castagnoli/size=15/align=0-8     920MB/s ± 2%   916MB/s ± 2%
+CRC32/poly=Castagnoli/size=15/align=1-8     867MB/s ± 2%   870MB/s ± 2%
+CRC32/poly=Castagnoli/size=40/align=0-8    2.28GB/s ± 4%  2.30GB/s ± 2%    2.16GB/s ±11%
+CRC32/poly=Castagnoli/size=40/align=1-8    2.06GB/s ± 2%  2.03GB/s ± 3%    2.04GB/s ± 2%
+CRC32/poly=Castagnoli/size=512/align=0-8   12.8GB/s ± 4%  12.7GB/s ± 2%
+CRC32/poly=Castagnoli/size=512/align=1-8   12.2GB/s ± 1%  12.1GB/s ± 3%
+CRC32/poly=Castagnoli/size=1kB/align=0-8   15.5GB/s ± 1%  15.6GB/s ± 1%
+CRC32/poly=Castagnoli/size=1kB/align=1-8   15.0GB/s ± 2%  14.6GB/s ± 6%
+CRC32/poly=Castagnoli/size=4kB/align=0-8   25.7GB/s ± 3%  25.1GB/s ± 5%    25.4GB/s ± 7%
+CRC32/poly=Castagnoli/size=4kB/align=1-8   25.3GB/s ± 3%  24.1GB/s ± 6%    24.1GB/s ± 8%
+CRC32/poly=Castagnoli/size=32kB/align=0-8  26.8GB/s ± 5%  26.9GB/s ± 4%
+CRC32/poly=Castagnoli/size=32kB/align=1-8  26.8GB/s ± 4%  25.9GB/s ± 3%
+CRC32/poly=Koopman/size=15/align=0-8        421MB/s ± 3%   412MB/s ±10%
+CRC32/poly=Koopman/size=15/align=1-8        422MB/s ± 1%   427MB/s ± 5%
+CRC32/poly=Koopman/size=40/align=0-8        456MB/s ± 2%   437MB/s ± 9%     428MB/s ±12%
+CRC32/poly=Koopman/size=40/align=1-8        455MB/s ± 3%   440MB/s ± 6%     461MB/s ± 3%
+CRC32/poly=Koopman/size=512/align=0-8       476MB/s ± 3%   453MB/s ± 5%
+CRC32/poly=Koopman/size=512/align=1-8       440MB/s ± 8%   455MB/s ± 6%
+CRC32/poly=Koopman/size=1kB/align=0-8       438MB/s ± 4%   452MB/s ± 9%
+CRC32/poly=Koopman/size=1kB/align=1-8       434MB/s ± 5%   477MB/s ± 2%
+CRC32/poly=Koopman/size=4kB/align=0-8       455MB/s ± 6%   454MB/s ± 5%     452MB/s ± 8%
+CRC32/poly=Koopman/size=4kB/align=1-8       455MB/s ±11%   459MB/s ± 9%     434MB/s ± 9%
+CRC32/poly=Koopman/size=32kB/align=0-8      450MB/s ± 4%   453MB/s ± 8%
+CRC32/poly=Koopman/size=32kB/align=1-8      441MB/s ± 3%   471MB/s ± 3%
+pkg:hash/crc32 goos:darwin goarch:386
+CRC32/poly=IEEE/size=15/align=0-8                                                          241MB/s ± 8%
+CRC32/poly=IEEE/size=15/align=1-8                                                          237MB/s ± 8%
+CRC32/poly=IEEE/size=40/align=0-8                                                          696MB/s ± 3%
+CRC32/poly=IEEE/size=40/align=1-8                                                          698MB/s ± 3%
+CRC32/poly=IEEE/size=512/align=0-8                                                        1.54GB/s ± 2%
+CRC32/poly=IEEE/size=512/align=1-8                                                        1.52GB/s ± 3%
+CRC32/poly=IEEE/size=1kB/align=0-8                                                        1.63GB/s ± 2%
+CRC32/poly=IEEE/size=1kB/align=1-8                                                        1.61GB/s ± 6%
+CRC32/poly=IEEE/size=4kB/align=0-8                                                        1.67GB/s ± 7%
+CRC32/poly=IEEE/size=4kB/align=1-8                                                        1.68GB/s ± 5%
+CRC32/poly=IEEE/size=32kB/align=0-8                                                       1.71GB/s ± 4%
+CRC32/poly=IEEE/size=32kB/align=1-8                                                       1.69GB/s ± 5%
+CRC32/poly=Castagnoli/size=15/align=0-8                                                    253MB/s ± 1%
+CRC32/poly=Castagnoli/size=15/align=1-8                                                    253MB/s ± 2%
+CRC32/poly=Castagnoli/size=40/align=0-8                                                    671MB/s ± 6%
+CRC32/poly=Castagnoli/size=40/align=1-8                                                    689MB/s ± 6%
+CRC32/poly=Castagnoli/size=512/align=0-8                                                  1.46GB/s ± 5%
+CRC32/poly=Castagnoli/size=512/align=1-8                                                  1.46GB/s ± 7%
+CRC32/poly=Castagnoli/size=1kB/align=0-8                                                  1.56GB/s ± 4%
+CRC32/poly=Castagnoli/size=1kB/align=1-8                                                  1.56GB/s ± 5%
+CRC32/poly=Castagnoli/size=4kB/align=0-8                                                  1.67GB/s ± 4%
+CRC32/poly=Castagnoli/size=4kB/align=1-8                                                  1.67GB/s ± 5%
+CRC32/poly=Castagnoli/size=32kB/align=0-8                                                 1.60GB/s ± 5%
+CRC32/poly=Castagnoli/size=32kB/align=1-8                                                 1.63GB/s ± 8%
+CRC32/poly=Koopman/size=15/align=0-8                                                       258MB/s ± 5%
+CRC32/poly=Koopman/size=15/align=1-8                                                       266MB/s ± 6%
+CRC32/poly=Koopman/size=40/align=0-8                                                       281MB/s ± 7%
+CRC32/poly=Koopman/size=40/align=1-8                                                       290MB/s ± 8%
+CRC32/poly=Koopman/size=512/align=0-8                                                      304MB/s ± 5%
+CRC32/poly=Koopman/size=512/align=1-8                                                      312MB/s ± 3%
+CRC32/poly=Koopman/size=1kB/align=0-8                                                      310MB/s ± 4%
+CRC32/poly=Koopman/size=1kB/align=1-8                                                      312MB/s ± 3%
+CRC32/poly=Koopman/size=4kB/align=0-8                                                      308MB/s ± 3%
+CRC32/poly=Koopman/size=4kB/align=1-8                                                      311MB/s ± 3%
+CRC32/poly=Koopman/size=32kB/align=0-8                                                     307MB/s ± 6%
+CRC32/poly=Koopman/size=32kB/align=1-8                                                     309MB/s ± 4%
diff --git a/cmd/benchstat/testdata/allnosplit.golden b/cmd/benchstat/testdata/allnosplit.golden
new file mode 100644
index 0000000..eff91db
--- /dev/null
+++ b/cmd/benchstat/testdata/allnosplit.golden
@@ -0,0 +1,75 @@
+name \ time/op                             new.txt        old.txt        slashslash4.txt  x386.txt
+CRC32/poly=IEEE/size=15/align=0-8            44.5ns ± 3%    46.9ns ± 8%                     62.4ns ± 9%
+CRC32/poly=IEEE/size=15/align=1-8            44.5ns ± 4%    44.7ns ± 5%                     63.5ns ± 8%
+CRC32/poly=IEEE/size=40/align=0-8            42.5ns ± 6%    41.0ns ± 1%      42.1ns ± 3%    57.4ns ± 3%
+CRC32/poly=IEEE/size=40/align=1-8            42.0ns ± 3%    41.1ns ± 1%      41.7ns ± 5%    57.3ns ± 3%
+CRC32/poly=IEEE/size=512/align=0-8           57.1ns ± 3%   238.0ns ± 5%                    332.3ns ± 2%
+CRC32/poly=IEEE/size=512/align=1-8           57.2ns ± 3%   235.5ns ± 3%                    335.3ns ± 3%
+CRC32/poly=IEEE/size=1kB/align=0-8           94.1ns ± 2%   452.5ns ± 4%                    626.3ns ± 2%
+CRC32/poly=IEEE/size=1kB/align=1-8           93.3ns ± 2%   443.6ns ± 2%                    635.3ns ± 6%
+CRC32/poly=IEEE/size=4kB/align=0-8            298ns ± 1%    1740ns ± 8%      1682ns ± 2%    2457ns ± 7%
+CRC32/poly=IEEE/size=4kB/align=1-8            299ns ± 3%    1764ns ± 6%      1690ns ± 4%    2434ns ± 5%
+CRC32/poly=IEEE/size=32kB/align=0-8          2.16µs ± 3%   14.95µs ± 7%                    19.15µs ± 4%
+CRC32/poly=IEEE/size=32kB/align=1-8          2.18µs ± 3%   14.19µs ± 7%                    19.42µs ± 5%
+CRC32/poly=Castagnoli/size=15/align=0-8      16.3ns ± 2%    16.4ns ± 3%                     59.4ns ± 1%
+CRC32/poly=Castagnoli/size=15/align=1-8      17.3ns ± 2%    17.2ns ± 2%                     59.4ns ± 2%
+CRC32/poly=Castagnoli/size=40/align=0-8      17.5ns ± 4%    17.4ns ± 2%      18.6ns ±11%    59.7ns ± 6%
+CRC32/poly=Castagnoli/size=40/align=1-8      19.4ns ± 2%    19.7ns ± 3%      19.6ns ± 2%    58.1ns ± 7%
+CRC32/poly=Castagnoli/size=512/align=0-8     40.1ns ± 4%    40.2ns ± 2%                    350.8ns ± 5%
+CRC32/poly=Castagnoli/size=512/align=1-8     41.9ns ± 2%    42.1ns ± 3%                    349.8ns ± 7%
+CRC32/poly=Castagnoli/size=1kB/align=0-8     66.2ns ± 1%    65.5ns ± 1%                    656.7ns ± 4%
+CRC32/poly=Castagnoli/size=1kB/align=1-8     68.5ns ± 2%    70.1ns ± 6%                    656.8ns ± 6%
+CRC32/poly=Castagnoli/size=4kB/align=0-8      159ns ± 3%     163ns ± 5%       161ns ± 8%    2452ns ± 4%
+CRC32/poly=Castagnoli/size=4kB/align=1-8      162ns ± 3%     169ns ± 6%       170ns ± 8%    2448ns ± 5%
+CRC32/poly=Castagnoli/size=32kB/align=0-8    1.21µs ± 3%    1.22µs ± 4%                    20.53µs ± 5%
+CRC32/poly=Castagnoli/size=32kB/align=1-8    1.22µs ± 4%    1.26µs ± 3%                    20.18µs ± 9%
+CRC32/poly=Koopman/size=15/align=0-8         35.6ns ± 3%    36.5ns ±11%                     58.2ns ± 4%
+CRC32/poly=Koopman/size=15/align=1-8         35.5ns ± 1%    35.1ns ± 5%                     56.4ns ± 7%
+CRC32/poly=Koopman/size=40/align=0-8         87.6ns ± 2%    91.6ns ± 9%      93.8ns ±13%   142.3ns ± 8%
+CRC32/poly=Koopman/size=40/align=1-8         88.0ns ± 3%    91.1ns ± 6%      86.9ns ± 3%   136.2ns ± 5%
+CRC32/poly=Koopman/size=512/align=0-8        1.08µs ± 3%    1.13µs ± 5%                     1.68µs ± 5%
+CRC32/poly=Koopman/size=512/align=1-8        1.17µs ± 8%    1.13µs ± 6%                     1.64µs ± 4%
+CRC32/poly=Koopman/size=1kB/align=0-8        2.34µs ± 4%    2.24µs ± 6%                     3.31µs ± 4%
+CRC32/poly=Koopman/size=1kB/align=1-8        2.36µs ± 5%    2.15µs ± 2%                     3.28µs ± 3%
+CRC32/poly=Koopman/size=4kB/align=0-8        9.00µs ± 6%    9.03µs ± 6%      9.08µs ± 8%   13.32µs ± 3%
+CRC32/poly=Koopman/size=4kB/align=1-8        9.05µs ±12%    8.94µs ±10%      9.46µs ± 8%   13.16µs ± 3%
+CRC32/poly=Koopman/size=32kB/align=0-8       72.9µs ± 4%    72.4µs ± 9%                    106.9µs ± 6%
+CRC32/poly=Koopman/size=32kB/align=1-8       74.3µs ± 3%    69.6µs ± 3%                    106.1µs ± 4%
+
+name \ speed                               new.txt        old.txt        slashslash4.txt  x386.txt
+CRC32/poly=IEEE/size=15/align=0-8           337MB/s ± 3%   321MB/s ± 8%                    241MB/s ± 8%
+CRC32/poly=IEEE/size=15/align=1-8           337MB/s ± 4%   336MB/s ± 4%                    237MB/s ± 8%
+CRC32/poly=IEEE/size=40/align=0-8           942MB/s ± 5%   975MB/s ± 1%     951MB/s ± 3%   696MB/s ± 3%
+CRC32/poly=IEEE/size=40/align=1-8           952MB/s ± 3%   974MB/s ± 1%     960MB/s ± 4%   698MB/s ± 3%
+CRC32/poly=IEEE/size=512/align=0-8         8.97GB/s ± 3%  2.15GB/s ± 4%                   1.54GB/s ± 2%
+CRC32/poly=IEEE/size=512/align=1-8         8.96GB/s ± 3%  2.17GB/s ± 3%                   1.52GB/s ± 3%
+CRC32/poly=IEEE/size=1kB/align=0-8         10.9GB/s ± 2%   2.3GB/s ± 4%                    1.6GB/s ± 2%
+CRC32/poly=IEEE/size=1kB/align=1-8         11.0GB/s ± 2%   2.3GB/s ± 2%                    1.6GB/s ± 6%
+CRC32/poly=IEEE/size=4kB/align=0-8         13.7GB/s ± 1%   2.4GB/s ± 7%     2.4GB/s ± 2%   1.7GB/s ± 7%
+CRC32/poly=IEEE/size=4kB/align=1-8         13.7GB/s ± 3%   2.3GB/s ± 6%     2.4GB/s ± 4%   1.7GB/s ± 5%
+CRC32/poly=IEEE/size=32kB/align=0-8        15.2GB/s ± 3%   2.2GB/s ± 7%                    1.7GB/s ± 4%
+CRC32/poly=IEEE/size=32kB/align=1-8        15.0GB/s ± 3%   2.3GB/s ± 8%                    1.7GB/s ± 5%
+CRC32/poly=Castagnoli/size=15/align=0-8     920MB/s ± 2%   916MB/s ± 2%                    253MB/s ± 1%
+CRC32/poly=Castagnoli/size=15/align=1-8     867MB/s ± 2%   870MB/s ± 2%                    253MB/s ± 2%
+CRC32/poly=Castagnoli/size=40/align=0-8    2.28GB/s ± 4%  2.30GB/s ± 2%    2.16GB/s ±11%  0.67GB/s ± 6%
+CRC32/poly=Castagnoli/size=40/align=1-8    2.06GB/s ± 2%  2.03GB/s ± 3%    2.04GB/s ± 2%  0.69GB/s ± 6%
+CRC32/poly=Castagnoli/size=512/align=0-8   12.8GB/s ± 4%  12.7GB/s ± 2%                    1.5GB/s ± 5%
+CRC32/poly=Castagnoli/size=512/align=1-8   12.2GB/s ± 1%  12.1GB/s ± 3%                    1.5GB/s ± 7%
+CRC32/poly=Castagnoli/size=1kB/align=0-8   15.5GB/s ± 1%  15.6GB/s ± 1%                    1.6GB/s ± 4%
+CRC32/poly=Castagnoli/size=1kB/align=1-8   15.0GB/s ± 2%  14.6GB/s ± 6%                    1.6GB/s ± 5%
+CRC32/poly=Castagnoli/size=4kB/align=0-8   25.7GB/s ± 3%  25.1GB/s ± 5%    25.4GB/s ± 7%   1.7GB/s ± 4%
+CRC32/poly=Castagnoli/size=4kB/align=1-8   25.3GB/s ± 3%  24.1GB/s ± 6%    24.1GB/s ± 8%   1.7GB/s ± 5%
+CRC32/poly=Castagnoli/size=32kB/align=0-8  26.8GB/s ± 5%  26.9GB/s ± 4%                    1.6GB/s ± 5%
+CRC32/poly=Castagnoli/size=32kB/align=1-8  26.8GB/s ± 4%  25.9GB/s ± 3%                    1.6GB/s ± 8%
+CRC32/poly=Koopman/size=15/align=0-8        421MB/s ± 3%   412MB/s ±10%                    258MB/s ± 5%
+CRC32/poly=Koopman/size=15/align=1-8        422MB/s ± 1%   427MB/s ± 5%                    266MB/s ± 6%
+CRC32/poly=Koopman/size=40/align=0-8        456MB/s ± 2%   437MB/s ± 9%     428MB/s ±12%   281MB/s ± 7%
+CRC32/poly=Koopman/size=40/align=1-8        455MB/s ± 3%   440MB/s ± 6%     461MB/s ± 3%   290MB/s ± 8%
+CRC32/poly=Koopman/size=512/align=0-8       476MB/s ± 3%   453MB/s ± 5%                    304MB/s ± 5%
+CRC32/poly=Koopman/size=512/align=1-8       440MB/s ± 8%   455MB/s ± 6%                    312MB/s ± 3%
+CRC32/poly=Koopman/size=1kB/align=0-8       438MB/s ± 4%   452MB/s ± 9%                    310MB/s ± 4%
+CRC32/poly=Koopman/size=1kB/align=1-8       434MB/s ± 5%   477MB/s ± 2%                    312MB/s ± 3%
+CRC32/poly=Koopman/size=4kB/align=0-8       455MB/s ± 6%   454MB/s ± 5%     452MB/s ± 8%   308MB/s ± 3%
+CRC32/poly=Koopman/size=4kB/align=1-8       455MB/s ±11%   459MB/s ± 9%     434MB/s ± 9%   311MB/s ± 3%
+CRC32/poly=Koopman/size=32kB/align=0-8      450MB/s ± 4%   453MB/s ± 8%                    307MB/s ± 6%
+CRC32/poly=Koopman/size=32kB/align=1-8      441MB/s ± 3%   471MB/s ± 3%                    309MB/s ± 4%
diff --git a/cmd/benchstat/testdata/packages.golden b/cmd/benchstat/testdata/packages.golden
new file mode 100644
index 0000000..05cf0c7
--- /dev/null
+++ b/cmd/benchstat/testdata/packages.golden
@@ -0,0 +1,11 @@
+name        old time/op    new time/op    delta
+pkg:encoding/gob
+GobEncode     13.6ms ± 1%    11.8ms ± 1%  -13.31%  (p=0.016 n=4+5)
+pkg:encoding/json
+JSONEncode    32.1ms ± 1%    31.8ms ± 1%     ~     (p=0.286 n=4+5)
+
+name        old speed      new speed      delta
+pkg:encoding/gob
+GobEncode   56.4MB/s ± 1%  65.1MB/s ± 1%  +15.36%  (p=0.016 n=4+5)
+pkg:encoding/json
+JSONEncode  60.4MB/s ± 1%  61.1MB/s ± 2%     ~     (p=0.286 n=4+5)
diff --git a/cmd/benchstat/testdata/packagesnew.txt b/cmd/benchstat/testdata/packagesnew.txt
new file mode 100644
index 0000000..7732820
--- /dev/null
+++ b/cmd/benchstat/testdata/packagesnew.txt
@@ -0,0 +1,12 @@
+pkg: encoding/gob
+BenchmarkGobEncode   	 100	  11773189 ns/op	  65.19 MB/s
+BenchmarkGobEncode   	 100	  11942588 ns/op	  64.27 MB/s
+BenchmarkGobEncode   	 100	  11786159 ns/op	  65.12 MB/s
+BenchmarkGobEncode   	 100	  11628583 ns/op	  66.00 MB/s
+BenchmarkGobEncode   	 100	  11815924 ns/op	  64.96 MB/s
+pkg: encoding/json
+BenchmarkJSONEncode  	  50	  32036529 ns/op	  60.57 MB/s
+BenchmarkJSONEncode  	  50	  32156552 ns/op	  60.34 MB/s
+BenchmarkJSONEncode  	  50	  31288355 ns/op	  62.02 MB/s
+BenchmarkJSONEncode  	  50	  31559706 ns/op	  61.49 MB/s
+BenchmarkJSONEncode  	  50	  31765634 ns/op	  61.09 MB/s
diff --git a/cmd/benchstat/testdata/packagesold.golden b/cmd/benchstat/testdata/packagesold.golden
new file mode 100644
index 0000000..4c20063
--- /dev/null
+++ b/cmd/benchstat/testdata/packagesold.golden
@@ -0,0 +1,11 @@
+name        time/op
+pkg:encoding/gob
+GobEncode     13.6ms ± 1%
+pkg:encoding/json
+JSONEncode    32.1ms ± 1%
+
+name        speed
+pkg:encoding/gob
+GobEncode   56.4MB/s ± 1%
+pkg:encoding/json
+JSONEncode  60.4MB/s ± 1%
diff --git a/cmd/benchstat/testdata/packagesold.txt b/cmd/benchstat/testdata/packagesold.txt
new file mode 100644
index 0000000..add4b7a
--- /dev/null
+++ b/cmd/benchstat/testdata/packagesold.txt
@@ -0,0 +1,10 @@
+pkg: encoding/gob
+BenchmarkGobEncode   	100	  13552735 ns/op	  56.63 MB/s
+BenchmarkGobEncode   	100	  13553943 ns/op	  56.63 MB/s
+BenchmarkGobEncode   	100	  13606356 ns/op	  56.41 MB/s
+BenchmarkGobEncode   	100	  13683198 ns/op	  56.09 MB/s
+pkg: encoding/json
+BenchmarkJSONEncode  	 50	  32395067 ns/op	  59.90 MB/s
+BenchmarkJSONEncode  	 50	  32334214 ns/op	  60.01 MB/s
+BenchmarkJSONEncode  	 50	  31992891 ns/op	  60.65 MB/s
+BenchmarkJSONEncode  	 50	  31735022 ns/op	  61.15 MB/s