benchstat: add support for sorting benchstat output.

Change-Id: If63789fdbe901ca6894a2d00860bedab3160d757
Reviewed-on: https://go-review.googlesource.com/88915
Reviewed-by: Russ Cox <rsc@golang.org>
diff --git a/benchstat/data.go b/benchstat/data.go
index 1123532..50777e8 100644
--- a/benchstat/data.go
+++ b/benchstat/data.go
@@ -44,6 +44,12 @@
 	// SplitBy specifies the labels to split results by.
 	// By default, results will only be split by full name.
 	SplitBy []string
+
+	// SortBy specifies the function by which tables in this collection
+	// should be sorted.
+	// By default, tables will not be specifically sorted, and will appear
+	// in the order they were read in
+	SortBy SortFunc
 }
 
 // A Key identifies one metric (e.g., "ns/op", "B/op") from one
diff --git a/benchstat/sort.go b/benchstat/sort.go
new file mode 100644
index 0000000..ef68821
--- /dev/null
+++ b/benchstat/sort.go
@@ -0,0 +1,43 @@
+// Copyright 2018 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package benchstat
+
+import (
+	"math"
+	"sort"
+)
+
+// A SortFunc abstracts the sorting interface to compare two rows of a Table
+type SortFunc func(*Table, int, int) bool
+
+// ByName sorts tables by the Benchmark name column
+func ByName(t *Table, i, j int) bool {
+	return t.Rows[i].Benchmark < t.Rows[j].Benchmark
+}
+
+// ByDelta sorts tables by the Delta column (comparing the numerical value
+// rather than the lexical value)
+// The sort takes into account the Change value as well, which indicates
+// whether a given delta is "good" or "bad"
+func ByDelta(t *Table, i, j int) bool {
+	return math.Abs(t.Rows[i].PctDelta)*float64(t.Rows[i].Change) <
+		math.Abs(t.Rows[j].PctDelta)*float64(t.Rows[j].Change)
+}
+
+// ByChange sorts tables by the unprinted Change column which indicates
+// whether a delta is negative, zero, or positive
+func ByChange(t *Table, i, j int) bool {
+	return t.Rows[i].Change < t.Rows[j].Change
+}
+
+// SortReverse returns a SortFunc that is the reverse of the input SortFunc
+func SortReverse(sortFunc SortFunc) SortFunc {
+	return func(t *Table, i, j int) bool { return !sortFunc(t, i, j) }
+}
+
+// SortTable sorts a Table t (in place) by the given SortFunc
+func SortTable(t *Table, sortFunc SortFunc) {
+	sort.Slice(t.Rows, func(i, j int) bool { return sortFunc(t, i, j) })
+}
diff --git a/benchstat/sort_test.go b/benchstat/sort_test.go
new file mode 100644
index 0000000..618490c
--- /dev/null
+++ b/benchstat/sort_test.go
@@ -0,0 +1,132 @@
+// Copyright 2018 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package benchstat
+
+import (
+	"io/ioutil"
+	"log"
+	"sort"
+	"testing"
+)
+
+var file1 = "../cmd/benchstat/testdata/old.txt"
+var file2 = "../cmd/benchstat/testdata/new.txt"
+
+func extractRowBenchmark(row *Row) string {
+	return row.Benchmark
+}
+func extractRowDelta(row *Row) float64 {
+	return row.PctDelta
+}
+func extractRowChange(row *Row) int {
+	return row.Change
+}
+
+func benchmarkSortTest(t *testing.T, sampleTable *Table) {
+	numRows := len(sampleTable.Rows)
+	benchmarks := make([]string, numRows)
+	SortTable(sampleTable, ByName)
+	for idx, row := range sampleTable.Rows {
+		benchmarks[idx] = extractRowBenchmark(row)
+	}
+	t.Run("BenchSorted", func(t *testing.T) {
+		if !sort.StringsAreSorted(benchmarks) {
+			t.Error("Table not sorted by benchmarks")
+		}
+	})
+	SortTable(sampleTable, SortReverse(ByName))
+	for idx, row := range sampleTable.Rows {
+		benchmarks[numRows-idx-1] = extractRowBenchmark(row)
+	}
+	t.Run("BenchSortReversed", func(t *testing.T) {
+		if !sort.StringsAreSorted(benchmarks) {
+			t.Error("Table not reverse sorted by benchmarks")
+		}
+	})
+}
+
+func deltaSortTest(t *testing.T, sampleTable *Table) {
+	numRows := len(sampleTable.Rows)
+	deltas := make([]float64, numRows)
+	SortTable(sampleTable, ByDelta)
+	for idx, row := range sampleTable.Rows {
+		deltas[idx] = extractRowDelta(row)
+	}
+	t.Run("DeltaSorted", func(t *testing.T) {
+		if !sort.Float64sAreSorted(deltas) {
+			t.Error("Table not sorted by deltas")
+		}
+	})
+	SortTable(sampleTable, SortReverse(ByDelta))
+	for idx, row := range sampleTable.Rows {
+		deltas[numRows-idx-1] = extractRowDelta(row)
+	}
+	t.Run("DeltaSortReversed", func(t *testing.T) {
+		if !sort.Float64sAreSorted(deltas) {
+			t.Error("Table not reverse sorted by deltas")
+		}
+	})
+}
+
+func changeSortTest(t *testing.T, sampleTable *Table) {
+	numRows := len(sampleTable.Rows)
+	changes := make([]int, numRows)
+	SortTable(sampleTable, ByChange)
+	for idx, row := range sampleTable.Rows {
+		changes[idx] = extractRowChange(row)
+	}
+	t.Run("ChangeSorted", func(t *testing.T) {
+		if !sort.IntsAreSorted(changes) {
+			t.Error("Table not sorted by changes")
+		}
+	})
+	SortTable(sampleTable, SortReverse(ByChange))
+	for idx, row := range sampleTable.Rows {
+		changes[numRows-idx-1] = extractRowChange(row)
+	}
+	t.Run("ChangeSortReversed", func(t *testing.T) {
+		if !sort.IntsAreSorted(changes) {
+			t.Error("Table not reverse sorted by changes")
+		}
+	})
+}
+
+func TestCompareCollection(t *testing.T) {
+	sampleCompareCollection := Collection{Alpha: 0.05, AddGeoMean: false, DeltaTest: UTest}
+	file1Data, err := ioutil.ReadFile(file1)
+	if err != nil {
+		log.Fatal(err)
+	}
+	file2Data, err := ioutil.ReadFile(file2)
+	if err != nil {
+		log.Fatal(err)
+	}
+	sampleCompareCollection.AddConfig(file1, file1Data)
+	sampleCompareCollection.AddConfig(file2, file2Data)
+	// data has both time and speed tables, test only the speed table
+	sampleTable := sampleCompareCollection.Tables()[0]
+	t.Run("BenchmarkSort", func(t *testing.T) {
+		benchmarkSortTest(t, sampleTable)
+	})
+	t.Run("DeltaSort", func(t *testing.T) {
+		deltaSortTest(t, sampleTable)
+	})
+	t.Run("ChangeSort", func(t *testing.T) {
+		changeSortTest(t, sampleTable)
+	})
+}
+
+func TestSingleCollection(t *testing.T) {
+	sampleCollection := Collection{Alpha: 0.05, AddGeoMean: false, DeltaTest: UTest}
+	file1Data, err1 := ioutil.ReadFile(file1)
+	if err1 != nil {
+		log.Fatal(err1)
+	}
+	sampleCollection.AddConfig(file1, file1Data)
+	sampleTable := sampleCollection.Tables()[0]
+	t.Run("BenchmarkSort", func(t *testing.T) {
+		benchmarkSortTest(t, sampleTable)
+	})
+}
diff --git a/benchstat/table.go b/benchstat/table.go
index cd8a027..b90cfd7 100644
--- a/benchstat/table.go
+++ b/benchstat/table.go
@@ -26,6 +26,7 @@
 	Group     string     // group name
 	Scaler    Scaler     // formatter for stats means
 	Metrics   []*Metrics // columns of statistics
+	PctDelta  float64    // unformatted percent change
 	Delta     string     // formatted percent change
 	Note      string     // additional information
 	Change    int        // +1 better, -1 worse, 0 unchanged
@@ -89,6 +90,7 @@
 						continue
 					}
 					pval, testerr := deltaTest(old, new)
+					row.PctDelta = 0.00
 					row.Delta = "~"
 					if testerr == stats.ErrZeroVariance {
 						row.Note = "(zero variance)"
@@ -103,6 +105,7 @@
 							row.Delta = "0.00%"
 						} else {
 							pct := ((new.Mean / old.Mean) - 1.0) * 100.0
+							row.PctDelta = pct
 							row.Delta = fmt.Sprintf("%+.2f%%", pct)
 							if pct < 0 == (table.Metric != "speed") { // smaller is better, except speeds
 								row.Change = +1
@@ -121,6 +124,9 @@
 		}
 
 		if len(table.Rows) > 0 {
+			if c.SortBy != nil {
+				SortTable(table, c.SortBy)
+			}
 			if c.AddGeoMean {
 				addGeomean(c, table, key.Unit, table.OldNewDelta)
 			}
@@ -200,7 +206,9 @@
 		return
 	}
 	if delta {
-		row.Delta = fmt.Sprintf("%+.2f%%", ((geomeans[1]/geomeans[0])-1.0)*100.0)
+		pct := ((geomeans[1] / geomeans[0]) - 1.0) * 100.0
+		row.PctDelta = pct
+		row.Delta = fmt.Sprintf("%+.2f%%", pct)
 	}
 	t.Rows = append(t.Rows, row)
 }
diff --git a/cmd/benchstat/main.go b/cmd/benchstat/main.go
index 1ec3808..43830ec 100644
--- a/cmd/benchstat/main.go
+++ b/cmd/benchstat/main.go
@@ -110,11 +110,13 @@
 }
 
 var (
-	flagDeltaTest = flag.String("delta-test", "utest", "significance `test` to apply to delta: utest, ttest, or none")
-	flagAlpha     = flag.Float64("alpha", 0.05, "consider change significant if p < `α`")
-	flagGeomean   = flag.Bool("geomean", false, "print the geometric mean of each file")
-	flagHTML      = flag.Bool("html", false, "print results as an HTML table")
-	flagSplit     = flag.String("split", "pkg,goos,goarch", "split benchmarks by `labels`")
+	flagDeltaTest   = flag.String("delta-test", "utest", "significance `test` to apply to delta: utest, ttest, or none")
+	flagAlpha       = flag.Float64("alpha", 0.05, "consider change significant if p < `α`")
+	flagGeomean     = flag.Bool("geomean", false, "print the geometric mean of each file")
+	flagHTML        = flag.Bool("html", false, "print results as an HTML table")
+	flagSplit       = flag.String("split", "pkg,goos,goarch", "split benchmarks by `labels`")
+	flagSort        = flag.String("sort", "none", "sort by this `header`: benchmark, delta, change")
+	flagReverseSort = flag.Bool("reverse", false, "reverse the sort order")
 )
 
 var deltaTestNames = map[string]benchstat.DeltaTest{
@@ -127,13 +129,21 @@
 	"ttest":  benchstat.TTest,
 }
 
+var sortNames = map[string]benchstat.SortFunc{
+	"none":      nil,
+	"benchmark": benchstat.ByName,
+	"delta":     benchstat.ByDelta,
+	"change":    benchstat.ByChange,
+}
+
 func main() {
 	log.SetPrefix("benchstat: ")
 	log.SetFlags(0)
 	flag.Usage = usage
 	flag.Parse()
 	deltaTest := deltaTestNames[strings.ToLower(*flagDeltaTest)]
-	if flag.NArg() < 1 || deltaTest == nil {
+	sortType, ok := sortNames[strings.ToLower(*flagSort)]
+	if flag.NArg() < 1 || deltaTest == nil || !ok {
 		flag.Usage()
 	}
 
@@ -145,6 +155,12 @@
 	if *flagSplit != "" {
 		c.SplitBy = strings.Split(*flagSplit, ",")
 	}
+	if sortType != nil {
+		if *flagReverseSort {
+			sortType = benchstat.SortReverse(sortType)
+		}
+		c.SortBy = sortType
+	}
 	for _, file := range flag.Args() {
 		data, err := ioutil.ReadFile(file)
 		if err != nil {
@@ -154,7 +170,6 @@
 	}
 
 	tables := c.Tables()
-
 	var buf bytes.Buffer
 	if *flagHTML {
 		buf.WriteString(htmlHeader)
diff --git a/cmd/benchstat/main_test.go b/cmd/benchstat/main_test.go
index 4a50020..4cc56eb 100644
--- a/cmd/benchstat/main_test.go
+++ b/cmd/benchstat/main_test.go
@@ -40,6 +40,9 @@
 	check(t, "packages", "packagesold.txt", "packagesnew.txt")
 	check(t, "units", "units-old.txt", "units-new.txt")
 	check(t, "zero", "-delta-test=none", "zero-old.txt", "zero-new.txt")
+	check(t, "benchsort", "-sort=benchmark", "old.txt", "new.txt")
+	check(t, "deltasort", "-sort=delta", "old.txt", "new.txt")
+	check(t, "changesort", "-sort=change", "old.txt", "new.txt")
 }
 
 func check(t *testing.T, name string, files ...string) {
diff --git a/cmd/benchstat/testdata/benchsort.golden b/cmd/benchstat/testdata/benchsort.golden
new file mode 100644
index 0000000..2dea70b
--- /dev/null
+++ b/cmd/benchstat/testdata/benchsort.golden
@@ -0,0 +1,75 @@
+name                                       old time/op    new time/op     delta
+CRC32/poly=Castagnoli/size=15/align=0-8      16.4ns ± 3%     16.3ns ± 2%      ~     (p=0.615 n=9+9)
+CRC32/poly=Castagnoli/size=15/align=1-8      17.2ns ± 2%     17.3ns ± 2%      ~     (p=0.650 n=9+10)
+CRC32/poly=Castagnoli/size=1kB/align=0-8     65.5ns ± 1%     66.2ns ± 1%    +1.01%  (p=0.003 n=9+8)
+CRC32/poly=Castagnoli/size=1kB/align=1-8     70.1ns ± 6%     68.5ns ± 2%      ~     (p=0.190 n=10+9)
+CRC32/poly=Castagnoli/size=32kB/align=0-8    1.22µs ± 4%     1.21µs ± 3%      ~     (p=0.882 n=9+9)
+CRC32/poly=Castagnoli/size=32kB/align=1-8    1.26µs ± 3%     1.22µs ± 4%    -3.48%  (p=0.002 n=9+10)
+CRC32/poly=Castagnoli/size=40/align=0-8      17.4ns ± 2%     17.5ns ± 4%      ~     (p=0.694 n=10+10)
+CRC32/poly=Castagnoli/size=40/align=1-8      19.7ns ± 3%     19.4ns ± 2%    -1.62%  (p=0.036 n=10+10)
+CRC32/poly=Castagnoli/size=4kB/align=0-8      163ns ± 5%      159ns ± 3%    -2.46%  (p=0.032 n=10+10)
+CRC32/poly=Castagnoli/size=4kB/align=1-8      169ns ± 6%      162ns ± 3%    -4.60%  (p=0.005 n=10+10)
+CRC32/poly=Castagnoli/size=512/align=0-8     40.2ns ± 2%     40.1ns ± 4%      ~     (p=0.614 n=10+10)
+CRC32/poly=Castagnoli/size=512/align=1-8     42.1ns ± 3%     41.9ns ± 2%      ~     (p=0.952 n=10+9)
+CRC32/poly=IEEE/size=15/align=0-8            46.9ns ± 8%     44.5ns ± 3%    -5.01%  (p=0.008 n=10+10)
+CRC32/poly=IEEE/size=15/align=1-8            44.7ns ± 5%     44.5ns ± 4%      ~     (p=0.539 n=10+10)
+CRC32/poly=IEEE/size=1kB/align=0-8            452ns ± 4%       94ns ± 2%   -79.20%  (p=0.000 n=10+8)
+CRC32/poly=IEEE/size=1kB/align=1-8            444ns ± 2%       93ns ± 2%   -78.97%  (p=0.000 n=10+8)
+CRC32/poly=IEEE/size=32kB/align=0-8          15.0µs ± 7%      2.2µs ± 3%   -85.57%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=32kB/align=1-8          14.2µs ± 7%      2.2µs ± 3%   -84.65%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=40/align=0-8            41.0ns ± 1%     42.5ns ± 6%    +3.56%  (p=0.000 n=8+10)
+CRC32/poly=IEEE/size=40/align=1-8            41.1ns ± 1%     42.0ns ± 3%    +2.34%  (p=0.000 n=9+10)
+CRC32/poly=IEEE/size=4kB/align=0-8           1.74µs ± 8%     0.30µs ± 1%   -82.87%  (p=0.000 n=10+9)
+CRC32/poly=IEEE/size=4kB/align=1-8           1.76µs ± 6%     0.30µs ± 3%   -83.05%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=512/align=0-8            238ns ± 5%       57ns ± 3%   -76.00%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=512/align=1-8            236ns ± 3%       57ns ± 3%   -75.72%  (p=0.000 n=10+10)
+CRC32/poly=Koopman/size=15/align=0-8         36.5ns ±11%     35.6ns ± 3%      ~     (p=0.216 n=10+10)
+CRC32/poly=Koopman/size=15/align=1-8         35.1ns ± 5%     35.5ns ± 1%      ~     (p=0.508 n=10+9)
+CRC32/poly=Koopman/size=1kB/align=0-8        2.24µs ± 6%     2.34µs ± 4%    +4.34%  (p=0.010 n=9+10)
+CRC32/poly=Koopman/size=1kB/align=1-8        2.15µs ± 2%     2.36µs ± 5%    +9.84%  (p=0.000 n=9+10)
+CRC32/poly=Koopman/size=32kB/align=0-8       72.4µs ± 9%     72.9µs ± 4%      ~     (p=0.684 n=10+10)
+CRC32/poly=Koopman/size=32kB/align=1-8       69.6µs ± 3%     74.3µs ± 3%    +6.70%  (p=0.000 n=8+10)
+CRC32/poly=Koopman/size=40/align=0-8         91.6ns ± 9%     87.6ns ± 2%    -4.35%  (p=0.002 n=10+10)
+CRC32/poly=Koopman/size=40/align=1-8         91.1ns ± 6%     88.0ns ± 3%      ~     (p=0.055 n=10+10)
+CRC32/poly=Koopman/size=4kB/align=0-8        9.03µs ± 6%     9.00µs ± 6%      ~     (p=0.971 n=10+10)
+CRC32/poly=Koopman/size=4kB/align=1-8        8.94µs ±10%     9.05µs ±12%      ~     (p=0.754 n=10+10)
+CRC32/poly=Koopman/size=512/align=0-8        1.13µs ± 5%     1.08µs ± 3%    -4.93%  (p=0.000 n=10+10)
+CRC32/poly=Koopman/size=512/align=1-8        1.13µs ± 6%     1.17µs ± 8%      ~     (p=0.143 n=10+10)
+
+name                                       old speed      new speed       delta
+CRC32/poly=Castagnoli/size=15/align=0-8     916MB/s ± 2%    920MB/s ± 2%      ~     (p=0.489 n=9+9)
+CRC32/poly=Castagnoli/size=15/align=1-8     870MB/s ± 2%    867MB/s ± 2%      ~     (p=0.661 n=9+10)
+CRC32/poly=Castagnoli/size=1kB/align=0-8   15.6GB/s ± 1%   15.5GB/s ± 1%    -1.02%  (p=0.002 n=9+8)
+CRC32/poly=Castagnoli/size=1kB/align=1-8   14.6GB/s ± 6%   15.0GB/s ± 2%      ~     (p=0.211 n=10+9)
+CRC32/poly=Castagnoli/size=32kB/align=0-8  26.9GB/s ± 4%   26.8GB/s ± 5%      ~     (p=0.842 n=9+10)
+CRC32/poly=Castagnoli/size=32kB/align=1-8  25.9GB/s ± 3%   26.8GB/s ± 4%    +3.62%  (p=0.002 n=9+10)
+CRC32/poly=Castagnoli/size=40/align=0-8    2.30GB/s ± 2%   2.28GB/s ± 4%      ~     (p=0.684 n=10+10)
+CRC32/poly=Castagnoli/size=40/align=1-8    2.03GB/s ± 3%   2.06GB/s ± 2%      ~     (p=0.063 n=10+10)
+CRC32/poly=Castagnoli/size=4kB/align=0-8   25.1GB/s ± 5%   25.7GB/s ± 3%      ~     (p=0.052 n=10+10)
+CRC32/poly=Castagnoli/size=4kB/align=1-8   24.1GB/s ± 6%   25.3GB/s ± 3%    +4.71%  (p=0.005 n=10+10)
+CRC32/poly=Castagnoli/size=512/align=0-8   12.7GB/s ± 2%   12.8GB/s ± 4%      ~     (p=0.529 n=10+10)
+CRC32/poly=Castagnoli/size=512/align=1-8   12.1GB/s ± 3%   12.2GB/s ± 1%      ~     (p=0.780 n=10+9)
+CRC32/poly=IEEE/size=15/align=0-8           321MB/s ± 8%    337MB/s ± 3%    +5.06%  (p=0.009 n=10+10)
+CRC32/poly=IEEE/size=15/align=1-8           336MB/s ± 4%    337MB/s ± 4%      ~     (p=0.579 n=10+10)
+CRC32/poly=IEEE/size=1kB/align=0-8         2.26GB/s ± 4%  10.88GB/s ± 2%  +381.12%  (p=0.000 n=10+8)
+CRC32/poly=IEEE/size=1kB/align=1-8         2.31GB/s ± 2%  10.98GB/s ± 2%  +375.97%  (p=0.000 n=10+8)
+CRC32/poly=IEEE/size=32kB/align=0-8        2.19GB/s ± 7%  15.19GB/s ± 3%  +591.99%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=32kB/align=1-8        2.31GB/s ± 8%  15.04GB/s ± 3%  +550.07%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=40/align=0-8           975MB/s ± 1%    942MB/s ± 5%    -3.37%  (p=0.001 n=8+10)
+CRC32/poly=IEEE/size=40/align=1-8           974MB/s ± 1%    952MB/s ± 3%    -2.25%  (p=0.000 n=9+10)
+CRC32/poly=IEEE/size=4kB/align=0-8         2.36GB/s ± 7%  13.73GB/s ± 1%  +482.26%  (p=0.000 n=10+9)
+CRC32/poly=IEEE/size=4kB/align=1-8         2.33GB/s ± 6%  13.68GB/s ± 3%  +488.23%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=512/align=0-8         2.15GB/s ± 4%   8.97GB/s ± 3%  +317.65%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=512/align=1-8         2.17GB/s ± 3%   8.96GB/s ± 3%  +312.89%  (p=0.000 n=10+10)
+CRC32/poly=Koopman/size=15/align=0-8        412MB/s ±10%    421MB/s ± 3%      ~     (p=0.218 n=10+10)
+CRC32/poly=Koopman/size=15/align=1-8        427MB/s ± 5%    422MB/s ± 1%      ~     (p=0.497 n=10+9)
+CRC32/poly=Koopman/size=1kB/align=0-8       452MB/s ± 9%    438MB/s ± 4%      ~     (p=0.052 n=10+10)
+CRC32/poly=Koopman/size=1kB/align=1-8       477MB/s ± 2%    434MB/s ± 5%    -8.92%  (p=0.000 n=9+10)
+CRC32/poly=Koopman/size=32kB/align=0-8      453MB/s ± 8%    450MB/s ± 4%      ~     (p=0.684 n=10+10)
+CRC32/poly=Koopman/size=32kB/align=1-8      471MB/s ± 3%    441MB/s ± 3%    -6.25%  (p=0.000 n=8+10)
+CRC32/poly=Koopman/size=40/align=0-8        437MB/s ± 9%    456MB/s ± 2%    +4.50%  (p=0.002 n=10+10)
+CRC32/poly=Koopman/size=40/align=1-8        440MB/s ± 6%    455MB/s ± 3%      ~     (p=0.052 n=10+10)
+CRC32/poly=Koopman/size=4kB/align=0-8       454MB/s ± 5%    455MB/s ± 6%      ~     (p=0.971 n=10+10)
+CRC32/poly=Koopman/size=4kB/align=1-8       459MB/s ± 9%    455MB/s ±11%      ~     (p=0.739 n=10+10)
+CRC32/poly=Koopman/size=512/align=0-8       453MB/s ± 5%    476MB/s ± 3%    +5.09%  (p=0.000 n=10+10)
+CRC32/poly=Koopman/size=512/align=1-8       455MB/s ± 6%    440MB/s ± 8%      ~     (p=0.143 n=10+10)
diff --git a/cmd/benchstat/testdata/changesort.golden b/cmd/benchstat/testdata/changesort.golden
new file mode 100644
index 0000000..83bfd24
--- /dev/null
+++ b/cmd/benchstat/testdata/changesort.golden
@@ -0,0 +1,75 @@
+name                                       old time/op    new time/op     delta
+CRC32/poly=Castagnoli/size=1kB/align=0-8     65.5ns ± 1%     66.2ns ± 1%    +1.01%  (p=0.003 n=9+8)
+CRC32/poly=Koopman/size=1kB/align=1-8        2.15µs ± 2%     2.36µs ± 5%    +9.84%  (p=0.000 n=9+10)
+CRC32/poly=IEEE/size=40/align=0-8            41.0ns ± 1%     42.5ns ± 6%    +3.56%  (p=0.000 n=8+10)
+CRC32/poly=IEEE/size=40/align=1-8            41.1ns ± 1%     42.0ns ± 3%    +2.34%  (p=0.000 n=9+10)
+CRC32/poly=Koopman/size=1kB/align=0-8        2.24µs ± 6%     2.34µs ± 4%    +4.34%  (p=0.010 n=9+10)
+CRC32/poly=Koopman/size=32kB/align=1-8       69.6µs ± 3%     74.3µs ± 3%    +6.70%  (p=0.000 n=8+10)
+CRC32/poly=Castagnoli/size=32kB/align=0-8    1.22µs ± 4%     1.21µs ± 3%      ~     (p=0.882 n=9+9)
+CRC32/poly=Koopman/size=32kB/align=0-8       72.4µs ± 9%     72.9µs ± 4%      ~     (p=0.684 n=10+10)
+CRC32/poly=Koopman/size=4kB/align=1-8        8.94µs ±10%     9.05µs ±12%      ~     (p=0.754 n=10+10)
+CRC32/poly=Koopman/size=4kB/align=0-8        9.03µs ± 6%     9.00µs ± 6%      ~     (p=0.971 n=10+10)
+CRC32/poly=IEEE/size=15/align=1-8            44.7ns ± 5%     44.5ns ± 4%      ~     (p=0.539 n=10+10)
+CRC32/poly=Koopman/size=512/align=1-8        1.13µs ± 6%     1.17µs ± 8%      ~     (p=0.143 n=10+10)
+CRC32/poly=Castagnoli/size=15/align=0-8      16.4ns ± 3%     16.3ns ± 2%      ~     (p=0.615 n=9+9)
+CRC32/poly=Castagnoli/size=15/align=1-8      17.2ns ± 2%     17.3ns ± 2%      ~     (p=0.650 n=9+10)
+CRC32/poly=Castagnoli/size=40/align=0-8      17.4ns ± 2%     17.5ns ± 4%      ~     (p=0.694 n=10+10)
+CRC32/poly=Koopman/size=40/align=1-8         91.1ns ± 6%     88.0ns ± 3%      ~     (p=0.055 n=10+10)
+CRC32/poly=Castagnoli/size=512/align=0-8     40.2ns ± 2%     40.1ns ± 4%      ~     (p=0.614 n=10+10)
+CRC32/poly=Castagnoli/size=512/align=1-8     42.1ns ± 3%     41.9ns ± 2%      ~     (p=0.952 n=10+9)
+CRC32/poly=Koopman/size=15/align=1-8         35.1ns ± 5%     35.5ns ± 1%      ~     (p=0.508 n=10+9)
+CRC32/poly=Castagnoli/size=1kB/align=1-8     70.1ns ± 6%     68.5ns ± 2%      ~     (p=0.190 n=10+9)
+CRC32/poly=Koopman/size=15/align=0-8         36.5ns ±11%     35.6ns ± 3%      ~     (p=0.216 n=10+10)
+CRC32/poly=IEEE/size=1kB/align=0-8            452ns ± 4%       94ns ± 2%   -79.20%  (p=0.000 n=10+8)
+CRC32/poly=Castagnoli/size=4kB/align=1-8      169ns ± 6%      162ns ± 3%    -4.60%  (p=0.005 n=10+10)
+CRC32/poly=Castagnoli/size=32kB/align=1-8    1.26µs ± 3%     1.22µs ± 4%    -3.48%  (p=0.002 n=9+10)
+CRC32/poly=Castagnoli/size=4kB/align=0-8      163ns ± 5%      159ns ± 3%    -2.46%  (p=0.032 n=10+10)
+CRC32/poly=IEEE/size=512/align=1-8            236ns ± 3%       57ns ± 3%   -75.72%  (p=0.000 n=10+10)
+CRC32/poly=Koopman/size=40/align=0-8         91.6ns ± 9%     87.6ns ± 2%    -4.35%  (p=0.002 n=10+10)
+CRC32/poly=Castagnoli/size=40/align=1-8      19.7ns ± 3%     19.4ns ± 2%    -1.62%  (p=0.036 n=10+10)
+CRC32/poly=Koopman/size=512/align=0-8        1.13µs ± 5%     1.08µs ± 3%    -4.93%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=32kB/align=1-8          14.2µs ± 7%      2.2µs ± 3%   -84.65%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=512/align=0-8            238ns ± 5%       57ns ± 3%   -76.00%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=32kB/align=0-8          15.0µs ± 7%      2.2µs ± 3%   -85.57%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=4kB/align=1-8           1.76µs ± 6%     0.30µs ± 3%   -83.05%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=4kB/align=0-8           1.74µs ± 8%     0.30µs ± 1%   -82.87%  (p=0.000 n=10+9)
+CRC32/poly=IEEE/size=1kB/align=1-8            444ns ± 2%       93ns ± 2%   -78.97%  (p=0.000 n=10+8)
+CRC32/poly=IEEE/size=15/align=0-8            46.9ns ± 8%     44.5ns ± 3%    -5.01%  (p=0.008 n=10+10)
+
+name                                       old speed      new speed       delta
+CRC32/poly=Castagnoli/size=1kB/align=0-8   15.6GB/s ± 1%   15.5GB/s ± 1%    -1.02%  (p=0.002 n=9+8)
+CRC32/poly=Koopman/size=1kB/align=1-8       477MB/s ± 2%    434MB/s ± 5%    -8.92%  (p=0.000 n=9+10)
+CRC32/poly=IEEE/size=40/align=0-8           975MB/s ± 1%    942MB/s ± 5%    -3.37%  (p=0.001 n=8+10)
+CRC32/poly=IEEE/size=40/align=1-8           974MB/s ± 1%    952MB/s ± 3%    -2.25%  (p=0.000 n=9+10)
+CRC32/poly=Koopman/size=32kB/align=1-8      471MB/s ± 3%    441MB/s ± 3%    -6.25%  (p=0.000 n=8+10)
+CRC32/poly=Koopman/size=15/align=0-8        412MB/s ±10%    421MB/s ± 3%      ~     (p=0.218 n=10+10)
+CRC32/poly=Koopman/size=32kB/align=0-8      453MB/s ± 8%    450MB/s ± 4%      ~     (p=0.684 n=10+10)
+CRC32/poly=Koopman/size=4kB/align=1-8       459MB/s ± 9%    455MB/s ±11%      ~     (p=0.739 n=10+10)
+CRC32/poly=Koopman/size=4kB/align=0-8       454MB/s ± 5%    455MB/s ± 6%      ~     (p=0.971 n=10+10)
+CRC32/poly=IEEE/size=15/align=1-8           336MB/s ± 4%    337MB/s ± 4%      ~     (p=0.579 n=10+10)
+CRC32/poly=Koopman/size=1kB/align=0-8       452MB/s ± 9%    438MB/s ± 4%      ~     (p=0.052 n=10+10)
+CRC32/poly=Koopman/size=512/align=1-8       455MB/s ± 6%    440MB/s ± 8%      ~     (p=0.143 n=10+10)
+CRC32/poly=Castagnoli/size=15/align=0-8     916MB/s ± 2%    920MB/s ± 2%      ~     (p=0.489 n=9+9)
+CRC32/poly=Castagnoli/size=15/align=1-8     870MB/s ± 2%    867MB/s ± 2%      ~     (p=0.661 n=9+10)
+CRC32/poly=Castagnoli/size=40/align=0-8    2.30GB/s ± 2%   2.28GB/s ± 4%      ~     (p=0.684 n=10+10)
+CRC32/poly=Castagnoli/size=40/align=1-8    2.03GB/s ± 3%   2.06GB/s ± 2%      ~     (p=0.063 n=10+10)
+CRC32/poly=Castagnoli/size=512/align=0-8   12.7GB/s ± 2%   12.8GB/s ± 4%      ~     (p=0.529 n=10+10)
+CRC32/poly=Castagnoli/size=512/align=1-8   12.1GB/s ± 3%   12.2GB/s ± 1%      ~     (p=0.780 n=10+9)
+CRC32/poly=Koopman/size=40/align=1-8        440MB/s ± 6%    455MB/s ± 3%      ~     (p=0.052 n=10+10)
+CRC32/poly=Castagnoli/size=1kB/align=1-8   14.6GB/s ± 6%   15.0GB/s ± 2%      ~     (p=0.211 n=10+9)
+CRC32/poly=Castagnoli/size=4kB/align=0-8   25.1GB/s ± 5%   25.7GB/s ± 3%      ~     (p=0.052 n=10+10)
+CRC32/poly=Koopman/size=15/align=1-8        427MB/s ± 5%    422MB/s ± 1%      ~     (p=0.497 n=10+9)
+CRC32/poly=Castagnoli/size=32kB/align=0-8  26.9GB/s ± 4%   26.8GB/s ± 5%      ~     (p=0.842 n=9+10)
+CRC32/poly=IEEE/size=512/align=1-8         2.17GB/s ± 3%   8.96GB/s ± 3%  +312.89%  (p=0.000 n=10+10)
+CRC32/poly=Castagnoli/size=32kB/align=1-8  25.9GB/s ± 3%   26.8GB/s ± 4%    +3.62%  (p=0.002 n=9+10)
+CRC32/poly=Castagnoli/size=4kB/align=1-8   24.1GB/s ± 6%   25.3GB/s ± 3%    +4.71%  (p=0.005 n=10+10)
+CRC32/poly=Koopman/size=40/align=0-8        437MB/s ± 9%    456MB/s ± 2%    +4.50%  (p=0.002 n=10+10)
+CRC32/poly=IEEE/size=512/align=0-8         2.15GB/s ± 4%   8.97GB/s ± 3%  +317.65%  (p=0.000 n=10+10)
+CRC32/poly=Koopman/size=512/align=0-8       453MB/s ± 5%    476MB/s ± 3%    +5.09%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=32kB/align=1-8        2.31GB/s ± 8%  15.04GB/s ± 3%  +550.07%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=32kB/align=0-8        2.19GB/s ± 7%  15.19GB/s ± 3%  +591.99%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=4kB/align=1-8         2.33GB/s ± 6%  13.68GB/s ± 3%  +488.23%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=4kB/align=0-8         2.36GB/s ± 7%  13.73GB/s ± 1%  +482.26%  (p=0.000 n=10+9)
+CRC32/poly=IEEE/size=1kB/align=1-8         2.31GB/s ± 2%  10.98GB/s ± 2%  +375.97%  (p=0.000 n=10+8)
+CRC32/poly=IEEE/size=1kB/align=0-8         2.26GB/s ± 4%  10.88GB/s ± 2%  +381.12%  (p=0.000 n=10+8)
+CRC32/poly=IEEE/size=15/align=0-8           321MB/s ± 8%    337MB/s ± 3%    +5.06%  (p=0.009 n=10+10)
diff --git a/cmd/benchstat/testdata/deltasort.golden b/cmd/benchstat/testdata/deltasort.golden
new file mode 100644
index 0000000..367d10c
--- /dev/null
+++ b/cmd/benchstat/testdata/deltasort.golden
@@ -0,0 +1,75 @@
+name                                       old time/op    new time/op     delta
+CRC32/poly=Koopman/size=1kB/align=1-8        2.15µs ± 2%     2.36µs ± 5%    +9.84%  (p=0.000 n=9+10)
+CRC32/poly=Koopman/size=32kB/align=1-8       69.6µs ± 3%     74.3µs ± 3%    +6.70%  (p=0.000 n=8+10)
+CRC32/poly=Koopman/size=1kB/align=0-8        2.24µs ± 6%     2.34µs ± 4%    +4.34%  (p=0.010 n=9+10)
+CRC32/poly=IEEE/size=40/align=0-8            41.0ns ± 1%     42.5ns ± 6%    +3.56%  (p=0.000 n=8+10)
+CRC32/poly=IEEE/size=40/align=1-8            41.1ns ± 1%     42.0ns ± 3%    +2.34%  (p=0.000 n=9+10)
+CRC32/poly=Castagnoli/size=1kB/align=0-8     65.5ns ± 1%     66.2ns ± 1%    +1.01%  (p=0.003 n=9+8)
+CRC32/poly=Castagnoli/size=32kB/align=0-8    1.22µs ± 4%     1.21µs ± 3%      ~     (p=0.882 n=9+9)
+CRC32/poly=Koopman/size=32kB/align=0-8       72.4µs ± 9%     72.9µs ± 4%      ~     (p=0.684 n=10+10)
+CRC32/poly=Koopman/size=4kB/align=1-8        8.94µs ±10%     9.05µs ±12%      ~     (p=0.754 n=10+10)
+CRC32/poly=Koopman/size=4kB/align=0-8        9.03µs ± 6%     9.00µs ± 6%      ~     (p=0.971 n=10+10)
+CRC32/poly=IEEE/size=15/align=1-8            44.7ns ± 5%     44.5ns ± 4%      ~     (p=0.539 n=10+10)
+CRC32/poly=Koopman/size=512/align=1-8        1.13µs ± 6%     1.17µs ± 8%      ~     (p=0.143 n=10+10)
+CRC32/poly=Castagnoli/size=15/align=0-8      16.4ns ± 3%     16.3ns ± 2%      ~     (p=0.615 n=9+9)
+CRC32/poly=Castagnoli/size=15/align=1-8      17.2ns ± 2%     17.3ns ± 2%      ~     (p=0.650 n=9+10)
+CRC32/poly=Castagnoli/size=40/align=0-8      17.4ns ± 2%     17.5ns ± 4%      ~     (p=0.694 n=10+10)
+CRC32/poly=Koopman/size=15/align=1-8         35.1ns ± 5%     35.5ns ± 1%      ~     (p=0.508 n=10+9)
+CRC32/poly=Castagnoli/size=512/align=0-8     40.2ns ± 2%     40.1ns ± 4%      ~     (p=0.614 n=10+10)
+CRC32/poly=Castagnoli/size=512/align=1-8     42.1ns ± 3%     41.9ns ± 2%      ~     (p=0.952 n=10+9)
+CRC32/poly=Koopman/size=15/align=0-8         36.5ns ±11%     35.6ns ± 3%      ~     (p=0.216 n=10+10)
+CRC32/poly=Castagnoli/size=1kB/align=1-8     70.1ns ± 6%     68.5ns ± 2%      ~     (p=0.190 n=10+9)
+CRC32/poly=Koopman/size=40/align=1-8         91.1ns ± 6%     88.0ns ± 3%      ~     (p=0.055 n=10+10)
+CRC32/poly=Castagnoli/size=40/align=1-8      19.7ns ± 3%     19.4ns ± 2%    -1.62%  (p=0.036 n=10+10)
+CRC32/poly=Castagnoli/size=4kB/align=0-8      163ns ± 5%      159ns ± 3%    -2.46%  (p=0.032 n=10+10)
+CRC32/poly=Castagnoli/size=32kB/align=1-8    1.26µs ± 3%     1.22µs ± 4%    -3.48%  (p=0.002 n=9+10)
+CRC32/poly=Koopman/size=40/align=0-8         91.6ns ± 9%     87.6ns ± 2%    -4.35%  (p=0.002 n=10+10)
+CRC32/poly=Castagnoli/size=4kB/align=1-8      169ns ± 6%      162ns ± 3%    -4.60%  (p=0.005 n=10+10)
+CRC32/poly=Koopman/size=512/align=0-8        1.13µs ± 5%     1.08µs ± 3%    -4.93%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=15/align=0-8            46.9ns ± 8%     44.5ns ± 3%    -5.01%  (p=0.008 n=10+10)
+CRC32/poly=IEEE/size=512/align=1-8            236ns ± 3%       57ns ± 3%   -75.72%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=512/align=0-8            238ns ± 5%       57ns ± 3%   -76.00%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=1kB/align=1-8            444ns ± 2%       93ns ± 2%   -78.97%  (p=0.000 n=10+8)
+CRC32/poly=IEEE/size=1kB/align=0-8            452ns ± 4%       94ns ± 2%   -79.20%  (p=0.000 n=10+8)
+CRC32/poly=IEEE/size=4kB/align=0-8           1.74µs ± 8%     0.30µs ± 1%   -82.87%  (p=0.000 n=10+9)
+CRC32/poly=IEEE/size=4kB/align=1-8           1.76µs ± 6%     0.30µs ± 3%   -83.05%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=32kB/align=1-8          14.2µs ± 7%      2.2µs ± 3%   -84.65%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=32kB/align=0-8          15.0µs ± 7%      2.2µs ± 3%   -85.57%  (p=0.000 n=10+10)
+
+name                                       old speed      new speed       delta
+CRC32/poly=Koopman/size=1kB/align=1-8       477MB/s ± 2%    434MB/s ± 5%    -8.92%  (p=0.000 n=9+10)
+CRC32/poly=Koopman/size=32kB/align=1-8      471MB/s ± 3%    441MB/s ± 3%    -6.25%  (p=0.000 n=8+10)
+CRC32/poly=IEEE/size=40/align=0-8           975MB/s ± 1%    942MB/s ± 5%    -3.37%  (p=0.001 n=8+10)
+CRC32/poly=IEEE/size=40/align=1-8           974MB/s ± 1%    952MB/s ± 3%    -2.25%  (p=0.000 n=9+10)
+CRC32/poly=Castagnoli/size=1kB/align=0-8   15.6GB/s ± 1%   15.5GB/s ± 1%    -1.02%  (p=0.002 n=9+8)
+CRC32/poly=Koopman/size=15/align=1-8        427MB/s ± 5%    422MB/s ± 1%      ~     (p=0.497 n=10+9)
+CRC32/poly=Koopman/size=32kB/align=0-8      453MB/s ± 8%    450MB/s ± 4%      ~     (p=0.684 n=10+10)
+CRC32/poly=Koopman/size=4kB/align=1-8       459MB/s ± 9%    455MB/s ±11%      ~     (p=0.739 n=10+10)
+CRC32/poly=Koopman/size=4kB/align=0-8       454MB/s ± 5%    455MB/s ± 6%      ~     (p=0.971 n=10+10)
+CRC32/poly=IEEE/size=15/align=1-8           336MB/s ± 4%    337MB/s ± 4%      ~     (p=0.579 n=10+10)
+CRC32/poly=Koopman/size=1kB/align=0-8       452MB/s ± 9%    438MB/s ± 4%      ~     (p=0.052 n=10+10)
+CRC32/poly=Koopman/size=512/align=1-8       455MB/s ± 6%    440MB/s ± 8%      ~     (p=0.143 n=10+10)
+CRC32/poly=Castagnoli/size=15/align=0-8     916MB/s ± 2%    920MB/s ± 2%      ~     (p=0.489 n=9+9)
+CRC32/poly=Castagnoli/size=15/align=1-8     870MB/s ± 2%    867MB/s ± 2%      ~     (p=0.661 n=9+10)
+CRC32/poly=Castagnoli/size=40/align=0-8    2.30GB/s ± 2%   2.28GB/s ± 4%      ~     (p=0.684 n=10+10)
+CRC32/poly=Castagnoli/size=40/align=1-8    2.03GB/s ± 3%   2.06GB/s ± 2%      ~     (p=0.063 n=10+10)
+CRC32/poly=Castagnoli/size=512/align=0-8   12.7GB/s ± 2%   12.8GB/s ± 4%      ~     (p=0.529 n=10+10)
+CRC32/poly=Castagnoli/size=512/align=1-8   12.1GB/s ± 3%   12.2GB/s ± 1%      ~     (p=0.780 n=10+9)
+CRC32/poly=Koopman/size=40/align=1-8        440MB/s ± 6%    455MB/s ± 3%      ~     (p=0.052 n=10+10)
+CRC32/poly=Castagnoli/size=1kB/align=1-8   14.6GB/s ± 6%   15.0GB/s ± 2%      ~     (p=0.211 n=10+9)
+CRC32/poly=Castagnoli/size=4kB/align=0-8   25.1GB/s ± 5%   25.7GB/s ± 3%      ~     (p=0.052 n=10+10)
+CRC32/poly=Koopman/size=15/align=0-8        412MB/s ±10%    421MB/s ± 3%      ~     (p=0.218 n=10+10)
+CRC32/poly=Castagnoli/size=32kB/align=0-8  26.9GB/s ± 4%   26.8GB/s ± 5%      ~     (p=0.842 n=9+10)
+CRC32/poly=Castagnoli/size=32kB/align=1-8  25.9GB/s ± 3%   26.8GB/s ± 4%    +3.62%  (p=0.002 n=9+10)
+CRC32/poly=Koopman/size=40/align=0-8        437MB/s ± 9%    456MB/s ± 2%    +4.50%  (p=0.002 n=10+10)
+CRC32/poly=Castagnoli/size=4kB/align=1-8   24.1GB/s ± 6%   25.3GB/s ± 3%    +4.71%  (p=0.005 n=10+10)
+CRC32/poly=IEEE/size=15/align=0-8           321MB/s ± 8%    337MB/s ± 3%    +5.06%  (p=0.009 n=10+10)
+CRC32/poly=Koopman/size=512/align=0-8       453MB/s ± 5%    476MB/s ± 3%    +5.09%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=512/align=1-8         2.17GB/s ± 3%   8.96GB/s ± 3%  +312.89%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=512/align=0-8         2.15GB/s ± 4%   8.97GB/s ± 3%  +317.65%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=1kB/align=1-8         2.31GB/s ± 2%  10.98GB/s ± 2%  +375.97%  (p=0.000 n=10+8)
+CRC32/poly=IEEE/size=1kB/align=0-8         2.26GB/s ± 4%  10.88GB/s ± 2%  +381.12%  (p=0.000 n=10+8)
+CRC32/poly=IEEE/size=4kB/align=0-8         2.36GB/s ± 7%  13.73GB/s ± 1%  +482.26%  (p=0.000 n=10+9)
+CRC32/poly=IEEE/size=4kB/align=1-8         2.33GB/s ± 6%  13.68GB/s ± 3%  +488.23%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=32kB/align=1-8        2.31GB/s ± 8%  15.04GB/s ± 3%  +550.07%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=32kB/align=0-8        2.19GB/s ± 7%  15.19GB/s ± 3%  +591.99%  (p=0.000 n=10+10)