benchstat: adjust new sort functionality

- rename benchstat.SortTable to benchstat.Sort
- rename benchstat.SortFunc to benchstat.Order
- rename benchstat.SortReverse to benchstat.Reverse
  (it doesn't sort)
- fix benchstat.Sort to be a stable sort

- document new -sort flag
- rename -sort=benchmark to -sort=name
- rename -reverse -sort=delta to -sort=-delta
- drop benchstat.ByChange, -sort=change option
  (it's not meaningfully different from -delta)
- add test of reverse delta sort

Change-Id: Ie6348b02ca71662743131bb2533ccee5fdc05323
Reviewed-on: https://go-review.googlesource.com/105175
Run-TryBot: Russ Cox <rsc@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
diff --git a/benchstat/data.go b/benchstat/data.go
index 50777e8..15f7acb 100644
--- a/benchstat/data.go
+++ b/benchstat/data.go
@@ -45,11 +45,10 @@
 	// By default, results will only be split by full name.
 	SplitBy []string
 
-	// SortBy specifies the function by which tables in this collection
-	// should be sorted.
-	// By default, tables will not be specifically sorted, and will appear
-	// in the order they were read in
-	SortBy SortFunc
+	// Order specifies the row display order for this table.
+	// If Order is nil, the table rows are printed in order of
+	// first appearance in the input.
+	Order Order
 }
 
 // A Key identifies one metric (e.g., "ns/op", "B/op") from one
diff --git a/benchstat/sort.go b/benchstat/sort.go
index ef68821..4a81199 100644
--- a/benchstat/sort.go
+++ b/benchstat/sort.go
@@ -9,35 +9,28 @@
 	"sort"
 )
 
-// A SortFunc abstracts the sorting interface to compare two rows of a Table
-type SortFunc func(*Table, int, int) bool
+// An Order defines a sort order for a table.
+// It reports whether t.Rows[i] should appear before t.Rows[j].
+type Order func(t *Table, i, j int) bool
 
 // ByName sorts tables by the Benchmark name column
 func ByName(t *Table, i, j int) bool {
 	return t.Rows[i].Benchmark < t.Rows[j].Benchmark
 }
 
-// ByDelta sorts tables by the Delta column (comparing the numerical value
-// rather than the lexical value)
-// The sort takes into account the Change value as well, which indicates
-// whether a given delta is "good" or "bad"
+// ByDelta sorts tables by the Delta column,
+// reversing the order when larger is better (for "speed" results).
 func ByDelta(t *Table, i, j int) bool {
 	return math.Abs(t.Rows[i].PctDelta)*float64(t.Rows[i].Change) <
 		math.Abs(t.Rows[j].PctDelta)*float64(t.Rows[j].Change)
 }
 
-// ByChange sorts tables by the unprinted Change column which indicates
-// whether a delta is negative, zero, or positive
-func ByChange(t *Table, i, j int) bool {
-	return t.Rows[i].Change < t.Rows[j].Change
+// Reverse returns the reverse of the given order.
+func Reverse(order Order) Order {
+	return func(t *Table, i, j int) bool { return order(t, j, i) }
 }
 
-// SortReverse returns a SortFunc that is the reverse of the input SortFunc
-func SortReverse(sortFunc SortFunc) SortFunc {
-	return func(t *Table, i, j int) bool { return !sortFunc(t, i, j) }
-}
-
-// SortTable sorts a Table t (in place) by the given SortFunc
-func SortTable(t *Table, sortFunc SortFunc) {
-	sort.Slice(t.Rows, func(i, j int) bool { return sortFunc(t, i, j) })
+// Sort sorts a Table t (in place) by the given order.
+func Sort(t *Table, order Order) {
+	sort.SliceStable(t.Rows, func(i, j int) bool { return order(t, i, j) })
 }
diff --git a/benchstat/sort_test.go b/benchstat/sort_test.go
index 618490c..b975bea 100644
--- a/benchstat/sort_test.go
+++ b/benchstat/sort_test.go
@@ -27,20 +27,20 @@
 func benchmarkSortTest(t *testing.T, sampleTable *Table) {
 	numRows := len(sampleTable.Rows)
 	benchmarks := make([]string, numRows)
-	SortTable(sampleTable, ByName)
+	Sort(sampleTable, ByName)
 	for idx, row := range sampleTable.Rows {
 		benchmarks[idx] = extractRowBenchmark(row)
 	}
 	t.Run("BenchSorted", func(t *testing.T) {
 		if !sort.StringsAreSorted(benchmarks) {
-			t.Error("Table not sorted by benchmarks")
+			t.Error("Table not sorted by names")
 		}
 	})
-	SortTable(sampleTable, SortReverse(ByName))
+	Sort(sampleTable, Reverse(ByName))
 	for idx, row := range sampleTable.Rows {
 		benchmarks[numRows-idx-1] = extractRowBenchmark(row)
 	}
-	t.Run("BenchSortReversed", func(t *testing.T) {
+	t.Run("BenchReversed", func(t *testing.T) {
 		if !sort.StringsAreSorted(benchmarks) {
 			t.Error("Table not reverse sorted by benchmarks")
 		}
@@ -50,49 +50,26 @@
 func deltaSortTest(t *testing.T, sampleTable *Table) {
 	numRows := len(sampleTable.Rows)
 	deltas := make([]float64, numRows)
-	SortTable(sampleTable, ByDelta)
+	Sort(sampleTable, ByDelta)
 	for idx, row := range sampleTable.Rows {
-		deltas[idx] = extractRowDelta(row)
+		deltas[idx] = -extractRowDelta(row)
 	}
 	t.Run("DeltaSorted", func(t *testing.T) {
 		if !sort.Float64sAreSorted(deltas) {
-			t.Error("Table not sorted by deltas")
+			t.Errorf("Table not sorted by deltas: %v", deltas)
 		}
 	})
-	SortTable(sampleTable, SortReverse(ByDelta))
+	Sort(sampleTable, Reverse(ByDelta))
 	for idx, row := range sampleTable.Rows {
-		deltas[numRows-idx-1] = extractRowDelta(row)
+		deltas[idx] = extractRowDelta(row)
 	}
-	t.Run("DeltaSortReversed", func(t *testing.T) {
+	t.Run("DeltaReversed", func(t *testing.T) {
 		if !sort.Float64sAreSorted(deltas) {
 			t.Error("Table not reverse sorted by deltas")
 		}
 	})
 }
 
-func changeSortTest(t *testing.T, sampleTable *Table) {
-	numRows := len(sampleTable.Rows)
-	changes := make([]int, numRows)
-	SortTable(sampleTable, ByChange)
-	for idx, row := range sampleTable.Rows {
-		changes[idx] = extractRowChange(row)
-	}
-	t.Run("ChangeSorted", func(t *testing.T) {
-		if !sort.IntsAreSorted(changes) {
-			t.Error("Table not sorted by changes")
-		}
-	})
-	SortTable(sampleTable, SortReverse(ByChange))
-	for idx, row := range sampleTable.Rows {
-		changes[numRows-idx-1] = extractRowChange(row)
-	}
-	t.Run("ChangeSortReversed", func(t *testing.T) {
-		if !sort.IntsAreSorted(changes) {
-			t.Error("Table not reverse sorted by changes")
-		}
-	})
-}
-
 func TestCompareCollection(t *testing.T) {
 	sampleCompareCollection := Collection{Alpha: 0.05, AddGeoMean: false, DeltaTest: UTest}
 	file1Data, err := ioutil.ReadFile(file1)
@@ -113,9 +90,6 @@
 	t.Run("DeltaSort", func(t *testing.T) {
 		deltaSortTest(t, sampleTable)
 	})
-	t.Run("ChangeSort", func(t *testing.T) {
-		changeSortTest(t, sampleTable)
-	})
 }
 
 func TestSingleCollection(t *testing.T) {
diff --git a/benchstat/table.go b/benchstat/table.go
index b90cfd7..4f52344 100644
--- a/benchstat/table.go
+++ b/benchstat/table.go
@@ -124,8 +124,8 @@
 		}
 
 		if len(table.Rows) > 0 {
-			if c.SortBy != nil {
-				SortTable(table, c.SortBy)
+			if c.Order != nil {
+				Sort(table, c.Order)
 			}
 			if c.AddGeoMean {
 				addGeomean(c, table, key.Unit, table.OldNewDelta)
diff --git a/cmd/benchstat/main.go b/cmd/benchstat/main.go
index 43830ec..7d87625 100644
--- a/cmd/benchstat/main.go
+++ b/cmd/benchstat/main.go
@@ -6,7 +6,7 @@
 //
 // Usage:
 //
-//	benchstat [-delta-test name] [-geomean] [-html] old.txt [new.txt] [more.txt ...]
+//	benchstat [-delta-test name] [-geomean] [-html] [-sort order] old.txt [new.txt] [more.txt ...]
 //
 // Each input file should contain the concatenated output of a number
 // of runs of ``go test -bench.'' For each different benchmark listed in an input file,
@@ -37,6 +37,10 @@
 //
 // The -html option causes benchstat to print the results as an HTML table.
 //
+// The -sort option specifies an order in which to list the results:
+// none (input order), delta (percent improvement), or name (benchmark name).
+// A leading “-” prefix, as in “-delta”, reverses the order.
+//
 // Example
 //
 // Suppose we collect benchmark results from running ``go test -bench=Encode''
@@ -102,21 +106,22 @@
 	"golang.org/x/perf/benchstat"
 )
 
+var exit = os.Exit // replaced during testing
+
 func usage() {
 	fmt.Fprintf(os.Stderr, "usage: benchstat [options] old.txt [new.txt] [more.txt ...]\n")
 	fmt.Fprintf(os.Stderr, "options:\n")
 	flag.PrintDefaults()
-	os.Exit(2)
+	exit(2)
 }
 
 var (
-	flagDeltaTest   = flag.String("delta-test", "utest", "significance `test` to apply to delta: utest, ttest, or none")
-	flagAlpha       = flag.Float64("alpha", 0.05, "consider change significant if p < `α`")
-	flagGeomean     = flag.Bool("geomean", false, "print the geometric mean of each file")
-	flagHTML        = flag.Bool("html", false, "print results as an HTML table")
-	flagSplit       = flag.String("split", "pkg,goos,goarch", "split benchmarks by `labels`")
-	flagSort        = flag.String("sort", "none", "sort by this `header`: benchmark, delta, change")
-	flagReverseSort = flag.Bool("reverse", false, "reverse the sort order")
+	flagDeltaTest = flag.String("delta-test", "utest", "significance `test` to apply to delta: utest, ttest, or none")
+	flagAlpha     = flag.Float64("alpha", 0.05, "consider change significant if p < `α`")
+	flagGeomean   = flag.Bool("geomean", false, "print the geometric mean of each file")
+	flagHTML      = flag.Bool("html", false, "print results as an HTML table")
+	flagSplit     = flag.String("split", "pkg,goos,goarch", "split benchmarks by `labels`")
+	flagSort      = flag.String("sort", "none", "sort by `order`: [-]delta, [-]name, none")
 )
 
 var deltaTestNames = map[string]benchstat.DeltaTest{
@@ -129,11 +134,10 @@
 	"ttest":  benchstat.TTest,
 }
 
-var sortNames = map[string]benchstat.SortFunc{
-	"none":      nil,
-	"benchmark": benchstat.ByName,
-	"delta":     benchstat.ByDelta,
-	"change":    benchstat.ByChange,
+var sortNames = map[string]benchstat.Order{
+	"none":  nil,
+	"name":  benchstat.ByName,
+	"delta": benchstat.ByDelta,
 }
 
 func main() {
@@ -142,7 +146,13 @@
 	flag.Usage = usage
 	flag.Parse()
 	deltaTest := deltaTestNames[strings.ToLower(*flagDeltaTest)]
-	sortType, ok := sortNames[strings.ToLower(*flagSort)]
+	sortName := *flagSort
+	reverse := false
+	if strings.HasPrefix(sortName, "-") {
+		reverse = true
+		sortName = sortName[1:]
+	}
+	order, ok := sortNames[sortName]
 	if flag.NArg() < 1 || deltaTest == nil || !ok {
 		flag.Usage()
 	}
@@ -155,11 +165,11 @@
 	if *flagSplit != "" {
 		c.SplitBy = strings.Split(*flagSplit, ",")
 	}
-	if sortType != nil {
-		if *flagReverseSort {
-			sortType = benchstat.SortReverse(sortType)
+	if order != nil {
+		if reverse {
+			order = benchstat.Reverse(order)
 		}
-		c.SortBy = sortType
+		c.Order = order
 	}
 	for _, file := range flag.Args() {
 		data, err := ioutil.ReadFile(file)
diff --git a/cmd/benchstat/main_test.go b/cmd/benchstat/main_test.go
index 4cc56eb..29419ee 100644
--- a/cmd/benchstat/main_test.go
+++ b/cmd/benchstat/main_test.go
@@ -40,9 +40,9 @@
 	check(t, "packages", "packagesold.txt", "packagesnew.txt")
 	check(t, "units", "units-old.txt", "units-new.txt")
 	check(t, "zero", "-delta-test=none", "zero-old.txt", "zero-new.txt")
-	check(t, "benchsort", "-sort=benchmark", "old.txt", "new.txt")
+	check(t, "namesort", "-sort=name", "old.txt", "new.txt")
 	check(t, "deltasort", "-sort=delta", "old.txt", "new.txt")
-	check(t, "changesort", "-sort=change", "old.txt", "new.txt")
+	check(t, "rdeltasort", "-sort=-delta", "old.txt", "new.txt")
 }
 
 func check(t *testing.T, name string, files ...string) {
@@ -65,6 +65,7 @@
 		stderr := os.Stderr
 		os.Stdout = w
 		os.Stderr = w
+		exit = func(code int) { t.Fatalf("exit %d during main", code) }
 		*flagGeomean = false
 		*flagHTML = false
 		*flagDeltaTest = "utest"
@@ -75,6 +76,7 @@
 		w.Close()
 		os.Stdout = stdout
 		os.Stderr = stderr
+		exit = os.Exit
 
 		data := <-c
 		golden, err := ioutil.ReadFile(name + ".golden")
diff --git a/cmd/benchstat/testdata/deltasort.golden b/cmd/benchstat/testdata/deltasort.golden
index 367d10c..020c044 100644
--- a/cmd/benchstat/testdata/deltasort.golden
+++ b/cmd/benchstat/testdata/deltasort.golden
@@ -5,21 +5,21 @@
 CRC32/poly=IEEE/size=40/align=0-8            41.0ns ± 1%     42.5ns ± 6%    +3.56%  (p=0.000 n=8+10)
 CRC32/poly=IEEE/size=40/align=1-8            41.1ns ± 1%     42.0ns ± 3%    +2.34%  (p=0.000 n=9+10)
 CRC32/poly=Castagnoli/size=1kB/align=0-8     65.5ns ± 1%     66.2ns ± 1%    +1.01%  (p=0.003 n=9+8)
-CRC32/poly=Castagnoli/size=32kB/align=0-8    1.22µs ± 4%     1.21µs ± 3%      ~     (p=0.882 n=9+9)
-CRC32/poly=Koopman/size=32kB/align=0-8       72.4µs ± 9%     72.9µs ± 4%      ~     (p=0.684 n=10+10)
-CRC32/poly=Koopman/size=4kB/align=1-8        8.94µs ±10%     9.05µs ±12%      ~     (p=0.754 n=10+10)
-CRC32/poly=Koopman/size=4kB/align=0-8        9.03µs ± 6%     9.00µs ± 6%      ~     (p=0.971 n=10+10)
 CRC32/poly=IEEE/size=15/align=1-8            44.7ns ± 5%     44.5ns ± 4%      ~     (p=0.539 n=10+10)
-CRC32/poly=Koopman/size=512/align=1-8        1.13µs ± 6%     1.17µs ± 8%      ~     (p=0.143 n=10+10)
 CRC32/poly=Castagnoli/size=15/align=0-8      16.4ns ± 3%     16.3ns ± 2%      ~     (p=0.615 n=9+9)
 CRC32/poly=Castagnoli/size=15/align=1-8      17.2ns ± 2%     17.3ns ± 2%      ~     (p=0.650 n=9+10)
 CRC32/poly=Castagnoli/size=40/align=0-8      17.4ns ± 2%     17.5ns ± 4%      ~     (p=0.694 n=10+10)
-CRC32/poly=Koopman/size=15/align=1-8         35.1ns ± 5%     35.5ns ± 1%      ~     (p=0.508 n=10+9)
 CRC32/poly=Castagnoli/size=512/align=0-8     40.2ns ± 2%     40.1ns ± 4%      ~     (p=0.614 n=10+10)
 CRC32/poly=Castagnoli/size=512/align=1-8     42.1ns ± 3%     41.9ns ± 2%      ~     (p=0.952 n=10+9)
-CRC32/poly=Koopman/size=15/align=0-8         36.5ns ±11%     35.6ns ± 3%      ~     (p=0.216 n=10+10)
 CRC32/poly=Castagnoli/size=1kB/align=1-8     70.1ns ± 6%     68.5ns ± 2%      ~     (p=0.190 n=10+9)
+CRC32/poly=Castagnoli/size=32kB/align=0-8    1.22µs ± 4%     1.21µs ± 3%      ~     (p=0.882 n=9+9)
+CRC32/poly=Koopman/size=15/align=0-8         36.5ns ±11%     35.6ns ± 3%      ~     (p=0.216 n=10+10)
+CRC32/poly=Koopman/size=15/align=1-8         35.1ns ± 5%     35.5ns ± 1%      ~     (p=0.508 n=10+9)
 CRC32/poly=Koopman/size=40/align=1-8         91.1ns ± 6%     88.0ns ± 3%      ~     (p=0.055 n=10+10)
+CRC32/poly=Koopman/size=512/align=1-8        1.13µs ± 6%     1.17µs ± 8%      ~     (p=0.143 n=10+10)
+CRC32/poly=Koopman/size=4kB/align=0-8        9.03µs ± 6%     9.00µs ± 6%      ~     (p=0.971 n=10+10)
+CRC32/poly=Koopman/size=4kB/align=1-8        8.94µs ±10%     9.05µs ±12%      ~     (p=0.754 n=10+10)
+CRC32/poly=Koopman/size=32kB/align=0-8       72.4µs ± 9%     72.9µs ± 4%      ~     (p=0.684 n=10+10)
 CRC32/poly=Castagnoli/size=40/align=1-8      19.7ns ± 3%     19.4ns ± 2%    -1.62%  (p=0.036 n=10+10)
 CRC32/poly=Castagnoli/size=4kB/align=0-8      163ns ± 5%      159ns ± 3%    -2.46%  (p=0.032 n=10+10)
 CRC32/poly=Castagnoli/size=32kB/align=1-8    1.26µs ± 3%     1.22µs ± 4%    -3.48%  (p=0.002 n=9+10)
@@ -42,24 +42,24 @@
 CRC32/poly=IEEE/size=40/align=0-8           975MB/s ± 1%    942MB/s ± 5%    -3.37%  (p=0.001 n=8+10)
 CRC32/poly=IEEE/size=40/align=1-8           974MB/s ± 1%    952MB/s ± 3%    -2.25%  (p=0.000 n=9+10)
 CRC32/poly=Castagnoli/size=1kB/align=0-8   15.6GB/s ± 1%   15.5GB/s ± 1%    -1.02%  (p=0.002 n=9+8)
-CRC32/poly=Koopman/size=15/align=1-8        427MB/s ± 5%    422MB/s ± 1%      ~     (p=0.497 n=10+9)
-CRC32/poly=Koopman/size=32kB/align=0-8      453MB/s ± 8%    450MB/s ± 4%      ~     (p=0.684 n=10+10)
-CRC32/poly=Koopman/size=4kB/align=1-8       459MB/s ± 9%    455MB/s ±11%      ~     (p=0.739 n=10+10)
-CRC32/poly=Koopman/size=4kB/align=0-8       454MB/s ± 5%    455MB/s ± 6%      ~     (p=0.971 n=10+10)
 CRC32/poly=IEEE/size=15/align=1-8           336MB/s ± 4%    337MB/s ± 4%      ~     (p=0.579 n=10+10)
-CRC32/poly=Koopman/size=1kB/align=0-8       452MB/s ± 9%    438MB/s ± 4%      ~     (p=0.052 n=10+10)
-CRC32/poly=Koopman/size=512/align=1-8       455MB/s ± 6%    440MB/s ± 8%      ~     (p=0.143 n=10+10)
 CRC32/poly=Castagnoli/size=15/align=0-8     916MB/s ± 2%    920MB/s ± 2%      ~     (p=0.489 n=9+9)
 CRC32/poly=Castagnoli/size=15/align=1-8     870MB/s ± 2%    867MB/s ± 2%      ~     (p=0.661 n=9+10)
 CRC32/poly=Castagnoli/size=40/align=0-8    2.30GB/s ± 2%   2.28GB/s ± 4%      ~     (p=0.684 n=10+10)
 CRC32/poly=Castagnoli/size=40/align=1-8    2.03GB/s ± 3%   2.06GB/s ± 2%      ~     (p=0.063 n=10+10)
 CRC32/poly=Castagnoli/size=512/align=0-8   12.7GB/s ± 2%   12.8GB/s ± 4%      ~     (p=0.529 n=10+10)
 CRC32/poly=Castagnoli/size=512/align=1-8   12.1GB/s ± 3%   12.2GB/s ± 1%      ~     (p=0.780 n=10+9)
-CRC32/poly=Koopman/size=40/align=1-8        440MB/s ± 6%    455MB/s ± 3%      ~     (p=0.052 n=10+10)
 CRC32/poly=Castagnoli/size=1kB/align=1-8   14.6GB/s ± 6%   15.0GB/s ± 2%      ~     (p=0.211 n=10+9)
 CRC32/poly=Castagnoli/size=4kB/align=0-8   25.1GB/s ± 5%   25.7GB/s ± 3%      ~     (p=0.052 n=10+10)
-CRC32/poly=Koopman/size=15/align=0-8        412MB/s ±10%    421MB/s ± 3%      ~     (p=0.218 n=10+10)
 CRC32/poly=Castagnoli/size=32kB/align=0-8  26.9GB/s ± 4%   26.8GB/s ± 5%      ~     (p=0.842 n=9+10)
+CRC32/poly=Koopman/size=15/align=0-8        412MB/s ±10%    421MB/s ± 3%      ~     (p=0.218 n=10+10)
+CRC32/poly=Koopman/size=15/align=1-8        427MB/s ± 5%    422MB/s ± 1%      ~     (p=0.497 n=10+9)
+CRC32/poly=Koopman/size=40/align=1-8        440MB/s ± 6%    455MB/s ± 3%      ~     (p=0.052 n=10+10)
+CRC32/poly=Koopman/size=512/align=1-8       455MB/s ± 6%    440MB/s ± 8%      ~     (p=0.143 n=10+10)
+CRC32/poly=Koopman/size=1kB/align=0-8       452MB/s ± 9%    438MB/s ± 4%      ~     (p=0.052 n=10+10)
+CRC32/poly=Koopman/size=4kB/align=0-8       454MB/s ± 5%    455MB/s ± 6%      ~     (p=0.971 n=10+10)
+CRC32/poly=Koopman/size=4kB/align=1-8       459MB/s ± 9%    455MB/s ±11%      ~     (p=0.739 n=10+10)
+CRC32/poly=Koopman/size=32kB/align=0-8      453MB/s ± 8%    450MB/s ± 4%      ~     (p=0.684 n=10+10)
 CRC32/poly=Castagnoli/size=32kB/align=1-8  25.9GB/s ± 3%   26.8GB/s ± 4%    +3.62%  (p=0.002 n=9+10)
 CRC32/poly=Koopman/size=40/align=0-8        437MB/s ± 9%    456MB/s ± 2%    +4.50%  (p=0.002 n=10+10)
 CRC32/poly=Castagnoli/size=4kB/align=1-8   24.1GB/s ± 6%   25.3GB/s ± 3%    +4.71%  (p=0.005 n=10+10)
diff --git a/cmd/benchstat/testdata/benchsort.golden b/cmd/benchstat/testdata/namesort.golden
similarity index 100%
rename from cmd/benchstat/testdata/benchsort.golden
rename to cmd/benchstat/testdata/namesort.golden
diff --git a/cmd/benchstat/testdata/changesort.golden b/cmd/benchstat/testdata/rdeltasort.golden
similarity index 99%
rename from cmd/benchstat/testdata/changesort.golden
rename to cmd/benchstat/testdata/rdeltasort.golden
index 83bfd24..d74cce4 100644
--- a/cmd/benchstat/testdata/changesort.golden
+++ b/cmd/benchstat/testdata/rdeltasort.golden
@@ -1,75 +1,75 @@
 name                                       old time/op    new time/op     delta
-CRC32/poly=Castagnoli/size=1kB/align=0-8     65.5ns ± 1%     66.2ns ± 1%    +1.01%  (p=0.003 n=9+8)
-CRC32/poly=Koopman/size=1kB/align=1-8        2.15µs ± 2%     2.36µs ± 5%    +9.84%  (p=0.000 n=9+10)
-CRC32/poly=IEEE/size=40/align=0-8            41.0ns ± 1%     42.5ns ± 6%    +3.56%  (p=0.000 n=8+10)
-CRC32/poly=IEEE/size=40/align=1-8            41.1ns ± 1%     42.0ns ± 3%    +2.34%  (p=0.000 n=9+10)
-CRC32/poly=Koopman/size=1kB/align=0-8        2.24µs ± 6%     2.34µs ± 4%    +4.34%  (p=0.010 n=9+10)
-CRC32/poly=Koopman/size=32kB/align=1-8       69.6µs ± 3%     74.3µs ± 3%    +6.70%  (p=0.000 n=8+10)
-CRC32/poly=Castagnoli/size=32kB/align=0-8    1.22µs ± 4%     1.21µs ± 3%      ~     (p=0.882 n=9+9)
-CRC32/poly=Koopman/size=32kB/align=0-8       72.4µs ± 9%     72.9µs ± 4%      ~     (p=0.684 n=10+10)
-CRC32/poly=Koopman/size=4kB/align=1-8        8.94µs ±10%     9.05µs ±12%      ~     (p=0.754 n=10+10)
-CRC32/poly=Koopman/size=4kB/align=0-8        9.03µs ± 6%     9.00µs ± 6%      ~     (p=0.971 n=10+10)
+CRC32/poly=IEEE/size=32kB/align=0-8          15.0µs ± 7%      2.2µs ± 3%   -85.57%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=32kB/align=1-8          14.2µs ± 7%      2.2µs ± 3%   -84.65%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=4kB/align=1-8           1.76µs ± 6%     0.30µs ± 3%   -83.05%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=4kB/align=0-8           1.74µs ± 8%     0.30µs ± 1%   -82.87%  (p=0.000 n=10+9)
+CRC32/poly=IEEE/size=1kB/align=0-8            452ns ± 4%       94ns ± 2%   -79.20%  (p=0.000 n=10+8)
+CRC32/poly=IEEE/size=1kB/align=1-8            444ns ± 2%       93ns ± 2%   -78.97%  (p=0.000 n=10+8)
+CRC32/poly=IEEE/size=512/align=0-8            238ns ± 5%       57ns ± 3%   -76.00%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=512/align=1-8            236ns ± 3%       57ns ± 3%   -75.72%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=15/align=0-8            46.9ns ± 8%     44.5ns ± 3%    -5.01%  (p=0.008 n=10+10)
+CRC32/poly=Koopman/size=512/align=0-8        1.13µs ± 5%     1.08µs ± 3%    -4.93%  (p=0.000 n=10+10)
+CRC32/poly=Castagnoli/size=4kB/align=1-8      169ns ± 6%      162ns ± 3%    -4.60%  (p=0.005 n=10+10)
+CRC32/poly=Koopman/size=40/align=0-8         91.6ns ± 9%     87.6ns ± 2%    -4.35%  (p=0.002 n=10+10)
+CRC32/poly=Castagnoli/size=32kB/align=1-8    1.26µs ± 3%     1.22µs ± 4%    -3.48%  (p=0.002 n=9+10)
+CRC32/poly=Castagnoli/size=4kB/align=0-8      163ns ± 5%      159ns ± 3%    -2.46%  (p=0.032 n=10+10)
+CRC32/poly=Castagnoli/size=40/align=1-8      19.7ns ± 3%     19.4ns ± 2%    -1.62%  (p=0.036 n=10+10)
 CRC32/poly=IEEE/size=15/align=1-8            44.7ns ± 5%     44.5ns ± 4%      ~     (p=0.539 n=10+10)
-CRC32/poly=Koopman/size=512/align=1-8        1.13µs ± 6%     1.17µs ± 8%      ~     (p=0.143 n=10+10)
 CRC32/poly=Castagnoli/size=15/align=0-8      16.4ns ± 3%     16.3ns ± 2%      ~     (p=0.615 n=9+9)
 CRC32/poly=Castagnoli/size=15/align=1-8      17.2ns ± 2%     17.3ns ± 2%      ~     (p=0.650 n=9+10)
 CRC32/poly=Castagnoli/size=40/align=0-8      17.4ns ± 2%     17.5ns ± 4%      ~     (p=0.694 n=10+10)
-CRC32/poly=Koopman/size=40/align=1-8         91.1ns ± 6%     88.0ns ± 3%      ~     (p=0.055 n=10+10)
 CRC32/poly=Castagnoli/size=512/align=0-8     40.2ns ± 2%     40.1ns ± 4%      ~     (p=0.614 n=10+10)
 CRC32/poly=Castagnoli/size=512/align=1-8     42.1ns ± 3%     41.9ns ± 2%      ~     (p=0.952 n=10+9)
-CRC32/poly=Koopman/size=15/align=1-8         35.1ns ± 5%     35.5ns ± 1%      ~     (p=0.508 n=10+9)
 CRC32/poly=Castagnoli/size=1kB/align=1-8     70.1ns ± 6%     68.5ns ± 2%      ~     (p=0.190 n=10+9)
+CRC32/poly=Castagnoli/size=32kB/align=0-8    1.22µs ± 4%     1.21µs ± 3%      ~     (p=0.882 n=9+9)
 CRC32/poly=Koopman/size=15/align=0-8         36.5ns ±11%     35.6ns ± 3%      ~     (p=0.216 n=10+10)
-CRC32/poly=IEEE/size=1kB/align=0-8            452ns ± 4%       94ns ± 2%   -79.20%  (p=0.000 n=10+8)
-CRC32/poly=Castagnoli/size=4kB/align=1-8      169ns ± 6%      162ns ± 3%    -4.60%  (p=0.005 n=10+10)
-CRC32/poly=Castagnoli/size=32kB/align=1-8    1.26µs ± 3%     1.22µs ± 4%    -3.48%  (p=0.002 n=9+10)
-CRC32/poly=Castagnoli/size=4kB/align=0-8      163ns ± 5%      159ns ± 3%    -2.46%  (p=0.032 n=10+10)
-CRC32/poly=IEEE/size=512/align=1-8            236ns ± 3%       57ns ± 3%   -75.72%  (p=0.000 n=10+10)
-CRC32/poly=Koopman/size=40/align=0-8         91.6ns ± 9%     87.6ns ± 2%    -4.35%  (p=0.002 n=10+10)
-CRC32/poly=Castagnoli/size=40/align=1-8      19.7ns ± 3%     19.4ns ± 2%    -1.62%  (p=0.036 n=10+10)
-CRC32/poly=Koopman/size=512/align=0-8        1.13µs ± 5%     1.08µs ± 3%    -4.93%  (p=0.000 n=10+10)
-CRC32/poly=IEEE/size=32kB/align=1-8          14.2µs ± 7%      2.2µs ± 3%   -84.65%  (p=0.000 n=10+10)
-CRC32/poly=IEEE/size=512/align=0-8            238ns ± 5%       57ns ± 3%   -76.00%  (p=0.000 n=10+10)
-CRC32/poly=IEEE/size=32kB/align=0-8          15.0µs ± 7%      2.2µs ± 3%   -85.57%  (p=0.000 n=10+10)
-CRC32/poly=IEEE/size=4kB/align=1-8           1.76µs ± 6%     0.30µs ± 3%   -83.05%  (p=0.000 n=10+10)
-CRC32/poly=IEEE/size=4kB/align=0-8           1.74µs ± 8%     0.30µs ± 1%   -82.87%  (p=0.000 n=10+9)
-CRC32/poly=IEEE/size=1kB/align=1-8            444ns ± 2%       93ns ± 2%   -78.97%  (p=0.000 n=10+8)
-CRC32/poly=IEEE/size=15/align=0-8            46.9ns ± 8%     44.5ns ± 3%    -5.01%  (p=0.008 n=10+10)
+CRC32/poly=Koopman/size=15/align=1-8         35.1ns ± 5%     35.5ns ± 1%      ~     (p=0.508 n=10+9)
+CRC32/poly=Koopman/size=40/align=1-8         91.1ns ± 6%     88.0ns ± 3%      ~     (p=0.055 n=10+10)
+CRC32/poly=Koopman/size=512/align=1-8        1.13µs ± 6%     1.17µs ± 8%      ~     (p=0.143 n=10+10)
+CRC32/poly=Koopman/size=4kB/align=0-8        9.03µs ± 6%     9.00µs ± 6%      ~     (p=0.971 n=10+10)
+CRC32/poly=Koopman/size=4kB/align=1-8        8.94µs ±10%     9.05µs ±12%      ~     (p=0.754 n=10+10)
+CRC32/poly=Koopman/size=32kB/align=0-8       72.4µs ± 9%     72.9µs ± 4%      ~     (p=0.684 n=10+10)
+CRC32/poly=Castagnoli/size=1kB/align=0-8     65.5ns ± 1%     66.2ns ± 1%    +1.01%  (p=0.003 n=9+8)
+CRC32/poly=IEEE/size=40/align=1-8            41.1ns ± 1%     42.0ns ± 3%    +2.34%  (p=0.000 n=9+10)
+CRC32/poly=IEEE/size=40/align=0-8            41.0ns ± 1%     42.5ns ± 6%    +3.56%  (p=0.000 n=8+10)
+CRC32/poly=Koopman/size=1kB/align=0-8        2.24µs ± 6%     2.34µs ± 4%    +4.34%  (p=0.010 n=9+10)
+CRC32/poly=Koopman/size=32kB/align=1-8       69.6µs ± 3%     74.3µs ± 3%    +6.70%  (p=0.000 n=8+10)
+CRC32/poly=Koopman/size=1kB/align=1-8        2.15µs ± 2%     2.36µs ± 5%    +9.84%  (p=0.000 n=9+10)
 
 name                                       old speed      new speed       delta
-CRC32/poly=Castagnoli/size=1kB/align=0-8   15.6GB/s ± 1%   15.5GB/s ± 1%    -1.02%  (p=0.002 n=9+8)
-CRC32/poly=Koopman/size=1kB/align=1-8       477MB/s ± 2%    434MB/s ± 5%    -8.92%  (p=0.000 n=9+10)
-CRC32/poly=IEEE/size=40/align=0-8           975MB/s ± 1%    942MB/s ± 5%    -3.37%  (p=0.001 n=8+10)
-CRC32/poly=IEEE/size=40/align=1-8           974MB/s ± 1%    952MB/s ± 3%    -2.25%  (p=0.000 n=9+10)
-CRC32/poly=Koopman/size=32kB/align=1-8      471MB/s ± 3%    441MB/s ± 3%    -6.25%  (p=0.000 n=8+10)
-CRC32/poly=Koopman/size=15/align=0-8        412MB/s ±10%    421MB/s ± 3%      ~     (p=0.218 n=10+10)
-CRC32/poly=Koopman/size=32kB/align=0-8      453MB/s ± 8%    450MB/s ± 4%      ~     (p=0.684 n=10+10)
-CRC32/poly=Koopman/size=4kB/align=1-8       459MB/s ± 9%    455MB/s ±11%      ~     (p=0.739 n=10+10)
-CRC32/poly=Koopman/size=4kB/align=0-8       454MB/s ± 5%    455MB/s ± 6%      ~     (p=0.971 n=10+10)
+CRC32/poly=IEEE/size=32kB/align=0-8        2.19GB/s ± 7%  15.19GB/s ± 3%  +591.99%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=32kB/align=1-8        2.31GB/s ± 8%  15.04GB/s ± 3%  +550.07%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=4kB/align=1-8         2.33GB/s ± 6%  13.68GB/s ± 3%  +488.23%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=4kB/align=0-8         2.36GB/s ± 7%  13.73GB/s ± 1%  +482.26%  (p=0.000 n=10+9)
+CRC32/poly=IEEE/size=1kB/align=0-8         2.26GB/s ± 4%  10.88GB/s ± 2%  +381.12%  (p=0.000 n=10+8)
+CRC32/poly=IEEE/size=1kB/align=1-8         2.31GB/s ± 2%  10.98GB/s ± 2%  +375.97%  (p=0.000 n=10+8)
+CRC32/poly=IEEE/size=512/align=0-8         2.15GB/s ± 4%   8.97GB/s ± 3%  +317.65%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=512/align=1-8         2.17GB/s ± 3%   8.96GB/s ± 3%  +312.89%  (p=0.000 n=10+10)
+CRC32/poly=Koopman/size=512/align=0-8       453MB/s ± 5%    476MB/s ± 3%    +5.09%  (p=0.000 n=10+10)
+CRC32/poly=IEEE/size=15/align=0-8           321MB/s ± 8%    337MB/s ± 3%    +5.06%  (p=0.009 n=10+10)
+CRC32/poly=Castagnoli/size=4kB/align=1-8   24.1GB/s ± 6%   25.3GB/s ± 3%    +4.71%  (p=0.005 n=10+10)
+CRC32/poly=Koopman/size=40/align=0-8        437MB/s ± 9%    456MB/s ± 2%    +4.50%  (p=0.002 n=10+10)
+CRC32/poly=Castagnoli/size=32kB/align=1-8  25.9GB/s ± 3%   26.8GB/s ± 4%    +3.62%  (p=0.002 n=9+10)
 CRC32/poly=IEEE/size=15/align=1-8           336MB/s ± 4%    337MB/s ± 4%      ~     (p=0.579 n=10+10)
-CRC32/poly=Koopman/size=1kB/align=0-8       452MB/s ± 9%    438MB/s ± 4%      ~     (p=0.052 n=10+10)
-CRC32/poly=Koopman/size=512/align=1-8       455MB/s ± 6%    440MB/s ± 8%      ~     (p=0.143 n=10+10)
 CRC32/poly=Castagnoli/size=15/align=0-8     916MB/s ± 2%    920MB/s ± 2%      ~     (p=0.489 n=9+9)
 CRC32/poly=Castagnoli/size=15/align=1-8     870MB/s ± 2%    867MB/s ± 2%      ~     (p=0.661 n=9+10)
 CRC32/poly=Castagnoli/size=40/align=0-8    2.30GB/s ± 2%   2.28GB/s ± 4%      ~     (p=0.684 n=10+10)
 CRC32/poly=Castagnoli/size=40/align=1-8    2.03GB/s ± 3%   2.06GB/s ± 2%      ~     (p=0.063 n=10+10)
 CRC32/poly=Castagnoli/size=512/align=0-8   12.7GB/s ± 2%   12.8GB/s ± 4%      ~     (p=0.529 n=10+10)
 CRC32/poly=Castagnoli/size=512/align=1-8   12.1GB/s ± 3%   12.2GB/s ± 1%      ~     (p=0.780 n=10+9)
-CRC32/poly=Koopman/size=40/align=1-8        440MB/s ± 6%    455MB/s ± 3%      ~     (p=0.052 n=10+10)
 CRC32/poly=Castagnoli/size=1kB/align=1-8   14.6GB/s ± 6%   15.0GB/s ± 2%      ~     (p=0.211 n=10+9)
 CRC32/poly=Castagnoli/size=4kB/align=0-8   25.1GB/s ± 5%   25.7GB/s ± 3%      ~     (p=0.052 n=10+10)
-CRC32/poly=Koopman/size=15/align=1-8        427MB/s ± 5%    422MB/s ± 1%      ~     (p=0.497 n=10+9)
 CRC32/poly=Castagnoli/size=32kB/align=0-8  26.9GB/s ± 4%   26.8GB/s ± 5%      ~     (p=0.842 n=9+10)
-CRC32/poly=IEEE/size=512/align=1-8         2.17GB/s ± 3%   8.96GB/s ± 3%  +312.89%  (p=0.000 n=10+10)
-CRC32/poly=Castagnoli/size=32kB/align=1-8  25.9GB/s ± 3%   26.8GB/s ± 4%    +3.62%  (p=0.002 n=9+10)
-CRC32/poly=Castagnoli/size=4kB/align=1-8   24.1GB/s ± 6%   25.3GB/s ± 3%    +4.71%  (p=0.005 n=10+10)
-CRC32/poly=Koopman/size=40/align=0-8        437MB/s ± 9%    456MB/s ± 2%    +4.50%  (p=0.002 n=10+10)
-CRC32/poly=IEEE/size=512/align=0-8         2.15GB/s ± 4%   8.97GB/s ± 3%  +317.65%  (p=0.000 n=10+10)
-CRC32/poly=Koopman/size=512/align=0-8       453MB/s ± 5%    476MB/s ± 3%    +5.09%  (p=0.000 n=10+10)
-CRC32/poly=IEEE/size=32kB/align=1-8        2.31GB/s ± 8%  15.04GB/s ± 3%  +550.07%  (p=0.000 n=10+10)
-CRC32/poly=IEEE/size=32kB/align=0-8        2.19GB/s ± 7%  15.19GB/s ± 3%  +591.99%  (p=0.000 n=10+10)
-CRC32/poly=IEEE/size=4kB/align=1-8         2.33GB/s ± 6%  13.68GB/s ± 3%  +488.23%  (p=0.000 n=10+10)
-CRC32/poly=IEEE/size=4kB/align=0-8         2.36GB/s ± 7%  13.73GB/s ± 1%  +482.26%  (p=0.000 n=10+9)
-CRC32/poly=IEEE/size=1kB/align=1-8         2.31GB/s ± 2%  10.98GB/s ± 2%  +375.97%  (p=0.000 n=10+8)
-CRC32/poly=IEEE/size=1kB/align=0-8         2.26GB/s ± 4%  10.88GB/s ± 2%  +381.12%  (p=0.000 n=10+8)
-CRC32/poly=IEEE/size=15/align=0-8           321MB/s ± 8%    337MB/s ± 3%    +5.06%  (p=0.009 n=10+10)
+CRC32/poly=Koopman/size=15/align=0-8        412MB/s ±10%    421MB/s ± 3%      ~     (p=0.218 n=10+10)
+CRC32/poly=Koopman/size=15/align=1-8        427MB/s ± 5%    422MB/s ± 1%      ~     (p=0.497 n=10+9)
+CRC32/poly=Koopman/size=40/align=1-8        440MB/s ± 6%    455MB/s ± 3%      ~     (p=0.052 n=10+10)
+CRC32/poly=Koopman/size=512/align=1-8       455MB/s ± 6%    440MB/s ± 8%      ~     (p=0.143 n=10+10)
+CRC32/poly=Koopman/size=1kB/align=0-8       452MB/s ± 9%    438MB/s ± 4%      ~     (p=0.052 n=10+10)
+CRC32/poly=Koopman/size=4kB/align=0-8       454MB/s ± 5%    455MB/s ± 6%      ~     (p=0.971 n=10+10)
+CRC32/poly=Koopman/size=4kB/align=1-8       459MB/s ± 9%    455MB/s ±11%      ~     (p=0.739 n=10+10)
+CRC32/poly=Koopman/size=32kB/align=0-8      453MB/s ± 8%    450MB/s ± 4%      ~     (p=0.684 n=10+10)
+CRC32/poly=Castagnoli/size=1kB/align=0-8   15.6GB/s ± 1%   15.5GB/s ± 1%    -1.02%  (p=0.002 n=9+8)
+CRC32/poly=IEEE/size=40/align=1-8           974MB/s ± 1%    952MB/s ± 3%    -2.25%  (p=0.000 n=9+10)
+CRC32/poly=IEEE/size=40/align=0-8           975MB/s ± 1%    942MB/s ± 5%    -3.37%  (p=0.001 n=8+10)
+CRC32/poly=Koopman/size=32kB/align=1-8      471MB/s ± 3%    441MB/s ± 3%    -6.25%  (p=0.000 n=8+10)
+CRC32/poly=Koopman/size=1kB/align=1-8       477MB/s ± 2%    434MB/s ± 5%    -8.92%  (p=0.000 n=9+10)