benchmath: new package of opinionated benchmark statistics

Updates golang/go#20728.

Change-Id: I4c33e64d5959cadfbb97ca6a2274e0c060e87d29
Reviewed-on: https://go-review.googlesource.com/c/perf/+/283616
Trust: Austin Clements <austin@google.com>
Run-TryBot: Austin Clements <austin@google.com>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
diff --git a/benchmath/aexact.go b/benchmath/aexact.go
new file mode 100644
index 0000000..3519e76
--- /dev/null
+++ b/benchmath/aexact.go
@@ -0,0 +1,51 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package benchmath
+
+import "fmt"
+
+// AssumeExact is an assumption that a value can be measured exactly
+// and thus has no distribution and does not require repeated sampling.
+// It reports a warning if not all values in a sample are equal.
+var AssumeExact = assumeExact{}
+
+type assumeExact struct{}
+
+var _ Assumption = assumeExact{}
+
+func (assumeExact) SummaryLabel() string {
+	// Really the summary is the mode, but the point of this
+	// assumption is that the summary is the exact value.
+	return "exact"
+}
+
+func (assumeExact) Summary(s *Sample, confidence float64) Summary {
+	// Find the sample's mode. This checks if all samples are the
+	// same, and lets us return a reasonable summary even if they
+	// aren't all the same.
+	val, count := s.Values[0], 1
+	modeVal, modeCount := val, count
+	for _, v := range s.Values[1:] {
+		if v == val {
+			count++
+			if count > modeCount {
+				modeVal, modeCount = val, count
+			}
+		} else {
+			val, count = v, 1
+		}
+	}
+	summary := Summary{Center: modeVal, Lo: s.Values[0], Hi: s.Values[len(s.Values)-1], Confidence: 1}
+
+	if modeCount != len(s.Values) {
+		// They're not all the same. Report a warning.
+		summary.Warnings = []error{fmt.Errorf("exact distribution expected, but values range from %v to %v", s.Values[0], s.Values[len(s.Values)-1])}
+	}
+	return summary
+}
+
+func (assumeExact) Compare(s1, s2 *Sample) Comparison {
+	return Comparison{P: 0, N1: len(s1.Values), N2: len(s2.Values)}
+}
diff --git a/benchmath/anone.go b/benchmath/anone.go
new file mode 100644
index 0000000..0ce86b6
--- /dev/null
+++ b/benchmath/anone.go
@@ -0,0 +1,134 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package benchmath
+
+import (
+	"fmt"
+	"math"
+	"sync"
+
+	"github.com/aclements/go-moremath/stats"
+)
+
+// AssumeNothing is a non-parametric Assumption (that is, it makes no
+// distributional assumptions). The summary statistic is the sample
+// median and comparisons are done using the Mann-Whitney U test.
+//
+// This is a good default assumption for benchmarks.
+// There's substantial evidence that benchmark results are non-normal.
+// The disadvantage (of any non-parametric methods) is that this is
+// less statistically powerful than parametric methods.
+var AssumeNothing = assumeNothing{}
+
+type assumeNothing struct{}
+
+var _ Assumption = assumeNothing{}
+
+// medianCache maps from ciKey to stats.QuantileCIResult for median
+// confidence intervals.
+var medianCache sync.Map
+
+func medianCI(n int, confidence float64) stats.QuantileCIResult {
+	type ciKey struct {
+		n          int
+		confidence float64
+	}
+	key := ciKey{n, confidence}
+	if ciX, ok := medianCache.Load(key); ok {
+		return ciX.(stats.QuantileCIResult)
+	}
+	ci := stats.QuantileCI(n, 0.5, confidence)
+	medianCache.Store(key, ci)
+	return ci
+}
+
+// medianSamples returns the minimum number of samples required to get
+// a finite confidence interval at the given confidence level.
+func medianSamples(confidence float64) (op string, n int) {
+	const limit = 50
+	// We need at least two samples to have an interval.
+	for n = 2; n <= limit; n++ {
+		ci := medianCI(n, confidence)
+		if 0 < ci.LoOrder && ci.HiOrder <= n {
+			return ">=", n
+		}
+	}
+	return ">", limit
+}
+
+func (assumeNothing) SummaryLabel() string {
+	return "median"
+}
+
+func (assumeNothing) Summary(s *Sample, confidence float64) Summary {
+	ci := medianCI(len(s.Values), confidence)
+	median, lo, hi := ci.SampleCI(s.sample())
+
+	var warnings []error
+	if math.IsInf(lo, 0) || math.IsInf(hi, 0) {
+		// Explain to the user why there's a ±∞
+		op, need := medianSamples(confidence)
+		msg := fmt.Errorf("need %s %d samples for confidence interval at level %v", op, need, confidence)
+		warnings = append(warnings, msg)
+	}
+
+	return Summary{median, lo, hi, ci.Confidence, warnings}
+}
+
+// uTestMinP[n] is the minimum possible P value for the U-test with
+// two samples of size n.
+//
+// Generated by go run mktables.go.
+var uTestMinP = []float64{
+	1: 1,
+	2: 0.3333333333333333,
+	3: 0.1,
+	4: 0.02857142857142857,
+	5: 0.007936507936507936,
+	6: 0.0021645021645021645,
+	7: 0.0005827505827505828,
+	8: 0.0001554001554001554,
+	9: 4.113533525298231e-05,
+}
+
+// uTestSamples returns the minimum number of samples required for the
+// U-test to achieve statistical significance at the given alpha
+// level.
+func uTestSamples(alpha float64) (op string, n int) {
+	for n, minP := range uTestMinP {
+		if n == 0 {
+			continue
+		}
+		if minP <= alpha {
+			return ">=", n
+		}
+	}
+	return ">", len(uTestMinP)
+}
+
+func (assumeNothing) Compare(s1, s2 *Sample) Comparison {
+	res, err := stats.MannWhitneyUTest(s1.Values, s2.Values, stats.LocationDiffers)
+	if err != nil {
+		// The U-test failed. Report as if there's no
+		// significant difference, along with the error.
+		return Comparison{P: 1, N1: len(s1.Values), N2: len(s2.Values), Alpha: s1.Thresholds.CompareAlpha, Warnings: []error{err}}
+	}
+	cmp := Comparison{P: res.P, N1: res.N1, N2: res.N2, Alpha: s1.Thresholds.CompareAlpha}
+	// Warn if there aren't enough samples to report a difference
+	// even if they were maximally diverged.
+	if cmp.P > cmp.Alpha {
+		op, n := uTestSamples(cmp.Alpha)
+		if cmp.N1 < n && cmp.N2 < n {
+			// We could deal with asymmetric sample sizes
+			// by first ramping up the smaller sample
+			// until the minimum P value is sufficient or
+			// the sample sizes are equal. But it doesn't
+			// seem worth the complexity.
+			msg := fmt.Errorf("need %s %d samples to detect a difference at alpha level %v", op, n, cmp.Alpha)
+			cmp.Warnings = append(cmp.Warnings, msg)
+		}
+	}
+	return cmp
+}
diff --git a/benchmath/anormal.go b/benchmath/anormal.go
new file mode 100644
index 0000000..b9fd4bf
--- /dev/null
+++ b/benchmath/anormal.go
@@ -0,0 +1,44 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package benchmath
+
+import "github.com/aclements/go-moremath/stats"
+
+// AssumeNormal is an assumption that a sample is normally distributed.
+// The summary statistic is the sample mean and comparisons are done
+// using the two-sample t-test.
+var AssumeNormal = assumeNormal{}
+
+type assumeNormal struct{}
+
+var _ Assumption = assumeNormal{}
+
+func (assumeNormal) SummaryLabel() string {
+	return "mean"
+}
+
+func (assumeNormal) Summary(s *Sample, confidence float64) Summary {
+	// TODO: Perform a normality test.
+
+	sample := s.sample()
+	mean, lo, hi := sample.MeanCI(confidence)
+
+	return Summary{
+		Center:     mean,
+		Lo:         lo,
+		Hi:         hi,
+		Confidence: confidence,
+	}
+}
+
+func (assumeNormal) Compare(s1, s2 *Sample) Comparison {
+	t, err := stats.TwoSampleWelchTTest(s1.sample(), s2.sample(), stats.LocationDiffers)
+	if err != nil {
+		// The t-test failed. Report as if there's no
+		// significant difference, along with the error.
+		return Comparison{P: 1, N1: len(s1.Values), N2: len(s2.Values), Warnings: []error{err}}
+	}
+	return Comparison{P: t.P, N1: len(s1.Values), N2: len(s2.Values)}
+}
diff --git a/benchmath/assumption_test.go b/benchmath/assumption_test.go
new file mode 100644
index 0000000..4a9d671
--- /dev/null
+++ b/benchmath/assumption_test.go
@@ -0,0 +1,178 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package benchmath
+
+import (
+	"fmt"
+	"math"
+	"testing"
+
+	"github.com/aclements/go-moremath/stats"
+)
+
+func TestMedianSamples(t *testing.T) {
+	if false {
+		for n := 2; n <= 50; n++ {
+			d := stats.BinomialDist{N: n, P: 0.5}
+			t.Log(n, 1-(d.PMF(0)+d.PMF(float64(d.N))), d.PMF(0))
+		}
+	}
+
+	check := func(confidence float64, wantOp string, wantN int) {
+		t.Helper()
+		gotOp, gotN := medianSamples(confidence)
+		if gotOp != wantOp || gotN != wantN {
+			t.Errorf("for confidence %v, want %s %d, got %s %d", confidence, wantOp, wantN, gotOp, gotN)
+		}
+	}
+
+	// At n=6, the tails are 0.015625 * 2 => 0.03125
+	check(0.95, ">=", 6)
+	// At n=8, the tails are 0.00390625 * 2 => 0.0078125
+	check(0.99, ">=", 8)
+	// The hard-coded threshold is 50.
+	check(1, ">", 50)
+	// Check the other extreme. We always need at least two
+	// samples to have an interval.
+	check(0, ">=", 2)
+}
+
+func TestUTestSamples(t *testing.T) {
+	check := func(alpha float64, wantOp string, wantN int) {
+		t.Helper()
+		gotOp, gotN := uTestSamples(alpha)
+		if gotOp != wantOp || gotN != wantN {
+			t.Errorf("for alpha %v, want %s %d, got %s %d", alpha, wantOp, wantN, gotOp, gotN)
+		}
+	}
+	check(1, ">=", 1)
+	check(0.05, ">=", 4)
+	check(0.01, ">=", 5)
+	check(1e-50, ">", 10)
+	check(0, ">", 10)
+}
+
+func TestSummaryNone(t *testing.T) {
+	// The following tests correspond to the tests in
+	// TestMedianSamples.
+	a := AssumeNothing
+	var sample *Sample
+	inf := math.Inf(1)
+	sample = NewSample([]float64{-10, 2, 3, 4, 5, 6}, &DefaultThresholds)
+	checkSummary(t, a.Summary(sample, 0.95),
+		Summary{Center: 3.5, Lo: -10, Hi: 6, Confidence: 1 - 0.03125})
+	checkSummary(t, a.Summary(sample, 0.99),
+		Summary{Center: 3.5, Lo: -inf, Hi: inf, Confidence: 1},
+		"need >= 8 samples for confidence interval at level 0.99")
+	checkSummary(t, a.Summary(sample, 1),
+		Summary{Center: 3.5, Lo: -inf, Hi: inf, Confidence: 1},
+		"need > 50 samples for confidence interval at level 1")
+	sample = NewSample([]float64{1, 2}, &DefaultThresholds)
+	checkSummary(t, a.Summary(sample, 0),
+		Summary{Center: 1.5, Lo: 1, Hi: 2, Confidence: 0.5})
+
+	// And test very small samples.
+	sample = NewSample([]float64{1}, &DefaultThresholds)
+	checkSummary(t, a.Summary(sample, 0.95),
+		Summary{Center: 1, Lo: -inf, Hi: inf, Confidence: 1},
+		"need >= 6 samples for confidence interval at level 0.95")
+}
+
+func TestCompareNone(t *testing.T) {
+	// Most of the complexity is in the sample size warning.
+	a := AssumeNothing
+	thr := DefaultThresholds
+	thr.CompareAlpha = 0.05
+	// Too-small samples.
+	s1 := NewSample([]float64{-1, -1, -1}, &thr)
+	s2 := NewSample([]float64{1, 1, 1}, &thr)
+	checkComparison(t, a.Compare(s1, s2),
+		Comparison{P: 0.1, N1: 3, N2: 3, Alpha: 0.05},
+		"need >= 4 samples to detect a difference at alpha level 0.05")
+	// Big enough samples with a difference.
+	s1 = NewSample([]float64{-1, -1, -1, -1}, &thr)
+	s2 = NewSample([]float64{1, 1, 1, 1}, &thr)
+	checkComparison(t, a.Compare(s1, s2),
+		Comparison{P: 0.02857142857142857, N1: 4, N2: 4, Alpha: 0.05})
+	// Big enough samples, but not enough difference.
+	s1 = NewSample([]float64{1, -1, -1, -1}, &thr)
+	s2 = NewSample([]float64{-1, 1, 1, 1}, &thr)
+	checkComparison(t, a.Compare(s1, s2),
+		Comparison{P: 0.4857142857142857, N1: 4, N2: 4, Alpha: 0.05})
+
+	// All samples equal, so the U-test is meaningless.
+	s1 = NewSample([]float64{1, 1, 1, 1}, &thr)
+	s2 = NewSample([]float64{1, 1, 1, 1}, &thr)
+	checkComparison(t, a.Compare(s1, s2),
+		Comparison{P: 1, N1: 4, N2: 4, Alpha: 0.05},
+		"all samples are equal")
+
+}
+
+func TestSummaryNormal(t *testing.T) {
+	// This is a thin wrapper around sample.MeanCI, so just do a
+	// smoke test.
+	a := AssumeNormal
+	sample := NewSample([]float64{-8, 2, 3, 4, 5, 6}, &DefaultThresholds)
+	checkSummary(t, a.Summary(sample, 0.95),
+		Summary{Center: 2, Lo: -3.351092806089359, Hi: 7.351092806089359, Confidence: 0.95})
+}
+
+func TestSummaryExact(t *testing.T) {
+	a := AssumeExact
+	sample := NewSample([]float64{1, 1, 1, 1}, &DefaultThresholds)
+	checkSummary(t, a.Summary(sample, 0.95),
+		Summary{Center: 1, Lo: 1, Hi: 1, Confidence: 1})
+
+	sample = NewSample([]float64{1}, &DefaultThresholds)
+	checkSummary(t, a.Summary(sample, 0.95),
+		Summary{Center: 1, Lo: 1, Hi: 1, Confidence: 1})
+
+	sample = NewSample([]float64{1, 2, 2, 3}, &DefaultThresholds)
+	checkSummary(t, a.Summary(sample, 0.95),
+		Summary{Center: 2, Lo: 1, Hi: 3, Confidence: 1},
+		"exact distribution expected, but values range from 1 to 3")
+}
+
+func aeq(x, y float64) bool {
+	if x < 0 && y < 0 {
+		x, y = -x, -y
+	}
+	// Check that x and y are equal to 8 digits.
+	const factor = 1 - 1e-7
+	return x*factor <= y && y*factor <= x
+}
+
+func checkSummary(t *testing.T, got, want Summary, warnings ...string) {
+	t.Helper()
+	for _, w := range warnings {
+		want.Warnings = append(want.Warnings, fmt.Errorf("%s", w))
+	}
+	if !aeq(got.Center, want.Center) || !aeq(got.Lo, want.Lo) || !aeq(got.Hi, got.Hi) || got.Confidence != want.Confidence || !errorsEq(got.Warnings, want.Warnings) {
+		t.Errorf("got %v, want %v", got, want)
+	}
+}
+
+func checkComparison(t *testing.T, got, want Comparison, warnings ...string) {
+	t.Helper()
+	for _, w := range warnings {
+		want.Warnings = append(want.Warnings, fmt.Errorf("%s", w))
+	}
+	if !aeq(got.P, want.P) || got.N1 != want.N1 || got.N2 != want.N2 || got.Alpha != want.Alpha || !errorsEq(got.Warnings, want.Warnings) {
+		t.Errorf("got %#v, want %#v", got, want)
+	}
+}
+
+func errorsEq(a, b []error) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for i := range a {
+		if a[i].Error() != b[i].Error() {
+			return false
+		}
+	}
+	return true
+}
diff --git a/benchmath/mktables.go b/benchmath/mktables.go
new file mode 100644
index 0000000..d377603
--- /dev/null
+++ b/benchmath/mktables.go
@@ -0,0 +1,35 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ignore
+// +build ignore
+
+// Mktables pre-computes statistical tables.
+package main
+
+import (
+	"fmt"
+
+	"github.com/aclements/go-moremath/stats"
+)
+
+func main() {
+	var s1, s2 []float64
+
+	// Compute minimal P-value for the U-test given different
+	// sample sizes.
+	fmt.Printf("var uTestMinP = []float64{\n")
+	for n := 1; n < 10; n++ {
+		// The P-value is minimized when the order statistic
+		// is maximally separated.
+		s1 = append(s1, -1)
+		s2 = append(s2, 1)
+		res, err := stats.MannWhitneyUTest(s1, s2, stats.LocationDiffers)
+		if err != nil {
+			panic(err)
+		}
+		fmt.Printf("\t%d: %v,\n", n, res.P)
+	}
+	fmt.Printf("}\n")
+}
diff --git a/benchmath/sample.go b/benchmath/sample.go
new file mode 100644
index 0000000..b9d8513
--- /dev/null
+++ b/benchmath/sample.go
@@ -0,0 +1,196 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package benchmath provides tools for computing statistics over
+// distributions of benchmark measurements.
+//
+// This package is opinionated. For example, it doesn't provide
+// specific statistical tests. Instead, callers state distributional
+// assumptions and this package chooses appropriate tests.
+//
+// All analysis results contain a list of warnings, captured as an
+// []error value. These aren't errors that prevent analysis, but
+// should be presented to the user along with analysis results.
+package benchmath
+
+import (
+	"fmt"
+	"math"
+	"sort"
+
+	"github.com/aclements/go-moremath/mathx"
+	"github.com/aclements/go-moremath/stats"
+)
+
+// A Sample is a set of repeated measurements of a given benchmark.
+type Sample struct {
+	// Values are the measured values, in ascending order.
+	Values []float64
+
+	// Thresholds stores the statistical thresholds used by tests
+	// on this sample.
+	Thresholds *Thresholds
+
+	// Warnings is a list of warnings about this sample that
+	// should be reported to the user.
+	Warnings []error
+}
+
+// NewSample constructs a Sample from a set of measurements.
+func NewSample(values []float64, t *Thresholds) *Sample {
+	// TODO: Analyze stationarity and put results in Warnings.
+	// Consider Augmented Dickey–Fuller (based on Maricq et al.)
+
+	// Sort values for fast order statistics.
+	sort.Float64s(values)
+	return &Sample{values, t, nil}
+}
+
+func (s *Sample) sample() stats.Sample {
+	return stats.Sample{Xs: s.Values, Sorted: true}
+}
+
+// A Thresholds configures various thresholds used by statistical tests.
+//
+// This should be initialized to DefaultThresholds because it may be
+// extended with other fields in the future.
+type Thresholds struct {
+	// CompareAlpha is the alpha level below which
+	// Assumption.Compare rejects the null hypothesis that two
+	// samples come from the same distribution.
+	//
+	// This is typically 0.05.
+	CompareAlpha float64
+}
+
+// Note: Thresholds exists so we can extend it in the future with
+// things like the stationarity and normality test thresholds without
+// having to add function arguments in the future.
+
+// DefaultThresholds contains a reasonable set of defaults for Thresholds.
+var DefaultThresholds = Thresholds{
+	CompareAlpha: 0.05,
+}
+
+// An Assumption indicates a distributional assumption about a sample.
+type Assumption interface {
+	// SummaryLabel returns the string name for the summary
+	// statistic under this assumption. For example, "median" or
+	// "mean".
+	SummaryLabel() string
+
+	// Summary returns a summary statistic and its confidence
+	// interval at the given confidence level for Sample s.
+	//
+	// Confidence is given in the range [0,1], e.g., 0.95 for 95%
+	// confidence.
+	Summary(s *Sample, confidence float64) Summary
+
+	// Compare tests whether s1 and s2 come from the same
+	// distribution.
+	Compare(s1, s2 *Sample) Comparison
+}
+
+// A Summary summarizes a Sample.
+type Summary struct {
+	// Center is some measure of the central tendency of a sample.
+	Center float64
+
+	// Lo and Hi give the bounds of the confidence interval around
+	// Center.
+	Lo, Hi float64
+
+	// Confidence is the actual confidence level of the confidence
+	// interval given by Lo, Hi. It will be >= the requested
+	// confidence level.
+	Confidence float64
+
+	// Warnings is a list of warnings about this summary or its
+	// confidence interval.
+	Warnings []error
+}
+
+// PctRangeString returns a string representation of the range of this
+// Summary's confidence interval as a percentage.
+func (s Summary) PctRangeString() string {
+	if math.IsInf(s.Lo, 0) || math.IsInf(s.Hi, 0) {
+		return "∞"
+	}
+
+	// If the signs of the bounds differ from the center, we can't
+	// render it as a percent.
+	var csign = mathx.Sign(s.Center)
+	if csign != mathx.Sign(s.Lo) || csign != mathx.Sign(s.Hi) {
+		return "?"
+	}
+
+	// If center is 0, avoid dividing by zero. But we can only get
+	// here if lo and hi are also 0, in which case is seems
+	// reasonable to call this 0%.
+	if s.Center == 0 {
+		return "0%"
+	}
+
+	// Phew. Compute the range percent.
+	v := math.Max(s.Hi/s.Center-1, 1-s.Lo/s.Center)
+	return fmt.Sprintf("%.0f%%", 100*v)
+}
+
+// A Comparison is the result of comparing two samples to test if they
+// come from the same distribution.
+type Comparison struct {
+	// P is the p-value of the null hypothesis that two samples
+	// come from the same distribution. If P is less than a
+	// threshold alpha (typically 0.05), then we reject the null
+	// hypothesis.
+	//
+	// P can be 0, which indicates this is an exact result.
+	P float64
+
+	// N1 and N2 are the sizes of the two samples.
+	N1, N2 int
+
+	// Alpha is the alpha threshold for this test. If P < Alpha,
+	// we reject the null hypothesis that the two samples come
+	// from the same distribution.
+	Alpha float64
+
+	// Warnings is a list of warnings about this comparison
+	// result.
+	Warnings []error
+}
+
+// String summarizes the comparison. The general form of this string
+// is "p=0.PPP n=N1+N2" but can be shortened.
+func (c Comparison) String() string {
+	var s string
+	if c.P != 0 {
+		s = fmt.Sprintf("p=%0.3f ", c.P)
+	}
+	if c.N1 == c.N2 {
+		// Slightly shorter form for a common case.
+		return s + fmt.Sprintf("n=%d", c.N1)
+	}
+	return s + fmt.Sprintf("n=%d+%d", c.N1, c.N2)
+}
+
+// FormatDelta formats the difference in the centers of two distributions.
+// The old and new values must be the center summaries of the two
+// compared samples. If the Comparison accepts the null hypothesis
+// that the samples come from the same distribution, FormatDelta
+// returns "~" to indicate there's no meaningful difference.
+// Otherwise, it returns the percent difference between the centers.
+func (c Comparison) FormatDelta(old, new float64) string {
+	if c.P > c.Alpha {
+		return "~"
+	}
+	if old == new {
+		return "0.00%"
+	}
+	if old == 0 {
+		return "?"
+	}
+	pct := ((new / old) - 1.0) * 100.0
+	return fmt.Sprintf("%+.2f%%", pct)
+}
diff --git a/benchmath/sample_test.go b/benchmath/sample_test.go
new file mode 100644
index 0000000..13916ff
--- /dev/null
+++ b/benchmath/sample_test.go
@@ -0,0 +1,68 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package benchmath
+
+import (
+	"math"
+	"testing"
+)
+
+func TestSummaryFormat(t *testing.T) {
+	check := func(center, lo, hi float64, want string) {
+		t.Helper()
+		s := Summary{Center: center, Lo: lo, Hi: hi}
+		got := s.PctRangeString()
+		if got != want {
+			t.Errorf("for %v CI [%v, %v], got %s, want %s", center, lo, hi, got, want)
+		}
+	}
+	inf := math.Inf(1)
+
+	check(1, 0.5, 1.1, "50%")
+	check(1, 0.9, 1.5, "50%")
+	check(1, 1, 1, "0%")
+
+	check(-1, -0.5, -1.1, "50%")
+	check(-1, -0.9, -1.5, "50%")
+	check(-1, -1, -1, "0%")
+
+	check(1, -inf, 1, "∞")
+	check(1, 1, inf, "∞")
+
+	check(1, -1, 1, "?")
+	check(1, -1, -1, "?")
+	check(-1, -1, 1, "?")
+	check(-1, 1, -1, "?")
+	check(0, -1, 1, "?")
+
+	check(0, 0, 0, "0%")
+}
+
+func TestComparisonFormat(t *testing.T) {
+	check := func(p float64, n1, n2 int, want string) {
+		t.Helper()
+		got := Comparison{P: p, N1: n1, N2: n2}.String()
+		if got != want {
+			t.Errorf("for %v,%v,%v, got %s, want %s", p, n1, n2, got, want)
+		}
+	}
+	check(0.5, 1, 2, "p=0.500 n=1+2")
+	check(0.5, 2, 2, "p=0.500 n=2")
+	check(0, 1, 2, "n=1+2")
+	check(0, 2, 2, "n=2")
+
+	checkD := func(p, old, new, alpha float64, want string) {
+		got := Comparison{P: p, Alpha: alpha}.FormatDelta(old, new)
+		if got != want {
+			t.Errorf("for p=%v %v=>%v @%v, got %s, want %s", p, old, new, alpha, got, want)
+		}
+	}
+	checkD(0.5, 0, 0, 0.05, "~")
+	checkD(0.01, 0, 0, 0.05, "0.00%")
+	checkD(0.01, 1, 1, 0.05, "0.00%")
+	checkD(0.01, 0, 1, 0.05, "?")
+	checkD(0.01, 1, 1.5, 0.05, "+50.00%")
+	checkD(0.01, 1, 0.5, 0.05, "-50.00%")
+}
diff --git a/go.mod b/go.mod
index 8bac3d7..842359f 100644
--- a/go.mod
+++ b/go.mod
@@ -6,7 +6,7 @@
 	cloud.google.com/go v0.0.0-20170206221025-ce650573d812
 	github.com/GoogleCloudPlatform/cloudsql-proxy v0.0.0-20190129172621-c8b1d7a94ddf
 	github.com/aclements/go-gg v0.0.0-20170118225347-6dbb4e4fefb0
-	github.com/aclements/go-moremath v0.0.0-20161014184102-0ff62e0875ff // indirect
+	github.com/aclements/go-moremath v0.0.0-20210112150236-f10218a38794
 	github.com/go-sql-driver/mysql v1.4.1
 	github.com/gonum/blas v0.0.0-20181208220705-f22b278b28ac // indirect
 	github.com/gonum/floats v0.0.0-20181209220543-c233463c7e82 // indirect
diff --git a/go.sum b/go.sum
index a039423..dcf682b 100644
--- a/go.sum
+++ b/go.sum
@@ -6,6 +6,8 @@
 github.com/aclements/go-gg v0.0.0-20170118225347-6dbb4e4fefb0/go.mod h1:55qNq4vcpkIuHowELi5C8e+1yUHtoLoOUR9QU5j7Tes=
 github.com/aclements/go-moremath v0.0.0-20161014184102-0ff62e0875ff h1:txKOXqsFQUyi7Ht0Prto4QMU4O/0Gby6v5RFqMS0/PM=
 github.com/aclements/go-moremath v0.0.0-20161014184102-0ff62e0875ff/go.mod h1:idZL3yvz4kzx1dsBOAC+oYv6L92P1oFEhUXUB1A/lwQ=
+github.com/aclements/go-moremath v0.0.0-20210112150236-f10218a38794 h1:xlwdaKcTNVW4PtpQb8aKA4Pjy0CdJHEqvFbAnvR5m2g=
+github.com/aclements/go-moremath v0.0.0-20210112150236-f10218a38794/go.mod h1:7e+I0LQFUI9AXWxOfsQROs9xPhoJtbsyWcjJqDd4KPY=
 github.com/go-sql-driver/mysql v1.4.1 h1:g24URVg0OFbNUTx9qqY1IRZ9D9z3iPyi5zKhQZpNwpA=
 github.com/go-sql-driver/mysql v1.4.1/go.mod h1:zAC/RDZ24gD3HViQzih4MyKcchzm+sOG5ZlKdlhCg5w=
 github.com/golang/protobuf v1.2.0 h1:P3YflyNX/ehuJFLhxviNdFxQPkGK5cDcApsge1SqnvM=