analysis: show benchstat results

Change-Id: Ic9615556358e0262384a1a2f3acb9e4e6ab98532
Reviewed-on: https://go-review.googlesource.com/35502
Reviewed-by: Russ Cox <rsc@golang.org>
diff --git a/analysis/app/compare.go b/analysis/app/compare.go
index cd57505..7773ed8 100644
--- a/analysis/app/compare.go
+++ b/analysis/app/compare.go
@@ -5,10 +5,13 @@
 package app
 
 import (
-	"fmt"
+	"bytes"
+	"html/template"
+	"io/ioutil"
 	"net/http"
 	"sort"
 
+	"golang.org/x/perf/analysis/internal/benchstat"
 	"golang.org/x/perf/storage/benchfmt"
 )
 
@@ -64,6 +67,39 @@
 
 	q := r.Form.Get("q")
 
+	tmpl, err := ioutil.ReadFile("template/compare.html")
+	if err != nil {
+		http.Error(w, err.Error(), 500)
+		return
+	}
+
+	t, err := template.New("main").Parse(string(tmpl))
+	if err != nil {
+		http.Error(w, err.Error(), 500)
+		return
+	}
+
+	data, err := a.compareQuery(q)
+	if err != nil {
+		http.Error(w, err.Error(), 500)
+		return
+	}
+
+	w.Header().Set("Content-Type", "text/html; charset=utf-8")
+	if err := t.Execute(w, data); err != nil {
+		http.Error(w, err.Error(), 500)
+		return
+	}
+}
+
+type compareData struct {
+	Q         string
+	Benchstat template.HTML
+	Groups    []*resultGroup
+	Labels    map[string]bool
+}
+
+func (a *App) compareQuery(q string) (*compareData, error) {
 	// Parse query
 	queries := parseQueryString(q)
 
@@ -79,8 +115,7 @@
 		}
 		if err := res.Err(); err != nil {
 			// TODO: If the query is invalid, surface that to the user.
-			http.Error(w, err.Error(), 500)
-			return
+			return nil, err
 		}
 		groups = append(groups, group)
 	}
@@ -94,15 +129,28 @@
 		}
 	}
 
-	// TODO: Compute benchstat
-
-	// TODO: Render template. This is just temporary output to confirm the above works.
-	w.Header().Set("Content-Type", "text/plain; charset=utf-8")
-	for i, g := range groups {
-		fmt.Fprintf(w, "Group #%d: %d results\n", i, len(g.results))
-		for k, vs := range g.labelValues {
-			fmt.Fprintf(w, "\t%s: %#v\n", k, vs)
-		}
-		fmt.Fprintf(w, "\n")
+	// Compute benchstat
+	var buf bytes.Buffer
+	var results [][]*benchfmt.Result
+	for _, g := range groups {
+		results = append(results, g.results)
 	}
+	benchstat.Run(&buf, results, &benchstat.Options{
+		HTML: true,
+	})
+
+	// Render template.
+	labels := make(map[string]bool)
+	for _, g := range groups {
+		for k := range g.LabelValues {
+			labels[k] = true
+		}
+	}
+	data := &compareData{
+		Q:         q,
+		Benchstat: template.HTML(buf.String()),
+		Groups:    groups,
+		Labels:    labels,
+	}
+	return data, nil
 }
diff --git a/analysis/app/compare_test.go b/analysis/app/compare_test.go
index 55ca938..301fd0e 100644
--- a/analysis/app/compare_test.go
+++ b/analysis/app/compare_test.go
@@ -5,10 +5,14 @@
 package app
 
 import (
+	"fmt"
+	"net/http"
+	"net/http/httptest"
 	"reflect"
 	"strings"
 	"testing"
 
+	"golang.org/x/perf/storage"
 	"golang.org/x/perf/storage/benchfmt"
 )
 
@@ -46,3 +50,61 @@
 		}
 	}
 }
+
+// static responses for TestCompareQuery
+var compareQueries = map[string]string{
+	"one": `upload: 1
+upload-part: 1
+label: value
+BenchmarkOne 1 5 ns/op
+BenchmarkTwo 1 10 ns/op`,
+	"two": `upload: 1
+upload-part: 2
+BenchmarkOne 1 10 ns/op
+BenchmarkTwo 1 5 ns/op`,
+	"onetwo": `upload: 1
+upload-part: 1
+label: value
+BenchmarkOne 1 5 ns/op
+BenchmarkTwo 1 10 ns/op
+label:
+upload-part: 2
+BenchmarkOne 1 10 ns/op
+BenchmarkTwo 1 5 ns/op`,
+}
+
+func TestCompareQuery(t *testing.T) {
+	// TODO(quentin): This test seems too heavyweight; we are but shouldn't be also testing the storage client -> storage server interaction.
+	ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if err := r.ParseForm(); err != nil {
+			t.Errorf("ParseForm = %v", err)
+		}
+		q := r.Form.Get("q")
+		w.Header().Set("Content-Type", "text/plain; charset=utf-8")
+		fmt.Fprint(w, compareQueries[q])
+	}))
+	defer ts.Close()
+
+	a := &App{StorageClient: &storage.Client{BaseURL: ts.URL}}
+
+	for _, q := range []string{"one vs two", "onetwo"} {
+		t.Run(q, func(t *testing.T) {
+			data, err := a.compareQuery(q)
+			if err != nil {
+				t.Fatalf("compareQuery failed: %v", err)
+			}
+			if have := data.Q; have != q {
+				t.Errorf("Q = %q, want %q", have, q)
+			}
+			if len(data.Groups) != 2 {
+				t.Errorf("len(Groups) = %d, want 2", len(data.Groups))
+			}
+			if len(data.Benchstat) == 0 {
+				t.Error("len(Benchstat) = 0, want >0")
+			}
+			if want := map[string]bool{"upload": true, "upload-part": true, "label": true}; !reflect.DeepEqual(data.Labels, want) {
+				t.Errorf("Labels = %#v, want %#v", data.Labels, want)
+			}
+		})
+	}
+}
diff --git a/analysis/appengine/template/compare.html b/analysis/appengine/template/compare.html
new file mode 100644
index 0000000..01a9755
--- /dev/null
+++ b/analysis/appengine/template/compare.html
@@ -0,0 +1,43 @@
+<!DOCTYPE html>
+<html>
+  <head>
+    <title>Performance Result Comparison</title>
+  </head>
+  <body>
+    <div>
+      <form action="/search">
+        <input type="text" name="q" value="{{.Q}}" size="120">
+        <input type="submit" value="Search">
+      </form>
+    </div>
+    <div>
+      {{.Benchstat}}
+    </div>
+    <table>
+      <tr>
+        <th>label</th>
+        {{range $index, $group := .Groups}}
+        <th>
+          #{{$index}}
+        </th>
+        {{end}}
+      </tr>
+      {{range $label, $exists := .Labels}}
+      <tr>
+        <th>{{$label}}</th>
+        {{range $.Groups}}
+        <td>
+          {{with index .LabelValues $label}}
+            [
+            {{range $value, $exists := .}}
+              {{printf "%q" $value}}
+            {{end}}
+            ]
+          {{end}}
+        </td>
+        {{end}}
+      </tr>
+      {{end}}
+    </table>
+  </body>
+</html>
diff --git a/analysis/internal/benchstat/benchstat.go b/analysis/internal/benchstat/benchstat.go
new file mode 100644
index 0000000..1bcb128
--- /dev/null
+++ b/analysis/internal/benchstat/benchstat.go
@@ -0,0 +1,546 @@
+// Copyright 2015 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Benchstat computes and compares statistics about benchmarks.
+//
+// Each input file should contain the concatenated output of a number
+// of runs of ``go test -bench.'' For each different benchmark listed in an input file,
+// benchstat computes the mean, minimum, and maximum run time,
+// after removing outliers using the interquartile range rule.
+//
+// If invoked on a single input file, benchstat prints the per-benchmark statistics
+// for that file.
+//
+// If invoked on a pair of input files, benchstat adds to the output a column
+// showing the statistics from the second file and a column showing the
+// percent change in mean from the first to the second file.
+// Next to the percent change, benchstat shows the p-value and sample
+// sizes from a test of the two distributions of benchmark times.
+// Small p-values indicate that the two distributions are significantly different.
+// If the test indicates that there was no significant change between the two
+// benchmarks (defined as p > 0.05), benchstat displays a single ~ instead of
+// the percent change.
+//
+// The -delta-test option controls which significance test is applied:
+// utest (Mann-Whitney U-test), ttest (two-sample Welch t-test), or none.
+// The default is the U-test, sometimes also referred to as the Wilcoxon rank
+// sum test.
+//
+// If invoked on more than two input files, benchstat prints the per-benchmark
+// statistics for all the files, showing one column of statistics for each file,
+// with no column for percent change or statistical significance.
+//
+// The -html option causes benchstat to print the results as an HTML table.
+package benchstat
+
+import (
+	"flag"
+	"fmt"
+	"html"
+	"io"
+	"strconv"
+	"strings"
+	"unicode/utf8"
+
+	"github.com/aclements/go-moremath/stats"
+	"golang.org/x/perf/storage/benchfmt"
+)
+
+var (
+	flagHTML = flag.Bool("html", false, "print results as an HTML table")
+)
+
+type row struct {
+	cols []string
+}
+
+func newRow(cols ...string) *row {
+	return &row{cols: cols}
+}
+
+func (r *row) add(col string) {
+	r.cols = append(r.cols, col)
+}
+
+func (r *row) trim() {
+	for len(r.cols) > 0 && r.cols[len(r.cols)-1] == "" {
+		r.cols = r.cols[:len(r.cols)-1]
+	}
+}
+
+type Options struct {
+	// DeltaTest is the significance test to apply to delta.
+	// This package provides Utest, Ttest, and Notest. If nil, Utest will be used.
+	DeltaTest func(old, new *Benchstat) (pval float64, err error)
+	// Alpha is the threshold to consider a change significant if p < α
+	Alpha float64
+	// Geomean will add a row for the geometric mean of each file
+	Geomean bool
+	// HTML will print the results as an HTML table
+	HTML bool
+}
+
+func Run(w io.Writer, results [][]*benchfmt.Result, options *Options) {
+	if options == nil {
+		options = &Options{}
+	}
+	if options.DeltaTest == nil {
+		options.DeltaTest = Utest
+	}
+	if options.Alpha == 0 {
+		options.Alpha = .05
+	}
+	// Read in benchmark data.
+	c := readResults(results)
+	for _, stat := range c.Stats {
+		stat.ComputeStats()
+	}
+
+	var tables [][]*row
+	switch len(c.Configs) {
+	case 2:
+		before, after := c.Configs[0], c.Configs[1]
+		key := BenchKey{}
+		for _, key.Unit = range c.Units {
+			var table []*row
+			metric := metricOf(key.Unit)
+			for _, key.Benchmark = range c.Benchmarks {
+				key.Config = before
+				old := c.Stats[key]
+				key.Config = after
+				new := c.Stats[key]
+				if old == nil || new == nil {
+					continue
+				}
+				if len(table) == 0 {
+					table = append(table, newRow("name", "old "+metric, "new "+metric, "delta"))
+				}
+
+				pval, testerr := options.DeltaTest(old, new)
+
+				scaler := newScaler(old.Mean, old.Unit)
+				row := newRow(key.Benchmark, old.Format(scaler), new.Format(scaler), "~   ")
+				if testerr == stats.ErrZeroVariance {
+					row.add("(zero variance)")
+				} else if testerr == stats.ErrSampleSize {
+					row.add("(too few samples)")
+				} else if testerr == stats.ErrSamplesEqual {
+					row.add("(all equal)")
+				} else if testerr != nil {
+					row.add(fmt.Sprintf("(%s)", testerr))
+				} else if pval < options.Alpha {
+					row.cols[3] = fmt.Sprintf("%+.2f%%", ((new.Mean/old.Mean)-1.0)*100.0)
+				}
+				if len(row.cols) == 4 && pval != -1 {
+					row.add(fmt.Sprintf("(p=%0.3f n=%d+%d)", pval, len(old.RValues), len(new.RValues)))
+				}
+				table = append(table, row)
+			}
+			if len(table) > 0 {
+				if options.Geomean {
+					table = addGeomean(table, c, key.Unit, true)
+				}
+				tables = append(tables, table)
+			}
+		}
+
+	default:
+		key := BenchKey{}
+		for _, key.Unit = range c.Units {
+			var table []*row
+			metric := metricOf(key.Unit)
+
+			if len(c.Configs) > 1 {
+				hdr := newRow("name \\ " + metric)
+				for _, config := range c.Configs {
+					hdr.add(config)
+				}
+				table = append(table, hdr)
+			} else {
+				table = append(table, newRow("name", metric))
+			}
+
+			for _, key.Benchmark = range c.Benchmarks {
+				row := newRow(key.Benchmark)
+				var scaler func(float64) string
+				for _, key.Config = range c.Configs {
+					stat := c.Stats[key]
+					if stat == nil {
+						row.add("")
+						continue
+					}
+					if scaler == nil {
+						scaler = newScaler(stat.Mean, stat.Unit)
+					}
+					row.add(stat.Format(scaler))
+				}
+				row.trim()
+				if len(row.cols) > 1 {
+					table = append(table, row)
+				}
+			}
+			if options.Geomean {
+				table = addGeomean(table, c, key.Unit, false)
+			}
+			tables = append(tables, table)
+		}
+	}
+
+	numColumn := 0
+	for _, table := range tables {
+		for _, row := range table {
+			if numColumn < len(row.cols) {
+				numColumn = len(row.cols)
+			}
+		}
+	}
+
+	max := make([]int, numColumn)
+	for _, table := range tables {
+		for _, row := range table {
+			for i, s := range row.cols {
+				n := utf8.RuneCountInString(s)
+				if max[i] < n {
+					max[i] = n
+				}
+			}
+		}
+	}
+
+	for i, table := range tables {
+		if i > 0 {
+			fmt.Fprintf(w, "\n")
+		}
+
+		if options.HTML {
+			fmt.Fprintf(w, "<style>.benchstat tbody td:nth-child(1n+2) { text-align: right; padding: 0em 1em; }</style>\n")
+			fmt.Fprintf(w, "<table class='benchstat'>\n")
+			printRow := func(row *row, tag string) {
+				fmt.Fprintf(w, "<tr>")
+				for _, cell := range row.cols {
+					fmt.Fprintf(w, "<%s>%s</%s>", tag, html.EscapeString(cell), tag)
+				}
+				fmt.Fprintf(w, "\n")
+			}
+			printRow(table[0], "th")
+			for _, row := range table[1:] {
+				printRow(row, "td")
+			}
+			fmt.Fprintf(w, "</table>\n")
+			continue
+		}
+
+		// headings
+		row := table[0]
+		for i, s := range row.cols {
+			switch i {
+			case 0:
+				fmt.Fprintf(w, "%-*s", max[i], s)
+			default:
+				fmt.Fprintf(w, "  %-*s", max[i], s)
+			case len(row.cols) - 1:
+				fmt.Fprintf(w, "  %s\n", s)
+			}
+		}
+
+		// data
+		for _, row := range table[1:] {
+			for i, s := range row.cols {
+				switch i {
+				case 0:
+					fmt.Fprintf(w, "%-*s", max[i], s)
+				default:
+					if i == len(row.cols)-1 && len(s) > 0 && s[0] == '(' {
+						// Left-align p value.
+						fmt.Fprintf(w, "  %s", s)
+						break
+					}
+					fmt.Fprintf(w, "  %*s", max[i], s)
+				}
+			}
+			fmt.Fprintf(w, "\n")
+		}
+	}
+}
+
+func addGeomean(table []*row, c *Collection, unit string, delta bool) []*row {
+	row := newRow("[Geo mean]")
+	key := BenchKey{Unit: unit}
+	geomeans := []float64{}
+	for _, key.Config = range c.Configs {
+		var means []float64
+		for _, key.Benchmark = range c.Benchmarks {
+			stat := c.Stats[key]
+			if stat != nil {
+				means = append(means, stat.Mean)
+			}
+		}
+		if len(means) == 0 {
+			row.add("")
+			delta = false
+		} else {
+			geomean := stats.GeoMean(means)
+			geomeans = append(geomeans, geomean)
+			row.add(newScaler(geomean, unit)(geomean) + "     ")
+		}
+	}
+	if delta {
+		row.add(fmt.Sprintf("%+.2f%%", ((geomeans[1]/geomeans[0])-1.0)*100.0))
+	}
+	return append(table, row)
+}
+
+func timeScaler(ns float64) func(float64) string {
+	var format string
+	var scale float64
+	switch x := ns / 1e9; {
+	case x >= 99.5:
+		format, scale = "%.0fs", 1
+	case x >= 9.95:
+		format, scale = "%.1fs", 1
+	case x >= 0.995:
+		format, scale = "%.2fs", 1
+	case x >= 0.0995:
+		format, scale = "%.0fms", 1000
+	case x >= 0.00995:
+		format, scale = "%.1fms", 1000
+	case x >= 0.000995:
+		format, scale = "%.2fms", 1000
+	case x >= 0.0000995:
+		format, scale = "%.0fµs", 1000*1000
+	case x >= 0.00000995:
+		format, scale = "%.1fµs", 1000*1000
+	case x >= 0.000000995:
+		format, scale = "%.2fµs", 1000*1000
+	case x >= 0.0000000995:
+		format, scale = "%.0fns", 1000*1000*1000
+	case x >= 0.00000000995:
+		format, scale = "%.1fns", 1000*1000*1000
+	default:
+		format, scale = "%.2fns", 1000*1000*1000
+	}
+	return func(ns float64) string {
+		return fmt.Sprintf(format, ns/1e9*scale)
+	}
+}
+
+func newScaler(val float64, unit string) func(float64) string {
+	if unit == "ns/op" {
+		return timeScaler(val)
+	}
+
+	var format string
+	var scale float64
+	var suffix string
+
+	prescale := 1.0
+	if unit == "MB/s" {
+		prescale = 1e6
+	}
+
+	switch x := val * prescale; {
+	case x >= 99500000000000:
+		format, scale, suffix = "%.0f", 1e12, "T"
+	case x >= 9950000000000:
+		format, scale, suffix = "%.1f", 1e12, "T"
+	case x >= 995000000000:
+		format, scale, suffix = "%.2f", 1e12, "T"
+	case x >= 99500000000:
+		format, scale, suffix = "%.0f", 1e9, "G"
+	case x >= 9950000000:
+		format, scale, suffix = "%.1f", 1e9, "G"
+	case x >= 995000000:
+		format, scale, suffix = "%.2f", 1e9, "G"
+	case x >= 99500000:
+		format, scale, suffix = "%.0f", 1e6, "M"
+	case x >= 9950000:
+		format, scale, suffix = "%.1f", 1e6, "M"
+	case x >= 995000:
+		format, scale, suffix = "%.2f", 1e6, "M"
+	case x >= 99500:
+		format, scale, suffix = "%.0f", 1e3, "k"
+	case x >= 9950:
+		format, scale, suffix = "%.1f", 1e3, "k"
+	case x >= 995:
+		format, scale, suffix = "%.2f", 1e3, "k"
+	case x >= 99.5:
+		format, scale, suffix = "%.0f", 1, ""
+	case x >= 9.95:
+		format, scale, suffix = "%.1f", 1, ""
+	default:
+		format, scale, suffix = "%.2f", 1, ""
+	}
+
+	if unit == "B/op" {
+		suffix += "B"
+	}
+	if unit == "MB/s" {
+		suffix += "B/s"
+	}
+	scale /= prescale
+
+	return func(val float64) string {
+		return fmt.Sprintf(format+suffix, val/scale)
+	}
+}
+
+func (b *Benchstat) Format(scaler func(float64) string) string {
+	diff := 1 - b.Min/b.Mean
+	if d := b.Max/b.Mean - 1; d > diff {
+		diff = d
+	}
+	s := scaler(b.Mean)
+	if b.Mean == 0 {
+		s += "     "
+	} else {
+		s = fmt.Sprintf("%s ±%3s", s, fmt.Sprintf("%.0f%%", diff*100.0))
+	}
+	return s
+}
+
+// ComputeStats updates the derived statistics in s from the raw
+// samples in s.Values.
+func (stat *Benchstat) ComputeStats() {
+	// Discard outliers.
+	values := stats.Sample{Xs: stat.Values}
+	q1, q3 := values.Percentile(0.25), values.Percentile(0.75)
+	lo, hi := q1-1.5*(q3-q1), q3+1.5*(q3-q1)
+	for _, value := range stat.Values {
+		if lo <= value && value <= hi {
+			stat.RValues = append(stat.RValues, value)
+		}
+	}
+
+	// Compute statistics of remaining data.
+	stat.Min, stat.Max = stats.Bounds(stat.RValues)
+	stat.Mean = stats.Mean(stat.RValues)
+}
+
+// A Benchstat is the metrics along one axis (e.g., ns/op or MB/s)
+// for all runs of a specific benchmark.
+type Benchstat struct {
+	Unit    string
+	Values  []float64 // metrics
+	RValues []float64 // metrics with outliers removed
+	Min     float64   // min of RValues
+	Mean    float64   // mean of RValues
+	Max     float64   // max of RValues
+}
+
+// A BenchKey identifies one metric (e.g., "ns/op", "B/op") from one
+// benchmark (function name sans "Benchmark" prefix) in one
+// configuration (input file name).
+type BenchKey struct {
+	Config, Benchmark, Unit string
+}
+
+type Collection struct {
+	Stats map[BenchKey]*Benchstat
+
+	// Configs, Benchmarks, and Units give the set of configs,
+	// benchmarks, and units from the keys in Stats in an order
+	// meant to match the order the benchmarks were read in.
+	Configs, Benchmarks, Units []string
+}
+
+func (c *Collection) AddStat(key BenchKey) *Benchstat {
+	if stat, ok := c.Stats[key]; ok {
+		return stat
+	}
+
+	addString := func(strings *[]string, add string) {
+		for _, s := range *strings {
+			if s == add {
+				return
+			}
+		}
+		*strings = append(*strings, add)
+	}
+	addString(&c.Configs, key.Config)
+	addString(&c.Benchmarks, key.Benchmark)
+	addString(&c.Units, key.Unit)
+	stat := &Benchstat{Unit: key.Unit}
+	c.Stats[key] = stat
+	return stat
+}
+
+// readResults parses all sets of results
+func readResults(results [][]*benchfmt.Result) *Collection {
+	c := Collection{Stats: make(map[BenchKey]*Benchstat)}
+	for _, group := range results {
+		readGroup(group, &c)
+	}
+	return &c
+}
+
+// readGroup parses a single set of benchmarks in to a Collection.
+func readGroup(group []*benchfmt.Result, c *Collection) {
+	// TODO(quentin): Pick a better name for each group
+	name := fmt.Sprintf("#%d", len(c.Configs))
+	c.Configs = append(c.Configs, name)
+	key := BenchKey{Config: name}
+
+	for _, result := range group {
+		line := result.Content
+		f := strings.Fields(line)
+		if len(f) < 4 {
+			continue
+		}
+		name := f[0]
+		if !strings.HasPrefix(name, "Benchmark") {
+			continue
+		}
+		name = strings.TrimPrefix(name, "Benchmark")
+		n, _ := strconv.Atoi(f[1])
+		if n == 0 {
+			continue
+		}
+
+		key.Benchmark = name
+		for i := 2; i+2 <= len(f); i += 2 {
+			val, err := strconv.ParseFloat(f[i], 64)
+			if err != nil {
+				continue
+			}
+			key.Unit = f[i+1]
+			stat := c.AddStat(key)
+			stat.Values = append(stat.Values, val)
+		}
+	}
+}
+
+func metricOf(unit string) string {
+	switch unit {
+	case "ns/op":
+		return "time/op"
+	case "B/op":
+		return "alloc/op"
+	case "MB/s":
+		return "speed"
+	default:
+		return unit
+	}
+}
+
+// Significance tests.
+
+func Notest(old, new *Benchstat) (pval float64, err error) {
+	return -1, nil
+}
+
+func Ttest(old, new *Benchstat) (pval float64, err error) {
+	t, err := stats.TwoSampleWelchTTest(stats.Sample{Xs: old.RValues}, stats.Sample{Xs: new.RValues}, stats.LocationDiffers)
+	if err != nil {
+		return -1, err
+	}
+	return t.P, nil
+}
+
+func Utest(old, new *Benchstat) (pval float64, err error) {
+	u, err := stats.MannWhitneyUTest(old.RValues, new.RValues, stats.LocationDiffers)
+	if err != nil {
+		return -1, err
+	}
+	return u.P, nil
+}