proposal/6282: Add benchmark code

Change-Id: I0327aaed1160d329d4faaafc2af210e8feb31455
Reviewed-on: https://go-review.googlesource.com/33362
Reviewed-by: Robert Griesemer <gri@golang.org>
diff --git a/design/6282-table-data.md b/design/6282-table-data.md
index 6c425ea..e4082dc 100644
--- a/design/6282-table-data.md
+++ b/design/6282-table-data.md
@@ -311,7 +311,7 @@
 This is a simpler version of the "General Matrix Multiply" at the core of many
 numerical routines.
 
-First consider a single-slice implementation (BenchmarkMTNaiveSlices), which
+First consider a single-slice implementation (BenchmarkNaiveSlices), which
 will be similar to the optimal performance.
 
 	// Compute C += A*B^T, where C is an m×n matrix, A is an m×k matrix, and B
@@ -328,7 +328,7 @@
 		}
 	}
 
-We can add an "AddSet" method (BenchmarkMTAddSet), and translate the above code
+We can add an "AddSet" method (BenchmarkAddSet), and translate the above code
 into the struct representation.
 
 	// Compute C += A*B^T, where C is an m×n matrix, A is an m×k matrix, and B
@@ -350,7 +350,7 @@
 The reason for this significant penalty is that the Go compiler does not
 currently inline methods that can panic, and the accessors contain panic calls
 as part of the manual index bounds checks.
-The next benchmark simulates a compiler with this restriction removed (BenchmarkMTAddSetNP)
+The next benchmark simulates a compiler with this restriction removed (BenchmarkAddSetNP)
 by replacing the `panic` calls in the accessor methods with setting the first
 data element to NaN (this is not good code, but it means the current Go compiler
 can inline the method calls and the bounds checks still affect program execution
@@ -360,12 +360,12 @@
 The final cause of the performance gap is bounds checking.
 The benchmark is modified so the bounds checks are removed, simulating a compiler
 with better proving capability than the current compiler.
-Further, the benchmark is run with `-gcflags=-B` (BenchmarkMTAddSetNB).
+Further, the benchmark is run with `-gcflags=-B` (BenchmarkAddSetNB).
 This closes the performance gap entirely (and also improves the single slice
 implementation by 15%).
 
 However, the initial single slice implementation can be significantly improved
-as follows (BenchmarkMTSliceOpt).
+as follows (BenchmarkSliceOpt).
 
 	for i := 0; i < m; i++ {
 		as := a[i*lda : i*lda+k]
@@ -383,7 +383,7 @@
 This reduces the cost by another 40% on top of the bounds check removal.
 
 Similar performance using a struct representation can be achieved with a
-"RowView" method (BenchmarkMTDenseOpt)
+"RowView" method (BenchmarkDenseOpt)
 
 	func (d *Dense) RowView(i int) []float64 {
 		if uint(i) >= uint(d.rows) {
diff --git a/design/6282/slicebench_test.go b/design/6282/slicebench_test.go
new file mode 100644
index 0000000..0b0e237
--- /dev/null
+++ b/design/6282/slicebench_test.go
@@ -0,0 +1,252 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// package slicebench implements tests to support the analysis in proposal 6282.
+// It compares the performance of multi-dimensional slices implemented using a
+// single slice and using a struct type.
+package slicebench
+
+import (
+	"math"
+	"math/rand"
+	"testing"
+)
+
+var (
+	m = 300
+	n = 400
+	k = 200
+
+	lda = k
+	ldb = k
+	ldc = n
+)
+
+var a, b, c []float64
+
+var A, B, C Dense
+
+var aStore, bStore, cStore []float64 // data storage for the variables above
+
+func init() {
+	// Initialize the matrices to random data.
+	aStore = make([]float64, m*lda)
+	for i := range a {
+		aStore[i] = rand.Float64()
+	}
+	bStore = make([]float64, n*ldb)
+	for i := range b {
+		bStore[i] = rand.Float64()
+	}
+	cStore = make([]float64, n*ldc)
+	for i := range c {
+		cStore[i] = rand.Float64()
+	}
+
+	a = make([]float64, len(aStore))
+	copy(a, aStore)
+	b = make([]float64, len(bStore))
+	copy(b, bStore)
+	c = make([]float64, len(cStore))
+	copy(c, cStore)
+
+	// The struct types use the single slices as underlying data.
+	A = Dense{lda, m, k, a}
+	B = Dense{lda, n, k, b}
+	C = Dense{lda, m, n, c}
+}
+
+// resetSlices resets the data to their original (randomly generated) values.
+// The Dense values share the same undelying data as the single slices, so this
+// works for both single slice and struct representations.
+func resetSlices(be *testing.B) {
+	copy(a, aStore)
+	copy(b, bStore)
+	copy(c, cStore)
+	be.ResetTimer()
+}
+
+// BenchmarkNaiveSlices measures a naive implementation of C += A * B^T using
+// the single slice representation.
+func BenchmarkNaiveSlices(be *testing.B) {
+	resetSlices(be)
+	for t := 0; t < be.N; t++ {
+		for i := 0; i < m; i++ {
+			for j := 0; j < n; j++ {
+				var t float64
+				for l := 0; l < k; l++ {
+					t += a[i*lda+l] * b[j*lda+l]
+				}
+				c[i*ldc+j] += t
+			}
+		}
+	}
+}
+
+// Dense represents a two-dimensional slice with the specified sizes.
+type Dense struct {
+	stride int
+	rows   int
+	cols   int
+	data   []float64
+}
+
+// At returns the element at row i and column j.
+func (d *Dense) At(i, j int) float64 {
+	if uint(i) >= uint(d.rows) {
+		panic("rows out of bounds")
+	}
+	if uint(j) >= uint(d.cols) {
+		panic("cols out of bounds")
+	}
+	return d.data[i*d.stride+j]
+}
+
+// AddSet adds v to the current value at row i and column j.
+func (d *Dense) AddSet(i, j int, v float64) {
+	if uint(i) >= uint(d.rows) {
+		panic("rows out of bounds")
+	}
+	if uint(j) >= uint(d.cols) {
+		panic("cols out of bounds")
+	}
+	d.data[i*d.stride+j] += v
+}
+
+// BenchmarkAddSet measures a naive implementation of C += A * B^T using
+// the Dense representation.
+func BenchmarkAddSet(be *testing.B) {
+	resetSlices(be)
+	for t := 0; t < be.N; t++ {
+		for i := 0; i < m; i++ {
+			for j := 0; j < n; j++ {
+				var t float64
+				for l := 0; l < k; l++ {
+					t += A.At(i, l) * B.At(j, l)
+				}
+				C.AddSet(i, j, t)
+			}
+		}
+	}
+}
+
+// AtNP gets the value at row i and column j without panicking if a bounds check
+// fails.
+func (d *Dense) AtNP(i, j int) float64 {
+	if uint(i) >= uint(d.rows) {
+		// Corrupt a value in data so the bounds check still has an effect if it
+		// fails. This way, the method can be in-lined but the bounds checks are
+		// not trivially removable.
+		d.data[0] = math.NaN()
+	}
+	if uint(j) >= uint(d.cols) {
+		d.data[0] = math.NaN()
+	}
+	return d.data[i*d.stride+j]
+}
+
+// AddSetNP adds v to the current value at row i and column j without panicking if
+// a bounds check fails.
+func (d *Dense) AddSetNP(i, j int, v float64) {
+	if uint(i) >= uint(d.rows) {
+		// See comment in AtNP.
+		d.data[0] = math.NaN()
+	}
+	if uint(j) >= uint(d.cols) {
+		d.data[0] = math.NaN()
+	}
+	d.data[i*d.stride+j] += v
+}
+
+// BenchmarkAddSetNP measures C += A * B^T using the Dense representation with
+// calls to methods that do not panic. This simulates a compiler that can inline
+// the normal At and Set methods.
+func BenchmarkAddSetNP(be *testing.B) {
+	resetSlices(be)
+	for t := 0; t < be.N; t++ {
+		for i := 0; i < m; i++ {
+			for j := 0; j < n; j++ {
+				var t float64
+				for l := 0; l < k; l++ {
+					t += A.AtNP(i, l) * B.AtNP(j, l)
+				}
+				C.AddSetNP(i, j, t)
+			}
+		}
+	}
+}
+
+// AtNB gets the value at row i and column j without performing any bounds checking.
+func (d *Dense) AtNB(i, j int) float64 {
+	return d.data[i*d.stride+j]
+}
+
+// AddSetNB adds v to the current value at row i and column j without performing
+// any bounds checking.
+func (d *Dense) AddSetNB(i, j int, v float64) {
+	d.data[i*d.stride+j] += v
+}
+
+// BenchmarkAddSetNB measures C += A * B^T using the Dense representation with
+// calls to methods that do not check bounds. This simulates a compiler that can
+// prove the bounds checks redundant and eliminate them.
+func BenchmarkAddSetNB(be *testing.B) {
+	resetSlices(be)
+	for t := 0; t < be.N; t++ {
+		for i := 0; i < m; i++ {
+			for j := 0; j < n; j++ {
+				var t float64
+				for l := 0; l < k; l++ {
+					t += A.AtNB(i, l) * B.AtNB(j, l)
+				}
+				C.AddSetNB(i, j, t)
+			}
+		}
+	}
+}
+
+// BenchmarkSliceOpt measures an optimized implementation of C += A * B^T using
+// the single slice representation.
+func BenchmarkSliceOpt(be *testing.B) {
+	resetSlices(be)
+	for t := 0; t < be.N; t++ {
+		for i := 0; i < m; i++ {
+			as := a[i*lda : i*lda+k]
+			cs := c[i*ldc : i*ldc+n]
+			for j := 0; j < n; j++ {
+				bs := b[j*lda : j*lda+k]
+				var t float64
+				for l, v := range as {
+					t += v * bs[l]
+				}
+				cs[j] += t
+			}
+		}
+	}
+}
+
+// RowViewNB gets the specified row of the Dense without checking bounds.
+func (d *Dense) RowViewNB(i int) []float64 {
+	return d.data[i*d.stride : i*d.stride+d.cols]
+}
+
+// BenchmarkDenseOpt measures an optimized implementation of C += A * B^T using
+// the Dense representation.
+func BenchmarkDenseOpt(be *testing.B) {
+	resetSlices(be)
+	for t := 0; t < be.N; t++ {
+		for i := 0; i < m; i++ {
+			as := A.RowViewNB(i)
+			cs := C.RowViewNB(i)
+			for j := 0; j < n; j++ {
+				bs := b[j*lda:]
+				var t float64
+				for l, v := range as {
+					t += v * bs[l]
+				}
+				cs[j] += t
+			}
+		}
+	}
+}