benchmarks/gc_latency: add new microbenchmark for tricky mutator latency issues

Gc_latency is a modified version of a program that tickled multiple
latency glitches in the Go GC/runtime.  This version reports the time
of the worst observed glitches so that they can be easily located in
a trace file and debugged.  This program can also be run as a benchmark
to allow easier automated performance monitoring; by default the benchmark doesn't report worst case times because those are too noisy.

Updates golang/go#27732.

Change-Id: I19b9060f24cda1547b8d75f762316dd5271e32c6
Reviewed-on: https://go-review.googlesource.com/c/benchmarks/+/372256
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: David Chase <drchase@google.com>
diff --git a/gc_latency/latency.go b/gc_latency/latency.go
new file mode 100644
index 0000000..de1afca
--- /dev/null
+++ b/gc_latency/latency.go
@@ -0,0 +1,231 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"flag"
+	"fmt"
+	"os"
+	"runtime"
+	"runtime/trace"
+	"sort"
+	"testing"
+	"time"
+	"unsafe"
+)
+
+const (
+	bufferLen   = 200_000
+	warmupCount = 1_000_000
+	runCount    = 5_000_000
+)
+
+type kbyte []byte
+type circularBuffer [bufferLen]kbyte
+
+type LB struct {
+	// Performance measurement stuff
+	delays []time.Duration // delays observed (for distribution)
+	worst  time.Duration   // worst delay observed
+
+	// For making sense of the bad outcome.
+	total        time.Duration // total time spent in allocations
+	allStart     time.Time     // time (very nearly) at which the trace begins
+	worstIndex   int           // index of worst case allocation delay
+	worstElapsed time.Duration // duration of worst case allocation delay
+
+	sink *circularBuffer // assign a pointer here to ensure heap allocation
+
+	// How to allocate
+
+	// "Fluff" refers to allocating a small fraction of extra quickly-dead objects
+	// to break up long runs on not-free objects that were once a cause of allocation latency.
+	doFluff bool
+	// "Fluff" allocations are all assigned to fluff, so that they are on-heap, but only the last one is live.
+	fluff kbyte
+
+	// The circular buffer can be on the heap, in a global, or on stack.
+	// This choice affects allocation latency.
+	howAllocated string
+}
+
+// globalBuffer is the globally-allocated circular buffer,
+// for measuring the cost of scanning large global objects.
+var globalBuffer circularBuffer
+
+// These three methods pass a differently-allocated circularBuffer
+// to the benchmarked "work" to see how that affects allocation tail latency.
+
+//go:noinline
+func (lb *LB) global(count int) {
+	lb.work(&globalBuffer, count)
+	for i := range globalBuffer {
+		globalBuffer[i] = nil
+	}
+}
+
+//go:noinline
+func (lb *LB) heap(count int) {
+	c := &circularBuffer{}
+	lb.sink = c // force to heap
+	lb.work(c, count)
+	lb.sink = nil
+}
+
+//go:noinline
+func (lb *LB) stack(count int) {
+	var c circularBuffer
+	lb.work(&c, count)
+}
+
+// newSlice allocates a 1k slice of bytes and initializes them all to byte(n)
+func (lb *LB) newSlice(n int) kbyte {
+	m := make(kbyte, 1024)
+	if lb.doFluff && n&63 == 0 {
+		lb.fluff = make(kbyte, 1024)
+	}
+	for i := range m {
+		m[i] = byte(n)
+	}
+	return m
+}
+
+// storeSlice stores a newly allocated 1k slice of bytes at c[count%len(c)]
+// It also checks the time needed to do this and records the worst case.
+func (lb *LB) storeSlice(c *circularBuffer, count int) {
+	start := time.Now()
+	c[count%len(c)] = lb.newSlice(count)
+	elapsed := time.Since(start)
+
+	candElapsed := time.Since(lb.allStart) // Record location of worst in trace
+	if elapsed > lb.worst {
+		lb.worst = elapsed
+		lb.worstIndex = count
+		lb.worstElapsed = candElapsed
+	}
+	lb.total = time.Duration(lb.total.Nanoseconds() + elapsed.Nanoseconds())
+	lb.delays = append(lb.delays, elapsed)
+}
+
+//go:noinline
+func (lb *LB) work(c *circularBuffer, count int) {
+	for i := 0; i < count; i++ {
+		lb.storeSlice(c, i)
+	}
+}
+
+func (lb *LB) doAllocations(count int) {
+	switch lb.howAllocated {
+	case "stack":
+		lb.stack(count)
+	case "heap":
+		lb.heap(count)
+	case "global":
+		lb.global(count)
+	}
+}
+
+var traceFile string
+
+func flags() (string, bool) {
+	var howAllocated = "stack"
+	var doFluff bool
+	flag.StringVar(&traceFile, "trace", traceFile, "name of trace file to create")
+	flag.StringVar(&howAllocated, "how", howAllocated, "how the buffer is allocated = {stack,heap,global}")
+	flag.BoolVar(&doFluff, "fluff", doFluff, "insert 'fluff' into allocation runs to break up sweeps")
+
+	flag.Parse()
+
+	switch howAllocated {
+	case "stack", "heap", "global":
+		break
+	default:
+		fmt.Fprintf(os.Stderr, "-how needs to be one of 'heap', 'stack' or 'global, saw '%s' instead\n", howAllocated)
+		os.Exit(1)
+	}
+	return howAllocated, doFluff
+}
+
+var reportWorstFlag bool
+
+func (lb0 *LB) bench(b *testing.B) {
+	if b != nil {
+		b.StopTimer()
+	}
+
+	var c *circularBuffer = &circularBuffer{}
+	lb0.sink = c // force heap allocation
+	lb0.delays = make([]time.Duration, 0, runCount)
+	// Warm up heap, virtual memory, address space, etc.
+	lb0.work(c, warmupCount)
+	c, lb0.sink = nil, nil
+	runtime.GC() // Start fresh, GC with all the timers turned off.
+
+	lb := &LB{doFluff: lb0.doFluff, howAllocated: lb0.howAllocated, delays: lb0.delays[:0]}
+	count := runCount
+
+	// Confine tracing and timing defers to a small block.
+	run := func() {
+		if traceFile != "" {
+			f, err := os.Create(traceFile)
+			if err != nil {
+				if b != nil {
+					b.Fatalf("Could not create trace file '%s'\n", traceFile)
+				} else {
+					fmt.Fprintf(os.Stderr, "Could not create trace file '%s'\n", traceFile)
+					os.Exit(1)
+				}
+			}
+			defer f.Close()
+			trace.Start(f)
+			defer trace.Stop()
+		}
+		lb.allStart = time.Now() // this is for trace file navigation, not benchmark timing.
+
+		if b != nil {
+			count = b.N * count
+			if b.N > 1 {
+				lb.delays = make([]time.Duration, 0, count)
+			}
+			b.StartTimer()
+			defer b.StopTimer()
+		}
+		lb.doAllocations(count)
+	}
+	run()
+
+	sort.Slice(lb.delays, func(i, j int) bool { return lb.delays[i] < lb.delays[j] })
+	delays := lb.delays
+	delayLen := float64(len(delays))
+	average, median := time.Duration(lb.total.Nanoseconds()/int64(count)), delays[len(delays)/2]
+	p29, p39, p49, p59, p69 := lb.delays[int(0.99*delayLen)], delays[int(0.999*delayLen)], delays[int(0.9999*delayLen)], delays[int(0.99999*delayLen)], delays[int(0.999999*delayLen)]
+	if b != nil {
+		b.ReportMetric(float64(average.Nanoseconds()), "ns/op")
+		b.ReportMetric(float64(median), "p50-ns")
+		b.ReportMetric(float64(p29), "p99-ns")
+		b.ReportMetric(float64(p39), "p99.9-ns")
+		b.ReportMetric(float64(p49), "p99.99-ns")
+		b.ReportMetric(float64(p59), "p99.999-ns")
+		b.ReportMetric(float64(p69), "p99.9999-ns")
+		if reportWorstFlag {
+			b.ReportMetric(float64(lb.worst), "worst")
+		}
+		// Don't report worst case, it is ultra-noisy.
+	} else {
+		fmt.Printf("GC latency benchmark, how=%s, fluff=%v\n", lb.howAllocated, lb.doFluff)
+		fmt.Println("Worst allocation latency:", lb.worst)
+		fmt.Println("Worst allocation index:", lb.worstIndex)
+		fmt.Println("Worst allocation occurs at run elapsed time:", lb.worstElapsed)
+		fmt.Println("Average allocation latency:", average)
+		fmt.Println("Median allocation latency:", median)
+		fmt.Println("99% allocation latency:", p29)
+		fmt.Println("99.9% allocation latency:", p39)
+		fmt.Println("99.99% allocation latency:", p49)
+		fmt.Println("99.999% allocation latency:", p59)
+		fmt.Println("99.9999% allocation latency:", p69)
+		fmt.Println("Sizeof(circularBuffer) =", unsafe.Sizeof(*c))
+		fmt.Println("Approximate live memory =", unsafe.Sizeof(*c)+bufferLen*1024)
+	}
+}
diff --git a/gc_latency/latency_test.go b/gc_latency/latency_test.go
new file mode 100644
index 0000000..a86d84f
--- /dev/null
+++ b/gc_latency/latency_test.go
@@ -0,0 +1,46 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"flag"
+	"fmt"
+	"os"
+	"testing"
+)
+
+// Run as a test, reports allocation time statistics for stack, heap, and globally
+// allocated buffers, out to the 99.9999th percentile.  Optionally reports worst
+// allocation time if -worst is specified, but this is normally too noisy for any
+// sort of trend tracking or alerting.  The default test usually runs long enough that
+// it requires only one iteration.
+
+func TestMain(m *testing.M) {
+	flag.BoolVar(&reportWorstFlag, "worst", false, "report otherwise too-noisy 'worst' metric in benchmark")
+	flag.Parse()
+	os.Exit(m.Run())
+}
+
+type testCase struct {
+	howAlloc  string
+	withFluff bool
+}
+
+func BenchmarkGCLatency(b *testing.B) {
+	tcs := []testCase{
+		{"stack", false},
+		{"stack", true},
+		{"heap", false},
+		{"heap", true},
+		{"global", false},
+		{"global", true},
+	}
+
+	for _, tc := range tcs {
+		lb := &LB{doFluff: tc.withFluff, howAllocated: tc.howAlloc}
+		b.Run(fmt.Sprintf("how=%s/fluff=%v", tc.howAlloc, tc.withFluff),
+			func(b *testing.B) { lb.bench(b) })
+	}
+}
diff --git a/gc_latency/main.go b/gc_latency/main.go
new file mode 100644
index 0000000..f709934
--- /dev/null
+++ b/gc_latency/main.go
@@ -0,0 +1,29 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+// Gc_latency is a modified version of a program that tickled multiple
+// latency glitches in the Go GC/runtime.  This version reports the time
+// of the worst observed glitches so that they can be easily located in
+// a trace file and debugged.  This program can also be run as a benchmark
+// to allow easier automated performance monitoring; the benchmark doesn't
+// report worst case times because those are too noisy.
+//
+// Usage:
+//
+//	 gc_latency [flags]
+//
+//	 Flags (as main):
+//		 -fluff
+//		insert 'fluff' into allocation runs to break up sweeps
+//		 -how string
+//		how the buffer is allocated = {stack,heap,global} (default "stack")
+//		 -trace string
+//		name of trace file to create
+func main() {
+	howAllocated, doFluffFlag := flags()
+	lb := &LB{howAllocated: howAllocated, doFluff: doFluffFlag}
+	lb.bench(nil)
+}