benchmarks/gc_latency: add new microbenchmark for tricky mutator latency issues
Gc_latency is a modified version of a program that tickled multiple
latency glitches in the Go GC/runtime. This version reports the time
of the worst observed glitches so that they can be easily located in
a trace file and debugged. This program can also be run as a benchmark
to allow easier automated performance monitoring; by default the benchmark doesn't report worst case times because those are too noisy.
Updates golang/go#27732.
Change-Id: I19b9060f24cda1547b8d75f762316dd5271e32c6
Reviewed-on: https://go-review.googlesource.com/c/benchmarks/+/372256
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: David Chase <drchase@google.com>
diff --git a/gc_latency/latency.go b/gc_latency/latency.go
new file mode 100644
index 0000000..de1afca
--- /dev/null
+++ b/gc_latency/latency.go
@@ -0,0 +1,231 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ "flag"
+ "fmt"
+ "os"
+ "runtime"
+ "runtime/trace"
+ "sort"
+ "testing"
+ "time"
+ "unsafe"
+)
+
+const (
+ bufferLen = 200_000
+ warmupCount = 1_000_000
+ runCount = 5_000_000
+)
+
+type kbyte []byte
+type circularBuffer [bufferLen]kbyte
+
+type LB struct {
+ // Performance measurement stuff
+ delays []time.Duration // delays observed (for distribution)
+ worst time.Duration // worst delay observed
+
+ // For making sense of the bad outcome.
+ total time.Duration // total time spent in allocations
+ allStart time.Time // time (very nearly) at which the trace begins
+ worstIndex int // index of worst case allocation delay
+ worstElapsed time.Duration // duration of worst case allocation delay
+
+ sink *circularBuffer // assign a pointer here to ensure heap allocation
+
+ // How to allocate
+
+ // "Fluff" refers to allocating a small fraction of extra quickly-dead objects
+ // to break up long runs on not-free objects that were once a cause of allocation latency.
+ doFluff bool
+ // "Fluff" allocations are all assigned to fluff, so that they are on-heap, but only the last one is live.
+ fluff kbyte
+
+ // The circular buffer can be on the heap, in a global, or on stack.
+ // This choice affects allocation latency.
+ howAllocated string
+}
+
+// globalBuffer is the globally-allocated circular buffer,
+// for measuring the cost of scanning large global objects.
+var globalBuffer circularBuffer
+
+// These three methods pass a differently-allocated circularBuffer
+// to the benchmarked "work" to see how that affects allocation tail latency.
+
+//go:noinline
+func (lb *LB) global(count int) {
+ lb.work(&globalBuffer, count)
+ for i := range globalBuffer {
+ globalBuffer[i] = nil
+ }
+}
+
+//go:noinline
+func (lb *LB) heap(count int) {
+ c := &circularBuffer{}
+ lb.sink = c // force to heap
+ lb.work(c, count)
+ lb.sink = nil
+}
+
+//go:noinline
+func (lb *LB) stack(count int) {
+ var c circularBuffer
+ lb.work(&c, count)
+}
+
+// newSlice allocates a 1k slice of bytes and initializes them all to byte(n)
+func (lb *LB) newSlice(n int) kbyte {
+ m := make(kbyte, 1024)
+ if lb.doFluff && n&63 == 0 {
+ lb.fluff = make(kbyte, 1024)
+ }
+ for i := range m {
+ m[i] = byte(n)
+ }
+ return m
+}
+
+// storeSlice stores a newly allocated 1k slice of bytes at c[count%len(c)]
+// It also checks the time needed to do this and records the worst case.
+func (lb *LB) storeSlice(c *circularBuffer, count int) {
+ start := time.Now()
+ c[count%len(c)] = lb.newSlice(count)
+ elapsed := time.Since(start)
+
+ candElapsed := time.Since(lb.allStart) // Record location of worst in trace
+ if elapsed > lb.worst {
+ lb.worst = elapsed
+ lb.worstIndex = count
+ lb.worstElapsed = candElapsed
+ }
+ lb.total = time.Duration(lb.total.Nanoseconds() + elapsed.Nanoseconds())
+ lb.delays = append(lb.delays, elapsed)
+}
+
+//go:noinline
+func (lb *LB) work(c *circularBuffer, count int) {
+ for i := 0; i < count; i++ {
+ lb.storeSlice(c, i)
+ }
+}
+
+func (lb *LB) doAllocations(count int) {
+ switch lb.howAllocated {
+ case "stack":
+ lb.stack(count)
+ case "heap":
+ lb.heap(count)
+ case "global":
+ lb.global(count)
+ }
+}
+
+var traceFile string
+
+func flags() (string, bool) {
+ var howAllocated = "stack"
+ var doFluff bool
+ flag.StringVar(&traceFile, "trace", traceFile, "name of trace file to create")
+ flag.StringVar(&howAllocated, "how", howAllocated, "how the buffer is allocated = {stack,heap,global}")
+ flag.BoolVar(&doFluff, "fluff", doFluff, "insert 'fluff' into allocation runs to break up sweeps")
+
+ flag.Parse()
+
+ switch howAllocated {
+ case "stack", "heap", "global":
+ break
+ default:
+ fmt.Fprintf(os.Stderr, "-how needs to be one of 'heap', 'stack' or 'global, saw '%s' instead\n", howAllocated)
+ os.Exit(1)
+ }
+ return howAllocated, doFluff
+}
+
+var reportWorstFlag bool
+
+func (lb0 *LB) bench(b *testing.B) {
+ if b != nil {
+ b.StopTimer()
+ }
+
+ var c *circularBuffer = &circularBuffer{}
+ lb0.sink = c // force heap allocation
+ lb0.delays = make([]time.Duration, 0, runCount)
+ // Warm up heap, virtual memory, address space, etc.
+ lb0.work(c, warmupCount)
+ c, lb0.sink = nil, nil
+ runtime.GC() // Start fresh, GC with all the timers turned off.
+
+ lb := &LB{doFluff: lb0.doFluff, howAllocated: lb0.howAllocated, delays: lb0.delays[:0]}
+ count := runCount
+
+ // Confine tracing and timing defers to a small block.
+ run := func() {
+ if traceFile != "" {
+ f, err := os.Create(traceFile)
+ if err != nil {
+ if b != nil {
+ b.Fatalf("Could not create trace file '%s'\n", traceFile)
+ } else {
+ fmt.Fprintf(os.Stderr, "Could not create trace file '%s'\n", traceFile)
+ os.Exit(1)
+ }
+ }
+ defer f.Close()
+ trace.Start(f)
+ defer trace.Stop()
+ }
+ lb.allStart = time.Now() // this is for trace file navigation, not benchmark timing.
+
+ if b != nil {
+ count = b.N * count
+ if b.N > 1 {
+ lb.delays = make([]time.Duration, 0, count)
+ }
+ b.StartTimer()
+ defer b.StopTimer()
+ }
+ lb.doAllocations(count)
+ }
+ run()
+
+ sort.Slice(lb.delays, func(i, j int) bool { return lb.delays[i] < lb.delays[j] })
+ delays := lb.delays
+ delayLen := float64(len(delays))
+ average, median := time.Duration(lb.total.Nanoseconds()/int64(count)), delays[len(delays)/2]
+ p29, p39, p49, p59, p69 := lb.delays[int(0.99*delayLen)], delays[int(0.999*delayLen)], delays[int(0.9999*delayLen)], delays[int(0.99999*delayLen)], delays[int(0.999999*delayLen)]
+ if b != nil {
+ b.ReportMetric(float64(average.Nanoseconds()), "ns/op")
+ b.ReportMetric(float64(median), "p50-ns")
+ b.ReportMetric(float64(p29), "p99-ns")
+ b.ReportMetric(float64(p39), "p99.9-ns")
+ b.ReportMetric(float64(p49), "p99.99-ns")
+ b.ReportMetric(float64(p59), "p99.999-ns")
+ b.ReportMetric(float64(p69), "p99.9999-ns")
+ if reportWorstFlag {
+ b.ReportMetric(float64(lb.worst), "worst")
+ }
+ // Don't report worst case, it is ultra-noisy.
+ } else {
+ fmt.Printf("GC latency benchmark, how=%s, fluff=%v\n", lb.howAllocated, lb.doFluff)
+ fmt.Println("Worst allocation latency:", lb.worst)
+ fmt.Println("Worst allocation index:", lb.worstIndex)
+ fmt.Println("Worst allocation occurs at run elapsed time:", lb.worstElapsed)
+ fmt.Println("Average allocation latency:", average)
+ fmt.Println("Median allocation latency:", median)
+ fmt.Println("99% allocation latency:", p29)
+ fmt.Println("99.9% allocation latency:", p39)
+ fmt.Println("99.99% allocation latency:", p49)
+ fmt.Println("99.999% allocation latency:", p59)
+ fmt.Println("99.9999% allocation latency:", p69)
+ fmt.Println("Sizeof(circularBuffer) =", unsafe.Sizeof(*c))
+ fmt.Println("Approximate live memory =", unsafe.Sizeof(*c)+bufferLen*1024)
+ }
+}
diff --git a/gc_latency/latency_test.go b/gc_latency/latency_test.go
new file mode 100644
index 0000000..a86d84f
--- /dev/null
+++ b/gc_latency/latency_test.go
@@ -0,0 +1,46 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ "flag"
+ "fmt"
+ "os"
+ "testing"
+)
+
+// Run as a test, reports allocation time statistics for stack, heap, and globally
+// allocated buffers, out to the 99.9999th percentile. Optionally reports worst
+// allocation time if -worst is specified, but this is normally too noisy for any
+// sort of trend tracking or alerting. The default test usually runs long enough that
+// it requires only one iteration.
+
+func TestMain(m *testing.M) {
+ flag.BoolVar(&reportWorstFlag, "worst", false, "report otherwise too-noisy 'worst' metric in benchmark")
+ flag.Parse()
+ os.Exit(m.Run())
+}
+
+type testCase struct {
+ howAlloc string
+ withFluff bool
+}
+
+func BenchmarkGCLatency(b *testing.B) {
+ tcs := []testCase{
+ {"stack", false},
+ {"stack", true},
+ {"heap", false},
+ {"heap", true},
+ {"global", false},
+ {"global", true},
+ }
+
+ for _, tc := range tcs {
+ lb := &LB{doFluff: tc.withFluff, howAllocated: tc.howAlloc}
+ b.Run(fmt.Sprintf("how=%s/fluff=%v", tc.howAlloc, tc.withFluff),
+ func(b *testing.B) { lb.bench(b) })
+ }
+}
diff --git a/gc_latency/main.go b/gc_latency/main.go
new file mode 100644
index 0000000..f709934
--- /dev/null
+++ b/gc_latency/main.go
@@ -0,0 +1,29 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+// Gc_latency is a modified version of a program that tickled multiple
+// latency glitches in the Go GC/runtime. This version reports the time
+// of the worst observed glitches so that they can be easily located in
+// a trace file and debugged. This program can also be run as a benchmark
+// to allow easier automated performance monitoring; the benchmark doesn't
+// report worst case times because those are too noisy.
+//
+// Usage:
+//
+// gc_latency [flags]
+//
+// Flags (as main):
+// -fluff
+// insert 'fluff' into allocation runs to break up sweeps
+// -how string
+// how the buffer is allocated = {stack,heap,global} (default "stack")
+// -trace string
+// name of trace file to create
+func main() {
+ howAllocated, doFluffFlag := flags()
+ lb := &LB{howAllocated: howAllocated, doFluff: doFluffFlag}
+ lb.bench(nil)
+}