| // Copyright 2023 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package main |
| |
| import ( |
| "flag" |
| "fmt" |
| "os" |
| "runtime" |
| "runtime/trace" |
| "sort" |
| "testing" |
| "time" |
| "unsafe" |
| ) |
| |
| const ( |
| bufferLen = 200_000 |
| warmupCount = 1_000_000 |
| runCount = 5_000_000 |
| ) |
| |
| type kbyte []byte |
| type circularBuffer [bufferLen]kbyte |
| |
| type LB struct { |
| // Performance measurement stuff |
| delays []time.Duration // delays observed (for distribution) |
| worst time.Duration // worst delay observed |
| |
| // For making sense of the bad outcome. |
| total time.Duration // total time spent in allocations |
| allStart time.Time // time (very nearly) at which the trace begins |
| worstIndex int // index of worst case allocation delay |
| worstElapsed time.Duration // duration of worst case allocation delay |
| |
| sink *circularBuffer // assign a pointer here to ensure heap allocation |
| |
| // How to allocate |
| |
| // "Fluff" refers to allocating a small fraction of extra quickly-dead objects |
| // to break up long runs on not-free objects that were once a cause of allocation latency. |
| doFluff bool |
| // "Fluff" allocations are all assigned to fluff, so that they are on-heap, but only the last one is live. |
| fluff kbyte |
| |
| // The circular buffer can be on the heap, in a global, or on stack. |
| // This choice affects allocation latency. |
| howAllocated string |
| } |
| |
| // globalBuffer is the globally-allocated circular buffer, |
| // for measuring the cost of scanning large global objects. |
| var globalBuffer circularBuffer |
| |
| // These three methods pass a differently-allocated circularBuffer |
| // to the benchmarked "work" to see how that affects allocation tail latency. |
| |
| //go:noinline |
| func (lb *LB) global(count int) { |
| lb.work(&globalBuffer, count) |
| for i := range globalBuffer { |
| globalBuffer[i] = nil |
| } |
| } |
| |
| //go:noinline |
| func (lb *LB) heap(count int) { |
| c := &circularBuffer{} |
| lb.sink = c // force to heap |
| lb.work(c, count) |
| lb.sink = nil |
| } |
| |
| //go:noinline |
| func (lb *LB) stack(count int) { |
| var c circularBuffer |
| lb.work(&c, count) |
| } |
| |
| // newSlice allocates a 1k slice of bytes and initializes them all to byte(n) |
| func (lb *LB) newSlice(n int) kbyte { |
| m := make(kbyte, 1024) |
| if lb.doFluff && n&63 == 0 { |
| lb.fluff = make(kbyte, 1024) |
| } |
| for i := range m { |
| m[i] = byte(n) |
| } |
| return m |
| } |
| |
| // storeSlice stores a newly allocated 1k slice of bytes at c[count%len(c)] |
| // It also checks the time needed to do this and records the worst case. |
| func (lb *LB) storeSlice(c *circularBuffer, count int) { |
| start := time.Now() |
| c[count%len(c)] = lb.newSlice(count) |
| elapsed := time.Since(start) |
| |
| candElapsed := time.Since(lb.allStart) // Record location of worst in trace |
| if elapsed > lb.worst { |
| lb.worst = elapsed |
| lb.worstIndex = count |
| lb.worstElapsed = candElapsed |
| } |
| lb.total = time.Duration(lb.total.Nanoseconds() + elapsed.Nanoseconds()) |
| lb.delays = append(lb.delays, elapsed) |
| } |
| |
| //go:noinline |
| func (lb *LB) work(c *circularBuffer, count int) { |
| for i := 0; i < count; i++ { |
| lb.storeSlice(c, i) |
| } |
| } |
| |
| func (lb *LB) doAllocations(count int) { |
| switch lb.howAllocated { |
| case "stack": |
| lb.stack(count) |
| case "heap": |
| lb.heap(count) |
| case "global": |
| lb.global(count) |
| } |
| } |
| |
| var traceFile string |
| |
| func flags() (string, bool) { |
| var howAllocated = "stack" |
| var doFluff bool |
| flag.StringVar(&traceFile, "trace", traceFile, "name of trace file to create") |
| flag.StringVar(&howAllocated, "how", howAllocated, "how the buffer is allocated = {stack,heap,global}") |
| flag.BoolVar(&doFluff, "fluff", doFluff, "insert 'fluff' into allocation runs to break up sweeps") |
| |
| flag.Parse() |
| |
| switch howAllocated { |
| case "stack", "heap", "global": |
| break |
| default: |
| fmt.Fprintf(os.Stderr, "-how needs to be one of 'heap', 'stack' or 'global, saw '%s' instead\n", howAllocated) |
| os.Exit(1) |
| } |
| return howAllocated, doFluff |
| } |
| |
| var reportWorstFlag bool |
| |
| func (lb0 *LB) bench(b *testing.B) { |
| if b != nil { |
| b.StopTimer() |
| } |
| |
| var c *circularBuffer = &circularBuffer{} |
| lb0.sink = c // force heap allocation |
| lb0.delays = make([]time.Duration, 0, runCount) |
| // Warm up heap, virtual memory, address space, etc. |
| lb0.work(c, warmupCount) |
| c, lb0.sink = nil, nil |
| runtime.GC() // Start fresh, GC with all the timers turned off. |
| |
| lb := &LB{doFluff: lb0.doFluff, howAllocated: lb0.howAllocated, delays: lb0.delays[:0]} |
| count := runCount |
| |
| // Confine tracing and timing defers to a small block. |
| run := func() { |
| if traceFile != "" { |
| f, err := os.Create(traceFile) |
| if err != nil { |
| if b != nil { |
| b.Fatalf("Could not create trace file '%s'\n", traceFile) |
| } else { |
| fmt.Fprintf(os.Stderr, "Could not create trace file '%s'\n", traceFile) |
| os.Exit(1) |
| } |
| } |
| defer f.Close() |
| trace.Start(f) |
| defer trace.Stop() |
| } |
| lb.allStart = time.Now() // this is for trace file navigation, not benchmark timing. |
| |
| if b != nil { |
| count = b.N * count |
| if b.N > 1 { |
| lb.delays = make([]time.Duration, 0, count) |
| } |
| b.StartTimer() |
| defer b.StopTimer() |
| } |
| lb.doAllocations(count) |
| } |
| run() |
| |
| sort.Slice(lb.delays, func(i, j int) bool { return lb.delays[i] < lb.delays[j] }) |
| delays := lb.delays |
| delayLen := float64(len(delays)) |
| average, median := time.Duration(lb.total.Nanoseconds()/int64(count)), delays[len(delays)/2] |
| p29, p39, p49, p59, p69 := lb.delays[int(0.99*delayLen)], delays[int(0.999*delayLen)], delays[int(0.9999*delayLen)], delays[int(0.99999*delayLen)], delays[int(0.999999*delayLen)] |
| if b != nil { |
| b.ReportMetric(float64(average.Nanoseconds()), "ns/op") |
| b.ReportMetric(float64(median), "p50-ns") |
| b.ReportMetric(float64(p29), "p99-ns") |
| b.ReportMetric(float64(p39), "p99.9-ns") |
| b.ReportMetric(float64(p49), "p99.99-ns") |
| b.ReportMetric(float64(p59), "p99.999-ns") |
| b.ReportMetric(float64(p69), "p99.9999-ns") |
| if reportWorstFlag { |
| b.ReportMetric(float64(lb.worst), "worst") |
| } |
| // Don't report worst case, it is ultra-noisy. |
| } else { |
| fmt.Printf("GC latency benchmark, how=%s, fluff=%v\n", lb.howAllocated, lb.doFluff) |
| fmt.Println("Worst allocation latency:", lb.worst) |
| fmt.Println("Worst allocation index:", lb.worstIndex) |
| fmt.Println("Worst allocation occurs at run elapsed time:", lb.worstElapsed) |
| fmt.Println("Average allocation latency:", average) |
| fmt.Println("Median allocation latency:", median) |
| fmt.Println("99% allocation latency:", p29) |
| fmt.Println("99.9% allocation latency:", p39) |
| fmt.Println("99.99% allocation latency:", p49) |
| fmt.Println("99.999% allocation latency:", p59) |
| fmt.Println("99.9999% allocation latency:", p69) |
| fmt.Println("Sizeof(circularBuffer) =", unsafe.Sizeof(*c)) |
| fmt.Println("Approximate live memory =", unsafe.Sizeof(*c)+bufferLen*1024) |
| } |
| } |