blob: 0e621cb6ca5f20a1d7e02f9d2234c0a360430260 [file] [log] [blame]
// Copyright 2022 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build goexperiment.pagetrace
// Page tracer.
//
// This file contains an implementation of page trace instrumentation for tracking
// the way the Go runtime manages pages of memory. The trace may be enabled at program
// startup with the GODEBUG option pagetrace.
//
// Each page trace event is either 8 or 16 bytes wide. The first
// 8 bytes follow this format for non-sync events:
//
// [16 timestamp delta][35 base address][10 npages][1 isLarge][2 pageTraceEventType]
//
// If the "large" bit is set then the event is 16 bytes wide with the second 8 byte word
// containing the full npages value (the npages bitfield is 0).
//
// The base address's bottom pageShift bits are always zero hence why we can pack other
// data in there. We ignore the top 16 bits, assuming a 48 bit address space for the
// heap.
//
// The timestamp delta is computed from the difference between the current nanotime
// timestamp and the last sync event's timestamp. The bottom pageTraceTimeLostBits of
// this delta is removed and only the next pageTraceTimeDeltaBits are kept.
//
// A sync event is emitted at the beginning of each trace buffer and whenever the
// timestamp delta would not fit in an event.
//
// Sync events have the following structure:
//
// [61 timestamp or P ID][1 isPID][2 pageTraceSyncEvent]
//
// In essence, the "large" bit repurposed to indicate whether it's a timestamp or a P ID
// (these are typically uint32). Note that we only have 61 bits for the 64-bit timestamp,
// but like for the delta we drop the bottom pageTraceTimeLostBits here as well.
package runtime
import (
"runtime/internal/sys"
"unsafe"
)
// pageTraceAlloc records a page trace allocation event.
// pp may be nil. Call only if debug.pagetracefd != 0.
//
// Must run on the system stack as a crude way to prevent preemption.
//
//go:systemstack
func pageTraceAlloc(pp *p, now int64, base, npages uintptr) {
if pageTrace.enabled {
if now == 0 {
now = nanotime()
}
pageTraceEmit(pp, now, base, npages, pageTraceAllocEvent)
}
}
// pageTraceFree records a page trace free event.
// pp may be nil. Call only if debug.pagetracefd != 0.
//
// Must run on the system stack as a crude way to prevent preemption.
//
//go:systemstack
func pageTraceFree(pp *p, now int64, base, npages uintptr) {
if pageTrace.enabled {
if now == 0 {
now = nanotime()
}
pageTraceEmit(pp, now, base, npages, pageTraceFreeEvent)
}
}
// pageTraceScav records a page trace scavenge event.
// pp may be nil. Call only if debug.pagetracefd != 0.
//
// Must run on the system stack as a crude way to prevent preemption.
//
//go:systemstack
func pageTraceScav(pp *p, now int64, base, npages uintptr) {
if pageTrace.enabled {
if now == 0 {
now = nanotime()
}
pageTraceEmit(pp, now, base, npages, pageTraceScavEvent)
}
}
// pageTraceEventType is a page trace event type.
type pageTraceEventType uint8
const (
pageTraceSyncEvent pageTraceEventType = iota // Timestamp emission.
pageTraceAllocEvent // Allocation of pages.
pageTraceFreeEvent // Freeing pages.
pageTraceScavEvent // Scavenging pages.
)
// pageTraceEmit emits a page trace event.
//
// Must run on the system stack as a crude way to prevent preemption.
//
//go:systemstack
func pageTraceEmit(pp *p, now int64, base, npages uintptr, typ pageTraceEventType) {
// Get a buffer.
var tbp *pageTraceBuf
pid := int32(-1)
if pp == nil {
// We have no P, so take the global buffer.
lock(&pageTrace.lock)
tbp = &pageTrace.buf
} else {
tbp = &pp.pageTraceBuf
pid = pp.id
}
// Initialize the buffer if necessary.
tb := *tbp
if tb.buf == nil {
tb.buf = (*pageTraceEvents)(sysAlloc(pageTraceBufSize, &memstats.other_sys))
tb = tb.writePid(pid)
}
// Handle timestamp and emit a sync event if necessary.
if now < tb.timeBase {
now = tb.timeBase
}
if now-tb.timeBase >= pageTraceTimeMaxDelta {
tb.timeBase = now
tb = tb.writeSync(pid)
}
// Emit the event.
tb = tb.writeEvent(pid, now, base, npages, typ)
// Write back the buffer.
*tbp = tb
if pp == nil {
unlock(&pageTrace.lock)
}
}
const (
pageTraceBufSize = 32 << 10
// These constants describe the per-event timestamp delta encoding.
pageTraceTimeLostBits = 7 // How many bits of precision we lose in the delta.
pageTraceTimeDeltaBits = 16 // Size of the delta in bits.
pageTraceTimeMaxDelta = 1 << (pageTraceTimeLostBits + pageTraceTimeDeltaBits)
)
// pageTraceEvents is the low-level buffer containing the trace data.
type pageTraceEvents struct {
_ sys.NotInHeap
events [pageTraceBufSize / 8]uint64
}
// pageTraceBuf is a wrapper around pageTraceEvents that knows how to write events
// to the buffer. It tracks state necessary to do so.
type pageTraceBuf struct {
buf *pageTraceEvents
len int // How many events have been written so far.
timeBase int64 // The current timestamp base from which deltas are produced.
finished bool // Whether this trace buf should no longer flush anything out.
}
// writePid writes a P ID event indicating which P we're running on.
//
// Assumes there's always space in the buffer since this is only called at the
// beginning of a new buffer.
//
// Must run on the system stack as a crude way to prevent preemption.
//
//go:systemstack
func (tb pageTraceBuf) writePid(pid int32) pageTraceBuf {
e := uint64(int64(pid))<<3 | 0b100 | uint64(pageTraceSyncEvent)
tb.buf.events[tb.len] = e
tb.len++
return tb
}
// writeSync writes a sync event, which is just a timestamp. Handles flushing.
//
// Must run on the system stack as a crude way to prevent preemption.
//
//go:systemstack
func (tb pageTraceBuf) writeSync(pid int32) pageTraceBuf {
if tb.len+1 > len(tb.buf.events) {
// N.B. flush will writeSync again.
return tb.flush(pid, tb.timeBase)
}
e := ((uint64(tb.timeBase) >> pageTraceTimeLostBits) << 3) | uint64(pageTraceSyncEvent)
tb.buf.events[tb.len] = e
tb.len++
return tb
}
// writeEvent handles writing all non-sync and non-pid events. Handles flushing if necessary.
//
// pid indicates the P we're currently running on. Necessary in case we need to flush.
// now is the current nanotime timestamp.
// base is the base address of whatever group of pages this event is happening to.
// npages is the length of the group of pages this event is happening to.
// typ is the event that's happening to these pages.
//
// Must run on the system stack as a crude way to prevent preemption.
//
//go:systemstack
func (tb pageTraceBuf) writeEvent(pid int32, now int64, base, npages uintptr, typ pageTraceEventType) pageTraceBuf {
large := 0
np := npages
if npages >= 1024 {
large = 1
np = 0
}
if tb.len+1+large > len(tb.buf.events) {
tb = tb.flush(pid, now)
}
if base%pageSize != 0 {
throw("base address not page aligned")
}
e := uint64(base)
// The pageShift low-order bits are zero.
e |= uint64(typ) // 2 bits
e |= uint64(large) << 2 // 1 bit
e |= uint64(np) << 3 // 10 bits
// Write the timestamp delta in the upper pageTraceTimeDeltaBits.
e |= uint64((now-tb.timeBase)>>pageTraceTimeLostBits) << (64 - pageTraceTimeDeltaBits)
tb.buf.events[tb.len] = e
if large != 0 {
// npages doesn't fit in 10 bits, so write an additional word with that data.
tb.buf.events[tb.len+1] = uint64(npages)
}
tb.len += 1 + large
return tb
}
// flush writes out the contents of the buffer to pageTrace.fd and resets the buffer.
// It then writes out a P ID event and the first sync event for the new buffer.
//
// Must run on the system stack as a crude way to prevent preemption.
//
//go:systemstack
func (tb pageTraceBuf) flush(pid int32, now int64) pageTraceBuf {
if !tb.finished {
lock(&pageTrace.fdLock)
writeFull(uintptr(pageTrace.fd), (*byte)(unsafe.Pointer(&tb.buf.events[0])), tb.len*8)
unlock(&pageTrace.fdLock)
}
tb.len = 0
tb.timeBase = now
return tb.writePid(pid).writeSync(pid)
}
var pageTrace struct {
// enabled indicates whether tracing is enabled. If true, fd >= 0.
//
// Safe to read without synchronization because it's only set once
// at program initialization.
enabled bool
// buf is the page trace buffer used if there is no P.
//
// lock protects buf.
lock mutex
buf pageTraceBuf
// fdLock protects writing to fd.
//
// fd is the file to write the page trace to.
fdLock mutex
fd int32
}
// initPageTrace initializes the page tracing infrastructure from GODEBUG.
//
// env must be the value of the GODEBUG environment variable.
func initPageTrace(env string) {
var value string
for env != "" {
elt, rest := env, ""
for i := 0; i < len(env); i++ {
if env[i] == ',' {
elt, rest = env[:i], env[i+1:]
break
}
}
env = rest
if hasPrefix(elt, "pagetrace=") {
value = elt[len("pagetrace="):]
break
}
}
pageTrace.fd = -1
if canCreateFile && value != "" {
var tmp [4096]byte
if len(value) != 0 && len(value) < 4096 {
copy(tmp[:], value)
pageTrace.fd = create(&tmp[0], 0o664)
}
}
pageTrace.enabled = pageTrace.fd >= 0
}
// finishPageTrace flushes all P's trace buffers and disables page tracing.
func finishPageTrace() {
if !pageTrace.enabled {
return
}
// Grab worldsema as we're about to execute a ragged barrier.
semacquire(&worldsema)
systemstack(func() {
// Disable tracing. This isn't strictly necessary and it's best-effort.
pageTrace.enabled = false
// Execute a ragged barrier, flushing each trace buffer.
forEachP(func(pp *p) {
if pp.pageTraceBuf.buf != nil {
pp.pageTraceBuf = pp.pageTraceBuf.flush(pp.id, nanotime())
}
pp.pageTraceBuf.finished = true
})
// Write the global have-no-P buffer.
lock(&pageTrace.lock)
if pageTrace.buf.buf != nil {
pageTrace.buf = pageTrace.buf.flush(-1, nanotime())
}
pageTrace.buf.finished = true
unlock(&pageTrace.lock)
// Safely close the file as nothing else should be allowed to write to the fd.
lock(&pageTrace.fdLock)
closefd(pageTrace.fd)
pageTrace.fd = -1
unlock(&pageTrace.fdLock)
})
semrelease(&worldsema)
}
// writeFull ensures that a complete write of bn bytes from b is made to fd.
func writeFull(fd uintptr, b *byte, bn int) {
for bn > 0 {
n := write(fd, unsafe.Pointer(b), int32(bn))
if n == -_EINTR || n == -_EAGAIN {
continue
}
if n < 0 {
print("errno=", -n, "\n")
throw("writeBytes: bad write")
}
bn -= int(n)
b = addb(b, uintptr(n))
}
}