| // Copyright 2023 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // Go execution tracer. |
| // The tracer captures a wide range of execution events like goroutine |
| // creation/blocking/unblocking, syscall enter/exit/block, GC-related events, |
| // changes of heap size, processor start/stop, etc and writes them to a buffer |
| // in a compact form. A precise nanosecond-precision timestamp and a stack |
| // trace is captured for most events. |
| // |
| // Tracer invariants (to keep the synchronization making sense): |
| // - An m that has a trace buffer must be on either the allm or sched.freem lists. |
| // - Any trace buffer mutation must either be happening in traceAdvance or between |
| // a traceAcquire and a subsequent traceRelease. |
| // - traceAdvance cannot return until the previous generation's buffers are all flushed. |
| // |
| // See https://go.dev/issue/60773 for a link to the full design. |
| |
| package runtime |
| |
| import ( |
| "internal/runtime/atomic" |
| "unsafe" |
| ) |
| |
| // Trace state. |
| |
| // trace is global tracing context. |
| var trace struct { |
| // trace.lock must only be acquired on the system stack where |
| // stack splits cannot happen while it is held. |
| lock mutex |
| |
| // Trace buffer management. |
| // |
| // First we check the empty list for any free buffers. If not, buffers |
| // are allocated directly from the OS. Once they're filled up and/or |
| // flushed, they end up on the full queue for trace.gen%2. |
| // |
| // The trace reader takes buffers off the full list one-by-one and |
| // places them into reading until they're finished being read from. |
| // Then they're placed onto the empty list. |
| // |
| // Protected by trace.lock. |
| reading *traceBuf // buffer currently handed off to user |
| empty *traceBuf // stack of empty buffers |
| full [2]traceBufQueue |
| workAvailable atomic.Bool |
| |
| // State for the trace reader goroutine. |
| // |
| // Protected by trace.lock. |
| readerGen atomic.Uintptr // the generation the reader is currently reading for |
| flushedGen atomic.Uintptr // the last completed generation |
| headerWritten bool // whether ReadTrace has emitted trace header |
| |
| // doneSema is used to synchronize the reader and traceAdvance. Specifically, |
| // it notifies traceAdvance that the reader is done with a generation. |
| // Both semaphores are 0 by default (so, acquires block). traceAdvance |
| // attempts to acquire for gen%2 after flushing the last buffers for gen. |
| // Meanwhile the reader releases the sema for gen%2 when it has finished |
| // processing gen. |
| doneSema [2]uint32 |
| |
| // Trace data tables for deduplicating data going into the trace. |
| // There are 2 of each: one for gen%2, one for 1-gen%2. |
| stackTab [2]traceStackTable // maps stack traces to unique ids |
| stringTab [2]traceStringTable // maps strings to unique ids |
| typeTab [2]traceTypeTable // maps type pointers to unique ids |
| |
| // cpuLogRead accepts CPU profile samples from the signal handler where |
| // they're generated. There are two profBufs here: one for gen%2, one for |
| // 1-gen%2. These profBufs use a three-word header to hold the IDs of the P, G, |
| // and M (respectively) that were active at the time of the sample. Because |
| // profBuf uses a record with all zeros in its header to indicate overflow, |
| // we make sure to make the P field always non-zero: The ID of a real P will |
| // start at bit 1, and bit 0 will be set. Samples that arrive while no P is |
| // running (such as near syscalls) will set the first header field to 0b10. |
| // This careful handling of the first header field allows us to store ID of |
| // the active G directly in the second field, even though that will be 0 |
| // when sampling g0. |
| // |
| // Initialization and teardown of these fields is protected by traceAdvanceSema. |
| cpuLogRead [2]*profBuf |
| signalLock atomic.Uint32 // protects use of the following member, only usable in signal handlers |
| cpuLogWrite [2]atomic.Pointer[profBuf] // copy of cpuLogRead for use in signal handlers, set without signalLock |
| cpuSleep *wakeableSleep |
| cpuLogDone <-chan struct{} |
| cpuBuf [2]*traceBuf |
| |
| reader atomic.Pointer[g] // goroutine that called ReadTrace, or nil |
| |
| // Fast mappings from enumerations to string IDs that are prepopulated |
| // in the trace. |
| markWorkerLabels [2][len(gcMarkWorkerModeStrings)]traceArg |
| goStopReasons [2][len(traceGoStopReasonStrings)]traceArg |
| goBlockReasons [2][len(traceBlockReasonStrings)]traceArg |
| |
| // enabled indicates whether tracing is enabled, but it is only an optimization, |
| // NOT the source of truth on whether tracing is enabled. Tracing is only truly |
| // enabled if gen != 0. This is used as an optimistic fast path check. |
| // |
| // Transitioning this value from true -> false is easy (once gen is 0) |
| // because it's OK for enabled to have a stale "true" value. traceAcquire will |
| // always double-check gen. |
| // |
| // Transitioning this value from false -> true is harder. We need to make sure |
| // this is observable as true strictly before gen != 0. To maintain this invariant |
| // we only make this transition with the world stopped and use the store to gen |
| // as a publication barrier. |
| enabled bool |
| |
| // enabledWithAllocFree is set if debug.traceallocfree is != 0 when tracing begins. |
| // It follows the same synchronization protocol as enabled. |
| enabledWithAllocFree bool |
| |
| // Trace generation counter. |
| gen atomic.Uintptr |
| lastNonZeroGen uintptr // last non-zero value of gen |
| |
| // shutdown is set when we are waiting for trace reader to finish after setting gen to 0 |
| // |
| // Writes protected by trace.lock. |
| shutdown atomic.Bool |
| |
| // Number of goroutines in syscall exiting slow path. |
| exitingSyscall atomic.Int32 |
| |
| // seqGC is the sequence counter for GC begin/end. |
| // |
| // Mutated only during stop-the-world. |
| seqGC uint64 |
| |
| // minPageHeapAddr is the minimum address of the page heap when tracing started. |
| minPageHeapAddr uint64 |
| |
| // debugMalloc is the value of debug.malloc before tracing began. |
| debugMalloc bool |
| } |
| |
| // Trace public API. |
| |
| var ( |
| traceAdvanceSema uint32 = 1 |
| traceShutdownSema uint32 = 1 |
| ) |
| |
| // StartTrace enables tracing for the current process. |
| // While tracing, the data will be buffered and available via [ReadTrace]. |
| // StartTrace returns an error if tracing is already enabled. |
| // Most clients should use the [runtime/trace] package or the [testing] package's |
| // -test.trace flag instead of calling StartTrace directly. |
| func StartTrace() error { |
| if traceEnabled() || traceShuttingDown() { |
| return errorString("tracing is already enabled") |
| } |
| // Block until cleanup of the last trace is done. |
| semacquire(&traceShutdownSema) |
| semrelease(&traceShutdownSema) |
| |
| // Hold traceAdvanceSema across trace start, since we'll want it on |
| // the other side of tracing being enabled globally. |
| semacquire(&traceAdvanceSema) |
| |
| // Initialize CPU profile -> trace ingestion. |
| traceInitReadCPU() |
| |
| // Compute the first generation for this StartTrace. |
| // |
| // Note: we start from the last non-zero generation rather than 1 so we |
| // can avoid resetting all the arrays indexed by gen%2 or gen%3. There's |
| // more than one of each per m, p, and goroutine. |
| firstGen := traceNextGen(trace.lastNonZeroGen) |
| |
| // Reset GC sequencer. |
| trace.seqGC = 1 |
| |
| // Reset trace reader state. |
| trace.headerWritten = false |
| trace.readerGen.Store(firstGen) |
| trace.flushedGen.Store(0) |
| |
| // Register some basic strings in the string tables. |
| traceRegisterLabelsAndReasons(firstGen) |
| |
| // Stop the world. |
| // |
| // The purpose of stopping the world is to make sure that no goroutine is in a |
| // context where it could emit an event by bringing all goroutines to a safe point |
| // with no opportunity to transition. |
| // |
| // The exception to this rule are goroutines that are concurrently exiting a syscall. |
| // Those will all be forced into the syscalling slow path, and we'll just make sure |
| // that we don't observe any goroutines in that critical section before starting |
| // the world again. |
| // |
| // A good follow-up question to this is why stopping the world is necessary at all |
| // given that we have traceAcquire and traceRelease. Unfortunately, those only help |
| // us when tracing is already active (for performance, so when tracing is off the |
| // tracing seqlock is left untouched). The main issue here is subtle: we're going to |
| // want to obtain a correct starting status for each goroutine, but there are windows |
| // of time in which we could read and emit an incorrect status. Specifically: |
| // |
| // trace := traceAcquire() |
| // // <----> problem window |
| // casgstatus(gp, _Gwaiting, _Grunnable) |
| // if trace.ok() { |
| // trace.GoUnpark(gp, 2) |
| // traceRelease(trace) |
| // } |
| // |
| // More precisely, if we readgstatus for a gp while another goroutine is in the problem |
| // window and that goroutine didn't observe that tracing had begun, then we might write |
| // a GoStatus(GoWaiting) event for that goroutine, but it won't trace an event marking |
| // the transition from GoWaiting to GoRunnable. The trace will then be broken, because |
| // future events will be emitted assuming the tracer sees GoRunnable. |
| // |
| // In short, what we really need here is to make sure that the next time *any goroutine* |
| // hits a traceAcquire, it sees that the trace is enabled. |
| // |
| // Note also that stopping the world is necessary to make sure sweep-related events are |
| // coherent. Since the world is stopped and sweeps are non-preemptible, we can never start |
| // the world and see an unpaired sweep 'end' event. Other parts of the tracer rely on this. |
| stw := stopTheWorld(stwStartTrace) |
| |
| // Prevent sysmon from running any code that could generate events. |
| lock(&sched.sysmonlock) |
| |
| // Grab the minimum page heap address. All Ps are stopped, so it's safe to read this since |
| // nothing can allocate heap memory. |
| trace.minPageHeapAddr = uint64(mheap_.pages.inUse.ranges[0].base.addr()) |
| |
| // Reset mSyscallID on all Ps while we have them stationary and the trace is disabled. |
| for _, pp := range allp { |
| pp.trace.mSyscallID = -1 |
| } |
| |
| // Start tracing. |
| // |
| // Set trace.enabled. This is *very* subtle. We need to maintain the invariant that if |
| // trace.gen != 0, then trace.enabled is always observed as true. Simultaneously, for |
| // performance, we need trace.enabled to be read without any synchronization. |
| // |
| // We ensure this is safe by stopping the world, which acts a global barrier on almost |
| // every M, and explicitly synchronize with any other Ms that could be running concurrently |
| // with us. Today, there are only two such cases: |
| // - sysmon, which we synchronized with by acquiring sysmonlock. |
| // - goroutines exiting syscalls, which we synchronize with via trace.exitingSyscall. |
| // |
| // After trace.gen is updated, other Ms may start creating trace buffers and emitting |
| // data into them. |
| trace.enabled = true |
| if debug.traceallocfree.Load() != 0 { |
| // Enable memory events since the GODEBUG is set. |
| trace.debugMalloc = debug.malloc |
| trace.enabledWithAllocFree = true |
| debug.malloc = true |
| } |
| trace.gen.Store(firstGen) |
| |
| // Wait for exitingSyscall to drain. |
| // |
| // It may not monotonically decrease to zero, but in the limit it will always become |
| // zero because the world is stopped and there are no available Ps for syscall-exited |
| // goroutines to run on. |
| // |
| // Because we set gen before checking this, and because exitingSyscall is always incremented |
| // *before* traceAcquire (which checks gen), we can be certain that when exitingSyscall is zero |
| // that any goroutine that goes to exit a syscall from then on *must* observe the new gen as |
| // well as trace.enabled being set to true. |
| // |
| // The critical section on each goroutine here is going to be quite short, so the likelihood |
| // that we observe a zero value is high. |
| for trace.exitingSyscall.Load() != 0 { |
| osyield() |
| } |
| |
| // Record some initial pieces of information. |
| // |
| // N.B. This will also emit a status event for this goroutine. |
| tl := traceAcquire() |
| tl.Gomaxprocs(gomaxprocs) // Get this as early in the trace as possible. See comment in traceAdvance. |
| tl.STWStart(stwStartTrace) // We didn't trace this above, so trace it now. |
| |
| // Record the fact that a GC is active, if applicable. |
| if gcphase == _GCmark || gcphase == _GCmarktermination { |
| tl.GCActive() |
| } |
| |
| // Dump a snapshot of memory, if enabled. |
| if trace.enabledWithAllocFree { |
| traceSnapshotMemory() |
| } |
| |
| // Record the heap goal so we have it at the very beginning of the trace. |
| tl.HeapGoal() |
| |
| // Make sure a ProcStatus is emitted for every P, while we're here. |
| for _, pp := range allp { |
| tl.writer().writeProcStatusForP(pp, pp == tl.mp.p.ptr()).end() |
| } |
| traceRelease(tl) |
| |
| unlock(&sched.sysmonlock) |
| startTheWorld(stw) |
| |
| traceStartReadCPU() |
| traceAdvancer.start() |
| |
| semrelease(&traceAdvanceSema) |
| return nil |
| } |
| |
| // StopTrace stops tracing, if it was previously enabled. |
| // StopTrace only returns after all the reads for the trace have completed. |
| func StopTrace() { |
| traceAdvance(true) |
| } |
| |
| // traceAdvance moves tracing to the next generation, and cleans up the current generation, |
| // ensuring that it's flushed out before returning. If stopTrace is true, it disables tracing |
| // altogether instead of advancing to the next generation. |
| // |
| // traceAdvanceSema must not be held. |
| func traceAdvance(stopTrace bool) { |
| semacquire(&traceAdvanceSema) |
| |
| // Get the gen that we're advancing from. In this function we don't really care much |
| // about the generation we're advancing _into_ since we'll do all the cleanup in this |
| // generation for the next advancement. |
| gen := trace.gen.Load() |
| if gen == 0 { |
| // We may end up here traceAdvance is called concurrently with StopTrace. |
| semrelease(&traceAdvanceSema) |
| return |
| } |
| |
| // Write an EvFrequency event for this generation. |
| // |
| // N.B. This may block for quite a while to get a good frequency estimate, so make sure we do |
| // this here and not e.g. on the trace reader. |
| traceFrequency(gen) |
| |
| // Collect all the untraced Gs. |
| type untracedG struct { |
| gp *g |
| goid uint64 |
| mid int64 |
| stackID uint64 |
| status uint32 |
| waitreason waitReason |
| inMarkAssist bool |
| } |
| var untracedGs []untracedG |
| forEachGRace(func(gp *g) { |
| // Make absolutely sure all Gs are ready for the next |
| // generation. We need to do this even for dead Gs because |
| // they may come alive with a new identity, and its status |
| // traced bookkeeping might end up being stale. |
| // We may miss totally new goroutines, but they'll always |
| // have clean bookkeeping. |
| gp.trace.readyNextGen(gen) |
| // If the status was traced, nothing else to do. |
| if gp.trace.statusWasTraced(gen) { |
| return |
| } |
| // Scribble down information about this goroutine. |
| ug := untracedG{gp: gp, mid: -1} |
| systemstack(func() { |
| me := getg().m.curg |
| // We don't have to handle this G status transition because we |
| // already eliminated ourselves from consideration above. |
| casGToWaitingForGC(me, _Grunning, waitReasonTraceGoroutineStatus) |
| // We need to suspend and take ownership of the G to safely read its |
| // goid. Note that we can't actually emit the event at this point |
| // because we might stop the G in a window where it's unsafe to write |
| // events based on the G's status. We need the global trace buffer flush |
| // coming up to make sure we're not racing with the G. |
| // |
| // It should be very unlikely that we try to preempt a running G here. |
| // The only situation that we might is that we're racing with a G |
| // that's running for the first time in this generation. Therefore, |
| // this should be relatively fast. |
| s := suspendG(gp) |
| if !s.dead { |
| ug.goid = s.g.goid |
| if s.g.m != nil { |
| ug.mid = int64(s.g.m.procid) |
| } |
| ug.status = readgstatus(s.g) &^ _Gscan |
| ug.waitreason = s.g.waitreason |
| ug.inMarkAssist = s.g.inMarkAssist |
| ug.stackID = traceStack(0, gp, gen) |
| } |
| resumeG(s) |
| casgstatus(me, _Gwaiting, _Grunning) |
| }) |
| if ug.goid != 0 { |
| untracedGs = append(untracedGs, ug) |
| } |
| }) |
| |
| if !stopTrace { |
| // Re-register runtime goroutine labels and stop/block reasons. |
| traceRegisterLabelsAndReasons(traceNextGen(gen)) |
| } |
| |
| // Now that we've done some of the heavy stuff, prevent the world from stopping. |
| // This is necessary to ensure the consistency of the STW events. If we're feeling |
| // adventurous we could lift this restriction and add a STWActive event, but the |
| // cost of maintaining this consistency is low. We're not going to hold this semaphore |
| // for very long and most STW periods are very short. |
| // Once we hold worldsema, prevent preemption as well so we're not interrupted partway |
| // through this. We want to get this done as soon as possible. |
| semacquire(&worldsema) |
| mp := acquirem() |
| |
| // Advance the generation or stop the trace. |
| trace.lastNonZeroGen = gen |
| if stopTrace { |
| systemstack(func() { |
| // Ordering is important here. Set shutdown first, then disable tracing, |
| // so that conditions like (traceEnabled() || traceShuttingDown()) have |
| // no opportunity to be false. Hold the trace lock so this update appears |
| // atomic to the trace reader. |
| lock(&trace.lock) |
| trace.shutdown.Store(true) |
| trace.gen.Store(0) |
| unlock(&trace.lock) |
| |
| // Clear trace.enabled. It is totally OK for this value to be stale, |
| // because traceAcquire will always double-check gen. |
| trace.enabled = false |
| }) |
| } else { |
| trace.gen.Store(traceNextGen(gen)) |
| } |
| |
| // Emit a ProcsChange event so we have one on record for each generation. |
| // Let's emit it as soon as possible so that downstream tools can rely on the value |
| // being there fairly soon in a generation. |
| // |
| // It's important that we do this before allowing stop-the-worlds again, |
| // because the procs count could change. |
| if !stopTrace { |
| tl := traceAcquire() |
| tl.Gomaxprocs(gomaxprocs) |
| traceRelease(tl) |
| } |
| |
| // Emit a GCActive event in the new generation if necessary. |
| // |
| // It's important that we do this before allowing stop-the-worlds again, |
| // because that could emit global GC-related events. |
| if !stopTrace && (gcphase == _GCmark || gcphase == _GCmarktermination) { |
| tl := traceAcquire() |
| tl.GCActive() |
| traceRelease(tl) |
| } |
| |
| // Preemption is OK again after this. If the world stops or whatever it's fine. |
| // We're just cleaning up the last generation after this point. |
| // |
| // We also don't care if the GC starts again after this for the same reasons. |
| releasem(mp) |
| semrelease(&worldsema) |
| |
| // Snapshot allm and freem. |
| // |
| // Snapshotting after the generation counter update is sufficient. |
| // Because an m must be on either allm or sched.freem if it has an active trace |
| // buffer, new threads added to allm after this point must necessarily observe |
| // the new generation number (sched.lock acts as a barrier). |
| // |
| // Threads that exit before this point and are on neither list explicitly |
| // flush their own buffers in traceThreadDestroy. |
| // |
| // Snapshotting freem is necessary because Ms can continue to emit events |
| // while they're still on that list. Removal from sched.freem is serialized with |
| // this snapshot, so either we'll capture an m on sched.freem and race with |
| // the removal to flush its buffers (resolved by traceThreadDestroy acquiring |
| // the thread's seqlock, which one of us must win, so at least its old gen buffer |
| // will be flushed in time for the new generation) or it will have flushed its |
| // buffers before we snapshotted it to begin with. |
| lock(&sched.lock) |
| mToFlush := allm |
| for mp := mToFlush; mp != nil; mp = mp.alllink { |
| mp.trace.link = mp.alllink |
| } |
| for mp := sched.freem; mp != nil; mp = mp.freelink { |
| mp.trace.link = mToFlush |
| mToFlush = mp |
| } |
| unlock(&sched.lock) |
| |
| // Iterate over our snapshot, flushing every buffer until we're done. |
| // |
| // Because trace writers read the generation while the seqlock is |
| // held, we can be certain that when there are no writers there are |
| // also no stale generation values left. Therefore, it's safe to flush |
| // any buffers that remain in that generation's slot. |
| const debugDeadlock = false |
| systemstack(func() { |
| // Track iterations for some rudimentary deadlock detection. |
| i := 0 |
| detectedDeadlock := false |
| |
| for mToFlush != nil { |
| prev := &mToFlush |
| for mp := *prev; mp != nil; { |
| if mp.trace.seqlock.Load()%2 != 0 { |
| // The M is writing. Come back to it later. |
| prev = &mp.trace.link |
| mp = mp.trace.link |
| continue |
| } |
| // Flush the trace buffer. |
| // |
| // trace.lock needed for traceBufFlush, but also to synchronize |
| // with traceThreadDestroy, which flushes both buffers unconditionally. |
| lock(&trace.lock) |
| bufp := &mp.trace.buf[gen%2] |
| if *bufp != nil { |
| traceBufFlush(*bufp, gen) |
| *bufp = nil |
| } |
| unlock(&trace.lock) |
| |
| // Remove the m from the flush list. |
| *prev = mp.trace.link |
| mp.trace.link = nil |
| mp = *prev |
| } |
| // Yield only if we're going to be going around the loop again. |
| if mToFlush != nil { |
| osyield() |
| } |
| |
| if debugDeadlock { |
| // Try to detect a deadlock. We probably shouldn't loop here |
| // this many times. |
| if i > 100000 && !detectedDeadlock { |
| detectedDeadlock = true |
| println("runtime: failing to flush") |
| for mp := mToFlush; mp != nil; mp = mp.trace.link { |
| print("runtime: m=", mp.id, "\n") |
| } |
| } |
| i++ |
| } |
| } |
| }) |
| |
| // At this point, the old generation is fully flushed minus stack and string |
| // tables, CPU samples, and goroutines that haven't run at all during the last |
| // generation. |
| |
| // Check to see if any Gs still haven't had events written out for them. |
| statusWriter := unsafeTraceWriter(gen, nil) |
| for _, ug := range untracedGs { |
| if ug.gp.trace.statusWasTraced(gen) { |
| // It was traced, we don't need to do anything. |
| continue |
| } |
| // It still wasn't traced. Because we ensured all Ms stopped writing trace |
| // events to the last generation, that must mean the G never had its status |
| // traced in gen between when we recorded it and now. If that's true, the goid |
| // and status we recorded then is exactly what we want right now. |
| status := goStatusToTraceGoStatus(ug.status, ug.waitreason) |
| statusWriter = statusWriter.writeGoStatus(ug.goid, ug.mid, status, ug.inMarkAssist, ug.stackID) |
| } |
| statusWriter.flush().end() |
| |
| // Read everything out of the last gen's CPU profile buffer. |
| traceReadCPU(gen) |
| |
| // Flush CPU samples, stacks, and strings for the last generation. This is safe, |
| // because we're now certain no M is writing to the last generation. |
| // |
| // Ordering is important here. traceCPUFlush may generate new stacks and dumping |
| // stacks may generate new strings. |
| traceCPUFlush(gen) |
| trace.stackTab[gen%2].dump(gen) |
| trace.typeTab[gen%2].dump(gen) |
| trace.stringTab[gen%2].reset(gen) |
| |
| // That's it. This generation is done producing buffers. |
| systemstack(func() { |
| lock(&trace.lock) |
| trace.flushedGen.Store(gen) |
| unlock(&trace.lock) |
| }) |
| |
| // Perform status reset on dead Ps because they just appear as idle. |
| // |
| // Preventing preemption is sufficient to access allp safely. allp is only |
| // mutated by GOMAXPROCS calls, which require a STW. |
| // |
| // TODO(mknyszek): Consider explicitly emitting ProcCreate and ProcDestroy |
| // events to indicate whether a P exists, rather than just making its |
| // existence implicit. |
| mp = acquirem() |
| for _, pp := range allp[len(allp):cap(allp)] { |
| pp.trace.readyNextGen(traceNextGen(gen)) |
| } |
| releasem(mp) |
| |
| if stopTrace { |
| // Acquire the shutdown sema to begin the shutdown process. |
| semacquire(&traceShutdownSema) |
| |
| // Finish off CPU profile reading. |
| traceStopReadCPU() |
| |
| // Reset debug.malloc if necessary. Note that this is set in a racy |
| // way; that's OK. Some mallocs may still enter into the debug.malloc |
| // block, but they won't generate events because tracing is disabled. |
| // That is, it's OK if mallocs read a stale debug.malloc or |
| // trace.enabledWithAllocFree value. |
| if trace.enabledWithAllocFree { |
| trace.enabledWithAllocFree = false |
| debug.malloc = trace.debugMalloc |
| } |
| } else { |
| // Go over each P and emit a status event for it if necessary. |
| // |
| // We do this at the beginning of the new generation instead of the |
| // end like we do for goroutines because forEachP doesn't give us a |
| // hook to skip Ps that have already been traced. Since we have to |
| // preempt all Ps anyway, might as well stay consistent with StartTrace |
| // which does this during the STW. |
| semacquire(&worldsema) |
| forEachP(waitReasonTraceProcStatus, func(pp *p) { |
| tl := traceAcquire() |
| if !pp.trace.statusWasTraced(tl.gen) { |
| tl.writer().writeProcStatusForP(pp, false).end() |
| } |
| traceRelease(tl) |
| }) |
| semrelease(&worldsema) |
| } |
| |
| // Block until the trace reader has finished processing the last generation. |
| semacquire(&trace.doneSema[gen%2]) |
| if raceenabled { |
| raceacquire(unsafe.Pointer(&trace.doneSema[gen%2])) |
| } |
| |
| // Double-check that things look as we expect after advancing and perform some |
| // final cleanup if the trace has fully stopped. |
| systemstack(func() { |
| lock(&trace.lock) |
| if !trace.full[gen%2].empty() { |
| throw("trace: non-empty full trace buffer for done generation") |
| } |
| if stopTrace { |
| if !trace.full[1-(gen%2)].empty() { |
| throw("trace: non-empty full trace buffer for next generation") |
| } |
| if trace.reading != nil || trace.reader.Load() != nil { |
| throw("trace: reading after shutdown") |
| } |
| // Free all the empty buffers. |
| for trace.empty != nil { |
| buf := trace.empty |
| trace.empty = buf.link |
| sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf), &memstats.other_sys) |
| } |
| // Clear trace.shutdown and other flags. |
| trace.headerWritten = false |
| trace.shutdown.Store(false) |
| } |
| unlock(&trace.lock) |
| }) |
| |
| if stopTrace { |
| // Clear the sweep state on every P for the next time tracing is enabled. |
| // |
| // It may be stale in the next trace because we may have ended tracing in |
| // the middle of a sweep on a P. |
| // |
| // It's fine not to call forEachP here because tracing is disabled and we |
| // know at this point that nothing is calling into the tracer, but we do |
| // need to look at dead Ps too just because GOMAXPROCS could have been called |
| // at any point since we stopped tracing, and we have to ensure there's no |
| // bad state on dead Ps too. Prevent a STW and a concurrent GOMAXPROCS that |
| // might mutate allp by making ourselves briefly non-preemptible. |
| mp := acquirem() |
| for _, pp := range allp[:cap(allp)] { |
| pp.trace.inSweep = false |
| pp.trace.maySweep = false |
| pp.trace.swept = 0 |
| pp.trace.reclaimed = 0 |
| } |
| releasem(mp) |
| } |
| |
| // Release the advance semaphore. If stopTrace is true we're still holding onto |
| // traceShutdownSema. |
| // |
| // Do a direct handoff. Don't let one caller of traceAdvance starve |
| // other calls to traceAdvance. |
| semrelease1(&traceAdvanceSema, true, 0) |
| |
| if stopTrace { |
| // Stop the traceAdvancer. We can't be holding traceAdvanceSema here because |
| // we'll deadlock (we're blocked on the advancer goroutine exiting, but it |
| // may be currently trying to acquire traceAdvanceSema). |
| traceAdvancer.stop() |
| semrelease(&traceShutdownSema) |
| } |
| } |
| |
| func traceNextGen(gen uintptr) uintptr { |
| if gen == ^uintptr(0) { |
| // gen is used both %2 and %3 and we want both patterns to continue when we loop around. |
| // ^uint32(0) and ^uint64(0) are both odd and multiples of 3. Therefore the next generation |
| // we want is even and one more than a multiple of 3. The smallest such number is 4. |
| return 4 |
| } |
| return gen + 1 |
| } |
| |
| // traceRegisterLabelsAndReasons re-registers mark worker labels and |
| // goroutine stop/block reasons in the string table for the provided |
| // generation. Note: the provided generation must not have started yet. |
| func traceRegisterLabelsAndReasons(gen uintptr) { |
| for i, label := range gcMarkWorkerModeStrings[:] { |
| trace.markWorkerLabels[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, label)) |
| } |
| for i, str := range traceBlockReasonStrings[:] { |
| trace.goBlockReasons[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, str)) |
| } |
| for i, str := range traceGoStopReasonStrings[:] { |
| trace.goStopReasons[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, str)) |
| } |
| } |
| |
| // ReadTrace returns the next chunk of binary tracing data, blocking until data |
| // is available. If tracing is turned off and all the data accumulated while it |
| // was on has been returned, ReadTrace returns nil. The caller must copy the |
| // returned data before calling ReadTrace again. |
| // ReadTrace must be called from one goroutine at a time. |
| func ReadTrace() []byte { |
| top: |
| var buf []byte |
| var park bool |
| systemstack(func() { |
| buf, park = readTrace0() |
| }) |
| if park { |
| gopark(func(gp *g, _ unsafe.Pointer) bool { |
| if !trace.reader.CompareAndSwapNoWB(nil, gp) { |
| // We're racing with another reader. |
| // Wake up and handle this case. |
| return false |
| } |
| |
| if g2 := traceReader(); gp == g2 { |
| // New data arrived between unlocking |
| // and the CAS and we won the wake-up |
| // race, so wake up directly. |
| return false |
| } else if g2 != nil { |
| printlock() |
| println("runtime: got trace reader", g2, g2.goid) |
| throw("unexpected trace reader") |
| } |
| |
| return true |
| }, nil, waitReasonTraceReaderBlocked, traceBlockSystemGoroutine, 2) |
| goto top |
| } |
| |
| return buf |
| } |
| |
| // readTrace0 is ReadTrace's continuation on g0. This must run on the |
| // system stack because it acquires trace.lock. |
| // |
| //go:systemstack |
| func readTrace0() (buf []byte, park bool) { |
| if raceenabled { |
| // g0 doesn't have a race context. Borrow the user G's. |
| if getg().racectx != 0 { |
| throw("expected racectx == 0") |
| } |
| getg().racectx = getg().m.curg.racectx |
| // (This defer should get open-coded, which is safe on |
| // the system stack.) |
| defer func() { getg().racectx = 0 }() |
| } |
| |
| // This function must not allocate while holding trace.lock: |
| // allocation can call heap allocate, which will try to emit a trace |
| // event while holding heap lock. |
| lock(&trace.lock) |
| |
| if trace.reader.Load() != nil { |
| // More than one goroutine reads trace. This is bad. |
| // But we rather do not crash the program because of tracing, |
| // because tracing can be enabled at runtime on prod servers. |
| unlock(&trace.lock) |
| println("runtime: ReadTrace called from multiple goroutines simultaneously") |
| return nil, false |
| } |
| // Recycle the old buffer. |
| if buf := trace.reading; buf != nil { |
| buf.link = trace.empty |
| trace.empty = buf |
| trace.reading = nil |
| } |
| // Write trace header. |
| if !trace.headerWritten { |
| trace.headerWritten = true |
| unlock(&trace.lock) |
| return []byte("go 1.23 trace\x00\x00\x00"), false |
| } |
| |
| // Read the next buffer. |
| |
| if trace.readerGen.Load() == 0 { |
| trace.readerGen.Store(1) |
| } |
| var gen uintptr |
| for { |
| assertLockHeld(&trace.lock) |
| gen = trace.readerGen.Load() |
| |
| // Check to see if we need to block for more data in this generation |
| // or if we need to move our generation forward. |
| if !trace.full[gen%2].empty() { |
| break |
| } |
| // Most of the time readerGen is one generation ahead of flushedGen, as the |
| // current generation is being read from. Then, once the last buffer is flushed |
| // into readerGen, flushedGen will rise to meet it. At this point, the tracer |
| // is waiting on the reader to finish flushing the last generation so that it |
| // can continue to advance. |
| if trace.flushedGen.Load() == gen { |
| if trace.shutdown.Load() { |
| unlock(&trace.lock) |
| |
| // Wake up anyone waiting for us to be done with this generation. |
| // |
| // Do this after reading trace.shutdown, because the thread we're |
| // waking up is going to clear trace.shutdown. |
| if raceenabled { |
| // Model synchronization on trace.doneSema, which te race |
| // detector does not see. This is required to avoid false |
| // race reports on writer passed to trace.Start. |
| racerelease(unsafe.Pointer(&trace.doneSema[gen%2])) |
| } |
| semrelease(&trace.doneSema[gen%2]) |
| |
| // We're shutting down, and the last generation is fully |
| // read. We're done. |
| return nil, false |
| } |
| // The previous gen has had all of its buffers flushed, and |
| // there's nothing else for us to read. Advance the generation |
| // we're reading from and try again. |
| trace.readerGen.Store(trace.gen.Load()) |
| unlock(&trace.lock) |
| |
| // Wake up anyone waiting for us to be done with this generation. |
| // |
| // Do this after reading gen to make sure we can't have the trace |
| // advance until we've read it. |
| if raceenabled { |
| // See comment above in the shutdown case. |
| racerelease(unsafe.Pointer(&trace.doneSema[gen%2])) |
| } |
| semrelease(&trace.doneSema[gen%2]) |
| |
| // Reacquire the lock and go back to the top of the loop. |
| lock(&trace.lock) |
| continue |
| } |
| // Wait for new data. |
| // |
| // We don't simply use a note because the scheduler |
| // executes this goroutine directly when it wakes up |
| // (also a note would consume an M). |
| // |
| // Before we drop the lock, clear the workAvailable flag. Work can |
| // only be queued with trace.lock held, so this is at least true until |
| // we drop the lock. |
| trace.workAvailable.Store(false) |
| unlock(&trace.lock) |
| return nil, true |
| } |
| // Pull a buffer. |
| tbuf := trace.full[gen%2].pop() |
| trace.reading = tbuf |
| unlock(&trace.lock) |
| return tbuf.arr[:tbuf.pos], false |
| } |
| |
| // traceReader returns the trace reader that should be woken up, if any. |
| // Callers should first check (traceEnabled() || traceShuttingDown()). |
| // |
| // This must run on the system stack because it acquires trace.lock. |
| // |
| //go:systemstack |
| func traceReader() *g { |
| gp := traceReaderAvailable() |
| if gp == nil || !trace.reader.CompareAndSwapNoWB(gp, nil) { |
| return nil |
| } |
| return gp |
| } |
| |
| // traceReaderAvailable returns the trace reader if it is not currently |
| // scheduled and should be. Callers should first check that |
| // (traceEnabled() || traceShuttingDown()) is true. |
| func traceReaderAvailable() *g { |
| // There are three conditions under which we definitely want to schedule |
| // the reader: |
| // - The reader is lagging behind in finishing off the last generation. |
| // In this case, trace buffers could even be empty, but the trace |
| // advancer will be waiting on the reader, so we have to make sure |
| // to schedule the reader ASAP. |
| // - The reader has pending work to process for it's reader generation |
| // (assuming readerGen is not lagging behind). Note that we also want |
| // to be careful *not* to schedule the reader if there's no work to do. |
| // - The trace is shutting down. The trace stopper blocks on the reader |
| // to finish, much like trace advancement. |
| // |
| // We also want to be careful not to schedule the reader if there's no |
| // reason to. |
| if trace.flushedGen.Load() == trace.readerGen.Load() || trace.workAvailable.Load() || trace.shutdown.Load() { |
| return trace.reader.Load() |
| } |
| return nil |
| } |
| |
| // Trace advancer goroutine. |
| var traceAdvancer traceAdvancerState |
| |
| type traceAdvancerState struct { |
| timer *wakeableSleep |
| done chan struct{} |
| } |
| |
| // start starts a new traceAdvancer. |
| func (s *traceAdvancerState) start() { |
| // Start a goroutine to periodically advance the trace generation. |
| s.done = make(chan struct{}) |
| s.timer = newWakeableSleep() |
| go func() { |
| for traceEnabled() { |
| // Set a timer to wake us up |
| s.timer.sleep(int64(debug.traceadvanceperiod)) |
| |
| // Try to advance the trace. |
| traceAdvance(false) |
| } |
| s.done <- struct{}{} |
| }() |
| } |
| |
| // stop stops a traceAdvancer and blocks until it exits. |
| func (s *traceAdvancerState) stop() { |
| s.timer.wake() |
| <-s.done |
| close(s.done) |
| s.timer.close() |
| } |
| |
| // traceAdvancePeriod is the approximate period between |
| // new generations. |
| const defaultTraceAdvancePeriod = 1e9 // 1 second. |
| |
| // wakeableSleep manages a wakeable goroutine sleep. |
| // |
| // Users of this type must call init before first use and |
| // close to free up resources. Once close is called, init |
| // must be called before another use. |
| type wakeableSleep struct { |
| timer *timer |
| |
| // lock protects access to wakeup, but not send/recv on it. |
| lock mutex |
| wakeup chan struct{} |
| } |
| |
| // newWakeableSleep initializes a new wakeableSleep and returns it. |
| func newWakeableSleep() *wakeableSleep { |
| s := new(wakeableSleep) |
| lockInit(&s.lock, lockRankWakeableSleep) |
| s.wakeup = make(chan struct{}, 1) |
| s.timer = new(timer) |
| f := func(s any, _ uintptr, _ int64) { |
| s.(*wakeableSleep).wake() |
| } |
| s.timer.init(f, s) |
| return s |
| } |
| |
| // sleep sleeps for the provided duration in nanoseconds or until |
| // another goroutine calls wake. |
| // |
| // Must not be called by more than one goroutine at a time and |
| // must not be called concurrently with close. |
| func (s *wakeableSleep) sleep(ns int64) { |
| s.timer.reset(nanotime()+ns, 0) |
| lock(&s.lock) |
| if raceenabled { |
| raceacquire(unsafe.Pointer(&s.lock)) |
| } |
| wakeup := s.wakeup |
| if raceenabled { |
| racerelease(unsafe.Pointer(&s.lock)) |
| } |
| unlock(&s.lock) |
| <-wakeup |
| s.timer.stop() |
| } |
| |
| // wake awakens any goroutine sleeping on the timer. |
| // |
| // Safe for concurrent use with all other methods. |
| func (s *wakeableSleep) wake() { |
| // Grab the wakeup channel, which may be nil if we're |
| // racing with close. |
| lock(&s.lock) |
| if raceenabled { |
| raceacquire(unsafe.Pointer(&s.lock)) |
| } |
| if s.wakeup != nil { |
| // Non-blocking send. |
| // |
| // Others may also write to this channel and we don't |
| // want to block on the receiver waking up. This also |
| // effectively batches together wakeup notifications. |
| select { |
| case s.wakeup <- struct{}{}: |
| default: |
| } |
| } |
| if raceenabled { |
| racerelease(unsafe.Pointer(&s.lock)) |
| } |
| unlock(&s.lock) |
| } |
| |
| // close wakes any goroutine sleeping on the timer and prevents |
| // further sleeping on it. |
| // |
| // Once close is called, the wakeableSleep must no longer be used. |
| // |
| // It must only be called once no goroutine is sleeping on the |
| // timer *and* nothing else will call wake concurrently. |
| func (s *wakeableSleep) close() { |
| // Set wakeup to nil so that a late timer ends up being a no-op. |
| lock(&s.lock) |
| if raceenabled { |
| raceacquire(unsafe.Pointer(&s.lock)) |
| } |
| wakeup := s.wakeup |
| s.wakeup = nil |
| |
| // Close the channel. |
| close(wakeup) |
| |
| if raceenabled { |
| racerelease(unsafe.Pointer(&s.lock)) |
| } |
| unlock(&s.lock) |
| return |
| } |