src/runtime/mgcmark_greenteagc.go - go - Git at Google

 // Copyright 2025 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 // Green Tea mark algorithm
 //
 // The core idea behind Green Tea is simple: achieve better locality during
 // mark/scan by delaying scanning so that we can accumulate objects to scan
 // within the same span, then scan the objects that have accumulated on the
 // span all together.
 //
 // By batching objects this way, we increase the chance that adjacent objects
 // will be accessed, amortize the cost of accessing object metadata, and create
 // better opportunities for prefetching. We can take this even further and
 // optimize the scan loop by size class (not yet completed) all the way to the
 // point of applying SIMD techniques to really tear through the heap.
 //
 // Naturally, this depends on being able to create opportunties to batch objects
 // together. The basic idea here is to have two sets of mark bits. One set is the
 // regular set of mark bits ("marks"), while the other essentially says that the
 // objects have been scanned already ("scans"). When we see a pointer for the first
 // time we set its mark and enqueue its span. We track these spans in work queues
 // with a FIFO policy, unlike workbufs which have a LIFO policy. Empirically, a
 // FIFO policy appears to work best for accumulating objects to scan on a span.
 // Later, when we dequeue the span, we find both the union and intersection of the
 // mark and scan bitsets. The union is then written back into the scan bits, while
 // the intersection is used to decide which objects need scanning, such that the GC
 // is still precise.
 //
 // Below is the bulk of the implementation, focusing on the worst case
 // for locality, small objects. Specifically, those that are smaller than
 // a few cache lines in size and whose metadata is stored the same way (at the
 // end of the span).

 //go:build goexperiment.greenteagc

 package runtime

 import (
 	"internal/goarch"
 	"internal/runtime/atomic"
 	"internal/runtime/gc"
 	"internal/runtime/gc/scan"
 	"internal/runtime/sys"
 	"unsafe"
 )

 const doubleCheckGreenTea = false

 // spanInlineMarkBits are mark bits that are inlined into the span
 // itself. gcUsesSpanInlineMarkBits may be used to check if objects
 // of a particular size use inline mark bits.
 //
 // Inline mark bits are a little bit more than just mark bits. They
 // consist of two parts: scans and marks. Marks are like pre-mark
 // bits. They're set once a pointer to an object is discovered for
 // the first time. The marks allow us to scan many objects in bulk
 // if we queue the whole span for scanning. Before we scan such objects
 // in bulk, we copy the marks to the scans, computing a diff along the
 // way. The resulting bitmap tells us which objects we should scan.
 //
 // The inlineMarkBits also hold state sufficient for scanning any
 // object in the span, as well as state for acquiring ownership of
 // the span for queuing. This avoids the need to look at the mspan when
 // scanning.
 type spanInlineMarkBits struct {
 	scans [63]uint8         // scanned bits.
 	owned spanScanOwnership // see the comment on spanScanOwnership.
 	marks [63]uint8         // mark bits.
 	class spanClass
 }

 // spanScanOwnership indicates whether some thread has acquired
 // the span for scanning, and whether there has been one or more
 // attempts to acquire the span. The latter information helps to
 // fast-track span scans that only apply to a single mark, skipping
 // the relatively costly merge-and-diff process for scans and marks
 // by allowing one to just set the mark directly.
 type spanScanOwnership uint8

 const (
 	spanScanUnowned  spanScanOwnership = 0         // Indicates the span is not acquired for scanning.
 	spanScanOneMark                    = 1 << iota // Indicates that only one mark bit is set relative to the scan bits.
 	spanScanManyMark                               // Indicates one or more scan bits may be set relative to the mark bits.
 	// "ManyMark" need not be exactly the value it has. In practice we just
 	// want to distinguish "none" from "one" from "many," so a comparison is
 	// sufficient (as opposed to a bit test) to check between these cases.
 )

 // load atomically loads from a pointer to a spanScanOwnership.
 func (o *spanScanOwnership) load() spanScanOwnership {
 	return spanScanOwnership(atomic.Load8((*uint8)(unsafe.Pointer(o))))
 }

 func (o *spanScanOwnership) or(v spanScanOwnership) spanScanOwnership {
 	// N.B. We round down the address and use Or32 because Or8 doesn't
 	// return a result, and it's strictly necessary for this protocol.
 	//
 	// Making Or8 return a result, while making the code look nicer, would
 	// not be strictly better on any supported platform, as an Or8 that
 	// returns a result is not a common instruction. On many platforms it
 	// would be implemented exactly as it is here, and since Or8 is
 	// exclusively used in the runtime and a hot function, we want to keep
 	// using its no-result version elsewhere for performance.
 	o32 := (*uint32)(unsafe.Pointer(uintptr(unsafe.Pointer(o)) &^ 0b11))
 	off := (uintptr(unsafe.Pointer(o)) & 0b11) * 8
 	if goarch.BigEndian {
 		off = 32 - off - 8
 	}
 	return spanScanOwnership(atomic.Or32(o32, uint32(v)<<off) >> off)
 }

 func (imb *spanInlineMarkBits) init(class spanClass, needzero bool) {
 	if imb == nil {
 		// This nil check and throw is almost pointless. Normally we would
 		// expect imb to never be nil. However, this is called on potentially
 		// freshly-allocated virtual memory. As of 2025, the compiler-inserted
 		// nil check is not a branch but a memory read that we expect to fault
 		// if the pointer really is nil.
 		//
 		// However, this causes a read of the page, and operating systems may
 		// take it as a hint to back the accessed memory with a read-only zero
 		// page. However, we immediately write to this memory, which can then
 		// force operating systems to have to update the page table and flush
 		// the TLB, causing a lot of churn for programs that are short-lived
 		// and monotonically grow in size.
 		//
 		// This nil check is thus an explicit branch instead of what the compiler
 		// would insert circa 2025, which is a memory read instruction.
 		//
 		// See go.dev/issue/74375 for details.
 		throw("runtime: span inline mark bits nil?")
 	}
 	if needzero {
 		// Use memclrNoHeapPointers to avoid having the compiler make a worse
 		// decision. We know that imb is both aligned and a nice power-of-two
 		// size that works well for wider SIMD instructions. The compiler likely
 		// has no idea that imb is aligned to 128 bytes.
 		memclrNoHeapPointers(unsafe.Pointer(imb), unsafe.Sizeof(spanInlineMarkBits{}))
 	}
 	imb.class = class
 }

 // tryAcquire attempts to acquire the span for scanning. On success, the caller
 // must queue the span for scanning or scan the span immediately.
 func (imb *spanInlineMarkBits) tryAcquire() bool {
 	switch imb.owned.load() {
 	case spanScanUnowned:
 		// Try to mark the span as having only one object marked.
 		if imb.owned.or(spanScanOneMark) == spanScanUnowned {
 			return true
 		}
 		// If we didn't see an old value of spanScanUnowned, then we must
 		// have raced with someone else and seen spanScanOneMark or greater.
 		// Fall through and try to set spanScanManyMark.
 		fallthrough
 	case spanScanOneMark:
 		// We may be the first to set *any* bit on owned. In such a case,
 		// we still need to make sure the span is queued.
 		return imb.owned.or(spanScanManyMark) == spanScanUnowned
 	}
 	return false
 }

 // release releases the span for scanning, allowing another thread to queue the span.
 //
 // Returns an upper bound on the number of mark bits set since the span was queued. The
 // upper bound is described as "one" (spanScanOneMark) or "many" (spanScanManyMark, with or
 // without spanScanOneMark). If the return value indicates only one mark bit was set, the
 // caller can be certain that it was the same mark bit that caused the span to get queued.
 // Take note of the fact that this is *only* an upper-bound. In particular, it may still
 // turn out that only one mark bit was set, even if the return value indicates "many".
 func (imb *spanInlineMarkBits) release() spanScanOwnership {
 	return spanScanOwnership(atomic.Xchg8((*uint8)(unsafe.Pointer(&imb.owned)), uint8(spanScanUnowned)))
 }

 // spanInlineMarkBitsFromBase returns the spanInlineMarkBits for a span whose start address is base.
 //
 // The span must be gcUsesSpanInlineMarkBits(span.elemsize).
 func spanInlineMarkBitsFromBase(base uintptr) *spanInlineMarkBits {
 	return (*spanInlineMarkBits)(unsafe.Pointer(base + gc.PageSize - unsafe.Sizeof(spanInlineMarkBits{})))
 }

 // initInlineMarkBits initializes the inlineMarkBits stored at the end of the span.
 func (s *mspan) initInlineMarkBits() {
 	if doubleCheckGreenTea && !gcUsesSpanInlineMarkBits(s.elemsize) {
 		throw("expected span with inline mark bits")
 	}
 	// Zeroing is only necessary if this span wasn't just freshly allocated from the OS.
 	s.inlineMarkBits().init(s.spanclass, s.needzero != 0)
 }

 // moveInlineMarks merges the span's inline mark bits into dst and clears them.
 //
 // gcUsesSpanInlineMarkBits(s.elemsize) must be true.
 func (s *mspan) moveInlineMarks(dst *gcBits) {
 	if doubleCheckGreenTea && !gcUsesSpanInlineMarkBits(s.elemsize) {
 		throw("expected span with inline mark bits")
 	}
 	bytes := divRoundUp(uintptr(s.nelems), 8)
 	imb := s.inlineMarkBits()
 	imbMarks := (*gc.ObjMask)(unsafe.Pointer(&imb.marks))
 	for i := uintptr(0); i < bytes; i += goarch.PtrSize {
 		marks := bswapIfBigEndian(imbMarks[i/goarch.PtrSize])
 		if i/goarch.PtrSize == uintptr(len(imb.marks)+1)/goarch.PtrSize-1 {
 			marks &^= 0xff << ((goarch.PtrSize - 1) * 8) // mask out class
 		}
 		*(*uintptr)(unsafe.Pointer(dst.bytep(i))) |= bswapIfBigEndian(marks)
 	}
 	if doubleCheckGreenTea && !s.spanclass.noscan() && imb.marks != imb.scans {
 		throw("marks don't match scans for span with pointer")
 	}

 	// Reset the inline mark bits.
 	imb.init(s.spanclass, true /* We know these bits are always dirty now. */)
 }

 // inlineMarkBits returns the inline mark bits for the span.
 //
 // gcUsesSpanInlineMarkBits(s.elemsize) must be true.
 func (s *mspan) inlineMarkBits() *spanInlineMarkBits {
 	if doubleCheckGreenTea && !gcUsesSpanInlineMarkBits(s.elemsize) {
 		throw("expected span with inline mark bits")
 	}
 	return spanInlineMarkBitsFromBase(s.base())
 }

 func (s *mspan) markBitsForIndex(objIndex uintptr) (bits markBits) {
 	if gcUsesSpanInlineMarkBits(s.elemsize) {
 		bits.bytep = &s.inlineMarkBits().marks[objIndex/8]
 	} else {
 		bits.bytep = s.gcmarkBits.bytep(objIndex / 8)
 	}
 	bits.mask = uint8(1) << (objIndex % 8)
 	bits.index = objIndex
 	return
 }

 func (s *mspan) markBitsForBase() markBits {
 	if gcUsesSpanInlineMarkBits(s.elemsize) {
 		return markBits{&s.inlineMarkBits().marks[0], uint8(1), 0}
 	}
 	return markBits{&s.gcmarkBits.x, uint8(1), 0}
 }

 // scannedBitsForIndex returns a markBits representing the scanned bit
 // for objIndex in the inline mark bits.
 func (s *mspan) scannedBitsForIndex(objIndex uintptr) markBits {
 	return markBits{&s.inlineMarkBits().scans[objIndex/8], uint8(1) << (objIndex % 8), objIndex}
 }

 // gcUsesSpanInlineMarkBits returns true if a span holding objects of a certain size
 // has inline mark bits. size must be the span's elemsize.
 //
 // nosplit because this is called from gcmarknewobject, which is nosplit.
 //
 //go:nosplit
 func gcUsesSpanInlineMarkBits(size uintptr) bool {
 	return heapBitsInSpan(size) && size >= 16
 }

 // tryDeferToSpanScan tries to queue p on the span it points to, if it
 // points to a small object span (gcUsesSpanQueue size).
 func tryDeferToSpanScan(p uintptr, gcw *gcWork) bool {
 	if useCheckmark {
 		return false
 	}

 	// Quickly to see if this is a span that has inline mark bits.
 	ha := heapArenaOf(p)
 	if ha == nil {
 		return false
 	}
 	pageIdx := ((p / pageSize) / 8) % uintptr(len(ha.pageInUse))
 	pageMask := byte(1 << ((p / pageSize) % 8))
 	if ha.pageUseSpanInlineMarkBits[pageIdx]&pageMask == 0 {
 		return false
 	}

 	// Find the object's index from the span class info stored in the inline mark bits.
 	base := alignDown(p, gc.PageSize)
 	q := spanInlineMarkBitsFromBase(base)
 	objIndex := uint16((uint64(p-base) * uint64(gc.SizeClassToDivMagic[q.class.sizeclass()])) >> 32)

 	// Set mark bit.
 	idx, mask := objIndex/8, uint8(1)<<(objIndex%8)
 	if atomic.Load8(&q.marks[idx])&mask != 0 {
 		return true
 	}
 	atomic.Or8(&q.marks[idx], mask)

 	// Fast-track noscan objects.
 	if q.class.noscan() {
 		gcw.bytesMarked += uint64(gc.SizeClassToSize[q.class.sizeclass()])
 		return true
 	}

 	// Queue up the pointer (as a representative for its span).
 	if q.tryAcquire() {
 		if gcw.spanq.put(makeObjPtr(base, objIndex)) {
 			if gcphase == _GCmark {
 				// This is intentionally racy; the bit set here might get
 				// stomped on by a stealing P. See the comment in tryStealSpan
 				// for an explanation as to why this is OK.
 				if !work.spanqMask.read(uint32(gcw.id)) {
 					work.spanqMask.set(gcw.id)
 				}
 				gcw.mayNeedWorker = true
 			}
 			gcw.flushedWork = true
 		}
 	}
 	return true
 }

 // tryGetSpanFast attempts to get an entire span to scan.
 func (w *gcWork) tryGetSpanFast() objptr {
 	return w.spanq.tryGetFast()
 }

 // tryGetSpan attempts to get an entire span to scan.
 func (w *gcWork) tryGetSpan() objptr {
 	if s := w.spanq.tryGetFast(); s != 0 {
 		return s
 	}
 	// "Steal" from ourselves.
 	if s := w.spanq.steal(&w.spanq); s != 0 {
 		return s
 	}
 	// We failed to get any local work, so we're fresh out.
 	// Nobody else is going to add work for us. Clear our bit.
 	if work.spanqMask.read(uint32(w.id)) {
 		work.spanqMask.clear(w.id)
 	}
 	return 0
 }

 // spanQueue is a P-local stealable span queue.
 type spanQueue struct {
 	// head, tail, and ring represent a local non-thread-safe ring buffer.
 	head, tail uint32
 	ring       [256]objptr

 	// putsSinceDrain counts the number of put calls since the last drain.
 	putsSinceDrain int

 	// chain contains state visible to other Ps.
 	//
 	// In particular, that means a linked chain of single-producer multi-consumer
 	// ring buffers where the single producer is this P only.
 	//
 	// This linked chain structure is based off the sync.Pool dequeue.
 	chain struct {
 		// head is the spanSPMC to put to. This is only accessed
 		// by the producer, so doesn't need to be synchronized.
 		head *spanSPMC

 		// tail is the spanSPMC to steal from. This is accessed
 		// by consumers, so reads and writes must be atomic.
 		tail atomic.UnsafePointer // *spanSPMC
 	}
 }

 // putFast tries to put s onto the queue, but may fail if it's full.
 func (q *spanQueue) putFast(s objptr) (ok bool) {
 	if q.tail-q.head == uint32(len(q.ring)) {
 		return false
 	}
 	q.ring[q.tail%uint32(len(q.ring))] = s
 	q.tail++
 	return true
 }

 // put puts s onto the queue.
 //
 // Returns whether the caller should spin up a new worker.
 func (q *spanQueue) put(s objptr) bool {
 	// The constants below define the period of and volume of
 	// spans we spill to the spmc chain when the local queue is
 	// not full.
 	//
 	// spillPeriod must be > spillMax, otherwise that sets the
 	// effective maximum size of our local span queue. Even if
 	// we have a span ring of size N, but we flush K spans every
 	// K puts, then K becomes our effective maximum length. When
 	// spillPeriod > spillMax, then we're always spilling spans
 	// at a slower rate than we're accumulating them.
 	const (
 		// spillPeriod defines how often to check if we should
 		// spill some spans, counted in the number of calls to put.
 		spillPeriod = 64

 		// spillMax defines, at most, how many spans to drain with
 		// each spill.
 		spillMax = 16
 	)

 	if q.putFast(s) {
 		// Occasionally try to spill some work to generate parallelism.
 		q.putsSinceDrain++
 		if q.putsSinceDrain >= spillPeriod {
 			// Reset even if we don't drain, so we don't check every time.
 			q.putsSinceDrain = 0

 			// Try to drain some spans. Don't bother if there's very
 			// few of them or there's already spans in the spmc chain.
 			n := min((q.tail-q.head)/2, spillMax)
 			if n > 4 && q.chainEmpty() {
 				q.drain(n)
 				return true
 			}
 		}
 		return false
 	}

 	// We're out of space. Drain out our local spans.
 	q.drain(uint32(len(q.ring)) / 2)
 	if !q.putFast(s) {
 		throw("failed putFast after drain")
 	}
 	return true
 }

 // flush publishes all spans in the local queue to the spmc chain.
 func (q *spanQueue) flush() {
 	n := q.tail - q.head
 	if n == 0 {
 		return
 	}
 	q.drain(n)
 }

 // empty returns true if there's no more work on the queue.
 //
 // Not thread-safe. Must only be called by the owner of q.
 func (q *spanQueue) empty() bool {
 	// Check the local queue for work.
 	if q.tail-q.head > 0 {
 		return false
 	}
 	return q.chainEmpty()
 }

 // chainEmpty returns true if the spmc chain is empty.
 //
 // Thread-safe.
 func (q *spanQueue) chainEmpty() bool {
 	// Check the rest of the rings for work.
 	r := (*spanSPMC)(q.chain.tail.Load())
 	for r != nil {
 		if !r.empty() {
 			return false
 		}
 		r = (*spanSPMC)(r.prev.Load())
 	}
 	return true
 }

 // drain publishes n spans from the local queue to the spmc chain.
 func (q *spanQueue) drain(n uint32) {
 	q.putsSinceDrain = 0

 	if q.chain.head == nil {
 		// N.B. We target 1024, but this may be bigger if the physical
 		// page size is bigger, or if we can fit more uintptrs into a
 		// physical page. See newSpanSPMC docs.
 		r := newSpanSPMC(1024)
 		q.chain.head = r
 		q.chain.tail.StoreNoWB(unsafe.Pointer(r))
 	}

 	// Try to drain some of the queue to the head spmc.
 	if q.tryDrain(q.chain.head, n) {
 		return
 	}
 	// No space. Create a bigger spmc and add it to the chain.

 	// Double the size of the next one, up to a maximum.
 	//
 	// We double each time so we can avoid taking this slow path
 	// in the future, which involves a global lock. Ideally we want
 	// to hit a steady-state where the deepest any queue goes during
 	// a mark phase can fit in the ring.
 	//
 	// However, we still set a maximum on this. We set the maximum
 	// to something large to amortize the cost of lock acquisition, but
 	// still at a reasonable size for big heaps and/or a lot of Ps (which
 	// tend to be correlated).
 	//
 	// It's not too bad to burn relatively large-but-fixed amounts of per-P
 	// memory if we need to deal with really, really deep queues, since the
 	// constants of proportionality are small. Simultaneously, we want to
 	// avoid a situation where a single worker ends up queuing O(heap)
 	// work and then forever retains a queue of that size.
 	const maxCap = 1 << 20 / goarch.PtrSize
 	newCap := q.chain.head.cap * 2
 	if newCap > maxCap {
 		newCap = maxCap
 	}
 	newHead := newSpanSPMC(newCap)
 	if !q.tryDrain(newHead, n) {
 		throw("failed to put span on newly-allocated spanSPMC")
 	}
 	q.chain.head.prev.StoreNoWB(unsafe.Pointer(newHead))
 	q.chain.head = newHead
 }

 // tryDrain attempts to drain n spans from q's local queue to the chain.
 //
 // Returns whether it succeeded.
 func (q *spanQueue) tryDrain(r *spanSPMC, n uint32) bool {
 	if q.head+n > q.tail {
 		throw("attempt to drain too many elements")
 	}
 	h := r.head.Load() // synchronize with consumers
 	t := r.tail.Load()
 	rn := t - h
 	if rn+n <= r.cap {
 		for i := uint32(0); i < n; i++ {
 			*r.slot(t + i) = q.ring[(q.head+i)%uint32(len(q.ring))]
 		}
 		r.tail.Store(t + n) // Makes the items avail for consumption.
 		q.head += n
 		return true
 	}
 	return false
 }

 // tryGetFast attempts to get a span from the local queue, but may fail if it's empty,
 // returning false.
 func (q *spanQueue) tryGetFast() objptr {
 	if q.tail-q.head == 0 {
 		return 0
 	}
 	s := q.ring[q.head%uint32(len(q.ring))]
 	q.head++
 	return s
 }

 // steal takes some spans from the ring chain of another span queue.
 //
 // q == q2 is OK.
 func (q *spanQueue) steal(q2 *spanQueue) objptr {
 	r := (*spanSPMC)(q2.chain.tail.Load())
 	if r == nil {
 		return 0
 	}
 	for {
 		// It's important that we load the next pointer
 		// *before* popping the tail. In general, r may be
 		// transiently empty, but if next is non-nil before
 		// the pop and the pop fails, then r is permanently
 		// empty, which is the only condition under which it's
 		// safe to drop r from the chain.
 		r2 := (*spanSPMC)(r.prev.Load())

 		// Try to refill from one of the rings
 		if s := q.refill(r); s != 0 {
 			return s
 		}

 		if r2 == nil {
 			// This is the only ring. It's empty right
 			// now, but could be pushed to in the future.
 			return 0
 		}

 		// The tail of the chain has been drained, so move on
 		// to the next ring. Try to drop it from the chain
 		// so the next consumer doesn't have to look at the empty
 		// ring again.
 		if q2.chain.tail.CompareAndSwapNoWB(unsafe.Pointer(r), unsafe.Pointer(r2)) {
 			r.dead.Store(true)
 		}

 		r = r2
 	}
 }

 // refill takes some spans from r and puts them into q's local queue.
 //
 // One span is removed from the stolen spans and returned on success.
 // Failure to steal returns a zero objptr.
 //
 // steal is thread-safe with respect to r.
 func (q *spanQueue) refill(r *spanSPMC) objptr {
 	if q.tail-q.head != 0 {
 		throw("steal with local work available")
 	}

 	// Steal some spans.
 	var n uint32
 	for {
 		h := r.head.Load() // load-acquire, synchronize with other consumers
 		t := r.tail.Load() // load-acquire, synchronize with the producer
 		n = t - h
 		n = n - n/2
 		if n == 0 {
 			return 0
 		}
 		if n > r.cap { // read inconsistent h and t
 			continue
 		}
 		n = min(n, uint32(len(q.ring)/2))
 		for i := uint32(0); i < n; i++ {
 			q.ring[i] = *r.slot(h + i)
 		}
 		if r.head.CompareAndSwap(h, h+n) {
 			break
 		}
 	}

 	// Update local queue head and tail to reflect new buffered values.
 	q.head = 0
 	q.tail = n

 	// Pop off the head of the queue and return it.
 	return q.tryGetFast()
 }

 // spanSPMC is a ring buffer of objptrs that represent spans.
 // Accessed without a lock.
 //
 // Single-producer, multi-consumer. The only producer is the P that owns this
 // queue, but any other P may consume from it.
 //
 // ## Invariants for memory management
 //
 // 1. All spanSPMCs are allocated from mheap_.spanSPMCAlloc.
 // 2. All allocated spanSPMCs must be on the work.spanSPMCs list.
 // 3. spanSPMCs may only be allocated if gcphase != _GCoff.
 // 4. spanSPMCs may only be deallocated if gcphase == _GCoff.
 //
 // Invariants (3) and (4) ensure that we do not need to concern ourselves with
 // tricky reuse issues that stem from not knowing when a thread is truly done
 // with a spanSPMC. For example, two threads could load the same spanSPMC from
 // the tail of the chain. One thread is then paused while the other steals the
 // last few elements off of it. It's not safe to free at that point since the
 // other thread will still inspect that spanSPMC, and we have no way of knowing
 // without more complex and/or heavyweight synchronization.
 //
 // Instead, we rely on the global synchronization inherent to GC phases, and
 // the fact that spanSPMCs are only ever used during the mark phase, to ensure
 // memory safety. This means we temporarily waste some memory, but it's only
 // until the end of the mark phase.
 type spanSPMC struct {
 	_ sys.NotInHeap

 	// allnext is the link to the next spanSPMC on the work.spanSPMCs list.
 	// This is used to find and free dead spanSPMCs. Protected by
 	// work.spanSPMCs.lock.
 	allnext *spanSPMC

 	// dead indicates whether the spanSPMC is no longer in use.
 	// Protected by the CAS to the prev field of the spanSPMC pointing
 	// to this spanSPMC. That is, whoever wins that CAS takes ownership
 	// of marking this spanSPMC as dead. See spanQueue.steal for details.
 	dead atomic.Bool

 	// prev is the next link up a spanQueue's SPMC chain, from tail to head,
 	// hence the name "prev." Set by a spanQueue's producer, cleared by a
 	// CAS in spanQueue.steal.
 	prev atomic.UnsafePointer // *spanSPMC

 	// head, tail, cap, and ring together represent a fixed-size SPMC lock-free
 	// ring buffer of size cap. The ring buffer contains objptr values.
 	head atomic.Uint32
 	tail atomic.Uint32
 	cap  uint32 // cap(ring))
 	ring *objptr
 }

 // newSpanSPMC allocates and initializes a new spmc with the provided capacity.
 //
 // newSpanSPMC may override the capacity with a larger one if the provided one would
 // waste memory.
 func newSpanSPMC(cap uint32) *spanSPMC {
 	lock(&work.spanSPMCs.lock)
 	r := (*spanSPMC)(mheap_.spanSPMCAlloc.alloc())
 	r.allnext = work.spanSPMCs.all
 	work.spanSPMCs.all = r
 	unlock(&work.spanSPMCs.lock)

 	// If cap < the capacity of a single physical page, round up.
 	pageCap := uint32(physPageSize / goarch.PtrSize) // capacity of a single page
 	if cap < pageCap {
 		cap = pageCap
 	}
 	if cap&(cap-1) != 0 {
 		throw("spmc capacity must be a power of 2")
 	}

 	r.cap = cap
 	ring := sysAlloc(uintptr(cap)*unsafe.Sizeof(objptr(0)), &memstats.gcMiscSys, "GC span queue")
 	atomic.StorepNoWB(unsafe.Pointer(&r.ring), ring)
 	return r
 }

 // empty returns true if the spmc is empty.
 //
 // empty is thread-safe.
 func (r *spanSPMC) empty() bool {
 	h := r.head.Load()
 	t := r.tail.Load()
 	return t == h
 }

 // deinit frees any resources the spanSPMC is holding onto and zeroes it.
 func (r *spanSPMC) deinit() {
 	sysFree(unsafe.Pointer(r.ring), uintptr(r.cap)*unsafe.Sizeof(objptr(0)), &memstats.gcMiscSys)
 	r.ring = nil
 	r.dead.Store(false)
 	r.prev.StoreNoWB(nil)
 	r.head.Store(0)
 	r.tail.Store(0)
 	r.cap = 0
 }

 // slot returns a pointer to slot i%r.cap.
 func (r *spanSPMC) slot(i uint32) *objptr {
 	idx := uintptr(i & (r.cap - 1))
 	return (*objptr)(unsafe.Add(unsafe.Pointer(r.ring), idx*unsafe.Sizeof(objptr(0))))
 }

 // freeSomeSpanSPMCs frees some spanSPMCs back to the OS and returns
 // true if it should be called again to free more.
 func freeSomeSpanSPMCs(preemptible bool) bool {
 	// TODO(mknyszek): This is arbitrary, but some kind of limit is necessary
 	// to help bound delays to cooperatively preempt ourselves.
 	const batchSize = 64

 	// According to the SPMC memory management invariants, we can only free
 	// spanSPMCs outside of the mark phase. We ensure we do this in two ways.
 	//
 	// 1. We take the work.spanSPMCs lock, which we need anyway. This ensures
 	//    that we are non-preemptible. If this path becomes lock-free, we will
 	//    need to become non-preemptible in some other way.
 	// 2. Once we are non-preemptible, we check the gcphase, and back out if
 	//    it's not safe.
 	//
 	// This way, we ensure that we don't start freeing if we're in the wrong
 	// phase, and the phase can't change on us while we're freeing.
 	lock(&work.spanSPMCs.lock)
 	if gcphase != _GCoff || work.spanSPMCs.all == nil {
 		unlock(&work.spanSPMCs.lock)
 		return false
 	}
 	rp := &work.spanSPMCs.all
 	gp := getg()
 	more := true
 	for i := 0; i < batchSize && !(preemptible && gp.preempt); i++ {
 		r := *rp
 		if r == nil {
 			more = false
 			break
 		}
 		if r.dead.Load() {
 			// It's dead. Deinitialize and free it.
 			*rp = r.allnext
 			r.deinit()
 			mheap_.spanSPMCAlloc.free(unsafe.Pointer(r))
 		} else {
 			// Still alive, likely in some P's chain.
 			// Skip it.
 			rp = &r.allnext
 		}
 	}
 	unlock(&work.spanSPMCs.lock)
 	return more
 }

 // tryStealSpan attempts to steal a span from another P's local queue.
 //
 // Returns a non-zero objptr on success.
 func (w *gcWork) tryStealSpan() objptr {
 	pp := getg().m.p.ptr()

 	for enum := stealOrder.start(cheaprand()); !enum.done(); enum.next() {
 		if !work.spanqMask.read(enum.position()) {
 			continue
 		}
 		p2 := allp[enum.position()]
 		if pp == p2 {
 			continue
 		}
 		if s := w.spanq.steal(&p2.gcw.spanq); s != 0 {
 			return s
 		}
 		// N.B. This is intentionally racy. We may stomp on a mask set by
 		// a P that just put a bunch of work into its local queue.
 		//
 		// This is OK because the ragged barrier in gcMarkDone will set
 		// the bit on each P if there's local work we missed. This race
 		// should generally be rare, since the window between noticing
 		// an empty local queue and this bit being set is quite small.
 		work.spanqMask.clear(int32(enum.position()))
 	}
 	return 0
 }

 // objptr consists of a span base and the index of the object in the span.
 type objptr uintptr

 // makeObjPtr creates an objptr from a span base address and an object index.
 func makeObjPtr(spanBase uintptr, objIndex uint16) objptr {
 	if doubleCheckGreenTea && spanBase&((1<<gc.PageShift)-1) != 0 {
 		throw("created objptr with address that is incorrectly aligned")
 	}
 	return objptr(spanBase | uintptr(objIndex))
 }

 func (p objptr) spanBase() uintptr {
 	return uintptr(p) &^ ((1 << gc.PageShift) - 1)
 }

 func (p objptr) objIndex() uint16 {
 	return uint16(p) & ((1 << gc.PageShift) - 1)
 }

 // scanSpan scans objects indicated marks&^scans and then scans those objects,
 // queuing the resulting pointers into gcw.
 func scanSpan(p objptr, gcw *gcWork) {
 	spanBase := p.spanBase()
 	imb := spanInlineMarkBitsFromBase(spanBase)
 	spanclass := imb.class
 	if spanclass.noscan() {
 		throw("noscan object in scanSpan")
 	}
 	elemsize := uintptr(gc.SizeClassToSize[spanclass.sizeclass()])

 	// Release span.
 	if imb.release() == spanScanOneMark {
 		// Nobody else set any mark bits on this span while it was acquired.
 		// That means p is the sole object we need to handle. Fast-track it.
 		objIndex := p.objIndex()
 		bytep := &imb.scans[objIndex/8]
 		mask := uint8(1) << (objIndex % 8)
 		if atomic.Load8(bytep)&mask != 0 {
 			return
 		}
 		atomic.Or8(bytep, mask)
 		gcw.bytesMarked += uint64(elemsize)
 		if debug.gctrace > 1 {
 			gcw.stats[spanclass.sizeclass()].sparseObjsScanned++
 		}
 		b := spanBase + uintptr(objIndex)*elemsize
 		scanObjectSmall(spanBase, b, elemsize, gcw)
 		return
 	}

 	// Compute nelems.
 	divMagic := uint64(gc.SizeClassToDivMagic[spanclass.sizeclass()])
 	usableSpanSize := uint64(gc.PageSize - unsafe.Sizeof(spanInlineMarkBits{}))
 	if !spanclass.noscan() {
 		usableSpanSize -= gc.PageSize / goarch.PtrSize / 8
 	}
 	nelems := uint16((usableSpanSize * divMagic) >> 32)

 	// Grey objects and return if there's nothing else to do.
 	var toScan gc.ObjMask
 	objsMarked := spanSetScans(spanBase, nelems, imb, &toScan)
 	if objsMarked == 0 {
 		return
 	}
 	gcw.bytesMarked += uint64(objsMarked) * uint64(elemsize)

 	// Check if we have enough density to make a dartboard scan
 	// worthwhile. If not, just do what scanobject does, but
 	// localized to the span, using the dartboard.
 	if !scan.HasFastScanSpanPacked() || objsMarked < int(nelems/8) {
 		if debug.gctrace > 1 {
 			gcw.stats[spanclass.sizeclass()].spansSparseScanned++
 			gcw.stats[spanclass.sizeclass()].spanObjsSparseScanned += uint64(objsMarked)
 		}
 		scanObjectsSmall(spanBase, elemsize, nelems, gcw, &toScan)
 		return
 	}

 	// Scan the span.
 	//
 	// N.B. Use gcw.ptrBuf as the output buffer. This is a bit different
 	// from scanObjectsSmall, which puts addresses to dereference. ScanSpanPacked
 	// on the other hand, fills gcw.ptrBuf with already dereferenced pointers.
 	nptrs := scan.ScanSpanPacked(
 		unsafe.Pointer(spanBase),
 		&gcw.ptrBuf[0],
 		&toScan,
 		uintptr(spanclass.sizeclass()),
 		spanPtrMaskUnsafe(spanBase),
 	)
 	gcw.heapScanWork += int64(objsMarked) * int64(elemsize)

 	if debug.gctrace > 1 {
 		// Write down some statistics.
 		gcw.stats[spanclass.sizeclass()].spansDenseScanned++
 		gcw.stats[spanclass.sizeclass()].spanObjsDenseScanned += uint64(objsMarked)
 	}

 	// Process all the pointers we just got.
 	for _, p := range gcw.ptrBuf[:nptrs] {
 		if !tryDeferToSpanScan(p, gcw) {
 			if obj, span, objIndex := findObject(p, 0, 0); obj != 0 {
 				greyobject(obj, 0, 0, span, gcw, objIndex)
 			}
 		}
 	}
 }

 // spanSetScans sets any unset mark bits that have their mark bits set in the inline mark bits.
 //
 // toScan is populated with bits indicating whether a particular mark bit was set.
 //
 // Returns the number of objects marked, which could be zero.
 func spanSetScans(spanBase uintptr, nelems uint16, imb *spanInlineMarkBits, toScan *gc.ObjMask) int {
 	arena, pageIdx, pageMask := pageIndexOf(spanBase)
 	if arena.pageMarks[pageIdx]&pageMask == 0 {
 		atomic.Or8(&arena.pageMarks[pageIdx], pageMask)
 	}

 	bytes := divRoundUp(uintptr(nelems), 8)
 	objsMarked := 0

 	// Careful: these two structures alias since ObjMask is much bigger
 	// than marks or scans. We do these unsafe shenanigans so that we can
 	// access the marks and scans by uintptrs rather than by byte.
 	imbMarks := (*gc.ObjMask)(unsafe.Pointer(&imb.marks))
 	imbScans := (*gc.ObjMask)(unsafe.Pointer(&imb.scans))

 	// Iterate over one uintptr-sized chunks at a time, computing both
 	// the union and intersection of marks and scans. Store the union
 	// into scans, and the intersection into toScan.
 	for i := uintptr(0); i < bytes; i += goarch.PtrSize {
 		scans := atomic.Loaduintptr(&imbScans[i/goarch.PtrSize])
 		marks := imbMarks[i/goarch.PtrSize]
 		scans = bswapIfBigEndian(scans)
 		marks = bswapIfBigEndian(marks)
 		if i/goarch.PtrSize == uintptr(len(imb.marks)+1)/goarch.PtrSize-1 {
 			scans &^= 0xff << ((goarch.PtrSize - 1) * 8) // mask out owned
 			marks &^= 0xff << ((goarch.PtrSize - 1) * 8) // mask out class
 		}
 		toGrey := marks &^ scans
 		toScan[i/goarch.PtrSize] = toGrey

 		// If there's anything left to grey, do it.
 		if toGrey != 0 {
 			toGrey = bswapIfBigEndian(toGrey)
 			if goarch.PtrSize == 4 {
 				atomic.Or32((*uint32)(unsafe.Pointer(&imbScans[i/goarch.PtrSize])), uint32(toGrey))
 			} else {
 				atomic.Or64((*uint64)(unsafe.Pointer(&imbScans[i/goarch.PtrSize])), uint64(toGrey))
 			}
 		}
 		objsMarked += sys.OnesCount64(uint64(toGrey))
 	}
 	return objsMarked
 }

 func scanObjectSmall(spanBase, b, objSize uintptr, gcw *gcWork) {
 	ptrBits := heapBitsSmallForAddrInline(spanBase, b, objSize)
 	gcw.heapScanWork += int64(sys.Len64(uint64(ptrBits)) * goarch.PtrSize)
 	nptrs := 0
 	n := sys.OnesCount64(uint64(ptrBits))
 	for range n {
 		k := sys.TrailingZeros64(uint64(ptrBits))
 		ptrBits &^= 1 << k
 		addr := b + uintptr(k)*goarch.PtrSize

 		// Prefetch addr since we're about to use it. This point for prefetching
 		// was chosen empirically.
 		sys.Prefetch(addr)

 		// N.B. ptrBuf is always large enough to hold pointers for an entire 1-page span.
 		gcw.ptrBuf[nptrs] = addr
 		nptrs++
 	}

 	// Process all the pointers we just got.
 	for _, p := range gcw.ptrBuf[:nptrs] {
 		p = *(*uintptr)(unsafe.Pointer(p))
 		if p == 0 {
 			continue
 		}
 		if !tryDeferToSpanScan(p, gcw) {
 			if obj, span, objIndex := findObject(p, 0, 0); obj != 0 {
 				greyobject(obj, 0, 0, span, gcw, objIndex)
 			}
 		}
 	}
 }

 func scanObjectsSmall(base, objSize uintptr, elems uint16, gcw *gcWork, scans *gc.ObjMask) {
 	nptrs := 0
 	for i, bits := range scans {
 		if i*(goarch.PtrSize*8) > int(elems) {
 			break
 		}
 		n := sys.OnesCount64(uint64(bits))
 		for range n {
 			j := sys.TrailingZeros64(uint64(bits))
 			bits &^= 1 << j

 			b := base + uintptr(i*(goarch.PtrSize*8)+j)*objSize
 			ptrBits := heapBitsSmallForAddrInline(base, b, objSize)
 			gcw.heapScanWork += int64(sys.Len64(uint64(ptrBits)) * goarch.PtrSize)

 			n := sys.OnesCount64(uint64(ptrBits))
 			for range n {
 				k := sys.TrailingZeros64(uint64(ptrBits))
 				ptrBits &^= 1 << k
 				addr := b + uintptr(k)*goarch.PtrSize

 				// Prefetch addr since we're about to use it. This point for prefetching
 				// was chosen empirically.
 				sys.Prefetch(addr)

 				// N.B. ptrBuf is always large enough to hold pointers for an entire 1-page span.
 				gcw.ptrBuf[nptrs] = addr
 				nptrs++
 			}
 		}
 	}

 	// Process all the pointers we just got.
 	for _, p := range gcw.ptrBuf[:nptrs] {
 		p = *(*uintptr)(unsafe.Pointer(p))
 		if p == 0 {
 			continue
 		}
 		if !tryDeferToSpanScan(p, gcw) {
 			if obj, span, objIndex := findObject(p, 0, 0); obj != 0 {
 				greyobject(obj, 0, 0, span, gcw, objIndex)
 			}
 		}
 	}
 }

 func heapBitsSmallForAddrInline(spanBase, addr, elemsize uintptr) uintptr {
 	hbitsBase, _ := spanHeapBitsRange(spanBase, gc.PageSize, elemsize)
 	hbits := (*byte)(unsafe.Pointer(hbitsBase))

 	// These objects are always small enough that their bitmaps
 	// fit in a single word, so just load the word or two we need.
 	//
 	// Mirrors mspan.writeHeapBitsSmall.
 	//
 	// We should be using heapBits(), but unfortunately it introduces
 	// both bounds checks panics and throw which causes us to exceed
 	// the nosplit limit in quite a few cases.
 	i := (addr - spanBase) / goarch.PtrSize / ptrBits
 	j := (addr - spanBase) / goarch.PtrSize % ptrBits
 	bits := elemsize / goarch.PtrSize
 	word0 := (*uintptr)(unsafe.Pointer(addb(hbits, goarch.PtrSize*(i+0))))
 	word1 := (*uintptr)(unsafe.Pointer(addb(hbits, goarch.PtrSize*(i+1))))

 	var read uintptr
 	if j+bits > ptrBits {
 		// Two reads.
 		bits0 := ptrBits - j
 		bits1 := bits - bits0
 		read = *word0 >> j
 		read |= (*word1 & ((1 << bits1) - 1)) << bits0
 	} else {
 		// One read.
 		read = (*word0 >> j) & ((1 << bits) - 1)
 	}
 	return read
 }

 // spanPtrMaskUnsafe returns the pointer mask for a span with inline mark bits.
 //
 // The caller must ensure spanBase is the base of a span that:
 // - 1 page in size,
 // - Uses inline mark bits,
 // - Contains pointers.
 func spanPtrMaskUnsafe(spanBase uintptr) *gc.PtrMask {
 	base := spanBase + gc.PageSize - unsafe.Sizeof(gc.PtrMask{}) - unsafe.Sizeof(spanInlineMarkBits{})
 	return (*gc.PtrMask)(unsafe.Pointer(base))
 }

 type sizeClassScanStats struct {
 	spansDenseScanned     uint64 // Spans scanned with ScanSpanPacked.
 	spanObjsDenseScanned  uint64 // Objects scanned with ScanSpanPacked.
 	spansSparseScanned    uint64 // Spans scanned with scanObjectsSmall.
 	spanObjsSparseScanned uint64 // Objects scanned with scanObjectsSmall.
 	sparseObjsScanned     uint64 // Objects scanned with scanobject or scanObjectSmall.
 	// Note: sparseObjsScanned is sufficient for both cases because
 	// a particular size class either uses scanobject or scanObjectSmall,
 	// not both. In the latter case, we also know that there was one
 	// object scanned per span, so no need for a span counter.
 }

 func dumpScanStats() {
 	var (
 		spansDenseScanned     uint64
 		spanObjsDenseScanned  uint64
 		spansSparseScanned    uint64
 		spanObjsSparseScanned uint64
 		sparseObjsScanned     uint64
 	)
 	for _, stats := range memstats.lastScanStats {
 		spansDenseScanned += stats.spansDenseScanned
 		spanObjsDenseScanned += stats.spanObjsDenseScanned
 		spansSparseScanned += stats.spansSparseScanned
 		spanObjsSparseScanned += stats.spanObjsSparseScanned
 		sparseObjsScanned += stats.sparseObjsScanned
 	}
 	totalObjs := sparseObjsScanned + spanObjsSparseScanned + spanObjsDenseScanned
 	totalSpans := spansSparseScanned + spansDenseScanned
 	print("scan: total ", sparseObjsScanned, "+", spanObjsSparseScanned, "+", spanObjsDenseScanned, "=", totalObjs, " objs")
 	print(", ", spansSparseScanned, "+", spansDenseScanned, "=", totalSpans, " spans\n")
 	for i, stats := range memstats.lastScanStats {
 		if stats == (sizeClassScanStats{}) {
 			continue
 		}
 		totalObjs := stats.sparseObjsScanned + stats.spanObjsSparseScanned + stats.spanObjsDenseScanned
 		totalSpans := stats.spansSparseScanned + stats.spansDenseScanned
 		if i == 0 {
 			print("scan: class L ")
 		} else {
 			print("scan: class ", gc.SizeClassToSize[i], "B ")
 		}
 		print(stats.sparseObjsScanned, "+", stats.spanObjsSparseScanned, "+", stats.spanObjsDenseScanned, "=", totalObjs, " objs")
 		print(", ", stats.spansSparseScanned, "+", stats.spansDenseScanned, "=", totalSpans, " spans\n")
 	}
 }

 func (w *gcWork) flushScanStats(dst *[gc.NumSizeClasses]sizeClassScanStats) {
 	for i := range w.stats {
 		dst[i].spansDenseScanned += w.stats[i].spansDenseScanned
 		dst[i].spanObjsDenseScanned += w.stats[i].spanObjsDenseScanned
 		dst[i].spansSparseScanned += w.stats[i].spansSparseScanned
 		dst[i].spanObjsSparseScanned += w.stats[i].spanObjsSparseScanned
 		dst[i].sparseObjsScanned += w.stats[i].sparseObjsScanned
 	}
 	clear(w.stats[:])
 }

 // gcMarkWorkAvailable reports whether there's any non-local work available to do.
 //
 // This is a heavyweight check and must only be used for correctness, not
 // as a hint.
 func gcMarkWorkAvailable() bool {
 	if !work.full.empty() {
 		return true // global work available
 	}
 	if work.markrootNext < work.markrootJobs {
 		return true // root scan work available
 	}
 	if work.spanqMask.any() {
 		return true // stealable local work available
 	}
 	return false
 }

 // scanObject scans the object starting at b, adding pointers to gcw.
 // b must point to the beginning of a heap object or an oblet.
 // scanObject consults the GC bitmap for the pointer mask and the
 // spans for the size of the object.
 //
 // Used only for !gcUsesSpanInlineMarkBits spans, but supports all
 // object sizes and is safe to be called on all heap objects.
 //
 //go:nowritebarrier
 func scanObject(b uintptr, gcw *gcWork) {
 	// Prefetch object before we scan it.
 	//
 	// This will overlap fetching the beginning of the object with initial
 	// setup before we start scanning the object.
 	sys.Prefetch(b)

 	// Find the bits for b and the size of the object at b.
 	//
 	// b is either the beginning of an object, in which case this
 	// is the size of the object to scan, or it points to an
 	// oblet, in which case we compute the size to scan below.
 	s := spanOfUnchecked(b)
 	n := s.elemsize
 	if n == 0 {
 		throw("scanObject n == 0")
 	}
 	if s.spanclass.noscan() {
 		// Correctness-wise this is ok, but it's inefficient
 		// if noscan objects reach here.
 		throw("scanObject of a noscan object")
 	}

 	var tp typePointers
 	if n > maxObletBytes {
 		// Large object. Break into oblets for better
 		// parallelism and lower latency.
 		if b == s.base() {
 			// Enqueue the other oblets to scan later.
 			// Some oblets may be in b's scalar tail, but
 			// these will be marked as "no more pointers",
 			// so we'll drop out immediately when we go to
 			// scan those.
 			for oblet := b + maxObletBytes; oblet < s.base()+s.elemsize; oblet += maxObletBytes {
 				if !gcw.putObjFast(oblet) {
 					gcw.putObj(oblet)
 				}
 			}
 		}

 		// Compute the size of the oblet. Since this object
 		// must be a large object, s.base() is the beginning
 		// of the object.
 		n = s.base() + s.elemsize - b
 		n = min(n, maxObletBytes)
 		tp = s.typePointersOfUnchecked(s.base())
 		tp = tp.fastForward(b-tp.addr, b+n)
 	} else {
 		tp = s.typePointersOfUnchecked(b)
 	}

 	var scanSize uintptr
 	for {
 		var addr uintptr
 		if tp, addr = tp.nextFast(); addr == 0 {
 			if tp, addr = tp.next(b + n); addr == 0 {
 				break
 			}
 		}

 		// Keep track of farthest pointer we found, so we can
 		// update heapScanWork. TODO: is there a better metric,
 		// now that we can skip scalar portions pretty efficiently?
 		scanSize = addr - b + goarch.PtrSize

 		// Work here is duplicated in scanblock and above.
 		// If you make changes here, make changes there too.
 		obj := *(*uintptr)(unsafe.Pointer(addr))

 		// At this point we have extracted the next potential pointer.
 		// Quickly filter out nil and pointers back to the current object.
 		if obj != 0 && obj-b >= n {
 			// Test if obj points into the Go heap and, if so,
 			// mark the object.
 			//
 			// Note that it's possible for findObject to
 			// fail if obj points to a just-allocated heap
 			// object because of a race with growing the
 			// heap. In this case, we know the object was
 			// just allocated and hence will be marked by
 			// allocation itself.
 			if !tryDeferToSpanScan(obj, gcw) {
 				if obj, span, objIndex := findObject(obj, b, addr-b); obj != 0 {
 					greyobject(obj, b, addr-b, span, gcw, objIndex)
 				}
 			}
 		}
 	}
 	gcw.bytesMarked += uint64(n)
 	gcw.heapScanWork += int64(scanSize)
 	if debug.gctrace > 1 {
 		gcw.stats[s.spanclass.sizeclass()].sparseObjsScanned++
 	}
 }