src/runtime/malloc_stubs.go - go.git - Git at Google

 // Copyright 2025 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 // This file contains stub functions that are not meant to be called directly,
 // but that will be assembled together using the inlining logic in runtime/_mkmalloc
 // to produce a full mallocgc function that's specialized for a span class
 // or specific size in the case of the tiny allocator.
 //
 // To assemble a mallocgc function, the mallocStub function is cloned, and the call to
 // inlinedMalloc is replaced with the inlined body of smallScanNoHeaderStub,
 // smallNoScanStub or tinyStub, depending on the parameters being specialized.
 //
 // The size_ (for the tiny case) and elemsize_, sizeclass_, and noscanint_ (for all three cases)
 // identifiers are replaced with the value of the parameter in the specialized case.
 // The nextFreeFastStub, nextFreeFastTiny, heapSetTypeNoHeaderStub, and writeHeapBitsSmallStub
 // functions are also inlined by _mkmalloc.

 package runtime

 import (
 	"internal/goarch"
 	"internal/runtime/sys"
 	"unsafe"
 )

 // These identifiers will all be replaced by the inliner. So their values don't
 // really matter: they just need to be set so that the stub functions, which
 // will never be used on their own, can compile. elemsize_ can't be  set to
 // zero because we divide by it in nextFreeFastTiny, and the compiler would
 // complain about a division by zero. Its replaced value will always be greater
 // than zero.
 const elemsize_ = 8
 const sizeclass_ = 0
 const noscanint_ = 0
 const size_ = 0

 func malloc0(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	if doubleCheckMalloc {
 		if gcphase == _GCmarktermination {
 			throw("mallocgc called with gcphase == _GCmarktermination")
 		}
 	}

 	// Short-circuit zero-sized allocation requests.
 	return unsafe.Pointer(&zerobase)
 }

 func mallocPanic(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	panic("not defined for sizeclass")
 }

 func mallocStub(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	if doubleCheckMalloc {
 		if gcphase == _GCmarktermination {
 			throw("mallocgc called with gcphase == _GCmarktermination")
 		}
 	}

 	// It's possible for any malloc to trigger sweeping, which may in
 	// turn queue finalizers. Record this dynamic lock edge.
 	// N.B. Compiled away if lockrank experiment is not enabled.
 	lockRankMayQueueFinalizer()

 	// Pre-malloc debug hooks.
 	if debug.malloc {
 		if x := preMallocgcDebug(size, typ); x != nil {
 			return x
 		}
 	}

 	// Assist the GC if needed.
 	if gcBlackenEnabled != 0 {
 		deductAssistCredit(size)
 	}

 	// Actually do the allocation.
 	x, elemsize := inlinedMalloc(size, typ, needzero)

 	// Adjust our GC assist debt to account for internal fragmentation.
 	if gcBlackenEnabled != 0 && elemsize != 0 {
 		if assistG := getg().m.curg; assistG != nil {
 			assistG.gcAssistBytes -= int64(elemsize - size)
 		}
 	}

 	// Post-malloc debug hooks.
 	if debug.malloc {
 		postMallocgcDebug(x, elemsize, typ)
 	}
 	return x
 }

 // inlinedMalloc will never be called. It is defined just so that the compiler can compile
 // the mallocStub function, which will also never be called, but instead used as a template
 // to generate a size-specialized malloc function. The call to inlinedMalloc in mallocStub
 // will be replaced with the inlined body of smallScanNoHeaderStub, smallNoScanStub, or tinyStub
 // when generating the size-specialized malloc function. See the comment at the top of this
 // file for more information.
 func inlinedMalloc(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, uintptr) {
 	return unsafe.Pointer(uintptr(0)), 0
 }

 func doubleCheckSmallScanNoHeader(size uintptr, typ *_type, mp *m) {
 	if mp.mallocing != 0 {
 		throw("malloc deadlock")
 	}
 	if mp.gsignal == getg() {
 		throw("malloc during signal")
 	}
 	if typ == nil || !typ.Pointers() {
 		throw("noscan allocated in scan-only path")
 	}
 	if !heapBitsInSpan(size) {
 		throw("heap bits in not in span for non-header-only path")
 	}
 }

 func smallScanNoHeaderStub(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, uintptr) {
 	const sizeclass = sizeclass_
 	const elemsize = elemsize_

 	// Set mp.mallocing to keep from being preempted by GC.
 	mp := acquirem()
 	if doubleCheckMalloc {
 		doubleCheckSmallScanNoHeader(size, typ, mp)
 	}
 	mp.mallocing = 1

 	checkGCTrigger := false
 	c := getMCache(mp)
 	const spc = spanClass(sizeclass<<1) | spanClass(noscanint_)
 	span := c.alloc[spc]
 	v := nextFreeFastStub(span)
 	if v == 0 {
 		v, span, checkGCTrigger = c.nextFree(spc)
 	}
 	x := unsafe.Pointer(v)
 	if span.needzero != 0 {
 		memclrNoHeapPointers(x, elemsize)
 	}
 	if goarch.PtrSize == 8 && sizeclass == 1 {
 		// initHeapBits already set the pointer bits for the 8-byte sizeclass
 		// on 64-bit platforms.
 		c.scanAlloc += 8
 	} else {
 		dataSize := size // make the inliner happy
 		x := uintptr(x)
 		scanSize := heapSetTypeNoHeaderStub(x, dataSize, typ, span)
 		c.scanAlloc += scanSize
 	}

 	// Ensure that the stores above that initialize x to
 	// type-safe memory and set the heap bits occur before
 	// the caller can make x observable to the garbage
 	// collector. Otherwise, on weakly ordered machines,
 	// the garbage collector could follow a pointer to x,
 	// but see uninitialized memory or stale heap bits.
 	publicationBarrier()

 	if writeBarrier.enabled {
 		// Allocate black during GC.
 		// All slots hold nil so no scanning is needed.
 		// This may be racing with GC so do it atomically if there can be
 		// a race marking the bit.
 		gcmarknewobject(span, uintptr(x))
 	} else {
 		// Track the last free index before the mark phase. This field
 		// is only used by the garbage collector. During the mark phase
 		// this is used by the conservative scanner to filter out objects
 		// that are both free and recently-allocated. It's safe to do that
 		// because we allocate-black if the GC is enabled. The conservative
 		// scanner produces pointers out of thin air, so without additional
 		// synchronization it might otherwise observe a partially-initialized
 		// object, which could crash the program.
 		span.freeIndexForScan = span.freeindex
 	}

 	// Note cache c only valid while m acquired; see #47302
 	//
 	// N.B. Use the full size because that matches how the GC
 	// will update the mem profile on the "free" side.
 	//
 	// TODO(mknyszek): We should really count the header as part
 	// of gc_sys or something. The code below just pretends it is
 	// internal fragmentation and matches the GC's accounting by
 	// using the whole allocation slot.
 	c.nextSample -= int64(elemsize)
 	if c.nextSample < 0 || MemProfileRate != c.memProfRate {
 		profilealloc(mp, x, elemsize)
 	}
 	mp.mallocing = 0
 	releasem(mp)

 	if checkGCTrigger {
 		if t := (gcTrigger{kind: gcTriggerHeap}); t.test() {
 			gcStart(t)
 		}
 	}

 	return x, elemsize
 }

 func doubleCheckSmallNoScan(typ *_type, mp *m) {
 	if mp.mallocing != 0 {
 		throw("malloc deadlock")
 	}
 	if mp.gsignal == getg() {
 		throw("malloc during signal")
 	}
 	if typ != nil && typ.Pointers() {
 		throw("expected noscan type for noscan alloc")
 	}
 }

 func smallNoScanStub(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, uintptr) {
 	// TODO(matloob): Add functionality to mkmalloc to allow us to inline a non-constant
 	// sizeclass_ and elemsize_ value (instead just set to the expressions to look up the size class
 	// and elemsize. We'd also need to teach mkmalloc that values that are touched by these (specifically
 	// spc below) should turn into vars. This would allow us to generate mallocgcSmallNoScan itself,
 	// so that its code could not diverge from the generated functions.
 	const sizeclass = sizeclass_
 	const elemsize = elemsize_

 	// Set mp.mallocing to keep from being preempted by GC.
 	mp := acquirem()
 	if doubleCheckMalloc {
 		doubleCheckSmallNoScan(typ, mp)
 	}
 	mp.mallocing = 1

 	checkGCTrigger := false
 	c := getMCache(mp)
 	const spc = spanClass(sizeclass<<1) | spanClass(noscanint_)
 	span := c.alloc[spc]
 	v := nextFreeFastStub(span)
 	if v == 0 {
 		v, span, checkGCTrigger = c.nextFree(spc)
 	}
 	x := unsafe.Pointer(v)
 	if needzero && span.needzero != 0 {
 		memclrNoHeapPointers(x, elemsize)
 	}

 	// Ensure that the stores above that initialize x to
 	// type-safe memory and set the heap bits occur before
 	// the caller can make x observable to the garbage
 	// collector. Otherwise, on weakly ordered machines,
 	// the garbage collector could follow a pointer to x,
 	// but see uninitialized memory or stale heap bits.
 	publicationBarrier()

 	if writeBarrier.enabled {
 		// Allocate black during GC.
 		// All slots hold nil so no scanning is needed.
 		// This may be racing with GC so do it atomically if there can be
 		// a race marking the bit.
 		gcmarknewobject(span, uintptr(x))
 	} else {
 		// Track the last free index before the mark phase. This field
 		// is only used by the garbage collector. During the mark phase
 		// this is used by the conservative scanner to filter out objects
 		// that are both free and recently-allocated. It's safe to do that
 		// because we allocate-black if the GC is enabled. The conservative
 		// scanner produces pointers out of thin air, so without additional
 		// synchronization it might otherwise observe a partially-initialized
 		// object, which could crash the program.
 		span.freeIndexForScan = span.freeindex
 	}

 	// Note cache c only valid while m acquired; see #47302
 	//
 	// N.B. Use the full size because that matches how the GC
 	// will update the mem profile on the "free" side.
 	//
 	// TODO(mknyszek): We should really count the header as part
 	// of gc_sys or something. The code below just pretends it is
 	// internal fragmentation and matches the GC's accounting by
 	// using the whole allocation slot.
 	c.nextSample -= int64(elemsize)
 	if c.nextSample < 0 || MemProfileRate != c.memProfRate {
 		profilealloc(mp, x, elemsize)
 	}
 	mp.mallocing = 0
 	releasem(mp)

 	if checkGCTrigger {
 		if t := (gcTrigger{kind: gcTriggerHeap}); t.test() {
 			gcStart(t)
 		}
 	}
 	return x, elemsize
 }

 func doubleCheckTiny(size uintptr, typ *_type, mp *m) {
 	if mp.mallocing != 0 {
 		throw("malloc deadlock")
 	}
 	if mp.gsignal == getg() {
 		throw("malloc during signal")
 	}
 	if typ != nil && typ.Pointers() {
 		throw("expected noscan for tiny alloc")
 	}
 }

 func tinyStub(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, uintptr) {
 	const constsize = size_
 	const elemsize = elemsize_

 	// Set mp.mallocing to keep from being preempted by GC.
 	mp := acquirem()
 	if doubleCheckMalloc {
 		doubleCheckTiny(constsize, typ, mp)
 	}
 	mp.mallocing = 1

 	// Tiny allocator.
 	//
 	// Tiny allocator combines several tiny allocation requests
 	// into a single memory block. The resulting memory block
 	// is freed when all subobjects are unreachable. The subobjects
 	// must be noscan (don't have pointers), this ensures that
 	// the amount of potentially wasted memory is bounded.
 	//
 	// Size of the memory block used for combining (maxTinySize) is tunable.
 	// Current setting is 16 bytes, which relates to 2x worst case memory
 	// wastage (when all but one subobjects are unreachable).
 	// 8 bytes would result in no wastage at all, but provides less
 	// opportunities for combining.
 	// 32 bytes provides more opportunities for combining,
 	// but can lead to 4x worst case wastage.
 	// The best case winning is 8x regardless of block size.
 	//
 	// Objects obtained from tiny allocator must not be freed explicitly.
 	// So when an object will be freed explicitly, we ensure that
 	// its size >= maxTinySize.
 	//
 	// SetFinalizer has a special case for objects potentially coming
 	// from tiny allocator, it such case it allows to set finalizers
 	// for an inner byte of a memory block.
 	//
 	// The main targets of tiny allocator are small strings and
 	// standalone escaping variables. On a json benchmark
 	// the allocator reduces number of allocations by ~12% and
 	// reduces heap size by ~20%.
 	c := getMCache(mp)
 	off := c.tinyoffset
 	// Align tiny pointer for required (conservative) alignment.
 	if constsize&7 == 0 {
 		off = alignUp(off, 8)
 	} else if goarch.PtrSize == 4 && constsize == 12 {
 		// Conservatively align 12-byte objects to 8 bytes on 32-bit
 		// systems so that objects whose first field is a 64-bit
 		// value is aligned to 8 bytes and does not cause a fault on
 		// atomic access. See issue 37262.
 		// TODO(mknyszek): Remove this workaround if/when issue 36606
 		// is resolved.
 		off = alignUp(off, 8)
 	} else if constsize&3 == 0 {
 		off = alignUp(off, 4)
 	} else if constsize&1 == 0 {
 		off = alignUp(off, 2)
 	}
 	if off+constsize <= maxTinySize && c.tiny != 0 {
 		// The object fits into existing tiny block.
 		x := unsafe.Pointer(c.tiny + off)
 		c.tinyoffset = off + constsize
 		c.tinyAllocs++
 		mp.mallocing = 0
 		releasem(mp)
 		return x, 0
 	}
 	// Allocate a new maxTinySize block.
 	checkGCTrigger := false
 	span := c.alloc[tinySpanClass]
 	v := nextFreeFastTiny(span)
 	if v == 0 {
 		v, span, checkGCTrigger = c.nextFree(tinySpanClass)
 	}
 	x := unsafe.Pointer(v)
 	(*[2]uint64)(x)[0] = 0 // Always zero
 	(*[2]uint64)(x)[1] = 0
 	// See if we need to replace the existing tiny block with the new one
 	// based on amount of remaining free space.
 	if !raceenabled && (constsize < c.tinyoffset || c.tiny == 0) {
 		// Note: disabled when race detector is on, see comment near end of this function.
 		c.tiny = uintptr(x)
 		c.tinyoffset = constsize
 	}

 	// Ensure that the stores above that initialize x to
 	// type-safe memory and set the heap bits occur before
 	// the caller can make x observable to the garbage
 	// collector. Otherwise, on weakly ordered machines,
 	// the garbage collector could follow a pointer to x,
 	// but see uninitialized memory or stale heap bits.
 	publicationBarrier()

 	if writeBarrier.enabled {
 		// Allocate black during GC.
 		// All slots hold nil so no scanning is needed.
 		// This may be racing with GC so do it atomically if there can be
 		// a race marking the bit.
 		gcmarknewobject(span, uintptr(x))
 	} else {
 		// Track the last free index before the mark phase. This field
 		// is only used by the garbage collector. During the mark phase
 		// this is used by the conservative scanner to filter out objects
 		// that are both free and recently-allocated. It's safe to do that
 		// because we allocate-black if the GC is enabled. The conservative
 		// scanner produces pointers out of thin air, so without additional
 		// synchronization it might otherwise observe a partially-initialized
 		// object, which could crash the program.
 		span.freeIndexForScan = span.freeindex
 	}

 	// Note cache c only valid while m acquired; see #47302
 	//
 	// N.B. Use the full size because that matches how the GC
 	// will update the mem profile on the "free" side.
 	//
 	// TODO(mknyszek): We should really count the header as part
 	// of gc_sys or something. The code below just pretends it is
 	// internal fragmentation and matches the GC's accounting by
 	// using the whole allocation slot.
 	c.nextSample -= int64(elemsize)
 	if c.nextSample < 0 || MemProfileRate != c.memProfRate {
 		profilealloc(mp, x, elemsize)
 	}
 	mp.mallocing = 0
 	releasem(mp)

 	if checkGCTrigger {
 		if t := (gcTrigger{kind: gcTriggerHeap}); t.test() {
 			gcStart(t)
 		}
 	}

 	if raceenabled {
 		// Pad tinysize allocations so they are aligned with the end
 		// of the tinyalloc region. This ensures that any arithmetic
 		// that goes off the top end of the object will be detectable
 		// by checkptr (issue 38872).
 		// Note that we disable tinyalloc when raceenabled for this to work.
 		// TODO: This padding is only performed when the race detector
 		// is enabled. It would be nice to enable it if any package
 		// was compiled with checkptr, but there's no easy way to
 		// detect that (especially at compile time).
 		// TODO: enable this padding for all allocations, not just
 		// tinyalloc ones. It's tricky because of pointer maps.
 		// Maybe just all noscan objects?
 		x = add(x, elemsize-constsize)
 	}
 	return x, elemsize
 }

 // TODO(matloob): Should we let the go compiler inline this instead of using mkmalloc?
 // We won't be able to use elemsize_ but that's probably ok.
 func nextFreeFastTiny(span *mspan) gclinkptr {
 	const nbytes = 8192
 	const nelems = uint16((nbytes - unsafe.Sizeof(spanInlineMarkBits{})) / elemsize_)
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache) // Is there a free object in the allocCache?
 		result := span.freeindex + uint16(theBit)
 		if result < nelems {
 			freeidx := result + 1
 			if !(freeidx%64 == 0 && freeidx != nelems) {
 				span.allocCache >>= uint(theBit + 1)
 				span.freeindex = freeidx
 				span.allocCount++
 				nextFreeFastResult = gclinkptr(uintptr(result)*elemsize_ + span.base())
 			}
 		}
 	}
 	return nextFreeFastResult
 }

 func nextFreeFastStub(span *mspan) gclinkptr {
 	var nextFreeFastResult gclinkptr
 	if span.allocCache != 0 {
 		theBit := sys.TrailingZeros64(span.allocCache) // Is there a free object in the allocCache?
 		result := span.freeindex + uint16(theBit)
 		if result < span.nelems {
 			freeidx := result + 1
 			if !(freeidx%64 == 0 && freeidx != span.nelems) {
 				span.allocCache >>= uint(theBit + 1)
 				span.freeindex = freeidx
 				span.allocCount++
 				nextFreeFastResult = gclinkptr(uintptr(result)*elemsize_ + span.base())
 			}
 		}
 	}
 	return nextFreeFastResult
 }

 func heapSetTypeNoHeaderStub(x, dataSize uintptr, typ *_type, span *mspan) uintptr {
 	if doubleCheckHeapSetType && (!heapBitsInSpan(dataSize) || !heapBitsInSpan(elemsize_)) {
 		throw("tried to write heap bits, but no heap bits in span")
 	}
 	scanSize := writeHeapBitsSmallStub(span, x, dataSize, typ)
 	if doubleCheckHeapSetType {
 		doubleCheckHeapType(x, dataSize, typ, nil, span)
 	}
 	return scanSize
 }

 // writeHeapBitsSmallStub writes the heap bits for small objects whose ptr/scalar data is
 // stored as a bitmap at the end of the span.
 //
 // Assumes dataSize is <= ptrBits*goarch.PtrSize. x must be a pointer into the span.
 // heapBitsInSpan(dataSize) must be true. dataSize must be >= typ.Size_.
 //
 //go:nosplit
 func writeHeapBitsSmallStub(span *mspan, x, dataSize uintptr, typ *_type) uintptr {
 	// The objects here are always really small, so a single load is sufficient.
 	src0 := readUintptr(getGCMask(typ))

 	const elemsize = elemsize_

 	// Create repetitions of the bitmap if we have a small slice backing store.
 	scanSize := typ.PtrBytes
 	src := src0
 	if typ.Size_ == goarch.PtrSize {
 		src = (1 << (dataSize / goarch.PtrSize)) - 1
 	} else {
 		// N.B. We rely on dataSize being an exact multiple of the type size.
 		// The alternative is to be defensive and mask out src to the length
 		// of dataSize. The purpose is to save on one additional masking operation.
 		if doubleCheckHeapSetType && !asanenabled && dataSize%typ.Size_ != 0 {
 			throw("runtime: (*mspan).writeHeapBitsSmall: dataSize is not a multiple of typ.Size_")
 		}
 		for i := typ.Size_; i < dataSize; i += typ.Size_ {
 			src |= src0 << (i / goarch.PtrSize)
 			scanSize += typ.Size_
 		}
 	}

 	// Since we're never writing more than one uintptr's worth of bits, we're either going
 	// to do one or two writes.
 	dstBase, _ := spanHeapBitsRange(span.base(), pageSize, elemsize)
 	dst := unsafe.Pointer(dstBase)
 	o := (x - span.base()) / goarch.PtrSize
 	i := o / ptrBits
 	j := o % ptrBits
 	const bits uintptr = elemsize / goarch.PtrSize
 	// In the if statement below, we have to do two uintptr writes if the bits
 	// we need to write straddle across two different memory locations. But if
 	// the number of bits we're writing divides evenly into the number of bits
 	// in the uintptr we're writing, this can never happen. Since bitsIsPowerOfTwo
 	// is a compile-time constant in the generated code, in the case where the size is
 	// a power of two less than or equal to ptrBits, the compiler can remove the
 	// 'two writes' branch of the if statement and always do only one write without
 	// the check.
 	const bitsIsPowerOfTwo = bits&(bits-1) == 0
 	if bits > ptrBits || (!bitsIsPowerOfTwo && j+bits > ptrBits) {
 		// Two writes.
 		bits0 := ptrBits - j
 		bits1 := bits - bits0
 		dst0 := (*uintptr)(add(dst, (i+0)*goarch.PtrSize))
 		dst1 := (*uintptr)(add(dst, (i+1)*goarch.PtrSize))
 		*dst0 = (*dst0)&(^uintptr(0)>>bits0) | (src << j)
 		*dst1 = (*dst1)&^((1<<bits1)-1) | (src >> bits0)
 	} else {
 		// One write.
 		dst := (*uintptr)(add(dst, i*goarch.PtrSize))
 		*dst = (*dst)&^(((1<<(min(bits, ptrBits)))-1)<<j) | (src << j) // We're taking the min so this compiles on 32 bit platforms. But if bits > ptrbits we always take the other branch
 	}

 	const doubleCheck = false
 	if doubleCheck {
 		writeHeapBitsDoubleCheck(span, x, dataSize, src, src0, i, j, bits, typ)
 	}
 	return scanSize
 }

 func writeHeapBitsDoubleCheck(span *mspan, x, dataSize, src, src0, i, j, bits uintptr, typ *_type) {
 	srcRead := span.heapBitsSmallForAddr(x)
 	if srcRead != src {
 		print("runtime: x=", hex(x), " i=", i, " j=", j, " bits=", bits, "\n")
 		print("runtime: dataSize=", dataSize, " typ.Size_=", typ.Size_, " typ.PtrBytes=", typ.PtrBytes, "\n")
 		print("runtime: src0=", hex(src0), " src=", hex(src), " srcRead=", hex(srcRead), "\n")
 		throw("bad pointer bits written for small object")
 	}
 }
	// Copyright 2025 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	// This file contains stub functions that are not meant to be called directly,
	// but that will be assembled together using the inlining logic in runtime/_mkmalloc
	// to produce a full mallocgc function that's specialized for a span class
	// or specific size in the case of the tiny allocator.
	//
	// To assemble a mallocgc function, the mallocStub function is cloned, and the call to
	// inlinedMalloc is replaced with the inlined body of smallScanNoHeaderStub,
	// smallNoScanStub or tinyStub, depending on the parameters being specialized.
	//
	// The size_ (for the tiny case) and elemsize_, sizeclass_, and noscanint_ (for all three cases)
	// identifiers are replaced with the value of the parameter in the specialized case.
	// The nextFreeFastStub, nextFreeFastTiny, heapSetTypeNoHeaderStub, and writeHeapBitsSmallStub
	// functions are also inlined by _mkmalloc.

	package runtime

	import (
	"internal/goarch"
	"internal/runtime/sys"
	"unsafe"
	)

	// These identifiers will all be replaced by the inliner. So their values don't
	// really matter: they just need to be set so that the stub functions, which
	// will never be used on their own, can compile. elemsize_ can't be set to
	// zero because we divide by it in nextFreeFastTiny, and the compiler would
	// complain about a division by zero. Its replaced value will always be greater
	// than zero.
	const elemsize_ = 8
	const sizeclass_ = 0
	const noscanint_ = 0
	const size_ = 0

	func malloc0(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
	if doubleCheckMalloc {
	if gcphase == _GCmarktermination {
	throw("mallocgc called with gcphase == _GCmarktermination")
	}
	}

	// Short-circuit zero-sized allocation requests.
	return unsafe.Pointer(&zerobase)
	}

	func mallocPanic(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
	panic("not defined for sizeclass")
	}

	func mallocStub(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
	if doubleCheckMalloc {
	if gcphase == _GCmarktermination {
	throw("mallocgc called with gcphase == _GCmarktermination")
	}
	}

	// It's possible for any malloc to trigger sweeping, which may in
	// turn queue finalizers. Record this dynamic lock edge.
	// N.B. Compiled away if lockrank experiment is not enabled.
	lockRankMayQueueFinalizer()

	// Pre-malloc debug hooks.
	if debug.malloc {
	if x := preMallocgcDebug(size, typ); x != nil {
	return x
	}
	}

	// Assist the GC if needed.
	if gcBlackenEnabled != 0 {
	deductAssistCredit(size)
	}

	// Actually do the allocation.
	x, elemsize := inlinedMalloc(size, typ, needzero)

	// Adjust our GC assist debt to account for internal fragmentation.
	if gcBlackenEnabled != 0 && elemsize != 0 {
	if assistG := getg().m.curg; assistG != nil {
	assistG.gcAssistBytes -= int64(elemsize - size)
	}
	}

	// Post-malloc debug hooks.
	if debug.malloc {
	postMallocgcDebug(x, elemsize, typ)
	}
	return x
	}

	// inlinedMalloc will never be called. It is defined just so that the compiler can compile
	// the mallocStub function, which will also never be called, but instead used as a template
	// to generate a size-specialized malloc function. The call to inlinedMalloc in mallocStub
	// will be replaced with the inlined body of smallScanNoHeaderStub, smallNoScanStub, or tinyStub
	// when generating the size-specialized malloc function. See the comment at the top of this
	// file for more information.
	func inlinedMalloc(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, uintptr) {
	return unsafe.Pointer(uintptr(0)), 0
	}

	func doubleCheckSmallScanNoHeader(size uintptr, typ _type, mp m) {
	if mp.mallocing != 0 {
	throw("malloc deadlock")
	}
	if mp.gsignal == getg() {
	throw("malloc during signal")
	}
	if typ == nil \|\| !typ.Pointers() {
	throw("noscan allocated in scan-only path")
	}
	if !heapBitsInSpan(size) {
	throw("heap bits in not in span for non-header-only path")
	}
	}

	func smallScanNoHeaderStub(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, uintptr) {
	const sizeclass = sizeclass_
	const elemsize = elemsize_

	// Set mp.mallocing to keep from being preempted by GC.
	mp := acquirem()
	if doubleCheckMalloc {
	doubleCheckSmallScanNoHeader(size, typ, mp)
	}
	mp.mallocing = 1

	checkGCTrigger := false
	c := getMCache(mp)
	const spc = spanClass(sizeclass<<1) \| spanClass(noscanint_)
	span := c.alloc[spc]
	v := nextFreeFastStub(span)
	if v == 0 {
	v, span, checkGCTrigger = c.nextFree(spc)
	}
	x := unsafe.Pointer(v)
	if span.needzero != 0 {
	memclrNoHeapPointers(x, elemsize)
	}
	if goarch.PtrSize == 8 && sizeclass == 1 {
	// initHeapBits already set the pointer bits for the 8-byte sizeclass
	// on 64-bit platforms.
	c.scanAlloc += 8
	} else {
	dataSize := size // make the inliner happy
	x := uintptr(x)
	scanSize := heapSetTypeNoHeaderStub(x, dataSize, typ, span)
	c.scanAlloc += scanSize
	}

	// Ensure that the stores above that initialize x to
	// type-safe memory and set the heap bits occur before
	// the caller can make x observable to the garbage
	// collector. Otherwise, on weakly ordered machines,
	// the garbage collector could follow a pointer to x,
	// but see uninitialized memory or stale heap bits.
	publicationBarrier()

	if writeBarrier.enabled {
	// Allocate black during GC.
	// All slots hold nil so no scanning is needed.
	// This may be racing with GC so do it atomically if there can be
	// a race marking the bit.
	gcmarknewobject(span, uintptr(x))
	} else {
	// Track the last free index before the mark phase. This field
	// is only used by the garbage collector. During the mark phase
	// this is used by the conservative scanner to filter out objects
	// that are both free and recently-allocated. It's safe to do that
	// because we allocate-black if the GC is enabled. The conservative
	// scanner produces pointers out of thin air, so without additional
	// synchronization it might otherwise observe a partially-initialized
	// object, which could crash the program.
	span.freeIndexForScan = span.freeindex
	}

	// Note cache c only valid while m acquired; see #47302
	//
	// N.B. Use the full size because that matches how the GC
	// will update the mem profile on the "free" side.
	//
	// TODO(mknyszek): We should really count the header as part
	// of gc_sys or something. The code below just pretends it is
	// internal fragmentation and matches the GC's accounting by
	// using the whole allocation slot.
	c.nextSample -= int64(elemsize)
	if c.nextSample < 0 \|\| MemProfileRate != c.memProfRate {
	profilealloc(mp, x, elemsize)
	}
	mp.mallocing = 0
	releasem(mp)

	if checkGCTrigger {
	if t := (gcTrigger{kind: gcTriggerHeap}); t.test() {
	gcStart(t)
	}
	}

	return x, elemsize
	}

	func doubleCheckSmallNoScan(typ _type, mp m) {
	if mp.mallocing != 0 {
	throw("malloc deadlock")
	}
	if mp.gsignal == getg() {
	throw("malloc during signal")
	}
	if typ != nil && typ.Pointers() {
	throw("expected noscan type for noscan alloc")
	}
	}

	func smallNoScanStub(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, uintptr) {
	// TODO(matloob): Add functionality to mkmalloc to allow us to inline a non-constant
	// sizeclass_ and elemsize_ value (instead just set to the expressions to look up the size class
	// and elemsize. We'd also need to teach mkmalloc that values that are touched by these (specifically
	// spc below) should turn into vars. This would allow us to generate mallocgcSmallNoScan itself,
	// so that its code could not diverge from the generated functions.
	const sizeclass = sizeclass_
	const elemsize = elemsize_

	// Set mp.mallocing to keep from being preempted by GC.
	mp := acquirem()
	if doubleCheckMalloc {
	doubleCheckSmallNoScan(typ, mp)
	}
	mp.mallocing = 1

	checkGCTrigger := false
	c := getMCache(mp)
	const spc = spanClass(sizeclass<<1) \| spanClass(noscanint_)
	span := c.alloc[spc]
	v := nextFreeFastStub(span)
	if v == 0 {
	v, span, checkGCTrigger = c.nextFree(spc)
	}
	x := unsafe.Pointer(v)
	if needzero && span.needzero != 0 {
	memclrNoHeapPointers(x, elemsize)
	}

	// Ensure that the stores above that initialize x to
	// type-safe memory and set the heap bits occur before
	// the caller can make x observable to the garbage
	// collector. Otherwise, on weakly ordered machines,
	// the garbage collector could follow a pointer to x,
	// but see uninitialized memory or stale heap bits.
	publicationBarrier()

	if writeBarrier.enabled {
	// Allocate black during GC.
	// All slots hold nil so no scanning is needed.
	// This may be racing with GC so do it atomically if there can be
	// a race marking the bit.
	gcmarknewobject(span, uintptr(x))
	} else {
	// Track the last free index before the mark phase. This field
	// is only used by the garbage collector. During the mark phase
	// this is used by the conservative scanner to filter out objects
	// that are both free and recently-allocated. It's safe to do that
	// because we allocate-black if the GC is enabled. The conservative
	// scanner produces pointers out of thin air, so without additional
	// synchronization it might otherwise observe a partially-initialized
	// object, which could crash the program.
	span.freeIndexForScan = span.freeindex
	}

	// Note cache c only valid while m acquired; see #47302
	//
	// N.B. Use the full size because that matches how the GC
	// will update the mem profile on the "free" side.
	//
	// TODO(mknyszek): We should really count the header as part
	// of gc_sys or something. The code below just pretends it is
	// internal fragmentation and matches the GC's accounting by
	// using the whole allocation slot.
	c.nextSample -= int64(elemsize)
	if c.nextSample < 0 \|\| MemProfileRate != c.memProfRate {
	profilealloc(mp, x, elemsize)
	}
	mp.mallocing = 0
	releasem(mp)

	if checkGCTrigger {
	if t := (gcTrigger{kind: gcTriggerHeap}); t.test() {
	gcStart(t)
	}
	}
	return x, elemsize
	}

	func doubleCheckTiny(size uintptr, typ _type, mp m) {
	if mp.mallocing != 0 {
	throw("malloc deadlock")
	}
	if mp.gsignal == getg() {
	throw("malloc during signal")
	}
	if typ != nil && typ.Pointers() {
	throw("expected noscan for tiny alloc")
	}
	}

	func tinyStub(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, uintptr) {
	const constsize = size_
	const elemsize = elemsize_

	// Set mp.mallocing to keep from being preempted by GC.
	mp := acquirem()
	if doubleCheckMalloc {
	doubleCheckTiny(constsize, typ, mp)
	}
	mp.mallocing = 1

	// Tiny allocator.
	//
	// Tiny allocator combines several tiny allocation requests
	// into a single memory block. The resulting memory block
	// is freed when all subobjects are unreachable. The subobjects
	// must be noscan (don't have pointers), this ensures that
	// the amount of potentially wasted memory is bounded.
	//
	// Size of the memory block used for combining (maxTinySize) is tunable.
	// Current setting is 16 bytes, which relates to 2x worst case memory
	// wastage (when all but one subobjects are unreachable).
	// 8 bytes would result in no wastage at all, but provides less
	// opportunities for combining.
	// 32 bytes provides more opportunities for combining,
	// but can lead to 4x worst case wastage.
	// The best case winning is 8x regardless of block size.
	//
	// Objects obtained from tiny allocator must not be freed explicitly.
	// So when an object will be freed explicitly, we ensure that
	// its size >= maxTinySize.
	//
	// SetFinalizer has a special case for objects potentially coming
	// from tiny allocator, it such case it allows to set finalizers
	// for an inner byte of a memory block.
	//
	// The main targets of tiny allocator are small strings and
	// standalone escaping variables. On a json benchmark
	// the allocator reduces number of allocations by ~12% and
	// reduces heap size by ~20%.
	c := getMCache(mp)
	off := c.tinyoffset
	// Align tiny pointer for required (conservative) alignment.
	if constsize&7 == 0 {
	off = alignUp(off, 8)
	} else if goarch.PtrSize == 4 && constsize == 12 {
	// Conservatively align 12-byte objects to 8 bytes on 32-bit
	// systems so that objects whose first field is a 64-bit
	// value is aligned to 8 bytes and does not cause a fault on
	// atomic access. See issue 37262.
	// TODO(mknyszek): Remove this workaround if/when issue 36606
	// is resolved.
	off = alignUp(off, 8)
	} else if constsize&3 == 0 {
	off = alignUp(off, 4)
	} else if constsize&1 == 0 {
	off = alignUp(off, 2)
	}
	if off+constsize <= maxTinySize && c.tiny != 0 {
	// The object fits into existing tiny block.
	x := unsafe.Pointer(c.tiny + off)
	c.tinyoffset = off + constsize
	c.tinyAllocs++
	mp.mallocing = 0
	releasem(mp)
	return x, 0
	}
	// Allocate a new maxTinySize block.
	checkGCTrigger := false
	span := c.alloc[tinySpanClass]
	v := nextFreeFastTiny(span)
	if v == 0 {
	v, span, checkGCTrigger = c.nextFree(tinySpanClass)
	}
	x := unsafe.Pointer(v)
	(*[2]uint64)(x)[0] = 0 // Always zero
	(*[2]uint64)(x)[1] = 0
	// See if we need to replace the existing tiny block with the new one
	// based on amount of remaining free space.
	if !raceenabled && (constsize < c.tinyoffset \|\| c.tiny == 0) {
	// Note: disabled when race detector is on, see comment near end of this function.
	c.tiny = uintptr(x)
	c.tinyoffset = constsize
	}

	// Ensure that the stores above that initialize x to
	// type-safe memory and set the heap bits occur before
	// the caller can make x observable to the garbage
	// collector. Otherwise, on weakly ordered machines,
	// the garbage collector could follow a pointer to x,
	// but see uninitialized memory or stale heap bits.
	publicationBarrier()

	if writeBarrier.enabled {
	// Allocate black during GC.
	// All slots hold nil so no scanning is needed.
	// This may be racing with GC so do it atomically if there can be
	// a race marking the bit.
	gcmarknewobject(span, uintptr(x))
	} else {
	// Track the last free index before the mark phase. This field
	// is only used by the garbage collector. During the mark phase
	// this is used by the conservative scanner to filter out objects
	// that are both free and recently-allocated. It's safe to do that
	// because we allocate-black if the GC is enabled. The conservative
	// scanner produces pointers out of thin air, so without additional
	// synchronization it might otherwise observe a partially-initialized
	// object, which could crash the program.
	span.freeIndexForScan = span.freeindex
	}

	// Note cache c only valid while m acquired; see #47302
	//
	// N.B. Use the full size because that matches how the GC
	// will update the mem profile on the "free" side.
	//
	// TODO(mknyszek): We should really count the header as part
	// of gc_sys or something. The code below just pretends it is
	// internal fragmentation and matches the GC's accounting by
	// using the whole allocation slot.
	c.nextSample -= int64(elemsize)
	if c.nextSample < 0 \|\| MemProfileRate != c.memProfRate {
	profilealloc(mp, x, elemsize)
	}
	mp.mallocing = 0
	releasem(mp)

	if checkGCTrigger {
	if t := (gcTrigger{kind: gcTriggerHeap}); t.test() {
	gcStart(t)
	}
	}

	if raceenabled {
	// Pad tinysize allocations so they are aligned with the end
	// of the tinyalloc region. This ensures that any arithmetic
	// that goes off the top end of the object will be detectable
	// by checkptr (issue 38872).
	// Note that we disable tinyalloc when raceenabled for this to work.
	// TODO: This padding is only performed when the race detector
	// is enabled. It would be nice to enable it if any package
	// was compiled with checkptr, but there's no easy way to
	// detect that (especially at compile time).
	// TODO: enable this padding for all allocations, not just
	// tinyalloc ones. It's tricky because of pointer maps.
	// Maybe just all noscan objects?
	x = add(x, elemsize-constsize)
	}
	return x, elemsize
	}

	// TODO(matloob): Should we let the go compiler inline this instead of using mkmalloc?
	// We won't be able to use elemsize_ but that's probably ok.
	func nextFreeFastTiny(span *mspan) gclinkptr {
	const nbytes = 8192
	const nelems = uint16((nbytes - unsafe.Sizeof(spanInlineMarkBits{})) / elemsize_)
	var nextFreeFastResult gclinkptr
	if span.allocCache != 0 {
	theBit := sys.TrailingZeros64(span.allocCache) // Is there a free object in the allocCache?
	result := span.freeindex + uint16(theBit)
	if result < nelems {
	freeidx := result + 1
	if !(freeidx%64 == 0 && freeidx != nelems) {
	span.allocCache >>= uint(theBit + 1)
	span.freeindex = freeidx
	span.allocCount++
	nextFreeFastResult = gclinkptr(uintptr(result)*elemsize_ + span.base())
	}
	}
	}
	return nextFreeFastResult
	}

	func nextFreeFastStub(span *mspan) gclinkptr {
	var nextFreeFastResult gclinkptr
	if span.allocCache != 0 {
	theBit := sys.TrailingZeros64(span.allocCache) // Is there a free object in the allocCache?
	result := span.freeindex + uint16(theBit)
	if result < span.nelems {
	freeidx := result + 1
	if !(freeidx%64 == 0 && freeidx != span.nelems) {
	span.allocCache >>= uint(theBit + 1)
	span.freeindex = freeidx
	span.allocCount++
	nextFreeFastResult = gclinkptr(uintptr(result)*elemsize_ + span.base())
	}
	}
	}
	return nextFreeFastResult
	}

	func heapSetTypeNoHeaderStub(x, dataSize uintptr, typ _type, span mspan) uintptr {
	if doubleCheckHeapSetType && (!heapBitsInSpan(dataSize) \|\| !heapBitsInSpan(elemsize_)) {
	throw("tried to write heap bits, but no heap bits in span")
	}
	scanSize := writeHeapBitsSmallStub(span, x, dataSize, typ)
	if doubleCheckHeapSetType {
	doubleCheckHeapType(x, dataSize, typ, nil, span)
	}
	return scanSize
	}

	// writeHeapBitsSmallStub writes the heap bits for small objects whose ptr/scalar data is
	// stored as a bitmap at the end of the span.
	//
	// Assumes dataSize is <= ptrBits*goarch.PtrSize. x must be a pointer into the span.
	// heapBitsInSpan(dataSize) must be true. dataSize must be >= typ.Size_.
	//
	//go:nosplit
	func writeHeapBitsSmallStub(span mspan, x, dataSize uintptr, typ _type) uintptr {
	// The objects here are always really small, so a single load is sufficient.
	src0 := readUintptr(getGCMask(typ))

	const elemsize = elemsize_

	// Create repetitions of the bitmap if we have a small slice backing store.
	scanSize := typ.PtrBytes
	src := src0
	if typ.Size_ == goarch.PtrSize {
	src = (1 << (dataSize / goarch.PtrSize)) - 1
	} else {
	// N.B. We rely on dataSize being an exact multiple of the type size.
	// The alternative is to be defensive and mask out src to the length
	// of dataSize. The purpose is to save on one additional masking operation.
	if doubleCheckHeapSetType && !asanenabled && dataSize%typ.Size_ != 0 {
	throw("runtime: (*mspan).writeHeapBitsSmall: dataSize is not a multiple of typ.Size_")
	}
	for i := typ.Size_; i < dataSize; i += typ.Size_ {
	src \|= src0 << (i / goarch.PtrSize)
	scanSize += typ.Size_
	}
	}

	// Since we're never writing more than one uintptr's worth of bits, we're either going
	// to do one or two writes.
	dstBase, _ := spanHeapBitsRange(span.base(), pageSize, elemsize)
	dst := unsafe.Pointer(dstBase)
	o := (x - span.base()) / goarch.PtrSize
	i := o / ptrBits
	j := o % ptrBits
	const bits uintptr = elemsize / goarch.PtrSize
	// In the if statement below, we have to do two uintptr writes if the bits
	// we need to write straddle across two different memory locations. But if
	// the number of bits we're writing divides evenly into the number of bits
	// in the uintptr we're writing, this can never happen. Since bitsIsPowerOfTwo
	// is a compile-time constant in the generated code, in the case where the size is
	// a power of two less than or equal to ptrBits, the compiler can remove the
	// 'two writes' branch of the if statement and always do only one write without
	// the check.
	const bitsIsPowerOfTwo = bits&(bits-1) == 0
	if bits > ptrBits \|\| (!bitsIsPowerOfTwo && j+bits > ptrBits) {
	// Two writes.
	bits0 := ptrBits - j
	bits1 := bits - bits0
	dst0 := (uintptr)(add(dst, (i+0)goarch.PtrSize))
	dst1 := (uintptr)(add(dst, (i+1)goarch.PtrSize))
	dst0 = (dst0)&(^uintptr(0)>>bits0) \| (src << j)
	dst1 = (dst1)&^((1<<bits1)-1) \| (src >> bits0)
	} else {
	// One write.
	dst := (uintptr)(add(dst, igoarch.PtrSize))
	dst = (dst)&^(((1<<(min(bits, ptrBits)))-1)<<j) \| (src << j) // We're taking the min so this compiles on 32 bit platforms. But if bits > ptrbits we always take the other branch
	}

	const doubleCheck = false
	if doubleCheck {
	writeHeapBitsDoubleCheck(span, x, dataSize, src, src0, i, j, bits, typ)
	}
	return scanSize
	}

	func writeHeapBitsDoubleCheck(span mspan, x, dataSize, src, src0, i, j, bits uintptr, typ _type) {
	srcRead := span.heapBitsSmallForAddr(x)
	if srcRead != src {
	print("runtime: x=", hex(x), " i=", i, " j=", j, " bits=", bits, "\n")
	print("runtime: dataSize=", dataSize, " typ.Size_=", typ.Size_, " typ.PtrBytes=", typ.PtrBytes, "\n")
	print("runtime: src0=", hex(src0), " src=", hex(src), " srcRead=", hex(srcRead), "\n")
	throw("bad pointer bits written for small object")
	}
	}