| // Copyright 2025 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // This file contains stub functions that are not meant to be called directly, |
| // but that will be assembled together using the inlining logic in runtime/_mkmalloc |
| // to produce a full mallocgc function that's specialized for a span class |
| // or specific size in the case of the tiny allocator. |
| // |
| // To assemble a mallocgc function, the mallocStub function is cloned, and the call to |
| // inlinedMalloc is replaced with the inlined body of smallScanNoHeaderStub, |
| // smallNoScanStub or tinyStub, depending on the parameters being specialized. |
| // |
| // The size_ (for the tiny case) and elemsize_, sizeclass_, and noscanint_ (for all three cases) |
| // identifiers are replaced with the value of the parameter in the specialized case. |
| // The nextFreeFastStub, nextFreeFastTiny, heapSetTypeNoHeaderStub, and writeHeapBitsSmallStub |
| // functions are also inlined by _mkmalloc. |
| |
| package runtime |
| |
| import ( |
| "internal/goarch" |
| "internal/runtime/sys" |
| "unsafe" |
| ) |
| |
| // These identifiers will all be replaced by the inliner. So their values don't |
| // really matter: they just need to be set so that the stub functions, which |
| // will never be used on their own, can compile. elemsize_ can't be set to |
| // zero because we divide by it in nextFreeFastTiny, and the compiler would |
| // complain about a division by zero. Its replaced value will always be greater |
| // than zero. |
| const elemsize_ = 8 |
| const sizeclass_ = 0 |
| const noscanint_ = 0 |
| const size_ = 0 |
| |
| func malloc0(size uintptr, typ *_type, needzero bool) unsafe.Pointer { |
| if doubleCheckMalloc { |
| if gcphase == _GCmarktermination { |
| throw("mallocgc called with gcphase == _GCmarktermination") |
| } |
| } |
| |
| // Short-circuit zero-sized allocation requests. |
| return unsafe.Pointer(&zerobase) |
| } |
| |
| func mallocPanic(size uintptr, typ *_type, needzero bool) unsafe.Pointer { |
| panic("not defined for sizeclass") |
| } |
| |
| func mallocStub(size uintptr, typ *_type, needzero bool) unsafe.Pointer { |
| if doubleCheckMalloc { |
| if gcphase == _GCmarktermination { |
| throw("mallocgc called with gcphase == _GCmarktermination") |
| } |
| } |
| |
| // It's possible for any malloc to trigger sweeping, which may in |
| // turn queue finalizers. Record this dynamic lock edge. |
| // N.B. Compiled away if lockrank experiment is not enabled. |
| lockRankMayQueueFinalizer() |
| |
| // Pre-malloc debug hooks. |
| if debug.malloc { |
| if x := preMallocgcDebug(size, typ); x != nil { |
| return x |
| } |
| } |
| |
| // Assist the GC if needed. |
| if gcBlackenEnabled != 0 { |
| deductAssistCredit(size) |
| } |
| |
| // Actually do the allocation. |
| x, elemsize := inlinedMalloc(size, typ, needzero) |
| |
| // Adjust our GC assist debt to account for internal fragmentation. |
| if gcBlackenEnabled != 0 && elemsize != 0 { |
| if assistG := getg().m.curg; assistG != nil { |
| assistG.gcAssistBytes -= int64(elemsize - size) |
| } |
| } |
| |
| // Post-malloc debug hooks. |
| if debug.malloc { |
| postMallocgcDebug(x, elemsize, typ) |
| } |
| return x |
| } |
| |
| // inlinedMalloc will never be called. It is defined just so that the compiler can compile |
| // the mallocStub function, which will also never be called, but instead used as a template |
| // to generate a size-specialized malloc function. The call to inlinedMalloc in mallocStub |
| // will be replaced with the inlined body of smallScanNoHeaderStub, smallNoScanStub, or tinyStub |
| // when generating the size-specialized malloc function. See the comment at the top of this |
| // file for more information. |
| func inlinedMalloc(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, uintptr) { |
| return unsafe.Pointer(uintptr(0)), 0 |
| } |
| |
| func doubleCheckSmallScanNoHeader(size uintptr, typ *_type, mp *m) { |
| if mp.mallocing != 0 { |
| throw("malloc deadlock") |
| } |
| if mp.gsignal == getg() { |
| throw("malloc during signal") |
| } |
| if typ == nil || !typ.Pointers() { |
| throw("noscan allocated in scan-only path") |
| } |
| if !heapBitsInSpan(size) { |
| throw("heap bits in not in span for non-header-only path") |
| } |
| } |
| |
| func smallScanNoHeaderStub(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, uintptr) { |
| const sizeclass = sizeclass_ |
| const elemsize = elemsize_ |
| |
| // Set mp.mallocing to keep from being preempted by GC. |
| mp := acquirem() |
| if doubleCheckMalloc { |
| doubleCheckSmallScanNoHeader(size, typ, mp) |
| } |
| mp.mallocing = 1 |
| |
| checkGCTrigger := false |
| c := getMCache(mp) |
| const spc = spanClass(sizeclass<<1) | spanClass(noscanint_) |
| span := c.alloc[spc] |
| v := nextFreeFastStub(span) |
| if v == 0 { |
| v, span, checkGCTrigger = c.nextFree(spc) |
| } |
| x := unsafe.Pointer(v) |
| if span.needzero != 0 { |
| memclrNoHeapPointers(x, elemsize) |
| } |
| if goarch.PtrSize == 8 && sizeclass == 1 { |
| // initHeapBits already set the pointer bits for the 8-byte sizeclass |
| // on 64-bit platforms. |
| c.scanAlloc += 8 |
| } else { |
| dataSize := size // make the inliner happy |
| x := uintptr(x) |
| scanSize := heapSetTypeNoHeaderStub(x, dataSize, typ, span) |
| c.scanAlloc += scanSize |
| } |
| |
| // Ensure that the stores above that initialize x to |
| // type-safe memory and set the heap bits occur before |
| // the caller can make x observable to the garbage |
| // collector. Otherwise, on weakly ordered machines, |
| // the garbage collector could follow a pointer to x, |
| // but see uninitialized memory or stale heap bits. |
| publicationBarrier() |
| |
| if writeBarrier.enabled { |
| // Allocate black during GC. |
| // All slots hold nil so no scanning is needed. |
| // This may be racing with GC so do it atomically if there can be |
| // a race marking the bit. |
| gcmarknewobject(span, uintptr(x)) |
| } else { |
| // Track the last free index before the mark phase. This field |
| // is only used by the garbage collector. During the mark phase |
| // this is used by the conservative scanner to filter out objects |
| // that are both free and recently-allocated. It's safe to do that |
| // because we allocate-black if the GC is enabled. The conservative |
| // scanner produces pointers out of thin air, so without additional |
| // synchronization it might otherwise observe a partially-initialized |
| // object, which could crash the program. |
| span.freeIndexForScan = span.freeindex |
| } |
| |
| // Note cache c only valid while m acquired; see #47302 |
| // |
| // N.B. Use the full size because that matches how the GC |
| // will update the mem profile on the "free" side. |
| // |
| // TODO(mknyszek): We should really count the header as part |
| // of gc_sys or something. The code below just pretends it is |
| // internal fragmentation and matches the GC's accounting by |
| // using the whole allocation slot. |
| c.nextSample -= int64(elemsize) |
| if c.nextSample < 0 || MemProfileRate != c.memProfRate { |
| profilealloc(mp, x, elemsize) |
| } |
| mp.mallocing = 0 |
| releasem(mp) |
| |
| if checkGCTrigger { |
| if t := (gcTrigger{kind: gcTriggerHeap}); t.test() { |
| gcStart(t) |
| } |
| } |
| |
| return x, elemsize |
| } |
| |
| func doubleCheckSmallNoScan(typ *_type, mp *m) { |
| if mp.mallocing != 0 { |
| throw("malloc deadlock") |
| } |
| if mp.gsignal == getg() { |
| throw("malloc during signal") |
| } |
| if typ != nil && typ.Pointers() { |
| throw("expected noscan type for noscan alloc") |
| } |
| } |
| |
| func smallNoScanStub(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, uintptr) { |
| // TODO(matloob): Add functionality to mkmalloc to allow us to inline a non-constant |
| // sizeclass_ and elemsize_ value (instead just set to the expressions to look up the size class |
| // and elemsize. We'd also need to teach mkmalloc that values that are touched by these (specifically |
| // spc below) should turn into vars. This would allow us to generate mallocgcSmallNoScan itself, |
| // so that its code could not diverge from the generated functions. |
| const sizeclass = sizeclass_ |
| const elemsize = elemsize_ |
| |
| // Set mp.mallocing to keep from being preempted by GC. |
| mp := acquirem() |
| if doubleCheckMalloc { |
| doubleCheckSmallNoScan(typ, mp) |
| } |
| mp.mallocing = 1 |
| |
| checkGCTrigger := false |
| c := getMCache(mp) |
| const spc = spanClass(sizeclass<<1) | spanClass(noscanint_) |
| span := c.alloc[spc] |
| v := nextFreeFastStub(span) |
| if v == 0 { |
| v, span, checkGCTrigger = c.nextFree(spc) |
| } |
| x := unsafe.Pointer(v) |
| if needzero && span.needzero != 0 { |
| memclrNoHeapPointers(x, elemsize) |
| } |
| |
| // Ensure that the stores above that initialize x to |
| // type-safe memory and set the heap bits occur before |
| // the caller can make x observable to the garbage |
| // collector. Otherwise, on weakly ordered machines, |
| // the garbage collector could follow a pointer to x, |
| // but see uninitialized memory or stale heap bits. |
| publicationBarrier() |
| |
| if writeBarrier.enabled { |
| // Allocate black during GC. |
| // All slots hold nil so no scanning is needed. |
| // This may be racing with GC so do it atomically if there can be |
| // a race marking the bit. |
| gcmarknewobject(span, uintptr(x)) |
| } else { |
| // Track the last free index before the mark phase. This field |
| // is only used by the garbage collector. During the mark phase |
| // this is used by the conservative scanner to filter out objects |
| // that are both free and recently-allocated. It's safe to do that |
| // because we allocate-black if the GC is enabled. The conservative |
| // scanner produces pointers out of thin air, so without additional |
| // synchronization it might otherwise observe a partially-initialized |
| // object, which could crash the program. |
| span.freeIndexForScan = span.freeindex |
| } |
| |
| // Note cache c only valid while m acquired; see #47302 |
| // |
| // N.B. Use the full size because that matches how the GC |
| // will update the mem profile on the "free" side. |
| // |
| // TODO(mknyszek): We should really count the header as part |
| // of gc_sys or something. The code below just pretends it is |
| // internal fragmentation and matches the GC's accounting by |
| // using the whole allocation slot. |
| c.nextSample -= int64(elemsize) |
| if c.nextSample < 0 || MemProfileRate != c.memProfRate { |
| profilealloc(mp, x, elemsize) |
| } |
| mp.mallocing = 0 |
| releasem(mp) |
| |
| if checkGCTrigger { |
| if t := (gcTrigger{kind: gcTriggerHeap}); t.test() { |
| gcStart(t) |
| } |
| } |
| return x, elemsize |
| } |
| |
| func doubleCheckTiny(size uintptr, typ *_type, mp *m) { |
| if mp.mallocing != 0 { |
| throw("malloc deadlock") |
| } |
| if mp.gsignal == getg() { |
| throw("malloc during signal") |
| } |
| if typ != nil && typ.Pointers() { |
| throw("expected noscan for tiny alloc") |
| } |
| } |
| |
| func tinyStub(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, uintptr) { |
| const constsize = size_ |
| const elemsize = elemsize_ |
| |
| // Set mp.mallocing to keep from being preempted by GC. |
| mp := acquirem() |
| if doubleCheckMalloc { |
| doubleCheckTiny(constsize, typ, mp) |
| } |
| mp.mallocing = 1 |
| |
| // Tiny allocator. |
| // |
| // Tiny allocator combines several tiny allocation requests |
| // into a single memory block. The resulting memory block |
| // is freed when all subobjects are unreachable. The subobjects |
| // must be noscan (don't have pointers), this ensures that |
| // the amount of potentially wasted memory is bounded. |
| // |
| // Size of the memory block used for combining (maxTinySize) is tunable. |
| // Current setting is 16 bytes, which relates to 2x worst case memory |
| // wastage (when all but one subobjects are unreachable). |
| // 8 bytes would result in no wastage at all, but provides less |
| // opportunities for combining. |
| // 32 bytes provides more opportunities for combining, |
| // but can lead to 4x worst case wastage. |
| // The best case winning is 8x regardless of block size. |
| // |
| // Objects obtained from tiny allocator must not be freed explicitly. |
| // So when an object will be freed explicitly, we ensure that |
| // its size >= maxTinySize. |
| // |
| // SetFinalizer has a special case for objects potentially coming |
| // from tiny allocator, it such case it allows to set finalizers |
| // for an inner byte of a memory block. |
| // |
| // The main targets of tiny allocator are small strings and |
| // standalone escaping variables. On a json benchmark |
| // the allocator reduces number of allocations by ~12% and |
| // reduces heap size by ~20%. |
| c := getMCache(mp) |
| off := c.tinyoffset |
| // Align tiny pointer for required (conservative) alignment. |
| if constsize&7 == 0 { |
| off = alignUp(off, 8) |
| } else if goarch.PtrSize == 4 && constsize == 12 { |
| // Conservatively align 12-byte objects to 8 bytes on 32-bit |
| // systems so that objects whose first field is a 64-bit |
| // value is aligned to 8 bytes and does not cause a fault on |
| // atomic access. See issue 37262. |
| // TODO(mknyszek): Remove this workaround if/when issue 36606 |
| // is resolved. |
| off = alignUp(off, 8) |
| } else if constsize&3 == 0 { |
| off = alignUp(off, 4) |
| } else if constsize&1 == 0 { |
| off = alignUp(off, 2) |
| } |
| if off+constsize <= maxTinySize && c.tiny != 0 { |
| // The object fits into existing tiny block. |
| x := unsafe.Pointer(c.tiny + off) |
| c.tinyoffset = off + constsize |
| c.tinyAllocs++ |
| mp.mallocing = 0 |
| releasem(mp) |
| return x, 0 |
| } |
| // Allocate a new maxTinySize block. |
| checkGCTrigger := false |
| span := c.alloc[tinySpanClass] |
| v := nextFreeFastTiny(span) |
| if v == 0 { |
| v, span, checkGCTrigger = c.nextFree(tinySpanClass) |
| } |
| x := unsafe.Pointer(v) |
| (*[2]uint64)(x)[0] = 0 // Always zero |
| (*[2]uint64)(x)[1] = 0 |
| // See if we need to replace the existing tiny block with the new one |
| // based on amount of remaining free space. |
| if !raceenabled && (constsize < c.tinyoffset || c.tiny == 0) { |
| // Note: disabled when race detector is on, see comment near end of this function. |
| c.tiny = uintptr(x) |
| c.tinyoffset = constsize |
| } |
| |
| // Ensure that the stores above that initialize x to |
| // type-safe memory and set the heap bits occur before |
| // the caller can make x observable to the garbage |
| // collector. Otherwise, on weakly ordered machines, |
| // the garbage collector could follow a pointer to x, |
| // but see uninitialized memory or stale heap bits. |
| publicationBarrier() |
| |
| if writeBarrier.enabled { |
| // Allocate black during GC. |
| // All slots hold nil so no scanning is needed. |
| // This may be racing with GC so do it atomically if there can be |
| // a race marking the bit. |
| gcmarknewobject(span, uintptr(x)) |
| } else { |
| // Track the last free index before the mark phase. This field |
| // is only used by the garbage collector. During the mark phase |
| // this is used by the conservative scanner to filter out objects |
| // that are both free and recently-allocated. It's safe to do that |
| // because we allocate-black if the GC is enabled. The conservative |
| // scanner produces pointers out of thin air, so without additional |
| // synchronization it might otherwise observe a partially-initialized |
| // object, which could crash the program. |
| span.freeIndexForScan = span.freeindex |
| } |
| |
| // Note cache c only valid while m acquired; see #47302 |
| // |
| // N.B. Use the full size because that matches how the GC |
| // will update the mem profile on the "free" side. |
| // |
| // TODO(mknyszek): We should really count the header as part |
| // of gc_sys or something. The code below just pretends it is |
| // internal fragmentation and matches the GC's accounting by |
| // using the whole allocation slot. |
| c.nextSample -= int64(elemsize) |
| if c.nextSample < 0 || MemProfileRate != c.memProfRate { |
| profilealloc(mp, x, elemsize) |
| } |
| mp.mallocing = 0 |
| releasem(mp) |
| |
| if checkGCTrigger { |
| if t := (gcTrigger{kind: gcTriggerHeap}); t.test() { |
| gcStart(t) |
| } |
| } |
| |
| if raceenabled { |
| // Pad tinysize allocations so they are aligned with the end |
| // of the tinyalloc region. This ensures that any arithmetic |
| // that goes off the top end of the object will be detectable |
| // by checkptr (issue 38872). |
| // Note that we disable tinyalloc when raceenabled for this to work. |
| // TODO: This padding is only performed when the race detector |
| // is enabled. It would be nice to enable it if any package |
| // was compiled with checkptr, but there's no easy way to |
| // detect that (especially at compile time). |
| // TODO: enable this padding for all allocations, not just |
| // tinyalloc ones. It's tricky because of pointer maps. |
| // Maybe just all noscan objects? |
| x = add(x, elemsize-constsize) |
| } |
| return x, elemsize |
| } |
| |
| // TODO(matloob): Should we let the go compiler inline this instead of using mkmalloc? |
| // We won't be able to use elemsize_ but that's probably ok. |
| func nextFreeFastTiny(span *mspan) gclinkptr { |
| const nbytes = 8192 |
| const nelems = uint16((nbytes - unsafe.Sizeof(spanInlineMarkBits{})) / elemsize_) |
| var nextFreeFastResult gclinkptr |
| if span.allocCache != 0 { |
| theBit := sys.TrailingZeros64(span.allocCache) // Is there a free object in the allocCache? |
| result := span.freeindex + uint16(theBit) |
| if result < nelems { |
| freeidx := result + 1 |
| if !(freeidx%64 == 0 && freeidx != nelems) { |
| span.allocCache >>= uint(theBit + 1) |
| span.freeindex = freeidx |
| span.allocCount++ |
| nextFreeFastResult = gclinkptr(uintptr(result)*elemsize_ + span.base()) |
| } |
| } |
| } |
| return nextFreeFastResult |
| } |
| |
| func nextFreeFastStub(span *mspan) gclinkptr { |
| var nextFreeFastResult gclinkptr |
| if span.allocCache != 0 { |
| theBit := sys.TrailingZeros64(span.allocCache) // Is there a free object in the allocCache? |
| result := span.freeindex + uint16(theBit) |
| if result < span.nelems { |
| freeidx := result + 1 |
| if !(freeidx%64 == 0 && freeidx != span.nelems) { |
| span.allocCache >>= uint(theBit + 1) |
| span.freeindex = freeidx |
| span.allocCount++ |
| nextFreeFastResult = gclinkptr(uintptr(result)*elemsize_ + span.base()) |
| } |
| } |
| } |
| return nextFreeFastResult |
| } |
| |
| func heapSetTypeNoHeaderStub(x, dataSize uintptr, typ *_type, span *mspan) uintptr { |
| if doubleCheckHeapSetType && (!heapBitsInSpan(dataSize) || !heapBitsInSpan(elemsize_)) { |
| throw("tried to write heap bits, but no heap bits in span") |
| } |
| scanSize := writeHeapBitsSmallStub(span, x, dataSize, typ) |
| if doubleCheckHeapSetType { |
| doubleCheckHeapType(x, dataSize, typ, nil, span) |
| } |
| return scanSize |
| } |
| |
| // writeHeapBitsSmallStub writes the heap bits for small objects whose ptr/scalar data is |
| // stored as a bitmap at the end of the span. |
| // |
| // Assumes dataSize is <= ptrBits*goarch.PtrSize. x must be a pointer into the span. |
| // heapBitsInSpan(dataSize) must be true. dataSize must be >= typ.Size_. |
| // |
| //go:nosplit |
| func writeHeapBitsSmallStub(span *mspan, x, dataSize uintptr, typ *_type) uintptr { |
| // The objects here are always really small, so a single load is sufficient. |
| src0 := readUintptr(getGCMask(typ)) |
| |
| const elemsize = elemsize_ |
| |
| // Create repetitions of the bitmap if we have a small slice backing store. |
| scanSize := typ.PtrBytes |
| src := src0 |
| if typ.Size_ == goarch.PtrSize { |
| src = (1 << (dataSize / goarch.PtrSize)) - 1 |
| } else { |
| // N.B. We rely on dataSize being an exact multiple of the type size. |
| // The alternative is to be defensive and mask out src to the length |
| // of dataSize. The purpose is to save on one additional masking operation. |
| if doubleCheckHeapSetType && !asanenabled && dataSize%typ.Size_ != 0 { |
| throw("runtime: (*mspan).writeHeapBitsSmall: dataSize is not a multiple of typ.Size_") |
| } |
| for i := typ.Size_; i < dataSize; i += typ.Size_ { |
| src |= src0 << (i / goarch.PtrSize) |
| scanSize += typ.Size_ |
| } |
| } |
| |
| // Since we're never writing more than one uintptr's worth of bits, we're either going |
| // to do one or two writes. |
| dstBase, _ := spanHeapBitsRange(span.base(), pageSize, elemsize) |
| dst := unsafe.Pointer(dstBase) |
| o := (x - span.base()) / goarch.PtrSize |
| i := o / ptrBits |
| j := o % ptrBits |
| const bits uintptr = elemsize / goarch.PtrSize |
| // In the if statement below, we have to do two uintptr writes if the bits |
| // we need to write straddle across two different memory locations. But if |
| // the number of bits we're writing divides evenly into the number of bits |
| // in the uintptr we're writing, this can never happen. Since bitsIsPowerOfTwo |
| // is a compile-time constant in the generated code, in the case where the size is |
| // a power of two less than or equal to ptrBits, the compiler can remove the |
| // 'two writes' branch of the if statement and always do only one write without |
| // the check. |
| const bitsIsPowerOfTwo = bits&(bits-1) == 0 |
| if bits > ptrBits || (!bitsIsPowerOfTwo && j+bits > ptrBits) { |
| // Two writes. |
| bits0 := ptrBits - j |
| bits1 := bits - bits0 |
| dst0 := (*uintptr)(add(dst, (i+0)*goarch.PtrSize)) |
| dst1 := (*uintptr)(add(dst, (i+1)*goarch.PtrSize)) |
| *dst0 = (*dst0)&(^uintptr(0)>>bits0) | (src << j) |
| *dst1 = (*dst1)&^((1<<bits1)-1) | (src >> bits0) |
| } else { |
| // One write. |
| dst := (*uintptr)(add(dst, i*goarch.PtrSize)) |
| *dst = (*dst)&^(((1<<(min(bits, ptrBits)))-1)<<j) | (src << j) // We're taking the min so this compiles on 32 bit platforms. But if bits > ptrbits we always take the other branch |
| } |
| |
| const doubleCheck = false |
| if doubleCheck { |
| writeHeapBitsDoubleCheck(span, x, dataSize, src, src0, i, j, bits, typ) |
| } |
| return scanSize |
| } |
| |
| func writeHeapBitsDoubleCheck(span *mspan, x, dataSize, src, src0, i, j, bits uintptr, typ *_type) { |
| srcRead := span.heapBitsSmallForAddr(x) |
| if srcRead != src { |
| print("runtime: x=", hex(x), " i=", i, " j=", j, " bits=", bits, "\n") |
| print("runtime: dataSize=", dataSize, " typ.Size_=", typ.Size_, " typ.PtrBytes=", typ.PtrBytes, "\n") |
| print("runtime: src0=", hex(src0), " src=", hex(src), " srcRead=", hex(srcRead), "\n") |
| throw("bad pointer bits written for small object") |
| } |
| } |