src/pkg/runtime/malloc.go - go - Git at Google

 // Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package runtime

 import (
 	"unsafe"
 )

 const (
 	debugMalloc = false

 	flagNoScan = 1 << 0 // GC doesn't have to scan object
 	flagNoZero = 1 << 1 // don't zero memory

 	kindArray      = 17
 	kindFunc       = 19
 	kindInterface  = 20
 	kindPtr        = 22
 	kindStruct     = 25
 	kindMask       = 1<<6 - 1
 	kindGCProg     = 1 << 6
 	kindNoPointers = 1 << 7

 	maxTinySize   = 16
 	tinySizeClass = 2
 	maxSmallSize  = 32 << 10

 	pageShift = 13
 	pageSize  = 1 << pageShift
 	pageMask  = pageSize - 1

 	wordsPerBitmapWord = ptrSize * 8 / 4
 	gcBits             = 4
 	bitsPerPointer     = 2
 	bitsMask           = 1<<bitsPerPointer - 1
 	pointersPerByte    = 8 / bitsPerPointer
 	bitPtrMask         = bitsMask << 2
 	maxGCMask          = 0 // disabled because wastes several bytes of memory
 	bitsDead           = 0
 	bitsPointer        = 2

 	bitMiddle    = 0
 	bitBoundary  = 1
 	bitAllocated = 2
 	bitMarked    = 3
 	bitMask      = bitMiddle | bitBoundary | bitAllocated | bitMarked
 )

 // All zero-sized allocations return a pointer to this byte.
 var zeroObject byte

 // Maximum possible heap size.
 var maxMem uintptr

 // Allocate an object of size bytes.
 // Small objects are allocated from the per-P cache's free lists.
 // Large objects (> 32 kB) are allocated straight from the heap.
 func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer {
 	if size == 0 {
 		return unsafe.Pointer(&zeroObject)
 	}
 	mp := acquirem()
 	if mp.mallocing != 0 {
 		gothrow("malloc/free - deadlock")
 	}
 	mp.mallocing = 1
 	size0 := size

 	c := mp.mcache
 	var s *mspan
 	var x unsafe.Pointer
 	if size <= maxSmallSize {
 		if flags&flagNoScan != 0 && size < maxTinySize {
 			// Tiny allocator.
 			//
 			// Tiny allocator combines several tiny allocation requests
 			// into a single memory block. The resulting memory block
 			// is freed when all subobjects are unreachable. The subobjects
 			// must be FlagNoScan (don't have pointers), this ensures that
 			// the amount of potentially wasted memory is bounded.
 			//
 			// Size of the memory block used for combining (maxTinySize) is tunable.
 			// Current setting is 16 bytes, which relates to 2x worst case memory
 			// wastage (when all but one subobjects are unreachable).
 			// 8 bytes would result in no wastage at all, but provides less
 			// opportunities for combining.
 			// 32 bytes provides more opportunities for combining,
 			// but can lead to 4x worst case wastage.
 			// The best case winning is 8x regardless of block size.
 			//
 			// Objects obtained from tiny allocator must not be freed explicitly.
 			// So when an object will be freed explicitly, we ensure that
 			// its size >= maxTinySize.
 			//
 			// SetFinalizer has a special case for objects potentially coming
 			// from tiny allocator, it such case it allows to set finalizers
 			// for an inner byte of a memory block.
 			//
 			// The main targets of tiny allocator are small strings and
 			// standalone escaping variables. On a json benchmark
 			// the allocator reduces number of allocations by ~12% and
 			// reduces heap size by ~20%.

 			tinysize := uintptr(c.tinysize)
 			if size <= tinysize {
 				tiny := unsafe.Pointer(c.tiny)
 				// Align tiny pointer for required (conservative) alignment.
 				if size&7 == 0 {
 					tiny = roundup(tiny, 8)
 				} else if size&3 == 0 {
 					tiny = roundup(tiny, 4)
 				} else if size&1 == 0 {
 					tiny = roundup(tiny, 2)
 				}
 				size1 := size + (uintptr(tiny) - uintptr(unsafe.Pointer(c.tiny)))
 				if size1 <= tinysize {
 					// The object fits into existing tiny block.
 					x = tiny
 					c.tiny = (*byte)(add(x, size))
 					c.tinysize -= uint(size1)
 					mp.mallocing = 0
 					releasem(mp)
 					return x
 				}
 			}
 			// Allocate a new maxTinySize block.
 			s = c.alloc[tinySizeClass]
 			v := s.freelist
 			if v == nil {
 				mp.scalararg[0] = tinySizeClass
 				onM(&mcacheRefill_m)
 				s = c.alloc[tinySizeClass]
 				v = s.freelist
 			}
 			s.freelist = v.next
 			s.ref++
 			//TODO: prefetch v.next
 			x = unsafe.Pointer(v)
 			(*[2]uint64)(x)[0] = 0
 			(*[2]uint64)(x)[1] = 0
 			// See if we need to replace the existing tiny block with the new one
 			// based on amount of remaining free space.
 			if maxTinySize-size > tinysize {
 				c.tiny = (*byte)(add(x, size))
 				c.tinysize = uint(maxTinySize - size)
 			}
 			size = maxTinySize
 		} else {
 			var sizeclass int8
 			if size <= 1024-8 {
 				sizeclass = size_to_class8[(size+7)>>3]
 			} else {
 				sizeclass = size_to_class128[(size-1024+127)>>7]
 			}
 			size = uintptr(class_to_size[sizeclass])
 			s = c.alloc[sizeclass]
 			v := s.freelist
 			if v == nil {
 				mp.scalararg[0] = uint(sizeclass)
 				onM(&mcacheRefill_m)
 				s = c.alloc[sizeclass]
 				v = s.freelist
 			}
 			s.freelist = v.next
 			s.ref++
 			//TODO: prefetch
 			x = unsafe.Pointer(v)
 			if flags&flagNoZero == 0 {
 				v.next = nil
 				if size > 2*ptrSize && ((*[2]uintptr)(x))[1] != 0 {
 					memclr(unsafe.Pointer(v), size)
 				}
 			}
 		}
 		c.local_cachealloc += int(size)
 	} else {
 		mp.scalararg[0] = uint(size)
 		mp.scalararg[1] = uint(flags)
 		onM(&largeAlloc_m)
 		s = (*mspan)(mp.ptrarg[0])
 		mp.ptrarg[0] = nil
 		x = unsafe.Pointer(uintptr(s.start << pageShift))
 		size = uintptr(s.elemsize)
 	}

 	// From here till marked label marking the object as allocated
 	// and storing type info in the GC bitmap.
 	arena_start := uintptr(unsafe.Pointer(mheap_.arena_start))
 	off := (uintptr(x) - arena_start) / ptrSize
 	xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
 	shift := (off % wordsPerBitmapWord) * gcBits
 	if debugMalloc && (((*xbits)>>shift)&bitMask) != bitBoundary {
 		gothrow("bad bits in markallocated")
 	}

 	var ti, te uintptr
 	var ptrmask *uint8
 	if flags&flagNoScan != 0 {
 		// bitsDead in the first quadruple means don't scan.
 		if size == ptrSize {
 			*xbits = (*xbits & ^((bitBoundary | bitPtrMask) << shift)) | ((bitAllocated + (bitsDead << 2)) << shift)
 		} else {
 			xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
 			*xbitsb = bitAllocated + (bitsDead << 2)
 		}
 		goto marked
 	}
 	if size == ptrSize {
 		// It's one word and it has pointers, it must be a pointer.
 		*xbits = (*xbits & ^((bitBoundary | bitPtrMask) << shift)) | ((bitAllocated | (bitsPointer << 2)) << shift)
 		goto marked
 	}
 	if typ != nil && (uintptr(typ.gc[0])|uintptr(typ.gc[1])) != 0 && uintptr(typ.size) > ptrSize {
 		if typ.kind&kindGCProg != 0 {
 			nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize
 			masksize := nptr
 			if masksize%2 != 0 {
 				masksize *= 2 // repeated
 			}
 			masksize = masksize * pointersPerByte / 8 // 4 bits per word
 			masksize++                                // unroll flag in the beginning
 			if masksize > maxGCMask && typ.gc[1] != 0 {
 				// If the mask is too large, unroll the program directly
 				// into the GC bitmap. It's 7 times slower than copying
 				// from the pre-unrolled mask, but saves 1/16 of type size
 				// memory for the mask.
 				mp.ptrarg[0] = x
 				mp.ptrarg[1] = unsafe.Pointer(typ)
 				mp.scalararg[0] = uint(size)
 				mp.scalararg[1] = uint(size0)
 				onM(&unrollgcproginplace_m)
 				goto marked
 			}
 			ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0])))
 			// Check whether the program is already unrolled.
 			if uintptr(goatomicloadp(unsafe.Pointer(ptrmask)))&0xff == 0 {
 				mp.ptrarg[0] = unsafe.Pointer(typ)
 				onM(&unrollgcprog_m)
 			}
 			ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte
 		} else {
 			ptrmask = (*uint8)(unsafe.Pointer(&typ.gc[0])) // embed mask
 		}
 		if size == 2*ptrSize {
 			xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
 			*xbitsb = *ptrmask | bitAllocated
 			goto marked
 		}
 		te = uintptr(typ.size) / ptrSize
 		// If the type occupies odd number of words, its mask is repeated.
 		if te%2 == 0 {
 			te /= 2
 		}
 	}
 	if size == 2*ptrSize {
 		xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
 		*xbitsb = (bitsPointer << 2) | (bitsPointer << 6) | bitAllocated
 		goto marked
 	}
 	// Copy pointer bitmask into the bitmap.
 	for i := uintptr(0); i < size0; i += 2 * ptrSize {
 		v := uint8((bitsPointer << 2) | (bitsPointer << 6))
 		if ptrmask != nil {
 			v = *(*uint8)(add(unsafe.Pointer(ptrmask), ti))
 			ti++
 			if ti == te {
 				ti = 0
 			}
 		}
 		if i == 0 {
 			v |= bitAllocated
 		}
 		if i+ptrSize == size0 {
 			v &= ^uint8(bitPtrMask << 4)
 		}

 		off := (uintptr(x) + i - arena_start) / ptrSize
 		xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
 		shift := (off % wordsPerBitmapWord) * gcBits
 		xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
 		*xbitsb = v
 	}
 	if size0%(2*ptrSize) == 0 && size0 < size {
 		// Mark the word after last object's word as bitsDead.
 		off := (uintptr(x) + size0 - arena_start) / ptrSize
 		xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
 		shift := (off % wordsPerBitmapWord) * gcBits
 		xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
 		*xbitsb = bitsDead << 2
 	}
 marked:
 	mp.mallocing = 0

 	if raceenabled {
 		racemalloc(x, size)
 	}
 	if debug.allocfreetrace != 0 {
 		tracealloc(x, size, typ)
 	}

 	if rate := MemProfileRate; rate > 0 {
 		if size < uintptr(rate) && int32(size) < c.next_sample {
 			c.next_sample -= int32(size)
 		} else {
 			profilealloc(mp, x, size)
 		}
 	}

 	releasem(mp)

 	if memstats.heap_alloc >= memstats.next_gc {
 		gogc(0)
 	}

 	return x
 }

 // cmallocgc is a trampoline used to call the Go malloc from C.
 func cmallocgc(size uintptr, typ *_type, flags int, ret *unsafe.Pointer) {
 	*ret = gomallocgc(size, typ, flags)
 }

 // implementation of new builtin
 func newobject(typ *_type) unsafe.Pointer {
 	flags := 0
 	if typ.kind&kindNoPointers != 0 {
 		flags |= flagNoScan
 	}
 	return gomallocgc(uintptr(typ.size), typ, flags)
 }

 // implementation of make builtin for slices
 func newarray(typ *_type, n uintptr) unsafe.Pointer {
 	flags := 0
 	if typ.kind&kindNoPointers != 0 {
 		flags |= flagNoScan
 	}
 	if int(n) < 0 || (typ.size > 0 && n > maxMem/uintptr(typ.size)) {
 		panic("runtime: allocation size out of range")
 	}
 	return gomallocgc(uintptr(typ.size)*n, typ, flags)
 }

 // rawmem returns a chunk of pointerless memory.  It is
 // not zeroed.
 func rawmem(size uintptr) unsafe.Pointer {
 	return gomallocgc(size, nil, flagNoScan|flagNoZero)
 }

 // round size up to next size class
 func goroundupsize(size uintptr) uintptr {
 	if size < maxSmallSize {
 		if size <= 1024-8 {
 			return uintptr(class_to_size[size_to_class8[(size+7)>>3]])
 		}
 		return uintptr(class_to_size[size_to_class128[(size-1024+127)>>7]])
 	}
 	if size+pageSize < size {
 		return size
 	}
 	return (size + pageSize - 1) &^ pageMask
 }

 func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
 	c := mp.mcache
 	rate := MemProfileRate
 	if size < uintptr(rate) {
 		// pick next profile time
 		// If you change this, also change allocmcache.
 		if rate > 0x3fffffff { // make 2*rate not overflow
 			rate = 0x3fffffff
 		}
 		next := int32(fastrand2()) % (2 * int32(rate))
 		// Subtract the "remainder" of the current allocation.
 		// Otherwise objects that are close in size to sampling rate
 		// will be under-sampled, because we consistently discard this remainder.
 		next -= (int32(size) - c.next_sample)
 		if next < 0 {
 			next = 0
 		}
 		c.next_sample = next
 	}
 	mp.scalararg[0] = uint(size)
 	mp.ptrarg[0] = x
 	onM(&mprofMalloc_m)
 }

 // force = 1 - do GC regardless of current heap usage
 // force = 2 - go GC and eager sweep
 func gogc(force int32) {
 	if memstats.enablegc == 0 {
 		return
 	}

 	// TODO: should never happen?  Only C calls malloc while holding a lock?
 	mp := acquirem()
 	if mp.locks > 1 {
 		releasem(mp)
 		return
 	}
 	releasem(mp)

 	if panicking != 0 {
 		return
 	}
 	if gcpercent == gcpercentUnknown {
 		golock(&mheap_.lock)
 		if gcpercent == gcpercentUnknown {
 			gcpercent = goreadgogc()
 		}
 		gounlock(&mheap_.lock)
 	}
 	if gcpercent < 0 {
 		return
 	}

 	semacquire(&worldsema, false)

 	if force == 0 && memstats.heap_alloc < memstats.next_gc {
 		// typically threads which lost the race to grab
 		// worldsema exit here when gc is done.
 		semrelease(&worldsema)
 		return
 	}

 	// Ok, we're doing it!  Stop everybody else
 	startTime := gonanotime()
 	mp = acquirem()
 	mp.gcing = 1
 	stoptheworld()

 	clearpools()

 	// Run gc on the g0 stack.  We do this so that the g stack
 	// we're currently running on will no longer change.  Cuts
 	// the root set down a bit (g0 stacks are not scanned, and
 	// we don't need to scan gc's internal state).  We also
 	// need to switch to g0 so we can shrink the stack.
 	n := 1
 	if debug.gctrace > 1 {
 		n = 2
 	}
 	for i := 0; i < n; i++ {
 		if i > 0 {
 			startTime = gonanotime()
 		}
 		// switch to g0, call gc, then switch back
 		mp.scalararg[0] = uint(startTime)
 		if force >= 2 {
 			mp.scalararg[1] = 1 // eagersweep
 		} else {
 			mp.scalararg[1] = 0
 		}
 		onM(&gc_m)
 	}

 	// all done
 	mp.gcing = 0
 	semrelease(&worldsema)
 	starttheworld()
 	releasem(mp)

 	// now that gc is done, kick off finalizer thread if needed
 	if !concurrentSweep {
 		// give the queued finalizers, if any, a chance to run
 		gosched()
 	}
 }

 // GC runs a garbage collection.
 func GC() {
 	gogc(2)
 }

 // SetFinalizer sets the finalizer associated with x to f.
 // When the garbage collector finds an unreachable block
 // with an associated finalizer, it clears the association and runs
 // f(x) in a separate goroutine.  This makes x reachable again, but
 // now without an associated finalizer.  Assuming that SetFinalizer
 // is not called again, the next time the garbage collector sees
 // that x is unreachable, it will free x.
 //
 // SetFinalizer(x, nil) clears any finalizer associated with x.
 //
 // The argument x must be a pointer to an object allocated by
 // calling new or by taking the address of a composite literal.
 // The argument f must be a function that takes a single argument
 // to which x's type can be assigned, and can have arbitrary ignored return
 // values. If either of these is not true, SetFinalizer aborts the
 // program.
 //
 // Finalizers are run in dependency order: if A points at B, both have
 // finalizers, and they are otherwise unreachable, only the finalizer
 // for A runs; once A is freed, the finalizer for B can run.
 // If a cyclic structure includes a block with a finalizer, that
 // cycle is not guaranteed to be garbage collected and the finalizer
 // is not guaranteed to run, because there is no ordering that
 // respects the dependencies.
 //
 // The finalizer for x is scheduled to run at some arbitrary time after
 // x becomes unreachable.
 // There is no guarantee that finalizers will run before a program exits,
 // so typically they are useful only for releasing non-memory resources
 // associated with an object during a long-running program.
 // For example, an os.File object could use a finalizer to close the
 // associated operating system file descriptor when a program discards
 // an os.File without calling Close, but it would be a mistake
 // to depend on a finalizer to flush an in-memory I/O buffer such as a
 // bufio.Writer, because the buffer would not be flushed at program exit.
 //
 // It is not guaranteed that a finalizer will run if the size of *x is
 // zero bytes.
 //
 // A single goroutine runs all finalizers for a program, sequentially.
 // If a finalizer must run for a long time, it should do so by starting
 // a new goroutine.
 func SetFinalizer(obj interface{}, finalizer interface{}) {
 	// We do just enough work here to make the mcall type safe.
 	// The rest is done on the M stack.
 	e := (*eface)(unsafe.Pointer(&obj))
 	typ := e._type
 	if typ == nil {
 		gothrow("runtime.SetFinalizer: first argument is nil")
 	}
 	if typ.kind&kindMask != kindPtr {
 		gothrow("runtime.SetFinalizer: first argument is " + *typ._string + ", not pointer")
 	}

 	f := (*eface)(unsafe.Pointer(&finalizer))
 	ftyp := f._type
 	if ftyp != nil && ftyp.kind&kindMask != kindFunc {
 		gothrow("runtime.SetFinalizer: second argument is " + *ftyp._string + ", not a function")
 	}
 	mp := acquirem()
 	mp.ptrarg[0] = unsafe.Pointer(typ)
 	mp.ptrarg[1] = e.data
 	mp.ptrarg[2] = unsafe.Pointer(ftyp)
 	mp.ptrarg[3] = f.data
 	onM(&setFinalizer_m)
 	releasem(mp)
 }
	// Copyright 2014 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package runtime

	import (
	"unsafe"
	)

	const (
	debugMalloc = false

	flagNoScan = 1 << 0 // GC doesn't have to scan object
	flagNoZero = 1 << 1 // don't zero memory

	kindArray = 17
	kindFunc = 19
	kindInterface = 20
	kindPtr = 22
	kindStruct = 25
	kindMask = 1<<6 - 1
	kindGCProg = 1 << 6
	kindNoPointers = 1 << 7

	maxTinySize = 16
	tinySizeClass = 2
	maxSmallSize = 32 << 10

	pageShift = 13
	pageSize = 1 << pageShift
	pageMask = pageSize - 1

	wordsPerBitmapWord = ptrSize * 8 / 4
	gcBits = 4
	bitsPerPointer = 2
	bitsMask = 1<<bitsPerPointer - 1
	pointersPerByte = 8 / bitsPerPointer
	bitPtrMask = bitsMask << 2
	maxGCMask = 0 // disabled because wastes several bytes of memory
	bitsDead = 0
	bitsPointer = 2

	bitMiddle = 0
	bitBoundary = 1
	bitAllocated = 2
	bitMarked = 3
	bitMask = bitMiddle \| bitBoundary \| bitAllocated \| bitMarked
	)

	// All zero-sized allocations return a pointer to this byte.
	var zeroObject byte

	// Maximum possible heap size.
	var maxMem uintptr

	// Allocate an object of size bytes.
	// Small objects are allocated from the per-P cache's free lists.
	// Large objects (> 32 kB) are allocated straight from the heap.
	func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer {
	if size == 0 {
	return unsafe.Pointer(&zeroObject)
	}
	mp := acquirem()
	if mp.mallocing != 0 {
	gothrow("malloc/free - deadlock")
	}
	mp.mallocing = 1
	size0 := size

	c := mp.mcache
	var s *mspan
	var x unsafe.Pointer
	if size <= maxSmallSize {
	if flags&flagNoScan != 0 && size < maxTinySize {
	// Tiny allocator.
	//
	// Tiny allocator combines several tiny allocation requests
	// into a single memory block. The resulting memory block
	// is freed when all subobjects are unreachable. The subobjects
	// must be FlagNoScan (don't have pointers), this ensures that
	// the amount of potentially wasted memory is bounded.
	//
	// Size of the memory block used for combining (maxTinySize) is tunable.
	// Current setting is 16 bytes, which relates to 2x worst case memory
	// wastage (when all but one subobjects are unreachable).
	// 8 bytes would result in no wastage at all, but provides less
	// opportunities for combining.
	// 32 bytes provides more opportunities for combining,
	// but can lead to 4x worst case wastage.
	// The best case winning is 8x regardless of block size.
	//
	// Objects obtained from tiny allocator must not be freed explicitly.
	// So when an object will be freed explicitly, we ensure that
	// its size >= maxTinySize.
	//
	// SetFinalizer has a special case for objects potentially coming
	// from tiny allocator, it such case it allows to set finalizers
	// for an inner byte of a memory block.
	//
	// The main targets of tiny allocator are small strings and
	// standalone escaping variables. On a json benchmark
	// the allocator reduces number of allocations by ~12% and
	// reduces heap size by ~20%.

	tinysize := uintptr(c.tinysize)
	if size <= tinysize {
	tiny := unsafe.Pointer(c.tiny)
	// Align tiny pointer for required (conservative) alignment.
	if size&7 == 0 {
	tiny = roundup(tiny, 8)
	} else if size&3 == 0 {
	tiny = roundup(tiny, 4)
	} else if size&1 == 0 {
	tiny = roundup(tiny, 2)
	}
	size1 := size + (uintptr(tiny) - uintptr(unsafe.Pointer(c.tiny)))
	if size1 <= tinysize {
	// The object fits into existing tiny block.
	x = tiny
	c.tiny = (*byte)(add(x, size))
	c.tinysize -= uint(size1)
	mp.mallocing = 0
	releasem(mp)
	return x
	}
	}
	// Allocate a new maxTinySize block.
	s = c.alloc[tinySizeClass]
	v := s.freelist
	if v == nil {
	mp.scalararg[0] = tinySizeClass
	onM(&mcacheRefill_m)
	s = c.alloc[tinySizeClass]
	v = s.freelist
	}
	s.freelist = v.next
	s.ref++
	//TODO: prefetch v.next
	x = unsafe.Pointer(v)
	(*[2]uint64)(x)[0] = 0
	(*[2]uint64)(x)[1] = 0
	// See if we need to replace the existing tiny block with the new one
	// based on amount of remaining free space.
	if maxTinySize-size > tinysize {
	c.tiny = (*byte)(add(x, size))
	c.tinysize = uint(maxTinySize - size)
	}
	size = maxTinySize
	} else {
	var sizeclass int8
	if size <= 1024-8 {
	sizeclass = size_to_class8[(size+7)>>3]
	} else {
	sizeclass = size_to_class128[(size-1024+127)>>7]
	}
	size = uintptr(class_to_size[sizeclass])
	s = c.alloc[sizeclass]
	v := s.freelist
	if v == nil {
	mp.scalararg[0] = uint(sizeclass)
	onM(&mcacheRefill_m)
	s = c.alloc[sizeclass]
	v = s.freelist
	}
	s.freelist = v.next
	s.ref++
	//TODO: prefetch
	x = unsafe.Pointer(v)
	if flags&flagNoZero == 0 {
	v.next = nil
	if size > 2ptrSize && (([2]uintptr)(x))[1] != 0 {
	memclr(unsafe.Pointer(v), size)
	}
	}
	}
	c.local_cachealloc += int(size)
	} else {
	mp.scalararg[0] = uint(size)
	mp.scalararg[1] = uint(flags)
	onM(&largeAlloc_m)
	s = (*mspan)(mp.ptrarg[0])
	mp.ptrarg[0] = nil
	x = unsafe.Pointer(uintptr(s.start << pageShift))
	size = uintptr(s.elemsize)
	}

	// From here till marked label marking the object as allocated
	// and storing type info in the GC bitmap.
	arena_start := uintptr(unsafe.Pointer(mheap_.arena_start))
	off := (uintptr(x) - arena_start) / ptrSize
	xbits := (uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWordptrSize - ptrSize))
	shift := (off % wordsPerBitmapWord) * gcBits
	if debugMalloc && (((*xbits)>>shift)&bitMask) != bitBoundary {
	gothrow("bad bits in markallocated")
	}

	var ti, te uintptr
	var ptrmask *uint8
	if flags&flagNoScan != 0 {
	// bitsDead in the first quadruple means don't scan.
	if size == ptrSize {
	xbits = (xbits & ^((bitBoundary \| bitPtrMask) << shift)) \| ((bitAllocated + (bitsDead << 2)) << shift)
	} else {
	xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
	*xbitsb = bitAllocated + (bitsDead << 2)
	}
	goto marked
	}
	if size == ptrSize {
	// It's one word and it has pointers, it must be a pointer.
	xbits = (xbits & ^((bitBoundary \| bitPtrMask) << shift)) \| ((bitAllocated \| (bitsPointer << 2)) << shift)
	goto marked
	}
	if typ != nil && (uintptr(typ.gc[0])\|uintptr(typ.gc[1])) != 0 && uintptr(typ.size) > ptrSize {
	if typ.kind&kindGCProg != 0 {
	nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize
	masksize := nptr
	if masksize%2 != 0 {
	masksize *= 2 // repeated
	}
	masksize = masksize * pointersPerByte / 8 // 4 bits per word
	masksize++ // unroll flag in the beginning
	if masksize > maxGCMask && typ.gc[1] != 0 {
	// If the mask is too large, unroll the program directly
	// into the GC bitmap. It's 7 times slower than copying
	// from the pre-unrolled mask, but saves 1/16 of type size
	// memory for the mask.
	mp.ptrarg[0] = x
	mp.ptrarg[1] = unsafe.Pointer(typ)
	mp.scalararg[0] = uint(size)
	mp.scalararg[1] = uint(size0)
	onM(&unrollgcproginplace_m)
	goto marked
	}
	ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0])))
	// Check whether the program is already unrolled.
	if uintptr(goatomicloadp(unsafe.Pointer(ptrmask)))&0xff == 0 {
	mp.ptrarg[0] = unsafe.Pointer(typ)
	onM(&unrollgcprog_m)
	}
	ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte
	} else {
	ptrmask = (*uint8)(unsafe.Pointer(&typ.gc[0])) // embed mask
	}
	if size == 2*ptrSize {
	xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
	xbitsb = ptrmask \| bitAllocated
	goto marked
	}
	te = uintptr(typ.size) / ptrSize
	// If the type occupies odd number of words, its mask is repeated.
	if te%2 == 0 {
	te /= 2
	}
	}
	if size == 2*ptrSize {
	xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
	*xbitsb = (bitsPointer << 2) \| (bitsPointer << 6) \| bitAllocated
	goto marked
	}
	// Copy pointer bitmask into the bitmap.
	for i := uintptr(0); i < size0; i += 2 * ptrSize {
	v := uint8((bitsPointer << 2) \| (bitsPointer << 6))
	if ptrmask != nil {
	v = (uint8)(add(unsafe.Pointer(ptrmask), ti))
	ti++
	if ti == te {
	ti = 0
	}
	}
	if i == 0 {
	v \|= bitAllocated
	}
	if i+ptrSize == size0 {
	v &= ^uint8(bitPtrMask << 4)
	}

	off := (uintptr(x) + i - arena_start) / ptrSize
	xbits := (uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWordptrSize - ptrSize))
	shift := (off % wordsPerBitmapWord) * gcBits
	xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
	*xbitsb = v
	}
	if size0%(2*ptrSize) == 0 && size0 < size {
	// Mark the word after last object's word as bitsDead.
	off := (uintptr(x) + size0 - arena_start) / ptrSize
	xbits := (uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWordptrSize - ptrSize))
	shift := (off % wordsPerBitmapWord) * gcBits
	xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
	*xbitsb = bitsDead << 2
	}
	marked:
	mp.mallocing = 0

	if raceenabled {
	racemalloc(x, size)
	}
	if debug.allocfreetrace != 0 {
	tracealloc(x, size, typ)
	}

	if rate := MemProfileRate; rate > 0 {
	if size < uintptr(rate) && int32(size) < c.next_sample {
	c.next_sample -= int32(size)
	} else {
	profilealloc(mp, x, size)
	}
	}

	releasem(mp)

	if memstats.heap_alloc >= memstats.next_gc {
	gogc(0)
	}

	return x
	}

	// cmallocgc is a trampoline used to call the Go malloc from C.
	func cmallocgc(size uintptr, typ _type, flags int, ret unsafe.Pointer) {
	*ret = gomallocgc(size, typ, flags)
	}

	// implementation of new builtin
	func newobject(typ *_type) unsafe.Pointer {
	flags := 0
	if typ.kind&kindNoPointers != 0 {
	flags \|= flagNoScan
	}
	return gomallocgc(uintptr(typ.size), typ, flags)
	}

	// implementation of make builtin for slices
	func newarray(typ *_type, n uintptr) unsafe.Pointer {
	flags := 0
	if typ.kind&kindNoPointers != 0 {
	flags \|= flagNoScan
	}
	if int(n) < 0 \|\| (typ.size > 0 && n > maxMem/uintptr(typ.size)) {
	panic("runtime: allocation size out of range")
	}
	return gomallocgc(uintptr(typ.size)*n, typ, flags)
	}

	// rawmem returns a chunk of pointerless memory. It is
	// not zeroed.
	func rawmem(size uintptr) unsafe.Pointer {
	return gomallocgc(size, nil, flagNoScan\|flagNoZero)
	}

	// round size up to next size class
	func goroundupsize(size uintptr) uintptr {
	if size < maxSmallSize {
	if size <= 1024-8 {
	return uintptr(class_to_size[size_to_class8[(size+7)>>3]])
	}
	return uintptr(class_to_size[size_to_class128[(size-1024+127)>>7]])
	}
	if size+pageSize < size {
	return size
	}
	return (size + pageSize - 1) &^ pageMask
	}

	func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
	c := mp.mcache
	rate := MemProfileRate
	if size < uintptr(rate) {
	// pick next profile time
	// If you change this, also change allocmcache.
	if rate > 0x3fffffff { // make 2*rate not overflow
	rate = 0x3fffffff
	}
	next := int32(fastrand2()) % (2 * int32(rate))
	// Subtract the "remainder" of the current allocation.
	// Otherwise objects that are close in size to sampling rate
	// will be under-sampled, because we consistently discard this remainder.
	next -= (int32(size) - c.next_sample)
	if next < 0 {
	next = 0
	}
	c.next_sample = next
	}
	mp.scalararg[0] = uint(size)
	mp.ptrarg[0] = x
	onM(&mprofMalloc_m)
	}

	// force = 1 - do GC regardless of current heap usage
	// force = 2 - go GC and eager sweep
	func gogc(force int32) {
	if memstats.enablegc == 0 {
	return
	}

	// TODO: should never happen? Only C calls malloc while holding a lock?
	mp := acquirem()
	if mp.locks > 1 {
	releasem(mp)
	return
	}
	releasem(mp)

	if panicking != 0 {
	return
	}
	if gcpercent == gcpercentUnknown {
	golock(&mheap_.lock)
	if gcpercent == gcpercentUnknown {
	gcpercent = goreadgogc()
	}
	gounlock(&mheap_.lock)
	}
	if gcpercent < 0 {
	return
	}

	semacquire(&worldsema, false)

	if force == 0 && memstats.heap_alloc < memstats.next_gc {
	// typically threads which lost the race to grab
	// worldsema exit here when gc is done.
	semrelease(&worldsema)
	return
	}

	// Ok, we're doing it! Stop everybody else
	startTime := gonanotime()
	mp = acquirem()
	mp.gcing = 1
	stoptheworld()

	clearpools()

	// Run gc on the g0 stack. We do this so that the g stack
	// we're currently running on will no longer change. Cuts
	// the root set down a bit (g0 stacks are not scanned, and
	// we don't need to scan gc's internal state). We also
	// need to switch to g0 so we can shrink the stack.
	n := 1
	if debug.gctrace > 1 {
	n = 2
	}
	for i := 0; i < n; i++ {
	if i > 0 {
	startTime = gonanotime()
	}
	// switch to g0, call gc, then switch back
	mp.scalararg[0] = uint(startTime)
	if force >= 2 {
	mp.scalararg[1] = 1 // eagersweep
	} else {
	mp.scalararg[1] = 0
	}
	onM(&gc_m)
	}

	// all done
	mp.gcing = 0
	semrelease(&worldsema)
	starttheworld()
	releasem(mp)

	// now that gc is done, kick off finalizer thread if needed
	if !concurrentSweep {
	// give the queued finalizers, if any, a chance to run
	gosched()
	}
	}

	// GC runs a garbage collection.
	func GC() {
	gogc(2)
	}

	// SetFinalizer sets the finalizer associated with x to f.
	// When the garbage collector finds an unreachable block
	// with an associated finalizer, it clears the association and runs
	// f(x) in a separate goroutine. This makes x reachable again, but
	// now without an associated finalizer. Assuming that SetFinalizer
	// is not called again, the next time the garbage collector sees
	// that x is unreachable, it will free x.
	//
	// SetFinalizer(x, nil) clears any finalizer associated with x.
	//
	// The argument x must be a pointer to an object allocated by
	// calling new or by taking the address of a composite literal.
	// The argument f must be a function that takes a single argument
	// to which x's type can be assigned, and can have arbitrary ignored return
	// values. If either of these is not true, SetFinalizer aborts the
	// program.
	//
	// Finalizers are run in dependency order: if A points at B, both have
	// finalizers, and they are otherwise unreachable, only the finalizer
	// for A runs; once A is freed, the finalizer for B can run.
	// If a cyclic structure includes a block with a finalizer, that
	// cycle is not guaranteed to be garbage collected and the finalizer
	// is not guaranteed to run, because there is no ordering that
	// respects the dependencies.
	//
	// The finalizer for x is scheduled to run at some arbitrary time after
	// x becomes unreachable.
	// There is no guarantee that finalizers will run before a program exits,
	// so typically they are useful only for releasing non-memory resources
	// associated with an object during a long-running program.
	// For example, an os.File object could use a finalizer to close the
	// associated operating system file descriptor when a program discards
	// an os.File without calling Close, but it would be a mistake
	// to depend on a finalizer to flush an in-memory I/O buffer such as a
	// bufio.Writer, because the buffer would not be flushed at program exit.
	//
	// It is not guaranteed that a finalizer will run if the size of *x is
	// zero bytes.
	//
	// A single goroutine runs all finalizers for a program, sequentially.
	// If a finalizer must run for a long time, it should do so by starting
	// a new goroutine.
	func SetFinalizer(obj interface{}, finalizer interface{}) {
	// We do just enough work here to make the mcall type safe.
	// The rest is done on the M stack.
	e := (*eface)(unsafe.Pointer(&obj))
	typ := e._type
	if typ == nil {
	gothrow("runtime.SetFinalizer: first argument is nil")
	}
	if typ.kind&kindMask != kindPtr {
	gothrow("runtime.SetFinalizer: first argument is " + *typ._string + ", not pointer")
	}

	f := (*eface)(unsafe.Pointer(&finalizer))
	ftyp := f._type
	if ftyp != nil && ftyp.kind&kindMask != kindFunc {
	gothrow("runtime.SetFinalizer: second argument is " + *ftyp._string + ", not a function")
	}
	mp := acquirem()
	mp.ptrarg[0] = unsafe.Pointer(typ)
	mp.ptrarg[1] = e.data
	mp.ptrarg[2] = unsafe.Pointer(ftyp)
	mp.ptrarg[3] = f.data
	onM(&setFinalizer_m)
	releasem(mp)
	}