runtime: rewrite malloc in Go.

This change introduces gomallocgc, a Go clone of mallocgc.
Only a few uses have been moved over, so there are still
lots of uses from C. Many of these C uses will be moved
over to Go (e.g. in slice.goc), but probably not all.
What should remain of C's mallocgc is an open question.

LGTM=rsc, dvyukov
R=rsc, khr, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/108840046
diff --git a/src/pkg/runtime/malloc.go b/src/pkg/runtime/malloc.go
new file mode 100644
index 0000000..cac8f96
--- /dev/null
+++ b/src/pkg/runtime/malloc.go
@@ -0,0 +1,426 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+const (
+	flagNoScan      = 1 << 0 // GC doesn't have to scan object
+	flagNoProfiling = 1 << 1 // must not profile
+	flagNoZero      = 1 << 3 // don't zero memory
+	flagNoInvokeGC  = 1 << 4 // don't invoke GC
+
+	kindArray      = 17
+	kindFunc       = 19
+	kindInterface  = 20
+	kindPtr        = 22
+	kindStruct     = 25
+	kindMask       = 1<<6 - 1
+	kindGCProg     = 1 << 6
+	kindNoPointers = 1 << 7
+
+	maxTinySize   = 16
+	tinySizeClass = 2
+	maxSmallSize  = 32 << 10
+
+	pageShift = 13
+	pageSize  = 1 << pageShift
+	pageMask  = pageSize - 1
+)
+
+// All zero-sized allocations return a pointer to this byte.
+var zeroObject byte
+
+// Maximum possible heap size.
+var maxMem uintptr
+
+// Allocate an object of at least size bytes.
+// Small objects are allocated from the per-thread cache's free lists.
+// Large objects (> 32 kB) are allocated straight from the heap.
+// If the block will be freed with runtimeĀ·free(), typ must be nil.
+func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer {
+	if size == 0 {
+		return unsafe.Pointer(&zeroObject)
+	}
+	mp := acquirem()
+	if mp.mallocing != 0 {
+		gothrow("malloc/free - deadlock")
+	}
+	mp.mallocing = 1
+	size0 := size
+
+	c := mp.mcache
+	var s *mspan
+	var x unsafe.Pointer
+	if size <= maxSmallSize {
+		if flags&flagNoScan != 0 && size < maxTinySize {
+			// Tiny allocator.
+			//
+			// Tiny allocator combines several tiny allocation requests
+			// into a single memory block. The resulting memory block
+			// is freed when all subobjects are unreachable. The subobjects
+			// must be FlagNoScan (don't have pointers), this ensures that
+			// the amount of potentially wasted memory is bounded.
+			//
+			// Size of the memory block used for combining (maxTinySize) is tunable.
+			// Current setting is 16 bytes, which relates to 2x worst case memory
+			// wastage (when all but one subobjects are unreachable).
+			// 8 bytes would result in no wastage at all, but provides less
+			// opportunities for combining.
+			// 32 bytes provides more opportunities for combining,
+			// but can lead to 4x worst case wastage.
+			// The best case winning is 8x regardless of block size.
+			//
+			// Objects obtained from tiny allocator must not be freed explicitly.
+			// So when an object will be freed explicitly, we ensure that
+			// its size >= maxTinySize.
+			//
+			// SetFinalizer has a special case for objects potentially coming
+			// from tiny allocator, it such case it allows to set finalizers
+			// for an inner byte of a memory block.
+			//
+			// The main targets of tiny allocator are small strings and
+			// standalone escaping variables. On a json benchmark
+			// the allocator reduces number of allocations by ~12% and
+			// reduces heap size by ~20%.
+
+			tinysize := uintptr(c.tinysize)
+			if size <= tinysize {
+				tiny := unsafe.Pointer(c.tiny)
+				// Align tiny pointer for required (conservative) alignment.
+				if size&7 == 0 {
+					tiny = roundup(tiny, 8)
+				} else if size&3 == 0 {
+					tiny = roundup(tiny, 4)
+				} else if size&1 == 0 {
+					tiny = roundup(tiny, 2)
+				}
+				size1 := size + (uintptr(tiny) - uintptr(unsafe.Pointer(c.tiny)))
+				if size1 <= tinysize {
+					// The object fits into existing tiny block.
+					x = tiny
+					c.tiny = (*byte)(add(x, size))
+					c.tinysize -= uint(size1)
+					mp.mallocing = 0
+					releasem(mp)
+					return x
+				}
+			}
+			// Allocate a new maxTinySize block.
+			s = c.alloc[tinySizeClass]
+			v := s.freelist
+			if v == nil {
+				mp.scalararg[0] = tinySizeClass
+				onM(&mcacheRefill)
+				s = c.alloc[tinySizeClass]
+				v = s.freelist
+			}
+			s.freelist = v.next
+			s.ref++
+			//TODO: prefetch v.next
+			x = unsafe.Pointer(v)
+			(*[2]uint64)(x)[0] = 0
+			(*[2]uint64)(x)[1] = 0
+			// See if we need to replace the existing tiny block with the new one
+			// based on amount of remaining free space.
+			if maxTinySize-size > tinysize {
+				c.tiny = (*byte)(add(x, size))
+				c.tinysize = uint(maxTinySize - size)
+			}
+			size = maxTinySize
+		} else {
+			var sizeclass int8
+			if size <= 1024-8 {
+				sizeclass = size_to_class8[(size+7)>>3]
+			} else {
+				sizeclass = size_to_class128[(size-1024+127)>>7]
+			}
+			size = uintptr(class_to_size[sizeclass])
+			s = c.alloc[sizeclass]
+			v := s.freelist
+			if v == nil {
+				mp.scalararg[0] = uint(sizeclass)
+				onM(&mcacheRefill)
+				s = c.alloc[sizeclass]
+				v = s.freelist
+			}
+			s.freelist = v.next
+			s.ref++
+			//TODO: prefetch
+			x = unsafe.Pointer(v)
+			if flags&flagNoZero == 0 {
+				v.next = nil
+				if size > 2*ptrSize && ((*[2]uintptr)(x))[1] != 0 {
+					memclr(unsafe.Pointer(v), size)
+				}
+			}
+		}
+		c.local_cachealloc += int(size)
+	} else {
+		mp.scalararg[0] = uint(size)
+		mp.scalararg[1] = uint(flags)
+		onM(&largeAlloc)
+		s = (*mspan)(mp.ptrarg[0])
+		mp.ptrarg[0] = nil
+		x = unsafe.Pointer(uintptr(s.start << pageShift))
+		size = uintptr(s.elemsize)
+	}
+
+	// TODO: write markallocated in Go
+	mp.ptrarg[0] = x
+	mp.scalararg[0] = uint(size)
+	mp.scalararg[1] = uint(size0)
+	mp.ptrarg[1] = unsafe.Pointer(typ)
+	mp.scalararg[2] = uint(flags & flagNoScan)
+	onM(&markallocated_m)
+
+	mp.mallocing = 0
+
+	if raceenabled {
+		racemalloc(x, size)
+	}
+	if debug.allocfreetrace != 0 {
+		tracealloc(x, size, typ)
+	}
+	if flags&flagNoProfiling == 0 {
+		rate := MemProfileRate
+		if rate > 0 {
+			if size < uintptr(rate) && int32(size) < c.next_sample {
+				c.next_sample -= int32(size)
+			} else {
+				profilealloc(mp, x, size)
+			}
+		}
+	}
+
+	releasem(mp)
+
+	if flags&flagNoInvokeGC == 0 && memstats.heap_alloc >= memstats.next_gc {
+		gogc(0)
+	}
+
+	return x
+}
+
+// cmallocgc is a trampoline used to call the Go malloc from C.
+func cmallocgc(size uintptr, typ *_type, flags int, ret *unsafe.Pointer) {
+	*ret = gomallocgc(size, typ, flags)
+}
+
+// implementation of new builtin
+func newobject(typ *_type) unsafe.Pointer {
+	flags := 0
+	if typ.kind&kindNoPointers != 0 {
+		flags |= flagNoScan
+	}
+	return gomallocgc(uintptr(typ.size), typ, flags)
+}
+
+// implementation of make builtin for slices
+func newarray(typ *_type, n uintptr) unsafe.Pointer {
+	flags := 0
+	if typ.kind&kindNoPointers != 0 {
+		flags |= flagNoScan
+	}
+	if int(n) < 0 || (typ.size > 0 && n > maxMem/uintptr(typ.size)) {
+		panic("runtime: allocation size out of range")
+	}
+	return gomallocgc(uintptr(typ.size)*n, typ, flags)
+}
+
+// round size up to next size class
+func goroundupsize(size uintptr) uintptr {
+	if size < maxSmallSize {
+		if size <= 1024-8 {
+			return uintptr(class_to_size[size_to_class8[(size+7)>>3]])
+		}
+		return uintptr(class_to_size[size_to_class128[(size-1024+127)>>7]])
+	}
+	if size+pageSize < size {
+		return size
+	}
+	return (size + pageSize - 1) &^ pageMask
+}
+
+func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
+	c := mp.mcache
+	rate := MemProfileRate
+	if size < uintptr(rate) {
+		// pick next profile time
+		// If you change this, also change allocmcache.
+		if rate > 0x3fffffff { // make 2*rate not overflow
+			rate = 0x3fffffff
+		}
+		next := int32(fastrand2()) % (2 * int32(rate))
+		// Subtract the "remainder" of the current allocation.
+		// Otherwise objects that are close in size to sampling rate
+		// will be under-sampled, because we consistently discard this remainder.
+		next -= (int32(size) - c.next_sample)
+		if next < 0 {
+			next = 0
+		}
+		c.next_sample = next
+	}
+	mp.scalararg[0] = uint(size)
+	mp.ptrarg[0] = x
+	onM(&mprofMalloc)
+}
+
+// force = 1 - do GC regardless of current heap usage
+// force = 2 - go GC and eager sweep
+func gogc(force int32) {
+	if memstats.enablegc == 0 {
+		return
+	}
+
+	// TODO: should never happen?  Only C calls malloc while holding a lock?
+	mp := acquirem()
+	if mp.locks > 1 {
+		releasem(mp)
+		return
+	}
+	releasem(mp)
+
+	if panicking != 0 {
+		return
+	}
+	if gcpercent == gcpercentUnknown {
+		golock(&mheap_.lock)
+		if gcpercent == gcpercentUnknown {
+			gcpercent = goreadgogc()
+		}
+		gounlock(&mheap_.lock)
+	}
+	if gcpercent < 0 {
+		return
+	}
+
+	semacquire(&worldsema, false)
+
+	if force == 0 && memstats.heap_alloc < memstats.next_gc {
+		// typically threads which lost the race to grab
+		// worldsema exit here when gc is done.
+		semrelease(&worldsema)
+		return
+	}
+
+	// Ok, we're doing it!  Stop everybody else
+	startTime := gonanotime()
+	mp = acquirem()
+	mp.gcing = 1
+	stoptheworld()
+
+	clearpools()
+
+	// Run gc on the g0 stack.  We do this so that the g stack
+	// we're currently running on will no longer change.  Cuts
+	// the root set down a bit (g0 stacks are not scanned, and
+	// we don't need to scan gc's internal state).  We also
+	// need to switch to g0 so we can shrink the stack.
+	n := 1
+	if debug.gctrace > 1 {
+		n = 2
+	}
+	for i := 0; i < n; i++ {
+		if i > 0 {
+			startTime = gonanotime()
+		}
+		// switch to g0, call gc, then switch back
+		mp.scalararg[0] = uint(startTime)
+		if force >= 2 {
+			mp.scalararg[1] = 1 // eagersweep
+		} else {
+			mp.scalararg[1] = 0
+		}
+		onM(&mgc2)
+	}
+
+	// all done
+	mp.gcing = 0
+	semrelease(&worldsema)
+	starttheworld()
+	releasem(mp)
+
+	// now that gc is done, kick off finalizer thread if needed
+	if !concurrentSweep {
+		// give the queued finalizers, if any, a chance to run
+		gosched()
+	}
+}
+
+// GC runs a garbage collection.
+func GC() {
+	gogc(2)
+}
+
+// SetFinalizer sets the finalizer associated with x to f.
+// When the garbage collector finds an unreachable block
+// with an associated finalizer, it clears the association and runs
+// f(x) in a separate goroutine.  This makes x reachable again, but
+// now without an associated finalizer.  Assuming that SetFinalizer
+// is not called again, the next time the garbage collector sees
+// that x is unreachable, it will free x.
+//
+// SetFinalizer(x, nil) clears any finalizer associated with x.
+//
+// The argument x must be a pointer to an object allocated by
+// calling new or by taking the address of a composite literal.
+// The argument f must be a function that takes a single argument
+// to which x's type can be assigned, and can have arbitrary ignored return
+// values. If either of these is not true, SetFinalizer aborts the
+// program.
+//
+// Finalizers are run in dependency order: if A points at B, both have
+// finalizers, and they are otherwise unreachable, only the finalizer
+// for A runs; once A is freed, the finalizer for B can run.
+// If a cyclic structure includes a block with a finalizer, that
+// cycle is not guaranteed to be garbage collected and the finalizer
+// is not guaranteed to run, because there is no ordering that
+// respects the dependencies.
+//
+// The finalizer for x is scheduled to run at some arbitrary time after
+// x becomes unreachable.
+// There is no guarantee that finalizers will run before a program exits,
+// so typically they are useful only for releasing non-memory resources
+// associated with an object during a long-running program.
+// For example, an os.File object could use a finalizer to close the
+// associated operating system file descriptor when a program discards
+// an os.File without calling Close, but it would be a mistake
+// to depend on a finalizer to flush an in-memory I/O buffer such as a
+// bufio.Writer, because the buffer would not be flushed at program exit.
+//
+// It is not guaranteed that a finalizer will run if the size of *x is
+// zero bytes.
+//
+// A single goroutine runs all finalizers for a program, sequentially.
+// If a finalizer must run for a long time, it should do so by starting
+// a new goroutine.
+func SetFinalizer(obj interface{}, finalizer interface{}) {
+	// We do just enough work here to make the mcall type safe.
+	// The rest is done on the M stack.
+	e := (*eface)(unsafe.Pointer(&obj))
+	typ := e._type
+	if typ == nil {
+		gothrow("runtime.SetFinalizer: first argument is nil")
+	}
+	if typ.kind&kindMask != kindPtr {
+		gothrow("runtime.SetFinalizer: first argument is " + *typ._string + ", not pointer")
+	}
+
+	f := (*eface)(unsafe.Pointer(&finalizer))
+	ftyp := f._type
+	if ftyp != nil && ftyp.kind&kindMask != kindFunc {
+		gothrow("runtime.SetFinalizer: second argument is " + *ftyp._string + ", not a function")
+	}
+	mp := acquirem()
+	mp.ptrarg[0] = unsafe.Pointer(typ)
+	mp.ptrarg[1] = e.data
+	mp.ptrarg[2] = unsafe.Pointer(ftyp)
+	mp.ptrarg[3] = f.data
+	onM(&setFinalizer)
+	releasem(mp)
+}