[dev.garbage] all: merge dev.cc into dev.garbage

The garbage collector is now written in Go.
There is plenty to clean up (just like on dev.cc).

all.bash passes on darwin/amd64, darwin/386, linux/amd64, linux/386.

TBR=rlh
R=austin, rlh, bradfitz
CC=golang-codereviews
https://golang.org/cl/173250043
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index fab8cf2..f90a8f8 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -26,10 +26,11 @@
 	maxGCMask       = _MaxGCMask
 	bitsDead        = _BitsDead
 	bitsPointer     = _BitsPointer
+	bitsScalar      = _BitsScalar
 
 	mSpanInUse = _MSpanInUse
 
-	concurrentSweep = _ConcurrentSweep != 0
+	concurrentSweep = _ConcurrentSweep
 )
 
 // Page number (address>>pageShift)
@@ -54,7 +55,7 @@
 	// This function must be atomic wrt GC, but for performance reasons
 	// we don't acquirem/releasem on fast path. The code below does not have
 	// split stack checks, so it can't be preempted by GC.
-	// Functions like roundup/add are inlined. And onM/racemalloc are nosplit.
+	// Functions like roundup/add are inlined. And systemstack/racemalloc are nosplit.
 	// If debugMalloc = true, these assumptions are checked below.
 	if debugMalloc {
 		mp := acquirem()
@@ -140,10 +141,9 @@
 			s = c.alloc[tinySizeClass]
 			v := s.freelist
 			if v == nil {
-				mp := acquirem()
-				mp.scalararg[0] = tinySizeClass
-				onM(mcacheRefill_m)
-				releasem(mp)
+				systemstack(func() {
+					mCache_Refill(c, tinySizeClass)
+				})
 				s = c.alloc[tinySizeClass]
 				v = s.freelist
 			}
@@ -171,10 +171,9 @@
 			s = c.alloc[sizeclass]
 			v := s.freelist
 			if v == nil {
-				mp := acquirem()
-				mp.scalararg[0] = uintptr(sizeclass)
-				onM(mcacheRefill_m)
-				releasem(mp)
+				systemstack(func() {
+					mCache_Refill(c, int32(sizeclass))
+				})
 				s = c.alloc[sizeclass]
 				v = s.freelist
 			}
@@ -191,13 +190,10 @@
 		}
 		c.local_cachealloc += intptr(size)
 	} else {
-		mp := acquirem()
-		mp.scalararg[0] = uintptr(size)
-		mp.scalararg[1] = uintptr(flags)
-		onM(largeAlloc_m)
-		s = (*mspan)(mp.ptrarg[0])
-		mp.ptrarg[0] = nil
-		releasem(mp)
+		var s *mspan
+		systemstack(func() {
+			s = largeAlloc(size, uint32(flags))
+		})
 		x = unsafe.Pointer(uintptr(s.start << pageShift))
 		size = uintptr(s.elemsize)
 	}
@@ -251,13 +247,9 @@
 				// into the GC bitmap. It's 7 times slower than copying
 				// from the pre-unrolled mask, but saves 1/16 of type size
 				// memory for the mask.
-				mp := acquirem()
-				mp.ptrarg[0] = x
-				mp.ptrarg[1] = unsafe.Pointer(typ)
-				mp.scalararg[0] = uintptr(size)
-				mp.scalararg[1] = uintptr(size0)
-				onM(unrollgcproginplace_m)
-				releasem(mp)
+				systemstack(func() {
+					unrollgcproginplace_m(x, typ, size, size0)
+				})
 				goto marked
 			}
 			ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0])))
@@ -265,10 +257,9 @@
 			// by checking if the unroll flag byte is set
 			maskword := uintptr(atomicloadp(unsafe.Pointer(ptrmask)))
 			if *(*uint8)(unsafe.Pointer(&maskword)) == 0 {
-				mp := acquirem()
-				mp.ptrarg[0] = unsafe.Pointer(typ)
-				onM(unrollgcprog_m)
-				releasem(mp)
+				systemstack(func() {
+					unrollgcprog_m(typ)
+				})
 			}
 			ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte
 		} else {
@@ -312,10 +303,9 @@
 	// This may be racing with GC so do it atomically if there can be
 	// a race marking the bit.
 	if gcphase == _GCmarktermination {
-		mp := acquirem()
-		mp.ptrarg[0] = x
-		onM(gcmarknewobject_m)
-		releasem(mp)
+		systemstack(func() {
+			gcmarknewobject_m(uintptr(x))
+		})
 	}
 
 	if raceenabled {
@@ -377,10 +367,9 @@
 		// by checking if the unroll flag byte is set
 		maskword := uintptr(atomicloadp(unsafe.Pointer(ptrmask)))
 		if *(*uint8)(unsafe.Pointer(&maskword)) == 0 {
-			mp := acquirem()
-			mp.ptrarg[0] = unsafe.Pointer(typ)
-			onM(unrollgcprog_m)
-			releasem(mp)
+			systemstack(func() {
+				unrollgcprog_m(typ)
+			})
 		}
 		ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte
 	} else {
@@ -404,7 +393,7 @@
 	if typ.kind&kindNoPointers != 0 {
 		flags |= flagNoScan
 	}
-	if int(n) < 0 || (typ.size > 0 && n > maxmem/uintptr(typ.size)) {
+	if int(n) < 0 || (typ.size > 0 && n > _MaxMem/uintptr(typ.size)) {
 		panic("runtime: allocation size out of range")
 	}
 	return mallocgc(uintptr(typ.size)*n, typ, flags)
@@ -484,19 +473,20 @@
 	mp.gcing = 1
 	releasem(mp)
 
-	onM(stoptheworld)
-	onM(finishsweep_m) // finish sweep before we start concurrent scan.
-	if false {         // To turn on concurrent scan and mark set to true...
-		onM(starttheworld)
+	systemstack(stoptheworld)
+	systemstack(finishsweep_m) // finish sweep before we start concurrent scan.
+	if false {                 // To turn on concurrent scan and mark set to true...
+		systemstack(starttheworld)
 		// Do a concurrent heap scan before we stop the world.
-		onM(gcscan_m)
-		onM(stoptheworld)
-		onM(gcinstallmarkwb_m)
-		onM(starttheworld)
-		onM(gcmark_m)
-		onM(stoptheworld)
-		onM(gcinstalloffwb_m)
+		systemstack(gcscan_m)
+		systemstack(stoptheworld)
+		systemstack(gcinstallmarkwb_m)
+		systemstack(starttheworld)
+		systemstack(gcmark_m)
+		systemstack(stoptheworld)
+		systemstack(gcinstalloffwb_m)
 	}
+
 	if mp != acquirem() {
 		gothrow("gogc: rescheduled")
 	}
@@ -512,27 +502,25 @@
 	if debug.gctrace > 1 {
 		n = 2
 	}
+	eagersweep := force >= 2
 	for i := 0; i < n; i++ {
 		if i > 0 {
 			startTime = nanotime()
 		}
 		// switch to g0, call gc, then switch back
-		mp.scalararg[0] = uintptr(uint32(startTime)) // low 32 bits
-		mp.scalararg[1] = uintptr(startTime >> 32)   // high 32 bits
-		if force >= 2 {
-			mp.scalararg[2] = 1 // eagersweep
-		} else {
-			mp.scalararg[2] = 0
-		}
-		onM(gc_m)
+		systemstack(func() {
+			gc_m(startTime, eagersweep)
+		})
 	}
 
-	onM(gccheckmark_m)
+	systemstack(func() {
+		gccheckmark_m(startTime, eagersweep)
+	})
 
 	// all done
 	mp.gcing = 0
 	semrelease(&worldsema)
-	onM(starttheworld)
+	systemstack(starttheworld)
 	releasem(mp)
 	mp = nil
 
@@ -544,11 +532,11 @@
 }
 
 func GCcheckmarkenable() {
-	onM(gccheckmarkenable_m)
+	systemstack(gccheckmarkenable_m)
 }
 
 func GCcheckmarkdisable() {
-	onM(gccheckmarkdisable_m)
+	systemstack(gccheckmarkdisable_m)
 }
 
 // GC runs a garbage collection.
@@ -652,11 +640,10 @@
 	f := (*eface)(unsafe.Pointer(&finalizer))
 	ftyp := f._type
 	if ftyp == nil {
-		// switch to M stack and remove finalizer
-		mp := acquirem()
-		mp.ptrarg[0] = e.data
-		onM(removeFinalizer_m)
-		releasem(mp)
+		// switch to system stack and remove finalizer
+		systemstack(func() {
+			removefinalizer(e.data)
+		})
 		return
 	}
 
@@ -701,18 +688,11 @@
 	// make sure we have a finalizer goroutine
 	createfing()
 
-	// switch to M stack to add finalizer record
-	mp := acquirem()
-	mp.ptrarg[0] = f.data
-	mp.ptrarg[1] = e.data
-	mp.scalararg[0] = nret
-	mp.ptrarg[2] = unsafe.Pointer(fint)
-	mp.ptrarg[3] = unsafe.Pointer(ot)
-	onM(setFinalizer_m)
-	if mp.scalararg[0] != 1 {
-		gothrow("runtime.SetFinalizer: finalizer already set")
-	}
-	releasem(mp)
+	systemstack(func() {
+		if !addfinalizer(e.data, (*funcval)(f.data), nret, fint, ot) {
+			gothrow("runtime.SetFinalizer: finalizer already set")
+		}
+	})
 }
 
 // round n up to a multiple of a.  a must be a power of 2.