runtime: convert markallocated from C to Go

benchmark                      old ns/op     new ns/op     delta
BenchmarkMalloc8               28.7          22.4          -21.95%
BenchmarkMalloc16              44.8          33.8          -24.55%
BenchmarkMallocTypeInfo8       49.0          32.9          -32.86%
BenchmarkMallocTypeInfo16      46.7          35.8          -23.34%
BenchmarkMallocLargeStruct     907           901           -0.66%
BenchmarkGobDecode             13235542      12036851      -9.06%
BenchmarkGobEncode             10639699      9539155       -10.34%
BenchmarkJSONEncode            25193036      21898922      -13.08%
BenchmarkJSONDecode            96104044      89464904      -6.91%

Fixes #8452.

LGTM=khr
R=golang-codereviews, bradfitz, rsc, dave, khr
CC=golang-codereviews
https://golang.org/cl/122090043
diff --git a/src/pkg/runtime/malloc.go b/src/pkg/runtime/malloc.go
index e7f2388..73dc9f2 100644
--- a/src/pkg/runtime/malloc.go
+++ b/src/pkg/runtime/malloc.go
@@ -9,6 +9,8 @@
 )
 
 const (
+	debugMalloc = false
+
 	flagNoScan      = 1 << 0 // GC doesn't have to scan object
 	flagNoProfiling = 1 << 1 // must not profile
 	flagNoZero      = 1 << 2 // don't zero memory
@@ -29,6 +31,22 @@
 	pageShift = 13
 	pageSize  = 1 << pageShift
 	pageMask  = pageSize - 1
+
+	wordsPerBitmapWord = ptrSize * 8 / 4
+	gcBits             = 4
+	bitsPerPointer     = 2
+	bitsMask           = 1<<bitsPerPointer - 1
+	pointersPerByte    = 8 / bitsPerPointer
+	bitPtrMask         = bitsMask << 2
+	maxGCMask          = 0 // disabled because wastes several bytes of memory
+	bitsDead           = 0
+	bitsPointer        = 2
+
+	bitMiddle    = 0
+	bitBoundary  = 1
+	bitAllocated = 2
+	bitMarked    = 3
+	bitMask      = bitMiddle | bitBoundary | bitAllocated | bitMarked
 )
 
 // All zero-sized allocations return a pointer to this byte.
@@ -168,14 +186,112 @@
 		size = uintptr(s.elemsize)
 	}
 
-	// TODO: write markallocated in Go
-	mp.ptrarg[0] = x
-	mp.scalararg[0] = uint(size)
-	mp.scalararg[1] = uint(size0)
-	mp.ptrarg[1] = unsafe.Pointer(typ)
-	mp.scalararg[2] = uint(flags & flagNoScan)
-	onM(&markallocated_m)
+	// From here till marked label marking the object as allocated
+	// and storing type info in the GC bitmap.
+	arena_start := uintptr(unsafe.Pointer(mheap_.arena_start))
+	off := (uintptr(x) - arena_start) / ptrSize
+	xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
+	shift := (off % wordsPerBitmapWord) * gcBits
+	if debugMalloc && (((*xbits)>>shift)&bitMask) != bitBoundary {
+		gothrow("bad bits in markallocated")
+	}
 
+	var ti, te uintptr
+	var ptrmask *uint8
+	if flags&flagNoScan != 0 {
+		// bitsDead in the first quadruple means don't scan.
+		if size == ptrSize {
+			*xbits = (*xbits & ^((bitBoundary | bitPtrMask) << shift)) | ((bitAllocated + (bitsDead << 2)) << shift)
+		} else {
+			xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
+			*xbitsb = bitAllocated + (bitsDead << 2)
+		}
+		goto marked
+	}
+	if size == ptrSize {
+		// It's one word and it has pointers, it must be a pointer.
+		*xbits = (*xbits & ^((bitBoundary | bitPtrMask) << shift)) | ((bitAllocated | (bitsPointer << 2)) << shift)
+		goto marked
+	}
+	if typ != nil && (uintptr(typ.gc[0])|uintptr(typ.gc[1])) != 0 && uintptr(typ.size) > ptrSize {
+		if typ.kind&kindGCProg != 0 {
+			nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize
+			masksize := nptr
+			if masksize%2 != 0 {
+				masksize *= 2 // repeated
+			}
+			masksize = masksize * pointersPerByte / 8 // 4 bits per word
+			masksize++                                // unroll flag in the beginning
+			if masksize > maxGCMask && typ.gc[1] != 0 {
+				// If the mask is too large, unroll the program directly
+				// into the GC bitmap. It's 7 times slower than copying
+				// from the pre-unrolled mask, but saves 1/16 of type size
+				// memory for the mask.
+				mp.ptrarg[0] = x
+				mp.ptrarg[1] = unsafe.Pointer(typ)
+				mp.scalararg[0] = uint(size)
+				mp.scalararg[1] = uint(size0)
+				onM(&unrollgcproginplace_m)
+				goto marked
+			}
+			ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0])))
+			// Check whether the program is already unrolled.
+			if uintptr(goatomicloadp(unsafe.Pointer(ptrmask)))&0xff == 0 {
+				mp.ptrarg[0] = unsafe.Pointer(typ)
+				onM(&unrollgcprog_m)
+			}
+			ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte
+		} else {
+			ptrmask = (*uint8)(unsafe.Pointer(&typ.gc[0])) // embed mask
+		}
+		if size == 2*ptrSize {
+			xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
+			*xbitsb = *ptrmask | bitAllocated
+			goto marked
+		}
+		te = uintptr(typ.size) / ptrSize
+		// If the type occupies odd number of words, its mask is repeated.
+		if te%2 == 0 {
+			te /= 2
+		}
+	}
+	if size == 2*ptrSize {
+		xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
+		*xbitsb = (bitsPointer << 2) | (bitsPointer << 6) | bitAllocated
+		goto marked
+	}
+	// Copy pointer bitmask into the bitmap.
+	for i := uintptr(0); i < size0; i += 2 * ptrSize {
+		v := uint8((bitsPointer << 2) | (bitsPointer << 6))
+		if ptrmask != nil {
+			v = *(*uint8)(add(unsafe.Pointer(ptrmask), ti))
+			ti++
+			if ti == te {
+				ti = 0
+			}
+		}
+		if i == 0 {
+			v |= bitAllocated
+		}
+		if i+ptrSize == size0 {
+			v &= ^uint8(bitPtrMask << 4)
+		}
+
+		off := (uintptr(x) + i - arena_start) / ptrSize
+		xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
+		shift := (off % wordsPerBitmapWord) * gcBits
+		xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
+		*xbitsb = v
+	}
+	if size0%(2*ptrSize) == 0 && size0 < size {
+		// Mark the word after last object's word as bitsDead.
+		off := (uintptr(x) + size0 - arena_start) / ptrSize
+		xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
+		shift := (off % wordsPerBitmapWord) * gcBits
+		xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
+		*xbitsb = bitsDead << 2
+	}
+marked:
 	mp.mallocing = 0
 
 	if raceenabled {
diff --git a/src/pkg/runtime/malloc.h b/src/pkg/runtime/malloc.h
index 43feef7..4b16c55 100644
--- a/src/pkg/runtime/malloc.h
+++ b/src/pkg/runtime/malloc.h
@@ -517,7 +517,6 @@
 int32	runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **s);
 void	runtime·gc(int32 force);
 uintptr	runtime·sweepone(void);
-void	runtime·markallocated(void *v, uintptr size, uintptr size0, Type* typ, bool scan);
 void	runtime·markspan(void *v, uintptr size, uintptr n, bool leftover);
 void	runtime·unmarkspan(void *v, uintptr size);
 void	runtime·purgecachedstats(MCache*);
diff --git a/src/pkg/runtime/mgc0.c b/src/pkg/runtime/mgc0.c
index 4637d68..8998a87 100644
--- a/src/pkg/runtime/mgc0.c
+++ b/src/pkg/runtime/mgc0.c
@@ -1741,7 +1741,7 @@
 	return res;
 }
 
-// Recursively GC program in prog.
+// Recursively unrolls GC program in prog.
 // mask is where to store the result.
 // ppos is a pointer to position in mask, in bits.
 // sparse says to generate 4-bits per word mask for heap (2-bits for data/bss otherwise).
@@ -1833,11 +1833,20 @@
 	return mask;
 }
 
-static void
-unrollgcproginplace(void *v, uintptr size, uintptr size0, Type *typ)
+void
+runtime·unrollgcproginplace_m(void)
 {
-	uintptr *b, off, shift, pos;
+	uintptr size, size0, *b, off, shift, pos;
 	byte *arena_start, *prog;
+	Type *typ;
+	void *v;
+
+	v = g->m->ptrarg[0];
+	typ = g->m->ptrarg[1];
+	size = g->m->scalararg[0];
+	size0 = g->m->scalararg[1];
+	g->m->ptrarg[0] = nil;
+	g->m->ptrarg[1] = nil;
 
 	pos = 0;
 	prog = (byte*)typ->gc[1];
@@ -1859,14 +1868,18 @@
 }
 
 // Unrolls GC program in typ->gc[1] into typ->gc[0]
-static void
-unrollgcprog(Type *typ)
+void
+runtime·unrollgcprog_m(void)
 {
 	static Lock lock;
+	Type *typ;
 	byte *mask, *prog;
 	uintptr pos;
 	uint32 x;
 
+	typ = g->m->ptrarg[0];
+	g->m->ptrarg[0] = nil;
+
 	runtime·lock(&lock);
 	mask = (byte*)typ->gc[0];
 	if(mask[0] == 0) {
@@ -1887,110 +1900,6 @@
 	runtime·unlock(&lock);
 }
 
-void
-runtime·markallocated(void *v, uintptr size, uintptr size0, Type *typ, bool scan)
-{
-	uintptr *b, off, shift, i, ti, te, nptr, masksize;
-	byte *arena_start, x;
-	bool *ptrmask;
-
-	arena_start = runtime·mheap.arena_start;
-	off = (uintptr*)v - (uintptr*)arena_start;
-	b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-	shift = (off % wordsPerBitmapWord) * gcBits;
-	if(Debug && (((*b)>>shift)&bitMask) != bitBoundary) {
-		runtime·printf("runtime: bad bits in markallocated (%p) b=%p[%p]\n", v, b, *b);
-		runtime·throw("bad bits in markallocated");
-	}
-
-	if(!scan) {
-		// BitsDead in the first quadruple means don't scan.
-		if(size == PtrSize)
-			*b = (*b & ~((bitBoundary|bitPtrMask)<<shift)) | ((bitAllocated+(BitsDead<<2))<<shift);
-		else
-			((byte*)b)[shift/8] = bitAllocated+(BitsDead<<2);
-		return;
-	}
-	if(size == PtrSize) {
-		// It's one word and it has pointers, it must be a pointer.
-		*b = (*b & ~((bitBoundary|bitPtrMask)<<shift)) | ((bitAllocated | (BitsPointer<<2))<<shift);
-		return;
-	}
-	ti = te = 0;
-	ptrmask = nil;
-	if(typ != nil && (typ->gc[0]|typ->gc[1]) != 0 && typ->size > PtrSize) {
-		if(typ->kind&KindGCProg) {
-			nptr = ROUND(typ->size, PtrSize)/PtrSize;
-			masksize = nptr;
-			if(masksize%2)
-				masksize *= 2;	// repeated twice
-			masksize = masksize*PointersPerByte/8;	// 4 bits per word
-			masksize++;	// unroll flag in the beginning
-			if(masksize > MaxGCMask && typ->gc[1] != 0) {
-				// If the mask is too large, unroll the program directly
-				// into the GC bitmap. It's 7 times slower than copying
-				// from the pre-unrolled mask, but saves 1/16 of type size
-				// memory for the mask.
-				unrollgcproginplace(v, size, size0, typ);
-				return;
-			}
-			ptrmask = (byte*)typ->gc[0];
-			// check whether the program is already unrolled
-			if((runtime·atomicload((uint32*)ptrmask)&0xff) == 0)
-				unrollgcprog(typ);
-			ptrmask++;  // skip the unroll flag byte
-		} else
-			ptrmask = (byte*)&typ->gc[0];  // embed mask
-		if(size == 2*PtrSize) {
-			((byte*)b)[shift/8] = ptrmask[0] | bitAllocated;
-			return;
-		}
-		te = typ->size/PtrSize;
-		// if the type occupies odd number of words, its mask is repeated twice
-		if((te%2) == 0)
-			te /= 2;
-	}
-	if(size == 2*PtrSize) {
-		((byte*)b)[shift/8] = (BitsPointer<<2) | (BitsPointer<<6) | bitAllocated;
-		return;
-	}
-	// Copy pointer bitmask into the bitmap.
-	for(i=0; i<size0; i+=2*PtrSize) {
-		x = (BitsPointer<<2) | (BitsPointer<<6);
-		if(ptrmask != nil) {
-			x = ptrmask[ti++];
-			if(ti == te)
-				ti = 0;
-		}
-		off = (uintptr*)((byte*)v + i) - (uintptr*)arena_start;
-		b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-		shift = (off % wordsPerBitmapWord) * gcBits;
-		if(i == 0)
-			x |= bitAllocated;
-		if(i+PtrSize == size0)
-			x &= ~(bitPtrMask<<4);
-		((byte*)b)[shift/8] = x;
-	}
-	if(size0 == i && size0 < size) {
-		// mark the word after last object's word as BitsDead
-		off = (uintptr*)((byte*)v + size0) - (uintptr*)arena_start;
-		b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-		shift = (off % wordsPerBitmapWord) * gcBits;
-		((byte*)b)[shift/8] = 0;
-	}
-}
-
-void
-runtime·markallocated_m(void)
-{
-	M *mp;
-
-	mp = g->m;
-	runtime·markallocated(mp->ptrarg[0], mp->scalararg[0], mp->scalararg[1], mp->ptrarg[1], mp->scalararg[2] == 0);
-	mp->ptrarg[0] = nil;
-	mp->ptrarg[1] = nil;
-}
-
 // mark the span of memory at v as having n blocks of the given size.
 // if leftover is true, there is left over space at the end of the span.
 void
diff --git a/src/pkg/runtime/stubs.go b/src/pkg/runtime/stubs.go
index 8a2fc8a9..fee18f0 100644
--- a/src/pkg/runtime/stubs.go
+++ b/src/pkg/runtime/stubs.go
@@ -65,7 +65,9 @@
 	mprofMalloc_m,
 	gc_m,
 	setFinalizer_m,
-	markallocated_m mFunction
+	markallocated_m,
+	unrollgcprog_m,
+	unrollgcproginplace_m mFunction
 )
 
 // memclr clears n bytes starting at ptr.
@@ -87,7 +89,12 @@
 	concurrentSweep  = true
 )
 
-// in asm_*.s
+// Atomic operations to read/write a pointer.
+// in stubs.goc
+func goatomicloadp(p unsafe.Pointer) unsafe.Pointer     // return *p
+func goatomicstorep(p unsafe.Pointer, v unsafe.Pointer) // *p = v
+
+// in stubs.goc
 // if *p == x { *p = y; return true } else { return false }, atomically
 //go:noescape
 func gocas(p *uint32, x uint32, y uint32) bool
diff --git a/src/pkg/runtime/stubs.goc b/src/pkg/runtime/stubs.goc
index c64e73d..42a4bf1 100644
--- a/src/pkg/runtime/stubs.goc
+++ b/src/pkg/runtime/stubs.goc
@@ -49,6 +49,16 @@
 }
 
 #pragma textflag NOSPLIT
+func goatomicloadp(p **byte) (v *byte) {
+	v = runtime·atomicloadp(p);
+}
+
+#pragma textflag NOSPLIT
+func goatomicstorep(p **byte, v *byte) {
+	runtime·atomicstorep(p, v);
+}
+
+#pragma textflag NOSPLIT
 func runtime·gocas(p *uint32, x uint32, y uint32) (ret bool) {
 	ret = runtime·cas(p, x, y);
 }