runtime: convert markallocated from C to Go

benchmark                      old ns/op     new ns/op     delta
BenchmarkMalloc8               28.7          22.4          -21.95%
BenchmarkMalloc16              44.8          33.8          -24.55%
BenchmarkMallocTypeInfo8       49.0          32.9          -32.86%
BenchmarkMallocTypeInfo16      46.7          35.8          -23.34%
BenchmarkMallocLargeStruct     907           901           -0.66%
BenchmarkGobDecode             13235542      12036851      -9.06%
BenchmarkGobEncode             10639699      9539155       -10.34%
BenchmarkJSONEncode            25193036      21898922      -13.08%
BenchmarkJSONDecode            96104044      89464904      -6.91%

Fixes #8452.

LGTM=khr
R=golang-codereviews, bradfitz, rsc, dave, khr
CC=golang-codereviews
https://golang.org/cl/122090043
diff --git a/src/pkg/runtime/malloc.go b/src/pkg/runtime/malloc.go
index e7f2388..73dc9f2 100644
--- a/src/pkg/runtime/malloc.go
+++ b/src/pkg/runtime/malloc.go
@@ -9,6 +9,8 @@
 )
 
 const (
+	debugMalloc = false
+
 	flagNoScan      = 1 << 0 // GC doesn't have to scan object
 	flagNoProfiling = 1 << 1 // must not profile
 	flagNoZero      = 1 << 2 // don't zero memory
@@ -29,6 +31,22 @@
 	pageShift = 13
 	pageSize  = 1 << pageShift
 	pageMask  = pageSize - 1
+
+	wordsPerBitmapWord = ptrSize * 8 / 4
+	gcBits             = 4
+	bitsPerPointer     = 2
+	bitsMask           = 1<<bitsPerPointer - 1
+	pointersPerByte    = 8 / bitsPerPointer
+	bitPtrMask         = bitsMask << 2
+	maxGCMask          = 0 // disabled because wastes several bytes of memory
+	bitsDead           = 0
+	bitsPointer        = 2
+
+	bitMiddle    = 0
+	bitBoundary  = 1
+	bitAllocated = 2
+	bitMarked    = 3
+	bitMask      = bitMiddle | bitBoundary | bitAllocated | bitMarked
 )
 
 // All zero-sized allocations return a pointer to this byte.
@@ -168,14 +186,112 @@
 		size = uintptr(s.elemsize)
 	}
 
-	// TODO: write markallocated in Go
-	mp.ptrarg[0] = x
-	mp.scalararg[0] = uint(size)
-	mp.scalararg[1] = uint(size0)
-	mp.ptrarg[1] = unsafe.Pointer(typ)
-	mp.scalararg[2] = uint(flags & flagNoScan)
-	onM(&markallocated_m)
+	// From here till marked label marking the object as allocated
+	// and storing type info in the GC bitmap.
+	arena_start := uintptr(unsafe.Pointer(mheap_.arena_start))
+	off := (uintptr(x) - arena_start) / ptrSize
+	xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
+	shift := (off % wordsPerBitmapWord) * gcBits
+	if debugMalloc && (((*xbits)>>shift)&bitMask) != bitBoundary {
+		gothrow("bad bits in markallocated")
+	}
 
+	var ti, te uintptr
+	var ptrmask *uint8
+	if flags&flagNoScan != 0 {
+		// bitsDead in the first quadruple means don't scan.
+		if size == ptrSize {
+			*xbits = (*xbits & ^((bitBoundary | bitPtrMask) << shift)) | ((bitAllocated + (bitsDead << 2)) << shift)
+		} else {
+			xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
+			*xbitsb = bitAllocated + (bitsDead << 2)
+		}
+		goto marked
+	}
+	if size == ptrSize {
+		// It's one word and it has pointers, it must be a pointer.
+		*xbits = (*xbits & ^((bitBoundary | bitPtrMask) << shift)) | ((bitAllocated | (bitsPointer << 2)) << shift)
+		goto marked
+	}
+	if typ != nil && (uintptr(typ.gc[0])|uintptr(typ.gc[1])) != 0 && uintptr(typ.size) > ptrSize {
+		if typ.kind&kindGCProg != 0 {
+			nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize
+			masksize := nptr
+			if masksize%2 != 0 {
+				masksize *= 2 // repeated
+			}
+			masksize = masksize * pointersPerByte / 8 // 4 bits per word
+			masksize++                                // unroll flag in the beginning
+			if masksize > maxGCMask && typ.gc[1] != 0 {
+				// If the mask is too large, unroll the program directly
+				// into the GC bitmap. It's 7 times slower than copying
+				// from the pre-unrolled mask, but saves 1/16 of type size
+				// memory for the mask.
+				mp.ptrarg[0] = x
+				mp.ptrarg[1] = unsafe.Pointer(typ)
+				mp.scalararg[0] = uint(size)
+				mp.scalararg[1] = uint(size0)
+				onM(&unrollgcproginplace_m)
+				goto marked
+			}
+			ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0])))
+			// Check whether the program is already unrolled.
+			if uintptr(goatomicloadp(unsafe.Pointer(ptrmask)))&0xff == 0 {
+				mp.ptrarg[0] = unsafe.Pointer(typ)
+				onM(&unrollgcprog_m)
+			}
+			ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte
+		} else {
+			ptrmask = (*uint8)(unsafe.Pointer(&typ.gc[0])) // embed mask
+		}
+		if size == 2*ptrSize {
+			xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
+			*xbitsb = *ptrmask | bitAllocated
+			goto marked
+		}
+		te = uintptr(typ.size) / ptrSize
+		// If the type occupies odd number of words, its mask is repeated.
+		if te%2 == 0 {
+			te /= 2
+		}
+	}
+	if size == 2*ptrSize {
+		xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
+		*xbitsb = (bitsPointer << 2) | (bitsPointer << 6) | bitAllocated
+		goto marked
+	}
+	// Copy pointer bitmask into the bitmap.
+	for i := uintptr(0); i < size0; i += 2 * ptrSize {
+		v := uint8((bitsPointer << 2) | (bitsPointer << 6))
+		if ptrmask != nil {
+			v = *(*uint8)(add(unsafe.Pointer(ptrmask), ti))
+			ti++
+			if ti == te {
+				ti = 0
+			}
+		}
+		if i == 0 {
+			v |= bitAllocated
+		}
+		if i+ptrSize == size0 {
+			v &= ^uint8(bitPtrMask << 4)
+		}
+
+		off := (uintptr(x) + i - arena_start) / ptrSize
+		xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
+		shift := (off % wordsPerBitmapWord) * gcBits
+		xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
+		*xbitsb = v
+	}
+	if size0%(2*ptrSize) == 0 && size0 < size {
+		// Mark the word after last object's word as bitsDead.
+		off := (uintptr(x) + size0 - arena_start) / ptrSize
+		xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
+		shift := (off % wordsPerBitmapWord) * gcBits
+		xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
+		*xbitsb = bitsDead << 2
+	}
+marked:
 	mp.mallocing = 0
 
 	if raceenabled {