runtime: add GODEBUG=sbrk=1 to bypass memory allocator (and GC)

To reduce lock contention in this mode, makes persistent allocation state per-P,
which means at most 64 kB overhead x $GOMAXPROCS, which should be
completely tolerable.

Change-Id: I34ca95e77d7e67130e30822e5a4aff6772b1a1c5
Reviewed-on: https://go-review.googlesource.com/7740
Reviewed-by: Rick Hudson <rlh@golang.org>
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 0b7b89a..11d6f94 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -483,16 +483,23 @@
 	if gcphase == _GCmarktermination {
 		throw("mallocgc called with gcphase == _GCmarktermination")
 	}
-	shouldhelpgc := false
+
 	if size == 0 {
 		return unsafe.Pointer(&zerobase)
 	}
-	dataSize := size
 
 	if flags&flagNoScan == 0 && typ == nil {
 		throw("malloc missing type")
 	}
 
+	if debug.sbrk != 0 {
+		align := uintptr(16)
+		if typ != nil {
+			align = uintptr(typ.align)
+		}
+		return persistentalloc(size, align, &memstats.other_sys)
+	}
+
 	// Set mp.mallocing to keep from being preempted by GC.
 	mp := acquirem()
 	if mp.mallocing != 0 {
@@ -500,6 +507,8 @@
 	}
 	mp.mallocing = 1
 
+	shouldhelpgc := false
+	dataSize := size
 	c := gomcache()
 	var s *mspan
 	var x unsafe.Pointer
@@ -761,12 +770,16 @@
 	mProf_Malloc(x, size)
 }
 
-var persistent struct {
-	lock mutex
+type persistentAlloc struct {
 	base unsafe.Pointer
 	off  uintptr
 }
 
+var globalAlloc struct {
+	mutex
+	persistentAlloc
+}
+
 // Wrapper around sysAlloc that can allocate small chunks.
 // There is no associated free operation.
 // Intended for things like function/type/debug-related persistent data.
@@ -795,19 +808,31 @@
 		return sysAlloc(size, stat)
 	}
 
-	lock(&persistent.lock)
+	mp := acquirem()
+	var persistent *persistentAlloc
+	if mp != nil && mp.p != nil {
+		persistent = &mp.p.palloc
+	} else {
+		lock(&globalAlloc.mutex)
+		persistent = &globalAlloc.persistentAlloc
+	}
 	persistent.off = round(persistent.off, align)
 	if persistent.off+size > chunk || persistent.base == nil {
 		persistent.base = sysAlloc(chunk, &memstats.other_sys)
 		if persistent.base == nil {
-			unlock(&persistent.lock)
+			if persistent == &globalAlloc.persistentAlloc {
+				unlock(&globalAlloc.mutex)
+			}
 			throw("runtime: cannot allocate memory")
 		}
 		persistent.off = 0
 	}
 	p := add(persistent.base, persistent.off)
 	persistent.off += size
-	unlock(&persistent.lock)
+	releasem(mp)
+	if persistent == &globalAlloc.persistentAlloc {
+		unlock(&globalAlloc.mutex)
+	}
 
 	if stat != &memstats.other_sys {
 		xadd64(stat, int64(size))