runtime: change tinyalloc, persistentalloc not to point past allocated data

During all.bash I got a crash in the GOMAXPROCS=2 runtime test reporting
that the write barrier in the assignment 'c.tiny = add(x, size)' had been
given a pointer pointing into an unexpected span. The problem is that
the tiny allocation was at the end of a span and c.tiny was now pointing
to the end of the allocation and therefore to the end of the span aka
the beginning of the next span.

Rewrite tinyalloc not to do that.

More generally, it's not okay to call add(p, size) unless you know that p
points at > (not just >=) size bytes. Similarly, pretty much any call to
roundup doesn't know how much space p points at, so those are all
broken.

Rewrite persistentalloc not to use add(p, totalsize) and not to use roundup.

There is only one use of roundup left, in vprintf, which is dead code.
I will remove that code and roundup itself in a followup CL.

Change-Id: I211e307d1a656d29087b8fd40b2b71010722fb4a
Reviewed-on: https://go-review.googlesource.com/2814
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 8209892..eb895a9 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -119,40 +119,35 @@
 			// standalone escaping variables. On a json benchmark
 			// the allocator reduces number of allocations by ~12% and
 			// reduces heap size by ~20%.
-			tinysize := uintptr(c.tinysize)
-			if size <= tinysize {
-				tiny := unsafe.Pointer(c.tiny)
-				// Align tiny pointer for required (conservative) alignment.
-				if size&7 == 0 {
-					tiny = roundup(tiny, 8)
-				} else if size&3 == 0 {
-					tiny = roundup(tiny, 4)
-				} else if size&1 == 0 {
-					tiny = roundup(tiny, 2)
-				}
-				size1 := size + (uintptr(tiny) - uintptr(unsafe.Pointer(c.tiny)))
-				if size1 <= tinysize {
-					// The object fits into existing tiny block.
-					x = tiny
-					c.tiny = (*byte)(add(x, size))
-					c.tinysize -= uintptr(size1)
-					c.local_tinyallocs++
-					if debugMalloc {
-						mp := acquirem()
-						if mp.mallocing == 0 {
-							throw("bad malloc")
-						}
-						mp.mallocing = 0
-						if mp.curg != nil {
-							mp.curg.stackguard0 = mp.curg.stack.lo + _StackGuard
-						}
-						// Note: one releasem for the acquirem just above.
-						// The other for the acquirem at start of malloc.
-						releasem(mp)
-						releasem(mp)
+			off := c.tinyoffset
+			// Align tiny pointer for required (conservative) alignment.
+			if size&7 == 0 {
+				off = round(off, 8)
+			} else if size&3 == 0 {
+				off = round(off, 4)
+			} else if size&1 == 0 {
+				off = round(off, 2)
+			}
+			if off+size <= maxTinySize {
+				// The object fits into existing tiny block.
+				x = add(c.tiny, off)
+				c.tinyoffset = off + size
+				c.local_tinyallocs++
+				if debugMalloc {
+					mp := acquirem()
+					if mp.mallocing == 0 {
+						throw("bad malloc")
 					}
-					return x
+					mp.mallocing = 0
+					if mp.curg != nil {
+						mp.curg.stackguard0 = mp.curg.stack.lo + _StackGuard
+					}
+					// Note: one releasem for the acquirem just above.
+					// The other for the acquirem at start of malloc.
+					releasem(mp)
+					releasem(mp)
 				}
+				return x
 			}
 			// Allocate a new maxTinySize block.
 			s = c.alloc[tinySizeClass]
@@ -173,9 +168,9 @@
 			(*[2]uint64)(x)[1] = 0
 			// See if we need to replace the existing tiny block with the new one
 			// based on amount of remaining free space.
-			if maxTinySize-size > tinysize {
-				c.tiny = (*byte)(add(x, size))
-				c.tinysize = uintptr(maxTinySize - size)
+			if size < c.tinyoffset {
+				c.tiny = x
+				c.tinyoffset = size
 			}
 			size = maxTinySize
 		} else {
@@ -1013,8 +1008,8 @@
 
 var persistent struct {
 	lock mutex
-	pos  unsafe.Pointer
-	end  unsafe.Pointer
+	base unsafe.Pointer
+	off  uintptr
 }
 
 // Wrapper around sysAlloc that can allocate small chunks.
@@ -1027,6 +1022,9 @@
 		maxBlock = 64 << 10 // VM reservation granularity is 64K on windows
 	)
 
+	if size == 0 {
+		throw("persistentalloc: size == 0")
+	}
 	if align != 0 {
 		if align&(align-1) != 0 {
 			throw("persistentalloc: align is not a power of 2")
@@ -1043,17 +1041,17 @@
 	}
 
 	lock(&persistent.lock)
-	persistent.pos = roundup(persistent.pos, align)
-	if uintptr(persistent.pos)+size > uintptr(persistent.end) {
-		persistent.pos = sysAlloc(chunk, &memstats.other_sys)
-		if persistent.pos == nil {
+	persistent.off = round(persistent.off, align)
+	if persistent.off+size > chunk {
+		persistent.base = sysAlloc(chunk, &memstats.other_sys)
+		if persistent.base == nil {
 			unlock(&persistent.lock)
 			throw("runtime: cannot allocate memory")
 		}
-		persistent.end = add(persistent.pos, chunk)
+		persistent.off = 0
 	}
-	p := persistent.pos
-	persistent.pos = add(persistent.pos, size)
+	p := add(persistent.base, persistent.off)
+	persistent.off += size
 	unlock(&persistent.lock)
 
 	if stat != &memstats.other_sys {