runtime: remove mcentral.nmalloc and add mcache.local_nsmallalloc

This change removes mcentral.nmalloc and adds mcache.local_nsmallalloc
which fulfills the same role but may be accessed non-atomically. It also
moves responsibility for updating heap_live and local_nsmallalloc into
mcache functions.

As a result of this change, mcache is now the sole source-of-truth for
malloc stats. It is also solely responsible for updating heap_live and
performing the various operations required as a result of updating
heap_live. The overall improvement here is in code organization:
previously malloc stats were fairly scattered, and now they have one
single home, and nearly all the required manipulations exist in a single
file.

Change-Id: I7e93fa297c1debf17e3f2a0d68aeed28a9c6af00
Reviewed-on: https://go-review.googlesource.com/c/go/+/246966
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index 3657c0b..4d2ba6d 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -51,6 +51,7 @@
 	// application.
 	local_largealloc  uintptr                  // bytes allocated for large objects
 	local_nlargealloc uintptr                  // number of large object allocations
+	local_nsmallalloc [_NumSizeClasses]uintptr // number of allocs for small objects
 	local_largefree   uintptr                  // bytes freed for large objects (>maxsmallsize)
 	local_nlargefree  uintptr                  // number of frees for large objects (>maxsmallsize)
 	local_nsmallfree  [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
@@ -138,6 +139,10 @@
 	c.local_largealloc = 0
 	d.local_nlargealloc += c.local_nlargealloc
 	c.local_nlargealloc = 0
+	for i := range c.local_nsmallalloc {
+		d.local_nsmallalloc[i] += c.local_nsmallalloc[i]
+		c.local_nsmallalloc[i] = 0
+	}
 	d.local_largefree += c.local_largefree
 	c.local_largefree = 0
 	d.local_nlargefree += c.local_nlargefree
@@ -182,6 +187,20 @@
 	// sweeping in the next sweep phase.
 	s.sweepgen = mheap_.sweepgen + 3
 
+	// Assume all objects from this span will be allocated in the
+	// mcache. If it gets uncached, we'll adjust this.
+	c.local_nsmallalloc[spc.sizeclass()] += uintptr(s.nelems) - uintptr(s.allocCount)
+	usedBytes := uintptr(s.allocCount) * s.elemsize
+	atomic.Xadd64(&memstats.heap_live, int64(s.npages*pageSize)-int64(usedBytes))
+	if trace.enabled {
+		// heap_live changed.
+		traceHeapAlloc()
+	}
+	if gcBlackenEnabled != 0 {
+		// heap_live changed.
+		gcController.revise()
+	}
+
 	c.alloc[spc] = s
 }
 
@@ -227,9 +246,24 @@
 }
 
 func (c *mcache) releaseAll() {
+	sg := mheap_.sweepgen
 	for i := range c.alloc {
 		s := c.alloc[i]
 		if s != &emptymspan {
+			// Adjust nsmallalloc in case the span wasn't fully allocated.
+			n := uintptr(s.nelems) - uintptr(s.allocCount)
+			c.local_nsmallalloc[spanClass(i).sizeclass()] -= n
+			if s.sweepgen != sg+1 {
+				// refill conservatively counted unallocated slots in heap_live.
+				// Undo this.
+				//
+				// If this span was cached before sweep, then
+				// heap_live was totally recomputed since
+				// caching this span, so we don't do this for
+				// stale spans.
+				atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
+			}
+			// Release the span to the mcentral.
 			mheap_.central[i].mcentral.uncacheSpan(s)
 			c.alloc[i] = &emptymspan
 		}
diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go
index ed49e01..97fe92c 100644
--- a/src/runtime/mcentral.go
+++ b/src/runtime/mcentral.go
@@ -44,11 +44,6 @@
 	// encounter swept spans, and these should be ignored.
 	partial [2]spanSet // list of spans with a free object
 	full    [2]spanSet // list of spans with no free objects
-
-	// nmalloc is the cumulative count of objects allocated from
-	// this mcentral, assuming all spans in mcaches are
-	// fully-allocated. Written atomically, read under STW.
-	nmalloc uint64
 }
 
 // Initialize a single central free list.
@@ -178,19 +173,6 @@
 	if n == 0 || s.freeindex == s.nelems || uintptr(s.allocCount) == s.nelems {
 		throw("span has no free objects")
 	}
-	// Assume all objects from this span will be allocated in the
-	// mcache. If it gets uncached, we'll adjust this.
-	atomic.Xadd64(&c.nmalloc, int64(n))
-	usedBytes := uintptr(s.allocCount) * s.elemsize
-	atomic.Xadd64(&memstats.heap_live, int64(spanBytes)-int64(usedBytes))
-	if trace.enabled {
-		// heap_live changed.
-		traceHeapAlloc()
-	}
-	if gcBlackenEnabled != 0 {
-		// heap_live changed.
-		gcController.revise()
-	}
 	freeByteBase := s.freeindex &^ (64 - 1)
 	whichByte := freeByteBase / 8
 	// Init alloc bits cache.
@@ -228,27 +210,6 @@
 		// Indicate that s is no longer cached.
 		atomic.Store(&s.sweepgen, sg)
 	}
-	n := int(s.nelems) - int(s.allocCount)
-
-	// Fix up statistics.
-	if n > 0 {
-		// cacheSpan updated alloc assuming all objects on s
-		// were going to be allocated. Adjust for any that
-		// weren't. We must do this before potentially
-		// sweeping the span.
-		atomic.Xadd64(&c.nmalloc, -int64(n))
-
-		if !stale {
-			// (*mcentral).cacheSpan conservatively counted
-			// unallocated slots in heap_live. Undo this.
-			//
-			// If this span was cached before sweep, then
-			// heap_live was totally recomputed since
-			// caching this span, so we don't do this for
-			// stale spans.
-			atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
-		}
-	}
 
 	// Put the span in the appropriate place.
 	if stale {
@@ -256,7 +217,7 @@
 		// the right list.
 		s.sweep(false)
 	} else {
-		if n > 0 {
+		if int(s.nelems)-int(s.allocCount) > 0 {
 			// Put it back on the partial swept list.
 			c.partialSwept(sg).push(s)
 		} else {
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index d9acb36..44cf17c 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -561,17 +561,6 @@
 	// Collect allocation stats. This is safe and consistent
 	// because the world is stopped.
 	var smallFree, totalAlloc, totalFree uint64
-	// Collect per-spanclass stats.
-	for spc := range mheap_.central {
-		// The mcaches are now empty, so mcentral stats are
-		// up-to-date.
-		c := &mheap_.central[spc].mcentral
-		memstats.nmalloc += c.nmalloc
-		i := spanClass(spc).sizeclass()
-		memstats.by_size[i].nmalloc += c.nmalloc
-		totalAlloc += c.nmalloc * uint64(class_to_size[i])
-	}
-
 	for _, p := range allp {
 		c := p.mcache
 		if c == nil {
@@ -585,12 +574,17 @@
 
 		// Collect per-sizeclass stats.
 		for i := 0; i < _NumSizeClasses; i++ {
+			// Malloc stats.
+			memstats.nmalloc += uint64(c.local_nsmallalloc[i])
+			memstats.by_size[i].nmalloc += uint64(c.local_nsmallalloc[i])
+			totalAlloc += uint64(c.local_nsmallalloc[i]) * uint64(class_to_size[i])
+
+			// Free stats.
 			memstats.nfree += uint64(c.local_nsmallfree[i])
 			memstats.by_size[i].nfree += uint64(c.local_nsmallfree[i])
 			smallFree += uint64(c.local_nsmallfree[i]) * uint64(class_to_size[i])
 		}
 	}
-	// Collect remaining large allocation stats.
 
 	totalFree += smallFree