runtime: make alloc count metrics truly monotonic

Right now we export alloc count metrics via the runtime/metrics package
and mark them as monotonic, but that's not actually true. As an
optimization, the runtime assumes a span is always fully allocated
before being uncached, and updates the accounting as such. In the rare
case that it's wrong, the span has enough information to back out what
did not get allocated.

This change uses 16 bits of padding in the mspan to house another field
that represents the amount of mspan slots filled just as the mspan is
cached. This is information is enough to get an exact count, allowing us
to make the metrics truly monotonic.

Change-Id: Iaff3ca43f8745dc1bbb0232372423e014b89b920
Reviewed-on: https://go-review.googlesource.com/c/go/+/377516
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index 86a8958..afd5afb 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -156,6 +156,25 @@
 			throw("bad sweepgen in refill")
 		}
 		mheap_.central[spc].mcentral.uncacheSpan(s)
+
+		// Count up how many slots were used and record it.
+		stats := memstats.heapStats.acquire()
+		slotsUsed := uintptr(s.allocCount) - uintptr(s.allocCountBeforeCache)
+		atomic.Xadduintptr(&stats.smallAllocCount[spc.sizeclass()], slotsUsed)
+
+		// Flush tinyAllocs.
+		if spc == tinySpanClass {
+			atomic.Xadduintptr(&stats.tinyAllocCount, c.tinyAllocs)
+			c.tinyAllocs = 0
+		}
+		memstats.heapStats.release()
+
+		// Update heapLive and flush scanAlloc.
+		gcController.update(int64(slotsUsed*s.elemsize), int64(c.scanAlloc))
+		c.scanAlloc = 0
+
+		// Clear the second allocCount just to be safe.
+		s.allocCountBeforeCache = 0
 	}
 
 	// Get a new cached span from the central lists.
@@ -172,24 +191,8 @@
 	// sweeping in the next sweep phase.
 	s.sweepgen = mheap_.sweepgen + 3
 
-	// Assume all objects from this span will be allocated in the
-	// mcache. If it gets uncached, we'll adjust this.
-	stats := memstats.heapStats.acquire()
-	atomic.Xadduintptr(&stats.smallAllocCount[spc.sizeclass()], uintptr(s.nelems)-uintptr(s.allocCount))
-
-	// Flush tinyAllocs.
-	if spc == tinySpanClass {
-		atomic.Xadduintptr(&stats.tinyAllocCount, c.tinyAllocs)
-		c.tinyAllocs = 0
-	}
-	memstats.heapStats.release()
-
-	// Update heapLive with the same assumption.
-	// While we're here, flush scanAlloc, since we have to call
-	// revise anyway.
-	usedBytes := uintptr(s.allocCount) * s.elemsize
-	gcController.update(int64(s.npages*pageSize)-int64(usedBytes), int64(c.scanAlloc))
-	c.scanAlloc = 0
+	// Store the current alloc count for accounting later.
+	s.allocCountBeforeCache = s.allocCount
 
 	c.alloc[spc] = s
 }
@@ -235,26 +238,16 @@
 	scanAlloc := int64(c.scanAlloc)
 	c.scanAlloc = 0
 
-	sg := mheap_.sweepgen
-	dHeapLive := int64(0)
 	for i := range c.alloc {
 		s := c.alloc[i]
 		if s != &emptymspan {
-			// Adjust nsmallalloc in case the span wasn't fully allocated.
-			n := uintptr(s.nelems) - uintptr(s.allocCount)
+			// Adjust smallAllocCount for whatever was allocated.
 			stats := memstats.heapStats.acquire()
-			atomic.Xadduintptr(&stats.smallAllocCount[spanClass(i).sizeclass()], -n)
+			slotsUsed := uintptr(s.allocCount) - uintptr(s.allocCountBeforeCache)
+			atomic.Xadduintptr(&stats.smallAllocCount[spanClass(i).sizeclass()], slotsUsed)
 			memstats.heapStats.release()
-			if s.sweepgen != sg+1 {
-				// refill conservatively counted unallocated slots in gcController.heapLive.
-				// Undo this.
-				//
-				// If this span was cached before sweep, then
-				// gcController.heapLive was totally recomputed since
-				// caching this span, so we don't do this for
-				// stale spans.
-				dHeapLive -= int64(n) * int64(s.elemsize)
-			}
+			s.allocCountBeforeCache = 0
+
 			// Release the span to the mcentral.
 			mheap_.central[i].mcentral.uncacheSpan(s)
 			c.alloc[i] = &emptymspan
@@ -270,8 +263,8 @@
 	c.tinyAllocs = 0
 	memstats.heapStats.release()
 
-	// Updated heapScan and heapLive.
-	gcController.update(dHeapLive, scanAlloc)
+	// Updated heapScan.
+	gcController.update(0, scanAlloc)
 }
 
 // prepareForSweep flushes c if the system has entered a new sweep phase
diff --git a/src/runtime/metrics_test.go b/src/runtime/metrics_test.go
index 5d32ef4..4bd1408 100644
--- a/src/runtime/metrics_test.go
+++ b/src/runtime/metrics_test.go
@@ -9,6 +9,7 @@
 	"runtime/metrics"
 	"sort"
 	"strings"
+	"sync"
 	"testing"
 	"time"
 	"unsafe"
@@ -319,3 +320,88 @@
 	b.ReportMetric(float64(latencies[len(latencies)*90/100]), "p90-ns")
 	b.ReportMetric(float64(latencies[len(latencies)*99/100]), "p99-ns")
 }
+
+var readMetricsSink [1024]interface{}
+
+func TestReadMetricsCumulative(t *testing.T) {
+	// Set up the set of metrics marked cumulative.
+	descs := metrics.All()
+	var samples [2][]metrics.Sample
+	samples[0] = make([]metrics.Sample, len(descs))
+	samples[1] = make([]metrics.Sample, len(descs))
+	total := 0
+	for i := range samples[0] {
+		if !descs[i].Cumulative {
+			continue
+		}
+		samples[0][total].Name = descs[i].Name
+		total++
+	}
+	samples[0] = samples[0][:total]
+	samples[1] = samples[1][:total]
+	copy(samples[1], samples[0])
+
+	// Start some noise in the background.
+	var wg sync.WaitGroup
+	wg.Add(1)
+	done := make(chan struct{})
+	go func() {
+		defer wg.Done()
+		for {
+			// Add more things here that could influence metrics.
+			for i := 0; i < len(readMetricsSink); i++ {
+				readMetricsSink[i] = make([]byte, 1024)
+				select {
+				case <-done:
+					return
+				default:
+				}
+			}
+			runtime.GC()
+		}
+	}()
+
+	sum := func(us []uint64) uint64 {
+		total := uint64(0)
+		for _, u := range us {
+			total += u
+		}
+		return total
+	}
+
+	// Populate the first generation.
+	metrics.Read(samples[0])
+
+	// Check to make sure that these metrics only grow monotonically.
+	for gen := 1; gen < 10; gen++ {
+		metrics.Read(samples[gen%2])
+		for i := range samples[gen%2] {
+			name := samples[gen%2][i].Name
+			vNew, vOld := samples[gen%2][i].Value, samples[1-(gen%2)][i].Value
+
+			switch vNew.Kind() {
+			case metrics.KindUint64:
+				new := vNew.Uint64()
+				old := vOld.Uint64()
+				if new < old {
+					t.Errorf("%s decreased: %d < %d", name, new, old)
+				}
+			case metrics.KindFloat64:
+				new := vNew.Float64()
+				old := vOld.Float64()
+				if new < old {
+					t.Errorf("%s decreased: %f < %f", name, new, old)
+				}
+			case metrics.KindFloat64Histogram:
+				new := sum(vNew.Float64Histogram().Counts)
+				old := sum(vOld.Float64Histogram().Counts)
+				if new < old {
+					t.Errorf("%s counts decreased: %d < %d", name, new, old)
+				}
+			}
+		}
+	}
+	close(done)
+
+	wg.Wait()
+}
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index d99363d..1c98afc 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -449,16 +449,17 @@
 	// if sweepgen == h->sweepgen + 3, the span was swept and then cached and is still cached
 	// h->sweepgen is incremented by 2 after every GC
 
-	sweepgen    uint32
-	divMul      uint32        // for divide by elemsize
-	allocCount  uint16        // number of allocated objects
-	spanclass   spanClass     // size class and noscan (uint8)
-	state       mSpanStateBox // mSpanInUse etc; accessed atomically (get/set methods)
-	needzero    uint8         // needs to be zeroed before allocation
-	elemsize    uintptr       // computed from sizeclass or from npages
-	limit       uintptr       // end of data in span
-	speciallock mutex         // guards specials list
-	specials    *special      // linked list of special records sorted by offset.
+	sweepgen              uint32
+	divMul                uint32        // for divide by elemsize
+	allocCount            uint16        // number of allocated objects
+	spanclass             spanClass     // size class and noscan (uint8)
+	state                 mSpanStateBox // mSpanInUse etc; accessed atomically (get/set methods)
+	needzero              uint8         // needs to be zeroed before allocation
+	allocCountBeforeCache uint16        // a copy of allocCount that is stored just before this span is cached
+	elemsize              uintptr       // computed from sizeclass or from npages
+	limit                 uintptr       // end of data in span
+	speciallock           mutex         // guards specials list
+	specials              *special      // linked list of special records sorted by offset.
 }
 
 func (s *mspan) base() uintptr {