runtime: track scan work performed during concurrent mark

This tracks the amount of scan work in terms of scanned pointers
during the concurrent mark phase. We'll use this information to
estimate scan work for the next cycle.

Currently this aggregates the work counter in gcWork and dispose
atomically aggregates this into a global work counter. dispose happens
relatively infrequently, so the contention on the global counter
should be low. If this turns out to be an issue, we can reduce the
number of disposes, and if it's still a problem, we can switch to
per-P counters.

Change-Id: Iac0364c466ee35fab781dbbbe7970a5f3c4e1fc1
Reviewed-on: https://go-review.googlesource.com/8832
Reviewed-by: Rick Hudson <rlh@golang.org>
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index d20473c..660a7d4 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -163,6 +163,7 @@
 	// Root aren't part of the heap, so don't count them toward
 	// marked heap bytes.
 	gcw.bytesMarked = 0
+	gcw.scanWork = 0
 	gcw.dispose()
 }
 
@@ -191,6 +192,10 @@
 		gcw.initFromCache()
 		const n = len(workbuf{}.obj)
 		gcDrainN(&gcw, n) // drain upto one buffer's worth of objects
+		// TODO(austin): This is the vast majority of our
+		// disposes. Instead of constantly disposing, keep a
+		// per-P gcWork cache (probably combined with the
+		// write barrier wbuf cache).
 		gcw.dispose()
 	case _GCmarktermination:
 		// We should never be here since the world is stopped.
@@ -267,6 +272,7 @@
 	// Stacks aren't part of the heap, so don't count them toward
 	// marked heap bytes.
 	gcw.bytesMarked = 0
+	gcw.scanWork = 0
 	gcw.disposeToCache()
 	gp.gcscanvalid = true
 }
@@ -425,6 +431,7 @@
 func scanobject(b, n uintptr, ptrmask *uint8, gcw *gcWork) {
 	arena_start := mheap_.arena_start
 	arena_used := mheap_.arena_used
+	scanWork := int64(0)
 
 	// Find bits of the beginning of the object.
 	var hbits heapBits
@@ -465,6 +472,16 @@
 
 		obj := *(*uintptr)(unsafe.Pointer(b + i))
 
+		// Track the scan work performed as a way to estimate
+		// GC time. We use the number of pointers scanned
+		// because pointer scanning dominates the cost of
+		// scanning.
+		//
+		// TODO(austin): Consider counting only pointers into
+		// the heap, since nil and non-heap pointers are
+		// probably cheap to scan.
+		scanWork++
+
 		// At this point we have extracted the next potential pointer.
 		// Check if it points into heap.
 		if obj == 0 || obj < arena_start || obj >= arena_used {
@@ -481,6 +498,7 @@
 		}
 	}
 	gcw.bytesMarked += uint64(n)
+	gcw.scanWork += scanWork
 }
 
 // Shade the object if it isn't already.