runtime: parallelize STW mcache flushing Currently all mcaches are flushed in a single STW root job. This takes about 5 µs per P, but since it's done sequentially it adds about 5*GOMAXPROCS µs to the STW. Fix this by parallelizing the job. Since there are exactly GOMAXPROCS mcaches to flush, this parallelizes quite nicely and brings the STW latency cost down to a constant 5 µs (assuming GOMAXPROCS actually reflects the number of CPUs). Updates #17503. Change-Id: Ibefeb1c2229975d5137c6e67fac3b6c92103742d Reviewed-on: https://go-review.googlesource.com/32033 Reviewed-by: Rick Hudson <rlh@golang.org>

commit: a475a38a3dcd48541f60e40b1dac3c84ab72d0e5 [log] [tgz]
author: Austin Clements <austin@google.com> Tue Oct 25 13:56:37 2016 -0400
committer: Austin Clements <austin@google.com> Fri Oct 28 18:19:53 2016 +0000
tree: ac525d4f7fcb24e7f1216ee0b5b3a4dbc7ae9fcd
parent: 20edeabc0fc73c6212769142f80e921c2e07ee08 [diff] [blame]
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index 7819698..022fbf2 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go

@@ -14,7 +14,6 @@
 
 const (
 	fixedRootFinalizers = iota
-	fixedRootFlushCaches
 	fixedRootFreeGStacks
 	fixedRootCount
 
@@ -45,6 +44,12 @@
 //
 //go:nowritebarrier
 func gcMarkRootPrepare() {
+	if gcphase == _GCmarktermination {
+		work.nFlushCacheRoots = int(gomaxprocs)
+	} else {
+		work.nFlushCacheRoots = 0
+	}
+
 	// Compute how many data and BSS root blocks there are.
 	nBlocks := func(bytes uintptr) int {
 		return int((bytes + rootBlockBytes - 1) / rootBlockBytes)
@@ -108,7 +113,7 @@
 	}
 
 	work.markrootNext = 0
-	work.markrootJobs = uint32(fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots + work.nRescanRoots)
+	work.markrootJobs = uint32(fixedRootCount + work.nFlushCacheRoots + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots + work.nRescanRoots)
 }
 
 // gcMarkRootCheck checks that all roots have been scanned. It is
@@ -156,7 +161,8 @@
 func markroot(gcw *gcWork, i uint32) {
 	// TODO(austin): This is a bit ridiculous. Compute and store
 	// the bases in gcMarkRootPrepare instead of the counts.
-	baseData := uint32(fixedRootCount)
+	baseFlushCache := uint32(fixedRootCount)
+	baseData := baseFlushCache + uint32(work.nFlushCacheRoots)
 	baseBSS := baseData + uint32(work.nDataRoots)
 	baseSpans := baseBSS + uint32(work.nBSSRoots)
 	baseStacks := baseSpans + uint32(work.nSpanRoots)
@@ -165,6 +171,9 @@
 
 	// Note: if you add a case here, please also update heapdump.go:dumproots.
 	switch {
+	case baseFlushCache <= i && i < baseData:
+		flushmcache(int(i - baseFlushCache))
+
 	case baseData <= i && i < baseBSS:
 		for datap := &firstmoduledata; datap != nil; datap = datap.next {
 			markrootBlock(datap.data, datap.edata-datap.data, datap.gcdatamask.bytedata, gcw, int(i-baseData))
@@ -180,11 +189,6 @@
 			scanblock(uintptr(unsafe.Pointer(&fb.fin[0])), uintptr(fb.cnt)*unsafe.Sizeof(fb.fin[0]), &finptrmask[0], gcw)
 		}
 
-	case i == fixedRootFlushCaches:
-		if gcphase == _GCmarktermination { // Do not flush mcaches during concurrent phase.
-			flushallmcaches()
-		}
-
 	case i == fixedRootFreeGStacks:
 		// Only do this once per GC cycle; preferably
 		// concurrently.
commit	a475a38a3dcd48541f60e40b1dac3c84ab72d0e5	[log] [tgz]
author	Austin Clements <austin@google.com>	Tue Oct 25 13:56:37 2016 -0400
committer	Austin Clements <austin@google.com>	Fri Oct 28 18:19:53 2016 +0000
tree	ac525d4f7fcb24e7f1216ee0b5b3a4dbc7ae9fcd
parent	20edeabc0fc73c6212769142f80e921c2e07ee08 [diff] [blame]