runtime: fix two garbage collector bugs

First, call clearcheckmarks immediately after changing checkmark,
so that there is less time when the checkmark flag and the bitmap
are inconsistent. The tiny gap between the two lines is fine, because
the world is stopped. Before, the gap was much larger and included
such code as "go bgsweep()", which allocated.

Second, modify gcphase only when the world is stopped.
As written, gcscan_m was changing gcphase from 0 to GCscan
and back to 0 while other goroutines were running.
Another goroutine running at the same time might decide to
sleep, see GCscan, call gcphasework, and start "helping" by
scanning its stack. That's fine, except that if gcphase flips back
to 0 as the goroutine calls scanblock, it will start draining the
work buffers prematurely.

Both of these were found wbshadow=2 (and a lot of hard work).
Eventually that will run automatically, but right now it still
doesn't quite work for all.bash, due to mmap conflicts with
pthread-created threads.

Change-Id: I99aa8210cff9c6e7d0a1b62c75be32a23321897b
Reviewed-on: https://go-review.googlesource.com/2340
Reviewed-by: Rick Hudson <rlh@golang.org>
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 22c0dfe..772d330 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -493,6 +493,7 @@
 	systemstack(stoptheworld)
 	systemstack(finishsweep_m) // finish sweep before we start concurrent scan.
 	if force == 0 {            // Do as much work concurrently as possible
+		gcphase = _GCscan
 		systemstack(starttheworld)
 		gctimer.cycle.scan = nanotime()
 		// Do a concurrent heap scan before we stop the world.
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 950ea35..35edd8a 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -408,8 +408,9 @@
 // obj is the start of an object with mark mbits.
 // If it isn't already marked, mark it and enqueue into workbuf.
 // Return possibly new workbuf to use.
+// base and off are for debugging only and could be removed.
 //go:nowritebarrier
-func greyobject(obj uintptr, mbits *markbits, wbuf *workbuf) *workbuf {
+func greyobject(obj uintptr, base, off uintptr, mbits *markbits, wbuf *workbuf) *workbuf {
 	// obj should be start of allocation, and so must be at least pointer-aligned.
 	if obj&(ptrSize-1) != 0 {
 		throw("greyobject: obj not pointer-aligned")
@@ -418,6 +419,7 @@
 	if checkmark {
 		if !ismarked(mbits) {
 			print("runtime:greyobject: checkmarks finds unexpected unmarked object obj=", hex(obj), ", mbits->bits=", hex(mbits.bits), " *mbits->bitp=", hex(*mbits.bitp), "\n")
+			print("runtime: found obj at *(", hex(base), "+", hex(off), ")\n")
 
 			k := obj >> _PageShift
 			x := k
@@ -568,7 +570,7 @@
 		if obj == 0 {
 			continue
 		}
-		wbuf = greyobject(obj, &mbits, wbuf)
+		wbuf = greyobject(obj, b, i, &mbits, wbuf)
 	}
 	return wbuf
 }
@@ -604,6 +606,11 @@
 
 	keepworking := b == 0
 
+	if gcphase != _GCmark && gcphase != _GCmarktermination {
+		println("gcphase", gcphase)
+		throw("scanblock phase")
+	}
+
 	// ptrmask can have 2 possible values:
 	// 1. nil - obtain pointer mask from GC bitmap.
 	// 2. pointer to a compact mask (for stacks and data).
@@ -1028,7 +1035,7 @@
 	var mbits markbits
 	obj := objectstart(b, &mbits)
 	if obj != 0 {
-		wbuf = greyobject(obj, &mbits, wbuf) // augments the wbuf
+		wbuf = greyobject(obj, 0, 0, &mbits, wbuf) // augments the wbuf
 	}
 	putpartial(wbuf)
 }
@@ -1782,9 +1789,7 @@
 
 	checkmark = true
 	clearcheckmarkbits()        // Converts BitsDead to BitsScalar.
-	gc_m(startTime, eagersweep) // turns off checkmark
-	// Work done, fixed up the GC bitmap to remove the checkmark bits.
-	clearcheckmarkbits()
+	gc_m(startTime, eagersweep) // turns off checkmark + calls clearcheckmarkbits
 }
 
 //go:nowritebarrier
@@ -1833,7 +1838,6 @@
 	// by placing it onto a scanenqueue state and then calling
 	// runtimeĀ·restartg(mastergp) to make it Grunnable.
 	// At the bottom we will want to return this p back to the scheduler.
-	oldphase := gcphase
 
 	// Prepare flag indicating that the scan has not been completed.
 	lock(&allglock)
@@ -1847,7 +1851,6 @@
 	work.nwait = 0
 	work.ndone = 0
 	work.nproc = 1 // For now do not do this in parallel.
-	gcphase = _GCscan
 	//	ackgcphase is not needed since we are not scanning running goroutines.
 	parforsetup(work.markfor, work.nproc, uint32(_RootCount+local_allglen), nil, false, markroot)
 	parfordo(work.markfor)
@@ -1862,7 +1865,6 @@
 	}
 	unlock(&allglock)
 
-	gcphase = oldphase
 	casgstatus(mastergp, _Gwaiting, _Grunning)
 	// Let the g that called us continue to run.
 }
@@ -2035,6 +2037,7 @@
 			return
 		}
 		checkmark = false // done checking marks
+		clearcheckmarkbits()
 	}
 
 	// Cache the current array for sweeping.