runtime: fix deadlock when gctrace
Calling ReadMemStats which does stoptheworld on m0 holding locks
was not a good idea.
Stoptheworld holding locks is a recipe for deadlocks (added check for this).
Stoptheworld on g0 may or may not work (added check for this as well).
As far as I understand scavenger will print incorrect numbers now,
as stack usage is not subtracted from heap. But it's better than deadlocking.

LGTM=khr
R=golang-codereviews, rsc, khr
CC=golang-codereviews, rlh
https://golang.org/cl/124670043
diff --git a/src/pkg/runtime/heapdump.c b/src/pkg/runtime/heapdump.c
index 63d80b8..a2d12ad 100644
--- a/src/pkg/runtime/heapdump.c
+++ b/src/pkg/runtime/heapdump.c
@@ -748,7 +748,6 @@
 	// Stop the world.
 	runtime·semacquire(&runtime·worldsema, false);
 	g->m->gcing = 1;
-	g->m->locks++;
 	runtime·stoptheworld();
 
 	// Update stats so we can dump them.
@@ -774,6 +773,7 @@
 
 	// Start up the world again.
 	g->m->gcing = 0;
+	g->m->locks++;
 	runtime·semrelease(&runtime·worldsema);
 	runtime·starttheworld();
 	g->m->locks--;
diff --git a/src/pkg/runtime/malloc.go b/src/pkg/runtime/malloc.go
index 8ee4607..578fbd1 100644
--- a/src/pkg/runtime/malloc.go
+++ b/src/pkg/runtime/malloc.go
@@ -413,6 +413,7 @@
 		return
 	}
 	releasem(mp)
+	mp = nil
 
 	if panicking != 0 {
 		return
@@ -441,7 +442,11 @@
 	startTime := gonanotime()
 	mp = acquirem()
 	mp.gcing = 1
+	releasem(mp)
 	stoptheworld()
+	if mp != acquirem() {
+		gothrow("gogc: rescheduled")
+	}
 
 	clearpools()
 
@@ -474,6 +479,7 @@
 	semrelease(&worldsema)
 	starttheworld()
 	releasem(mp)
+	mp = nil
 
 	// now that gc is done, kick off finalizer thread if needed
 	if !concurrentSweep {
diff --git a/src/pkg/runtime/mheap.c b/src/pkg/runtime/mheap.c
index 5998724..8e6190c 100644
--- a/src/pkg/runtime/mheap.c
+++ b/src/pkg/runtime/mheap.c
@@ -608,7 +608,6 @@
 {
 	uint32 i;
 	uintptr sumreleased;
-	MStats stats;
 	MHeap *h;
 	
 	h = &runtime·mheap;
@@ -618,12 +617,13 @@
 	sumreleased += scavengelist(&h->freelarge, now, limit);
 
 	if(runtime·debug.gctrace > 0) {
-		runtime·ReadMemStats(&stats);
 		if(sumreleased > 0)
 			runtime·printf("scvg%d: %D MB released\n", k, (uint64)sumreleased>>20);
+		// TODO(dvyukov): these stats are incorrect as we don't subtract stack usage from heap.
+		// But we can't call ReadMemStats on g0 holding locks.
 		runtime·printf("scvg%d: inuse: %D, idle: %D, sys: %D, released: %D, consumed: %D (MB)\n",
-			k, stats.heap_inuse>>20, stats.heap_idle>>20, stats.heap_sys>>20,
-			stats.heap_released>>20, (stats.heap_sys - stats.heap_released)>>20);
+			k, mstats.heap_inuse>>20, mstats.heap_idle>>20, mstats.heap_sys>>20,
+			mstats.heap_released>>20, (mstats.heap_sys - mstats.heap_released)>>20);
 	}
 }
 
diff --git a/src/pkg/runtime/proc.c b/src/pkg/runtime/proc.c
index 2510a42..8584cb6 100644
--- a/src/pkg/runtime/proc.c
+++ b/src/pkg/runtime/proc.c
@@ -498,6 +498,14 @@
 	P *p;
 	bool wait;
 
+	// If we hold a lock, then we won't be able to stop another M
+	// that is blocked trying to acquire the lock.
+	if(g->m->locks > 0)
+		runtime·throw("stoptheworld: holding locks");
+	// There is no evidence that stoptheworld on g0 does not work,
+	// we just don't do it today.
+	if(g == g->m->g0)
+		runtime·throw("stoptheworld: on g0");
 	runtime·lock(&runtime·sched.lock);
 	runtime·sched.stopwait = runtime·gomaxprocs;
 	runtime·atomicstore((uint32*)&runtime·sched.gcwaiting, 1);