runtime: release unused memory to the OS.

Periodically browse MHeap's freelists for long unused spans and release them if any.

Current hardcoded settings:
        - GC is forced if none occured over the last 2 minutes.
        - spans are handed back after 5 minutes of uselessness.

SysUnused (for Unix) is a wrapper on madvise MADV_DONTNEED on Linux and MADV_FREE on BSDs.

R=rsc, dvyukov, remyoudompheng
CC=golang-dev
https://golang.org/cl/5451057
diff --git a/src/pkg/runtime/malloc.h b/src/pkg/runtime/malloc.h
index d79c86d..5f03693 100644
--- a/src/pkg/runtime/malloc.h
+++ b/src/pkg/runtime/malloc.h
@@ -205,6 +205,7 @@
 	uint64	heap_sys;	// bytes obtained from system
 	uint64	heap_idle;	// bytes in idle spans
 	uint64	heap_inuse;	// bytes in non-idle spans
+	uint64	heap_released;	// bytes released to the OS
 	uint64	heap_objects;	// total number of allocated objects
 
 	// Statistics about allocation of low-level fixed-size structures.
@@ -220,6 +221,7 @@
 	// Statistics about garbage collector.
 	// Protected by stopping the world during GC.
 	uint64	next_gc;	// next GC (in heap_alloc time)
+	uint64  last_gc;	// last GC (in absolute time)
 	uint64	pause_total_ns;
 	uint64	pause_ns[256];
 	uint32	numgc;
@@ -304,14 +306,16 @@
 {
 	MSpan	*next;		// in a span linked list
 	MSpan	*prev;		// in a span linked list
-	MSpan	*allnext;		// in the list of all spans
+	MSpan	*allnext;	// in the list of all spans
 	PageID	start;		// starting page number
 	uintptr	npages;		// number of pages in span
 	MLink	*freelist;	// list of free objects
 	uint32	ref;		// number of allocated objects in this span
 	uint32	sizeclass;	// size class
 	uint32	state;		// MSpanInUse etc
-	byte	*limit;	// end of data in span
+	int64   unusedsince;	// First time spotted by GC in MSpanFree state
+	uintptr npreleased;	// number of pages released to the OS
+	byte	*limit;		// end of data in span
 };
 
 void	runtime·MSpan_Init(MSpan *span, PageID start, uintptr npages);
@@ -381,6 +385,7 @@
 void	runtime·MGetSizeClassInfo(int32 sizeclass, uintptr *size, int32 *npages, int32 *nobj);
 void*	runtime·MHeap_SysAlloc(MHeap *h, uintptr n);
 void	runtime·MHeap_MapBits(MHeap *h);
+void	runtime·MHeap_Scavenger(void);
 
 void*	runtime·mallocgc(uintptr size, uint32 flag, int32 dogc, int32 zeroed);
 int32	runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **s);
diff --git a/src/pkg/runtime/mem.go b/src/pkg/runtime/mem.go
index 3ad906a..7668008 100644
--- a/src/pkg/runtime/mem.go
+++ b/src/pkg/runtime/mem.go
@@ -17,11 +17,12 @@
 	Frees      uint64 // number of frees
 
 	// Main allocation heap statistics.
-	HeapAlloc   uint64 // bytes allocated and still in use
-	HeapSys     uint64 // bytes obtained from system
-	HeapIdle    uint64 // bytes in idle spans
-	HeapInuse   uint64 // bytes in non-idle span
-	HeapObjects uint64 // total number of allocated objects
+	HeapAlloc    uint64 // bytes allocated and still in use
+	HeapSys      uint64 // bytes obtained from system
+	HeapIdle     uint64 // bytes in idle spans
+	HeapInuse    uint64 // bytes in non-idle span
+	HeapReleased uint64 // bytes released to the OS
+	HeapObjects  uint64 // total number of allocated objects
 
 	// Low-level fixed-size structure allocator statistics.
 	//	Inuse is bytes used now.
@@ -35,7 +36,8 @@
 	BuckHashSys uint64 // profiling bucket hash table
 
 	// Garbage collector statistics.
-	NextGC       uint64
+	NextGC       uint64 // next run in HeapAlloc time (bytes)
+	LastGC       uint64 // last run in absolute time (ns)
 	PauseTotalNs uint64
 	PauseNs      [256]uint64 // most recent GC pause times
 	NumGC        uint32
diff --git a/src/pkg/runtime/mgc0.c b/src/pkg/runtime/mgc0.c
index 1b95928..8efa7af 100644
--- a/src/pkg/runtime/mgc0.c
+++ b/src/pkg/runtime/mgc0.c
@@ -716,8 +716,10 @@
 	byte *p;
 	MCache *c;
 	byte *arena_start;
+	int64 now;
 
 	arena_start = runtime·mheap.arena_start;
+	now = runtime·nanotime();
 
 	for(;;) {
 		s = work.spans;
@@ -726,6 +728,11 @@
 		if(!runtime·casp(&work.spans, s, s->allnext))
 			continue;
 
+		// Stamp newly unused spans. The scavenger will use that
+		// info to potentially give back some pages to the OS.
+		if(s->state == MSpanFree && s->unusedsince == 0)
+			s->unusedsince = now;
+
 		if(s->state != MSpanInUse)
 			continue;
 
@@ -963,6 +970,7 @@
 	obj1 = mstats.nmalloc - mstats.nfree;
 
 	t3 = runtime·nanotime();
+	mstats.last_gc = t3;
 	mstats.pause_ns[mstats.numgc%nelem(mstats.pause_ns)] = t3 - t0;
 	mstats.pause_total_ns += t3 - t0;
 	mstats.numgc++;
diff --git a/src/pkg/runtime/mheap.c b/src/pkg/runtime/mheap.c
index d75c18d..a40a145 100644
--- a/src/pkg/runtime/mheap.c
+++ b/src/pkg/runtime/mheap.c
@@ -103,6 +103,8 @@
 	runtime·MSpanList_Remove(s);
 	s->state = MSpanInUse;
 	mstats.heap_idle -= s->npages<<PageShift;
+	mstats.heap_released -= s->npreleased<<PageShift;
+	s->npreleased = 0;
 
 	if(s->npages > npage) {
 		// Trim extra and put it back in the heap.
@@ -280,6 +282,8 @@
 	}
 	mstats.heap_idle += s->npages<<PageShift;
 	s->state = MSpanFree;
+	s->unusedsince = 0;
+	s->npreleased = 0;
 	runtime·MSpanList_Remove(s);
 	sp = (uintptr*)(s->start<<PageShift);
 
@@ -292,6 +296,7 @@
 		*tp |= *sp;	// propagate "needs zeroing" mark
 		s->start = t->start;
 		s->npages += t->npages;
+		s->npreleased = t->npreleased; // absorb released pages
 		p -= t->npages;
 		h->map[p] = s;
 		runtime·MSpanList_Remove(t);
@@ -304,6 +309,7 @@
 		tp = (uintptr*)(t->start<<PageShift);
 		*sp |= *tp;	// propagate "needs zeroing" mark
 		s->npages += t->npages;
+		s->npreleased += t->npreleased;
 		h->map[p + s->npages - 1] = s;
 		runtime·MSpanList_Remove(t);
 		t->state = MSpanDead;
@@ -317,8 +323,81 @@
 		runtime·MSpanList_Insert(&h->free[s->npages], s);
 	else
 		runtime·MSpanList_Insert(&h->large, s);
+}
 
-	// TODO(rsc): IncrementalScavenge() to return memory to OS.
+// Release (part of) unused memory to OS.
+// Goroutine created in runtime·schedinit.
+// Loop forever.
+void
+runtime·MHeap_Scavenger(void)
+{
+	MHeap *h;
+	MSpan *s, *list;
+	uint64 tick, now, forcegc, limit;
+	uint32 k, i;
+	uintptr released, sumreleased;
+	byte *env;
+	bool trace;
+	Note note;
+
+	// If we go two minutes without a garbage collection, force one to run.
+	forcegc = 2*60*1e9;
+	// If a span goes unused for 5 minutes after a garbage collection,
+	// we hand it back to the operating system.
+	limit = 5*60*1e9;
+	// Make wake-up period small enough for the sampling to be correct.
+	tick = forcegc < limit ? forcegc/2 : limit/2;
+
+	trace = false;
+	env = runtime·getenv("GOGCTRACE");
+	if(env != nil)
+		trace = runtime·atoi(env) > 0;
+
+	h = &runtime·mheap;
+	for(k=0;; k++) {
+		runtime·noteclear(&note);
+		runtime·entersyscall();
+		runtime·notetsleep(&note, tick);
+		runtime·exitsyscall();
+
+		runtime·lock(h);
+		now = runtime·nanotime();
+		if(now - mstats.last_gc > forcegc) {
+			runtime·unlock(h);
+			runtime·gc(1);
+			runtime·lock(h);
+			now = runtime·nanotime();
+			if (trace)
+				runtime·printf("scvg%d: GC forced\n", k);
+		}
+		sumreleased = 0;
+		for(i=0; i < nelem(h->free)+1; i++) {
+			if(i < nelem(h->free))
+				list = &h->free[i];
+			else
+				list = &h->large;
+			if(runtime·MSpanList_IsEmpty(list))
+				continue;
+			for(s=list->next; s != list; s=s->next) {
+				if(s->unusedsince != 0 && (now - s->unusedsince) > limit) {
+					released = (s->npages - s->npreleased) << PageShift;
+					mstats.heap_released += released;
+					sumreleased += released;
+					s->npreleased = s->npages;
+					runtime·SysUnused((void*)(s->start << PageShift), s->npages << PageShift);
+				}
+			}
+		}
+		runtime·unlock(h);
+
+		if(trace) {
+			if(sumreleased > 0)
+				runtime·printf("scvg%d: %p MB released\n", k, sumreleased>>20);
+			runtime·printf("scvg%d: inuse: %D, idle: %D, sys: %D, released: %D, consumed: %D (MB)\n",
+				k, mstats.heap_inuse>>20, mstats.heap_idle>>20, mstats.heap_sys>>20,
+				mstats.heap_released>>20, (mstats.heap_sys - mstats.heap_released)>>20);
+		}
+	}
 }
 
 // Initialize a new span with the given start and npages.
@@ -333,6 +412,8 @@
 	span->ref = 0;
 	span->sizeclass = 0;
 	span->state = 0;
+	span->unusedsince = 0;
+	span->npreleased = 0;
 }
 
 // Initialize an empty doubly-linked list.
diff --git a/src/pkg/runtime/proc.c b/src/pkg/runtime/proc.c
index 9a4d205..3dbf77a 100644
--- a/src/pkg/runtime/proc.c
+++ b/src/pkg/runtime/proc.c
@@ -164,6 +164,9 @@
 	}
 }
 
+// Keep trace of scavenger's goroutine for deadlock detection.
+static G *scvg;
+
 // The bootstrap sequence is:
 //
 //	call osinit
@@ -206,6 +209,8 @@
 
 	mstats.enablegc = 1;
 	m->nomemprof--;
+
+	scvg = runtime·newproc1((byte*)runtime·MHeap_Scavenger, nil, 0, 0, runtime·schedinit);
 }
 
 extern void main·init(void);
@@ -582,9 +587,12 @@
 		mput(m);
 	}
 
-	v = runtime·atomicload(&runtime·sched.atomic);
-	if(runtime·sched.grunning == 0)
-		runtime·throw("all goroutines are asleep - deadlock!");
+	// Look for deadlock situation: one single active g which happens to be scvg.
+	if(runtime·sched.grunning == 1 && runtime·sched.gwait == 0) {
+		if(scvg->status == Grunning || scvg->status == Gsyscall)
+			runtime·throw("all goroutines are asleep - deadlock!");
+	}
+
 	m->nextg = nil;
 	m->waitnextg = 1;
 	runtime·noteclear(&m->havenextg);
@@ -593,6 +601,7 @@
 	// Entersyscall might have decremented mcpu too, but if so
 	// it will see the waitstop and take the slow path.
 	// Exitsyscall never increments mcpu beyond mcpumax.
+	v = runtime·atomicload(&runtime·sched.atomic);
 	if(atomic_waitstop(v) && atomic_mcpu(v) <= atomic_mcpumax(v)) {
 		// set waitstop = 0 (known to be 1)
 		runtime·xadd(&runtime·sched.atomic, -1<<waitstopShift);