runtime: improve scheduler fairness
Currently global runqueue is starved if a group of goroutines
constantly respawn each other (local runqueue never becomes empty).
Fixes #5639.

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/10042044
diff --git a/src/pkg/runtime/proc.c b/src/pkg/runtime/proc.c
index 9d2f765..c121466 100644
--- a/src/pkg/runtime/proc.c
+++ b/src/pkg/runtime/proc.c
@@ -106,7 +106,7 @@
 static G* gfget(P*);
 static void gfpurge(P*);
 static void globrunqput(G*);
-static G* globrunqget(P*);
+static G* globrunqget(P*, int32);
 static P* pidleget(void);
 static void pidleput(P*);
 static void injectglist(G*);
@@ -1024,7 +1024,7 @@
 	// global runq
 	if(runtime·sched.runqsize) {
 		runtime·lock(&runtime·sched);
-		gp = globrunqget(m->p);
+		gp = globrunqget(m->p, 0);
 		runtime·unlock(&runtime·sched);
 		if(gp)
 			return gp;
@@ -1065,7 +1065,7 @@
 		goto top;
 	}
 	if(runtime·sched.runqsize) {
-		gp = globrunqget(m->p);
+		gp = globrunqget(m->p, 0);
 		runtime·unlock(&runtime·sched);
 		return gp;
 	}
@@ -1144,6 +1144,7 @@
 schedule(void)
 {
 	G *gp;
+	uint32 tick;
 
 	if(m->locks)
 		runtime·throw("schedule: holding locks");
@@ -1154,9 +1155,23 @@
 		goto top;
 	}
 
-	gp = runqget(m->p);
-	if(gp && m->spinning)
-		runtime·throw("schedule: spinning with local work");
+	gp = nil;
+	// Check the global runnable queue once in a while to ensure fairness.
+	// Otherwise two goroutines can completely occupy the local runqueue
+	// by constantly respawning each other.
+	tick = m->p->tick;
+	// This is a fancy way to say tick%61==0,
+	// it uses 2 MUL instructions instead of a single DIV and so is faster on modern processors.
+	if(tick - (((uint64)tick*0x4325c53fu)>>36)*61 == 0 && runtime·sched.runqsize > 0) {
+		runtime·lock(&runtime·sched);
+		gp = globrunqget(m->p, 1);
+		runtime·unlock(&runtime·sched);
+	}
+	if(gp == nil) {
+		gp = runqget(m->p);
+		if(gp && m->spinning)
+			runtime·throw("schedule: spinning with local work");
+	}
 	if(gp == nil)
 		gp = findrunnable();
 
@@ -2167,7 +2182,7 @@
 // Try get a batch of G's from the global runnable queue.
 // Sched must be locked.
 static G*
-globrunqget(P *p)
+globrunqget(P *p, int32 max)
 {
 	G *gp, *gp1;
 	int32 n;
@@ -2177,6 +2192,8 @@
 	n = runtime·sched.runqsize/runtime·gomaxprocs+1;
 	if(n > runtime·sched.runqsize)
 		n = runtime·sched.runqsize;
+	if(max > 0 && n > max)
+		n = max;
 	runtime·sched.runqsize -= n;
 	if(runtime·sched.runqsize == 0)
 		runtime·sched.runqtail = nil;
diff --git a/src/pkg/runtime/proc_test.go b/src/pkg/runtime/proc_test.go
index 21fb9c2..83368e0 100644
--- a/src/pkg/runtime/proc_test.go
+++ b/src/pkg/runtime/proc_test.go
@@ -8,6 +8,7 @@
 	"math"
 	"runtime"
 	"sync/atomic"
+	"syscall"
 	"testing"
 	"time"
 )
@@ -107,6 +108,55 @@
 	}
 }
 
+func TestTimerFairness(t *testing.T) {
+	done := make(chan bool)
+	c := make(chan bool)
+	for i := 0; i < 2; i++ {
+		go func() {
+			for {
+				select {
+				case c <- true:
+				case <-done:
+					return
+				}
+			}
+		}()
+	}
+
+	timer := time.After(20 * time.Millisecond)
+	for {
+		select {
+		case <-c:
+		case <-timer:
+			close(done)
+			return
+		}
+	}
+}
+
+func TestTimerFairness2(t *testing.T) {
+	done := make(chan bool)
+	c := make(chan bool)
+	for i := 0; i < 2; i++ {
+		go func() {
+			timer := time.After(20 * time.Millisecond)
+			var buf [1]byte
+			for {
+				syscall.Read(0, buf[0:0])
+				select {
+				case c <- true:
+				case <-c:
+				case <-timer:
+					done <- true
+					return
+				}
+			}
+		}()
+	}
+	<-done
+	<-done
+}
+
 func stackGrowthRecursive(i int) {
 	var pad [128]uint64
 	if i != 0 && pad[0] == 0 {