runtime: unify mutex code across OSes
The change introduces 2 generic mutex implementations
(futex- and semaphore-based). Each OS chooses a suitable mutex
implementation and implements few callbacks (e.g. futex wait/wake).
The CL reduces code duplication, extends some optimizations available
only on Linux/Windows to other OSes and provides ground
for futher optimizations. Chan finalizers are finally eliminated.

(Linux/amd64, 8 HT cores)
benchmark                      old      new
BenchmarkChanContended         83.6     77.8 ns/op
BenchmarkChanContended-2       341      328 ns/op
BenchmarkChanContended-4       382      383 ns/op
BenchmarkChanContended-8       390      374 ns/op
BenchmarkChanContended-16      313      291 ns/op

(Darwin/amd64, 2 cores)
benchmark                      old      new
BenchmarkChanContended         159      172 ns/op
BenchmarkChanContended-2       6735     263 ns/op
BenchmarkChanContended-4       10384    255 ns/op
BenchmarkChanCreation          1174     407 ns/op
BenchmarkChanCreation-2        4007     254 ns/op
BenchmarkChanCreation-4        4029     246 ns/op

R=rsc, jsing, hectorchu
CC=golang-dev
https://golang.org/cl/5140043
diff --git a/src/pkg/runtime/runtime.h b/src/pkg/runtime/runtime.h
index e45808f..685725a 100644
--- a/src/pkg/runtime/runtime.h
+++ b/src/pkg/runtime/runtime.h
@@ -47,14 +47,13 @@
 typedef	struct	Func		Func;
 typedef	struct	G		G;
 typedef	struct	Gobuf		Gobuf;
-typedef	struct	Lock		Lock;
+typedef	union	Lock		Lock;
 typedef	struct	M		M;
 typedef	struct	Mem		Mem;
 typedef	union	Note		Note;
 typedef	struct	Slice		Slice;
 typedef	struct	Stktop		Stktop;
 typedef	struct	String		String;
-typedef	struct	Usema		Usema;
 typedef	struct	SigTab		SigTab;
 typedef	struct	MCache		MCache;
 typedef	struct	FixAlloc	FixAlloc;
@@ -117,32 +116,15 @@
 /*
  * structures
  */
-struct	Lock
+union	Lock
 {
-#ifdef __WINDOWS__
-	M*	waitm;	// linked list of waiting M's
-#else
-	uint32	key;
-	uint32	sema;	// for OS X
-#endif
-};
-struct	Usema
-{
-	uint32	u;
-	uint32	k;
+	uint32	key;	// futex-based impl
+	M*	waitm;	// linked list of waiting M's (sema-based impl)
 };
 union	Note
 {
-	struct {	// Linux
-		uint32	state;
-	};
-	struct {	// Windows
-		Lock lock;
-	};
-	struct {	// OS X
-		int32	wakeup;
-		Usema	sema;
-	};
+	uint32	key;	// futex-based impl
+	M*	waitm;	// waiting M (sema-based impl)
 };
 struct String
 {
@@ -253,11 +235,13 @@
 	uint32	freglo[16];	// D[i] lsb and F[i]
 	uint32	freghi[16];	// D[i] msb and F[i+16]
 	uint32	fflag;		// floating point compare flags
-
+	M*	nextwaitm;	// next M waiting for lock
+	uintptr	waitsema;	// semaphore for parking on locks
+	uint32	waitsemacount;
+	uint32	waitsemalock;
+	
 #ifdef __WINDOWS__
 	void*	thread;		// thread handle
-	void*	event;		// event for signalling
-	M*	nextwaitm;	// next M waiting for lock
 #endif
 	uintptr	end[];
 };
@@ -409,7 +393,6 @@
 int8*	runtime·goos;
 int32	runtime·ncpu;
 extern	bool	runtime·iscgo;
-extern	void	(*runtime·destroylock)(Lock*);
 
 /*
  * common functions and data