net: add special netFD mutex
The mutex, fdMutex, handles locking and lifetime of sysfd,
and serializes Read and Write methods.
This allows to strip 2 sync.Mutex.Lock calls,
2 sync.Mutex.Unlock calls, 1 defer and some amount
of misc overhead from every network operation.

On linux/amd64, Intel E5-2690:
benchmark                             old ns/op    new ns/op    delta
BenchmarkTCP4Persistent                    9595         9454   -1.47%
BenchmarkTCP4Persistent-2                  8978         8772   -2.29%
BenchmarkTCP4ConcurrentReadWrite           4900         4625   -5.61%
BenchmarkTCP4ConcurrentReadWrite-2         2603         2500   -3.96%

In general it strips 70-500 ns from every network operation depending
on processor model. On my relatively new E5-2690 it accounts to ~5%
of network op cost.

Fixes #6074.

R=golang-dev, bradfitz, alex.brainman, iant, mikioh.mikioh
CC=golang-dev
https://golang.org/cl/12418043
diff --git a/src/pkg/net/fd_mutex.go b/src/pkg/net/fd_mutex.go
new file mode 100644
index 0000000..1caf974
--- /dev/null
+++ b/src/pkg/net/fd_mutex.go
@@ -0,0 +1,184 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package net
+
+import "sync/atomic"
+
+// fdMutex is a specialized synchronization primitive
+// that manages lifetime of an fd and serializes access
+// to Read and Write methods on netFD.
+type fdMutex struct {
+	state uint64
+	rsema uint32
+	wsema uint32
+}
+
+// fdMutex.state is organized as follows:
+// 1 bit - whether netFD is closed, if set all subsequent lock operations will fail.
+// 1 bit - lock for read operations.
+// 1 bit - lock for write operations.
+// 20 bits - total number of references (read+write+misc).
+// 20 bits - number of outstanding read waiters.
+// 20 bits - number of outstanding write waiters.
+const (
+	mutexClosed  = 1 << 0
+	mutexRLock   = 1 << 1
+	mutexWLock   = 1 << 2
+	mutexRef     = 1 << 3
+	mutexRefMask = (1<<20 - 1) << 3
+	mutexRWait   = 1 << 23
+	mutexRMask   = (1<<20 - 1) << 23
+	mutexWWait   = 1 << 43
+	mutexWMask   = (1<<20 - 1) << 43
+)
+
+// Read operations must do RWLock(true)/RWUnlock(true).
+// Write operations must do RWLock(false)/RWUnlock(false).
+// Misc operations must do Incref/Decref. Misc operations include functions like
+// setsockopt and setDeadline. They need to use Incref/Decref to ensure that
+// they operate on the correct fd in presence of a concurrent Close call
+// (otherwise fd can be closed under their feet).
+// Close operation must do IncrefAndClose/Decref.
+
+// RWLock/Incref return whether fd is open.
+// RWUnlock/Decref return whether fd is closed and there are no remaining references.
+
+func (mu *fdMutex) Incref() bool {
+	for {
+		old := atomic.LoadUint64(&mu.state)
+		if old&mutexClosed != 0 {
+			return false
+		}
+		new := old + mutexRef
+		if new&mutexRefMask == 0 {
+			panic("net: inconsistent fdMutex")
+		}
+		if atomic.CompareAndSwapUint64(&mu.state, old, new) {
+			return true
+		}
+	}
+}
+
+func (mu *fdMutex) IncrefAndClose() bool {
+	for {
+		old := atomic.LoadUint64(&mu.state)
+		if old&mutexClosed != 0 {
+			return false
+		}
+		// Mark as closed and acquire a reference.
+		new := (old | mutexClosed) + mutexRef
+		if new&mutexRefMask == 0 {
+			panic("net: inconsistent fdMutex")
+		}
+		// Remove all read and write waiters.
+		new &^= mutexRMask | mutexWMask
+		if atomic.CompareAndSwapUint64(&mu.state, old, new) {
+			// Wake all read and write waiters,
+			// they will observe closed flag after wakeup.
+			for old&mutexRMask != 0 {
+				old -= mutexRWait
+				runtime_Semrelease(&mu.rsema)
+			}
+			for old&mutexWMask != 0 {
+				old -= mutexWWait
+				runtime_Semrelease(&mu.wsema)
+			}
+			return true
+		}
+	}
+}
+
+func (mu *fdMutex) Decref() bool {
+	for {
+		old := atomic.LoadUint64(&mu.state)
+		if old&mutexRefMask == 0 {
+			panic("net: inconsistent fdMutex")
+		}
+		new := old - mutexRef
+		if atomic.CompareAndSwapUint64(&mu.state, old, new) {
+			return new&(mutexClosed|mutexRef) == mutexClosed
+		}
+	}
+}
+
+func (mu *fdMutex) RWLock(read bool) bool {
+	var mutexBit, mutexWait, mutexMask uint64
+	var mutexSema *uint32
+	if read {
+		mutexBit = mutexRLock
+		mutexWait = mutexRWait
+		mutexMask = mutexRMask
+		mutexSema = &mu.rsema
+	} else {
+		mutexBit = mutexWLock
+		mutexWait = mutexWWait
+		mutexMask = mutexWMask
+		mutexSema = &mu.wsema
+	}
+	for {
+		old := atomic.LoadUint64(&mu.state)
+		if old&mutexClosed != 0 {
+			return false
+		}
+		var new uint64
+		if old&mutexBit == 0 {
+			// Lock is free, acquire it.
+			new = (old | mutexBit) + mutexRef
+			if new&mutexRefMask == 0 {
+				panic("net: inconsistent fdMutex")
+			}
+		} else {
+			// Wait for lock.
+			new = old + mutexWait
+			if new&mutexMask == 0 {
+				panic("net: inconsistent fdMutex")
+			}
+		}
+		if atomic.CompareAndSwapUint64(&mu.state, old, new) {
+			if old&mutexBit == 0 {
+				return true
+			}
+			runtime_Semacquire(mutexSema)
+			// The signaller has subtracted mutexWait.
+		}
+	}
+}
+
+func (mu *fdMutex) RWUnlock(read bool) bool {
+	var mutexBit, mutexWait, mutexMask uint64
+	var mutexSema *uint32
+	if read {
+		mutexBit = mutexRLock
+		mutexWait = mutexRWait
+		mutexMask = mutexRMask
+		mutexSema = &mu.rsema
+	} else {
+		mutexBit = mutexWLock
+		mutexWait = mutexWWait
+		mutexMask = mutexWMask
+		mutexSema = &mu.wsema
+	}
+	for {
+		old := atomic.LoadUint64(&mu.state)
+		if old&mutexBit == 0 || old&mutexRefMask == 0 {
+			panic("net: inconsistent fdMutex")
+		}
+		// Drop lock, drop reference and wake read waiter if present.
+		new := (old &^ mutexBit) - mutexRef
+		if old&mutexMask != 0 {
+			new -= mutexWait
+		}
+		if atomic.CompareAndSwapUint64(&mu.state, old, new) {
+			if old&mutexMask != 0 {
+				runtime_Semrelease(mutexSema)
+			}
+			return new&(mutexClosed|mutexRef) == mutexClosed
+		}
+	}
+}
+
+// Implemented in runtime package.
+func runtime_Semacquire(sema *uint32)
+func runtime_Semrelease(sema *uint32)