syncmap: remove blocking for all operations on existing keys

In the previous version, Loads of keys with unchanged values would
block on the Mutex if the map was dirty, leading to potentially long
stalls while the read-only map is being copied to the dirty map.

This change adds a double pointer indirection to each entry and an
extra atomic load on the fast path. In exchange, it removes the need
to lock the map for nearly all operations on existing keys.

Each entry is represented by an atomically-updated pointer to an
interface{} value. The same entries are stored in both the read-only
and dirty maps. Keys deleted before the dirty map was last copied are
marked as "expunged" and omitted from the dirty map until the next
Store to that key.

Newly-stored values exist only in the dirty map. The existence of new
keys in the dirty map is indicated by an "amended" bit attached to the
read-only map, allowing Load or Delete calls for nonexistent keys to
sometimes return immediately (without acquiring the Mutex).

This trades off a bit of steady-state throughput in the "mostly hits"
case in exchange for less contention (and lower Load tail latencies)
for mixed Load/Store/Delete usage.

Unfortunately, the latency impact is currently difficult to measure
(#19128).

updates golang/go#19128
updates golang/go#18177

https://perf.golang.org/search?q=upload:20170315.5

name                                               old time/op    new time/op    delta
LoadMostlyHits/*syncmap_test.DeepCopyMap             70.3ns ± 6%    70.2ns ± 4%      ~     (p=0.886 n=4+4)
LoadMostlyHits/*syncmap_test.DeepCopyMap-48          10.9ns ± 4%    13.6ns ±24%      ~     (p=0.314 n=4+4)
LoadMostlyHits/*syncmap_test.RWMutexMap              86.0ns ± 9%    86.0ns ± 4%      ~     (p=1.000 n=4+4)
LoadMostlyHits/*syncmap_test.RWMutexMap-48            140ns ± 3%     139ns ± 2%      ~     (p=0.686 n=4+4)
LoadMostlyHits/*syncmap.Map                          70.6ns ± 8%    70.3ns ± 1%      ~     (p=0.571 n=4+4)
LoadMostlyHits/*syncmap.Map-48                       10.3ns ± 7%    11.2ns ± 5%      ~     (p=0.114 n=4+4)
LoadMostlyMisses/*syncmap_test.DeepCopyMap           59.4ns ± 4%    59.4ns ± 4%      ~     (p=1.000 n=4+4)
LoadMostlyMisses/*syncmap_test.DeepCopyMap-48        10.4ns ±30%    12.6ns ±29%      ~     (p=0.486 n=4+4)
LoadMostlyMisses/*syncmap_test.RWMutexMap            64.8ns ± 6%    63.8ns ± 4%      ~     (p=0.686 n=4+4)
LoadMostlyMisses/*syncmap_test.RWMutexMap-48          138ns ± 3%     139ns ± 2%      ~     (p=0.800 n=4+4)
LoadMostlyMisses/*syncmap.Map                        51.5ns ± 7%    50.4ns ± 2%      ~     (p=0.371 n=4+4)
LoadMostlyMisses/*syncmap.Map-48                     9.37ns ± 3%    9.40ns ± 3%      ~     (p=0.886 n=4+4)
LoadOrStoreBalanced/*syncmap_test.RWMutexMap          820ns ±17%     834ns ±14%      ~     (p=1.000 n=4+4)
LoadOrStoreBalanced/*syncmap_test.RWMutexMap-48      1.38µs ± 4%    1.38µs ± 3%      ~     (p=0.886 n=4+4)
LoadOrStoreBalanced/*syncmap.Map                      709ns ±13%    1085ns ± 9%   +53.09%  (p=0.029 n=4+4)
LoadOrStoreBalanced/*syncmap.Map-48                  1.30µs ±13%    1.08µs ± 8%   -17.40%  (p=0.029 n=4+4)
LoadOrStoreUnique/*syncmap_test.RWMutexMap           1.26µs ±14%    1.16µs ±29%      ~     (p=0.343 n=4+4)
LoadOrStoreUnique/*syncmap_test.RWMutexMap-48        1.82µs ±15%    1.98µs ± 6%      ~     (p=0.143 n=4+4)
LoadOrStoreUnique/*syncmap.Map                       1.06µs ± 7%    1.86µs ±11%   +76.09%  (p=0.029 n=4+4)
LoadOrStoreUnique/*syncmap.Map-48                    2.00µs ± 4%    1.62µs ± 4%   -19.20%  (p=0.029 n=4+4)
LoadOrStoreCollision/*syncmap_test.DeepCopyMap       32.9ns ± 8%    32.6ns ± 9%      ~     (p=0.686 n=4+4)
LoadOrStoreCollision/*syncmap_test.DeepCopyMap-48   2.26ns ±136%    3.41ns ±62%      ~     (p=0.114 n=4+4)
LoadOrStoreCollision/*syncmap_test.RWMutexMap        58.0ns ±20%    54.6ns ±17%      ~     (p=0.343 n=4+4)
LoadOrStoreCollision/*syncmap_test.RWMutexMap-48      458ns ± 2%     445ns ± 6%      ~     (p=0.229 n=4+4)
LoadOrStoreCollision/*syncmap.Map                    35.8ns ± 5%    39.6ns ± 9%      ~     (p=0.114 n=4+4)
LoadOrStoreCollision/*syncmap.Map-48                 1.24ns ± 2%    1.58ns ± 4%   +27.16%  (p=0.029 n=4+4)
Range/*syncmap_test.DeepCopyMap                      19.7µs ± 4%    19.8µs ± 3%      ~     (p=0.686 n=4+4)
Range/*syncmap_test.DeepCopyMap-48                    763ns ± 1%     864ns ± 3%   +13.24%  (p=0.029 n=4+4)
Range/*syncmap_test.RWMutexMap                       20.9µs ± 3%    20.4µs ± 4%      ~     (p=0.343 n=4+4)
Range/*syncmap_test.RWMutexMap-48                     764ns ± 1%     870ns ± 1%   +13.77%  (p=0.029 n=4+4)
Range/*syncmap.Map                                   20.4µs ± 5%    22.7µs ± 5%   +10.98%  (p=0.029 n=4+4)
Range/*syncmap.Map-48                                 776ns ± 3%     954ns ± 4%   +22.95%  (p=0.029 n=4+4)
AdversarialAlloc/*syncmap_test.DeepCopyMap            206ns ± 5%     199ns ± 2%      ~     (p=0.200 n=4+4)
AdversarialAlloc/*syncmap_test.DeepCopyMap-48        8.94µs ± 5%    9.21µs ± 4%      ~     (p=0.200 n=4+4)
AdversarialAlloc/*syncmap_test.RWMutexMap            63.4ns ± 4%    63.8ns ± 3%      ~     (p=0.686 n=4+4)
AdversarialAlloc/*syncmap_test.RWMutexMap-48          184ns ±10%     198ns ±11%      ~     (p=0.343 n=4+4)
AdversarialAlloc/*syncmap.Map                         213ns ± 3%     264ns ± 3%   +23.97%  (p=0.029 n=4+4)
AdversarialAlloc/*syncmap.Map-48                      556ns ± 5%    1389ns ± 8%  +149.93%  (p=0.029 n=4+4)
AdversarialDelete/*syncmap_test.DeepCopyMap           300ns ± 6%     304ns ± 7%      ~     (p=0.886 n=4+4)
AdversarialDelete/*syncmap_test.DeepCopyMap-48        647ns ± 3%     646ns ± 3%      ~     (p=0.943 n=4+4)
AdversarialDelete/*syncmap_test.RWMutexMap           69.1ns ± 1%    69.2ns ± 6%      ~     (p=0.686 n=4+4)
AdversarialDelete/*syncmap_test.RWMutexMap-48         289ns ±15%     300ns ±17%      ~     (p=0.829 n=4+4)
AdversarialDelete/*syncmap.Map                        198ns ± 5%     264ns ± 2%   +33.17%  (p=0.029 n=4+4)
AdversarialDelete/*syncmap.Map-48                     291ns ± 9%     173ns ± 8%   -40.50%  (p=0.029 n=4+4)

name                                               old alloc/op   new alloc/op   delta
LoadMostlyHits/*syncmap_test.DeepCopyMap              7.00B ± 0%     7.00B ± 0%      ~     (all equal)
LoadMostlyHits/*syncmap_test.DeepCopyMap-48           7.00B ± 0%     7.00B ± 0%      ~     (all equal)
LoadMostlyHits/*syncmap_test.RWMutexMap               7.00B ± 0%     7.00B ± 0%      ~     (all equal)
LoadMostlyHits/*syncmap_test.RWMutexMap-48            7.00B ± 0%     7.00B ± 0%      ~     (all equal)
LoadMostlyHits/*syncmap.Map                           7.00B ± 0%     7.00B ± 0%      ~     (all equal)
LoadMostlyHits/*syncmap.Map-48                        7.00B ± 0%     7.00B ± 0%      ~     (all equal)
LoadMostlyMisses/*syncmap_test.DeepCopyMap            7.00B ± 0%     7.00B ± 0%      ~     (all equal)
LoadMostlyMisses/*syncmap_test.DeepCopyMap-48         7.00B ± 0%     7.00B ± 0%      ~     (all equal)
LoadMostlyMisses/*syncmap_test.RWMutexMap             7.00B ± 0%     7.00B ± 0%      ~     (all equal)
LoadMostlyMisses/*syncmap_test.RWMutexMap-48          7.00B ± 0%     7.00B ± 0%      ~     (all equal)
LoadMostlyMisses/*syncmap.Map                         7.00B ± 0%     7.00B ± 0%      ~     (all equal)
LoadMostlyMisses/*syncmap.Map-48                      7.00B ± 0%     7.00B ± 0%      ~     (all equal)
LoadOrStoreBalanced/*syncmap_test.RWMutexMap          95.0B ± 0%     95.0B ± 0%      ~     (all equal)
LoadOrStoreBalanced/*syncmap_test.RWMutexMap-48       95.0B ± 0%     95.0B ± 0%      ~     (all equal)
LoadOrStoreBalanced/*syncmap.Map                      95.0B ± 0%     88.0B ± 0%    -7.37%  (p=0.029 n=4+4)
LoadOrStoreBalanced/*syncmap.Map-48                   95.0B ± 0%     88.0B ± 0%    -7.37%  (p=0.029 n=4+4)
LoadOrStoreUnique/*syncmap_test.RWMutexMap             175B ± 0%      175B ± 0%      ~     (all equal)
LoadOrStoreUnique/*syncmap_test.RWMutexMap-48          175B ± 0%      175B ± 0%      ~     (all equal)
LoadOrStoreUnique/*syncmap.Map                         175B ± 0%      161B ± 0%    -8.00%  (p=0.029 n=4+4)
LoadOrStoreUnique/*syncmap.Map-48                      175B ± 0%      161B ± 0%    -8.00%  (p=0.029 n=4+4)
LoadOrStoreCollision/*syncmap_test.DeepCopyMap        0.00B          0.00B           ~     (all equal)
LoadOrStoreCollision/*syncmap_test.DeepCopyMap-48     0.00B          0.00B           ~     (all equal)
LoadOrStoreCollision/*syncmap_test.RWMutexMap         0.00B          0.00B           ~     (all equal)
LoadOrStoreCollision/*syncmap_test.RWMutexMap-48      0.00B          0.00B           ~     (all equal)
LoadOrStoreCollision/*syncmap.Map                     0.00B          0.00B           ~     (all equal)
LoadOrStoreCollision/*syncmap.Map-48                  0.00B          0.00B           ~     (all equal)
Range/*syncmap_test.DeepCopyMap                       0.00B          0.00B           ~     (all equal)
Range/*syncmap_test.DeepCopyMap-48                    0.00B          0.00B           ~     (all equal)
Range/*syncmap_test.RWMutexMap                        0.00B          0.00B           ~     (all equal)
Range/*syncmap_test.RWMutexMap-48                     0.00B          0.00B           ~     (all equal)
Range/*syncmap.Map                                    0.00B          0.00B           ~     (all equal)
Range/*syncmap.Map-48                                 0.00B          0.00B           ~     (all equal)
AdversarialAlloc/*syncmap_test.DeepCopyMap            74.0B ± 0%     74.0B ± 0%      ~     (all equal)
AdversarialAlloc/*syncmap_test.DeepCopyMap-48        3.15kB ± 0%    3.15kB ± 0%      ~     (p=1.000 n=4+4)
AdversarialAlloc/*syncmap_test.RWMutexMap             8.00B ± 0%     8.00B ± 0%      ~     (all equal)
AdversarialAlloc/*syncmap_test.RWMutexMap-48          8.00B ± 0%     8.00B ± 0%      ~     (all equal)
AdversarialAlloc/*syncmap.Map                         74.0B ± 0%     55.0B ± 0%   -25.68%  (p=0.029 n=4+4)
AdversarialAlloc/*syncmap.Map-48                      8.00B ± 0%    56.25B ± 1%  +603.12%  (p=0.029 n=4+4)
AdversarialDelete/*syncmap_test.DeepCopyMap            155B ± 0%      155B ± 0%      ~     (all equal)
AdversarialDelete/*syncmap_test.DeepCopyMap-48         156B ± 0%      156B ± 0%      ~     (p=1.000 n=4+4)
AdversarialDelete/*syncmap_test.RWMutexMap            8.00B ± 0%     8.00B ± 0%      ~     (all equal)
AdversarialDelete/*syncmap_test.RWMutexMap-48         8.00B ± 0%     8.00B ± 0%      ~     (all equal)
AdversarialDelete/*syncmap.Map                        81.0B ± 0%     65.0B ± 0%   -19.75%  (p=0.029 n=4+4)
AdversarialDelete/*syncmap.Map-48                     23.0B ± 9%     15.2B ± 5%   -33.70%  (p=0.029 n=4+4)

name                                               old allocs/op  new allocs/op  delta
LoadMostlyHits/*syncmap_test.DeepCopyMap               0.00           0.00           ~     (all equal)
LoadMostlyHits/*syncmap_test.DeepCopyMap-48            0.00           0.00           ~     (all equal)
LoadMostlyHits/*syncmap_test.RWMutexMap                0.00           0.00           ~     (all equal)
LoadMostlyHits/*syncmap_test.RWMutexMap-48             0.00           0.00           ~     (all equal)
LoadMostlyHits/*syncmap.Map                            0.00           0.00           ~     (all equal)
LoadMostlyHits/*syncmap.Map-48                         0.00           0.00           ~     (all equal)
LoadMostlyMisses/*syncmap_test.DeepCopyMap             0.00           0.00           ~     (all equal)
LoadMostlyMisses/*syncmap_test.DeepCopyMap-48          0.00           0.00           ~     (all equal)
LoadMostlyMisses/*syncmap_test.RWMutexMap              0.00           0.00           ~     (all equal)
LoadMostlyMisses/*syncmap_test.RWMutexMap-48           0.00           0.00           ~     (all equal)
LoadMostlyMisses/*syncmap.Map                          0.00           0.00           ~     (all equal)
LoadMostlyMisses/*syncmap.Map-48                       0.00           0.00           ~     (all equal)
LoadOrStoreBalanced/*syncmap_test.RWMutexMap           2.00 ± 0%      2.00 ± 0%      ~     (all equal)
LoadOrStoreBalanced/*syncmap_test.RWMutexMap-48        2.00 ± 0%      2.00 ± 0%      ~     (all equal)
LoadOrStoreBalanced/*syncmap.Map                       2.00 ± 0%      3.00 ± 0%   +50.00%  (p=0.029 n=4+4)
LoadOrStoreBalanced/*syncmap.Map-48                    2.00 ± 0%      3.00 ± 0%   +50.00%  (p=0.029 n=4+4)
LoadOrStoreUnique/*syncmap_test.RWMutexMap             2.00 ± 0%      2.00 ± 0%      ~     (all equal)
LoadOrStoreUnique/*syncmap_test.RWMutexMap-48          2.00 ± 0%      2.00 ± 0%      ~     (all equal)
LoadOrStoreUnique/*syncmap.Map                         2.00 ± 0%      4.00 ± 0%  +100.00%  (p=0.029 n=4+4)
LoadOrStoreUnique/*syncmap.Map-48                      2.00 ± 0%      4.00 ± 0%  +100.00%  (p=0.029 n=4+4)
LoadOrStoreCollision/*syncmap_test.DeepCopyMap         0.00           0.00           ~     (all equal)
LoadOrStoreCollision/*syncmap_test.DeepCopyMap-48      0.00           0.00           ~     (all equal)
LoadOrStoreCollision/*syncmap_test.RWMutexMap          0.00           0.00           ~     (all equal)
LoadOrStoreCollision/*syncmap_test.RWMutexMap-48       0.00           0.00           ~     (all equal)
LoadOrStoreCollision/*syncmap.Map                      0.00           0.00           ~     (all equal)
LoadOrStoreCollision/*syncmap.Map-48                   0.00           0.00           ~     (all equal)
Range/*syncmap_test.DeepCopyMap                        0.00           0.00           ~     (all equal)
Range/*syncmap_test.DeepCopyMap-48                     0.00           0.00           ~     (all equal)
Range/*syncmap_test.RWMutexMap                         0.00           0.00           ~     (all equal)
Range/*syncmap_test.RWMutexMap-48                      0.00           0.00           ~     (all equal)
Range/*syncmap.Map                                     0.00           0.00           ~     (all equal)
Range/*syncmap.Map-48                                  0.00           0.00           ~     (all equal)
AdversarialAlloc/*syncmap_test.DeepCopyMap             1.00 ± 0%      1.00 ± 0%      ~     (all equal)
AdversarialAlloc/*syncmap_test.DeepCopyMap-48          1.00 ± 0%      1.00 ± 0%      ~     (all equal)
AdversarialAlloc/*syncmap_test.RWMutexMap              1.00 ± 0%      1.00 ± 0%      ~     (all equal)
AdversarialAlloc/*syncmap_test.RWMutexMap-48           1.00 ± 0%      1.00 ± 0%      ~     (all equal)
AdversarialAlloc/*syncmap.Map                          1.00 ± 0%      1.00 ± 0%      ~     (all equal)
AdversarialAlloc/*syncmap.Map-48                       1.00 ± 0%      1.00 ± 0%      ~     (all equal)
AdversarialDelete/*syncmap_test.DeepCopyMap            1.00 ± 0%      1.00 ± 0%      ~     (all equal)
AdversarialDelete/*syncmap_test.DeepCopyMap-48         1.00 ± 0%      1.00 ± 0%      ~     (all equal)
AdversarialDelete/*syncmap_test.RWMutexMap             1.00 ± 0%      1.00 ± 0%      ~     (all equal)
AdversarialDelete/*syncmap_test.RWMutexMap-48          1.00 ± 0%      1.00 ± 0%      ~     (all equal)
AdversarialDelete/*syncmap.Map                         1.00 ± 0%      1.00 ± 0%      ~     (all equal)
AdversarialDelete/*syncmap.Map-48                      1.00 ± 0%      1.00 ± 0%      ~     (all equal)

Change-Id: I93c9458505d84238cc8e9016a7dfd285868af236
Reviewed-on: https://go-review.googlesource.com/37342
Reviewed-by: Russ Cox <rsc@golang.org>
diff --git a/syncmap/map.go b/syncmap/map.go
index 1efbcb2..80e1584 100644
--- a/syncmap/map.go
+++ b/syncmap/map.go
@@ -11,9 +11,10 @@
 import (
 	"sync"
 	"sync/atomic"
+	"unsafe"
 )
 
-// Map is a concurrent map with amortized-constant-time operations.
+// Map is a concurrent map with amortized-constant-time loads, stores, and deletes.
 // It is safe for multiple goroutines to call a Map's methods concurrently.
 //
 // The zero Map is valid and empty.
@@ -22,145 +23,313 @@
 type Map struct {
 	mu sync.Mutex
 
-	// clean is a copy of the map's contents that will never be overwritten, and
-	// is thus safe for concurrent lookups without additional synchronization.
+	// read contains the portion of the map's contents that are safe for
+	// concurrent access (with or without mu held).
 	//
-	// A nil clean map indicates that the current map contents are stored in the
-	// dirty field instead.
-	// If clean is non-nil, its contents are up-to-date.
+	// The read field itself is always safe to load, but must only be stored with
+	// mu held.
 	//
-	// clean is always safe to load, but must only be stored with mu held.
-	clean atomic.Value // map[interface{}]interface{}
+	// Entries stored in read may be updated concurrently without mu, but updating
+	// a previously-expunged entry requires that the entry be copied to the dirty
+	// map and unexpunged with mu held.
+	read atomic.Value // readOnly
 
-	// dirty is a copy of the map to which all writes occur.
+	// dirty contains the portion of the map's contents that require mu to be
+	// held. To ensure that the dirty map can be promoted to the read map quickly,
+	// it also includes all of the non-expunged entries in the read map.
 	//
-	// A nil dirty map indicates that the current map contents are either empty or
-	// stored in the clean field.
+	// Expunged entries are not stored in the dirty map. An expunged entry in the
+	// clean map must be unexpunged and added to the dirty map before a new value
+	// can be stored to it.
 	//
 	// If the dirty map is nil, the next write to the map will initialize it by
-	// making a deep copy of the clean map, then setting the clean map to nil.
-	dirty map[interface{}]interface{}
+	// making a shallow copy of the clean map, omitting stale entries.
+	dirty map[interface{}]*entry
 
-	// misses counts the number of Load calls for which the clean map was nil
-	// since the last write.
+	// misses counts the number of loads since the read map was last updated that
+	// needed to lock mu to determine whether the key was present.
 	//
-	// Once enough Load misses have occurred to cover the cost of a copy, the
-	// dirty map will be promoted to clean and any subsequent writes will make
-	// a new copy.
+	// Once enough misses have occurred to cover the cost of copying the dirty
+	// map, the dirty map will be promoted to the read map (in the unamended
+	// state) and the next store to the map will make a new dirty copy.
 	misses int
 }
 
+// readOnly is an immutable struct stored atomically in the Map.read field.
+type readOnly struct {
+	m       map[interface{}]*entry
+	amended bool // true if the dirty map contains some key not in m.
+}
+
+// expunged is an arbitrary pointer that marks entries which have been deleted
+// from the dirty map.
+var expunged = unsafe.Pointer(new(interface{}))
+
+// An entry is a slot in the map corresponding to a particular key.
+type entry struct {
+	// p points to the interface{} value stored for the entry.
+	//
+	// If p == nil, the entry has been deleted and m.dirty == nil.
+	//
+	// If p == expunged, the entry has been deleted, m.dirty != nil, and the entry
+	// is missing from m.dirty.
+	//
+	// Otherwise, the entry is valid and recorded in m.read.m[key] and, if m.dirty
+	// != nil, in m.dirty[key].
+	//
+	// An entry can be deleted by atomic replacement with nil: when m.dirty is
+	// next created, it will atomically replace nil with expunged and leave
+	// m.dirty[key] unset.
+	//
+	// An entry's associated value can be updated by atomic replacement, provided
+	// p != expunged. If p == expunged, an entry's associated value can be updated
+	// only after first setting m.dirty[key] = e so that lookups using the dirty
+	// map find the entry.
+	p unsafe.Pointer // *interface{}
+}
+
+func newEntry(i interface{}) *entry {
+	return &entry{p: unsafe.Pointer(&i)}
+}
+
 // Load returns the value stored in the map for a key, or nil if no
 // value is present.
 // The ok result indicates whether value was found in the map.
 func (m *Map) Load(key interface{}) (value interface{}, ok bool) {
-	clean, _ := m.clean.Load().(map[interface{}]interface{})
-	if clean != nil {
-		value, ok = clean[key]
-		return value, ok
-	}
-
-	m.mu.Lock()
-	if m.dirty == nil {
-		clean, _ := m.clean.Load().(map[interface{}]interface{})
-		if clean == nil {
-			// Completely empty — promote to clean immediately.
-			m.clean.Store(map[interface{}]interface{}{})
-		} else {
-			value, ok = clean[key]
+	read, _ := m.read.Load().(readOnly)
+	e, ok := read.m[key]
+	if !ok && read.amended {
+		m.mu.Lock()
+		// Avoid reporting a spurious miss if m.dirty got promoted while we were
+		// blocked on m.mu. (If further loads of the same key will not miss, it's
+		// not worth copying the dirty map for this key.)
+		read, _ = m.read.Load().(readOnly)
+		e, ok = read.m[key]
+		if !ok && read.amended {
+			e, ok = m.dirty[key]
+			// Regardless of whether the entry was present, record a miss: this key
+			// will take the slow path until the dirty map is promoted to the read
+			// map.
+			m.missLocked()
 		}
 		m.mu.Unlock()
-		return value, ok
 	}
-	value, ok = m.dirty[key]
-	m.missLocked()
-	m.mu.Unlock()
-	return value, ok
+	if !ok {
+		return nil, false
+	}
+	return e.load()
+}
+
+func (e *entry) load() (value interface{}, ok bool) {
+	p := atomic.LoadPointer(&e.p)
+	if p == nil || p == expunged {
+		return nil, false
+	}
+	return *(*interface{})(p), true
 }
 
 // Store sets the value for a key.
 func (m *Map) Store(key, value interface{}) {
+	read, _ := m.read.Load().(readOnly)
+	if e, ok := read.m[key]; ok && e.tryStore(&value) {
+		return
+	}
+
 	m.mu.Lock()
-	m.dirtyLocked()
-	m.dirty[key] = value
+	read, _ = m.read.Load().(readOnly)
+	if e, ok := read.m[key]; ok {
+		if e.unexpungeLocked() {
+			// The entry was previously expunged, which implies that there is a
+			// non-nil dirty map and this entry is not in it.
+			m.dirty[key] = e
+		}
+		e.storeLocked(&value)
+	} else if e, ok := m.dirty[key]; ok {
+		e.storeLocked(&value)
+	} else {
+		if !read.amended {
+			// We're adding the first new key to the dirty map.
+			// Make sure it is allocated and mark the read-only map as incomplete.
+			m.dirtyLocked()
+			m.read.Store(readOnly{m: read.m, amended: true})
+		}
+		m.dirty[key] = newEntry(value)
+	}
 	m.mu.Unlock()
 }
 
+// tryStore stores a value if the entry has not been expunged.
+//
+// If the entry is expunged, tryStore returns false and leaves the entry
+// unchanged.
+func (e *entry) tryStore(i *interface{}) bool {
+	p := atomic.LoadPointer(&e.p)
+	if p == expunged {
+		return false
+	}
+	for {
+		if atomic.CompareAndSwapPointer(&e.p, p, unsafe.Pointer(i)) {
+			return true
+		}
+		p = atomic.LoadPointer(&e.p)
+		if p == expunged {
+			return false
+		}
+	}
+}
+
+// unexpungeLocked ensures that the entry is not marked as expunged.
+//
+// If the entry was previously expunged, it must be added to the dirty map
+// before m.mu is unlocked.
+func (e *entry) unexpungeLocked() (wasExpunged bool) {
+	return atomic.CompareAndSwapPointer(&e.p, expunged, nil)
+}
+
+// storeLocked unconditionally stores a value to the entry.
+//
+// The entry must be known not to be expunged.
+func (e *entry) storeLocked(i *interface{}) {
+	atomic.StorePointer(&e.p, unsafe.Pointer(i))
+}
+
 // LoadOrStore returns the existing value for the key if present.
 // Otherwise, it stores and returns the given value.
 // The loaded result is true if the value was loaded, false if stored.
 func (m *Map) LoadOrStore(key, value interface{}) (actual interface{}, loaded bool) {
 	// Avoid locking if it's a clean hit.
-	clean, _ := m.clean.Load().(map[interface{}]interface{})
-	actual, loaded = clean[key]
-	if loaded {
-		return actual, true
+	read, _ := m.read.Load().(readOnly)
+	if e, ok := read.m[key]; ok {
+		actual, loaded, ok := e.tryLoadOrStore(value)
+		if ok {
+			return actual, loaded
+		}
 	}
 
 	m.mu.Lock()
-	if m.dirty == nil {
-		// Reload clean in case it changed while we were waiting on m.mu.
-		clean, _ := m.clean.Load().(map[interface{}]interface{})
-		actual, loaded = clean[key]
-		if loaded {
-			m.mu.Unlock()
-			return actual, true
+	read, _ = m.read.Load().(readOnly)
+	if e, ok := read.m[key]; ok {
+		if e.unexpungeLocked() {
+			m.dirty[key] = e
 		}
+		actual, loaded, _ = e.tryLoadOrStore(value)
+	} else if e, ok := m.dirty[key]; ok {
+		actual, loaded, _ = e.tryLoadOrStore(value)
+		m.missLocked()
 	} else {
-		actual, loaded = m.dirty[key]
-		if loaded {
-			m.missLocked()
-			m.mu.Unlock()
-			return actual, true
+		if !read.amended {
+			// We're adding the first new key to the dirty map.
+			// Make sure it is allocated and mark the read-only map as incomplete.
+			m.dirtyLocked()
+			m.read.Store(readOnly{m: read.m, amended: true})
 		}
+		m.dirty[key] = newEntry(value)
+		actual, loaded = value, false
+	}
+	m.mu.Unlock()
+
+	return actual, loaded
+}
+
+// tryLoadOrStore atomically loads or stores a value if the entry is not
+// expunged.
+//
+// If the entry is expunged, tryLoadOrStore leaves the entry unchanged and
+// returns with ok==false.
+func (e *entry) tryLoadOrStore(i interface{}) (actual interface{}, loaded, ok bool) {
+	p := atomic.LoadPointer(&e.p)
+	if p == expunged {
+		return nil, false, false
+	}
+	if p != nil {
+		return *(*interface{})(p), true, true
 	}
 
-	m.dirtyLocked()
-	m.dirty[key] = value
-	actual = value
-	m.mu.Unlock()
-	return actual, false
+	// Copy the interface after the first load to make this method more amenable
+	// to escape analysis: if we hit the "load" path or the entry is expunged, we
+	// shouldn't bother heap-allocating.
+	ic := i
+	for {
+		if atomic.CompareAndSwapPointer(&e.p, nil, unsafe.Pointer(&ic)) {
+			return i, false, true
+		}
+		p = atomic.LoadPointer(&e.p)
+		if p == expunged {
+			return nil, false, false
+		}
+		if p != nil {
+			return *(*interface{})(p), true, true
+		}
+	}
 }
 
 // Delete deletes the value for a key.
 func (m *Map) Delete(key interface{}) {
-	m.mu.Lock()
-	m.dirtyLocked()
-	delete(m.dirty, key)
-	m.mu.Unlock()
+	read, _ := m.read.Load().(readOnly)
+	e, ok := read.m[key]
+	if !ok && read.amended {
+		m.mu.Lock()
+		read, _ = m.read.Load().(readOnly)
+		e, ok = read.m[key]
+		if !ok && read.amended {
+			delete(m.dirty, key)
+		}
+		m.mu.Unlock()
+	}
+	if ok {
+		e.delete()
+	}
+}
+
+func (e *entry) delete() (hadValue bool) {
+	for {
+		p := atomic.LoadPointer(&e.p)
+		if p == nil || p == expunged {
+			return false
+		}
+		if atomic.CompareAndSwapPointer(&e.p, p, nil) {
+			return true
+		}
+	}
 }
 
 // Range calls f sequentially for each key and value present in the map.
 // If f returns false, range stops the iteration.
 //
-// Calls to other Map methods may block until Range returns.
-// The function f must not call any other methods on the Map.
-//
 // Range does not necessarily correspond to any consistent snapshot of the Map's
 // contents: no key will be visited more than once, but if the value for any key
 // is stored or deleted concurrently, Range may reflect any mapping for that key
 // from any point during the Range call.
+//
+// Range may be O(N) with the number of elements in the map even if f returns
+// false after a constant number of calls.
 func (m *Map) Range(f func(key, value interface{}) bool) {
-	clean, _ := m.clean.Load().(map[interface{}]interface{})
-	if clean == nil {
+	// We need to be able to iterate over all of the keys that were already
+	// present at the start of the call to Range.
+	// If read.amended is false, then read.m satisfies that property without
+	// requiring us to hold m.mu for a long time.
+	read, _ := m.read.Load().(readOnly)
+	if read.amended {
+		// m.dirty contains keys not in read.m. Fortunately, Range is already O(N)
+		// (assuming the caller does not break out early), so a call to Range
+		// amortizes an entire copy of the map: we can promote the dirty copy
+		// immediately!
 		m.mu.Lock()
-		if m.dirty == nil {
-			clean, _ = m.clean.Load().(map[interface{}]interface{})
-			if clean == nil {
-				// Completely empty — add an empty map to bypass m.mu next time.
-				m.clean.Store(map[interface{}]interface{}{})
-			}
-		} else {
-			// Range is already O(N), so a call to Range amortizes an entire copy of
-			// the map.  If it is dirty, we can promote it to clean immediately!
-			clean = m.dirty
-			m.clean.Store(clean)
+		read, _ = m.read.Load().(readOnly)
+		if read.amended {
+			read = readOnly{m: m.dirty}
+			m.read.Store(read)
 			m.dirty = nil
+			m.misses = 0
 		}
 		m.mu.Unlock()
 	}
 
-	for k, v := range clean {
+	for k, e := range read.m {
+		v, ok := e.load()
+		if !ok {
+			continue
+		}
 		if !f(k, v) {
 			break
 		}
@@ -168,25 +337,36 @@
 }
 
 func (m *Map) missLocked() {
-	if m.misses++; m.misses >= len(m.dirty) {
-		m.clean.Store(m.dirty)
-		m.dirty = nil
+	m.misses++
+	if m.misses < len(m.dirty) {
+		return
 	}
+	m.read.Store(readOnly{m: m.dirty})
+	m.dirty = nil
+	m.misses = 0
 }
 
-// dirtyLocked prepares the map for a subsequent write.
-// It ensures that the dirty field is non-nil and clean is nil by making a deep
-// copy of clean.
 func (m *Map) dirtyLocked() {
-	m.misses = 0
 	if m.dirty != nil {
 		return
 	}
 
-	clean, _ := m.clean.Load().(map[interface{}]interface{})
-	m.dirty = make(map[interface{}]interface{}, len(clean))
-	for k, v := range clean {
-		m.dirty[k] = v
+	read, _ := m.read.Load().(readOnly)
+	m.dirty = make(map[interface{}]*entry, len(read.m))
+	for k, e := range read.m {
+		if !e.tryExpungeLocked() {
+			m.dirty[k] = e
+		}
 	}
-	m.clean.Store(map[interface{}]interface{}(nil))
+}
+
+func (e *entry) tryExpungeLocked() (isExpunged bool) {
+	p := atomic.LoadPointer(&e.p)
+	for p == nil {
+		if atomic.CompareAndSwapPointer(&e.p, nil, expunged) {
+			return true
+		}
+		p = atomic.LoadPointer(&e.p)
+	}
+	return p == expunged
 }