runtime: convert map implementation to Go.

It's a bit slower, but not painfully so.  There is still room for
improvement (saving space so we can use nosplit, and removing the
requirement for hash/eq stubs).

benchmark                              old ns/op     new ns/op     delta
BenchmarkMegMap                        23.5          24.2          +2.98%
BenchmarkMegOneMap                     14.9          15.7          +5.37%
BenchmarkMegEqMap                      71668         72234         +0.79%
BenchmarkMegEmptyMap                   4.05          4.93          +21.73%
BenchmarkSmallStrMap                   21.9          22.5          +2.74%
BenchmarkMapStringKeysEight_16         23.1          26.3          +13.85%
BenchmarkMapStringKeysEight_32         21.9          25.0          +14.16%
BenchmarkMapStringKeysEight_64         21.9          25.1          +14.61%
BenchmarkMapStringKeysEight_1M         21.9          25.0          +14.16%
BenchmarkIntMap                        21.8          12.5          -42.66%
BenchmarkRepeatedLookupStrMapKey32     39.3          30.2          -23.16%
BenchmarkRepeatedLookupStrMapKey1M     322353        322675        +0.10%
BenchmarkNewEmptyMap                   129           136           +5.43%
BenchmarkMapIter                       137           107           -21.90%
BenchmarkMapIterEmpty                  7.14          8.71          +21.99%
BenchmarkSameLengthMap                 5.24          6.82          +30.15%
BenchmarkBigKeyMap                     34.5          35.3          +2.32%
BenchmarkBigValMap                     36.1          36.1          +0.00%
BenchmarkSmallKeyMap                   26.9          26.7          -0.74%

LGTM=rsc
R=golang-codereviews, dave, dvyukov, rsc, gobot, khr
CC=golang-codereviews
https://golang.org/cl/99380043
diff --git a/src/pkg/runtime/hashmap_fast.go b/src/pkg/runtime/hashmap_fast.go
new file mode 100644
index 0000000..5055af4
--- /dev/null
+++ b/src/pkg/runtime/hashmap_fast.go
@@ -0,0 +1,391 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+func mapaccess1_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
+	if raceenabled && h != nil {
+		callerpc := gogetcallerpc(unsafe.Pointer(&t))
+		fn := mapaccess1_fast32
+		pc := **(**uintptr)(unsafe.Pointer(&fn))
+		racereadpc(unsafe.Pointer(h), callerpc, pc)
+	}
+	if h == nil || h.count == 0 {
+		return unsafe.Pointer(t.elem.zero)
+	}
+	var b *bmap
+	if h.B == 0 {
+		// One-bucket table.  No need to hash.
+		b = (*bmap)(h.buckets)
+	} else {
+		hash := gohash(t.key.alg, unsafe.Pointer(&key), 4, uintptr(h.hash0))
+		m := uintptr(1)<<h.B - 1
+		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(h.bucketsize)))
+		if c := h.oldbuckets; c != nil {
+			oldb := (*bmap)(add(c, (hash&(m>>1))*uintptr(h.bucketsize)))
+			if !evacuated(oldb) {
+				b = oldb
+			}
+		}
+	}
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			k := *((*uint32)(add(unsafe.Pointer(b), dataOffset+i*4)))
+			if k != key {
+				continue
+			}
+			t := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			if t == empty {
+				continue
+			}
+			return add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(h.valuesize))
+		}
+		b = b.overflow
+		if b == nil {
+			return unsafe.Pointer(t.elem.zero)
+		}
+	}
+}
+
+func mapaccess2_fast32(t *maptype, h *hmap, key uint32) (unsafe.Pointer, bool) {
+	if raceenabled && h != nil {
+		callerpc := gogetcallerpc(unsafe.Pointer(&t))
+		fn := mapaccess2_fast32
+		pc := **(**uintptr)(unsafe.Pointer(&fn))
+		racereadpc(unsafe.Pointer(h), callerpc, pc)
+	}
+	if h == nil || h.count == 0 {
+		return unsafe.Pointer(t.elem.zero), false
+	}
+	var b *bmap
+	if h.B == 0 {
+		// One-bucket table.  No need to hash.
+		b = (*bmap)(h.buckets)
+	} else {
+		hash := gohash(t.key.alg, unsafe.Pointer(&key), 4, uintptr(h.hash0))
+		m := uintptr(1)<<h.B - 1
+		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(h.bucketsize)))
+		if c := h.oldbuckets; c != nil {
+			oldb := (*bmap)(add(c, (hash&(m>>1))*uintptr(h.bucketsize)))
+			if !evacuated(oldb) {
+				b = oldb
+			}
+		}
+	}
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			k := *((*uint32)(add(unsafe.Pointer(b), dataOffset+i*4)))
+			if k != key {
+				continue
+			}
+			t := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			if t == empty {
+				continue
+			}
+			return add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(h.valuesize)), true
+		}
+		b = b.overflow
+		if b == nil {
+			return unsafe.Pointer(t.elem.zero), false
+		}
+	}
+}
+
+func mapaccess1_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
+	if raceenabled && h != nil {
+		callerpc := gogetcallerpc(unsafe.Pointer(&t))
+		fn := mapaccess1_fast64
+		pc := **(**uintptr)(unsafe.Pointer(&fn))
+		racereadpc(unsafe.Pointer(h), callerpc, pc)
+	}
+	if h == nil || h.count == 0 {
+		return unsafe.Pointer(t.elem.zero)
+	}
+	var b *bmap
+	if h.B == 0 {
+		// One-bucket table.  No need to hash.
+		b = (*bmap)(h.buckets)
+	} else {
+		hash := gohash(t.key.alg, unsafe.Pointer(&key), 8, uintptr(h.hash0))
+		m := uintptr(1)<<h.B - 1
+		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(h.bucketsize)))
+		if c := h.oldbuckets; c != nil {
+			oldb := (*bmap)(add(c, (hash&(m>>1))*uintptr(h.bucketsize)))
+			if !evacuated(oldb) {
+				b = oldb
+			}
+		}
+	}
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			k := *((*uint64)(add(unsafe.Pointer(b), dataOffset+i*8)))
+			if k != key {
+				continue
+			}
+			t := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			if t == empty {
+				continue
+			}
+			return add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(h.valuesize))
+		}
+		b = b.overflow
+		if b == nil {
+			return unsafe.Pointer(t.elem.zero)
+		}
+	}
+}
+
+func mapaccess2_fast64(t *maptype, h *hmap, key uint64) (unsafe.Pointer, bool) {
+	if raceenabled && h != nil {
+		callerpc := gogetcallerpc(unsafe.Pointer(&t))
+		fn := mapaccess2_fast64
+		pc := **(**uintptr)(unsafe.Pointer(&fn))
+		racereadpc(unsafe.Pointer(h), callerpc, pc)
+	}
+	if h == nil || h.count == 0 {
+		return unsafe.Pointer(t.elem.zero), false
+	}
+	var b *bmap
+	if h.B == 0 {
+		// One-bucket table.  No need to hash.
+		b = (*bmap)(h.buckets)
+	} else {
+		hash := gohash(t.key.alg, unsafe.Pointer(&key), 8, uintptr(h.hash0))
+		m := uintptr(1)<<h.B - 1
+		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(h.bucketsize)))
+		if c := h.oldbuckets; c != nil {
+			oldb := (*bmap)(add(c, (hash&(m>>1))*uintptr(h.bucketsize)))
+			if !evacuated(oldb) {
+				b = oldb
+			}
+		}
+	}
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			k := *((*uint64)(add(unsafe.Pointer(b), dataOffset+i*8)))
+			if k != key {
+				continue
+			}
+			t := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			if t == empty {
+				continue
+			}
+			return add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(h.valuesize)), true
+		}
+		b = b.overflow
+		if b == nil {
+			return unsafe.Pointer(t.elem.zero), false
+		}
+	}
+}
+
+func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
+	if raceenabled && h != nil {
+		callerpc := gogetcallerpc(unsafe.Pointer(&t))
+		fn := mapaccess1_faststr
+		pc := **(**uintptr)(unsafe.Pointer(&fn))
+		racereadpc(unsafe.Pointer(h), callerpc, pc)
+	}
+	if h == nil || h.count == 0 {
+		return unsafe.Pointer(t.elem.zero)
+	}
+	key := (*stringStruct)(unsafe.Pointer(&ky))
+	if h.B == 0 {
+		// One-bucket table.
+		b := (*bmap)(h.buckets)
+		if key.len < 32 {
+			// short key, doing lots of comparisons is ok
+			for i := uintptr(0); i < bucketCnt; i++ {
+				t := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+				if t == empty {
+					continue
+				}
+				k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*ptrSize))
+				if k.len != key.len {
+					continue
+				}
+				if k.str == key.str || gomemeq(k.str, key.str, uintptr(key.len)) {
+					return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+i*uintptr(h.valuesize))
+				}
+			}
+			return unsafe.Pointer(t.elem.zero)
+		}
+		// long key, try not to do more comparisons than necessary
+		keymaybe := uintptr(bucketCnt)
+		for i := uintptr(0); i < bucketCnt; i++ {
+			t := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			if t == empty {
+				continue
+			}
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*ptrSize))
+			if k.len != key.len {
+				continue
+			}
+			if k.str == key.str {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+i*uintptr(h.valuesize))
+			}
+			// check first 4 bytes
+			// TODO: on amd64/386 at least, make this compile to one 4-byte comparison instead of
+			// four 1-byte comparisons.
+			if *((*[4]byte)(key.str)) != *((*[4]byte)(k.str)) {
+				continue
+			}
+			// check last 4 bytes
+			if *((*[4]byte)(add(key.str, uintptr(key.len)-4))) != *((*[4]byte)(add(k.str, uintptr(key.len)-4))) {
+				continue
+			}
+			if keymaybe != bucketCnt {
+				// Two keys are potential matches.  Use hash to distinguish them.
+				goto dohash
+			}
+			keymaybe = i
+		}
+		if keymaybe != bucketCnt {
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+keymaybe*2*ptrSize))
+			if gomemeq(k.str, key.str, uintptr(key.len)) {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+keymaybe*uintptr(h.valuesize))
+			}
+		}
+		return unsafe.Pointer(t.elem.zero)
+	}
+dohash:
+	hash := gohash(t.key.alg, unsafe.Pointer(&ky), 2*ptrSize, uintptr(h.hash0))
+	m := uintptr(1)<<h.B - 1
+	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(h.bucketsize)))
+	if c := h.oldbuckets; c != nil {
+		oldb := (*bmap)(add(c, (hash&(m>>1))*uintptr(h.bucketsize)))
+		if !evacuated(oldb) {
+			b = oldb
+		}
+	}
+	top := uint8(hash >> (ptrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			t := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			if t != top {
+				continue
+			}
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*ptrSize))
+			if k.len != key.len {
+				continue
+			}
+			if k.str == key.str || gomemeq(k.str, key.str, uintptr(key.len)) {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+i*uintptr(h.valuesize))
+			}
+		}
+		b = b.overflow
+		if b == nil {
+			return unsafe.Pointer(t.elem.zero)
+		}
+	}
+}
+
+func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
+	if raceenabled && h != nil {
+		callerpc := gogetcallerpc(unsafe.Pointer(&t))
+		fn := mapaccess2_faststr
+		pc := **(**uintptr)(unsafe.Pointer(&fn))
+		racereadpc(unsafe.Pointer(h), callerpc, pc)
+	}
+	if h == nil || h.count == 0 {
+		return unsafe.Pointer(t.elem.zero), false
+	}
+	key := (*stringStruct)(unsafe.Pointer(&ky))
+	if h.B == 0 {
+		// One-bucket table.
+		b := (*bmap)(h.buckets)
+		if key.len < 32 {
+			// short key, doing lots of comparisons is ok
+			for i := uintptr(0); i < bucketCnt; i++ {
+				t := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+				if t == empty {
+					continue
+				}
+				k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*ptrSize))
+				if k.len != key.len {
+					continue
+				}
+				if k.str == key.str || gomemeq(k.str, key.str, uintptr(key.len)) {
+					return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+i*uintptr(h.valuesize)), true
+				}
+			}
+			return unsafe.Pointer(t.elem.zero), false
+		}
+		// long key, try not to do more comparisons than necessary
+		keymaybe := uintptr(bucketCnt)
+		for i := uintptr(0); i < bucketCnt; i++ {
+			t := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			if t == empty {
+				continue
+			}
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*ptrSize))
+			if k.len != key.len {
+				continue
+			}
+			if k.str == key.str {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+i*uintptr(h.valuesize)), true
+			}
+			// check first 4 bytes
+			if *((*[4]byte)(key.str)) != *((*[4]byte)(k.str)) {
+				continue
+			}
+			// check last 4 bytes
+			if *((*[4]byte)(add(key.str, uintptr(key.len)-4))) != *((*[4]byte)(add(k.str, uintptr(key.len)-4))) {
+				continue
+			}
+			if keymaybe != bucketCnt {
+				// Two keys are potential matches.  Use hash to distinguish them.
+				goto dohash
+			}
+			keymaybe = i
+		}
+		if keymaybe != bucketCnt {
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+keymaybe*2*ptrSize))
+			if gomemeq(k.str, key.str, uintptr(key.len)) {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+keymaybe*uintptr(h.valuesize)), true
+			}
+		}
+		return unsafe.Pointer(t.elem.zero), false
+	}
+dohash:
+	hash := gohash(t.key.alg, unsafe.Pointer(&ky), 2*ptrSize, uintptr(h.hash0))
+	m := uintptr(1)<<h.B - 1
+	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(h.bucketsize)))
+	if c := h.oldbuckets; c != nil {
+		oldb := (*bmap)(add(c, (hash&(m>>1))*uintptr(h.bucketsize)))
+		if !evacuated(oldb) {
+			b = oldb
+		}
+	}
+	top := uint8(hash >> (ptrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			t := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			if t != top {
+				continue
+			}
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*ptrSize))
+			if k.len != key.len {
+				continue
+			}
+			if k.str == key.str || gomemeq(k.str, key.str, uintptr(key.len)) {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+i*uintptr(h.valuesize)), true
+			}
+		}
+		b = b.overflow
+		if b == nil {
+			return unsafe.Pointer(t.elem.zero), false
+		}
+	}
+}