runtime: do not scan maps when k/v do not contain pointers

Currently we scan maps even if k/v does not contain pointers.
This is required because overflow buckets are hanging off the main table.
This change introduces a separate array that contains pointers to all
overflow buckets and keeps them alive. Buckets themselves are marked
as containing no pointers and are not scanned by GC (if k/v does not
contain pointers).

This brings maps in line with slices and chans -- GC does not scan
their contents if elements do not contain pointers.

Currently scanning of a map[int]int with 2e8 entries (~8GB heap)
takes ~8 seconds. With this change scanning takes negligible time.

Update #9477.

Change-Id: Id8a04066a53d2f743474cad406afb9f30f00eaae
Reviewed-on: https://go-review.googlesource.com/3288
Reviewed-by: Keith Randall <khr@golang.org>
diff --git a/src/cmd/gc/reflect.c b/src/cmd/gc/reflect.c
index 8d302b5..61a63c0 100644
--- a/src/cmd/gc/reflect.c
+++ b/src/cmd/gc/reflect.c
@@ -181,6 +181,10 @@
 	valuesfield->down = overflowfield;
 	overflowfield->down = T;
 
+	// See comment on hmap.overflow in ../../runtime/hashmap.go.
+	if(!haspointers(t->type) && !haspointers(t->down))
+		bucket->haspointers = 1;  // no pointers
+
 	bucket->width = offset;
 	bucket->local = t->local;
 	t->bucket = bucket;
@@ -197,7 +201,7 @@
 hmap(Type *t)
 {
 	Type *h, *bucket;
-	Type *bucketsfield, *oldbucketsfield;
+	Type *bucketsfield, *oldbucketsfield, *overflowfield;
 	int32 offset;
 
 	if(t->hmap != T)
@@ -208,9 +212,10 @@
 	h->noalg = 1;
 
 	offset = widthint; // count
-	offset += 4;       // flags
-	offset += 4;       // hash0
+	offset += 1;       // flags
 	offset += 1;       // B
+	offset += 2;       // padding
+	offset += 4;       // hash0
 	offset = (offset + widthptr - 1) / widthptr * widthptr;
 	
 	bucketsfield = typ(TFIELD);
@@ -227,12 +232,20 @@
 	oldbucketsfield->sym->name = "oldbuckets";
 	offset += widthptr;
 
-	offset += widthptr; // nevacuate (last field in Hmap)
+	offset += widthptr; // nevacuate
+
+	overflowfield = typ(TFIELD);
+	overflowfield->type = types[TUNSAFEPTR];
+	overflowfield->width = offset;
+	overflowfield->sym = mal(sizeof(Sym));
+	overflowfield->sym->name = "overflow";
+	offset += widthptr;
 
 	// link up fields
 	h->type = bucketsfield;
 	bucketsfield->down = oldbucketsfield;
-	oldbucketsfield->down = T;
+	oldbucketsfield->down = overflowfield;
+	overflowfield->down = T;
 
 	h->width = offset;
 	h->local = t->local;
@@ -245,7 +258,7 @@
 hiter(Type *t)
 {
 	int32 n, off;
-	Type *field[7];
+	Type *field[9];
 	Type *i;
 
 	if(t->hiter != T)
@@ -259,6 +272,7 @@
 	//    h *Hmap
 	//    buckets *Bucket
 	//    bptr *Bucket
+	//    overflow unsafe.Pointer
 	//    other [4]uintptr
 	// }
 	// must match ../../runtime/hashmap.c:hash_iter.
@@ -292,29 +306,39 @@
 	field[5]->sym = mal(sizeof(Sym));
 	field[5]->sym->name = "bptr";
 	
-	// all other non-pointer fields
 	field[6] = typ(TFIELD);
-	field[6]->type = typ(TARRAY);
-	field[6]->type->type = types[TUINTPTR];
-	field[6]->type->bound = 4;
-	field[6]->type->width = 4 * widthptr;
+	field[6]->type = types[TUNSAFEPTR];
 	field[6]->sym = mal(sizeof(Sym));
-	field[6]->sym->name = "other";
+	field[6]->sym->name = "overflow0";
+
+	field[7] = typ(TFIELD);
+	field[7]->type = types[TUNSAFEPTR];
+	field[7]->sym = mal(sizeof(Sym));
+	field[7]->sym->name = "overflow1";
+
+	// all other non-pointer fields
+	field[8] = typ(TFIELD);
+	field[8]->type = typ(TARRAY);
+	field[8]->type->type = types[TUINTPTR];
+	field[8]->type->bound = 4;
+	field[8]->type->width = 4 * widthptr;
+	field[8]->sym = mal(sizeof(Sym));
+	field[8]->sym->name = "other";
 	
 	// build iterator struct holding the above fields
 	i = typ(TSTRUCT);
 	i->noalg = 1;
 	i->type = field[0];
 	off = 0;
-	for(n = 0; n < 6; n++) {
+	for(n = 0; n < nelem(field)-1; n++) {
 		field[n]->down = field[n+1];
 		field[n]->width = off;
 		off += field[n]->type->width;
 	}
-	field[6]->down = T;
-	off += field[6]->type->width;
-	if(off != 10 * widthptr)
-		yyerror("hash_iter size not correct %d %d", off, 10 * widthptr);
+	field[nelem(field)-1]->down = T;
+	off += field[nelem(field)-1]->type->width;
+	if(off != 12 * widthptr)
+		yyerror("hash_iter size not correct %d %d", off, 11 * widthptr);
 	t->hiter = i;
 	i->map = t;
 	return i;
diff --git a/src/reflect/type.go b/src/reflect/type.go
index a71d837..040b9c0 100644
--- a/src/reflect/type.go
+++ b/src/reflect/type.go
@@ -1469,9 +1469,8 @@
 
 	// Make a map type.
 	var imap interface{} = (map[unsafe.Pointer]unsafe.Pointer)(nil)
-	prototype := *(**mapType)(unsafe.Pointer(&imap))
 	mt := new(mapType)
-	*mt = *prototype
+	*mt = **(**mapType)(unsafe.Pointer(&imap))
 	mt.string = &s
 	mt.hash = fnv1(etyp.hash, 'm', byte(ktyp.hash>>24), byte(ktyp.hash>>16), byte(ktyp.hash>>8), byte(ktyp.hash))
 	mt.key = ktyp
@@ -1575,7 +1574,7 @@
 		for i := 0; i < c; i++ {
 			gc.appendProg(t.Field(i).Type.common())
 		}
-		if gc.size > oldsize + t.size {
+		if gc.size > oldsize+t.size {
 			panic("reflect: struct components are larger than the struct itself")
 		}
 		gc.size = oldsize + t.size
@@ -1650,6 +1649,12 @@
 )
 
 func bucketOf(ktyp, etyp *rtype) *rtype {
+	// See comment on hmap.overflow in ../runtime/hashmap.go.
+	var kind uint8
+	if ktyp.kind&kindNoPointers != 0 && etyp.kind&kindNoPointers != 0 {
+		kind = kindNoPointers
+	}
+
 	if ktyp.size > maxKeySize {
 		ktyp = PtrTo(ktyp).(*rtype)
 	}
@@ -1679,6 +1684,7 @@
 
 	b := new(rtype)
 	b.size = gc.size
+	b.kind = kind
 	b.gc[0], _ = gc.finalize()
 	s := "bucket(" + *ktyp.string + "," + *etyp.string + ")"
 	b.string = &s
diff --git a/src/runtime/hashmap.go b/src/runtime/hashmap.go
index f829e8f..058d1c7 100644
--- a/src/runtime/hashmap.go
+++ b/src/runtime/hashmap.go
@@ -106,13 +106,24 @@
 	// Note: the format of the Hmap is encoded in ../../cmd/gc/reflect.c and
 	// ../reflect/type.go.  Don't change this structure without also changing that code!
 	count int // # live cells == size of map.  Must be first (used by len() builtin)
-	flags uint32
-	hash0 uint32 // hash seed
+	flags uint8
 	B     uint8  // log_2 of # of buckets (can hold up to loadFactor * 2^B items)
+	hash0 uint32 // hash seed
 
 	buckets    unsafe.Pointer // array of 2^B Buckets. may be nil if count==0.
 	oldbuckets unsafe.Pointer // previous bucket array of half the size, non-nil only when growing
 	nevacuate  uintptr        // progress counter for evacuation (buckets less than this have been evacuated)
+
+	// If both key and value do not contain pointers, then we mark bucket
+	// type as containing no pointers. This avoids scanning such maps.
+	// However, bmap.overflow is a pointer. In order to keep overflow buckets
+	// alive, we store pointers to all overflow buckets in hmap.overflow.
+	// Overflow is used only if key and value do not contain pointers.
+	// overflow[0] contains overflow buckets for hmap.buckets.
+	// overflow[1] contains overflow buckets for hmap.oldbuckets.
+	// The first indirection allows us to reduce static size of hmap.
+	// The second indirection allows to store a pointer to the slice in hiter.
+	overflow *[2]*[]*bmap
 }
 
 // A bucket for a Go map.
@@ -135,6 +146,7 @@
 	h           *hmap
 	buckets     unsafe.Pointer // bucket ptr at hash_iter initialization time
 	bptr        *bmap          // current bucket
+	overflow    [2]*[]*bmap    // keeps overflow buckets alive
 	startBucket uintptr        // bucket iteration started at
 	offset      uint8          // intra-bucket offset to start from during iteration (should be big enough to hold bucketCnt-1)
 	wrapped     bool           // already wrapped around from end of bucket array to beginning
@@ -152,10 +164,24 @@
 func (b *bmap) overflow(t *maptype) *bmap {
 	return *(**bmap)(add(unsafe.Pointer(b), uintptr(t.bucketsize)-regSize))
 }
-func (b *bmap) setoverflow(t *maptype, ovf *bmap) {
+
+func (h *hmap) setoverflow(t *maptype, b, ovf *bmap) {
+	if t.bucket.kind&kindNoPointers != 0 {
+		h.createOverflow()
+		*h.overflow[0] = append(*h.overflow[0], ovf)
+	}
 	*(**bmap)(add(unsafe.Pointer(b), uintptr(t.bucketsize)-regSize)) = ovf
 }
 
+func (h *hmap) createOverflow() {
+	if h.overflow == nil {
+		h.overflow = new([2]*[]*bmap)
+	}
+	if h.overflow[0] == nil {
+		h.overflow[0] = new([]*bmap)
+	}
+}
+
 func makemap(t *maptype, hint int64) *hmap {
 	if sz := unsafe.Sizeof(hmap{}); sz > 48 || sz != uintptr(t.hmap.size) {
 		throw("bad hmap size")
@@ -463,7 +489,7 @@
 			memstats.next_gc = memstats.heap_alloc
 		}
 		newb := (*bmap)(newobject(t.bucket))
-		b.setoverflow(t, newb)
+		h.setoverflow(t, b, newb)
 		inserti = &newb.tophash[0]
 		insertk = add(unsafe.Pointer(newb), dataOffset)
 		insertv = add(insertk, bucketCnt*uintptr(t.keysize))
@@ -548,6 +574,8 @@
 	it.h = nil
 	it.buckets = nil
 	it.bptr = nil
+	it.overflow[0] = nil
+	it.overflow[1] = nil
 
 	if raceenabled && h != nil {
 		callerpc := getcallerpc(unsafe.Pointer(&t))
@@ -560,7 +588,7 @@
 		return
 	}
 
-	if unsafe.Sizeof(hiter{})/ptrSize != 10 {
+	if unsafe.Sizeof(hiter{})/ptrSize != 12 {
 		throw("hash_iter size incorrect") // see ../../cmd/gc/reflect.c
 	}
 	it.t = t
@@ -569,6 +597,14 @@
 	// grab snapshot of bucket state
 	it.B = h.B
 	it.buckets = h.buckets
+	if t.bucket.kind&kindNoPointers != 0 {
+		// Allocate the current slice and remember pointers to both current and old.
+		// This preserves all relevant overflow buckets alive even if
+		// the table grows and/or overflow buckets are added to the table
+		// while we are iterating.
+		h.createOverflow()
+		it.overflow = *h.overflow
+	}
 
 	// decide where to start
 	r := uintptr(fastrand1())
@@ -585,14 +621,8 @@
 
 	// Remember we have an iterator.
 	// Can run concurrently with another hash_iter_init().
-	for {
-		old := h.flags
-		if old == old|iterator|oldIterator {
-			break
-		}
-		if cas(&h.flags, old, old|iterator|oldIterator) {
-			break
-		}
+	if old := h.flags; old&(iterator|oldIterator) != iterator|oldIterator {
+		atomicor8(&h.flags, iterator|oldIterator)
 	}
 
 	mapiternext(it)
@@ -753,6 +783,15 @@
 	h.buckets = newbuckets
 	h.nevacuate = 0
 
+	if h.overflow != nil {
+		// Promote current overflow buckets to the old generation.
+		if h.overflow[1] != nil {
+			throw("overflow is not nil")
+		}
+		h.overflow[1] = h.overflow[0]
+		h.overflow[0] = nil
+	}
+
 	// the actual copying of the hash table data is done incrementally
 	// by growWork() and evacuate().
 }
@@ -836,7 +875,7 @@
 							memstats.next_gc = memstats.heap_alloc
 						}
 						newx := (*bmap)(newobject(t.bucket))
-						x.setoverflow(t, newx)
+						h.setoverflow(t, x, newx)
 						x = newx
 						xi = 0
 						xk = add(unsafe.Pointer(x), dataOffset)
@@ -863,7 +902,7 @@
 							memstats.next_gc = memstats.heap_alloc
 						}
 						newy := (*bmap)(newobject(t.bucket))
-						y.setoverflow(t, newy)
+						h.setoverflow(t, y, newy)
 						y = newy
 						yi = 0
 						yk = add(unsafe.Pointer(y), dataOffset)
@@ -899,6 +938,12 @@
 		if oldbucket+1 == newbit { // newbit == # of oldbuckets
 			// Growing is all done.  Free old main bucket array.
 			h.oldbuckets = nil
+			// Can discard old overflow buckets as well.
+			// If they are still referenced by an iterator,
+			// then the iterator holds a pointers to the slice.
+			if h.overflow != nil {
+				h.overflow[1] = nil
+			}
 		}
 	}
 }