runtime: make the GC bitmap a byte array
Half the code in the garbage collector accesses the bitmap
as an array of bytes instead of as an array of uintptrs.
This is tricky to do correctly in a portable fashion,
it breaks on big-endian systems.
Make the bitmap a byte array.
Simplifies markallocated, scanblock and span sweep along the way,
as we don't need to recalculate bitmap position for each word.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/125250043
diff --git a/src/pkg/runtime/asm_386.s b/src/pkg/runtime/asm_386.s
index 0607bc8..16e3f31 100644
--- a/src/pkg/runtime/asm_386.s
+++ b/src/pkg/runtime/asm_386.s
@@ -619,6 +619,14 @@
 	XADDL	AX, (SP)
 	RET
 
+// void	runtime·atomicor8(byte volatile*, byte);
+TEXT runtime·atomicor8(SB), NOSPLIT, $0-8
+	MOVL	ptr+0(FP), AX
+	MOVB	val+4(FP), BX
+	LOCK
+	ORB	BX, (AX)
+	RET
+
 // void jmpdefer(fn, sp);
 // called from deferreturn.
 // 1. pop the caller
diff --git a/src/pkg/runtime/asm_amd64.s b/src/pkg/runtime/asm_amd64.s
index d94df0b..6446b5d 100644
--- a/src/pkg/runtime/asm_amd64.s
+++ b/src/pkg/runtime/asm_amd64.s
@@ -702,6 +702,14 @@
 	XCHGQ	AX, 0(BX)
 	RET
 
+// void	runtime·atomicor8(byte volatile*, byte);
+TEXT runtime·atomicor8(SB), NOSPLIT, $0-16
+	MOVQ	ptr+0(FP), AX
+	MOVB	val+8(FP), BX
+	LOCK
+	ORB	BX, (AX)
+	RET
+
 // void jmpdefer(fn, sp);
 // called from deferreturn.
 // 1. pop the caller
diff --git a/src/pkg/runtime/atomic_arm.c b/src/pkg/runtime/atomic_arm.c
index d914475..537bf18 100644
--- a/src/pkg/runtime/atomic_arm.c
+++ b/src/pkg/runtime/atomic_arm.c
@@ -167,3 +167,19 @@
 	*addr = v;
 	runtime·unlock(LOCK(addr));
 }
+
+#pragma textflag NOSPLIT
+void
+runtime·atomicor8(byte volatile *addr, byte v)
+{
+	uint32 *addr32, old, word, shift;
+
+	// Align down to 4 bytes and use 32-bit CAS.
+	addr32 = (uint32*)((uintptr)addr & ~3);
+	word = ((uint32)v) << (((uintptr)addr & 3) * 8);
+	for(;;) {
+		old = *addr32;
+		if(runtime·cas(addr32, old, old|word))
+			break;
+	}
+}
diff --git a/src/pkg/runtime/heapdump.c b/src/pkg/runtime/heapdump.c
index babb32f..63d80b8 100644
--- a/src/pkg/runtime/heapdump.c
+++ b/src/pkg/runtime/heapdump.c
@@ -820,7 +820,8 @@
 static BitVector
 makeheapobjbv(byte *p, uintptr size)
 {
-	uintptr off, shift, *bitp, bits, nptr, i;
+	uintptr off, nptr, i;
+	byte shift, *bitp, bits;
 	bool mw;
 
 	// Extend the temp buffer if necessary.
@@ -838,13 +839,13 @@
 	mw = false;
 	for(i = 0; i < nptr; i++) {
 		off = (uintptr*)(p + i*PtrSize) - (uintptr*)runtime·mheap.arena_start;
-		bitp = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
-		shift = (off % wordsPerBitmapWord) * gcBits;
-		bits = (*bitp >> (shift + 2)) & 3;
+		bitp = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1;
+		shift = (off % wordsPerBitmapByte) * gcBits;
+		bits = (*bitp >> (shift + 2)) & BitsMask;
 		if(!mw && bits == BitsDead)
 			break;  // end of heap object
 		mw = !mw && bits == BitsMultiWord;
-		tmpbuf[i*BitsPerPointer/8] &= ~(3<<((i*BitsPerPointer)%8));
+		tmpbuf[i*BitsPerPointer/8] &= ~(BitsMask<<((i*BitsPerPointer)%8));
 		tmpbuf[i*BitsPerPointer/8] |= bits<<((i*BitsPerPointer)%8);
 	}
 	return (BitVector){i*BitsPerPointer, (uint32*)tmpbuf};
diff --git a/src/pkg/runtime/malloc.go b/src/pkg/runtime/malloc.go
index 152b3b6..8ee4607 100644
--- a/src/pkg/runtime/malloc.go
+++ b/src/pkg/runtime/malloc.go
@@ -22,8 +22,8 @@
 	pageSize  = 1 << pageShift
 	pageMask  = pageSize - 1
 
-	wordsPerBitmapWord = ptrSize * 8 / 4
 	gcBits             = 4
+	wordsPerBitmapByte = 8 / gcBits
 	bitsPerPointer     = 2
 	bitsMask           = 1<<bitsPerPointer - 1
 	pointersPerByte    = 8 / bitsPerPointer
@@ -211,8 +211,8 @@
 	{
 		arena_start := uintptr(unsafe.Pointer(mheap_.arena_start))
 		off := (uintptr(x) - arena_start) / ptrSize
-		xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
-		shift := (off % wordsPerBitmapWord) * gcBits
+		xbits := (*uint8)(unsafe.Pointer(arena_start - off/wordsPerBitmapByte - 1))
+		shift := (off % wordsPerBitmapByte) * gcBits
 		if debugMalloc && ((*xbits>>shift)&(bitMask|bitPtrMask)) != bitBoundary {
 			println("runtime: bits =", (*xbits>>shift)&(bitMask|bitPtrMask))
 			gothrow("bad bits in markallocated")
@@ -260,8 +260,7 @@
 			ptrmask = (*uint8)(unsafe.Pointer(&typ.gc[0])) // embed mask
 		}
 		if size == 2*ptrSize {
-			xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
-			*xbitsb = *ptrmask | bitBoundary
+			*xbits = *ptrmask | bitBoundary
 			goto marked
 		}
 		te = uintptr(typ.size) / ptrSize
@@ -283,19 +282,12 @@
 				v &^= uint8(bitPtrMask << 4)
 			}
 
-			off := (uintptr(x) + i - arena_start) / ptrSize
-			xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
-			shift := (off % wordsPerBitmapWord) * gcBits
-			xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
-			*xbitsb = v
+			*xbits = v
+			xbits = (*byte)(add(unsafe.Pointer(xbits), ^uintptr(0)))
 		}
 		if size0%(2*ptrSize) == 0 && size0 < size {
 			// Mark the word after last object's word as bitsDead.
-			off := (uintptr(x) + size0 - arena_start) / ptrSize
-			xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
-			shift := (off % wordsPerBitmapWord) * gcBits
-			xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
-			*xbitsb = bitsDead << 2
+			*xbits = bitsDead << 2
 		}
 	}
 marked:
diff --git a/src/pkg/runtime/mgc0.c b/src/pkg/runtime/mgc0.c
index 60a6181..14743c2 100644
--- a/src/pkg/runtime/mgc0.c
+++ b/src/pkg/runtime/mgc0.c
@@ -212,8 +212,8 @@
 static void
 scanblock(byte *b, uintptr n, byte *ptrmask)
 {
-	byte *obj, *p, *arena_start, *arena_used, **wp, *scanbuf[8], bits8;
-	uintptr i, nobj, size, idx, *bitp, bits, xbits, shift, x, off, cached, scanbufpos;
+	byte *obj, *p, *arena_start, *arena_used, **wp, *scanbuf[8], *ptrbitp, *bitp, bits, xbits, shift, cached;
+	uintptr i, nobj, size, idx, x, off, scanbufpos;
 	intptr ncached;
 	Workbuf *wbuf;
 	String *str;
@@ -237,6 +237,10 @@
 	for(i = 0; i < nelem(scanbuf); i++)
 		scanbuf[i] = nil;
 
+	ptrbitp = nil;
+	cached = 0;
+	ncached = 0;
+
 	// ptrmask can have 3 possible values:
 	// 1. nil - obtain pointer mask from GC bitmap.
 	// 2. ScanConservatively - don't use any mask, scan conservatively.
@@ -295,8 +299,15 @@
 			}
 			ptrmask = ScanConservatively;
 		}
-		cached = 0;
-		ncached = 0;
+		// Find bits of the beginning of the object.
+		if(ptrmask == nil) {
+			off = (uintptr*)b - (uintptr*)arena_start;
+			ptrbitp = arena_start - off/wordsPerBitmapByte - 1;
+			shift = (off % wordsPerBitmapByte) * gcBits;
+			cached = *ptrbitp >> shift;
+			cached &= ~bitBoundary;
+			ncached = (8 - shift)/gcBits;
+		}
 		for(i = 0; i < n; i += PtrSize) {
 			obj = nil;
 			// Find bits for this word.
@@ -308,16 +319,13 @@
 				// Consult GC bitmap.
 				if(ncached <= 0) {
 					// Refill cache.
-					off = (uintptr*)(b+i) - (uintptr*)arena_start;
-					bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-					shift = (off % wordsPerBitmapWord) * gcBits;
-					cached = *bitp >> shift;
-					ncached = (PtrSize*8 - shift)/gcBits;
+					cached = *--ptrbitp;
+					ncached = 2;
 				}
 				bits = cached;
 				cached >>= gcBits;
 				ncached--;
-				if(i != 0 && (bits&bitBoundary) != 0)
+				if((bits&bitBoundary) != 0)
 					break; // reached beginning of the next object
 				bits = (bits>>2)&BitsMask;
 				if(bits == BitsDead)
@@ -336,11 +344,9 @@
 			// Find the next pair of bits.
 			if(ptrmask == nil) {
 				if(ncached <= 0) {
-					off = (uintptr*)(b+i+PtrSize) - (uintptr*)arena_start;
-					bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-					shift = (off % wordsPerBitmapWord) * gcBits;
-					cached = *bitp >> shift;
-					ncached = (PtrSize*8 - shift)/gcBits;
+					// Refill cache.
+					cached = *--ptrbitp;
+					ncached = 2;
 				}
 				bits = (cached>>2)&BitsMask;
 			} else
@@ -383,8 +389,14 @@
 
 			if(bits == BitsSlice) {
 				i += 2*PtrSize;
-				cached >>= 2*gcBits;
-				ncached -= 2;
+				if(ncached == 2)
+					ncached = 0;
+				else if(ptrmask == nil) {
+					// Refill cache and consume one quadruple.
+					cached = *--ptrbitp;
+					cached >>= gcBits;
+					ncached = 1;
+				}
 			} else {
 				i += PtrSize;
 				cached >>= gcBits;
@@ -398,20 +410,12 @@
 				continue;
 			// Mark the object.
 			off = (uintptr*)obj - (uintptr*)arena_start;
-			bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-			shift = (off % wordsPerBitmapWord) * gcBits;
+			bitp = arena_start - off/wordsPerBitmapByte - 1;
+			shift = (off % wordsPerBitmapByte) * gcBits;
 			xbits = *bitp;
 			bits = (xbits >> shift) & bitMask;
 			if((bits&bitBoundary) == 0) {
-				// Not a beginning of a block, check if we have block boundary in xbits.
-				while(shift > 0) {
-					obj -= PtrSize;
-					shift -= gcBits;
-					bits = (xbits >> shift) & bitMask;
-					if((bits&bitBoundary) != 0)
-						goto havebits;
-				}
-				// Otherwise consult span table to find the block beginning.
+				// Not a beginning of a block, consult span table to find the block beginning.
 				k = (uintptr)obj>>PageShift;
 				x = k;
 				x -= (uintptr)arena_start>>PageShift;
@@ -433,7 +437,6 @@
 				goto markobj;
 			}
 
-		havebits:
 			// Now we have bits, bitp, and shift correct for
 			// obj pointing at the base of the object.
 			// Only care about not marked objects.
@@ -449,22 +452,12 @@
 			// For 8-byte objects we use non-atomic store, if the other
 			// quadruple is already marked. Otherwise we resort to CAS
 			// loop for marking.
-			bits8 = xbits>>(shift&~7);
-			if((bits8&(bitMask|(bitMask<<gcBits))) != (bitBoundary|(bitBoundary<<gcBits)) ||
+			if((xbits&(bitMask|(bitMask<<gcBits))) != (bitBoundary|(bitBoundary<<gcBits)) ||
 				work.nproc == 1)
-				((uint8*)bitp)[shift/8] = bits8 | (bitMarked<<(shift&7));
-			else {
-				for(;;) {
-					if(runtime·casp((void**)bitp, (void*)xbits, (void*)(xbits|(bitMarked<<shift))))
-						break;
-					xbits = *bitp;
-					bits = (xbits>>shift) & bitMask;
-					if((bits&bitMarked) != 0)
-						break;
-				}
-				if((bits&bitMarked) != 0)
-					continue;
-			}
+				*bitp = xbits | (bitMarked<<shift);
+			else
+				runtime·atomicor8(bitp, bitMarked<<shift);
+
 			if(((xbits>>(shift+2))&BitsMask) == BitsDead)
 				continue;  // noscan object
 
@@ -901,9 +894,9 @@
 runtime·MSpan_Sweep(MSpan *s, bool preserve)
 {
 	int32 cl, n, npages, nfree;
-	uintptr size, off, *bitp, shift, xbits, bits;
+	uintptr size, off, step;
 	uint32 sweepgen;
-	byte *p;
+	byte *p, *bitp, shift, xbits, bits;
 	MCache *c;
 	byte *arena_start;
 	MLink head, *end, *link;
@@ -939,8 +932,8 @@
 	// Mark any free objects in this span so we don't collect them.
 	for(link = s->freelist; link != nil; link = link->next) {
 		off = (uintptr*)link - (uintptr*)arena_start;
-		bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-		shift = (off % wordsPerBitmapWord) * gcBits;
+		bitp = arena_start - off/wordsPerBitmapByte - 1;
+		shift = (off % wordsPerBitmapByte) * gcBits;
 		*bitp |= bitMarked<<shift;
 	}
 
@@ -951,8 +944,8 @@
 		// A finalizer can be set for an inner byte of an object, find object beginning.
 		p = (byte*)(s->start << PageShift) + special->offset/size*size;
 		off = (uintptr*)p - (uintptr*)arena_start;
-		bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-		shift = (off % wordsPerBitmapWord) * gcBits;
+		bitp = arena_start - off/wordsPerBitmapByte - 1;
+		shift = (off % wordsPerBitmapByte) * gcBits;
 		bits = (*bitp>>shift) & bitMask;
 		if((bits&bitMarked) == 0) {
 			// Find the exact byte for which the special was setup
@@ -977,10 +970,27 @@
 	// This thread owns the span now, so it can manipulate
 	// the block bitmap without atomic operations.
 	p = (byte*)(s->start << PageShift);
+	// Find bits for the beginning of the span.
+	off = (uintptr*)p - (uintptr*)arena_start;
+	bitp = arena_start - off/wordsPerBitmapByte - 1;
+	shift = 0;
+	step = size/(PtrSize*wordsPerBitmapByte);
+	// Rewind to the previous quadruple as we move to the next
+	// in the beginning of the loop.
+	bitp += step;
+	if(step == 0) {
+		// 8-byte objects.
+		bitp++;
+		shift = gcBits;
+	}
 	for(; n > 0; n--, p += size) {
-		off = (uintptr*)p - (uintptr*)arena_start;
-		bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-		shift = (off % wordsPerBitmapWord) * gcBits;
+		bitp -= step;
+		if(step == 0) {
+			if(shift != 0)
+				bitp--;
+			shift = gcBits - shift;
+		}
+
 		xbits = *bitp;
 		bits = (xbits>>shift) & bitMask;
 
@@ -1759,8 +1769,8 @@
 static byte*
 unrollgcprog1(byte *mask, byte *prog, uintptr *ppos, bool inplace, bool sparse)
 {
-	uintptr *b, off, shift, pos, siz, i;
-	byte *arena_start, *prog1, v;
+	uintptr pos, siz, i, off;
+	byte *arena_start, *prog1, v, *bitp, shift;
 
 	arena_start = runtime·mheap.arena_start;
 	pos = *ppos;
@@ -1777,11 +1787,11 @@
 				if(inplace) {
 					// Store directly into GC bitmap.
 					off = (uintptr*)(mask+pos) - (uintptr*)arena_start;
-					b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-					shift = (off % wordsPerBitmapWord) * gcBits;
-					if((shift%8)==0)
-						((byte*)b)[shift/8] = 0;
-					((byte*)b)[shift/8] |= v<<((shift%8)+2);
+					bitp = arena_start - off/wordsPerBitmapByte - 1;
+					shift = (off % wordsPerBitmapByte) * gcBits;
+					if(shift==0)
+						*bitp = 0;
+					*bitp |= v<<(shift+2);
 					pos += PtrSize;
 				} else if(sparse) {
 					// 4-bits per word
@@ -1847,8 +1857,8 @@
 void
 runtime·unrollgcproginplace_m(void)
 {
-	uintptr size, size0, *b, off, shift, pos;
-	byte *arena_start, *prog;
+	uintptr size, size0, pos, off;
+	byte *arena_start, *prog, *bitp, shift;
 	Type *typ;
 	void *v;
 
@@ -1866,15 +1876,15 @@
 	// Mark first word as bitAllocated.
 	arena_start = runtime·mheap.arena_start;
 	off = (uintptr*)v - (uintptr*)arena_start;
-	b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-	shift = (off % wordsPerBitmapWord) * gcBits;
-	*b |= bitBoundary<<shift;
+	bitp = arena_start - off/wordsPerBitmapByte - 1;
+	shift = (off % wordsPerBitmapByte) * gcBits;
+	*bitp |= bitBoundary<<shift;
 	// Mark word after last as BitsDead.
 	if(size0 < size) {
 		off = (uintptr*)((byte*)v + size0) - (uintptr*)arena_start;
-		b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-		shift = (off % wordsPerBitmapWord) * gcBits;
-		*b &= ~(bitPtrMask<<shift) | ((uintptr)BitsDead<<(shift+2));
+		bitp = arena_start - off/wordsPerBitmapByte - 1;
+		shift = (off % wordsPerBitmapByte) * gcBits;
+		*bitp &= ~(bitPtrMask<<shift) | ((uintptr)BitsDead<<(shift+2));
 	}
 }
 
@@ -1916,60 +1926,67 @@
 void
 runtime·markspan(void *v, uintptr size, uintptr n, bool leftover)
 {
-	uintptr *b, *b0, off, shift, x;
-	byte *p;
+	uintptr i, off, step;
+	byte *b;
 
 	if((byte*)v+size*n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
 		runtime·throw("markspan: bad pointer");
 
-	p = v;
-	if(leftover)	// mark a boundary just past end of last block too
-		n++;
+	// Find bits of the beginning of the span.
+	off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
+	b = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1;
+	if((off%wordsPerBitmapByte) != 0)
+		runtime·throw("markspan: unaligned length");
 
-	b0 = nil;
-	x = 0;
-	for(; n-- > 0; p += size) {
-		// Okay to use non-atomic ops here, because we control
-		// the entire span, and each bitmap word has bits for only
-		// one span, so no other goroutines are changing these
-		// bitmap words.
-		off = (uintptr*)p - (uintptr*)runtime·mheap.arena_start;  // word offset
-		b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
-		shift = (off % wordsPerBitmapWord) * gcBits;
-		if(b0 != b) {
-			if(b0 != nil)
-				*b0 = x;
-			b0 = b;
-			x = 0;
-		}
-		x |= (bitBoundary<<shift) | ((uintptr)BitsDead<<(shift+2));
+	// Okay to use non-atomic ops here, because we control
+	// the entire span, and each bitmap byte has bits for only
+	// one span, so no other goroutines are changing these bitmap words.
+
+	if(size == PtrSize) {
+		// Possible only on 64-bits (minimal size class is 8 bytes).
+		// Poor man's memset(0x11).
+		if(0x11 != ((bitBoundary+BitsDead)<<gcBits) + (bitBoundary+BitsDead))
+			runtime·throw("markspan: bad bits");
+		if((n%(wordsPerBitmapByte*PtrSize)) != 0)
+			runtime·throw("markspan: unaligned length");
+		b = b - n/wordsPerBitmapByte + 1;	// find first byte
+		if(((uintptr)b%PtrSize) != 0)
+			runtime·throw("markspan: unaligned pointer");
+		for(i = 0; i != n; i += wordsPerBitmapByte*PtrSize, b += PtrSize)
+			*(uintptr*)b = (uintptr)0x1111111111111111ULL;  // bitBoundary+BitsDead
+		return;
 	}
-	*b0 = x;
+
+	if(leftover)
+		n++;	// mark a boundary just past end of last block too
+	step = size/(PtrSize*wordsPerBitmapByte);
+	for(i = 0; i != n; i++, b -= step)
+		*b = bitBoundary|(BitsDead<<2);
 }
 
 // unmark the span of memory at v of length n bytes.
 void
 runtime·unmarkspan(void *v, uintptr n)
 {
-	uintptr *p, *b, off;
+	uintptr off;
+	byte *b;
 
 	if((byte*)v+n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
 		runtime·throw("markspan: bad pointer");
 
-	p = v;
-	off = p - (uintptr*)runtime·mheap.arena_start;  // word offset
-	if((off % wordsPerBitmapWord) != 0)
+	off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
+	if((off % (PtrSize*wordsPerBitmapByte)) != 0)
 		runtime·throw("markspan: unaligned pointer");
-	b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
+	b = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1;
 	n /= PtrSize;
-	if(n%wordsPerBitmapWord != 0)
+	if(n%(PtrSize*wordsPerBitmapByte) != 0)
 		runtime·throw("unmarkspan: unaligned length");
 	// Okay to use non-atomic ops here, because we control
 	// the entire span, and each bitmap word has bits for only
 	// one span, so no other goroutines are changing these
 	// bitmap words.
-	n /= wordsPerBitmapWord;
-	runtime·memclr((byte*)(b - n + 1), n*PtrSize);
+	n /= wordsPerBitmapByte;
+	runtime·memclr(b - n + 1, n);
 }
 
 void
@@ -1983,7 +2000,7 @@
 	};
 	uintptr n;
 
-	n = (h->arena_used - h->arena_start) / wordsPerBitmapWord;
+	n = (h->arena_used - h->arena_start) / (PtrSize*wordsPerBitmapByte);
 	n = ROUND(n, bitmapChunk);
 	n = ROUND(n, PhysPageSize);
 	if(h->bitmap_mapped >= n)
@@ -2011,8 +2028,8 @@
 runtime·getgcmask(byte *p, Type *t, byte **mask, uintptr *len)
 {
 	Stkframe frame;
-	uintptr i, n, off, bits, shift, *b;
-	byte *base;
+	uintptr i, n, off;
+	byte *base, bits, shift, *b;
 
 	*mask = nil;
 	*len = 0;
@@ -2047,8 +2064,8 @@
 		*mask = runtime·mallocgc(*len, nil, 0);
 		for(i = 0; i < n; i += PtrSize) {
 			off = (uintptr*)(base+i) - (uintptr*)runtime·mheap.arena_start;
-			b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
-			shift = (off % wordsPerBitmapWord) * gcBits;
+			b = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1;
+			shift = (off % wordsPerBitmapByte) * gcBits;
 			bits = (*b >> (shift+2))&BitsMask;
 			(*mask)[i/PtrSize] = bits;
 		}
diff --git a/src/pkg/runtime/mgc0.h b/src/pkg/runtime/mgc0.h
index 893819c..7449398 100644
--- a/src/pkg/runtime/mgc0.h
+++ b/src/pkg/runtime/mgc0.h
@@ -8,8 +8,8 @@
 	ScanStackByFrames = 1,
 
 	// Four bits per word (see #defines below).
-	wordsPerBitmapWord = sizeof(void*)*8/4,
 	gcBits = 4,
+	wordsPerBitmapByte = 8/gcBits,
 
 	// GC type info programs.
 	// The programs allow to store type info required for GC in a compact form.
diff --git a/src/pkg/runtime/runtime.h b/src/pkg/runtime/runtime.h
index 0dc60b2..867da3f 100644
--- a/src/pkg/runtime/runtime.h
+++ b/src/pkg/runtime/runtime.h
@@ -913,6 +913,7 @@
 uint64	runtime·atomicload64(uint64 volatile*);
 void*	runtime·atomicloadp(void* volatile*);
 void	runtime·atomicstorep(void* volatile*, void*);
+void	runtime·atomicor8(byte volatile*, byte);
 
 void	runtime·setg(G*);
 void	runtime·newextram(void);