runtime: use multiply instead of divide in heapBitsForObject These benchmarks show the effect of the combination of this change and Rick's pending CL 6665. Code with interior pointers is helped much more than code without, but even code without doesn't suffer too badly. benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 6989407768 6851728175 -1.97% BenchmarkFannkuch11 4416250775 4405762558 -0.24% BenchmarkFmtFprintfEmpty 134 130 -2.99% BenchmarkFmtFprintfString 491 402 -18.13% BenchmarkFmtFprintfInt 430 420 -2.33% BenchmarkFmtFprintfIntInt 748 663 -11.36% BenchmarkFmtFprintfPrefixedInt 602 534 -11.30% BenchmarkFmtFprintfFloat 728 699 -3.98% BenchmarkFmtManyArgs 2528 2507 -0.83% BenchmarkGobDecode 17448191 17749756 +1.73% BenchmarkGobEncode 14579824 14370183 -1.44% BenchmarkGzip 656489990 652669348 -0.58% BenchmarkGunzip 141254147 141099278 -0.11% BenchmarkHTTPClientServer 94111 93738 -0.40% BenchmarkJSONEncode 36305013 36696440 +1.08% BenchmarkJSONDecode 124652000 128176454 +2.83% BenchmarkMandelbrot200 6009333 5997093 -0.20% BenchmarkGoParse 7651583 7623494 -0.37% BenchmarkRegexpMatchEasy0_32 213 213 +0.00% BenchmarkRegexpMatchEasy0_1K 511 494 -3.33% BenchmarkRegexpMatchEasy1_32 186 187 +0.54% BenchmarkRegexpMatchEasy1_1K 1834 1827 -0.38% BenchmarkRegexpMatchMedium_32 427 412 -3.51% BenchmarkRegexpMatchMedium_1K 154841 153086 -1.13% BenchmarkRegexpMatchHard_32 7473 7478 +0.07% BenchmarkRegexpMatchHard_1K 233587 232272 -0.56% BenchmarkRevcomp 918797689 944528032 +2.80% BenchmarkTemplate 167665081 167773121 +0.06% BenchmarkTimeParse 631 636 +0.79% BenchmarkTimeFormat 672 666 -0.89% Change-Id: Ia923de3cdb3993b640fe0a02cbe2c7babc16f32c Reviewed-on: https://go-review.googlesource.com/6782 Reviewed-by: Rick Hudson <rlh@golang.org> Reviewed-by: Austin Clements <austin@google.com>

commit: 9feb24f3ed2bf8625ba3fbb83b8784d715d37b7e [log] [tgz]
author: Russ Cox <rsc@golang.org> Wed Mar 04 11:34:50 2015 -0500
committer: Russ Cox <rsc@golang.org> Wed Mar 04 17:46:47 2015 +0000
tree: 9554cd46831782fd808a0703e808d0eef5690a4b
parent: 81d4072eb062a6af09e9a36314887a88acd50ebf [diff]
diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index 4592044..6b46ad1 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go

@@ -202,7 +202,19 @@
 	}
 	base = s.base()
 	if p-base >= s.elemsize {
-		base += (p - base) / s.elemsize * s.elemsize
+		// n := (p - base) / s.elemsize, using division by multiplication
+		n := uintptr(uint64(p-base) >> s.divShift * uint64(s.divMul) >> s.divShift2)
+
+		const debugMagic = false
+		if debugMagic {
+			n2 := (p - base) / s.elemsize
+			if n != n2 {
+				println("runtime: bad div magic", (p - base), s.elemsize, s.divShift, s.divMul, s.divShift2)
+				throw("bad div magic")
+			}
+		}
+
+		base += n * s.elemsize
 	}
 	if base == p {
 		print("runtime: failed to find block beginning for ", hex(p), " s=", hex(s.start*_PageSize), " s.limit=", hex(s.limit), "\n")

diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 94ef4de..fc4dfee 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go

@@ -101,11 +101,14 @@
 	// if sweepgen == h->sweepgen, the span is swept and ready to use
 	// h->sweepgen is incremented by 2 after every GC
 	sweepgen    uint32
+	divMul      uint32   // for divide by elemsize - divMagic.mul
 	ref         uint16   // capacity - number of objects in freelist
 	sizeclass   uint8    // size class
 	incache     bool     // being used by an mcache
 	state       uint8    // mspaninuse etc
 	needzero    uint8    // needs to be zeroed before allocation
+	divShift    uint8    // for divide by elemsize - divMagic.shift
+	divShift2   uint8    // for divide by elemsize - divMagic.shift2
 	elemsize    uintptr  // computed from sizeclass or from npages
 	unusedsince int64    // first time spotted by gc in mspanfree state
 	npreleased  uintptr  // number of pages released to the os
@@ -385,8 +388,15 @@
 		s.sizeclass = uint8(sizeclass)
 		if sizeclass == 0 {
 			s.elemsize = s.npages << _PageShift
+			s.divShift = 0
+			s.divMul = 0
+			s.divShift2 = 0
 		} else {
 			s.elemsize = uintptr(class_to_size[sizeclass])
+			m := &class_to_divmagic[sizeclass]
+			s.divShift = m.shift
+			s.divMul = m.mul
+			s.divShift2 = m.shift2
 		}
 
 		// update stats, sweep lists

diff --git a/src/runtime/msize.go b/src/runtime/msize.go
index 370cae6..f2a7cb9 100644
--- a/src/runtime/msize.go
+++ b/src/runtime/msize.go

@@ -48,6 +48,8 @@
 
 var class_to_size [_NumSizeClasses]int32
 var class_to_allocnpages [_NumSizeClasses]int32
+var class_to_divmagic [_NumSizeClasses]divMagic
+
 var size_to_class8 [1024/8 + 1]int8
 var size_to_class128 [(_MaxSmallSize-1024)/128 + 1]int8
 
@@ -144,6 +146,11 @@
 	for i := 0; i < len(class_to_size); i++ {
 		memstats.by_size[i].size = uint32(class_to_size[i])
 	}
+
+	for i := 1; i < len(class_to_size); i++ {
+		class_to_divmagic[i] = computeDivMagic(uint32(class_to_size[i]))
+	}
+
 	return
 
 dump:
@@ -182,3 +189,55 @@
 	}
 	return round(size, _PageSize)
 }
+
+// divMagic holds magic constants to implement division
+// by a particular constant as a shift, multiply, and shift.
+// That is, given
+//	m = computeMagic(d)
+// then
+//	n/d == ((n>>m.shift) * m.mul) >> m.shift2
+//
+// The magic computation picks m such that
+//	d = d₁*d₂
+//	d₂= 2^m.shift
+//	m.mul = ⌈2^m.shift2 / d₁⌉
+//
+// The magic computation here is tailored for malloc block sizes
+// and does not handle arbitrary d correctly. Malloc block sizes d are
+// always even, so the first shift implements the factors of 2 in d
+// and then the mul and second shift implement the odd factor
+// that remains. Because the first shift divides n by at least 2 (actually 8)
+// before the multiply gets involved, the huge corner cases that
+// require additional adjustment are impossible, so the usual
+// fixup is not needed.
+//
+// For more details see Hacker's Delight, Chapter 10, and
+// http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
+// http://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html
+type divMagic struct {
+	shift  uint8
+	mul    uint32
+	shift2 uint8
+}
+
+func computeDivMagic(d uint32) divMagic {
+	var m divMagic
+
+	// Compute pre-shift by factoring power of 2 out of d.
+	for d&1 == 0 {
+		m.shift++
+		d >>= 1
+	}
+
+	// Compute largest k such that ⌈2^k / d⌉ fits in a 32-bit int.
+	// This is always a good enough approximation.
+	// We could use smaller k for some divisors but there's no point.
+	k := uint8(63)
+	d64 := uint64(d)
+	for ((1<<k)+d64-1)/d64 >= 1<<32 {
+		k--
+	}
+	m.mul = uint32(((1 << k) + d64 - 1) / d64) //  ⌈2^k / d⌉
+	m.shift2 = k
+	return m
+}
commit	9feb24f3ed2bf8625ba3fbb83b8784d715d37b7e	[log] [tgz]
author	Russ Cox <rsc@golang.org>	Wed Mar 04 11:34:50 2015 -0500
committer	Russ Cox <rsc@golang.org>	Wed Mar 04 17:46:47 2015 +0000
tree	9554cd46831782fd808a0703e808d0eef5690a4b
parent	81d4072eb062a6af09e9a36314887a88acd50ebf [diff]