exp: use new operations from math/bits for faster generation

Code by josharian.

benchmark                        old ns/op     new ns/op     delta
BenchmarkPCGMultiply-4           3.48          2.31          -33.62%
BenchmarkSource-4                7.46          3.54          -52.55%
BenchmarkInt63Threadsafe-4       21.6          19.4          -10.19%
BenchmarkInt63Unthreadsafe-4     7.69          3.60          -53.19%
BenchmarkIntn1000-4              17.3          13.2          -23.70%
BenchmarkInt63n1000-4            17.7          13.2          -25.42%
BenchmarkInt31n1000-4            17.4          13.2          -24.14%
BenchmarkFloat32-4               13.5          8.57          -36.52%
BenchmarkFloat64-4               11.1          6.43          -42.07%
BenchmarkPerm3-4                 64.6          49.2          -23.84%
BenchmarkPerm30-4                614           430           -29.97%
BenchmarkPerm30ViaShuffle-4      556           450           -19.06%
BenchmarkShuffleOverhead-4       969           788           -18.68%
BenchmarkRead3-4                 11.7          10.1          -13.68%
BenchmarkRead64-4                127           83.0          -34.65%
BenchmarkRead1000-4              1820          1191          -34.56%

Also move the rotate out from under the 1.9 build tag, as 1.9 is old.
The new code goes under a 1.12 build tag.

Update golang/go#21835.

Change-Id: I66d5cac6578b8df2cdfc3b90ed097c5525adb6bc
Reviewed-on: https://go-review.googlesource.com/c/163258
Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
diff --git a/rand/rng.go b/rand/rng.go
index 00c421e..f59fadb 100644
--- a/rand/rng.go
+++ b/rand/rng.go
@@ -4,6 +4,8 @@
 
 package rand
 
+import "math/bits"
+
 // PCGSource is an implementation of a 64-bit permuted congruential
 // generator as defined in
 //
@@ -25,6 +27,8 @@
 	maxUint32 = (1 << 32) - 1
 
 	multiplier = 47026247687942121848144207491837523525
+	mulHigh    = multiplier >> 64
+	mulLow     = multiplier & maxUint64
 
 	increment = 117397592171526113268558934119004209487
 	incHigh   = increment >> 64
@@ -42,39 +46,10 @@
 	pcg.high = seed // TODO: What is right?
 }
 
-func (pcg *PCGSource) add() {
-	old := pcg.low
-	pcg.low += incLow
-	if pcg.low < old {
-		// Carry occurred.
-		pcg.high++
-	}
-	pcg.high += incHigh
-}
-
-func (pcg *PCGSource) multiply() {
-	// Break each lower word into two separate 32-bit 'digits' each stored
-	// in a 64-bit word with 32 high zero bits.  This allows the overflow
-	// into the high word to be computed.
-	s0 := (pcg.low >> 00) & maxUint32
-	s1 := (pcg.low >> 32) & maxUint32
-
-	const (
-		m0    = (multiplier >> 00) & maxUint32
-		m1    = (multiplier >> 32) & maxUint32
-		mLow  = multiplier & (1<<64 - 1)
-		mHigh = multiplier >> 64 & (1<<64 - 1)
-	)
-
-	high := pcg.low*mHigh + pcg.high*mLow
-	s0m0 := s0 * m0
-	s0m1 := s0 * m1
-	s1m0 := s1 * m0
-	s1m1 := s1 * m1
-	high += (s0m1 >> 32) + (s1m0 >> 32)
-	carry := (s0m1 & maxUint32) + (s1m0 & maxUint32) + s0m0>>32
-	high += (carry >> 32)
-
-	pcg.low *= mLow
-	pcg.high = high + s1m1
+// Uint64 returns a pseudo-random 64-bit unsigned integer as a uint64.
+func (pcg *PCGSource) Uint64() uint64 {
+	pcg.multiply()
+	pcg.add()
+	// XOR high and low 64 bits together and rotate right by high 6 bits of state.
+	return bits.RotateLeft64(pcg.high^pcg.low, -int(pcg.high>>58))
 }
diff --git a/rand/uint64.go b/rand/uint64.go
index 58761f6..cc1fc01 100644
--- a/rand/uint64.go
+++ b/rand/uint64.go
@@ -2,16 +2,22 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build go1.9
+// +build go1.12
 
 package rand
 
 import "math/bits"
 
-// Uint64 returns a pseudo-random 64-bit unsigned integer as a uint64.
-func (pcg *PCGSource) Uint64() uint64 {
-	pcg.multiply()
-	pcg.add()
-	// XOR high and low 64 bits together and rotate right by high 6 bits of state.
-	return bits.RotateLeft64(pcg.high^pcg.low, -int(pcg.high>>58))
+func (pcg *PCGSource) add() {
+	var carry uint64
+	pcg.low, carry = bits.Add64(pcg.low, incLow, 0)
+	pcg.high, _ = bits.Add64(pcg.high, incHigh, carry)
+}
+
+func (pcg *PCGSource) multiply() {
+	hi, lo := bits.Mul64(pcg.low, mulLow)
+	hi += pcg.high * mulLow
+	hi += pcg.low * mulHigh
+	pcg.low = lo
+	pcg.high = hi
 }
diff --git a/rand/uint64_fallback.go b/rand/uint64_fallback.go
index 2e67888..58f66cf 100644
--- a/rand/uint64_fallback.go
+++ b/rand/uint64_fallback.go
@@ -2,20 +2,43 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !go1.9
+// +build !go1.12
 
 package rand
 
-// Uint64 returns a pseudo-random 64-bit unsigned integer as a uint64.
-func (pcg *PCGSource) Uint64() uint64 {
-	pcg.multiply()
-	pcg.add()
-	// XOR high and low 64 bits together and rotate right by high 6 bits of state.
-	return rotateLeft(pcg.high^pcg.low, -int(pcg.high>>58))
+func (pcg *PCGSource) add() {
+	old := pcg.low
+	pcg.low += incLow
+	if pcg.low < old {
+		// Carry occurred.
+		pcg.high++
+	}
+	pcg.high += incHigh
 }
 
-func rotateLeft(x uint64, k int) uint64 {
-	const n = 64
-	s := uint(k) & (n - 1)
-	return x<<s | x>>(n-s)
+func (pcg *PCGSource) multiply() {
+	// Break each lower word into two separate 32-bit 'digits' each stored
+	// in a 64-bit word with 32 high zero bits.  This allows the overflow
+	// into the high word to be computed.
+	s0 := (pcg.low >> 00) & maxUint32
+	s1 := (pcg.low >> 32) & maxUint32
+
+	const (
+		m0    = (multiplier >> 00) & maxUint32
+		m1    = (multiplier >> 32) & maxUint32
+		mLow  = multiplier & (1<<64 - 1)
+		mHigh = multiplier >> 64 & (1<<64 - 1)
+	)
+
+	high := pcg.low*mHigh + pcg.high*mLow
+	s0m0 := s0 * m0
+	s0m1 := s0 * m1
+	s1m0 := s1 * m0
+	s1m1 := s1 * m1
+	high += (s0m1 >> 32) + (s1m0 >> 32)
+	carry := (s0m1 & maxUint32) + (s1m0 & maxUint32) + s0m0>>32
+	high += (carry >> 32)
+
+	pcg.low *= mLow
+	pcg.high = high + s1m1
 }