math/rand: use fastrand64 if possible

Now that the top-level math/rand functions are auto-seeded by default
(issue #54880), use the runtime fastrand64 function when 1) Seed
has not been called; 2) the GODEBUG randautoseed=0 is not used.

The benchmarks run quickly and are relatively noisy, but they show
significant improvements for parallel calls to the top-level functions.

goos: linux
goarch: amd64
pkg: math/rand
cpu: 11th Gen Intel(R) Core(TM) i7-11850H @ 2.50GHz
                           │  /tmp/foo.1   │              /tmp/foo.2               │
                           │    sec/op     │    sec/op      vs base                │
Int63Threadsafe-16            11.605n ± 1%    3.094n ±  1%  -73.34% (p=0.000 n=10)
Int63ThreadsafeParallel-16   67.8350n ± 2%   0.4000n ±  1%  -99.41% (p=0.000 n=10)
Int63Unthreadsafe-16           1.947n ± 3%    1.924n ±  2%        ~ (p=0.189 n=10)
Intn1000-16                    4.295n ± 2%    4.287n ±  3%        ~ (p=0.517 n=10)
Int63n1000-16                  4.379n ± 0%    4.192n ±  2%   -4.27% (p=0.000 n=10)
Int31n1000-16                  3.641n ± 3%    3.506n ±  0%   -3.69% (p=0.000 n=10)
Float32-16                     3.330n ± 7%    3.250n ±  2%   -2.40% (p=0.017 n=10)
Float64-16                     2.194n ± 6%    2.056n ±  4%   -6.31% (p=0.004 n=10)
Perm3-16                       43.39n ± 9%    38.28n ± 12%  -11.77% (p=0.015 n=10)
Perm30-16                      324.4n ± 6%    315.9n ± 19%        ~ (p=0.315 n=10)
Perm30ViaShuffle-16            175.4n ± 1%    143.6n ±  2%  -18.15% (p=0.000 n=10)
ShuffleOverhead-16             223.4n ± 2%    215.8n ±  1%   -3.38% (p=0.000 n=10)
Read3-16                       5.428n ± 3%    5.406n ±  2%        ~ (p=0.780 n=10)
Read64-16                      41.55n ± 5%    40.14n ±  3%   -3.38% (p=0.000 n=10)
Read1000-16                    622.9n ± 4%    594.9n ±  2%   -4.50% (p=0.000 n=10)
Concurrent-16                136.300n ± 2%    4.647n ± 26%  -96.59% (p=0.000 n=10)
geomean                        23.40n         12.15n        -48.08%

Fixes #49892

Change-Id: Iba75b326145512ab0b7ece233b98ac3d4e1fb504
Reviewed-on: https://go-review.googlesource.com/c/go/+/465037
Run-TryBot: Ian Lance Taylor <iant@golang.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@google.com>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
Auto-Submit: Ian Lance Taylor <iant@google.com>
diff --git a/src/math/rand/default_test.go b/src/math/rand/default_test.go
new file mode 100644
index 0000000..19fd75d
--- /dev/null
+++ b/src/math/rand/default_test.go
@@ -0,0 +1,148 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package rand_test
+
+import (
+	"fmt"
+	"internal/race"
+	"internal/testenv"
+	. "math/rand"
+	"os"
+	"runtime"
+	"strconv"
+	"sync"
+	"testing"
+)
+
+// Test that racy access to the default functions behaves reasonably.
+func TestDefaultRace(t *testing.T) {
+	// Skip the test in short mode, but even in short mode run
+	// the test if we are using the race detector, because part
+	// of this is to see whether the race detector reports any problems.
+	if testing.Short() && !race.Enabled {
+		t.Skip("skipping starting another executable in short mode")
+	}
+
+	const env = "GO_RAND_TEST_HELPER_CODE"
+	if v := os.Getenv(env); v != "" {
+		doDefaultTest(t, v)
+		return
+	}
+
+	t.Parallel()
+
+	for i := 0; i < 6; i++ {
+		i := i
+		t.Run(strconv.Itoa(i), func(t *testing.T) {
+			t.Parallel()
+			exe, err := os.Executable()
+			if err != nil {
+				exe = os.Args[0]
+			}
+			cmd := testenv.Command(t, exe, "-test.run=TestDefaultRace")
+			cmd = testenv.CleanCmdEnv(cmd)
+			cmd.Env = append(cmd.Env, fmt.Sprintf("GO_RAND_TEST_HELPER_CODE=%d", i/2))
+			if i%2 != 0 {
+				cmd.Env = append(cmd.Env, "GODEBUG=randautoseed=0")
+			}
+			out, err := cmd.CombinedOutput()
+			if len(out) > 0 {
+				t.Logf("%s", out)
+			}
+			if err != nil {
+				t.Error(err)
+			}
+		})
+	}
+}
+
+// doDefaultTest should be run before there have been any calls to the
+// top-level math/rand functions. Make sure that we can make concurrent
+// calls to top-level functions and to Seed without any duplicate values.
+// This will also give the race detector a change to report any problems.
+func doDefaultTest(t *testing.T, v string) {
+	code, err := strconv.Atoi(v)
+	if err != nil {
+		t.Fatalf("internal error: unrecognized code %q", v)
+	}
+
+	goroutines := runtime.GOMAXPROCS(0)
+	if goroutines < 4 {
+		goroutines = 4
+	}
+
+	ch := make(chan uint64, goroutines*3)
+	var wg sync.WaitGroup
+
+	// The various tests below should not cause race detector reports
+	// and should not produce duplicate results.
+	//
+	// Note: these tests can theoretically fail when using fastrand64
+	// in that it is possible to coincidentally get the same random
+	// number twice. That could happen something like 1 / 2**64 times,
+	// which is rare enough that it may never happen. We don't worry
+	// about that case.
+
+	switch code {
+	case 0:
+		// Call Seed and Uint64 concurrently.
+		wg.Add(goroutines)
+		for i := 0; i < goroutines; i++ {
+			go func(s int64) {
+				defer wg.Done()
+				Seed(s)
+			}(int64(i) + 100)
+		}
+		wg.Add(goroutines)
+		for i := 0; i < goroutines; i++ {
+			go func() {
+				defer wg.Done()
+				ch <- Uint64()
+			}()
+		}
+	case 1:
+		// Call Uint64 concurrently with no Seed.
+		wg.Add(goroutines)
+		for i := 0; i < goroutines; i++ {
+			go func() {
+				defer wg.Done()
+				ch <- Uint64()
+			}()
+		}
+	case 2:
+		// Start with Uint64 to pick the fast source, then call
+		// Seed and Uint64 concurrently.
+		ch <- Uint64()
+		wg.Add(goroutines)
+		for i := 0; i < goroutines; i++ {
+			go func(s int64) {
+				defer wg.Done()
+				Seed(s)
+			}(int64(i) + 100)
+		}
+		wg.Add(goroutines)
+		for i := 0; i < goroutines; i++ {
+			go func() {
+				defer wg.Done()
+				ch <- Uint64()
+			}()
+		}
+	default:
+		t.Fatalf("internal error: unrecognized code %d", code)
+	}
+
+	go func() {
+		wg.Wait()
+		close(ch)
+	}()
+
+	m := make(map[uint64]bool)
+	for i := range ch {
+		if m[i] {
+			t.Errorf("saw %d twice", i)
+		}
+		m[i] = true
+	}
+}
diff --git a/src/math/rand/rand.go b/src/math/rand/rand.go
index 7448ee1..612b34d 100644
--- a/src/math/rand/rand.go
+++ b/src/math/rand/rand.go
@@ -20,6 +20,7 @@
 import (
 	"internal/godebug"
 	"sync"
+	"sync/atomic"
 	_ "unsafe" // for go:linkname
 )
 
@@ -269,8 +270,11 @@
 // always returns len(p) and a nil error.
 // Read should not be called concurrently with any other Rand method.
 func (r *Rand) Read(p []byte) (n int, err error) {
-	if lk, ok := r.src.(*lockedSource); ok {
-		return lk.read(p, &r.readVal, &r.readPos)
+	switch src := r.src.(type) {
+	case *lockedSource:
+		return src.read(p, &r.readVal, &r.readPos)
+	case *fastSource:
+		return src.read(p, &r.readVal, &r.readPos)
 	}
 	return read(p, r.src, &r.readVal, &r.readPos)
 }
@@ -301,7 +305,75 @@
  * Top-level convenience functions
  */
 
-var globalRand = New(new(lockedSource))
+// globalRandGenerator is the source of random numbers for the top-level
+// convenience functions. When possible it uses the runtime fastrand64
+// function to avoid locking. This is not possible if the user called Seed,
+// either explicitly or implicitly via GODEBUG=randautoseed=0.
+var globalRandGenerator atomic.Pointer[Rand]
+
+var randautoseed = godebug.New("randautoseed")
+
+// globalRand returns the generator to use for the top-level convenience
+// functions.
+func globalRand() *Rand {
+	if r := globalRandGenerator.Load(); r != nil {
+		return r
+	}
+
+	// This is the first call. Initialize based on GODEBUG.
+	var r *Rand
+	if randautoseed.Value() == "0" {
+		randautoseed.IncNonDefault()
+		r = New(new(lockedSource))
+		r.Seed(1)
+	} else {
+		r = &Rand{
+			src: &fastSource{},
+			s64: &fastSource{},
+		}
+	}
+
+	if !globalRandGenerator.CompareAndSwap(nil, r) {
+		// Two different goroutines called some top-level
+		// function at the same time. While the results in
+		// that case are unpredictable, if we just use r here,
+		// and we are using a seed, we will most likely return
+		// the same value for both calls. That doesn't seem ideal.
+		// Just use the first one to get in.
+		return globalRandGenerator.Load()
+	}
+
+	return r
+}
+
+//go:linkname fastrand64
+func fastrand64() uint64
+
+// fastSource is an implementation of Source64 that uses the runtime
+// fastrand functions.
+type fastSource struct {
+	// The mutex is used to avoid race conditions in Read.
+	mu sync.Mutex
+}
+
+func (*fastSource) Int63() int64 {
+	return int64(fastrand64() & rngMask)
+}
+
+func (*fastSource) Seed(int64) {
+	panic("internal error: call to fastSource.Seed")
+}
+
+func (*fastSource) Uint64() uint64 {
+	return fastrand64()
+}
+
+func (fs *fastSource) read(p []byte, readVal *int64, readPos *int8) (n int, err error) {
+	fs.mu.Lock()
+	n, err = read(p, fs, readVal, readPos)
+	fs.mu.Unlock()
+	return
+}
 
 // Seed uses the provided seed value to initialize the default Source to a
 // deterministic state. Seed values that have the same remainder when
@@ -321,65 +393,90 @@
 // from the global random source. To avoid such breakages, programs
 // that need a specific result sequence should use NewRand(NewSource(seed))
 // to obtain a random generator that other packages cannot access.
-func Seed(seed int64) { globalRand.Seed(seed) }
+func Seed(seed int64) {
+	orig := globalRandGenerator.Load()
+
+	// If we are already using a lockedSource, we can just re-seed it.
+	if orig != nil {
+		if _, ok := orig.src.(*lockedSource); ok {
+			orig.Seed(seed)
+			return
+		}
+	}
+
+	// Otherwise either
+	// 1) orig == nil, which is the normal case when Seed is the first
+	// top-level function to be called, or
+	// 2) orig is already a fastSource, in which case we need to change
+	// to a lockedSource.
+	// Either way we do the same thing.
+
+	r := New(new(lockedSource))
+	r.Seed(seed)
+
+	if !globalRandGenerator.CompareAndSwap(orig, r) {
+		// Something changed underfoot. Retry to be safe.
+		Seed(seed)
+	}
+}
 
 // Int63 returns a non-negative pseudo-random 63-bit integer as an int64
 // from the default Source.
-func Int63() int64 { return globalRand.Int63() }
+func Int63() int64 { return globalRand().Int63() }
 
 // Uint32 returns a pseudo-random 32-bit value as a uint32
 // from the default Source.
-func Uint32() uint32 { return globalRand.Uint32() }
+func Uint32() uint32 { return globalRand().Uint32() }
 
 // Uint64 returns a pseudo-random 64-bit value as a uint64
 // from the default Source.
-func Uint64() uint64 { return globalRand.Uint64() }
+func Uint64() uint64 { return globalRand().Uint64() }
 
 // Int31 returns a non-negative pseudo-random 31-bit integer as an int32
 // from the default Source.
-func Int31() int32 { return globalRand.Int31() }
+func Int31() int32 { return globalRand().Int31() }
 
 // Int returns a non-negative pseudo-random int from the default Source.
-func Int() int { return globalRand.Int() }
+func Int() int { return globalRand().Int() }
 
 // Int63n returns, as an int64, a non-negative pseudo-random number in the half-open interval [0,n)
 // from the default Source.
 // It panics if n <= 0.
-func Int63n(n int64) int64 { return globalRand.Int63n(n) }
+func Int63n(n int64) int64 { return globalRand().Int63n(n) }
 
 // Int31n returns, as an int32, a non-negative pseudo-random number in the half-open interval [0,n)
 // from the default Source.
 // It panics if n <= 0.
-func Int31n(n int32) int32 { return globalRand.Int31n(n) }
+func Int31n(n int32) int32 { return globalRand().Int31n(n) }
 
 // Intn returns, as an int, a non-negative pseudo-random number in the half-open interval [0,n)
 // from the default Source.
 // It panics if n <= 0.
-func Intn(n int) int { return globalRand.Intn(n) }
+func Intn(n int) int { return globalRand().Intn(n) }
 
 // Float64 returns, as a float64, a pseudo-random number in the half-open interval [0.0,1.0)
 // from the default Source.
-func Float64() float64 { return globalRand.Float64() }
+func Float64() float64 { return globalRand().Float64() }
 
 // Float32 returns, as a float32, a pseudo-random number in the half-open interval [0.0,1.0)
 // from the default Source.
-func Float32() float32 { return globalRand.Float32() }
+func Float32() float32 { return globalRand().Float32() }
 
 // Perm returns, as a slice of n ints, a pseudo-random permutation of the integers
 // in the half-open interval [0,n) from the default Source.
-func Perm(n int) []int { return globalRand.Perm(n) }
+func Perm(n int) []int { return globalRand().Perm(n) }
 
 // Shuffle pseudo-randomizes the order of elements using the default Source.
 // n is the number of elements. Shuffle panics if n < 0.
 // swap swaps the elements with indexes i and j.
-func Shuffle(n int, swap func(i, j int)) { globalRand.Shuffle(n, swap) }
+func Shuffle(n int, swap func(i, j int)) { globalRand().Shuffle(n, swap) }
 
 // Read generates len(p) random bytes from the default Source and
 // writes them into p. It always returns len(p) and a nil error.
 // Read, unlike the Rand.Read method, is safe for concurrent use.
 //
 // Deprecated: For almost all use cases, crypto/rand.Read is more appropriate.
-func Read(p []byte) (n int, err error) { return globalRand.Read(p) }
+func Read(p []byte) (n int, err error) { return globalRand().Read(p) }
 
 // NormFloat64 returns a normally distributed float64 in the range
 // [-math.MaxFloat64, +math.MaxFloat64] with
@@ -389,7 +486,7 @@
 // adjust the output using:
 //
 //	sample = NormFloat64() * desiredStdDev + desiredMean
-func NormFloat64() float64 { return globalRand.NormFloat64() }
+func NormFloat64() float64 { return globalRand().NormFloat64() }
 
 // ExpFloat64 returns an exponentially distributed float64 in the range
 // (0, +math.MaxFloat64] with an exponential distribution whose rate parameter
@@ -398,44 +495,23 @@
 // callers can adjust the output using:
 //
 //	sample = ExpFloat64() / desiredRateParameter
-func ExpFloat64() float64 { return globalRand.ExpFloat64() }
+func ExpFloat64() float64 { return globalRand().ExpFloat64() }
 
 type lockedSource struct {
 	lk sync.Mutex
-	s  *rngSource // nil if not yet allocated
-}
-
-//go:linkname fastrand64
-func fastrand64() uint64
-
-var randautoseed = godebug.New("randautoseed")
-
-// source returns r.s, allocating and seeding it if needed.
-// The caller must have locked r.
-func (r *lockedSource) source() *rngSource {
-	if r.s == nil {
-		var seed int64
-		if randautoseed.Value() == "0" {
-			randautoseed.IncNonDefault()
-			seed = 1
-		} else {
-			seed = int64(fastrand64())
-		}
-		r.s = newSource(seed)
-	}
-	return r.s
+	s  *rngSource
 }
 
 func (r *lockedSource) Int63() (n int64) {
 	r.lk.Lock()
-	n = r.source().Int63()
+	n = r.s.Int63()
 	r.lk.Unlock()
 	return
 }
 
 func (r *lockedSource) Uint64() (n uint64) {
 	r.lk.Lock()
-	n = r.source().Uint64()
+	n = r.s.Uint64()
 	r.lk.Unlock()
 	return
 }
@@ -467,7 +543,7 @@
 // read implements Read for a lockedSource without a race condition.
 func (r *lockedSource) read(p []byte, readVal *int64, readPos *int8) (n int, err error) {
 	r.lk.Lock()
-	n, err = read(p, r.source(), readVal, readPos)
+	n, err = read(p, r.s, readVal, readPos)
 	r.lk.Unlock()
 	return
 }
diff --git a/src/math/rand/rand_test.go b/src/math/rand/rand_test.go
index 462de8b..7eba1dc 100644
--- a/src/math/rand/rand_test.go
+++ b/src/math/rand/rand_test.go
@@ -14,6 +14,7 @@
 	. "math/rand"
 	"os"
 	"runtime"
+	"sync"
 	"testing"
 	"testing/iotest"
 )
@@ -683,3 +684,18 @@
 		r.Read(buf)
 	}
 }
+
+func BenchmarkConcurrent(b *testing.B) {
+	const goroutines = 4
+	var wg sync.WaitGroup
+	wg.Add(goroutines)
+	for i := 0; i < goroutines; i++ {
+		go func() {
+			defer wg.Done()
+			for n := b.N; n > 0; n-- {
+				Int63()
+			}
+		}()
+	}
+	wg.Wait()
+}