runtime: add pcvalue cache to improve stack scan speed

The cost of scanning large stacks is currently dominated by the time
spent looking up and decoding the pcvalue table. However, large stacks
are usually large not because they contain calls to many different
functions, but because they contain many calls to the same, small set
of recursive functions. Hence, walking large stacks tends to make the
same pcvalue queries many times.

Based on this observation, this commit adds a small, very simple, and
fast cache in front of pcvalue lookup. We thread this cache down from
operations that make many pcvalue calls, such as gentraceback, stack
scanning, and stack adjusting.

This simple cache works well because it has minimal overhead when it's
not effective. I also tried a hashed direct-map cache, CLOCK-based
replacement, round-robin replacement, and round-robin with lookups
disabled until there had been at least 16 probes, but none of these
approaches had obvious wins over the random replacement policy in this
commit.

This nearly doubles the overall performance of the deep stack test
program from issue #10898:

name        old time/op  new time/op  delta
Issue10898   16.5s ±12%    9.2s ±12%  -44.37%  (p=0.008 n=5+5)

It's a very slight win on the garbage benchmark:

name              old time/op  new time/op  delta
XBenchGarbage-12  4.92ms ± 1%  4.89ms ± 1%  -0.75%  (p=0.000 n=18+19)

It's a wash (but doesn't harm performance) on the go1 benchmarks,
which don't have particularly deep stacks:

name                      old time/op    new time/op    delta
BinaryTree17-12              3.11s ± 2%     3.20s ± 3%  +2.83%  (p=0.000 n=17+20)
Fannkuch11-12                2.51s ± 1%     2.51s ± 1%  -0.22%  (p=0.034 n=19+18)
FmtFprintfEmpty-12          50.8ns ± 3%    50.6ns ± 2%    ~     (p=0.793 n=20+20)
FmtFprintfString-12          174ns ± 0%     174ns ± 1%  +0.17%  (p=0.048 n=15+20)
FmtFprintfInt-12             177ns ± 0%     165ns ± 1%  -6.99%  (p=0.000 n=17+19)
FmtFprintfIntInt-12          283ns ± 1%     284ns ± 0%  +0.22%  (p=0.000 n=18+15)
FmtFprintfPrefixedInt-12     243ns ± 1%     244ns ± 1%  +0.40%  (p=0.000 n=20+19)
FmtFprintfFloat-12           318ns ± 0%     319ns ± 0%  +0.27%  (p=0.001 n=19+20)
FmtManyArgs-12              1.12µs ± 0%    1.14µs ± 0%  +1.74%  (p=0.000 n=19+20)
GobDecode-12                8.69ms ± 0%    8.73ms ± 1%  +0.46%  (p=0.000 n=18+18)
GobEncode-12                6.64ms ± 1%    6.61ms ± 1%  -0.46%  (p=0.000 n=20+20)
Gzip-12                      323ms ± 2%     319ms ± 1%  -1.11%  (p=0.000 n=20+20)
Gunzip-12                   42.8ms ± 0%    42.9ms ± 0%    ~     (p=0.158 n=18+20)
HTTPClientServer-12         63.3µs ± 1%    63.1µs ± 1%  -0.35%  (p=0.011 n=20+20)
JSONEncode-12               16.9ms ± 1%    17.3ms ± 1%  +2.84%  (p=0.000 n=19+20)
JSONDecode-12               59.7ms ± 0%    58.5ms ± 0%  -2.05%  (p=0.000 n=19+17)
Mandelbrot200-12            3.92ms ± 0%    3.91ms ± 0%  -0.16%  (p=0.003 n=19+19)
GoParse-12                  3.79ms ± 2%    3.75ms ± 2%  -0.91%  (p=0.005 n=20+20)
RegexpMatchEasy0_32-12       102ns ± 1%     101ns ± 1%  -0.80%  (p=0.001 n=14+20)
RegexpMatchEasy0_1K-12       337ns ± 1%     346ns ± 1%  +2.90%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      84.4ns ± 2%    84.3ns ± 2%    ~     (p=0.743 n=20+20)
RegexpMatchEasy1_1K-12       502ns ± 1%     505ns ± 0%  +0.64%  (p=0.000 n=20+20)
RegexpMatchMedium_32-12      133ns ± 1%     132ns ± 1%  -0.85%  (p=0.000 n=20+19)
RegexpMatchMedium_1K-12     40.1µs ± 1%    39.8µs ± 1%  -0.77%  (p=0.000 n=18+18)
RegexpMatchHard_32-12       2.08µs ± 1%    2.07µs ± 1%  -0.55%  (p=0.001 n=18+19)
RegexpMatchHard_1K-12       62.4µs ± 1%    62.0µs ± 1%  -0.74%  (p=0.000 n=19+19)
Revcomp-12                   545ms ± 2%     545ms ± 3%    ~     (p=0.771 n=19+20)
Template-12                 73.7ms ± 1%    72.0ms ± 0%  -2.33%  (p=0.000 n=20+18)
TimeParse-12                 358ns ± 1%     351ns ± 1%  -2.07%  (p=0.000 n=20+20)
TimeFormat-12                369ns ± 1%     356ns ± 0%  -3.53%  (p=0.000 n=20+18)
[Geo mean]                  63.5µs         63.2µs       -0.41%

name                      old speed      new speed      delta
GobDecode-12              88.3MB/s ± 0%  87.9MB/s ± 0%  -0.43%  (p=0.000 n=18+17)
GobEncode-12               116MB/s ± 1%   116MB/s ± 1%  +0.47%  (p=0.000 n=20+20)
Gzip-12                   60.2MB/s ± 2%  60.8MB/s ± 1%  +1.13%  (p=0.000 n=20+20)
Gunzip-12                  453MB/s ± 0%   453MB/s ± 0%    ~     (p=0.160 n=18+20)
JSONEncode-12              115MB/s ± 1%   112MB/s ± 1%  -2.76%  (p=0.000 n=19+20)
JSONDecode-12             32.5MB/s ± 0%  33.2MB/s ± 0%  +2.09%  (p=0.000 n=19+17)
GoParse-12                15.3MB/s ± 2%  15.4MB/s ± 2%  +0.92%  (p=0.004 n=20+20)
RegexpMatchEasy0_32-12     311MB/s ± 1%   314MB/s ± 1%  +0.78%  (p=0.000 n=15+19)
RegexpMatchEasy0_1K-12    3.04GB/s ± 1%  2.95GB/s ± 1%  -2.90%  (p=0.000 n=19+19)
RegexpMatchEasy1_32-12     379MB/s ± 2%   380MB/s ± 2%    ~     (p=0.779 n=20+20)
RegexpMatchEasy1_1K-12    2.04GB/s ± 1%  2.02GB/s ± 0%  -0.62%  (p=0.000 n=20+20)
RegexpMatchMedium_32-12   7.46MB/s ± 1%  7.53MB/s ± 1%  +0.86%  (p=0.000 n=20+19)
RegexpMatchMedium_1K-12   25.5MB/s ± 1%  25.7MB/s ± 1%  +0.78%  (p=0.000 n=18+18)
RegexpMatchHard_32-12     15.4MB/s ± 1%  15.5MB/s ± 1%  +0.62%  (p=0.000 n=19+19)
RegexpMatchHard_1K-12     16.4MB/s ± 1%  16.5MB/s ± 1%  +0.82%  (p=0.000 n=20+19)
Revcomp-12                 466MB/s ± 2%   466MB/s ± 3%    ~     (p=0.765 n=19+20)
Template-12               26.3MB/s ± 1%  27.0MB/s ± 0%  +2.38%  (p=0.000 n=20+18)
[Geo mean]                97.8MB/s       98.0MB/s       +0.23%

Change-Id: I281044ae0b24990ba46487cacbc1069493274bc4
Reviewed-on: https://go-review.googlesource.com/13614
Reviewed-by: Keith Randall <khr@golang.org>
diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index 0344330..f8f88c6 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -247,7 +247,7 @@
 	if pc != f.entry {
 		pc--
 	}
-	pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, pc)
+	pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, pc, nil)
 	if pcdata == -1 {
 		// We do not have a valid pcdata value but there might be a
 		// stackmap for this function.  It is likely that we are looking
diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index 33715f2..42afdf4 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -1696,7 +1696,7 @@
 			if targetpc != f.entry {
 				targetpc--
 			}
-			pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
+			pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc, nil)
 			if pcdata == -1 {
 				return
 			}
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index 35bdda9..9301820 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -438,10 +438,11 @@
 		throw("scanstack in wrong phase")
 	}
 
+	var cache pcvalueCache
 	gcw := &getg().m.p.ptr().gcw
 	n := 0
 	scanframe := func(frame *stkframe, unused unsafe.Pointer) bool {
-		scanframeworker(frame, unused, gcw)
+		scanframeworker(frame, &cache, gcw)
 
 		if frame.fp > nextBarrier {
 			// We skip installing a barrier on bottom-most
@@ -474,7 +475,7 @@
 
 // Scan a stack frame: local variables and function arguments/results.
 //go:nowritebarrier
-func scanframeworker(frame *stkframe, unused unsafe.Pointer, gcw *gcWork) {
+func scanframeworker(frame *stkframe, cache *pcvalueCache, gcw *gcWork) {
 
 	f := frame.fn
 	targetpc := frame.continpc
@@ -488,7 +489,7 @@
 	if targetpc != f.entry {
 		targetpc--
 	}
-	pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
+	pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc, cache)
 	if pcdata == -1 {
 		// We do not have a valid pcdata value but there might be a
 		// stackmap for this function.  It is likely that we are looking
diff --git a/src/runtime/stack.go b/src/runtime/stack.go
index 56efc2e..e3087af 100644
--- a/src/runtime/stack.go
+++ b/src/runtime/stack.go
@@ -482,6 +482,7 @@
 type adjustinfo struct {
 	old   stack
 	delta uintptr // ptr distance from old to new stack (newbase - oldbase)
+	cache pcvalueCache
 }
 
 // Adjustpointer checks whether *vpp is in the old stack described by adjinfo.
@@ -575,7 +576,7 @@
 	if targetpc != f.entry {
 		targetpc--
 	}
-	pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
+	pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc, &adjinfo.cache)
 	if pcdata == -1 {
 		pcdata = 0 // in prologue
 	}
diff --git a/src/runtime/symtab.go b/src/runtime/symtab.go
index 4668609..c3235fa 100644
--- a/src/runtime/symtab.go
+++ b/src/runtime/symtab.go
@@ -112,6 +112,7 @@
 
 	// ftab is lookup table for function by program counter.
 	nftab := len(datap.ftab) - 1
+	var pcCache pcvalueCache
 	for i := 0; i < nftab; i++ {
 		// NOTE: ftab[nftab].entry is legal; it is the address beyond the final function.
 		if datap.ftab[i].entry > datap.ftab[i+1].entry {
@@ -147,9 +148,9 @@
 					}
 				}
 			}
-			pcvalue(f, f.pcfile, end, true)
-			pcvalue(f, f.pcln, end, true)
-			pcvalue(f, f.pcsp, end, true)
+			pcvalue(f, f.pcfile, end, &pcCache, true)
+			pcvalue(f, f.pcln, end, &pcCache, true)
+			pcvalue(f, f.pcsp, end, &pcCache, true)
 		}
 	}
 
@@ -226,10 +227,42 @@
 	return (*_func)(unsafe.Pointer(&datap.pclntable[datap.ftab[idx].funcoff]))
 }
 
-func pcvalue(f *_func, off int32, targetpc uintptr, strict bool) int32 {
+type pcvalueCache struct {
+	entries [16]pcvalueCacheEnt
+}
+
+type pcvalueCacheEnt struct {
+	// targetpc and off together are the key of this cache entry.
+	targetpc uintptr
+	off      int32
+	// val is the value of this cached pcvalue entry.
+	val int32
+}
+
+func pcvalue(f *_func, off int32, targetpc uintptr, cache *pcvalueCache, strict bool) int32 {
 	if off == 0 {
 		return -1
 	}
+
+	// Check the cache. This speeds up walks of deep stacks, which
+	// tend to have the same recursive functions over and over.
+	//
+	// This cache is small enough that full associativity is
+	// cheaper than doing the hashing for a less associative
+	// cache.
+	if cache != nil {
+		for _, ent := range cache.entries {
+			// We check off first because we're more
+			// likely to have multiple entries with
+			// different offsets for the same targetpc
+			// than the other way around, so we'll usually
+			// fail in the first clause.
+			if ent.off == off && ent.targetpc == targetpc {
+				return ent.val
+			}
+		}
+	}
+
 	datap := findmoduledatap(f.entry) // inefficient
 	if datap == nil {
 		if strict && panicking == 0 {
@@ -248,6 +281,19 @@
 			break
 		}
 		if targetpc < pc {
+			// Replace a random entry in the cache. Random
+			// replacement prevents a performance cliff if
+			// a recursive stack's cycle is slightly
+			// larger than the cache.
+			if cache != nil {
+				ci := fastrand1() % uint32(len(cache.entries))
+				cache.entries[ci] = pcvalueCacheEnt{
+					targetpc: targetpc,
+					off:      off,
+					val:      val,
+				}
+			}
+
 			return val
 		}
 	}
@@ -296,8 +342,8 @@
 	if datap == nil {
 		return "?", 0
 	}
-	fileno := int(pcvalue(f, f.pcfile, targetpc, strict))
-	line = pcvalue(f, f.pcln, targetpc, strict)
+	fileno := int(pcvalue(f, f.pcfile, targetpc, nil, strict))
+	line = pcvalue(f, f.pcln, targetpc, nil, strict)
 	if fileno == -1 || line == -1 || fileno >= len(datap.filetab) {
 		// print("looking for ", hex(targetpc), " in ", funcname(f), " got file=", fileno, " line=", lineno, "\n")
 		return "?", 0
@@ -310,20 +356,20 @@
 	return funcline1(f, targetpc, true)
 }
 
-func funcspdelta(f *_func, targetpc uintptr) int32 {
-	x := pcvalue(f, f.pcsp, targetpc, true)
+func funcspdelta(f *_func, targetpc uintptr, cache *pcvalueCache) int32 {
+	x := pcvalue(f, f.pcsp, targetpc, cache, true)
 	if x&(ptrSize-1) != 0 {
 		print("invalid spdelta ", funcname(f), " ", hex(f.entry), " ", hex(targetpc), " ", hex(f.pcsp), " ", x, "\n")
 	}
 	return x
 }
 
-func pcdatavalue(f *_func, table int32, targetpc uintptr) int32 {
+func pcdatavalue(f *_func, table int32, targetpc uintptr, cache *pcvalueCache) int32 {
 	if table < 0 || table >= f.npcdata {
 		return -1
 	}
 	off := *(*int32)(add(unsafe.Pointer(&f.nfuncdata), unsafe.Sizeof(f.nfuncdata)+uintptr(table)*4))
-	return pcvalue(f, off, targetpc, true)
+	return pcvalue(f, off, targetpc, cache, true)
 }
 
 func funcdata(f *_func, i int32) unsafe.Pointer {
diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
index 2d223ce..b99920a 100644
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go
@@ -198,6 +198,8 @@
 	}
 	frame.fn = f
 
+	var cache pcvalueCache
+
 	n := 0
 	for n < max {
 		// Typically:
@@ -219,7 +221,7 @@
 				sp = gp.m.curg.sched.sp
 				stkbar = gp.m.curg.stkbar[gp.m.curg.stkbarPos:]
 			}
-			frame.fp = sp + uintptr(funcspdelta(f, frame.pc))
+			frame.fp = sp + uintptr(funcspdelta(f, frame.pc, &cache))
 			if !usesLR {
 				// On x86, call instruction pushes return PC before entering new function.
 				frame.fp += regSize
@@ -403,7 +405,7 @@
 			frame.fn = f
 			if f == nil {
 				frame.pc = x
-			} else if funcspdelta(f, frame.pc) == 0 {
+			} else if funcspdelta(f, frame.pc, &cache) == 0 {
 				frame.lr = x
 			}
 		}