libgo: delay applying profile stack-frame skip until fixup

When the runtime collects a stack trace to associate it with some
profiling event (mem alloc, mutex, etc) there is a skip count passed
to runtime.Callers (or equivalent) to skip some known count of frames
in order to get to the "interesting" frame corresponding to the
profile event. Now that the profiling mechanism uses lazy fixup (when
removing compiler artifacts like thunks, morestack calls etc), we also
need to move the frame skipping logic after the fixup, so as to insure
that the skip count isn't thrown off by these artifacts.

Fixes golang/go#32290.

Change-Id: I109b3e8b30d4e2955da4af4235f8965717fba7cb
Reviewed-on: https://go-review.googlesource.com/c/gofrontend/+/179740
Reviewed-by: Ian Lance Taylor <iant@golang.org>
diff --git a/libgo/go/runtime/mprof.go b/libgo/go/runtime/mprof.go
index 9238e2b..132c2ff 100644
--- a/libgo/go/runtime/mprof.go
+++ b/libgo/go/runtime/mprof.go
@@ -56,6 +56,7 @@
 	hash    uintptr
 	size    uintptr
 	nstk    uintptr
+	skip    int
 }
 
 // A memRecord is the bucket data for a bucket of type memProfile,
@@ -185,7 +186,7 @@
 }
 
 // newBucket allocates a bucket with the given type and number of stack entries.
-func newBucket(typ bucketType, nstk int) *bucket {
+func newBucket(typ bucketType, nstk int, skipCount int) *bucket {
 	size := payloadOffset(typ, uintptr(nstk))
 	switch typ {
 	default:
@@ -203,6 +204,7 @@
 	bucketmem += size
 	b.typ = typ
 	b.nstk = uintptr(nstk)
+	b.skip = skipCount
 	return b
 }
 
@@ -229,7 +231,7 @@
 }
 
 // Return the bucket for stk[0:nstk], allocating new bucket if needed.
-func stkbucket(typ bucketType, size uintptr, stk []uintptr, alloc bool) *bucket {
+func stkbucket(typ bucketType, size uintptr, skip int, stk []uintptr, alloc bool) *bucket {
 	if buckhash == nil {
 		buckhash = (*[buckHashSize]*bucket)(sysAlloc(unsafe.Sizeof(*buckhash), &memstats.buckhash_sys))
 		if buckhash == nil {
@@ -264,7 +266,7 @@
 	}
 
 	// Create new bucket.
-	b := newBucket(typ, len(stk))
+	b := newBucket(typ, len(stk), skip)
 	copy(b.stk(), stk)
 	b.hash = h
 	b.size = size
@@ -369,9 +371,10 @@
 // Called by malloc to record a profiled block.
 func mProf_Malloc(p unsafe.Pointer, size uintptr) {
 	var stk [maxStack]uintptr
-	nstk := callersRaw(1, stk[:])
+	nstk := callersRaw(stk[:])
 	lock(&proflock)
-	b := stkbucket(memProfile, size, stk[:nstk], true)
+	skip := 1
+	b := stkbucket(memProfile, size, skip, stk[:nstk], true)
 	c := mProf.cycle
 	mp := b.mp()
 	mpc := &mp.future[(c+2)%uint32(len(mp.future))]
@@ -446,14 +449,14 @@
 	var nstk int
 	var stk [maxStack]uintptr
 	if gp.m.curg == nil || gp.m.curg == gp {
-		nstk = callersRaw(skip, stk[:])
+		nstk = callersRaw(stk[:])
 	} else {
 		// FIXME: This should get a traceback of gp.m.curg.
 		// nstk = gcallers(gp.m.curg, skip, stk[:])
-		nstk = callersRaw(skip, stk[:])
+		nstk = callersRaw(stk[:])
 	}
 	lock(&proflock)
-	b := stkbucket(which, 0, stk[:nstk], true)
+	b := stkbucket(which, 0, skip, stk[:nstk], true)
 	b.bp().count++
 	b.bp().cycles += cycles
 	unlock(&proflock)
@@ -605,9 +608,12 @@
 // later. Note: there is code in go-callers.c's backtrace_full callback()
 // function that performs very similar fixups; these two code paths
 // should be kept in sync.
-func fixupStack(stk []uintptr, canonStack *[maxStack]uintptr, size uintptr) int {
+func fixupStack(stk []uintptr, skip int, canonStack *[maxStack]uintptr, size uintptr) int {
 	var cidx int
 	var termTrace bool
+	// Increase the skip count to take into account the frames corresponding
+	// to runtime.callersRaw and to the C routine that it invokes.
+	skip += 2
 	for _, pc := range stk {
 		// Subtract 1 from PC to undo the 1 we added in callback in
 		// go-callers.c.
@@ -669,6 +675,16 @@
 			break
 		}
 	}
+
+	// Apply skip count. Needs to be done after expanding inline frames.
+	if skip != 0 {
+		if skip >= cidx {
+			return 0
+		}
+		copy(canonStack[:cidx-skip], canonStack[skip:])
+		return cidx - skip
+	}
+
 	return cidx
 }
 
@@ -680,8 +696,8 @@
 // the new bucket.
 func fixupBucket(b *bucket) {
 	var canonStack [maxStack]uintptr
-	frames := fixupStack(b.stk(), &canonStack, b.size)
-	cb := stkbucket(prunedProfile, b.size, canonStack[:frames], true)
+	frames := fixupStack(b.stk(), b.skip, &canonStack, b.size)
+	cb := stkbucket(prunedProfile, b.size, 0, canonStack[:frames], true)
 	switch b.typ {
 	default:
 		throw("invalid profile bucket type")
diff --git a/libgo/go/runtime/traceback_gccgo.go b/libgo/go/runtime/traceback_gccgo.go
index b0eecf2..4134d28 100644
--- a/libgo/go/runtime/traceback_gccgo.go
+++ b/libgo/go/runtime/traceback_gccgo.go
@@ -63,11 +63,11 @@
 
 //go:noescape
 //extern runtime_callersRaw
-func c_callersRaw(skip int32, pcs *uintptr, max int32) int32
+func c_callersRaw(pcs *uintptr, max int32) int32
 
 // callersRaw returns a raw (PCs only) stack trace of the current goroutine.
-func callersRaw(skip int, pcbuf []uintptr) int {
-	n := c_callersRaw(int32(skip)+1, &pcbuf[0], int32(len(pcbuf)))
+func callersRaw(pcbuf []uintptr) int {
+	n := c_callersRaw(&pcbuf[0], int32(len(pcbuf)))
 	return int(n)
 }
 
diff --git a/libgo/runtime/go-callers.c b/libgo/runtime/go-callers.c
index 4a9c1a7..e7d53a3 100644
--- a/libgo/runtime/go-callers.c
+++ b/libgo/runtime/go-callers.c
@@ -268,7 +268,6 @@
 struct callersRaw_data
 {
   uintptr* pcbuf;
-  int skip;
   int index;
   int max;
 };
@@ -280,12 +279,6 @@
 {
   struct callersRaw_data *arg = (struct callersRaw_data *) data;
 
-  if (arg->skip > 0)
-    {
-      --arg->skip;
-      return 0;
-    }
-
   /* On the call to backtrace_simple the pc value was most likely
      decremented if there was a normal call, since the pc referred to
      the instruction where the call returned and not the call itself.
@@ -306,13 +299,12 @@
 /* runtime_callersRaw is similar to runtime_callers() above, but
    it returns raw PC values as opposed to file/func/line locations. */
 int32
-runtime_callersRaw (int32 skip, uintptr *pcbuf, int32 m)
+runtime_callersRaw (uintptr *pcbuf, int32 m)
 {
   struct callersRaw_data data;
   struct backtrace_state* state;
 
   data.pcbuf = pcbuf;
-  data.skip = skip + 1;
   data.index = 0;
   data.max = m;
   runtime_xadd (&__go_runtime_in_callers, 1);