cmd/compile,runtime: allocate defer records on the stack

When a defer is executed at most once in a function body,
we can allocate the defer record for it on the stack instead
of on the heap.

This should make defers like this (which are very common) faster.

This optimization applies to 363 out of the 370 static defer sites
in the cmd/go binary.

name     old time/op  new time/op  delta
Defer-4  52.2ns ± 5%  36.2ns ± 3%  -30.70%  (p=0.000 n=10+10)

Fixes #6980
Update #14939

Change-Id: I697109dd7aeef9e97a9eeba2ef65ff53d3ee1004
Reviewed-on: https://go-review.googlesource.com/c/go/+/171758
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
diff --git a/src/cmd/compile/internal/gc/esc.go b/src/cmd/compile/internal/gc/esc.go
index ded9439..c42f25e 100644
--- a/src/cmd/compile/internal/gc/esc.go
+++ b/src/cmd/compile/internal/gc/esc.go
@@ -802,6 +802,7 @@
 
 	case ODEFER:
 		if e.loopdepth == 1 { // top level
+			n.Esc = EscNever // force stack allocation of defer record (see ssa.go)
 			break
 		}
 		// arguments leak out of scope
diff --git a/src/cmd/compile/internal/gc/escape.go b/src/cmd/compile/internal/gc/escape.go
index 88dc9ef..47ce853 100644
--- a/src/cmd/compile/internal/gc/escape.go
+++ b/src/cmd/compile/internal/gc/escape.go
@@ -882,6 +882,7 @@
 	// non-transient location to avoid arguments from being
 	// transiently allocated.
 	if where.Op == ODEFER && e.loopDepth == 1 {
+		where.Esc = EscNever // force stack allocation of defer record (see ssa.go)
 		// TODO(mdempsky): Eliminate redundant EscLocation allocs.
 		return e.teeHole(k, e.newLoc(nil, false).asHole())
 	}
diff --git a/src/cmd/compile/internal/gc/go.go b/src/cmd/compile/internal/gc/go.go
index 6123e6a..85718e3 100644
--- a/src/cmd/compile/internal/gc/go.go
+++ b/src/cmd/compile/internal/gc/go.go
@@ -287,6 +287,7 @@
 	assertI2I,
 	assertI2I2,
 	deferproc,
+	deferprocStack,
 	Deferreturn,
 	Duffcopy,
 	Duffzero,
diff --git a/src/cmd/compile/internal/gc/reflect.go b/src/cmd/compile/internal/gc/reflect.go
index 04707b9..0854817 100644
--- a/src/cmd/compile/internal/gc/reflect.go
+++ b/src/cmd/compile/internal/gc/reflect.go
@@ -317,6 +317,48 @@
 	return hiter
 }
 
+// deferstruct makes a runtime._defer structure, with additional space for
+// stksize bytes of args.
+func deferstruct(stksize int64) *types.Type {
+	makefield := func(name string, typ *types.Type) *types.Field {
+		f := types.NewField()
+		f.Type = typ
+		// Unlike the global makefield function, this one needs to set Pkg
+		// because these types might be compared (in SSA CSE sorting).
+		// TODO: unify this makefield and the global one above.
+		f.Sym = &types.Sym{Name: name, Pkg: localpkg}
+		return f
+	}
+	argtype := types.NewArray(types.Types[TUINT8], stksize)
+	argtype.SetNoalg(true)
+	argtype.Width = stksize
+	argtype.Align = 1
+	// These fields must match the ones in runtime/runtime2.go:_defer and
+	// cmd/compile/internal/gc/ssa.go:(*state).call.
+	fields := []*types.Field{
+		makefield("siz", types.Types[TUINT32]),
+		makefield("started", types.Types[TBOOL]),
+		makefield("heap", types.Types[TBOOL]),
+		makefield("sp", types.Types[TUINTPTR]),
+		makefield("pc", types.Types[TUINTPTR]),
+		// Note: the types here don't really matter. Defer structures
+		// are always scanned explicitly during stack copying and GC,
+		// so we make them uintptr type even though they are real pointers.
+		makefield("fn", types.Types[TUINTPTR]),
+		makefield("_panic", types.Types[TUINTPTR]),
+		makefield("link", types.Types[TUINTPTR]),
+		makefield("args", argtype),
+	}
+
+	// build struct holding the above fields
+	s := types.New(TSTRUCT)
+	s.SetNoalg(true)
+	s.SetFields(fields)
+	s.Width = widstruct(s, s, 0, 1)
+	s.Align = uint8(Widthptr)
+	return s
+}
+
 // f is method type, with receiver.
 // return function type, receiver as first argument (or not).
 func methodfunc(f *types.Type, receiver *types.Type) *types.Type {
diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go
index 8637d72..744c37f 100644
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@@ -68,6 +68,7 @@
 	assertI2I = sysfunc("assertI2I")
 	assertI2I2 = sysfunc("assertI2I2")
 	deferproc = sysfunc("deferproc")
+	deferprocStack = sysfunc("deferprocStack")
 	Deferreturn = sysfunc("deferreturn")
 	Duffcopy = sysvar("duffcopy")             // asm func with special ABI
 	Duffzero = sysvar("duffzero")             // asm func with special ABI
@@ -864,7 +865,11 @@
 			}
 		}
 	case ODEFER:
-		s.call(n.Left, callDefer)
+		d := callDefer
+		if n.Esc == EscNever {
+			d = callDeferStack
+		}
+		s.call(n.Left, d)
 	case OGO:
 		s.call(n.Left, callGo)
 
@@ -2859,6 +2864,7 @@
 const (
 	callNormal callKind = iota
 	callDefer
+	callDeferStack
 	callGo
 )
 
@@ -3799,74 +3805,132 @@
 		rcvr = s.newValue1(ssa.OpIData, types.Types[TUINTPTR], i)
 	}
 	dowidth(fn.Type)
-	stksize := fn.Type.ArgWidth() // includes receiver
+	stksize := fn.Type.ArgWidth() // includes receiver, args, and results
 
 	// Run all assignments of temps.
 	// The temps are introduced to avoid overwriting argument
 	// slots when arguments themselves require function calls.
 	s.stmtList(n.List)
 
-	// Store arguments to stack, including defer/go arguments and receiver for method calls.
-	// These are written in SP-offset order.
-	argStart := Ctxt.FixedFrameSize()
-	// Defer/go args.
-	if k != callNormal {
-		// Write argsize and closure (args to newproc/deferproc).
-		argsize := s.constInt32(types.Types[TUINT32], int32(stksize))
-		addr := s.constOffPtrSP(s.f.Config.Types.UInt32Ptr, argStart)
-		s.store(types.Types[TUINT32], addr, argsize)
-		addr = s.constOffPtrSP(s.f.Config.Types.UintptrPtr, argStart+int64(Widthptr))
-		s.store(types.Types[TUINTPTR], addr, closure)
-		stksize += 2 * int64(Widthptr)
-		argStart += 2 * int64(Widthptr)
-	}
-
-	// Set receiver (for interface calls).
-	if rcvr != nil {
-		addr := s.constOffPtrSP(s.f.Config.Types.UintptrPtr, argStart)
-		s.store(types.Types[TUINTPTR], addr, rcvr)
-	}
-
-	// Write args.
-	t := n.Left.Type
-	args := n.Rlist.Slice()
-	if n.Op == OCALLMETH {
-		f := t.Recv()
-		s.storeArg(args[0], f.Type, argStart+f.Offset)
-		args = args[1:]
-	}
-	for i, n := range args {
-		f := t.Params().Field(i)
-		s.storeArg(n, f.Type, argStart+f.Offset)
-	}
-
-	// call target
 	var call *ssa.Value
-	switch {
-	case k == callDefer:
-		call = s.newValue1A(ssa.OpStaticCall, types.TypeMem, deferproc, s.mem())
-	case k == callGo:
-		call = s.newValue1A(ssa.OpStaticCall, types.TypeMem, newproc, s.mem())
-	case closure != nil:
-		// rawLoad because loading the code pointer from a
-		// closure is always safe, but IsSanitizerSafeAddr
-		// can't always figure that out currently, and it's
-		// critical that we not clobber any arguments already
-		// stored onto the stack.
-		codeptr = s.rawLoad(types.Types[TUINTPTR], closure)
-		call = s.newValue3(ssa.OpClosureCall, types.TypeMem, codeptr, closure, s.mem())
-	case codeptr != nil:
-		call = s.newValue2(ssa.OpInterCall, types.TypeMem, codeptr, s.mem())
-	case sym != nil:
-		call = s.newValue1A(ssa.OpStaticCall, types.TypeMem, sym.Linksym(), s.mem())
-	default:
-		Fatalf("bad call type %v %v", n.Op, n)
+	if k == callDeferStack {
+		// Make a defer struct d on the stack.
+		t := deferstruct(stksize)
+		d := tempAt(n.Pos, s.curfn, t)
+
+		s.vars[&memVar] = s.newValue1A(ssa.OpVarDef, types.TypeMem, d, s.mem())
+		addr := s.addr(d, false)
+
+		// Must match reflect.go:deferstruct and src/runtime/runtime2.go:_defer.
+		// 0: siz
+		s.store(types.Types[TUINT32],
+			s.newValue1I(ssa.OpOffPtr, types.Types[TUINT32].PtrTo(), t.FieldOff(0), addr),
+			s.constInt32(types.Types[TUINT32], int32(stksize)))
+		// 1: started, set in deferprocStack
+		// 2: heap, set in deferprocStack
+		// 3: sp, set in deferprocStack
+		// 4: pc, set in deferprocStack
+		// 5: fn
+		s.store(closure.Type,
+			s.newValue1I(ssa.OpOffPtr, closure.Type.PtrTo(), t.FieldOff(5), addr),
+			closure)
+		// 6: panic, set in deferprocStack
+		// 7: link, set in deferprocStack
+
+		// Then, store all the arguments of the defer call.
+		ft := fn.Type
+		off := t.FieldOff(8)
+		args := n.Rlist.Slice()
+
+		// Set receiver (for interface calls). Always a pointer.
+		if rcvr != nil {
+			p := s.newValue1I(ssa.OpOffPtr, ft.Recv().Type.PtrTo(), off, addr)
+			s.store(types.Types[TUINTPTR], p, rcvr)
+		}
+		// Set receiver (for method calls).
+		if n.Op == OCALLMETH {
+			f := ft.Recv()
+			s.storeArgWithBase(args[0], f.Type, addr, off+f.Offset)
+			args = args[1:]
+		}
+		// Set other args.
+		for _, f := range ft.Params().Fields().Slice() {
+			s.storeArgWithBase(args[0], f.Type, addr, off+f.Offset)
+			args = args[1:]
+		}
+
+		// Call runtime.deferprocStack with pointer to _defer record.
+		arg0 := s.constOffPtrSP(types.Types[TUINTPTR], Ctxt.FixedFrameSize())
+		s.store(types.Types[TUINTPTR], arg0, addr)
+		call = s.newValue1A(ssa.OpStaticCall, types.TypeMem, deferprocStack, s.mem())
+		if stksize < int64(Widthptr) {
+			// We need room for both the call to deferprocStack and the call to
+			// the deferred function.
+			stksize = int64(Widthptr)
+		}
+		call.AuxInt = stksize
+	} else {
+		// Store arguments to stack, including defer/go arguments and receiver for method calls.
+		// These are written in SP-offset order.
+		argStart := Ctxt.FixedFrameSize()
+		// Defer/go args.
+		if k != callNormal {
+			// Write argsize and closure (args to newproc/deferproc).
+			argsize := s.constInt32(types.Types[TUINT32], int32(stksize))
+			addr := s.constOffPtrSP(s.f.Config.Types.UInt32Ptr, argStart)
+			s.store(types.Types[TUINT32], addr, argsize)
+			addr = s.constOffPtrSP(s.f.Config.Types.UintptrPtr, argStart+int64(Widthptr))
+			s.store(types.Types[TUINTPTR], addr, closure)
+			stksize += 2 * int64(Widthptr)
+			argStart += 2 * int64(Widthptr)
+		}
+
+		// Set receiver (for interface calls).
+		if rcvr != nil {
+			addr := s.constOffPtrSP(s.f.Config.Types.UintptrPtr, argStart)
+			s.store(types.Types[TUINTPTR], addr, rcvr)
+		}
+
+		// Write args.
+		t := n.Left.Type
+		args := n.Rlist.Slice()
+		if n.Op == OCALLMETH {
+			f := t.Recv()
+			s.storeArg(args[0], f.Type, argStart+f.Offset)
+			args = args[1:]
+		}
+		for i, n := range args {
+			f := t.Params().Field(i)
+			s.storeArg(n, f.Type, argStart+f.Offset)
+		}
+
+		// call target
+		switch {
+		case k == callDefer:
+			call = s.newValue1A(ssa.OpStaticCall, types.TypeMem, deferproc, s.mem())
+		case k == callGo:
+			call = s.newValue1A(ssa.OpStaticCall, types.TypeMem, newproc, s.mem())
+		case closure != nil:
+			// rawLoad because loading the code pointer from a
+			// closure is always safe, but IsSanitizerSafeAddr
+			// can't always figure that out currently, and it's
+			// critical that we not clobber any arguments already
+			// stored onto the stack.
+			codeptr = s.rawLoad(types.Types[TUINTPTR], closure)
+			call = s.newValue3(ssa.OpClosureCall, types.TypeMem, codeptr, closure, s.mem())
+		case codeptr != nil:
+			call = s.newValue2(ssa.OpInterCall, types.TypeMem, codeptr, s.mem())
+		case sym != nil:
+			call = s.newValue1A(ssa.OpStaticCall, types.TypeMem, sym.Linksym(), s.mem())
+		default:
+			Fatalf("bad call type %v %v", n.Op, n)
+		}
+		call.AuxInt = stksize // Call operations carry the argsize of the callee along with them
 	}
-	call.AuxInt = stksize // Call operations carry the argsize of the callee along with them
 	s.vars[&memVar] = call
 
 	// Finish block for defers
-	if k == callDefer {
+	if k == callDefer || k == callDeferStack {
 		b := s.endBlock()
 		b.Kind = ssa.BlockDefer
 		b.SetControl(call)
@@ -4361,17 +4425,27 @@
 }
 
 func (s *state) storeArg(n *Node, t *types.Type, off int64) {
+	s.storeArgWithBase(n, t, s.sp, off)
+}
+
+func (s *state) storeArgWithBase(n *Node, t *types.Type, base *ssa.Value, off int64) {
 	pt := types.NewPtr(t)
-	sp := s.constOffPtrSP(pt, off)
+	var addr *ssa.Value
+	if base == s.sp {
+		// Use special routine that avoids allocation on duplicate offsets.
+		addr = s.constOffPtrSP(pt, off)
+	} else {
+		addr = s.newValue1I(ssa.OpOffPtr, pt, off, base)
+	}
 
 	if !canSSAType(t) {
 		a := s.addr(n, false)
-		s.move(t, sp, a)
+		s.move(t, addr, a)
 		return
 	}
 
 	a := s.expr(n)
-	s.storeType(t, sp, a, 0, false)
+	s.storeType(t, addr, a, 0, false)
 }
 
 // slice computes the slice v[i:j:k] and returns ptr, len, and cap of result.
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index efa007a..2c63724 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -712,15 +712,31 @@
 
 	// Find additional pointers that point into the stack from the heap.
 	// Currently this includes defers and panics. See also function copystack.
+
+	// Find and trace all defer arguments.
 	tracebackdefers(gp, scanframe, nil)
+
+	// Find and trace other pointers in defer records.
 	for d := gp._defer; d != nil; d = d.link {
-		// tracebackdefers above does not scan the func value, which could
-		// be a stack allocated closure. See issue 30453.
 		if d.fn != nil {
+			// tracebackdefers above does not scan the func value, which could
+			// be a stack allocated closure. See issue 30453.
 			scanblock(uintptr(unsafe.Pointer(&d.fn)), sys.PtrSize, &oneptrmask[0], gcw, &state)
 		}
+		if d.link != nil {
+			// The link field of a stack-allocated defer record might point
+			// to a heap-allocated defer record. Keep that heap record live.
+			scanblock(uintptr(unsafe.Pointer(&d.link)), sys.PtrSize, &oneptrmask[0], gcw, &state)
+		}
+		// Retain defers records themselves.
+		// Defer records might not be reachable from the G through regular heap
+		// tracing because the defer linked list might weave between the stack and the heap.
+		if d.heap {
+			scanblock(uintptr(unsafe.Pointer(&d)), sys.PtrSize, &oneptrmask[0], gcw, &state)
+		}
 	}
 	if gp._panic != nil {
+		// Panics are always stack allocated.
 		state.putPtr(uintptr(unsafe.Pointer(gp._panic)))
 	}
 
diff --git a/src/runtime/panic.go b/src/runtime/panic.go
index f39a4bc..ce26eb5 100644
--- a/src/runtime/panic.go
+++ b/src/runtime/panic.go
@@ -228,6 +228,46 @@
 	// been set and must not be clobbered.
 }
 
+// deferprocStack queues a new deferred function with a defer record on the stack.
+// The defer record must have its siz and fn fields initialized.
+// All other fields can contain junk.
+// The defer record must be immediately followed in memory by
+// the arguments of the defer.
+// Nosplit because the arguments on the stack won't be scanned
+// until the defer record is spliced into the gp._defer list.
+//go:nosplit
+func deferprocStack(d *_defer) {
+	gp := getg()
+	if gp.m.curg != gp {
+		// go code on the system stack can't defer
+		throw("defer on system stack")
+	}
+	// siz and fn are already set.
+	// The other fields are junk on entry to deferprocStack and
+	// are initialized here.
+	d.started = false
+	d.heap = false
+	d.sp = getcallersp()
+	d.pc = getcallerpc()
+	// The lines below implement:
+	//   d.panic = nil
+	//   d.link = gp._defer
+	//   gp._defer = d
+	// But without write barriers. The first two are writes to
+	// the stack so they don't need a write barrier, and furthermore
+	// are to uninitialized memory, so they must not use a write barrier.
+	// The third write does not require a write barrier because we
+	// explicitly mark all the defer structures, so we don't need to
+	// keep track of pointers to them with a write barrier.
+	*(*uintptr)(unsafe.Pointer(&d._panic)) = 0
+	*(*uintptr)(unsafe.Pointer(&d.link)) = uintptr(unsafe.Pointer(gp._defer))
+	*(*uintptr)(unsafe.Pointer(&gp._defer)) = uintptr(unsafe.Pointer(d))
+
+	return0()
+	// No code can go here - the C return register has
+	// been set and must not be clobbered.
+}
+
 // Small malloc size classes >= 16 are the multiples of 16: 16, 32, 48, 64, 80, 96, 112, 128, 144, ...
 // Each P holds a pool for defers with small arg sizes.
 // Assign defer allocations to pools by rounding to 16, to match malloc size classes.
@@ -349,6 +389,7 @@
 		}
 	}
 	d.siz = siz
+	d.heap = true
 	d.link = gp._defer
 	gp._defer = d
 	return d
@@ -368,6 +409,9 @@
 	if d.fn != nil {
 		freedeferfn()
 	}
+	if !d.heap {
+		return
+	}
 	sc := deferclass(uintptr(d.siz))
 	if sc >= uintptr(len(p{}.deferpool)) {
 		return
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index bc5b482..16c02cd 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -775,9 +775,16 @@
 
 // A _defer holds an entry on the list of deferred calls.
 // If you add a field here, add code to clear it in freedefer.
+// This struct must match the code in cmd/compile/internal/gc/reflect.go:deferstruct
+// and cmd/compile/internal/gc/ssa.go:(*state).call.
+// Some defers will be allocated on the stack and some on the heap.
+// All defers are logically part of the stack, so write barriers to
+// initialize them are not required. All defers must be manually scanned,
+// and for heap defers, marked.
 type _defer struct {
-	siz     int32
+	siz     int32 // includes both arguments and results
 	started bool
+	heap    bool
 	sp      uintptr // sp at time of defer
 	pc      uintptr
 	fn      *funcval
diff --git a/src/runtime/stack.go b/src/runtime/stack.go
index d5d09ba..c32d9e8 100644
--- a/src/runtime/stack.go
+++ b/src/runtime/stack.go
@@ -719,16 +719,21 @@
 }
 
 func adjustdefers(gp *g, adjinfo *adjustinfo) {
-	// Adjust defer argument blocks the same way we adjust active stack frames.
-	tracebackdefers(gp, adjustframe, noescape(unsafe.Pointer(adjinfo)))
-
 	// Adjust pointers in the Defer structs.
-	// Defer structs themselves are never on the stack.
+	// We need to do this first because we need to adjust the
+	// defer.link fields so we always work on the new stack.
+	adjustpointer(adjinfo, unsafe.Pointer(&gp._defer))
 	for d := gp._defer; d != nil; d = d.link {
 		adjustpointer(adjinfo, unsafe.Pointer(&d.fn))
 		adjustpointer(adjinfo, unsafe.Pointer(&d.sp))
 		adjustpointer(adjinfo, unsafe.Pointer(&d._panic))
+		adjustpointer(adjinfo, unsafe.Pointer(&d.link))
 	}
+
+	// Adjust defer argument blocks the same way we adjust active stack frames.
+	// Note: this code is after the loop above, so that if a defer record is
+	// stack allocated, we work on the copy in the new stack.
+	tracebackdefers(gp, adjustframe, noescape(unsafe.Pointer(adjinfo)))
 }
 
 func adjustpanics(gp *g, adjinfo *adjustinfo) {
diff --git a/src/runtime/stack_test.go b/src/runtime/stack_test.go
index df73b3a..143d3a9 100644
--- a/src/runtime/stack_test.go
+++ b/src/runtime/stack_test.go
@@ -799,3 +799,58 @@
 		t.Errorf("output:\n%s\n\nwant no output", output)
 	}
 }
+
+func TestDeferHeapAndStack(t *testing.T) {
+	P := 4     // processors
+	N := 10000 //iterations
+	D := 200   // stack depth
+
+	if testing.Short() {
+		P /= 2
+		N /= 10
+		D /= 10
+	}
+	c := make(chan bool)
+	for p := 0; p < P; p++ {
+		go func() {
+			for i := 0; i < N; i++ {
+				if deferHeapAndStack(D) != 2*D {
+					panic("bad result")
+				}
+			}
+			c <- true
+		}()
+	}
+	for p := 0; p < P; p++ {
+		<-c
+	}
+}
+
+// deferHeapAndStack(n) computes 2*n
+func deferHeapAndStack(n int) (r int) {
+	if n == 0 {
+		return 0
+	}
+	if n%2 == 0 {
+		// heap-allocated defers
+		for i := 0; i < 2; i++ {
+			defer func() {
+				r++
+			}()
+		}
+	} else {
+		// stack-allocated defers
+		defer func() {
+			r++
+		}()
+		defer func() {
+			r++
+		}()
+	}
+	r = deferHeapAndStack(n - 1)
+	escapeMe(new([1024]byte)) // force some GCs
+	return
+}
+
+// Pass a value to escapeMe to force it to escape.
+var escapeMe = func(x interface{}) {}
diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go
index 18e64dd..26aaf22 100644
--- a/src/runtime/stubs.go
+++ b/src/runtime/stubs.go
@@ -248,9 +248,6 @@
 //go:noescape
 func asmcgocall(fn, arg unsafe.Pointer) int32
 
-// argp used in Defer structs when there is no argp.
-const _NoArgs = ^uintptr(0)
-
 func morestack()
 func morestack_noctxt()
 func rt0_go()
diff --git a/src/runtime/syscall_windows.go b/src/runtime/syscall_windows.go
index 36ad751..722a73d 100644
--- a/src/runtime/syscall_windows.go
+++ b/src/runtime/syscall_windows.go
@@ -112,7 +112,6 @@
 //go:nosplit
 func syscall_loadsystemlibrary(filename *uint16, absoluteFilepath *uint16) (handle, err uintptr) {
 	lockOSThread()
-	defer unlockOSThread()
 	c := &getg().m.syscall
 
 	if useLoadLibraryEx {
@@ -135,6 +134,7 @@
 	if handle == 0 {
 		err = c.err
 	}
+	unlockOSThread() // not defer'd after the lockOSThread above to save stack frame size.
 	return
 }
 
diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
index d817018..ef48c9f 100644
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go
@@ -148,11 +148,6 @@
 	waspanic := false
 	cgoCtxt := gp.cgoCtxt
 	printing := pcbuf == nil && callback == nil
-	_defer := gp._defer
-
-	for _defer != nil && _defer.sp == _NoArgs {
-		_defer = _defer.link
-	}
 
 	// If the PC is zero, it's likely a nil function call.
 	// Start in the caller's frame.
@@ -319,15 +314,14 @@
 		// In the latter case, use a deferreturn call site as the continuation pc.
 		frame.continpc = frame.pc
 		if waspanic {
-			// We match up defers with frames using the SP.
-			// However, if the function has an empty stack
-			// frame, then it's possible (on LR machines)
-			// for multiple call frames to have the same
-			// SP. But, since a function with no frame
-			// can't push a defer, the defer can't belong
-			// to that frame.
-			if _defer != nil && _defer.sp == frame.sp && frame.sp != frame.fp {
+			if frame.fn.deferreturn != 0 {
 				frame.continpc = frame.fn.entry + uintptr(frame.fn.deferreturn) + 1
+				// Note: this may perhaps keep return variables alive longer than
+				// strictly necessary, as we are using "function has a defer statement"
+				// as a proxy for "function actually deferred something". It seems
+				// to be a minor drawback. (We used to actually look through the
+				// gp._defer for a defer corresponding to this function, but that
+				// is hard to do with defer records on the stack during a stack copy.)
 				// Note: the +1 is to offset the -1 that
 				// stack.go:getStackMap does to back up a return
 				// address make sure the pc is in the CALL instruction.
@@ -336,11 +330,6 @@
 			}
 		}
 
-		// Unwind our local defer stack past this frame.
-		for _defer != nil && ((_defer.sp == frame.sp && frame.sp != frame.fp) || _defer.sp == _NoArgs) {
-			_defer = _defer.link
-		}
-
 		if callback != nil {
 			if !callback((*stkframe)(noescape(unsafe.Pointer(&frame))), v) {
 				return n
@@ -510,13 +499,6 @@
 		n = nprint
 	}
 
-	// If callback != nil, we're being called to gather stack information during
-	// garbage collection or stack growth. In that context, require that we used
-	// up the entire defer stack. If not, then there is a bug somewhere and the
-	// garbage collection or stack growth may not have seen the correct picture
-	// of the stack. Crash now instead of silently executing the garbage collection
-	// or stack copy incorrectly and setting up for a mysterious crash later.
-	//
 	// Note that panic != nil is okay here: there can be leftover panics,
 	// because the defers on the panic stack do not nest in frame order as
 	// they do on the defer stack. If you have:
@@ -557,16 +539,6 @@
 	// At other times, such as when gathering a stack for a profiling signal
 	// or when printing a traceback during a crash, everything may not be
 	// stopped nicely, and the stack walk may not be able to complete.
-	// It's okay in those situations not to use up the entire defer stack:
-	// incomplete information then is still better than nothing.
-	if callback != nil && n < max && _defer != nil {
-		print("runtime: g", gp.goid, ": leftover defer sp=", hex(_defer.sp), " pc=", hex(_defer.pc), "\n")
-		for _defer = gp._defer; _defer != nil; _defer = _defer.link {
-			print("\tdefer ", _defer, " sp=", hex(_defer.sp), " pc=", hex(_defer.pc), "\n")
-		}
-		throw("traceback has leftover defers")
-	}
-
 	if callback != nil && n < max && frame.sp != gp.stktopsp {
 		print("runtime: g", gp.goid, ": frame.sp=", hex(frame.sp), " top=", hex(gp.stktopsp), "\n")
 		print("\tstack=[", hex(gp.stack.lo), "-", hex(gp.stack.hi), "] n=", n, " max=", max, "\n")
diff --git a/test/codegen/stack.go b/test/codegen/stack.go
index ca37622..37d378a 100644
--- a/test/codegen/stack.go
+++ b/test/codegen/stack.go
@@ -109,3 +109,8 @@
 	_ = i << s   // panicShift
 	_ = i / j    // panicDivide
 }
+
+func Defer() {
+	// amd64:`CALL\truntime\.deferprocStack`
+	defer func() {}()
+}
diff --git a/test/live.go b/test/live.go
index e7134ec..ec51193 100644
--- a/test/live.go
+++ b/test/live.go
@@ -687,7 +687,7 @@
 // In particular, at printint r must be live.
 func f41(p, q *int) (r *int) { // ERROR "live at entry to f41: p q$"
 	r = p
-	defer func() { // ERROR "live at call to deferproc: q r$" "live at call to deferreturn: r$"
+	defer func() { // ERROR "live at call to deferprocStack: q r$" "live at call to deferreturn: r$"
 		recover()
 	}()
 	printint(0) // ERROR "live at call to printint: q r$"