cmd/internal/obj/arm64: save LR and SP in one instruction for small frames

When we create a thread with signals blocked. But glibc's
pthread_sigmask doesn't really allow us to block SIGSETXID. So we
may get a signal early on before the signal stack is set. If we
get a signal on the current stack, it will clobber anything below
the SP. This CL makes it to save LR and decrement SP in a single
MOVD.W instruction for small frames, so we don't write below the
SP.

We used to use a single MOVD.W instruction before CL 379075.
CL 379075 changed to use an STP instruction to save the LR and FP,
then decrementing the SP. This CL changes it back, just this part
(epilogues and large frame prologues are unchanged). For small
frames, it is the same number of instructions either way.

This decreases the size of a "small" frame from 0x1f0 to 0xf0.
For frame sizes in between, it could benefit from using an
STP instruction instead of using the prologue for the "large"
frame case. We don't bother it for now as this is a stop-gap
solution anyway.

This only addresses the issue with small frames. Luckily, all
functions from thread entry to setting up the signal stack have
samll frames.

Other possible ideas:
- Expand the unwind info metadata, separate SP delta and the
  location of the return address, so we can express "SP is
  decremented but the return address is in the LR register". Then
  we can always create the frame first then write the LR, without
  writing anything below the SP (except the frame pointer at SP-8,
  which is minor because it doesn't really affect program
  execution).
- Set up the signal stack immediately in mstart in assembly.

For Go 1.19 we do this simple fix. We plan to do the metadata fix
in Go 1.20 ( #53609 ).

Other LR architectures are addressed in CL 413428.

Fix #53374.

Change-Id: I9d6582ab14ccb06ac61ad43852943d9555e22ae5
Reviewed-on: https://go-review.googlesource.com/c/go/+/412474
Run-TryBot: Cherry Mui <cherryyz@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
Reviewed-by: Eric Fang <eric.fang@arm.com>
diff --git a/misc/cgo/test/cgo_linux_test.go b/misc/cgo/test/cgo_linux_test.go
index a9746b5..7c4628c 100644
--- a/misc/cgo/test/cgo_linux_test.go
+++ b/misc/cgo/test/cgo_linux_test.go
@@ -15,6 +15,14 @@
 	}
 	testSetgid(t)
 }
+
+func TestSetgidStress(t *testing.T) {
+	if runtime.GOOS == "android" {
+		t.Skip("unsupported on Android")
+	}
+	testSetgidStress(t)
+}
+
 func Test1435(t *testing.T)    { test1435(t) }
 func Test6997(t *testing.T)    { test6997(t) }
 func TestBuildID(t *testing.T) { testBuildID(t) }
diff --git a/misc/cgo/test/setgid2_linux.go b/misc/cgo/test/setgid2_linux.go
new file mode 100644
index 0000000..d239893
--- /dev/null
+++ b/misc/cgo/test/setgid2_linux.go
@@ -0,0 +1,35 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Stress test setgid and thread creation. A thread
+// can get a SIGSETXID signal early on at thread
+// initialization, causing crash. See issue 53374.
+
+package cgotest
+
+/*
+#include <sys/types.h>
+#include <unistd.h>
+*/
+import "C"
+
+import (
+	"runtime"
+	"testing"
+)
+
+func testSetgidStress(t *testing.T) {
+	const N = 1000
+	ch := make(chan int, N)
+	for i := 0; i < N; i++ {
+		go func() {
+			C.setgid(0)
+			ch <- 1
+			runtime.LockOSThread() // so every goroutine uses a new thread
+		}()
+	}
+	for i := 0; i < N; i++ {
+		<-ch
+	}
+}
diff --git a/src/cmd/internal/obj/arm64/obj7.go b/src/cmd/internal/obj/arm64/obj7.go
index 1f2625d..83ae64a 100644
--- a/src/cmd/internal/obj/arm64/obj7.go
+++ b/src/cmd/internal/obj/arm64/obj7.go
@@ -609,17 +609,17 @@
 			var prologueEnd *obj.Prog
 
 			aoffset := c.autosize
-			if aoffset > 0x1f0 {
-				// LDP offset variant range is -512 to 504, SP should be 16-byte aligned,
-				// so the maximum aoffset value is 496.
-				aoffset = 0x1f0
+			if aoffset > 0xf0 {
+				// MOVD.W offset variant range is -0x100 to 0xf8, SP should be 16-byte aligned.
+				// so the maximum aoffset value is 0xf0.
+				aoffset = 0xf0
 			}
 
 			// Frame is non-empty. Make sure to save link register, even if
 			// it is a leaf function, so that traceback works.
 			q = p
 			if c.autosize > aoffset {
-				// Frame size is too large for a STP instruction. Store the frame pointer
+				// Frame size is too large for a MOVD.W instruction. Store the frame pointer
 				// register and link register before decrementing SP, so if a signal comes
 				// during the execution of the function prologue, the traceback code will
 				// not see a half-updated stack frame.
@@ -679,50 +679,37 @@
 					q1.To.Offset = -8
 				}
 			} else {
-				// small frame, save FP and LR with one STP instruction, then update SP.
-				// Store first, so if a signal comes during the execution of the function
-				// prologue, the traceback code will not see a half-updated stack frame.
-				// STP (R29, R30), -aoffset-8(RSP)
+				// small frame, update SP and save LR in a single MOVD.W instruction.
+				// So if a signal comes during the execution of the function prologue,
+				// the traceback code will not see a half-updated stack frame.
+				// Also, on Linux, in a cgo binary we may get a SIGSETXID signal
+				// early on before the signal stack is set, as glibc doesn't allow
+				// us to block SIGSETXID. So it is important that we don't write below
+				// the SP until the signal stack is set.
+				// Luckily, all the functions from thread entry to setting the signal
+				// stack have small frames.
 				q1 = obj.Appendp(q, c.newprog)
-				q1.As = ASTP
+				q1.As = AMOVD
 				q1.Pos = p.Pos
-				q1.From.Type = obj.TYPE_REGREG
-				q1.From.Reg = REGFP
-				q1.From.Offset = REGLINK
+				q1.From.Type = obj.TYPE_REG
+				q1.From.Reg = REGLINK
 				q1.To.Type = obj.TYPE_MEM
-				q1.To.Offset = int64(-aoffset - 8)
-				q1.To.Reg = REGSP
-
-				prologueEnd = q1
-
-				q1 = c.ctxt.StartUnsafePoint(q1, c.newprog)
-				// This instruction is not async preemptible, see the above comment.
-				// SUB $aoffset, RSP, RSP
-				q1 = obj.Appendp(q1, c.newprog)
-				q1.Pos = p.Pos
-				q1.As = ASUB
-				q1.From.Type = obj.TYPE_CONST
-				q1.From.Offset = int64(aoffset)
-				q1.Reg = REGSP
-				q1.To.Type = obj.TYPE_REG
+				q1.Scond = C_XPRE
+				q1.To.Offset = int64(-aoffset)
 				q1.To.Reg = REGSP
 				q1.Spadj = aoffset
 
-				q1 = c.ctxt.EndUnsafePoint(q1, c.newprog, -1)
+				prologueEnd = q1
 
-				if buildcfg.GOOS == "ios" {
-					// See the above comment.
-					// STP (R29, R30), -8(RSP)
-					q1 = obj.Appendp(q1, c.newprog)
-					q1.As = ASTP
-					q1.Pos = p.Pos
-					q1.From.Type = obj.TYPE_REGREG
-					q1.From.Reg = REGFP
-					q1.From.Offset = REGLINK
-					q1.To.Type = obj.TYPE_MEM
-					q1.To.Offset = int64(-8)
-					q1.To.Reg = REGSP
-				}
+				// Frame pointer.
+				q1 = obj.Appendp(q1, c.newprog)
+				q1.Pos = p.Pos
+				q1.As = AMOVD
+				q1.From.Type = obj.TYPE_REG
+				q1.From.Reg = REGFP
+				q1.To.Type = obj.TYPE_MEM
+				q1.To.Reg = REGSP
+				q1.To.Offset = -8
 			}
 
 			prologueEnd.Pos = prologueEnd.Pos.WithXlogue(src.PosPrologueEnd)