cmd/internal/obj/x86: use push/pop instead of mov to store/load FP

This CL changes how the x86 compiler stores and loads the frame pointer
on each function prologue and epilogue, with the goal to reduce the
final binary size without affecting performance.

The compiler is currently using MOV instructions to load and store BP,
which can take from 5 to 8 bytes each.

This CL changes this approach so it emits PUSH/POP instructions instead,
which always take only 1 byte each (when operating with BP). It can also
avoid using the SUBQ/ADDQ to grow the stack for functions that have
frame pointer but does not have local variables.

On Windows, this CL reduces the go toolchain size from 15,697,920 bytes
to 15,584,768 bytes, a reduction of 0.7%.

Example of epilog and prologue for a function with 0x10 bytes of
local variables:

Before

===
 SUBQ    $0x18, SP
 MOVQ    BP, 0x10(SP)
 LEAQ    0x10(SP), BP

 ... function body ...

 MOVQ    0x10(SP), BP
 ADDQ    $0x18, SP
 RET
===

After

===
  PUSHQ   BP
  LEAQ    0(SP), BP
  SUBQ    $0x10, SP

  ... function body ...

  MOVQ    ADDQ $0x10, SP
  POPQ    BP
  RET
===

Updates #6853

Change-Id: Ice9e14bbf8dff083c5f69feb97e9a764c3ca7785
Reviewed-on: https://go-review.googlesource.com/c/go/+/462300
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Quim Muntal <quimmuntal@gmail.com>
diff --git a/src/cmd/internal/obj/x86/obj6.go b/src/cmd/internal/obj/x86/obj6.go
index aa4cc22..33c02d5 100644
--- a/src/cmd/internal/obj/x86/obj6.go
+++ b/src/cmd/internal/obj/x86/obj6.go
@@ -684,37 +684,13 @@
 		p, regg = loadG(ctxt, cursym, p, newprog)
 	}
 
-	// Delve debugger would like the next instruction to be noted as the end of the function prologue.
-	// TODO: are there other cases (e.g., wrapper functions) that need marking?
-	markedPrologue := false
-
-	if autoffset != 0 {
-		if autoffset%int32(ctxt.Arch.RegSize) != 0 {
-			ctxt.Diag("unaligned stack size %d", autoffset)
-		}
-		p = obj.Appendp(p, newprog)
-		p.As = AADJSP
-		p.From.Type = obj.TYPE_CONST
-		p.From.Offset = int64(autoffset)
-		p.Spadj = autoffset
-		p.Pos = p.Pos.WithXlogue(src.PosPrologueEnd)
-		markedPrologue = true
-	}
-
 	if bpsize > 0 {
 		// Save caller's BP
 		p = obj.Appendp(p, newprog)
 
-		p.As = AMOVQ
+		p.As = APUSHQ
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = REG_BP
-		p.To.Type = obj.TYPE_MEM
-		p.To.Reg = REG_SP
-		p.To.Scale = 1
-		p.To.Offset = int64(autoffset) - int64(bpsize)
-		if !markedPrologue {
-			p.Pos = p.Pos.WithXlogue(src.PosPrologueEnd)
-		}
 
 		// Move current frame to BP
 		p = obj.Appendp(p, newprog)
@@ -723,11 +699,32 @@
 		p.From.Type = obj.TYPE_MEM
 		p.From.Reg = REG_SP
 		p.From.Scale = 1
-		p.From.Offset = int64(autoffset) - int64(bpsize)
+		p.From.Offset = 0
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = REG_BP
 	}
 
+	if autoffset%int32(ctxt.Arch.RegSize) != 0 {
+		ctxt.Diag("unaligned stack size %d", autoffset)
+	}
+
+	// localoffset is autoffset discounting the frame pointer,
+	// which has already been allocated in the stack.
+	localoffset := autoffset - int32(bpsize)
+	if localoffset != 0 {
+		p = obj.Appendp(p, newprog)
+		p.As = AADJSP
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = int64(localoffset)
+		p.Spadj = localoffset
+	}
+
+	// Delve debugger would like the next instruction to be noted as the end of the function prologue.
+	// TODO: are there other cases (e.g., wrapper functions) that need marking?
+	if autoffset != 0 {
+		p.Pos = p.Pos.WithXlogue(src.PosPrologueEnd)
+	}
+
 	if cursym.Func().Text.From.Sym.Wrapper() {
 		// if g._panic != nil && g._panic.argp == FP {
 		//   g._panic.argp = bottom-of-frame
@@ -933,24 +930,23 @@
 		if autoffset != 0 {
 			to := p.To // Keep To attached to RET for retjmp below
 			p.To = obj.Addr{}
-			if bpsize > 0 {
-				// Restore caller's BP
-				p.As = AMOVQ
-
-				p.From.Type = obj.TYPE_MEM
-				p.From.Reg = REG_SP
-				p.From.Scale = 1
-				p.From.Offset = int64(autoffset) - int64(bpsize)
-				p.To.Type = obj.TYPE_REG
-				p.To.Reg = REG_BP
+			if localoffset != 0 {
+				p.As = AADJSP
+				p.From.Type = obj.TYPE_CONST
+				p.From.Offset = int64(-localoffset)
+				p.Spadj = -localoffset
 				p = obj.Appendp(p, newprog)
 			}
 
-			p.As = AADJSP
-			p.From.Type = obj.TYPE_CONST
-			p.From.Offset = int64(-autoffset)
-			p.Spadj = -autoffset
-			p = obj.Appendp(p, newprog)
+			if bpsize > 0 {
+				// Restore caller's BP
+				p.As = APOPQ
+				p.To.Type = obj.TYPE_REG
+				p.To.Reg = REG_BP
+				p.Spadj = -int32(bpsize)
+				p = obj.Appendp(p, newprog)
+			}
+
 			p.As = obj.ARET
 			p.To = to