[dev.link] cmd/link: emit ELF relocations in mmap

Currently, ELF relocations are generated sequentially in the heap
and flushed to output file periodically. In fact, in some cases,
the output size of the relocation records can be easily computed,
as a relocation entry has fixed size. We only need to count the
number of relocation records to compute the size.

Once the size is computed, we can mmap the output with the proper
size, and directly write relocation records in the mapped memory.
It also opens the possibility of writing relocations in parallel
(not done in this CL).

Note: on some architectures, a Go relocation may turn into
multiple ELF relocations, which makes size calculation harder.
This CL does not handle those cases, and it still writes
sequentially in the heap there.

Linking cmd/compile with external linking,

name          old time/op    new time/op    delta
Asmb2            190ms ± 2%     141ms ± 4%  -25.74%  (p=0.000 n=10+10)

name          old alloc/op   new alloc/op   delta
Asmb2_GC        66.8MB ± 0%     8.2MB ± 0%  -87.79%  (p=0.008 n=5+5)

name          old live-B     new live-B     delta
Asmb2_GC         66.9M ± 0%     55.2M ± 0%  -17.58%  (p=0.008 n=5+5)

Change-Id: If7056bbe909dc90033eef6b9c4891fcca310602c
Reviewed-on: https://go-review.googlesource.com/c/go/+/240399
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Than McIntosh <thanm@google.com>
diff --git a/src/cmd/link/internal/amd64/obj.go b/src/cmd/link/internal/amd64/obj.go
index 4c52574..fcc2499 100644
--- a/src/cmd/link/internal/amd64/obj.go
+++ b/src/cmd/link/internal/amd64/obj.go
@@ -61,6 +61,7 @@
 		Archreloc:        archreloc,
 		Archrelocvariant: archrelocvariant,
 		Elfreloc1:        elfreloc1,
+		ElfrelocSize:     24,
 		Elfsetupplt:      elfsetupplt,
 		Gentext:          gentext,
 		Machoreloc1:      machoreloc1,
diff --git a/src/cmd/link/internal/arm/obj.go b/src/cmd/link/internal/arm/obj.go
index 1a57298..f25f735 100644
--- a/src/cmd/link/internal/arm/obj.go
+++ b/src/cmd/link/internal/arm/obj.go
@@ -54,6 +54,7 @@
 		Archrelocvariant: archrelocvariant,
 		Trampoline:       trampoline,
 		Elfreloc1:        elfreloc1,
+		ElfrelocSize:     8,
 		Elfsetupplt:      elfsetupplt,
 		Gentext:          gentext,
 		Machoreloc1:      machoreloc1,
diff --git a/src/cmd/link/internal/ld/data.go b/src/cmd/link/internal/ld/data.go
index be1af4b..7dc2c46 100644
--- a/src/cmd/link/internal/ld/data.go
+++ b/src/cmd/link/internal/ld/data.go
@@ -47,6 +47,7 @@
 	"strconv"
 	"strings"
 	"sync"
+	"sync/atomic"
 )
 
 // isRuntimeDepPkg reports whether pkg is the runtime package or its dependency
@@ -581,6 +582,7 @@
 	if len(extRelocs) != 0 {
 		st.finalizeExtRelocSlice(extRelocs)
 		ldr.SetExtRelocs(s, extRelocs)
+		atomic.AddUint32(&ldr.SymSect(s).Relcount, uint32(len(extRelocs)))
 	}
 }
 
diff --git a/src/cmd/link/internal/ld/elf.go b/src/cmd/link/internal/ld/elf.go
index 80612c4..bb4e171 100644
--- a/src/cmd/link/internal/ld/elf.go
+++ b/src/cmd/link/internal/ld/elf.go
@@ -1397,11 +1397,29 @@
 }
 
 func elfEmitReloc(ctxt *Link) {
-
 	for ctxt.Out.Offset()&7 != 0 {
 		ctxt.Out.Write8(0)
 	}
 
+	// Precompute the size needed for the reloc records if we can
+	// Mmap the output buffer with the proper size.
+	//
+	// TODO: on some architectures, one Go relocation may turn to
+	// multiple ELF relocations, which makes the size not fixed.
+	// Handle this case better. Maybe increment the counter by the
+	// number of external reloc records in relocsym.
+	var sz, filesz int64
+	if thearch.ElfrelocSize != 0 {
+		for _, seg := range Segments {
+			for _, sect := range seg.Sections {
+				sz += int64(thearch.ElfrelocSize * sect.Relcount)
+			}
+		}
+		filesz = ctxt.Out.Offset() + sz
+		ctxt.Out.Mmap(uint64(filesz))
+	}
+
+	// Now emits the records.
 	for _, sect := range Segtext.Sections {
 		if sect.Name == ".text" {
 			elfrelocsect(ctxt, sect, ctxt.Textp)
@@ -1428,6 +1446,11 @@
 		}
 		elfrelocsect(ctxt, sect, si.syms)
 	}
+
+	// sanity check
+	if thearch.ElfrelocSize != 0 && ctxt.Out.Offset() != filesz {
+		panic("elfEmitReloc: size mismatch")
+	}
 }
 
 func addgonote(ctxt *Link, sectionName string, tag uint32, desc []byte) {
diff --git a/src/cmd/link/internal/ld/lib.go b/src/cmd/link/internal/ld/lib.go
index fbf72f6..d160139 100644
--- a/src/cmd/link/internal/ld/lib.go
+++ b/src/cmd/link/internal/ld/lib.go
@@ -236,12 +236,13 @@
 	Asmb  func(*Link, *loader.Loader)
 	Asmb2 func(*Link, *loader.Loader)
 
-	Elfreloc1   func(*Link, *loader.Loader, loader.Sym, loader.ExtRelocView, int64) bool
-	Elfsetupplt func(ctxt *Link, plt, gotplt *loader.SymbolBuilder, dynamic loader.Sym)
-	Gentext     func(*Link, *loader.Loader)
-	Machoreloc1 func(*sys.Arch, *OutBuf, *loader.Loader, loader.Sym, loader.ExtRelocView, int64) bool
-	PEreloc1    func(*sys.Arch, *OutBuf, *loader.Loader, loader.Sym, loader.ExtRelocView, int64) bool
-	Xcoffreloc1 func(*sys.Arch, *OutBuf, *loader.Loader, loader.Sym, loader.ExtRelocView, int64) bool
+	Elfreloc1    func(*Link, *loader.Loader, loader.Sym, loader.ExtRelocView, int64) bool
+	ElfrelocSize uint32 // size of an ELF relocation record, must match Elfreloc1. Currently this can be 0, meaning that the size is not fixed (a Go reloc may turn into multiple ELF reloc).
+	Elfsetupplt  func(ctxt *Link, plt, gotplt *loader.SymbolBuilder, dynamic loader.Sym)
+	Gentext      func(*Link, *loader.Loader)
+	Machoreloc1  func(*sys.Arch, *OutBuf, *loader.Loader, loader.Sym, loader.ExtRelocView, int64) bool
+	PEreloc1     func(*sys.Arch, *OutBuf, *loader.Loader, loader.Sym, loader.ExtRelocView, int64) bool
+	Xcoffreloc1  func(*sys.Arch, *OutBuf, *loader.Loader, loader.Sym, loader.ExtRelocView, int64) bool
 
 	// TLSIEtoLE converts a TLS Initial Executable relocation to
 	// a TLS Local Executable relocation.
@@ -310,6 +311,8 @@
 	Segrelrodata sym.Segment
 	Segdata      sym.Segment
 	Segdwarf     sym.Segment
+
+	Segments = []*sym.Segment{&Segtext, &Segrodata, &Segrelrodata, &Segdata, &Segdwarf}
 )
 
 const pkgdef = "__.PKGDEF"
diff --git a/src/cmd/link/internal/ld/outbuf.go b/src/cmd/link/internal/ld/outbuf.go
index b474067..f017828 100644
--- a/src/cmd/link/internal/ld/outbuf.go
+++ b/src/cmd/link/internal/ld/outbuf.go
@@ -149,13 +149,10 @@
 	bufLen := len(out.buf)
 	heapLen := len(out.heap)
 	total := uint64(bufLen + heapLen)
-	out.munmap()
 	if heapLen != 0 {
-		if err := out.Mmap(total); err != nil {
+		if err := out.Mmap(total); err != nil { // Mmap will copy out.heap over to out.buf
 			panic(err)
 		}
-		copy(out.buf[bufLen:], out.heap[:heapLen])
-		out.heap = out.heap[:0]
 	}
 	return true
 }
diff --git a/src/cmd/link/internal/ld/outbuf_mmap.go b/src/cmd/link/internal/ld/outbuf_mmap.go
index 7280027..53b14b09 100644
--- a/src/cmd/link/internal/ld/outbuf_mmap.go
+++ b/src/cmd/link/internal/ld/outbuf_mmap.go
@@ -10,7 +10,15 @@
 	"syscall"
 )
 
+// Mmap maps the output file with the given size. It unmaps the old mapping
+// if it is already mapped. It also flushes any in-heap data to the new
+// mapping.
 func (out *OutBuf) Mmap(filesize uint64) (err error) {
+	oldlen := len(out.buf)
+	if oldlen != 0 {
+		out.munmap()
+	}
+
 	for {
 		if err = out.fallocate(filesize); err != syscall.EINTR {
 			break
@@ -29,7 +37,17 @@
 		Exitf("resize output file failed: %v", err)
 	}
 	out.buf, err = syscall.Mmap(int(out.f.Fd()), 0, int(filesize), syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED|syscall.MAP_FILE)
-	return err
+	if err != nil {
+		return err
+	}
+
+	// copy heap to new mapping
+	if uint64(oldlen+len(out.heap)) > filesize {
+		panic("mmap size too small")
+	}
+	copy(out.buf[oldlen:], out.heap)
+	out.heap = out.heap[:0]
+	return nil
 }
 
 func (out *OutBuf) munmap() {
diff --git a/src/cmd/link/internal/ld/outbuf_nommap.go b/src/cmd/link/internal/ld/outbuf_nommap.go
index bad01dc..6b40253 100644
--- a/src/cmd/link/internal/ld/outbuf_nommap.go
+++ b/src/cmd/link/internal/ld/outbuf_nommap.go
@@ -6,9 +6,16 @@
 
 package ld
 
+// Mmap allocates an in-heap output buffer with the given size. It copies
+// any old data (if any) to the new buffer.
 func (out *OutBuf) Mmap(filesize uint64) error {
 	// We need space to put all the symbols before we apply relocations.
+	oldheap := out.heap
+	if filesize < uint64(len(oldheap)) {
+		panic("mmap size too small")
+	}
 	out.heap = make([]byte, filesize)
+	copy(out.heap, oldheap)
 	return nil
 }
 
diff --git a/src/cmd/link/internal/ld/outbuf_windows.go b/src/cmd/link/internal/ld/outbuf_windows.go
index 807c0e2..60dc1ab 100644
--- a/src/cmd/link/internal/ld/outbuf_windows.go
+++ b/src/cmd/link/internal/ld/outbuf_windows.go
@@ -10,7 +10,15 @@
 	"unsafe"
 )
 
+// Mmap maps the output file with the given size. It unmaps the old mapping
+// if it is already mapped. It also flushes any in-heap data to the new
+// mapping.
 func (out *OutBuf) Mmap(filesize uint64) error {
+	oldlen := len(out.buf)
+	if oldlen != 0 {
+		out.munmap()
+	}
+
 	err := out.f.Truncate(int64(filesize))
 	if err != nil {
 		Exitf("resize output file failed: %v", err)
@@ -28,6 +36,13 @@
 		return err
 	}
 	*(*reflect.SliceHeader)(unsafe.Pointer(&out.buf)) = reflect.SliceHeader{Data: ptr, Len: int(filesize), Cap: int(filesize)}
+
+	// copy heap to new mapping
+	if uint64(oldlen+len(out.heap)) > filesize {
+		panic("mmap size too small")
+	}
+	copy(out.buf[oldlen:], out.heap)
+	out.heap = out.heap[:0]
 	return nil
 }
 
diff --git a/src/cmd/link/internal/mips/obj.go b/src/cmd/link/internal/mips/obj.go
index cc3cc43..e59c382 100644
--- a/src/cmd/link/internal/mips/obj.go
+++ b/src/cmd/link/internal/mips/obj.go
@@ -53,6 +53,7 @@
 		Archreloc:        archreloc,
 		Archrelocvariant: archrelocvariant,
 		Elfreloc1:        elfreloc1,
+		ElfrelocSize:     8,
 		Elfsetupplt:      elfsetupplt,
 		Gentext:          gentext,
 		Machoreloc1:      machoreloc1,
diff --git a/src/cmd/link/internal/mips64/obj.go b/src/cmd/link/internal/mips64/obj.go
index 449a200..6ef27ce 100644
--- a/src/cmd/link/internal/mips64/obj.go
+++ b/src/cmd/link/internal/mips64/obj.go
@@ -52,6 +52,7 @@
 		Archreloc:        archreloc,
 		Archrelocvariant: archrelocvariant,
 		Elfreloc1:        elfreloc1,
+		ElfrelocSize:     24,
 		Elfsetupplt:      elfsetupplt,
 		Gentext:          gentext,
 		Machoreloc1:      machoreloc1,
diff --git a/src/cmd/link/internal/s390x/obj.go b/src/cmd/link/internal/s390x/obj.go
index bb62fe1..8acc1d4 100644
--- a/src/cmd/link/internal/s390x/obj.go
+++ b/src/cmd/link/internal/s390x/obj.go
@@ -51,6 +51,7 @@
 		Archreloc:        archreloc,
 		Archrelocvariant: archrelocvariant,
 		Elfreloc1:        elfreloc1,
+		ElfrelocSize:     24,
 		Elfsetupplt:      elfsetupplt,
 		Gentext:          gentext,
 		Machoreloc1:      machoreloc1,
diff --git a/src/cmd/link/internal/sym/segment.go b/src/cmd/link/internal/sym/segment.go
index 9b49c62..97853b9 100644
--- a/src/cmd/link/internal/sym/segment.go
+++ b/src/cmd/link/internal/sym/segment.go
@@ -55,6 +55,12 @@
 	Elfsect interface{} // an *ld.ElfShdr
 	Reloff  uint64
 	Rellen  uint64
-	Sym     LoaderSym // symbol for the section, if any
-	Index   uint16    // each section has a unique index, used internally
+	// Relcount is the number of *host* relocations applied to this section
+	// (when external linking).
+	// Incremented atomically on multiple goroutines.
+	// Note: this may differ from number of Go relocations, as one Go relocation
+	// may turn into multiple host relocations.
+	Relcount uint32
+	Sym      LoaderSym // symbol for the section, if any
+	Index    uint16    // each section has a unique index, used internally
 }