[dev.link] cmd/link: apply relocations while writing symbols

We can apply relocations of a symbol right after the symbol data
is copied to output buffer. This should help locality and
parallelism (parallelizing over blocks, instead of over segments).

Linking cmd/compile,
Asmb+Reloc     23.9ms ±18%    16.5ms ±11%   -30.73%  (p=0.008 n=5+5)

Linking cmd/compile with external linking,
Asmb+Reloc     74.0ms ± 3%    33.8ms ± 8%   -54.32%  (p=0.008 n=5+5)

In external linking mode, allocation goes up slightly, as we do
smaller batching now. It doesn't seem too bad.
Asmb+Reloc     15.0MB ± 0%    16.7MB ± 0%   +11.22%  (p=0.008 n=5+5)

Change-Id: Ide33d9ff86c39124c8f5cfc050d7badc753a1ced
Reviewed-on: https://go-review.googlesource.com/c/go/+/239197
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Jeremy Faller <jeremy@golang.org>
Reviewed-by: Than McIntosh <thanm@google.com>
diff --git a/src/cmd/link/internal/ld/asmb.go b/src/cmd/link/internal/ld/asmb.go
index a9987ba..d4e358c 100644
--- a/src/cmd/link/internal/ld/asmb.go
+++ b/src/cmd/link/internal/ld/asmb.go
@@ -6,19 +6,23 @@
 
 import (
 	"cmd/internal/objabi"
-	"cmd/link/internal/loader"
 	"fmt"
 	"sync"
 )
 
 // Assembling the binary is broken into two steps:
-//  - writing out the code/data/dwarf Segments
+//  - writing out the code/data/dwarf Segments, applying relocations on the fly
 //  - writing out the architecture specific pieces.
 // This function handles the first part.
-func asmb(ctxt *Link, ldr *loader.Loader) {
+func asmb(ctxt *Link) {
+	ctxt.loader.InitOutData()
+	if ctxt.IsExternal() {
+		ctxt.loader.InitExtRelocs()
+	}
+
 	// TODO(jfaller): delete me.
 	if thearch.Asmb != nil {
-		thearch.Asmb(ctxt, ldr)
+		thearch.Asmb(ctxt, ctxt.loader)
 		return
 	}
 
diff --git a/src/cmd/link/internal/ld/data.go b/src/cmd/link/internal/ld/data.go
index 7300317..ed7129a 100644
--- a/src/cmd/link/internal/ld/data.go
+++ b/src/cmd/link/internal/ld/data.go
@@ -640,41 +640,6 @@
 	}
 }
 
-func (ctxt *Link) reloc() {
-	var wg sync.WaitGroup
-	ldr := ctxt.loader
-	if ctxt.IsExternal() {
-		ldr.InitExtRelocs()
-	}
-	wg.Add(3)
-	go func() {
-		if !ctxt.IsWasm() { // On Wasm, text relocations are applied in Asmb2.
-			st := ctxt.makeRelocSymState()
-			for _, s := range ctxt.Textp {
-				st.relocsym(s, ldr.OutData(s))
-			}
-		}
-		wg.Done()
-	}()
-	go func() {
-		st := ctxt.makeRelocSymState()
-		for _, s := range ctxt.datap {
-			st.relocsym(s, ldr.OutData(s))
-		}
-		wg.Done()
-	}()
-	go func() {
-		st := ctxt.makeRelocSymState()
-		for _, si := range dwarfp {
-			for _, s := range si.syms {
-				st.relocsym(s, ldr.OutData(s))
-			}
-		}
-		wg.Done()
-	}()
-	wg.Wait()
-}
-
 func windynrelocsym(ctxt *Link, rel *loader.SymbolBuilder, s loader.Sym) {
 	var su *loader.SymbolBuilder
 	relocs := ctxt.loader.Relocs(s)
@@ -801,7 +766,7 @@
 }
 
 func CodeblkPad(ctxt *Link, out *OutBuf, addr int64, size int64, pad []byte) {
-	writeBlocks(out, ctxt.outSem, ctxt.loader, ctxt.Textp, addr, size, pad)
+	writeBlocks(ctxt, out, ctxt.outSem, ctxt.loader, ctxt.Textp, addr, size, pad)
 }
 
 const blockSize = 1 << 20 // 1MB chunks written at a time.
@@ -811,7 +776,7 @@
 // as many goroutines as necessary to accomplish this task. This call then
 // blocks, waiting on the writes to complete. Note that we use the sem parameter
 // to limit the number of concurrent writes taking place.
-func writeBlocks(out *OutBuf, sem chan int, ldr *loader.Loader, syms []loader.Sym, addr, size int64, pad []byte) {
+func writeBlocks(ctxt *Link, out *OutBuf, sem chan int, ldr *loader.Loader, syms []loader.Sym, addr, size int64, pad []byte) {
 	for i, s := range syms {
 		if ldr.SymValue(s) >= addr && !ldr.AttrSubSymbol(s) {
 			syms = syms[i:]
@@ -876,12 +841,12 @@
 			sem <- 1
 			wg.Add(1)
 			go func(o *OutBuf, ldr *loader.Loader, syms []loader.Sym, addr, size int64, pad []byte) {
-				writeBlock(o, ldr, syms, addr, size, pad)
+				writeBlock(ctxt, o, ldr, syms, addr, size, pad)
 				wg.Done()
 				<-sem
 			}(o, ldr, syms, addr, length, pad)
 		} else { // output not mmaped, don't parallelize.
-			writeBlock(out, ldr, syms, addr, length, pad)
+			writeBlock(ctxt, out, ldr, syms, addr, length, pad)
 		}
 
 		// Prepare for the next loop.
@@ -894,7 +859,7 @@
 	wg.Wait()
 }
 
-func writeBlock(out *OutBuf, ldr *loader.Loader, syms []loader.Sym, addr, size int64, pad []byte) {
+func writeBlock(ctxt *Link, out *OutBuf, ldr *loader.Loader, syms []loader.Sym, addr, size int64, pad []byte) {
 	for i, s := range syms {
 		if ldr.SymValue(s) >= addr && !ldr.AttrSubSymbol(s) {
 			syms = syms[i:]
@@ -902,6 +867,8 @@
 		}
 	}
 
+	st := ctxt.makeRelocSymState()
+
 	// This doesn't distinguish the memory size from the file
 	// size, and it lays out the file based on Symbol.Value, which
 	// is the virtual address. DWARF compression changes file sizes,
@@ -924,6 +891,7 @@
 			addr = val
 		}
 		out.WriteSym(ldr, s)
+		st.relocsym(s, ldr.OutData(s))
 		addr += int64(len(ldr.Data(s)))
 		siz := ldr.SymSize(s)
 		if addr < val+siz {
@@ -973,7 +941,7 @@
 }
 
 func writeDatblkToOutBuf(ctxt *Link, out *OutBuf, addr int64, size int64) {
-	writeBlocks(out, ctxt.outSem, ctxt.loader, ctxt.datap, addr, size, zeros[:])
+	writeBlocks(ctxt, out, ctxt.outSem, ctxt.loader, ctxt.datap, addr, size, zeros[:])
 }
 
 func dwarfblk(ctxt *Link, out *OutBuf, addr int64, size int64) {
@@ -991,7 +959,7 @@
 	for i := range dwarfp {
 		syms = append(syms, dwarfp[i].syms...)
 	}
-	writeBlocks(out, ctxt.outSem, ctxt.loader, syms, addr, size, zeros[:])
+	writeBlocks(ctxt, out, ctxt.outSem, ctxt.loader, syms, addr, size, zeros[:])
 }
 
 var zeros [512]byte
diff --git a/src/cmd/link/internal/ld/main.go b/src/cmd/link/internal/ld/main.go
index e68997f..252c3c5 100644
--- a/src/cmd/link/internal/ld/main.go
+++ b/src/cmd/link/internal/ld/main.go
@@ -315,10 +315,7 @@
 	// asmb will redirect symbols to the output file mmap, and relocations
 	// will be applied directly there.
 	bench.Start("Asmb")
-	ctxt.loader.InitOutData()
-	asmb(ctxt, ctxt.loader)
-	bench.Start("reloc")
-	ctxt.reloc()
+	asmb(ctxt)
 	bench.Start("Asmb2")
 	asmb2(ctxt)