[dev.link] cmd/link: apply relocations while writing symbols
We can apply relocations of a symbol right after the symbol data
is copied to output buffer. This should help locality and
parallelism (parallelizing over blocks, instead of over segments).
Linking cmd/compile,
Asmb+Reloc 23.9ms ±18% 16.5ms ±11% -30.73% (p=0.008 n=5+5)
Linking cmd/compile with external linking,
Asmb+Reloc 74.0ms ± 3% 33.8ms ± 8% -54.32% (p=0.008 n=5+5)
In external linking mode, allocation goes up slightly, as we do
smaller batching now. It doesn't seem too bad.
Asmb+Reloc 15.0MB ± 0% 16.7MB ± 0% +11.22% (p=0.008 n=5+5)
Change-Id: Ide33d9ff86c39124c8f5cfc050d7badc753a1ced
Reviewed-on: https://go-review.googlesource.com/c/go/+/239197
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Jeremy Faller <jeremy@golang.org>
Reviewed-by: Than McIntosh <thanm@google.com>
diff --git a/src/cmd/link/internal/ld/asmb.go b/src/cmd/link/internal/ld/asmb.go
index a9987ba..d4e358c 100644
--- a/src/cmd/link/internal/ld/asmb.go
+++ b/src/cmd/link/internal/ld/asmb.go
@@ -6,19 +6,23 @@
import (
"cmd/internal/objabi"
- "cmd/link/internal/loader"
"fmt"
"sync"
)
// Assembling the binary is broken into two steps:
-// - writing out the code/data/dwarf Segments
+// - writing out the code/data/dwarf Segments, applying relocations on the fly
// - writing out the architecture specific pieces.
// This function handles the first part.
-func asmb(ctxt *Link, ldr *loader.Loader) {
+func asmb(ctxt *Link) {
+ ctxt.loader.InitOutData()
+ if ctxt.IsExternal() {
+ ctxt.loader.InitExtRelocs()
+ }
+
// TODO(jfaller): delete me.
if thearch.Asmb != nil {
- thearch.Asmb(ctxt, ldr)
+ thearch.Asmb(ctxt, ctxt.loader)
return
}
diff --git a/src/cmd/link/internal/ld/data.go b/src/cmd/link/internal/ld/data.go
index 7300317..ed7129a 100644
--- a/src/cmd/link/internal/ld/data.go
+++ b/src/cmd/link/internal/ld/data.go
@@ -640,41 +640,6 @@
}
}
-func (ctxt *Link) reloc() {
- var wg sync.WaitGroup
- ldr := ctxt.loader
- if ctxt.IsExternal() {
- ldr.InitExtRelocs()
- }
- wg.Add(3)
- go func() {
- if !ctxt.IsWasm() { // On Wasm, text relocations are applied in Asmb2.
- st := ctxt.makeRelocSymState()
- for _, s := range ctxt.Textp {
- st.relocsym(s, ldr.OutData(s))
- }
- }
- wg.Done()
- }()
- go func() {
- st := ctxt.makeRelocSymState()
- for _, s := range ctxt.datap {
- st.relocsym(s, ldr.OutData(s))
- }
- wg.Done()
- }()
- go func() {
- st := ctxt.makeRelocSymState()
- for _, si := range dwarfp {
- for _, s := range si.syms {
- st.relocsym(s, ldr.OutData(s))
- }
- }
- wg.Done()
- }()
- wg.Wait()
-}
-
func windynrelocsym(ctxt *Link, rel *loader.SymbolBuilder, s loader.Sym) {
var su *loader.SymbolBuilder
relocs := ctxt.loader.Relocs(s)
@@ -801,7 +766,7 @@
}
func CodeblkPad(ctxt *Link, out *OutBuf, addr int64, size int64, pad []byte) {
- writeBlocks(out, ctxt.outSem, ctxt.loader, ctxt.Textp, addr, size, pad)
+ writeBlocks(ctxt, out, ctxt.outSem, ctxt.loader, ctxt.Textp, addr, size, pad)
}
const blockSize = 1 << 20 // 1MB chunks written at a time.
@@ -811,7 +776,7 @@
// as many goroutines as necessary to accomplish this task. This call then
// blocks, waiting on the writes to complete. Note that we use the sem parameter
// to limit the number of concurrent writes taking place.
-func writeBlocks(out *OutBuf, sem chan int, ldr *loader.Loader, syms []loader.Sym, addr, size int64, pad []byte) {
+func writeBlocks(ctxt *Link, out *OutBuf, sem chan int, ldr *loader.Loader, syms []loader.Sym, addr, size int64, pad []byte) {
for i, s := range syms {
if ldr.SymValue(s) >= addr && !ldr.AttrSubSymbol(s) {
syms = syms[i:]
@@ -876,12 +841,12 @@
sem <- 1
wg.Add(1)
go func(o *OutBuf, ldr *loader.Loader, syms []loader.Sym, addr, size int64, pad []byte) {
- writeBlock(o, ldr, syms, addr, size, pad)
+ writeBlock(ctxt, o, ldr, syms, addr, size, pad)
wg.Done()
<-sem
}(o, ldr, syms, addr, length, pad)
} else { // output not mmaped, don't parallelize.
- writeBlock(out, ldr, syms, addr, length, pad)
+ writeBlock(ctxt, out, ldr, syms, addr, length, pad)
}
// Prepare for the next loop.
@@ -894,7 +859,7 @@
wg.Wait()
}
-func writeBlock(out *OutBuf, ldr *loader.Loader, syms []loader.Sym, addr, size int64, pad []byte) {
+func writeBlock(ctxt *Link, out *OutBuf, ldr *loader.Loader, syms []loader.Sym, addr, size int64, pad []byte) {
for i, s := range syms {
if ldr.SymValue(s) >= addr && !ldr.AttrSubSymbol(s) {
syms = syms[i:]
@@ -902,6 +867,8 @@
}
}
+ st := ctxt.makeRelocSymState()
+
// This doesn't distinguish the memory size from the file
// size, and it lays out the file based on Symbol.Value, which
// is the virtual address. DWARF compression changes file sizes,
@@ -924,6 +891,7 @@
addr = val
}
out.WriteSym(ldr, s)
+ st.relocsym(s, ldr.OutData(s))
addr += int64(len(ldr.Data(s)))
siz := ldr.SymSize(s)
if addr < val+siz {
@@ -973,7 +941,7 @@
}
func writeDatblkToOutBuf(ctxt *Link, out *OutBuf, addr int64, size int64) {
- writeBlocks(out, ctxt.outSem, ctxt.loader, ctxt.datap, addr, size, zeros[:])
+ writeBlocks(ctxt, out, ctxt.outSem, ctxt.loader, ctxt.datap, addr, size, zeros[:])
}
func dwarfblk(ctxt *Link, out *OutBuf, addr int64, size int64) {
@@ -991,7 +959,7 @@
for i := range dwarfp {
syms = append(syms, dwarfp[i].syms...)
}
- writeBlocks(out, ctxt.outSem, ctxt.loader, syms, addr, size, zeros[:])
+ writeBlocks(ctxt, out, ctxt.outSem, ctxt.loader, syms, addr, size, zeros[:])
}
var zeros [512]byte
diff --git a/src/cmd/link/internal/ld/main.go b/src/cmd/link/internal/ld/main.go
index e68997f..252c3c5 100644
--- a/src/cmd/link/internal/ld/main.go
+++ b/src/cmd/link/internal/ld/main.go
@@ -315,10 +315,7 @@
// asmb will redirect symbols to the output file mmap, and relocations
// will be applied directly there.
bench.Start("Asmb")
- ctxt.loader.InitOutData()
- asmb(ctxt, ctxt.loader)
- bench.Start("reloc")
- ctxt.reloc()
+ asmb(ctxt)
bench.Start("Asmb2")
asmb2(ctxt)