[dev.link] cmd/link: parallelize asmb on amd64
Introduces a parallel OutBuf implementation, and POC on amd64. Due to
some of the weird behaviors I saw on MacOS (SIGBUS while calling msync),
I will wait for feedback to port to other architectures.
On my mac, sped up Asmb by ~78% for cmd/compile (below). Will likely
have an appreciable speedup on kubelet benchmark.
Asmb 39.1ms ±11% 8.5ms ±10% -78.17% (p=0.000 n=10+9)
TotalTime 596ms ± 2% 577ms ± 8% -3.07% (p=0.034 n=8+10)
Change-Id: Id2a2577c3f4da155d8dccc862897f43b941877ac
Reviewed-on: https://go-review.googlesource.com/c/go/+/223742
Run-TryBot: Jeremy Faller <jeremy@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Than McIntosh <thanm@google.com>
diff --git a/src/cmd/link/internal/amd64/asm.go b/src/cmd/link/internal/amd64/asm.go
index 7b92585..40ecd25 100644
--- a/src/cmd/link/internal/amd64/asm.go
+++ b/src/cmd/link/internal/amd64/asm.go
@@ -38,6 +38,7 @@
"cmd/link/internal/sym"
"debug/elf"
"log"
+ "sync"
)
func PADDR(x uint32) uint32 {
@@ -693,29 +694,32 @@
ld.Asmbelfsetup()
}
+ var wg sync.WaitGroup
sect := ld.Segtext.Sections[0]
- ctxt.Out.SeekSet(int64(sect.Vaddr - ld.Segtext.Vaddr + ld.Segtext.Fileoff))
- // 0xCC is INT $3 - breakpoint instruction
- ld.CodeblkPad(ctxt, int64(sect.Vaddr), int64(sect.Length), []byte{0xCC})
- for _, sect = range ld.Segtext.Sections[1:] {
- ctxt.Out.SeekSet(int64(sect.Vaddr - ld.Segtext.Vaddr + ld.Segtext.Fileoff))
- ld.Datblk(ctxt, int64(sect.Vaddr), int64(sect.Length))
+ offset := sect.Vaddr - ld.Segtext.Vaddr + ld.Segtext.Fileoff
+ f := func(ctxt *ld.Link, out *ld.OutBuf, start, length int64) {
+ // 0xCC is INT $3 - breakpoint instruction
+ ld.CodeblkPad(ctxt, out, start, length, []byte{0xCC})
+ }
+ ld.WriteParallel(&wg, f, ctxt, offset, sect.Vaddr, sect.Length)
+
+ for _, sect := range ld.Segtext.Sections[1:] {
+ offset := sect.Vaddr - ld.Segtext.Vaddr + ld.Segtext.Fileoff
+ ld.WriteParallel(&wg, ld.Datblk2, ctxt, offset, sect.Vaddr, sect.Length)
}
if ld.Segrodata.Filelen > 0 {
- ctxt.Out.SeekSet(int64(ld.Segrodata.Fileoff))
- ld.Datblk(ctxt, int64(ld.Segrodata.Vaddr), int64(ld.Segrodata.Filelen))
+ ld.WriteParallel(&wg, ld.Datblk2, ctxt, ld.Segrodata.Fileoff, ld.Segrodata.Vaddr, ld.Segrodata.Filelen)
}
if ld.Segrelrodata.Filelen > 0 {
- ctxt.Out.SeekSet(int64(ld.Segrelrodata.Fileoff))
- ld.Datblk(ctxt, int64(ld.Segrelrodata.Vaddr), int64(ld.Segrelrodata.Filelen))
+ ld.WriteParallel(&wg, ld.Datblk2, ctxt, ld.Segrelrodata.Fileoff, ld.Segrelrodata.Vaddr, ld.Segrelrodata.Filelen)
}
- ctxt.Out.SeekSet(int64(ld.Segdata.Fileoff))
- ld.Datblk(ctxt, int64(ld.Segdata.Vaddr), int64(ld.Segdata.Filelen))
+ ld.WriteParallel(&wg, ld.Datblk2, ctxt, ld.Segdata.Fileoff, ld.Segdata.Vaddr, ld.Segdata.Filelen)
- ctxt.Out.SeekSet(int64(ld.Segdwarf.Fileoff))
- ld.Dwarfblk(ctxt, int64(ld.Segdwarf.Vaddr), int64(ld.Segdwarf.Filelen))
+ ld.WriteParallel(&wg, ld.Dwarfblk2, ctxt, ld.Segdwarf.Fileoff, ld.Segdwarf.Vaddr, ld.Segdwarf.Filelen)
+
+ wg.Wait()
}
func asmb2(ctxt *ld.Link) {
diff --git a/src/cmd/link/internal/ld/data.go b/src/cmd/link/internal/ld/data.go
index 4174a70..7bdeb1f 100644
--- a/src/cmd/link/internal/ld/data.go
+++ b/src/cmd/link/internal/ld/data.go
@@ -730,14 +730,15 @@
}
func Codeblk(ctxt *Link, addr int64, size int64) {
- CodeblkPad(ctxt, addr, size, zeros[:])
+ CodeblkPad(ctxt, ctxt.Out, addr, size, zeros[:])
}
-func CodeblkPad(ctxt *Link, addr int64, size int64, pad []byte) {
+
+func CodeblkPad(ctxt *Link, out *OutBuf, addr int64, size int64, pad []byte) {
if *flagA {
ctxt.Logf("codeblk [%#x,%#x) at offset %#x\n", addr, addr+size, ctxt.Out.Offset())
}
- blk(ctxt.Out, ctxt.Textp, addr, size, pad)
+ writeBlocks(out, ctxt.outSem, ctxt.Textp, addr, size, pad)
/* again for printing */
if !*flagA {
@@ -795,9 +796,75 @@
}
}
-func blk(out *OutBuf, syms []*sym.Symbol, addr, size int64, pad []byte) {
+const blockSize = 1 << 20 // 1MB chunks written at a time.
+
+// writeBlocks writes a specified chunk of symbols to the output buffer. It
+// breaks the write up into ≥blockSize chunks to write them out, and schedules
+// as many goroutines as necessary to accomplish this task. This call then
+// blocks, waiting on the writes to complete. Note that we use the sem parameter
+// to limit the number of concurrent writes taking place.
+func writeBlocks(out *OutBuf, sem chan int, syms []*sym.Symbol, addr, size int64, pad []byte) {
for i, s := range syms {
- if !s.Attr.SubSymbol() && s.Value >= addr {
+ if s.Value >= addr && !s.Attr.SubSymbol() {
+ syms = syms[i:]
+ break
+ }
+ }
+
+ var wg sync.WaitGroup
+ max, lastAddr, written := int64(blockSize), addr+size, int64(0)
+ for addr < lastAddr {
+ // Find the last symbol we'd write.
+ idx := -1
+ length := int64(0)
+ for i, s := range syms {
+ // If the next symbol's size would put us out of bounds on the total length,
+ // stop looking.
+ if s.Value+s.Size > lastAddr {
+ break
+ }
+
+ // We're gonna write this symbol.
+ idx = i
+ length = s.Value + s.Size - addr
+
+ // If we cross over the max size, we've got enough symbols.
+ if s.Value+s.Size > addr+max {
+ break
+ }
+ }
+
+ // If we didn't find any symbols to write, we're done here.
+ if idx < 0 {
+ break
+ }
+
+ // Start the block output operator.
+ if o, err := out.View(uint64(out.Offset() + written)); err == nil {
+ sem <- 1
+ wg.Add(1)
+ go func(o *OutBuf, syms []*sym.Symbol, addr, size int64, pad []byte) {
+ writeBlock(o, syms, addr, size, pad)
+ wg.Done()
+ <-sem
+ }(o, syms, addr, length, pad)
+ } else { // output not mmaped, don't parallelize.
+ writeBlock(out, syms, addr, length, pad)
+ }
+
+ // Prepare for the next loop.
+ if idx != -1 {
+ syms = syms[idx+1:]
+ }
+ written += length
+ addr += length
+ }
+ wg.Wait()
+}
+
+func writeBlock(out *OutBuf, syms []*sym.Symbol, addr, size int64, pad []byte) {
+ for i, s := range syms {
+ if s.Value >= addr && !s.Attr.SubSymbol() {
syms = syms[i:]
break
}
@@ -841,13 +908,32 @@
if addr < eaddr {
out.WriteStringPad("", int(eaddr-addr), pad)
}
- out.Flush()
+}
+
+type writeFn func(*Link, *OutBuf, int64, int64)
+
+// WriteParallel handles scheduling parallel execution of data write functions.
+func WriteParallel(wg *sync.WaitGroup, fn writeFn, ctxt *Link, seek, vaddr, length uint64) {
+ if out, err := ctxt.Out.View(seek); err != nil {
+ ctxt.Out.SeekSet(int64(seek))
+ fn(ctxt, ctxt.Out, int64(vaddr), int64(length))
+ } else {
+ wg.Add(1)
+ go func() {
+ defer wg.Done()
+ fn(ctxt, out, int64(vaddr), int64(length))
+ }()
+ }
}
func Datblk(ctxt *Link, addr int64, size int64) {
writeDatblkToOutBuf(ctxt, ctxt.Out, addr, size)
}
+func Datblk2(ctxt *Link, out *OutBuf, addr, size int64) {
+ writeDatblkToOutBuf(ctxt, out, addr, size)
+}
+
// Used only on Wasm for now.
func DatblkBytes(ctxt *Link, addr int64, size int64) []byte {
buf := bytes.NewBuffer(make([]byte, 0, size))
@@ -862,7 +948,7 @@
ctxt.Logf("datblk [%#x,%#x) at offset %#x\n", addr, addr+size, ctxt.Out.Offset())
}
- blk(out, ctxt.datap, addr, size, zeros[:])
+ writeBlocks(out, ctxt.outSem, ctxt.datap, addr, size, zeros[:])
/* again for printing */
if !*flagA {
@@ -931,12 +1017,20 @@
ctxt.Logf("\t%.8x|\n", uint(eaddr))
}
+func Dwarfblk2(ctxt *Link, out *OutBuf, addr int64, size int64) {
+ if *flagA {
+ ctxt.Logf("dwarfblk [%#x,%#x) at offset %#x\n", addr, addr+size, ctxt.Out.Offset())
+ }
+
+ writeBlocks(out, ctxt.outSem, dwarfp, addr, size, zeros[:])
+}
+
func Dwarfblk(ctxt *Link, addr int64, size int64) {
if *flagA {
ctxt.Logf("dwarfblk [%#x,%#x) at offset %#x\n", addr, addr+size, ctxt.Out.Offset())
}
- blk(ctxt.Out, dwarfp, addr, size, zeros[:])
+ writeBlock(ctxt.Out, dwarfp, addr, size, zeros[:])
}
var zeros [512]byte
diff --git a/src/cmd/link/internal/ld/lib.go b/src/cmd/link/internal/ld/lib.go
index 692cf4f..0d6cdab 100644
--- a/src/cmd/link/internal/ld/lib.go
+++ b/src/cmd/link/internal/ld/lib.go
@@ -31,7 +31,6 @@
package ld
import (
- "bufio"
"bytes"
"cmd/internal/bio"
"cmd/internal/obj"
@@ -385,14 +384,11 @@
Lflag(ctxt, filepath.Join(objabi.GOROOT, "pkg", fmt.Sprintf("%s_%s%s%s", objabi.GOOS, objabi.GOARCH, suffixsep, suffix)))
mayberemoveoutfile()
- f, err := os.OpenFile(*flagOutfile, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0775)
- if err != nil {
+
+ if err := ctxt.Out.Open(*flagOutfile); err != nil {
Exitf("cannot create %s: %v", *flagOutfile, err)
}
- ctxt.Out.w = bufio.NewWriter(f)
- ctxt.Out.f = f
-
if *flagEntrySymbol == "" {
switch ctxt.BuildMode {
case BuildModeCShared, BuildModeCArchive:
@@ -1203,25 +1199,21 @@
*flagTmpdir = dir
ownTmpDir = true
AtExit(func() {
- ctxt.Out.f.Close()
+ ctxt.Out.Close()
os.RemoveAll(*flagTmpdir)
})
}
// change our output to temporary object file
- ctxt.Out.f.Close()
+ if err := ctxt.Out.Close(); err != nil {
+ Exitf("error closing output file")
+ }
mayberemoveoutfile()
p := filepath.Join(*flagTmpdir, "go.o")
- var err error
- f, err := os.OpenFile(p, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0775)
- if err != nil {
+ if err := ctxt.Out.Open(p); err != nil {
Exitf("cannot create %s: %v", p, err)
}
-
- ctxt.Out.w = bufio.NewWriter(f)
- ctxt.Out.f = f
- ctxt.Out.off = 0
}
// hostobjCopy creates a copy of the object files in hostobj in a
@@ -1305,11 +1297,9 @@
// Force the buffer to flush here so that external
// tools will see a complete file.
- ctxt.Out.Flush()
- if err := ctxt.Out.f.Close(); err != nil {
- Exitf("close: %v", err)
+ if err := ctxt.Out.Close(); err != nil {
+ Exitf("error closing %v", *flagOutfile)
}
- ctxt.Out.f = nil
argv := []string{*flagExtar, "-q", "-c", "-s"}
if ctxt.HeadType == objabi.Haix {
diff --git a/src/cmd/link/internal/ld/link.go b/src/cmd/link/internal/ld/link.go
index b32b7c8..24866d8 100644
--- a/src/cmd/link/internal/ld/link.go
+++ b/src/cmd/link/internal/ld/link.go
@@ -53,7 +53,9 @@
Target
ErrorReporter
ArchSyms
- Out *OutBuf
+
+ outSem chan int // limits the number of output writers
+ Out *OutBuf
Syms *sym.Symbols
diff --git a/src/cmd/link/internal/ld/outbuf.go b/src/cmd/link/internal/ld/outbuf.go
index f8e65ef..8e08347 100644
--- a/src/cmd/link/internal/ld/outbuf.go
+++ b/src/cmd/link/internal/ld/outbuf.go
@@ -9,6 +9,7 @@
"cmd/internal/sys"
"cmd/link/internal/sym"
"encoding/binary"
+ "errors"
"log"
"os"
)
@@ -23,19 +24,100 @@
// Second, it provides a very cheap offset counter that doesn't require
// any system calls to read the value.
//
-// It also mmaps the output file (if available). The intended usage is:
+// Third, it also mmaps the output file (if available). The intended usage is:
// - Mmap the output file
// - Write the content
// - possibly apply any edits in the output buffer
// - Munmap the output file
// - possibly write more content to the file, which will not be edited later.
+//
+// And finally, it provides a mechanism by which you can multithread the
+// writing of output files. This mechanism is accomplished by copying a OutBuf,
+// and using it in the thread/goroutine.
+//
+// Parallel OutBuf is intended to be used like:
+//
+// func write(out *OutBuf) {
+// var wg sync.WaitGroup
+// for i := 0; i < 10; i++ {
+// wg.Add(1)
+// view, err := out.View(start[i])
+// if err != nil {
+// // handle output
+// continue
+// }
+// go func(out *OutBuf, i int) {
+// // do output
+// wg.Done()
+// }(view, i)
+// }
+// wg.Wait()
+// }
type OutBuf struct {
- arch *sys.Arch
- off int64
- w *bufio.Writer
- buf []byte // backing store of mmap'd output file
- f *os.File
- encbuf [8]byte // temp buffer used by WriteN methods
+ arch *sys.Arch
+ off int64
+ w *bufio.Writer
+ buf []byte // backing store of mmap'd output file
+ name string
+ f *os.File
+ encbuf [8]byte // temp buffer used by WriteN methods
+ isView bool // true if created from View()
+ start, length uint64 // start and length mmaped data.
+}
+
+func (out *OutBuf) Open(name string) error {
+ if out.f != nil {
+ return errors.New("cannont open more than one file")
+ }
+ f, err := os.OpenFile(name, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0775)
+ if err != nil {
+ return err
+ }
+ out.off = 0
+ out.name = name
+ out.w = bufio.NewWriter(f)
+ out.f = f
+ return nil
+}
+
+func NewOutBuf(arch *sys.Arch) *OutBuf {
+ return &OutBuf{
+ arch: arch,
+ }
+}
+
+var viewError = errors.New("output not mmapped")
+
+func (out *OutBuf) View(start uint64) (*OutBuf, error) {
+ if out.buf == nil {
+ return nil, viewError
+ }
+ return &OutBuf{
+ arch: out.arch,
+ name: out.name,
+ buf: out.buf,
+ off: int64(start),
+ start: start,
+ length: out.length,
+ isView: true,
+ }, nil
+}
+
+var viewCloseError = errors.New("cannot Close OutBuf from View")
+
+func (out *OutBuf) Close() error {
+ if out.isView {
+ return viewCloseError
+ }
+ out.Flush()
+ if out.f == nil {
+ return nil
+ }
+ if err := out.f.Close(); err != nil {
+ return err
+ }
+ out.f = nil
+ return nil
}
func (out *OutBuf) SeekSet(p int64) {
@@ -45,7 +127,7 @@
if out.buf == nil {
out.Flush()
if _, err := out.f.Seek(p, 0); err != nil {
- Exitf("seeking to %d in %s: %v", p, out.f.Name(), err)
+ Exitf("seeking to %d in %s: %v", p, out.name, err)
}
}
out.off = p
@@ -154,13 +236,16 @@
// edit to the symbol content.
// If the output file is not Mmap'd, just writes the content.
func (out *OutBuf) WriteSym(s *sym.Symbol) {
+ // NB: We inline the Write call for speediness.
if out.buf != nil {
start := out.off
- out.Write(s.P)
+ n := copy(out.buf[out.off:], s.P)
+ out.off += int64(n)
s.P = out.buf[start:out.off]
s.Attr.Set(sym.AttrReadOnly, false)
} else {
- out.Write(s.P)
+ n, _ := out.w.Write(s.P)
+ out.off += int64(n)
}
}
@@ -168,10 +253,11 @@
var err error
if out.buf != nil {
err = out.Msync()
- } else {
+ }
+ if out.w != nil {
err = out.w.Flush()
}
if err != nil {
- Exitf("flushing %s: %v", out.f.Name(), err)
+ Exitf("flushing %s: %v", out.name, err)
}
}
diff --git a/src/cmd/link/internal/ld/outbuf_mmap.go b/src/cmd/link/internal/ld/outbuf_mmap.go
index 4075141..c064e96 100644
--- a/src/cmd/link/internal/ld/outbuf_mmap.go
+++ b/src/cmd/link/internal/ld/outbuf_mmap.go
@@ -16,11 +16,15 @@
if err != nil {
Exitf("resize output file failed: %v", err)
}
+ out.length = filesize
out.buf, err = syscall.Mmap(int(out.f.Fd()), 0, int(filesize), syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED|syscall.MAP_FILE)
return err
}
func (out *OutBuf) Munmap() {
+ if out.buf == nil {
+ return
+ }
err := out.Msync()
if err != nil {
Exitf("msync output file failed: %v", err)
@@ -34,9 +38,12 @@
}
func (out *OutBuf) Msync() error {
+ if out.buf == nil || out.length <= 0 {
+ return nil
+ }
// TODO: netbsd supports mmap and msync, but the syscall package doesn't define MSYNC.
// It is excluded from the build tag for now.
- _, _, errno := syscall.Syscall(syscall.SYS_MSYNC, uintptr(unsafe.Pointer(&out.buf[0])), uintptr(len(out.buf)), syscall.MS_SYNC)
+ _, _, errno := syscall.Syscall(syscall.SYS_MSYNC, uintptr(unsafe.Pointer(&out.buf[0])), uintptr(out.length), syscall.MS_SYNC)
if errno != 0 {
return errno
}
diff --git a/src/cmd/link/internal/ld/outbuf_windows.go b/src/cmd/link/internal/ld/outbuf_windows.go
index 1cb05c3..e7cda75 100644
--- a/src/cmd/link/internal/ld/outbuf_windows.go
+++ b/src/cmd/link/internal/ld/outbuf_windows.go
@@ -36,6 +36,7 @@
return
}
err := syscall.UnmapViewOfFile(uintptr(unsafe.Pointer(&out.buf[0])))
+ out.buf = nil
if err != nil {
Exitf("UnmapViewOfFile failed: %v", err)
}
diff --git a/src/cmd/link/internal/ld/sym.go b/src/cmd/link/internal/ld/sym.go
index 62e6af2..97966ed 100644
--- a/src/cmd/link/internal/ld/sym.go
+++ b/src/cmd/link/internal/ld/sym.go
@@ -36,13 +36,15 @@
"cmd/internal/sys"
"cmd/link/internal/sym"
"log"
+ "runtime"
)
func linknew(arch *sys.Arch) *Link {
ctxt := &Link{
Target: Target{Arch: arch},
Syms: sym.NewSymbols(),
- Out: &OutBuf{arch: arch},
+ outSem: make(chan int, 2*runtime.GOMAXPROCS(0)),
+ Out: NewOutBuf(arch),
LibraryByPkg: make(map[string]*sym.Library),
}
@@ -51,8 +53,8 @@
}
AtExit(func() {
- if nerrors > 0 && ctxt.Out.f != nil {
- ctxt.Out.f.Close()
+ if nerrors > 0 {
+ ctxt.Out.Close()
mayberemoveoutfile()
}
})
diff --git a/src/cmd/link/internal/x86/asm.go b/src/cmd/link/internal/x86/asm.go
index 30ad086..0fdd730 100644
--- a/src/cmd/link/internal/x86/asm.go
+++ b/src/cmd/link/internal/x86/asm.go
@@ -626,7 +626,7 @@
sect := ld.Segtext.Sections[0]
ctxt.Out.SeekSet(int64(sect.Vaddr - ld.Segtext.Vaddr + ld.Segtext.Fileoff))
// 0xCC is INT $3 - breakpoint instruction
- ld.CodeblkPad(ctxt, int64(sect.Vaddr), int64(sect.Length), []byte{0xCC})
+ ld.CodeblkPad(ctxt, ctxt.Out, int64(sect.Vaddr), int64(sect.Length), []byte{0xCC})
for _, sect = range ld.Segtext.Sections[1:] {
ctxt.Out.SeekSet(int64(sect.Vaddr - ld.Segtext.Vaddr + ld.Segtext.Fileoff))
ld.Datblk(ctxt, int64(sect.Vaddr), int64(sect.Length))