[dev.link] cmd/link: parallelize asmb on amd64

Introduces a parallel OutBuf implementation, and POC on amd64. Due to
some of the weird behaviors I saw on MacOS (SIGBUS while calling msync),
I will wait for feedback to port to other architectures.

On my mac, sped up Asmb by ~78% for cmd/compile (below). Will likely
have an appreciable speedup on kubelet benchmark.

Asmb                      39.1ms ±11%     8.5ms ±10%     -78.17%  (p=0.000 n=10+9)
TotalTime                  596ms ± 2%     577ms ± 8%      -3.07%  (p=0.034 n=8+10)

Change-Id: Id2a2577c3f4da155d8dccc862897f43b941877ac
Reviewed-on: https://go-review.googlesource.com/c/go/+/223742
Run-TryBot: Jeremy Faller <jeremy@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Than McIntosh <thanm@google.com>
diff --git a/src/cmd/link/internal/amd64/asm.go b/src/cmd/link/internal/amd64/asm.go
index 7b92585..40ecd25 100644
--- a/src/cmd/link/internal/amd64/asm.go
+++ b/src/cmd/link/internal/amd64/asm.go
@@ -38,6 +38,7 @@
 	"cmd/link/internal/sym"
 	"debug/elf"
 	"log"
+	"sync"
 )
 
 func PADDR(x uint32) uint32 {
@@ -693,29 +694,32 @@
 		ld.Asmbelfsetup()
 	}
 
+	var wg sync.WaitGroup
 	sect := ld.Segtext.Sections[0]
-	ctxt.Out.SeekSet(int64(sect.Vaddr - ld.Segtext.Vaddr + ld.Segtext.Fileoff))
-	// 0xCC is INT $3 - breakpoint instruction
-	ld.CodeblkPad(ctxt, int64(sect.Vaddr), int64(sect.Length), []byte{0xCC})
-	for _, sect = range ld.Segtext.Sections[1:] {
-		ctxt.Out.SeekSet(int64(sect.Vaddr - ld.Segtext.Vaddr + ld.Segtext.Fileoff))
-		ld.Datblk(ctxt, int64(sect.Vaddr), int64(sect.Length))
+	offset := sect.Vaddr - ld.Segtext.Vaddr + ld.Segtext.Fileoff
+	f := func(ctxt *ld.Link, out *ld.OutBuf, start, length int64) {
+		// 0xCC is INT $3 - breakpoint instruction
+		ld.CodeblkPad(ctxt, out, start, length, []byte{0xCC})
+	}
+	ld.WriteParallel(&wg, f, ctxt, offset, sect.Vaddr, sect.Length)
+
+	for _, sect := range ld.Segtext.Sections[1:] {
+		offset := sect.Vaddr - ld.Segtext.Vaddr + ld.Segtext.Fileoff
+		ld.WriteParallel(&wg, ld.Datblk2, ctxt, offset, sect.Vaddr, sect.Length)
 	}
 
 	if ld.Segrodata.Filelen > 0 {
-		ctxt.Out.SeekSet(int64(ld.Segrodata.Fileoff))
-		ld.Datblk(ctxt, int64(ld.Segrodata.Vaddr), int64(ld.Segrodata.Filelen))
+		ld.WriteParallel(&wg, ld.Datblk2, ctxt, ld.Segrodata.Fileoff, ld.Segrodata.Vaddr, ld.Segrodata.Filelen)
 	}
 	if ld.Segrelrodata.Filelen > 0 {
-		ctxt.Out.SeekSet(int64(ld.Segrelrodata.Fileoff))
-		ld.Datblk(ctxt, int64(ld.Segrelrodata.Vaddr), int64(ld.Segrelrodata.Filelen))
+		ld.WriteParallel(&wg, ld.Datblk2, ctxt, ld.Segrelrodata.Fileoff, ld.Segrelrodata.Vaddr, ld.Segrelrodata.Filelen)
 	}
 
-	ctxt.Out.SeekSet(int64(ld.Segdata.Fileoff))
-	ld.Datblk(ctxt, int64(ld.Segdata.Vaddr), int64(ld.Segdata.Filelen))
+	ld.WriteParallel(&wg, ld.Datblk2, ctxt, ld.Segdata.Fileoff, ld.Segdata.Vaddr, ld.Segdata.Filelen)
 
-	ctxt.Out.SeekSet(int64(ld.Segdwarf.Fileoff))
-	ld.Dwarfblk(ctxt, int64(ld.Segdwarf.Vaddr), int64(ld.Segdwarf.Filelen))
+	ld.WriteParallel(&wg, ld.Dwarfblk2, ctxt, ld.Segdwarf.Fileoff, ld.Segdwarf.Vaddr, ld.Segdwarf.Filelen)
+
+	wg.Wait()
 }
 
 func asmb2(ctxt *ld.Link) {
diff --git a/src/cmd/link/internal/ld/data.go b/src/cmd/link/internal/ld/data.go
index 4174a70..7bdeb1f 100644
--- a/src/cmd/link/internal/ld/data.go
+++ b/src/cmd/link/internal/ld/data.go
@@ -730,14 +730,15 @@
 }
 
 func Codeblk(ctxt *Link, addr int64, size int64) {
-	CodeblkPad(ctxt, addr, size, zeros[:])
+	CodeblkPad(ctxt, ctxt.Out, addr, size, zeros[:])
 }
-func CodeblkPad(ctxt *Link, addr int64, size int64, pad []byte) {
+
+func CodeblkPad(ctxt *Link, out *OutBuf, addr int64, size int64, pad []byte) {
 	if *flagA {
 		ctxt.Logf("codeblk [%#x,%#x) at offset %#x\n", addr, addr+size, ctxt.Out.Offset())
 	}
 
-	blk(ctxt.Out, ctxt.Textp, addr, size, pad)
+	writeBlocks(out, ctxt.outSem, ctxt.Textp, addr, size, pad)
 
 	/* again for printing */
 	if !*flagA {
@@ -795,9 +796,75 @@
 	}
 }
 
-func blk(out *OutBuf, syms []*sym.Symbol, addr, size int64, pad []byte) {
+const blockSize = 1 << 20 // 1MB chunks written at a time.
+
+// writeBlocks writes a specified chunk of symbols to the output buffer. It
+// breaks the write up into ≥blockSize chunks to write them out, and schedules
+// as many goroutines as necessary to accomplish this task. This call then
+// blocks, waiting on the writes to complete. Note that we use the sem parameter
+// to limit the number of concurrent writes taking place.
+func writeBlocks(out *OutBuf, sem chan int, syms []*sym.Symbol, addr, size int64, pad []byte) {
 	for i, s := range syms {
-		if !s.Attr.SubSymbol() && s.Value >= addr {
+		if s.Value >= addr && !s.Attr.SubSymbol() {
+			syms = syms[i:]
+			break
+		}
+	}
+
+	var wg sync.WaitGroup
+	max, lastAddr, written := int64(blockSize), addr+size, int64(0)
+	for addr < lastAddr {
+		// Find the last symbol we'd write.
+		idx := -1
+		length := int64(0)
+		for i, s := range syms {
+			// If the next symbol's size would put us out of bounds on the total length,
+			// stop looking.
+			if s.Value+s.Size > lastAddr {
+				break
+			}
+
+			// We're gonna write this symbol.
+			idx = i
+			length = s.Value + s.Size - addr
+
+			// If we cross over the max size, we've got enough symbols.
+			if s.Value+s.Size > addr+max {
+				break
+			}
+		}
+
+		// If we didn't find any symbols to write, we're done here.
+		if idx < 0 {
+			break
+		}
+
+		// Start the block output operator.
+		if o, err := out.View(uint64(out.Offset() + written)); err == nil {
+			sem <- 1
+			wg.Add(1)
+			go func(o *OutBuf, syms []*sym.Symbol, addr, size int64, pad []byte) {
+				writeBlock(o, syms, addr, size, pad)
+				wg.Done()
+				<-sem
+			}(o, syms, addr, length, pad)
+		} else { // output not mmaped, don't parallelize.
+			writeBlock(out, syms, addr, length, pad)
+		}
+
+		// Prepare for the next loop.
+		if idx != -1 {
+			syms = syms[idx+1:]
+		}
+		written += length
+		addr += length
+	}
+	wg.Wait()
+}
+
+func writeBlock(out *OutBuf, syms []*sym.Symbol, addr, size int64, pad []byte) {
+	for i, s := range syms {
+		if s.Value >= addr && !s.Attr.SubSymbol() {
 			syms = syms[i:]
 			break
 		}
@@ -841,13 +908,32 @@
 	if addr < eaddr {
 		out.WriteStringPad("", int(eaddr-addr), pad)
 	}
-	out.Flush()
+}
+
+type writeFn func(*Link, *OutBuf, int64, int64)
+
+// WriteParallel handles scheduling parallel execution of data write functions.
+func WriteParallel(wg *sync.WaitGroup, fn writeFn, ctxt *Link, seek, vaddr, length uint64) {
+	if out, err := ctxt.Out.View(seek); err != nil {
+		ctxt.Out.SeekSet(int64(seek))
+		fn(ctxt, ctxt.Out, int64(vaddr), int64(length))
+	} else {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			fn(ctxt, out, int64(vaddr), int64(length))
+		}()
+	}
 }
 
 func Datblk(ctxt *Link, addr int64, size int64) {
 	writeDatblkToOutBuf(ctxt, ctxt.Out, addr, size)
 }
 
+func Datblk2(ctxt *Link, out *OutBuf, addr, size int64) {
+	writeDatblkToOutBuf(ctxt, out, addr, size)
+}
+
 // Used only on Wasm for now.
 func DatblkBytes(ctxt *Link, addr int64, size int64) []byte {
 	buf := bytes.NewBuffer(make([]byte, 0, size))
@@ -862,7 +948,7 @@
 		ctxt.Logf("datblk [%#x,%#x) at offset %#x\n", addr, addr+size, ctxt.Out.Offset())
 	}
 
-	blk(out, ctxt.datap, addr, size, zeros[:])
+	writeBlocks(out, ctxt.outSem, ctxt.datap, addr, size, zeros[:])
 
 	/* again for printing */
 	if !*flagA {
@@ -931,12 +1017,20 @@
 	ctxt.Logf("\t%.8x|\n", uint(eaddr))
 }
 
+func Dwarfblk2(ctxt *Link, out *OutBuf, addr int64, size int64) {
+	if *flagA {
+		ctxt.Logf("dwarfblk [%#x,%#x) at offset %#x\n", addr, addr+size, ctxt.Out.Offset())
+	}
+
+	writeBlocks(out, ctxt.outSem, dwarfp, addr, size, zeros[:])
+}
+
 func Dwarfblk(ctxt *Link, addr int64, size int64) {
 	if *flagA {
 		ctxt.Logf("dwarfblk [%#x,%#x) at offset %#x\n", addr, addr+size, ctxt.Out.Offset())
 	}
 
-	blk(ctxt.Out, dwarfp, addr, size, zeros[:])
+	writeBlock(ctxt.Out, dwarfp, addr, size, zeros[:])
 }
 
 var zeros [512]byte
diff --git a/src/cmd/link/internal/ld/lib.go b/src/cmd/link/internal/ld/lib.go
index 692cf4f..0d6cdab 100644
--- a/src/cmd/link/internal/ld/lib.go
+++ b/src/cmd/link/internal/ld/lib.go
@@ -31,7 +31,6 @@
 package ld
 
 import (
-	"bufio"
 	"bytes"
 	"cmd/internal/bio"
 	"cmd/internal/obj"
@@ -385,14 +384,11 @@
 	Lflag(ctxt, filepath.Join(objabi.GOROOT, "pkg", fmt.Sprintf("%s_%s%s%s", objabi.GOOS, objabi.GOARCH, suffixsep, suffix)))
 
 	mayberemoveoutfile()
-	f, err := os.OpenFile(*flagOutfile, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0775)
-	if err != nil {
+
+	if err := ctxt.Out.Open(*flagOutfile); err != nil {
 		Exitf("cannot create %s: %v", *flagOutfile, err)
 	}
 
-	ctxt.Out.w = bufio.NewWriter(f)
-	ctxt.Out.f = f
-
 	if *flagEntrySymbol == "" {
 		switch ctxt.BuildMode {
 		case BuildModeCShared, BuildModeCArchive:
@@ -1203,25 +1199,21 @@
 		*flagTmpdir = dir
 		ownTmpDir = true
 		AtExit(func() {
-			ctxt.Out.f.Close()
+			ctxt.Out.Close()
 			os.RemoveAll(*flagTmpdir)
 		})
 	}
 
 	// change our output to temporary object file
-	ctxt.Out.f.Close()
+	if err := ctxt.Out.Close(); err != nil {
+		Exitf("error closing output file")
+	}
 	mayberemoveoutfile()
 
 	p := filepath.Join(*flagTmpdir, "go.o")
-	var err error
-	f, err := os.OpenFile(p, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0775)
-	if err != nil {
+	if err := ctxt.Out.Open(p); err != nil {
 		Exitf("cannot create %s: %v", p, err)
 	}
-
-	ctxt.Out.w = bufio.NewWriter(f)
-	ctxt.Out.f = f
-	ctxt.Out.off = 0
 }
 
 // hostobjCopy creates a copy of the object files in hostobj in a
@@ -1305,11 +1297,9 @@
 
 	// Force the buffer to flush here so that external
 	// tools will see a complete file.
-	ctxt.Out.Flush()
-	if err := ctxt.Out.f.Close(); err != nil {
-		Exitf("close: %v", err)
+	if err := ctxt.Out.Close(); err != nil {
+		Exitf("error closing %v", *flagOutfile)
 	}
-	ctxt.Out.f = nil
 
 	argv := []string{*flagExtar, "-q", "-c", "-s"}
 	if ctxt.HeadType == objabi.Haix {
diff --git a/src/cmd/link/internal/ld/link.go b/src/cmd/link/internal/ld/link.go
index b32b7c8..24866d8 100644
--- a/src/cmd/link/internal/ld/link.go
+++ b/src/cmd/link/internal/ld/link.go
@@ -53,7 +53,9 @@
 	Target
 	ErrorReporter
 	ArchSyms
-	Out *OutBuf
+
+	outSem chan int // limits the number of output writers
+	Out    *OutBuf
 
 	Syms *sym.Symbols
 
diff --git a/src/cmd/link/internal/ld/outbuf.go b/src/cmd/link/internal/ld/outbuf.go
index f8e65ef..8e08347 100644
--- a/src/cmd/link/internal/ld/outbuf.go
+++ b/src/cmd/link/internal/ld/outbuf.go
@@ -9,6 +9,7 @@
 	"cmd/internal/sys"
 	"cmd/link/internal/sym"
 	"encoding/binary"
+	"errors"
 	"log"
 	"os"
 )
@@ -23,19 +24,100 @@
 // Second, it provides a very cheap offset counter that doesn't require
 // any system calls to read the value.
 //
-// It also mmaps the output file (if available). The intended usage is:
+// Third, it also mmaps the output file (if available). The intended usage is:
 // - Mmap the output file
 // - Write the content
 // - possibly apply any edits in the output buffer
 // - Munmap the output file
 // - possibly write more content to the file, which will not be edited later.
+//
+// And finally, it provides a mechanism by which you can multithread the
+// writing of output files. This mechanism is accomplished by copying a OutBuf,
+// and using it in the thread/goroutine.
+//
+// Parallel OutBuf is intended to be used like:
+//
+//  func write(out *OutBuf) {
+//    var wg sync.WaitGroup
+//    for i := 0; i < 10; i++ {
+//      wg.Add(1)
+//      view, err := out.View(start[i])
+//      if err != nil {
+//         // handle output
+//         continue
+//      }
+//      go func(out *OutBuf, i int) {
+//        // do output
+//        wg.Done()
+//      }(view, i)
+//    }
+//    wg.Wait()
+//  }
 type OutBuf struct {
-	arch   *sys.Arch
-	off    int64
-	w      *bufio.Writer
-	buf    []byte // backing store of mmap'd output file
-	f      *os.File
-	encbuf [8]byte // temp buffer used by WriteN methods
+	arch          *sys.Arch
+	off           int64
+	w             *bufio.Writer
+	buf           []byte // backing store of mmap'd output file
+	name          string
+	f             *os.File
+	encbuf        [8]byte // temp buffer used by WriteN methods
+	isView        bool    // true if created from View()
+	start, length uint64  // start and length mmaped data.
+}
+
+func (out *OutBuf) Open(name string) error {
+	if out.f != nil {
+		return errors.New("cannont open more than one file")
+	}
+	f, err := os.OpenFile(name, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0775)
+	if err != nil {
+		return err
+	}
+	out.off = 0
+	out.name = name
+	out.w = bufio.NewWriter(f)
+	out.f = f
+	return nil
+}
+
+func NewOutBuf(arch *sys.Arch) *OutBuf {
+	return &OutBuf{
+		arch: arch,
+	}
+}
+
+var viewError = errors.New("output not mmapped")
+
+func (out *OutBuf) View(start uint64) (*OutBuf, error) {
+	if out.buf == nil {
+		return nil, viewError
+	}
+	return &OutBuf{
+		arch:   out.arch,
+		name:   out.name,
+		buf:    out.buf,
+		off:    int64(start),
+		start:  start,
+		length: out.length,
+		isView: true,
+	}, nil
+}
+
+var viewCloseError = errors.New("cannot Close OutBuf from View")
+
+func (out *OutBuf) Close() error {
+	if out.isView {
+		return viewCloseError
+	}
+	out.Flush()
+	if out.f == nil {
+		return nil
+	}
+	if err := out.f.Close(); err != nil {
+		return err
+	}
+	out.f = nil
+	return nil
 }
 
 func (out *OutBuf) SeekSet(p int64) {
@@ -45,7 +127,7 @@
 	if out.buf == nil {
 		out.Flush()
 		if _, err := out.f.Seek(p, 0); err != nil {
-			Exitf("seeking to %d in %s: %v", p, out.f.Name(), err)
+			Exitf("seeking to %d in %s: %v", p, out.name, err)
 		}
 	}
 	out.off = p
@@ -154,13 +236,16 @@
 // edit to the symbol content.
 // If the output file is not Mmap'd, just writes the content.
 func (out *OutBuf) WriteSym(s *sym.Symbol) {
+	// NB: We inline the Write call for speediness.
 	if out.buf != nil {
 		start := out.off
-		out.Write(s.P)
+		n := copy(out.buf[out.off:], s.P)
+		out.off += int64(n)
 		s.P = out.buf[start:out.off]
 		s.Attr.Set(sym.AttrReadOnly, false)
 	} else {
-		out.Write(s.P)
+		n, _ := out.w.Write(s.P)
+		out.off += int64(n)
 	}
 }
 
@@ -168,10 +253,11 @@
 	var err error
 	if out.buf != nil {
 		err = out.Msync()
-	} else {
+	}
+	if out.w != nil {
 		err = out.w.Flush()
 	}
 	if err != nil {
-		Exitf("flushing %s: %v", out.f.Name(), err)
+		Exitf("flushing %s: %v", out.name, err)
 	}
 }
diff --git a/src/cmd/link/internal/ld/outbuf_mmap.go b/src/cmd/link/internal/ld/outbuf_mmap.go
index 4075141..c064e96 100644
--- a/src/cmd/link/internal/ld/outbuf_mmap.go
+++ b/src/cmd/link/internal/ld/outbuf_mmap.go
@@ -16,11 +16,15 @@
 	if err != nil {
 		Exitf("resize output file failed: %v", err)
 	}
+	out.length = filesize
 	out.buf, err = syscall.Mmap(int(out.f.Fd()), 0, int(filesize), syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED|syscall.MAP_FILE)
 	return err
 }
 
 func (out *OutBuf) Munmap() {
+	if out.buf == nil {
+		return
+	}
 	err := out.Msync()
 	if err != nil {
 		Exitf("msync output file failed: %v", err)
@@ -34,9 +38,12 @@
 }
 
 func (out *OutBuf) Msync() error {
+	if out.buf == nil || out.length <= 0 {
+		return nil
+	}
 	// TODO: netbsd supports mmap and msync, but the syscall package doesn't define MSYNC.
 	// It is excluded from the build tag for now.
-	_, _, errno := syscall.Syscall(syscall.SYS_MSYNC, uintptr(unsafe.Pointer(&out.buf[0])), uintptr(len(out.buf)), syscall.MS_SYNC)
+	_, _, errno := syscall.Syscall(syscall.SYS_MSYNC, uintptr(unsafe.Pointer(&out.buf[0])), uintptr(out.length), syscall.MS_SYNC)
 	if errno != 0 {
 		return errno
 	}
diff --git a/src/cmd/link/internal/ld/outbuf_windows.go b/src/cmd/link/internal/ld/outbuf_windows.go
index 1cb05c3..e7cda75 100644
--- a/src/cmd/link/internal/ld/outbuf_windows.go
+++ b/src/cmd/link/internal/ld/outbuf_windows.go
@@ -36,6 +36,7 @@
 		return
 	}
 	err := syscall.UnmapViewOfFile(uintptr(unsafe.Pointer(&out.buf[0])))
+	out.buf = nil
 	if err != nil {
 		Exitf("UnmapViewOfFile failed: %v", err)
 	}
diff --git a/src/cmd/link/internal/ld/sym.go b/src/cmd/link/internal/ld/sym.go
index 62e6af2..97966ed 100644
--- a/src/cmd/link/internal/ld/sym.go
+++ b/src/cmd/link/internal/ld/sym.go
@@ -36,13 +36,15 @@
 	"cmd/internal/sys"
 	"cmd/link/internal/sym"
 	"log"
+	"runtime"
 )
 
 func linknew(arch *sys.Arch) *Link {
 	ctxt := &Link{
 		Target:       Target{Arch: arch},
 		Syms:         sym.NewSymbols(),
-		Out:          &OutBuf{arch: arch},
+		outSem:       make(chan int, 2*runtime.GOMAXPROCS(0)),
+		Out:          NewOutBuf(arch),
 		LibraryByPkg: make(map[string]*sym.Library),
 	}
 
@@ -51,8 +53,8 @@
 	}
 
 	AtExit(func() {
-		if nerrors > 0 && ctxt.Out.f != nil {
-			ctxt.Out.f.Close()
+		if nerrors > 0 {
+			ctxt.Out.Close()
 			mayberemoveoutfile()
 		}
 	})
diff --git a/src/cmd/link/internal/x86/asm.go b/src/cmd/link/internal/x86/asm.go
index 30ad086..0fdd730 100644
--- a/src/cmd/link/internal/x86/asm.go
+++ b/src/cmd/link/internal/x86/asm.go
@@ -626,7 +626,7 @@
 	sect := ld.Segtext.Sections[0]
 	ctxt.Out.SeekSet(int64(sect.Vaddr - ld.Segtext.Vaddr + ld.Segtext.Fileoff))
 	// 0xCC is INT $3 - breakpoint instruction
-	ld.CodeblkPad(ctxt, int64(sect.Vaddr), int64(sect.Length), []byte{0xCC})
+	ld.CodeblkPad(ctxt, ctxt.Out, int64(sect.Vaddr), int64(sect.Length), []byte{0xCC})
 	for _, sect = range ld.Segtext.Sections[1:] {
 		ctxt.Out.SeekSet(int64(sect.Vaddr - ld.Segtext.Vaddr + ld.Segtext.Fileoff))
 		ld.Datblk(ctxt, int64(sect.Vaddr), int64(sect.Length))