internal/socket: reuse buffers in recv/sendMsgs

Use a pool to reuse internally allocated temporary buffers, i.e. the
mmsghdrs structures and the associated iovecs and sockaddr buffers.

The temporary buffers obtained from the pool are re-allocated when their
sizes are not sufficient for the current call.
The (hopefully reasonable) assumption is that RecvMsgs and SendMsgs are
called with a somewhat uniform number of Messages/Buffers. If not, the
buffers in the pool will likely be re-allocated a few times until they
reach the size required for the "biggest" caller.

This pooling can significantly reduce the temporary allocations made in
recv/sendMsgs. While the effectiveness of this change clearly depends
on the usage pattern, pressure on the GC, etc., some improvements can
already be observed in the micro-benchmark BenchmarkUDP: the number and
size of allocations per iteration decreases significantly and there is a
small but noticable improvement in time per iteration.

```
name             old time/op    new time/op    delta
UDP/Iter-1-8       5.50µs ± 2%    5.46µs ± 2%     ~     (p=0.055 n=20+20)
UDP/Batch-1-8      5.65µs ± 3%    5.56µs ± 4%   -1.68%  (p=0.011 n=20+20)
UDP/Iter-2-8       11.0µs ± 3%    11.0µs ± 3%     ~     (p=0.645 n=20+20)
UDP/Batch-2-8      8.40µs ± 3%    8.24µs ± 5%   -1.87%  (p=0.012 n=20+20)
UDP/Iter-4-8       22.0µs ± 1%    21.9µs ± 3%     ~     (p=0.437 n=17+20)
UDP/Batch-4-8      13.3µs ± 2%    12.7µs ± 2%   -4.69%  (p=0.000 n=20+20)
UDP/Iter-8-8       44.2µs ± 3%    44.0µs ± 2%     ~     (p=0.551 n=20+20)
UDP/Batch-8-8      24.2µs ± 4%    23.1µs ± 4%   -4.65%  (p=0.000 n=20+20)
UDP/Iter-16-8      87.9µs ± 4%    88.1µs ± 3%     ~     (p=0.708 n=19+20)
UDP/Batch-16-8     45.6µs ± 4%    44.1µs ± 5%   -3.12%  (p=0.000 n=20+20)
UDP/Iter-32-8       175µs ± 4%     176µs ± 4%     ~     (p=0.087 n=20+20)
UDP/Batch-32-8     87.9µs ± 1%    84.5µs ± 6%   -3.78%  (p=0.000 n=19+19)
UDP/Iter-64-8       353µs ± 3%     352µs ± 3%     ~     (p=0.414 n=20+20)
UDP/Batch-64-8      172µs ± 4%     172µs ±11%     ~     (p=0.157 n=20+20)
UDP/Iter-128-8      705µs ± 3%     699µs ± 4%     ~     (p=0.142 n=20+20)
UDP/Batch-128-8     345µs ± 2%     343µs ± 6%     ~     (p=0.134 n=20+20)
UDP/Iter-256-8     1.41ms ± 3%    1.41ms ± 4%     ~     (p=0.758 n=20+20)
UDP/Batch-256-8     692µs ± 4%     685µs ± 4%     ~     (p=0.114 n=20+20)
UDP/Iter-512-8     2.82ms ± 2%    2.81ms ± 2%     ~     (p=0.820 n=20+20)
UDP/Batch-512-8    1.27ms ± 3%    0.75ms ± 6%  -40.62%  (p=0.000 n=20+20)

name             old alloc/op   new alloc/op   delta
UDP/Iter-1-8         408B ± 0%      424B ± 0%   +3.92%  (p=0.000 n=20+20)
UDP/Batch-1-8        440B ± 0%      232B ± 0%  -47.27%  (p=0.000 n=20+20)
UDP/Iter-2-8         816B ± 0%      848B ± 0%   +3.92%  (p=0.000 n=20+20)
UDP/Batch-2-8        696B ± 0%      280B ± 0%  -59.77%  (p=0.000 n=20+20)
UDP/Iter-4-8       1.63kB ± 0%    1.70kB ± 0%   +3.92%  (p=0.000 n=20+20)
UDP/Batch-4-8      1.22kB ± 0%    0.38kB ± 0%  -68.42%  (p=0.000 n=20+20)
UDP/Iter-8-8       3.26kB ± 0%    3.39kB ± 0%   +3.92%  (p=0.000 n=20+20)
UDP/Batch-8-8      2.26kB ± 0%    0.59kB ± 0%  -73.76%  (p=0.000 n=20+20)
UDP/Iter-16-8      6.53kB ± 0%    6.78kB ± 0%   +3.92%  (p=0.000 n=20+20)
UDP/Batch-16-8     4.34kB ± 0%    1.01kB ± 0%  -76.75%  (p=0.000 n=20+20)
UDP/Iter-32-8      13.1kB ± 0%    13.6kB ± 0%   +3.92%  (p=0.000 n=20+20)
UDP/Batch-32-8     8.50kB ± 0%    1.84kB ± 0%  -78.34%  (p=0.000 n=20+18)
UDP/Iter-64-8      26.1kB ± 0%    27.1kB ± 0%   +3.92%  (p=0.000 n=20+20)
UDP/Batch-64-8     16.8kB ± 0%     3.5kB ± 0%  -79.16%  (p=0.000 n=20+20)
UDP/Iter-128-8     52.2kB ± 0%    54.3kB ± 0%   +3.92%  (p=0.000 n=20+20)
UDP/Batch-128-8    33.5kB ± 0%     6.8kB ± 0%  -79.56%  (p=0.000 n=20+16)
UDP/Iter-256-8      104kB ± 0%     109kB ± 0%   +3.92%  (p=0.000 n=20+17)
UDP/Batch-256-8    66.7kB ± 0%    13.5kB ± 0%  -79.77%  (p=0.000 n=20+20)
UDP/Iter-512-8      209kB ± 0%     217kB ± 0%   +3.92%  (p=0.000 n=20+20)
UDP/Batch-512-8     121kB ± 0%      15kB ± 0%  -87.89%  (p=0.000 n=20+20)

name             old allocs/op  new allocs/op  delta
UDP/Iter-1-8         14.0 ± 0%      14.0 ± 0%     ~     (all equal)
UDP/Batch-1-8        14.0 ± 0%       8.0 ± 0%  -42.86%  (p=0.000 n=20+20)
UDP/Iter-2-8         28.0 ± 0%      28.0 ± 0%     ~     (all equal)
UDP/Batch-2-8        20.0 ± 0%      10.0 ± 0%  -50.00%  (p=0.000 n=20+20)
UDP/Iter-4-8         56.0 ± 0%      56.0 ± 0%     ~     (all equal)
UDP/Batch-4-8        32.0 ± 0%      14.0 ± 0%  -56.25%  (p=0.000 n=20+20)
UDP/Iter-8-8          112 ± 0%       112 ± 0%     ~     (all equal)
UDP/Batch-8-8        56.0 ± 0%      22.0 ± 0%  -60.71%  (p=0.000 n=20+20)
UDP/Iter-16-8         224 ± 0%       224 ± 0%     ~     (all equal)
UDP/Batch-16-8        104 ± 0%        38 ± 0%  -63.46%  (p=0.000 n=20+20)
UDP/Iter-32-8         448 ± 0%       448 ± 0%     ~     (all equal)
UDP/Batch-32-8        200 ± 0%        70 ± 0%  -65.00%  (p=0.000 n=20+20)
UDP/Iter-64-8         896 ± 0%       896 ± 0%     ~     (all equal)
UDP/Batch-64-8        392 ± 0%       134 ± 0%  -65.82%  (p=0.000 n=20+20)
UDP/Iter-128-8      1.79k ± 0%     1.79k ± 0%     ~     (all equal)
UDP/Batch-128-8       776 ± 0%       262 ± 0%  -66.24%  (p=0.000 n=20+20)
UDP/Iter-256-8      3.58k ± 0%     3.58k ± 0%     ~     (all equal)
UDP/Batch-256-8     1.54k ± 0%     0.52k ± 0%  -66.45%  (p=0.000 n=20+20)
UDP/Iter-512-8      7.17k ± 0%     7.17k ± 0%     ~     (all equal)
UDP/Batch-512-8     2.61k ± 0%     0.56k ± 0%  -78.48%  (p=0.000 n=20+20)
```

Fixes golang/go#26838

Change-Id: Id34e362737455cd48df5bc751426be49fbc28094
GitHub-Last-Rev: 4e33d507bb0325fd2cd8b58b586081de1acdf2ad
GitHub-Pull-Request: golang/net#102
Reviewed-on: https://go-review.googlesource.com/c/net/+/315589
Run-TryBot: Tobias Klauser <tobias.klauser@gmail.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Tobias Klauser <tobias.klauser@gmail.com>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
diff --git a/internal/socket/mmsghdr_unix.go b/internal/socket/mmsghdr_unix.go
index 5025a0f..42c4a0d 100644
--- a/internal/socket/mmsghdr_unix.go
+++ b/internal/socket/mmsghdr_unix.go
@@ -7,25 +7,13 @@
 
 package socket
 
-import "net"
+import (
+	"net"
+	"sync"
+)
 
 type mmsghdrs []mmsghdr
 
-func (hs mmsghdrs) pack(ms []Message, parseFn func([]byte, string) (net.Addr, error), marshalFn func(net.Addr) []byte) error {
-	for i := range hs {
-		vs := make([]iovec, len(ms[i].Buffers))
-		var sa []byte
-		if parseFn != nil {
-			sa = make([]byte, sizeofSockaddrInet6)
-		}
-		if marshalFn != nil {
-			sa = marshalFn(ms[i].Addr)
-		}
-		hs[i].Hdr.pack(vs, ms[i].Buffers, ms[i].OOB, sa)
-	}
-	return nil
-}
-
 func (hs mmsghdrs) unpack(ms []Message, parseFn func([]byte, string) (net.Addr, error), hint string) error {
 	for i := range hs {
 		ms[i].N = int(hs[i].Len)
@@ -41,3 +29,84 @@
 	}
 	return nil
 }
+
+// mmsghdrsPacker packs Message-slices into mmsghdrs (re-)using pre-allocated buffers.
+type mmsghdrsPacker struct {
+	// hs are the pre-allocated mmsghdrs.
+	hs mmsghdrs
+	// sockaddrs is the pre-allocated buffer for the Hdr.Name buffers.
+	// We use one large buffer for all messages and slice it up.
+	sockaddrs []byte
+	// vs are the pre-allocated iovecs.
+	// We allocate one large buffer for all messages and slice it up. This allows to reuse the buffer
+	// if the number of buffers per message is distributed differently between calls.
+	vs []iovec
+}
+
+func (p *mmsghdrsPacker) prepare(ms []Message) {
+	n := len(ms)
+	if n <= cap(p.hs) {
+		p.hs = p.hs[:n]
+	} else {
+		p.hs = make(mmsghdrs, n)
+	}
+	if n*sizeofSockaddrInet6 <= cap(p.sockaddrs) {
+		p.sockaddrs = p.sockaddrs[:n*sizeofSockaddrInet6]
+	} else {
+		p.sockaddrs = make([]byte, n*sizeofSockaddrInet6)
+	}
+
+	nb := 0
+	for _, m := range ms {
+		nb += len(m.Buffers)
+	}
+	if nb <= cap(p.vs) {
+		p.vs = p.vs[:nb]
+	} else {
+		p.vs = make([]iovec, nb)
+	}
+}
+
+func (p *mmsghdrsPacker) pack(ms []Message, parseFn func([]byte, string) (net.Addr, error), marshalFn func(net.Addr, []byte) int) mmsghdrs {
+	p.prepare(ms)
+	hs := p.hs
+	vsRest := p.vs
+	saRest := p.sockaddrs
+	for i := range hs {
+		nvs := len(ms[i].Buffers)
+		vs := vsRest[:nvs]
+		vsRest = vsRest[nvs:]
+
+		var sa []byte
+		if parseFn != nil {
+			sa = saRest[:sizeofSockaddrInet6]
+			saRest = saRest[sizeofSockaddrInet6:]
+		} else if marshalFn != nil {
+			n := marshalFn(ms[i].Addr, saRest)
+			sa = saRest[:n]
+			saRest = saRest[n:]
+		}
+		hs[i].Hdr.pack(vs, ms[i].Buffers, ms[i].OOB, sa)
+	}
+	return hs
+}
+
+var defaultMmsghdrsPool = mmsghdrsPool{
+	p: sync.Pool{
+		New: func() interface{} {
+			return new(mmsghdrsPacker)
+		},
+	},
+}
+
+type mmsghdrsPool struct {
+	p sync.Pool
+}
+
+func (p *mmsghdrsPool) Get() *mmsghdrsPacker {
+	return p.p.Get().(*mmsghdrsPacker)
+}
+
+func (p *mmsghdrsPool) Put(packer *mmsghdrsPacker) {
+	p.p.Put(packer)
+}
diff --git a/internal/socket/rawconn_mmsg.go b/internal/socket/rawconn_mmsg.go
index 5d90de1..d80a15c 100644
--- a/internal/socket/rawconn_mmsg.go
+++ b/internal/socket/rawconn_mmsg.go
@@ -17,14 +17,13 @@
 	for i := range ms {
 		ms[i].raceWrite()
 	}
-	hs := make(mmsghdrs, len(ms))
+	packer := defaultMmsghdrsPool.Get()
+	defer defaultMmsghdrsPool.Put(packer)
 	var parseFn func([]byte, string) (net.Addr, error)
 	if c.network != "tcp" {
 		parseFn = parseInetAddr
 	}
-	if err := hs.pack(ms, parseFn, nil); err != nil {
-		return 0, err
-	}
+	hs := packer.pack(ms, parseFn, nil)
 	var operr error
 	var n int
 	fn := func(s uintptr) bool {
@@ -50,14 +49,13 @@
 	for i := range ms {
 		ms[i].raceRead()
 	}
-	hs := make(mmsghdrs, len(ms))
-	var marshalFn func(net.Addr) []byte
+	packer := defaultMmsghdrsPool.Get()
+	defer defaultMmsghdrsPool.Put(packer)
+	var marshalFn func(net.Addr, []byte) int
 	if c.network != "tcp" {
 		marshalFn = marshalInetAddr
 	}
-	if err := hs.pack(ms, nil, marshalFn); err != nil {
-		return 0, err
-	}
+	hs := packer.pack(ms, nil, marshalFn)
 	var operr error
 	var n int
 	fn := func(s uintptr) bool {
diff --git a/internal/socket/rawconn_msg.go b/internal/socket/rawconn_msg.go
index dfed9a8..2e2d61b 100644
--- a/internal/socket/rawconn_msg.go
+++ b/internal/socket/rawconn_msg.go
@@ -55,7 +55,9 @@
 	vs := make([]iovec, len(m.Buffers))
 	var sa []byte
 	if m.Addr != nil {
-		sa = marshalInetAddr(m.Addr)
+		var a [sizeofSockaddrInet6]byte
+		n := marshalInetAddr(m.Addr, a[:])
+		sa = a[:n]
 	}
 	h.pack(vs, m.Buffers, m.OOB, sa)
 	var operr error
diff --git a/internal/socket/sys_posix.go b/internal/socket/sys_posix.go
index d8dda77..42b8f23 100644
--- a/internal/socket/sys_posix.go
+++ b/internal/socket/sys_posix.go
@@ -17,22 +17,24 @@
 	"time"
 )
 
-func marshalInetAddr(a net.Addr) []byte {
+// marshalInetAddr writes a in sockaddr format into the buffer b.
+// The buffer must be sufficiently large (sizeofSockaddrInet4/6).
+// Returns the number of bytes written.
+func marshalInetAddr(a net.Addr, b []byte) int {
 	switch a := a.(type) {
 	case *net.TCPAddr:
-		return marshalSockaddr(a.IP, a.Port, a.Zone)
+		return marshalSockaddr(a.IP, a.Port, a.Zone, b)
 	case *net.UDPAddr:
-		return marshalSockaddr(a.IP, a.Port, a.Zone)
+		return marshalSockaddr(a.IP, a.Port, a.Zone, b)
 	case *net.IPAddr:
-		return marshalSockaddr(a.IP, 0, a.Zone)
+		return marshalSockaddr(a.IP, 0, a.Zone, b)
 	default:
-		return nil
+		return 0
 	}
 }
 
-func marshalSockaddr(ip net.IP, port int, zone string) []byte {
+func marshalSockaddr(ip net.IP, port int, zone string, b []byte) int {
 	if ip4 := ip.To4(); ip4 != nil {
-		b := make([]byte, sizeofSockaddrInet4)
 		switch runtime.GOOS {
 		case "android", "illumos", "linux", "solaris", "windows":
 			NativeEndian.PutUint16(b[:2], uint16(sysAF_INET))
@@ -42,10 +44,9 @@
 		}
 		binary.BigEndian.PutUint16(b[2:4], uint16(port))
 		copy(b[4:8], ip4)
-		return b
+		return sizeofSockaddrInet4
 	}
 	if ip6 := ip.To16(); ip6 != nil && ip.To4() == nil {
-		b := make([]byte, sizeofSockaddrInet6)
 		switch runtime.GOOS {
 		case "android", "illumos", "linux", "solaris", "windows":
 			NativeEndian.PutUint16(b[:2], uint16(sysAF_INET6))
@@ -58,9 +59,9 @@
 		if zone != "" {
 			NativeEndian.PutUint32(b[24:28], uint32(zoneCache.index(zone)))
 		}
-		return b
+		return sizeofSockaddrInet6
 	}
-	return nil
+	return 0
 }
 
 func parseInetAddr(b []byte, network string) (net.Addr, error) {