internal/impl: avoid inlining fixed coderFieldInfo array

Any attempt at guessing the size for a fixed coderFieldInfo array
will always get it wrong in some cases, either by under-estimating
or over-estimating the count. The former causes worse caching behavior,
while the latter causes memory waste.

As a middle ground, just pre-allocate a slice of the exact length.
Each element will have memory locality with each other, but not
be guaranteed to have memory locality with the parent coderMessageInfo.

name                            old time/op  new time/op  delta
EmptyMessage/Wire/Marshal-8     43.1ns ±11%  42.6ns ± 8%  -1.32%  (p=0.036 n=50+49)
EmptyMessage/Wire/Unmarshal-8   18.6ns ±10%  18.9ns ±12%    ~     (p=0.054 n=50+50)
EmptyMessage/Wire/Validate-8    15.0ns ± 9%  14.7ns ±10%  -2.44%  (p=0.002 n=50+45)
EmptyMessage/Clone-8             163ns ±20%   149ns ±19%  -8.58%  (p=0.000 n=48+53)
RepeatedInt32/Wire/Marshal-8    4.27µs ±12%  4.24µs ±13%    ~     (p=0.612 n=48+52)
RepeatedInt32/Wire/Unmarshal-8  3.47µs ±14%  3.50µs ±11%    ~     (p=0.217 n=50+53)
RepeatedInt32/Wire/Validate-8   2.12µs ±12%  2.09µs ± 9%    ~     (p=0.121 n=50+51)
RepeatedInt32/Clone-8           3.04µs ±18%  2.98µs ±36%    ~     (p=0.289 n=51+54)
Required/Wire/Marshal-8          281ns ±14%   276ns ±11%    ~     (p=0.059 n=48+55)
Required/Wire/Unmarshal-8        117ns ±14%   118ns ±11%    ~     (p=0.358 n=49+53)
Required/Wire/Validate-8        87.6ns ± 9%  88.0ns ±12%    ~     (p=0.373 n=48+53)
Required/Clone-8                 533ns ±12%   507ns ±15%  -4.71%  (p=0.000 n=49+54)

Change-Id: I4cf3134e424130bee728b7591127e5c80f07e2db
Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/232937
Reviewed-by: Damien Neil <dneil@google.com>
diff --git a/internal/impl/codec_message.go b/internal/impl/codec_message.go
index 29ed59b..0e176d5 100644
--- a/internal/impl/codec_message.go
+++ b/internal/impl/codec_message.go
@@ -31,11 +31,6 @@
 	needsInitCheck     bool
 	isMessageSet       bool
 	numRequiredFields  uint8
-
-	// Include space for a number of coderFieldInfos to improve cache locality.
-	// The number of entries is chosen through a combination of guesswork and
-	// empirical testing.
-	coderFieldBuf [32]coderFieldInfo
 }
 
 type coderFieldInfo struct {
@@ -58,7 +53,7 @@
 
 	mi.coderFields = make(map[protowire.Number]*coderFieldInfo)
 	fields := mi.Desc.Fields()
-	preallocFields := mi.coderFieldBuf[:]
+	preallocFields := make([]coderFieldInfo, fields.Len())
 	for i := 0; i < fields.Len(); i++ {
 		fd := fields.Get(i)
 
@@ -87,13 +82,7 @@
 			fieldOffset = offsetOf(fs, mi.Exporter)
 			childMessage, funcs = fieldCoder(fd, ft)
 		}
-		var cf *coderFieldInfo
-		if len(preallocFields) > 0 {
-			cf = &preallocFields[0]
-			preallocFields = preallocFields[1:]
-		} else {
-			cf = new(coderFieldInfo)
-		}
+		cf := &preallocFields[i]
 		*cf = coderFieldInfo{
 			num:        fd.Number(),
 			offset:     fieldOffset,