sync: avoid a dynamic check in WaitGroup on 64-bit architectures

uint64 is guaranteed by the compiler to be aligned on 64-bit archs.
By using uint64+uint32 instead of [3]uint32 we can make use of the
guaranteed alignment to avoid the run-time alignment check.

On linux/amd64:

name                     old time/op    new time/op    delta
WaitGroupUncontended-4     8.84ns ± 3%    7.62ns ± 4%  -13.72%  (p=0.000 n=17+18)
WaitGroupAddDone-4         66.8ns ± 3%    45.9ns ± 2%  -31.31%  (p=0.000 n=20+18)
WaitGroupAddDoneWork-4     79.2ns ± 1%    56.6ns ± 1%  -28.54%  (p=0.000 n=17+20)
WaitGroupWait-4            2.83ns ± 2%    2.58ns ± 2%   -9.05%  (p=0.000 n=18+20)
WaitGroupWaitWork-4        16.8ns ± 6%    16.5ns ± 6%     ~     (p=0.072 n=20+18)
WaitGroupActuallyWait-4     263ns ± 2%     261ns ± 5%     ~     (p=0.063 n=18+20)

Change-Id: I314340f2ed8a47d8b9c15f8a3b07e41f252f4831
Reviewed-on: https://go-review.googlesource.com/c/go/+/189837
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
Run-TryBot: Cherry Mui <cherryyz@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
diff --git a/src/sync/waitgroup.go b/src/sync/waitgroup.go
index e81a493..9c6662d 100644
--- a/src/sync/waitgroup.go
+++ b/src/sync/waitgroup.go
@@ -22,18 +22,24 @@
 
 	// 64-bit value: high 32 bits are counter, low 32 bits are waiter count.
 	// 64-bit atomic operations require 64-bit alignment, but 32-bit
-	// compilers do not ensure it. So we allocate 12 bytes and then use
-	// the aligned 8 bytes in them as state, and the other 4 as storage
-	// for the sema.
-	state1 [3]uint32
+	// compilers only guarantee that 64-bit fields are 32-bit aligned.
+	// For this reason on 32 bit architectures we need to check in state()
+	// if state1 is aligned or not, and dynamically "swap" the field order if
+	// needed.
+	state1 uint64
+	state2 uint32
 }
 
-// state returns pointers to the state and sema fields stored within wg.state1.
+// state returns pointers to the state and sema fields stored within wg.state*.
 func (wg *WaitGroup) state() (statep *uint64, semap *uint32) {
-	if uintptr(unsafe.Pointer(&wg.state1))%8 == 0 {
-		return (*uint64)(unsafe.Pointer(&wg.state1)), &wg.state1[2]
+	if unsafe.Alignof(wg.state1) == 8 || uintptr(unsafe.Pointer(&wg.state1))%8 == 0 {
+		// state1 is 64-bit aligned: nothing to do.
+		return &wg.state1, &wg.state2
 	} else {
-		return (*uint64)(unsafe.Pointer(&wg.state1[1])), &wg.state1[0]
+		// state1 is 32-bit aligned but not 64-bit aligned: this means that
+		// (&state1)+4 is 64-bit aligned.
+		state := (*[3]uint32)(unsafe.Pointer(&wg.state1))
+		return (*uint64)(unsafe.Pointer(&state[1])), &state[0]
 	}
 }