hash/fnv: use bits.Mul64 for 128-bit hash

Replace the 128-bit multiplication in 4 parts with bits.Mul64
and two single-width multiplications.  This simplifies the code
and increases throughput by ~50% on amd64.

name         old time/op   new time/op   delta
Fnv128KB-4    9.64µs ± 0%   6.09µs ± 0%  -36.89%  (p=0.016 n=4+5)
Fnv128aKB-4   9.11µs ± 0%   6.17µs ± 5%  -32.32%  (p=0.008 n=5+5)

name         old speed     new speed     delta
Fnv128KB-4   106MB/s ± 0%  168MB/s ± 0%  +58.44%  (p=0.016 n=4+5)
Fnv128aKB-4  112MB/s ± 0%  166MB/s ± 5%  +47.85%  (p=0.008 n=5+5)

Change-Id: Id752f2a20ea3de23a41e08db89eecf2bb60b7e6d
Reviewed-on: https://go-review.googlesource.com/c/133936
Run-TryBot: Matt Layher <mdlayher@gmail.com>
Run-TryBot: Robert Griesemer <gri@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Matt Layher <mdlayher@gmail.com>
Reviewed-by: Robert Griesemer <gri@golang.org>
diff --git a/src/hash/fnv/fnv.go b/src/hash/fnv/fnv.go
index 7662315..0fce177 100644
--- a/src/hash/fnv/fnv.go
+++ b/src/hash/fnv/fnv.go
@@ -15,6 +15,7 @@
 import (
 	"errors"
 	"hash"
+	"math/bits"
 )
 
 type (
@@ -137,18 +138,12 @@
 
 func (s *sum128) Write(data []byte) (int, error) {
 	for _, c := range data {
-		// Compute the multiplication in 4 parts to simplify carrying
-		s1l := (s[1] & 0xffffffff) * prime128Lower
-		s1h := (s[1] >> 32) * prime128Lower
-		s0l := (s[0]&0xffffffff)*prime128Lower + (s[1]&0xffffffff)<<prime128Shift
-		s0h := (s[0]>>32)*prime128Lower + (s[1]>>32)<<prime128Shift
-		// Carries
-		s1h += s1l >> 32
-		s0l += s1h >> 32
-		s0h += s0l >> 32
+		// Compute the multiplication
+		s0, s1 := bits.Mul64(prime128Lower, s[1])
+		s0 += s[1]<<prime128Shift + prime128Lower*s[0]
 		// Update the values
-		s[1] = (s1l & 0xffffffff) + (s1h << 32)
-		s[0] = (s0l & 0xffffffff) + (s0h << 32)
+		s[1] = s1
+		s[0] = s0
 		s[1] ^= uint64(c)
 	}
 	return len(data), nil
@@ -157,18 +152,12 @@
 func (s *sum128a) Write(data []byte) (int, error) {
 	for _, c := range data {
 		s[1] ^= uint64(c)
-		// Compute the multiplication in 4 parts to simplify carrying
-		s1l := (s[1] & 0xffffffff) * prime128Lower
-		s1h := (s[1] >> 32) * prime128Lower
-		s0l := (s[0]&0xffffffff)*prime128Lower + (s[1]&0xffffffff)<<prime128Shift
-		s0h := (s[0]>>32)*prime128Lower + (s[1]>>32)<<prime128Shift
-		// Carries
-		s1h += s1l >> 32
-		s0l += s1h >> 32
-		s0h += s0l >> 32
+		// Compute the multiplication
+		s0, s1 := bits.Mul64(prime128Lower, s[1])
+		s0 += s[1]<<prime128Shift + prime128Lower*s[0]
 		// Update the values
-		s[1] = (s1l & 0xffffffff) + (s1h << 32)
-		s[0] = (s0l & 0xffffffff) + (s0h << 32)
+		s[1] = s1
+		s[0] = s0
 	}
 	return len(data), nil
 }