chacha20: improve generic implementation performance

Some small changes to improve computation throughput of noasm chacha20 by between 4-11%.

name                 old time/op   new time/op   delta
ChaCha20/64-12         142ns ± 1%    132ns ± 1%   -7.09%  (p=0.000 n=20+20)
ChaCha20/256-12        485ns ± 1%    441ns ± 1%   -9.08%  (p=0.000 n=20+20)
ChaCha20/10x25-12      822ns ± 1%    785ns ± 1%   -4.42%  (p=0.000 n=20+20)
ChaCha20/4096-12       484ns ± 1%    442ns ± 1%   -8.80%  (p=0.000 n=20+19)
ChaCha20/100x40-12    8.65µs ± 1%   8.08µs ± 1%   -6.54%  (p=0.000 n=19+20)
ChaCha20/65536-12      118µs ± 1%    106µs ± 1%  -10.04%  (p=0.000 n=19+20)
ChaCha20/1000x65-12    120µs ± 1%    108µs ± 0%   -9.84%  (p=0.000 n=19+19)

name                 old speed     new speed     delta
ChaCha20/64-12       450MB/s ± 1%  484MB/s ± 1%   +7.61%  (p=0.000 n=19+19)
ChaCha20/256-12      527MB/s ± 1%  580MB/s ± 1%  +10.03%  (p=0.000 n=20+20)
ChaCha20/10x25-12    304MB/s ± 1%  318MB/s ± 1%   +4.62%  (p=0.000 n=20+20)
ChaCha20/4096-12     529MB/s ± 1%  580MB/s ± 1%   +9.70%  (p=0.000 n=19+20)
ChaCha20/100x40-12   463MB/s ± 1%  495MB/s ± 1%   +7.00%  (p=0.000 n=19+20)
ChaCha20/65536-12    556MB/s ± 1%  618MB/s ± 1%  +11.16%  (p=0.000 n=19+20)
ChaCha20/1000x65-12  542MB/s ± 1%  602MB/s ± 0%  +10.92%  (p=0.000 n=19+19)

Redundant bound checks on loading key and nonce as uint32s in newUnauthenticatedCipher
are eliminated.

In the main block loop, two changes were made:
1. Specialise the xor function to addXor, so the final block state variables don't need to be
separately incremented by the initial block state before XORing with the input block.
2. Change the loop condition to be bound by len(src) >=64 and len(dst) >= 64.
This eliminates all bound checks later in the load-xor-store stage.
As a result, src and dst are resliced in-place after each block.

Change-Id: Ic3165a556bf7cb2d71349d534cdd21d06d9a7a2c
GitHub-Last-Rev: b3b1a185004c551919a8984547a5bac0283919a1
GitHub-Pull-Request: golang/crypto#131
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/228618
Run-TryBot: Filippo Valsorda <filippo@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Filippo Valsorda <filippo@golang.org>
diff --git a/chacha20/chacha_generic.go b/chacha20/chacha_generic.go
index 7c498e9..18c8bc0 100644
--- a/chacha20/chacha_generic.go
+++ b/chacha20/chacha_generic.go
@@ -89,6 +89,7 @@
 		return nil, errors.New("chacha20: wrong nonce size")
 	}
 
+	key, nonce = key[:KeySize], nonce[:NonceSize] // bounds check elimination hint
 	c.key = [8]uint32{
 		binary.LittleEndian.Uint32(key[0:4]),
 		binary.LittleEndian.Uint32(key[4:8]),
@@ -260,7 +261,9 @@
 		s.precompDone = true
 	}
 
-	for i := 0; i < len(src); i += blockSize {
+	// A condition of len(src) > 0 would be sufficient, but this also
+	// acts as a bounds check elimination hint.
+	for len(src) >= 64 && len(dst) >= 64 {
 		// The remainder of the first column round.
 		fcr0, fcr4, fcr8, fcr12 := quarterRound(c0, c4, c8, s.counter)
 
@@ -285,49 +288,31 @@
 			x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14)
 		}
 
-		// Finally, add back the initial state to generate the key stream.
-		x0 += c0
-		x1 += c1
-		x2 += c2
-		x3 += c3
-		x4 += c4
-		x5 += c5
-		x6 += c6
-		x7 += c7
-		x8 += c8
-		x9 += c9
-		x10 += c10
-		x11 += c11
-		x12 += s.counter
-		x13 += c13
-		x14 += c14
-		x15 += c15
+		// Add back the initial state to generate the key stream, then
+		// XOR the key stream with the source and write out the result.
+		addXor(dst[0:4], src[0:4], x0, c0)
+		addXor(dst[4:8], src[4:8], x1, c1)
+		addXor(dst[8:12], src[8:12], x2, c2)
+		addXor(dst[12:16], src[12:16], x3, c3)
+		addXor(dst[16:20], src[16:20], x4, c4)
+		addXor(dst[20:24], src[20:24], x5, c5)
+		addXor(dst[24:28], src[24:28], x6, c6)
+		addXor(dst[28:32], src[28:32], x7, c7)
+		addXor(dst[32:36], src[32:36], x8, c8)
+		addXor(dst[36:40], src[36:40], x9, c9)
+		addXor(dst[40:44], src[40:44], x10, c10)
+		addXor(dst[44:48], src[44:48], x11, c11)
+		addXor(dst[48:52], src[48:52], x12, s.counter)
+		addXor(dst[52:56], src[52:56], x13, c13)
+		addXor(dst[56:60], src[56:60], x14, c14)
+		addXor(dst[60:64], src[60:64], x15, c15)
 
 		s.counter += 1
 		if s.counter == 0 {
 			panic("chacha20: internal error: counter overflow")
 		}
 
-		in, out := src[i:], dst[i:]
-		in, out = in[:blockSize], out[:blockSize] // bounds check elimination hint
-
-		// XOR the key stream with the source and write out the result.
-		xor(out[0:], in[0:], x0)
-		xor(out[4:], in[4:], x1)
-		xor(out[8:], in[8:], x2)
-		xor(out[12:], in[12:], x3)
-		xor(out[16:], in[16:], x4)
-		xor(out[20:], in[20:], x5)
-		xor(out[24:], in[24:], x6)
-		xor(out[28:], in[28:], x7)
-		xor(out[32:], in[32:], x8)
-		xor(out[36:], in[36:], x9)
-		xor(out[40:], in[40:], x10)
-		xor(out[44:], in[44:], x11)
-		xor(out[48:], in[48:], x12)
-		xor(out[52:], in[52:], x13)
-		xor(out[56:], in[56:], x14)
-		xor(out[60:], in[60:], x15)
+		src, dst = src[blockSize:], dst[blockSize:]
 	}
 }
 
diff --git a/chacha20/xor.go b/chacha20/xor.go
index 0110c98..c2d0485 100644
--- a/chacha20/xor.go
+++ b/chacha20/xor.go
@@ -13,10 +13,10 @@
 	runtime.GOARCH == "ppc64le" ||
 	runtime.GOARCH == "s390x"
 
-// xor reads a little endian uint32 from src, XORs it with u and
+// addXor reads a little endian uint32 from src, XORs it with (a + b) and
 // places the result in little endian byte order in dst.
-func xor(dst, src []byte, u uint32) {
-	_, _ = src[3], dst[3] // eliminate bounds checks
+func addXor(dst, src []byte, a, b uint32) {
+	_, _ = src[3], dst[3] // bounds check elimination hint
 	if unaligned {
 		// The compiler should optimize this code into
 		// 32-bit unaligned little endian loads and stores.
@@ -27,15 +27,16 @@
 		v |= uint32(src[1]) << 8
 		v |= uint32(src[2]) << 16
 		v |= uint32(src[3]) << 24
-		v ^= u
+		v ^= a + b
 		dst[0] = byte(v)
 		dst[1] = byte(v >> 8)
 		dst[2] = byte(v >> 16)
 		dst[3] = byte(v >> 24)
 	} else {
-		dst[0] = src[0] ^ byte(u)
-		dst[1] = src[1] ^ byte(u>>8)
-		dst[2] = src[2] ^ byte(u>>16)
-		dst[3] = src[3] ^ byte(u>>24)
+		a += b
+		dst[0] = src[0] ^ byte(a)
+		dst[1] = src[1] ^ byte(a>>8)
+		dst[2] = src[2] ^ byte(a>>16)
+		dst[3] = src[3] ^ byte(a>>24)
 	}
 }