chacha20: improve generic implementation performance Some small changes to improve computation throughput of noasm chacha20 by between 4-11%. name old time/op new time/op delta ChaCha20/64-12 142ns ± 1% 132ns ± 1% -7.09% (p=0.000 n=20+20) ChaCha20/256-12 485ns ± 1% 441ns ± 1% -9.08% (p=0.000 n=20+20) ChaCha20/10x25-12 822ns ± 1% 785ns ± 1% -4.42% (p=0.000 n=20+20) ChaCha20/4096-12 484ns ± 1% 442ns ± 1% -8.80% (p=0.000 n=20+19) ChaCha20/100x40-12 8.65µs ± 1% 8.08µs ± 1% -6.54% (p=0.000 n=19+20) ChaCha20/65536-12 118µs ± 1% 106µs ± 1% -10.04% (p=0.000 n=19+20) ChaCha20/1000x65-12 120µs ± 1% 108µs ± 0% -9.84% (p=0.000 n=19+19) name old speed new speed delta ChaCha20/64-12 450MB/s ± 1% 484MB/s ± 1% +7.61% (p=0.000 n=19+19) ChaCha20/256-12 527MB/s ± 1% 580MB/s ± 1% +10.03% (p=0.000 n=20+20) ChaCha20/10x25-12 304MB/s ± 1% 318MB/s ± 1% +4.62% (p=0.000 n=20+20) ChaCha20/4096-12 529MB/s ± 1% 580MB/s ± 1% +9.70% (p=0.000 n=19+20) ChaCha20/100x40-12 463MB/s ± 1% 495MB/s ± 1% +7.00% (p=0.000 n=19+20) ChaCha20/65536-12 556MB/s ± 1% 618MB/s ± 1% +11.16% (p=0.000 n=19+20) ChaCha20/1000x65-12 542MB/s ± 1% 602MB/s ± 0% +10.92% (p=0.000 n=19+19) Redundant bound checks on loading key and nonce as uint32s in newUnauthenticatedCipher are eliminated. In the main block loop, two changes were made: 1. Specialise the xor function to addXor, so the final block state variables don't need to be separately incremented by the initial block state before XORing with the input block. 2. Change the loop condition to be bound by len(src) >=64 and len(dst) >= 64. This eliminates all bound checks later in the load-xor-store stage. As a result, src and dst are resliced in-place after each block. Change-Id: Ic3165a556bf7cb2d71349d534cdd21d06d9a7a2c GitHub-Last-Rev: b3b1a185004c551919a8984547a5bac0283919a1 GitHub-Pull-Request: golang/crypto#131 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/228618 Run-TryBot: Filippo Valsorda <filippo@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Filippo Valsorda <filippo@golang.org>

commit: 3c4aac89819a5fdc28d906456729d3423fd46969 [log] [tgz]
author: Andy Wang <cbeuw.andy@gmail.com> Sat Apr 18 01:52:37 2020 +0000
committer: Filippo Valsorda <filippo@golang.org> Mon Apr 20 20:11:42 2020 +0000
tree: c3d9a234c841dd434fc6000aabe118cc9a159608
parent: a76a400e302568c342acfdbe6e92c6531be3eb86 [diff]
diff --git a/chacha20/chacha_generic.go b/chacha20/chacha_generic.go
index 7c498e9..18c8bc0 100644
--- a/chacha20/chacha_generic.go
+++ b/chacha20/chacha_generic.go

@@ -89,6 +89,7 @@
 		return nil, errors.New("chacha20: wrong nonce size")
 	}
 
+	key, nonce = key[:KeySize], nonce[:NonceSize] // bounds check elimination hint
 	c.key = [8]uint32{
 		binary.LittleEndian.Uint32(key[0:4]),
 		binary.LittleEndian.Uint32(key[4:8]),
@@ -260,7 +261,9 @@
 		s.precompDone = true
 	}
 
-	for i := 0; i < len(src); i += blockSize {
+	// A condition of len(src) > 0 would be sufficient, but this also
+	// acts as a bounds check elimination hint.
+	for len(src) >= 64 && len(dst) >= 64 {
 		// The remainder of the first column round.
 		fcr0, fcr4, fcr8, fcr12 := quarterRound(c0, c4, c8, s.counter)
 
@@ -285,49 +288,31 @@
 			x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14)
 		}
 
-		// Finally, add back the initial state to generate the key stream.
-		x0 += c0
-		x1 += c1
-		x2 += c2
-		x3 += c3
-		x4 += c4
-		x5 += c5
-		x6 += c6
-		x7 += c7
-		x8 += c8
-		x9 += c9
-		x10 += c10
-		x11 += c11
-		x12 += s.counter
-		x13 += c13
-		x14 += c14
-		x15 += c15
+		// Add back the initial state to generate the key stream, then
+		// XOR the key stream with the source and write out the result.
+		addXor(dst[0:4], src[0:4], x0, c0)
+		addXor(dst[4:8], src[4:8], x1, c1)
+		addXor(dst[8:12], src[8:12], x2, c2)
+		addXor(dst[12:16], src[12:16], x3, c3)
+		addXor(dst[16:20], src[16:20], x4, c4)
+		addXor(dst[20:24], src[20:24], x5, c5)
+		addXor(dst[24:28], src[24:28], x6, c6)
+		addXor(dst[28:32], src[28:32], x7, c7)
+		addXor(dst[32:36], src[32:36], x8, c8)
+		addXor(dst[36:40], src[36:40], x9, c9)
+		addXor(dst[40:44], src[40:44], x10, c10)
+		addXor(dst[44:48], src[44:48], x11, c11)
+		addXor(dst[48:52], src[48:52], x12, s.counter)
+		addXor(dst[52:56], src[52:56], x13, c13)
+		addXor(dst[56:60], src[56:60], x14, c14)
+		addXor(dst[60:64], src[60:64], x15, c15)
 
 		s.counter += 1
 		if s.counter == 0 {
 			panic("chacha20: internal error: counter overflow")
 		}
 
-		in, out := src[i:], dst[i:]
-		in, out = in[:blockSize], out[:blockSize] // bounds check elimination hint
-
-		// XOR the key stream with the source and write out the result.
-		xor(out[0:], in[0:], x0)
-		xor(out[4:], in[4:], x1)
-		xor(out[8:], in[8:], x2)
-		xor(out[12:], in[12:], x3)
-		xor(out[16:], in[16:], x4)
-		xor(out[20:], in[20:], x5)
-		xor(out[24:], in[24:], x6)
-		xor(out[28:], in[28:], x7)
-		xor(out[32:], in[32:], x8)
-		xor(out[36:], in[36:], x9)
-		xor(out[40:], in[40:], x10)
-		xor(out[44:], in[44:], x11)
-		xor(out[48:], in[48:], x12)
-		xor(out[52:], in[52:], x13)
-		xor(out[56:], in[56:], x14)
-		xor(out[60:], in[60:], x15)
+		src, dst = src[blockSize:], dst[blockSize:]
 	}
 }
 

diff --git a/chacha20/xor.go b/chacha20/xor.go
index 0110c98..c2d0485 100644
--- a/chacha20/xor.go
+++ b/chacha20/xor.go

@@ -13,10 +13,10 @@
 	runtime.GOARCH == "ppc64le" ||
 	runtime.GOARCH == "s390x"
 
-// xor reads a little endian uint32 from src, XORs it with u and
+// addXor reads a little endian uint32 from src, XORs it with (a + b) and
 // places the result in little endian byte order in dst.
-func xor(dst, src []byte, u uint32) {
-	_, _ = src[3], dst[3] // eliminate bounds checks
+func addXor(dst, src []byte, a, b uint32) {
+	_, _ = src[3], dst[3] // bounds check elimination hint
 	if unaligned {
 		// The compiler should optimize this code into
 		// 32-bit unaligned little endian loads and stores.
@@ -27,15 +27,16 @@
 		v |= uint32(src[1]) << 8
 		v |= uint32(src[2]) << 16
 		v |= uint32(src[3]) << 24
-		v ^= u
+		v ^= a + b
 		dst[0] = byte(v)
 		dst[1] = byte(v >> 8)
 		dst[2] = byte(v >> 16)
 		dst[3] = byte(v >> 24)
 	} else {
-		dst[0] = src[0] ^ byte(u)
-		dst[1] = src[1] ^ byte(u>>8)
-		dst[2] = src[2] ^ byte(u>>16)
-		dst[3] = src[3] ^ byte(u>>24)
+		a += b
+		dst[0] = src[0] ^ byte(a)
+		dst[1] = src[1] ^ byte(a>>8)
+		dst[2] = src[2] ^ byte(a>>16)
+		dst[3] = src[3] ^ byte(a>>24)
 	}
 }
commit	3c4aac89819a5fdc28d906456729d3423fd46969	[log] [tgz]
author	Andy Wang <cbeuw.andy@gmail.com>	Sat Apr 18 01:52:37 2020 +0000
committer	Filippo Valsorda <filippo@golang.org>	Mon Apr 20 20:11:42 2020 +0000
tree	c3d9a234c841dd434fc6000aabe118cc9a159608
parent	a76a400e302568c342acfdbe6e92c6531be3eb86 [diff]