internal/chacha20: cache first round across XORKeyStream invocations name old speed new speed delta ChaCha20/64-4 428MB/s ± 1% 432MB/s ± 1% ~ (p=0.089 n=10+10) ChaCha20/256-4 497MB/s ± 1% 507MB/s ± 2% +1.94% (p=0.000 n=9+10) ChaCha20/10x25-4 273MB/s ± 1% 285MB/s ± 3% +4.37% (p=0.000 n=10+10) ChaCha20/4096-4 495MB/s ± 1% 508MB/s ± 1% +2.51% (p=0.000 n=8+10) ChaCha20/100x40-4 407MB/s ± 1% 439MB/s ± 1% +7.92% (p=0.000 n=9+9) ChaCha20/65536-4 521MB/s ± 2% 537MB/s ± 1% +3.00% (p=0.000 n=10+10) ChaCha20/1000x65-4 498MB/s ± 2% 521MB/s ± 2% +4.70% (p=0.000 n=10+10) Curiously, even if we omit the critical s.precompDone = true step, we see a significant performance improvement across the board, maybe due to reduced register pressure. (See below. Actually using the precomputed values only impacts the 10x25, 100x40 and 1000x65 benchmarks, as expected.) name old speed new speed delta ChaCha20/64-4 428MB/s ± 1% 428MB/s ± 1% ~ (p=0.912 n=10+10) ChaCha20/256-4 497MB/s ± 1% 510MB/s ± 1% +2.64% (p=0.000 n=9+10) ChaCha20/10x25-4 273MB/s ± 1% 277MB/s ± 2% +1.36% (p=0.003 n=10+10) ChaCha20/4096-4 495MB/s ± 1% 507MB/s ± 2% +2.28% (p=0.000 n=8+10) ChaCha20/100x40-4 407MB/s ± 1% 418MB/s ± 1% +2.69% (p=0.000 n=9+10) ChaCha20/65536-4 521MB/s ± 2% 536MB/s ± 1% +2.76% (p=0.000 n=10+8) ChaCha20/1000x65-4 498MB/s ± 2% 519MB/s ± 1% +4.15% (p=0.000 n=10+9) Updates golang/go#24485 Change-Id: I117fab938787819aae1cc4371354888701e4e54b Reviewed-on: https://go-review.googlesource.com/c/crypto/+/185440 Reviewed-by: Michael Munday <mike.munday@ibm.com>

commit: 9b708ad8e1a3fb29b81d7e722d70a4226e2acf7a [log] [tgz]
author: Filippo Valsorda <filippo@golang.org> Tue Jul 09 20:38:07 2019 -0400
committer: Filippo Valsorda <filippo@golang.org> Mon Nov 11 21:36:26 2019 +0000
tree: 548ac5a33973a14fffe1ee0396db74c79cbb7936
parent: 85e5e33df3ab4a4c7fcfc67d04bbeb97fc2d8e89 [diff]
diff --git a/internal/chacha20/chacha_generic.go b/internal/chacha20/chacha_generic.go
index 94222bf..e915a61 100644
--- a/internal/chacha20/chacha_generic.go
+++ b/internal/chacha20/chacha_generic.go

@@ -28,6 +28,13 @@
 	// computed at a time.
 	buf [bufSize]byte
 	len int
+
+	// The counter-independent results of the first round are cached after they
+	// are computed the first time.
+	precompDone      bool
+	p1, p5, p9, p13  uint32
+	p2, p6, p10, p14 uint32
+	p3, p7, p11, p15 uint32
 }
 
 var _ cipher.Stream = (*Cipher)(nil)
@@ -155,21 +162,24 @@
 	)
 
 	// Three quarters of the first round don't depend on the counter, so we can
-	// calculate them here, and reuse them for multiple blocks in the loop.
-	// TODO(filippo): experiment with reusing across XORKeyStream calls.
-	s1, s5, s9, s13 := quarterRound(c1, c5, c9, c13)
-	s2, s6, s10, s14 := quarterRound(c2, c6, c10, c14)
-	s3, s7, s11, s15 := quarterRound(c3, c7, c11, c15)
+	// calculate them here, and reuse them for multiple blocks in the loop, and
+	// for future XORKeyStream invocations.
+	if !s.precompDone {
+		s.p1, s.p5, s.p9, s.p13 = quarterRound(c1, c5, c9, c13)
+		s.p2, s.p6, s.p10, s.p14 = quarterRound(c2, c6, c10, c14)
+		s.p3, s.p7, s.p11, s.p15 = quarterRound(c3, c7, c11, c15)
+		s.precompDone = true
+	}
 
 	for i := 0; i < len(src); i += blockSize {
 		// The remainder of the first column round.
-		s0, s4, s8, s12 := quarterRound(c0, c4, c8, s.counter)
+		fcr0, fcr4, fcr8, fcr12 := quarterRound(c0, c4, c8, s.counter)
 
 		// The second diagonal round.
-		x0, x5, x10, x15 := quarterRound(s0, s5, s10, s15)
-		x1, x6, x11, x12 := quarterRound(s1, s6, s11, s12)
-		x2, x7, x8, x13 := quarterRound(s2, s7, s8, s13)
-		x3, x4, x9, x14 := quarterRound(s3, s4, s9, s14)
+		x0, x5, x10, x15 := quarterRound(fcr0, s.p5, s.p10, s.p15)
+		x1, x6, x11, x12 := quarterRound(s.p1, s.p6, s.p11, fcr12)
+		x2, x7, x8, x13 := quarterRound(s.p2, s.p7, fcr8, s.p13)
+		x3, x4, x9, x14 := quarterRound(s.p3, fcr4, s.p9, s.p14)
 
 		// The remaining 18 rounds.
 		for i := 0; i < 9; i++ {
commit	9b708ad8e1a3fb29b81d7e722d70a4226e2acf7a	[log] [tgz]
author	Filippo Valsorda <filippo@golang.org>	Tue Jul 09 20:38:07 2019 -0400
committer	Filippo Valsorda <filippo@golang.org>	Mon Nov 11 21:36:26 2019 +0000
tree	548ac5a33973a14fffe1ee0396db74c79cbb7936
parent	85e5e33df3ab4a4c7fcfc67d04bbeb97fc2d8e89 [diff]