internal/chacha20: cache first round across XORKeyStream invocations

name                old speed     new speed     delta
ChaCha20/64-4       428MB/s ± 1%  432MB/s ± 1%    ~     (p=0.089 n=10+10)
ChaCha20/256-4      497MB/s ± 1%  507MB/s ± 2%  +1.94%  (p=0.000 n=9+10)
ChaCha20/10x25-4    273MB/s ± 1%  285MB/s ± 3%  +4.37%  (p=0.000 n=10+10)
ChaCha20/4096-4     495MB/s ± 1%  508MB/s ± 1%  +2.51%  (p=0.000 n=8+10)
ChaCha20/100x40-4   407MB/s ± 1%  439MB/s ± 1%  +7.92%  (p=0.000 n=9+9)
ChaCha20/65536-4    521MB/s ± 2%  537MB/s ± 1%  +3.00%  (p=0.000 n=10+10)
ChaCha20/1000x65-4  498MB/s ± 2%  521MB/s ± 2%  +4.70%  (p=0.000 n=10+10)

Curiously, even if we omit the critical s.precompDone = true step, we
see a significant performance improvement across the board, maybe due to
reduced register pressure. (See below. Actually using the precomputed
values only impacts the 10x25, 100x40 and 1000x65 benchmarks, as
expected.)

name                old speed     new speed     delta
ChaCha20/64-4       428MB/s ± 1%  428MB/s ± 1%    ~     (p=0.912 n=10+10)
ChaCha20/256-4      497MB/s ± 1%  510MB/s ± 1%  +2.64%  (p=0.000 n=9+10)
ChaCha20/10x25-4    273MB/s ± 1%  277MB/s ± 2%  +1.36%  (p=0.003 n=10+10)
ChaCha20/4096-4     495MB/s ± 1%  507MB/s ± 2%  +2.28%  (p=0.000 n=8+10)
ChaCha20/100x40-4   407MB/s ± 1%  418MB/s ± 1%  +2.69%  (p=0.000 n=9+10)
ChaCha20/65536-4    521MB/s ± 2%  536MB/s ± 1%  +2.76%  (p=0.000 n=10+8)
ChaCha20/1000x65-4  498MB/s ± 2%  519MB/s ± 1%  +4.15%  (p=0.000 n=10+9)

Updates golang/go#24485

Change-Id: I117fab938787819aae1cc4371354888701e4e54b
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/185440
Reviewed-by: Michael Munday <mike.munday@ibm.com>
diff --git a/internal/chacha20/chacha_generic.go b/internal/chacha20/chacha_generic.go
index 94222bf..e915a61 100644
--- a/internal/chacha20/chacha_generic.go
+++ b/internal/chacha20/chacha_generic.go
@@ -28,6 +28,13 @@
 	// computed at a time.
 	buf [bufSize]byte
 	len int
+
+	// The counter-independent results of the first round are cached after they
+	// are computed the first time.
+	precompDone      bool
+	p1, p5, p9, p13  uint32
+	p2, p6, p10, p14 uint32
+	p3, p7, p11, p15 uint32
 }
 
 var _ cipher.Stream = (*Cipher)(nil)
@@ -155,21 +162,24 @@
 	)
 
 	// Three quarters of the first round don't depend on the counter, so we can
-	// calculate them here, and reuse them for multiple blocks in the loop.
-	// TODO(filippo): experiment with reusing across XORKeyStream calls.
-	s1, s5, s9, s13 := quarterRound(c1, c5, c9, c13)
-	s2, s6, s10, s14 := quarterRound(c2, c6, c10, c14)
-	s3, s7, s11, s15 := quarterRound(c3, c7, c11, c15)
+	// calculate them here, and reuse them for multiple blocks in the loop, and
+	// for future XORKeyStream invocations.
+	if !s.precompDone {
+		s.p1, s.p5, s.p9, s.p13 = quarterRound(c1, c5, c9, c13)
+		s.p2, s.p6, s.p10, s.p14 = quarterRound(c2, c6, c10, c14)
+		s.p3, s.p7, s.p11, s.p15 = quarterRound(c3, c7, c11, c15)
+		s.precompDone = true
+	}
 
 	for i := 0; i < len(src); i += blockSize {
 		// The remainder of the first column round.
-		s0, s4, s8, s12 := quarterRound(c0, c4, c8, s.counter)
+		fcr0, fcr4, fcr8, fcr12 := quarterRound(c0, c4, c8, s.counter)
 
 		// The second diagonal round.
-		x0, x5, x10, x15 := quarterRound(s0, s5, s10, s15)
-		x1, x6, x11, x12 := quarterRound(s1, s6, s11, s12)
-		x2, x7, x8, x13 := quarterRound(s2, s7, s8, s13)
-		x3, x4, x9, x14 := quarterRound(s3, s4, s9, s14)
+		x0, x5, x10, x15 := quarterRound(fcr0, s.p5, s.p10, s.p15)
+		x1, x6, x11, x12 := quarterRound(s.p1, s.p6, s.p11, fcr12)
+		x2, x7, x8, x13 := quarterRound(s.p2, s.p7, fcr8, s.p13)
+		x3, x4, x9, x14 := quarterRound(s.p3, fcr4, s.p9, s.p14)
 
 		// The remaining 18 rounds.
 		for i := 0; i < 9; i++ {