internal/chacha20: refactor for readability and consistency

Separated the complex buffering logic from key stream generation more
clearly, added plenty of comments and generally refactored the Go
implementation for readability. Made the interface with the
generic/assembly cores smaller and more consistent, according to
golang.org/wiki/TargetSpecific.

We will recover the lost performance on unaligned calls by caching 3/4
of the first round across XORKeyStream invocations, which we now have
complexity budget for.

name                old speed     new speed     delta
ChaCha20/64-4       435MB/s ± 2%  429MB/s ± 2%  -1.47%  (p=0.013 n=10+9)
ChaCha20/256-4      496MB/s ± 1%  493MB/s ± 2%    ~     (p=0.280 n=10+10)
ChaCha20/10x25-4    283MB/s ± 1%  274MB/s ± 2%  -3.13%  (p=0.000 n=10+10)
ChaCha20/4096-4     494MB/s ± 1%  493MB/s ± 5%    ~     (p=0.631 n=10+10)
ChaCha20/100x40-4   421MB/s ± 3%  408MB/s ± 1%  -3.14%  (p=0.003 n=9+9)
ChaCha20/65536-4    515MB/s ± 1%  519MB/s ± 3%    ~     (p=0.161 n=7+10)
ChaCha20/1000x65-4  501MB/s ± 2%  501MB/s ± 3%    ~     (p=0.497 n=9+10)

Also applied a fix for a lingering bug in the ppc64le assembly written
by Lynn Boger <laboger@linux.vnet.ibm.com>.

Updates golang/go#24485

Change-Id: I10cf24a7f10359b1b4ae63c9bb1946735b98ac9b
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/185439
Reviewed-by: Michael Munday <mike.munday@ibm.com>
diff --git a/internal/chacha20/chacha_arm64.go b/internal/chacha20/chacha_arm64.go
index ad74e23..87f1e36 100644
--- a/internal/chacha20/chacha_arm64.go
+++ b/internal/chacha20/chacha_arm64.go
@@ -3,29 +3,15 @@
 // license that can be found in the LICENSE file.
 
 // +build go1.11
-// +build !gccgo
+// +build !gccgo,!appengine
 
 package chacha20
 
-const (
-	haveAsm = true
-	bufSize = 256
-)
+const bufSize = 256
 
 //go:noescape
 func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
 
-func (c *Cipher) xorKeyStreamAsm(dst, src []byte) {
-
-	if len(src) >= bufSize {
-		xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter)
-	}
-
-	if len(src)%bufSize != 0 {
-		i := len(src) - len(src)%bufSize
-		c.buf = [bufSize]byte{}
-		copy(c.buf[:], src[i:])
-		xorKeyStreamVX(c.buf[:], c.buf[:], &c.key, &c.nonce, &c.counter)
-		c.len = bufSize - copy(dst[i:], c.buf[:len(src)%bufSize])
-	}
+func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) {
+	xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter)
 }
diff --git a/internal/chacha20/asm_arm64.s b/internal/chacha20/chacha_arm64.s
similarity index 100%
rename from internal/chacha20/asm_arm64.s
rename to internal/chacha20/chacha_arm64.s
diff --git a/internal/chacha20/chacha_generic.go b/internal/chacha20/chacha_generic.go
index 6570847..94222bf 100644
--- a/internal/chacha20/chacha_generic.go
+++ b/internal/chacha20/chacha_generic.go
@@ -2,57 +2,68 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// Package ChaCha20 implements the core ChaCha20 function as specified
-// in https://tools.ietf.org/html/rfc7539#section-2.3.
+// Package chacha20 implements the ChaCha20 encryption algorithm
+// as specified in RFC 8439.
 package chacha20
 
 import (
 	"crypto/cipher"
 	"encoding/binary"
+	"math/bits"
 
 	"golang.org/x/crypto/internal/subtle"
 )
 
-// assert that *Cipher implements cipher.Stream
-var _ cipher.Stream = (*Cipher)(nil)
-
 // Cipher is a stateful instance of ChaCha20 using a particular key
 // and nonce. A *Cipher implements the cipher.Stream interface.
 type Cipher struct {
+	// The ChaCha20 state is 16 words: 4 constant, 8 of key, 1 of counter
+	// (incremented after each block), and 3 of nonce.
 	key     [8]uint32
-	counter uint32 // incremented after each block
+	counter uint32
 	nonce   [3]uint32
-	buf     [bufSize]byte // buffer for unused keystream bytes
-	len     int           // number of unused keystream bytes at end of buf
+
+	// The last len bytes of buf are leftover key stream bytes from the previous
+	// XORKeyStream invocation. The size of buf depends on how many blocks are
+	// computed at a time.
+	buf [bufSize]byte
+	len int
 }
 
+var _ cipher.Stream = (*Cipher)(nil)
+
 // New creates a new ChaCha20 stream cipher with the given key and nonce.
 // The initial counter value is set to 0.
 func New(key [8]uint32, nonce [3]uint32) *Cipher {
 	return &Cipher{key: key, nonce: nonce}
 }
 
-// ChaCha20 constants spelling "expand 32-byte k"
+// The constant first 4 words of the ChaCha20 state.
 const (
-	j0 uint32 = 0x61707865
-	j1 uint32 = 0x3320646e
-	j2 uint32 = 0x79622d32
-	j3 uint32 = 0x6b206574
+	j0 uint32 = 0x61707865 // expa
+	j1 uint32 = 0x3320646e // nd 3
+	j2 uint32 = 0x79622d32 // 2-by
+	j3 uint32 = 0x6b206574 // te k
 )
 
+const blockSize = 64
+
+// quarterRound is the core of ChaCha20. It shuffles the bits of 4 state words.
+// It's executed 4 times for each of the 20 ChaCha20 rounds, operating on all 16
+// words each round, in columnar or diagonal groups of 4 at a time.
 func quarterRound(a, b, c, d uint32) (uint32, uint32, uint32, uint32) {
 	a += b
 	d ^= a
-	d = (d << 16) | (d >> 16)
+	d = bits.RotateLeft32(d, 16)
 	c += d
 	b ^= c
-	b = (b << 12) | (b >> 20)
+	b = bits.RotateLeft32(b, 12)
 	a += b
 	d ^= a
-	d = (d << 8) | (d >> 24)
+	d = bits.RotateLeft32(d, 8)
 	c += d
 	b ^= c
-	b = (b << 7) | (b >> 25)
+	b = bits.RotateLeft32(b, 7)
 	return a, b, c, d
 }
 
@@ -67,116 +78,141 @@
 // the src buffers was passed in a single run. That is, Cipher
 // maintains state and does not reset at each XORKeyStream call.
 func (s *Cipher) XORKeyStream(dst, src []byte) {
-	if len(dst) < len(src) {
-		panic("chacha20: output smaller than input")
-	}
-	if subtle.InexactOverlap(dst[:len(src)], src) {
-		panic("chacha20: invalid buffer overlap")
-	}
-
-	// xor src with buffered keystream first
-	if s.len != 0 {
-		buf := s.buf[len(s.buf)-s.len:]
-		if len(src) < len(buf) {
-			buf = buf[:len(src)]
-		}
-		td, ts := dst[:len(buf)], src[:len(buf)] // BCE hint
-		for i, b := range buf {
-			td[i] = ts[i] ^ b
-		}
-		s.len -= len(buf)
-		if s.len != 0 {
-			return
-		}
-		s.buf = [len(s.buf)]byte{} // zero the empty buffer
-		src = src[len(buf):]
-		dst = dst[len(buf):]
-	}
-
 	if len(src) == 0 {
 		return
 	}
-	if haveAsm {
-		if uint64(len(src))+uint64(s.counter)*64 > (1<<38)-64 {
-			panic("chacha20: counter overflow")
+	if len(dst) < len(src) {
+		panic("chacha20: output smaller than input")
+	}
+	dst = dst[:len(src)]
+	if subtle.InexactOverlap(dst, src) {
+		panic("chacha20: invalid buffer overlap")
+	}
+
+	// First, drain any remaining key stream from a previous XORKeyStream.
+	if s.len != 0 {
+		keyStream := s.buf[bufSize-s.len:]
+		if len(src) < len(keyStream) {
+			keyStream = keyStream[:len(src)]
 		}
-		s.xorKeyStreamAsm(dst, src)
-		return
+		_ = src[len(keyStream)-1] // bounds check elimination hint
+		for i, b := range keyStream {
+			dst[i] = src[i] ^ b
+		}
+		s.len -= len(keyStream)
+		src = src[len(keyStream):]
+		dst = dst[len(keyStream):]
 	}
 
-	// set up a 64-byte buffer to pad out the final block if needed
-	// (hoisted out of the main loop to avoid spills)
-	rem := len(src) % 64  // length of final block
-	fin := len(src) - rem // index of final block
+	const blocksPerBuf = bufSize / blockSize
+	numBufs := (uint64(len(src)) + bufSize - 1) / bufSize
+	if uint64(s.counter)+numBufs*blocksPerBuf >= 1<<32 {
+		panic("chacha20: counter overflow")
+	}
+
+	// xorKeyStreamBlocks implementations expect input lengths that are a
+	// multiple of bufSize. Platform-specific ones process multiple blocks at a
+	// time, so have bufSizes that are a multiple of blockSize.
+
+	rem := len(src) % bufSize
+	full := len(src) - rem
+
+	if full > 0 {
+		s.xorKeyStreamBlocks(dst[:full], src[:full])
+	}
+
+	// If we have a partial (multi-)block, pad it for xorKeyStreamBlocks, and
+	// keep the leftover keystream for the next XORKeyStream invocation.
 	if rem > 0 {
-		copy(s.buf[len(s.buf)-64:], src[fin:])
+		s.buf = [bufSize]byte{}
+		copy(s.buf[:], src[full:])
+		s.xorKeyStreamBlocks(s.buf[:], s.buf[:])
+		s.len = bufSize - copy(dst[full:], s.buf[:])
+	}
+}
+
+func (s *Cipher) xorKeyStreamBlocksGeneric(dst, src []byte) {
+	if len(dst) != len(src) || len(dst)%blockSize != 0 {
+		panic("chacha20: internal error: wrong dst and/or src length")
 	}
 
-	// pre-calculate most of the first round
-	s1, s5, s9, s13 := quarterRound(j1, s.key[1], s.key[5], s.nonce[0])
-	s2, s6, s10, s14 := quarterRound(j2, s.key[2], s.key[6], s.nonce[1])
-	s3, s7, s11, s15 := quarterRound(j3, s.key[3], s.key[7], s.nonce[2])
+	// To generate each block of key stream, the initial cipher state
+	// (represented below) is passed through 20 rounds of shuffling,
+	// alternatively applying quarterRounds by columns (like 1, 5, 9, 13)
+	// or by diagonals (like 1, 6, 11, 12).
+	//
+	//      0:cccccccc   1:cccccccc   2:cccccccc   3:cccccccc
+	//      4:kkkkkkkk   5:kkkkkkkk   6:kkkkkkkk   7:kkkkkkkk
+	//      8:kkkkkkkk   9:kkkkkkkk  10:kkkkkkkk  11:kkkkkkkk
+	//     12:bbbbbbbb  13:nnnnnnnn  14:nnnnnnnn  15:nnnnnnnn
+	//
+	//            c=constant k=key b=blockcount n=nonce
+	var (
+		c0, c1, c2, c3   = j0, j1, j2, j3
+		c4, c5, c6, c7   = s.key[0], s.key[1], s.key[2], s.key[3]
+		c8, c9, c10, c11 = s.key[4], s.key[5], s.key[6], s.key[7]
+		_, c13, c14, c15 = s.counter, s.nonce[0], s.nonce[1], s.nonce[2]
+	)
 
-	n := len(src)
-	src, dst = src[:n:n], dst[:n:n] // BCE hint
-	for i := 0; i < n; i += 64 {
-		// calculate the remainder of the first round
-		s0, s4, s8, s12 := quarterRound(j0, s.key[0], s.key[4], s.counter)
+	// Three quarters of the first round don't depend on the counter, so we can
+	// calculate them here, and reuse them for multiple blocks in the loop.
+	// TODO(filippo): experiment with reusing across XORKeyStream calls.
+	s1, s5, s9, s13 := quarterRound(c1, c5, c9, c13)
+	s2, s6, s10, s14 := quarterRound(c2, c6, c10, c14)
+	s3, s7, s11, s15 := quarterRound(c3, c7, c11, c15)
 
-		// execute the second round
+	for i := 0; i < len(src); i += blockSize {
+		// The remainder of the first column round.
+		s0, s4, s8, s12 := quarterRound(c0, c4, c8, s.counter)
+
+		// The second diagonal round.
 		x0, x5, x10, x15 := quarterRound(s0, s5, s10, s15)
 		x1, x6, x11, x12 := quarterRound(s1, s6, s11, s12)
 		x2, x7, x8, x13 := quarterRound(s2, s7, s8, s13)
 		x3, x4, x9, x14 := quarterRound(s3, s4, s9, s14)
 
-		// execute the remaining 18 rounds
+		// The remaining 18 rounds.
 		for i := 0; i < 9; i++ {
+			// Column round.
 			x0, x4, x8, x12 = quarterRound(x0, x4, x8, x12)
 			x1, x5, x9, x13 = quarterRound(x1, x5, x9, x13)
 			x2, x6, x10, x14 = quarterRound(x2, x6, x10, x14)
 			x3, x7, x11, x15 = quarterRound(x3, x7, x11, x15)
 
+			// Diagonal round.
 			x0, x5, x10, x15 = quarterRound(x0, x5, x10, x15)
 			x1, x6, x11, x12 = quarterRound(x1, x6, x11, x12)
 			x2, x7, x8, x13 = quarterRound(x2, x7, x8, x13)
 			x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14)
 		}
 
-		x0 += j0
-		x1 += j1
-		x2 += j2
-		x3 += j3
-
-		x4 += s.key[0]
-		x5 += s.key[1]
-		x6 += s.key[2]
-		x7 += s.key[3]
-		x8 += s.key[4]
-		x9 += s.key[5]
-		x10 += s.key[6]
-		x11 += s.key[7]
-
+		// Finally, add back the initial state to generate the key stream.
+		x0 += c0
+		x1 += c1
+		x2 += c2
+		x3 += c3
+		x4 += c4
+		x5 += c5
+		x6 += c6
+		x7 += c7
+		x8 += c8
+		x9 += c9
+		x10 += c10
+		x11 += c11
 		x12 += s.counter
-		x13 += s.nonce[0]
-		x14 += s.nonce[1]
-		x15 += s.nonce[2]
+		x13 += c13
+		x14 += c14
+		x15 += c15
 
-		// increment the counter
 		s.counter += 1
 		if s.counter == 0 {
-			panic("chacha20: counter overflow")
+			panic("chacha20: internal error: counter overflow")
 		}
 
-		// pad to 64 bytes if needed
 		in, out := src[i:], dst[i:]
-		if i == fin {
-			// src[fin:] has already been copied into s.buf before
-			// the main loop
-			in, out = s.buf[len(s.buf)-64:], s.buf[len(s.buf)-64:]
-		}
-		in, out = in[:64], out[:64] // BCE hint
+		in, out = in[:blockSize], out[:blockSize] // bounds check elimination hint
 
-		// XOR the key stream with the source and write out the result
+		// XOR the key stream with the source and write out the result.
 		xor(out[0:], in[0:], x0)
 		xor(out[4:], in[4:], x1)
 		xor(out[8:], in[8:], x2)
@@ -194,22 +230,13 @@
 		xor(out[56:], in[56:], x14)
 		xor(out[60:], in[60:], x15)
 	}
-	// copy any trailing bytes out of the buffer and into dst
-	if rem != 0 {
-		s.len = 64 - rem
-		copy(dst[fin:], s.buf[len(s.buf)-64:])
-	}
 }
 
 // Advance discards bytes in the key stream until the next 64 byte block
-// boundary is reached and updates the counter accordingly. If the key
-// stream is already at a block boundary no bytes will be discarded and
-// the counter will be unchanged.
+// boundary is reached. If the key stream is already at a block boundary no
+// bytes will be discarded.
 func (s *Cipher) Advance() {
-	s.len -= s.len % 64
-	if s.len == 0 {
-		s.buf = [len(s.buf)]byte{}
-	}
+	s.len -= s.len % blockSize
 }
 
 // XORKeyStream crypts bytes from in to out using the given key and counters.
@@ -246,11 +273,13 @@
 	x12, x13, x14, x15 := nonce[0], nonce[1], nonce[2], nonce[3]
 
 	for i := 0; i < 10; i++ {
+		// Diagonal round.
 		x0, x4, x8, x12 = quarterRound(x0, x4, x8, x12)
 		x1, x5, x9, x13 = quarterRound(x1, x5, x9, x13)
 		x2, x6, x10, x14 = quarterRound(x2, x6, x10, x14)
 		x3, x7, x11, x15 = quarterRound(x3, x7, x11, x15)
 
+		// Column round.
 		x0, x5, x10, x15 = quarterRound(x0, x5, x10, x15)
 		x1, x6, x11, x12 = quarterRound(x1, x6, x11, x12)
 		x2, x7, x8, x13 = quarterRound(x2, x7, x8, x13)
diff --git a/internal/chacha20/chacha_noasm.go b/internal/chacha20/chacha_noasm.go
index fc26825..ec609ed 100644
--- a/internal/chacha20/chacha_noasm.go
+++ b/internal/chacha20/chacha_noasm.go
@@ -6,11 +6,8 @@
 
 package chacha20
 
-const (
-	bufSize = 64
-	haveAsm = false
-)
+const bufSize = blockSize
 
-func (*Cipher) xorKeyStreamAsm(dst, src []byte) {
-	panic("not implemented")
+func (s *Cipher) xorKeyStreamBlocks(dst, src []byte) {
+	s.xorKeyStreamBlocksGeneric(dst, src)
 }
diff --git a/internal/chacha20/chacha_ppc64le.go b/internal/chacha20/chacha_ppc64le.go
index 8d832b3..d0ec61f 100644
--- a/internal/chacha20/chacha_ppc64le.go
+++ b/internal/chacha20/chacha_ppc64le.go
@@ -2,53 +2,15 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build ppc64le,!gccgo,!appengine
+// +build !gccgo,!appengine
 
 package chacha20
 
-import (
-	"encoding/binary"
-)
-
-const (
-	bufSize = 256
-	haveAsm = true
-)
+const bufSize = 256
 
 //go:noescape
 func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
 
-func (c *Cipher) xorKeyStreamAsm(dst, src []byte) {
-	// This implementation can handle buffers that aren't multiples of
-	// 256.
-	if len(src) >= bufSize {
-		chaCha20_ctr32_vsx(&dst[0], &src[0], len(src), &c.key, &c.counter)
-	} else if len(src)%bufSize != 0 {
-		chaCha20_ctr32_vsx(&c.buf[0], &c.buf[0], bufSize, &c.key, &c.counter)
-		start := len(src) - len(src)%bufSize
-		ts, td, tb := src[start:], dst[start:], c.buf[:]
-		// Unroll loop to XOR 32 bytes per iteration.
-		for i := 0; i < len(ts)-32; i += 32 {
-			td, tb = td[:len(ts)], tb[:len(ts)] // bounds check elimination
-			s0 := binary.LittleEndian.Uint64(ts[0:8])
-			s1 := binary.LittleEndian.Uint64(ts[8:16])
-			s2 := binary.LittleEndian.Uint64(ts[16:24])
-			s3 := binary.LittleEndian.Uint64(ts[24:32])
-			b0 := binary.LittleEndian.Uint64(tb[0:8])
-			b1 := binary.LittleEndian.Uint64(tb[8:16])
-			b2 := binary.LittleEndian.Uint64(tb[16:24])
-			b3 := binary.LittleEndian.Uint64(tb[24:32])
-			binary.LittleEndian.PutUint64(td[0:8], s0^b0)
-			binary.LittleEndian.PutUint64(td[8:16], s1^b1)
-			binary.LittleEndian.PutUint64(td[16:24], s2^b2)
-			binary.LittleEndian.PutUint64(td[24:32], s3^b3)
-			ts, td, tb = ts[32:], td[32:], tb[32:]
-		}
-		td, tb = td[:len(ts)], tb[:len(ts)] // bounds check elimination
-		for i, v := range ts {
-			td[i] = tb[i] ^ v
-		}
-		c.len = bufSize - (len(src) % bufSize)
-	}
-
+func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) {
+	chaCha20_ctr32_vsx(&dst[0], &src[0], len(src), &c.key, &c.counter)
 }
diff --git a/internal/chacha20/asm_ppc64le.s b/internal/chacha20/chacha_ppc64le.s
similarity index 96%
rename from internal/chacha20/asm_ppc64le.s
rename to internal/chacha20/chacha_ppc64le.s
index bc9b562..533014e 100644
--- a/internal/chacha20/asm_ppc64le.s
+++ b/internal/chacha20/chacha_ppc64le.s
@@ -19,7 +19,7 @@
 // The differences in this and the original implementation are
 // due to the calling conventions and initialization of constants.
 
-// +build ppc64le,!gccgo,!appengine
+// +build !gccgo,!appengine
 
 #include "textflag.h"
 
@@ -31,24 +31,7 @@
 #define TMP  R15
 
 #define CONSTBASE  R16
-
-#define X0   R11
-#define X1   R12
-#define X2   R14
-#define X3   R15
-#define X4   R16
-#define X5   R17
-#define X6   R18
-#define X7   R19
-#define X8   R20
-#define X9   R21
-#define X10  R22
-#define X11  R23
-#define X12  R24
-#define X13  R25
-#define X14  R26
-#define X15  R27
-
+#define BLOCKS R17
 
 DATA consts<>+0x00(SB)/8, $0x3320646e61707865
 DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
@@ -86,6 +69,7 @@
 	MOVD $32, R9
 	MOVD $48, R10
 	MOVD $64, R11
+	SRD $6, LEN, BLOCKS
 	// V16
 	LXVW4X (CONSTBASE)(R0), VS48
 	ADD $80,CONSTBASE
@@ -429,9 +413,9 @@
 	BNE  loop_outer_vsx
 
 done_vsx:
-	// Increment counter by 4
+	// Increment counter by number of 64 byte blocks
 	MOVD (CNT), R14
-	ADD  $4, R14
+	ADD  BLOCKS, R14
 	MOVD R14, (CNT)
 	RET
 
diff --git a/internal/chacha20/chacha_s390x.go b/internal/chacha20/chacha_s390x.go
index aad645b..cd55f45 100644
--- a/internal/chacha20/chacha_s390x.go
+++ b/internal/chacha20/chacha_s390x.go
@@ -2,28 +2,25 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build s390x,!gccgo,!appengine
+// +build !gccgo,!appengine
 
 package chacha20
 
-import (
-	"golang.org/x/sys/cpu"
-)
+import "golang.org/x/sys/cpu"
 
 var haveAsm = cpu.S390X.HasVX
 
 const bufSize = 256
 
 // xorKeyStreamVX is an assembly implementation of XORKeyStream. It must only
-// be called when the vector facility is available.
-// Implementation in asm_s390x.s.
+// be called when the vector facility is available. Implementation in asm_s390x.s.
 //go:noescape
-func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32, buf *[256]byte, len *int)
+func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
 
-func (c *Cipher) xorKeyStreamAsm(dst, src []byte) {
-	xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter, &c.buf, &c.len)
+func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) {
+	if cpu.S390X.HasVX {
+		xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter)
+	} else {
+		c.xorKeyStreamBlocksGeneric(dst, src)
+	}
 }
-
-// EXRL targets, DO NOT CALL!
-func mvcSrcToBuf()
-func mvcBufToDst()
diff --git a/internal/chacha20/chacha_s390x.s b/internal/chacha20/chacha_s390x.s
index 57df404..de52a2e 100644
--- a/internal/chacha20/chacha_s390x.s
+++ b/internal/chacha20/chacha_s390x.s
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build s390x,!gccgo,!appengine
+// +build !gccgo,!appengine
 
 #include "go_asm.h"
 #include "textflag.h"
@@ -24,15 +24,6 @@
 DATA ·constants<>+0x18(SB)/4, $0x79622d32
 DATA ·constants<>+0x1c(SB)/4, $0x6b206574
 
-// EXRL targets:
-TEXT ·mvcSrcToBuf(SB), NOFRAME|NOSPLIT, $0
-	MVC $1, (R1), (R8)
-	RET
-
-TEXT ·mvcBufToDst(SB), NOFRAME|NOSPLIT, $0
-	MVC $1, (R8), (R9)
-	RET
-
 #define BSWAP V5
 #define J0    V6
 #define KEY0  V7
@@ -144,7 +135,7 @@
 	VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]}
 	VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]}
 
-// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32, buf *[256]byte, len *int)
+// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
 TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
 	MOVD $·constants<>(SB), R1
 	MOVD dst+0(FP), R2         // R2=&dst[0]
@@ -152,25 +143,10 @@
 	MOVD key+48(FP), R5        // R5=key
 	MOVD nonce+56(FP), R6      // R6=nonce
 	MOVD counter+64(FP), R7    // R7=counter
-	MOVD buf+72(FP), R8        // R8=buf
-	MOVD len+80(FP), R9        // R9=len
 
 	// load BSWAP and J0
 	VLM (R1), BSWAP, J0
 
-	// set up tail buffer
-	ADD     $-1, R4, R12
-	MOVBZ   R12, R12
-	CMPUBEQ R12, $255, aligned
-	MOVD    R4, R1
-	AND     $~255, R1
-	MOVD    $(R3)(R1*1), R1
-	EXRL    $·mvcSrcToBuf(SB), R12
-	MOVD    $255, R0
-	SUB     R12, R0
-	MOVD    R0, (R9)               // update len
-
-aligned:
 	// setup
 	MOVD  $95, R0
 	VLM   (R5), KEY0, KEY1
@@ -217,9 +193,7 @@
 
 	// decrement length
 	ADD $-256, R4
-	BLT tail
 
-continue:
 	// rearrange vectors
 	SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3)
 	ADDV(J0, X0, X1, X2, X3)
@@ -245,16 +219,6 @@
 	MOVD $256(R3), R3
 
 	CMPBNE  R4, $0, chacha
-	CMPUBEQ R12, $255, return
-	EXRL    $·mvcBufToDst(SB), R12 // len was updated during setup
 
-return:
 	VSTEF $0, CTR, (R7)
 	RET
-
-tail:
-	MOVD R2, R9
-	MOVD R8, R2
-	MOVD R8, R3
-	MOVD $0, R4
-	JMP  continue
diff --git a/internal/chacha20/chacha_test.go b/internal/chacha20/chacha_test.go
index 9a7a099..913af4f 100644
--- a/internal/chacha20/chacha_test.go
+++ b/internal/chacha20/chacha_test.go
@@ -7,11 +7,16 @@
 import (
 	"encoding/binary"
 	"encoding/hex"
-	"fmt"
 	"math/rand"
 	"testing"
 )
 
+func _() {
+	// Assert that bufSize is a multiple of blockSize.
+	var b [1]byte
+	_ = b[bufSize%blockSize]
+}
+
 func TestCore(t *testing.T) {
 	// This is just a smoke test that checks the example from
 	// https://tools.ietf.org/html/rfc7539#section-2.3.2. The
@@ -121,6 +126,8 @@
 		}
 		// finish the encryption
 		s.XORKeyStream(output[i:], input[i:])
+		// ensure we tolerate a call with an empty input
+		s.XORKeyStream(output[len(output):], input[len(input):])
 
 		got := hex.EncodeToString(output)
 		if got != c.output {
@@ -170,24 +177,44 @@
 	}
 }
 
-func BenchmarkChaCha20(b *testing.B) {
-	sizes := []int{32, 63, 64, 256, 1024, 1350, 65536}
-	for _, size := range sizes {
-		s := size
-		b.Run(fmt.Sprint(s), func(b *testing.B) {
-			k := [32]byte{}
-			c := [16]byte{}
-			src := make([]byte, s)
-			dst := make([]byte, s)
-			b.SetBytes(int64(s))
-			b.ResetTimer()
-			for i := 0; i < b.N; i++ {
-				XORKeyStream(dst, src, &c, &k)
-			}
-		})
+func benchmarkChaCha20(b *testing.B, step, count int) {
+	tot := step * count
+	src := make([]byte, tot)
+	dst := make([]byte, tot)
+	b.SetBytes(int64(tot))
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		c := New([8]uint32{}, [3]uint32{})
+		for i := 0; i < tot; i += step {
+			c.XORKeyStream(dst[i:], src[i:i+step])
+		}
 	}
 }
 
+func BenchmarkChaCha20(b *testing.B) {
+	b.Run("64", func(b *testing.B) {
+		benchmarkChaCha20(b, 64, 1)
+	})
+	b.Run("256", func(b *testing.B) {
+		benchmarkChaCha20(b, 256, 1)
+	})
+	b.Run("10x25", func(b *testing.B) {
+		benchmarkChaCha20(b, 10, 25)
+	})
+	b.Run("4096", func(b *testing.B) {
+		benchmarkChaCha20(b, 256, 1)
+	})
+	b.Run("100x40", func(b *testing.B) {
+		benchmarkChaCha20(b, 100, 40)
+	})
+	b.Run("65536", func(b *testing.B) {
+		benchmarkChaCha20(b, 65536, 1)
+	})
+	b.Run("1000x65", func(b *testing.B) {
+		benchmarkChaCha20(b, 1000, 65)
+	})
+}
+
 func TestHChaCha20(t *testing.T) {
 	// See draft-paragon-paseto-rfc-00 §7.2.1.
 	key := []byte{0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
diff --git a/internal/chacha20/xor.go b/internal/chacha20/xor.go
index 9c5ba0b..0110c98 100644
--- a/internal/chacha20/xor.go
+++ b/internal/chacha20/xor.go
@@ -4,9 +4,7 @@
 
 package chacha20
 
-import (
-	"runtime"
-)
+import "runtime"
 
 // Platforms that have fast unaligned 32-bit little endian accesses.
 const unaligned = runtime.GOARCH == "386" ||