internal/chacha20: refactor for readability and consistency
Separated the complex buffering logic from key stream generation more
clearly, added plenty of comments and generally refactored the Go
implementation for readability. Made the interface with the
generic/assembly cores smaller and more consistent, according to
golang.org/wiki/TargetSpecific.
We will recover the lost performance on unaligned calls by caching 3/4
of the first round across XORKeyStream invocations, which we now have
complexity budget for.
name old speed new speed delta
ChaCha20/64-4 435MB/s ± 2% 429MB/s ± 2% -1.47% (p=0.013 n=10+9)
ChaCha20/256-4 496MB/s ± 1% 493MB/s ± 2% ~ (p=0.280 n=10+10)
ChaCha20/10x25-4 283MB/s ± 1% 274MB/s ± 2% -3.13% (p=0.000 n=10+10)
ChaCha20/4096-4 494MB/s ± 1% 493MB/s ± 5% ~ (p=0.631 n=10+10)
ChaCha20/100x40-4 421MB/s ± 3% 408MB/s ± 1% -3.14% (p=0.003 n=9+9)
ChaCha20/65536-4 515MB/s ± 1% 519MB/s ± 3% ~ (p=0.161 n=7+10)
ChaCha20/1000x65-4 501MB/s ± 2% 501MB/s ± 3% ~ (p=0.497 n=9+10)
Also applied a fix for a lingering bug in the ppc64le assembly written
by Lynn Boger <laboger@linux.vnet.ibm.com>.
Updates golang/go#24485
Change-Id: I10cf24a7f10359b1b4ae63c9bb1946735b98ac9b
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/185439
Reviewed-by: Michael Munday <mike.munday@ibm.com>
diff --git a/internal/chacha20/chacha_arm64.go b/internal/chacha20/chacha_arm64.go
index ad74e23..87f1e36 100644
--- a/internal/chacha20/chacha_arm64.go
+++ b/internal/chacha20/chacha_arm64.go
@@ -3,29 +3,15 @@
// license that can be found in the LICENSE file.
// +build go1.11
-// +build !gccgo
+// +build !gccgo,!appengine
package chacha20
-const (
- haveAsm = true
- bufSize = 256
-)
+const bufSize = 256
//go:noescape
func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
-func (c *Cipher) xorKeyStreamAsm(dst, src []byte) {
-
- if len(src) >= bufSize {
- xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter)
- }
-
- if len(src)%bufSize != 0 {
- i := len(src) - len(src)%bufSize
- c.buf = [bufSize]byte{}
- copy(c.buf[:], src[i:])
- xorKeyStreamVX(c.buf[:], c.buf[:], &c.key, &c.nonce, &c.counter)
- c.len = bufSize - copy(dst[i:], c.buf[:len(src)%bufSize])
- }
+func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) {
+ xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter)
}
diff --git a/internal/chacha20/asm_arm64.s b/internal/chacha20/chacha_arm64.s
similarity index 100%
rename from internal/chacha20/asm_arm64.s
rename to internal/chacha20/chacha_arm64.s
diff --git a/internal/chacha20/chacha_generic.go b/internal/chacha20/chacha_generic.go
index 6570847..94222bf 100644
--- a/internal/chacha20/chacha_generic.go
+++ b/internal/chacha20/chacha_generic.go
@@ -2,57 +2,68 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// Package ChaCha20 implements the core ChaCha20 function as specified
-// in https://tools.ietf.org/html/rfc7539#section-2.3.
+// Package chacha20 implements the ChaCha20 encryption algorithm
+// as specified in RFC 8439.
package chacha20
import (
"crypto/cipher"
"encoding/binary"
+ "math/bits"
"golang.org/x/crypto/internal/subtle"
)
-// assert that *Cipher implements cipher.Stream
-var _ cipher.Stream = (*Cipher)(nil)
-
// Cipher is a stateful instance of ChaCha20 using a particular key
// and nonce. A *Cipher implements the cipher.Stream interface.
type Cipher struct {
+ // The ChaCha20 state is 16 words: 4 constant, 8 of key, 1 of counter
+ // (incremented after each block), and 3 of nonce.
key [8]uint32
- counter uint32 // incremented after each block
+ counter uint32
nonce [3]uint32
- buf [bufSize]byte // buffer for unused keystream bytes
- len int // number of unused keystream bytes at end of buf
+
+ // The last len bytes of buf are leftover key stream bytes from the previous
+ // XORKeyStream invocation. The size of buf depends on how many blocks are
+ // computed at a time.
+ buf [bufSize]byte
+ len int
}
+var _ cipher.Stream = (*Cipher)(nil)
+
// New creates a new ChaCha20 stream cipher with the given key and nonce.
// The initial counter value is set to 0.
func New(key [8]uint32, nonce [3]uint32) *Cipher {
return &Cipher{key: key, nonce: nonce}
}
-// ChaCha20 constants spelling "expand 32-byte k"
+// The constant first 4 words of the ChaCha20 state.
const (
- j0 uint32 = 0x61707865
- j1 uint32 = 0x3320646e
- j2 uint32 = 0x79622d32
- j3 uint32 = 0x6b206574
+ j0 uint32 = 0x61707865 // expa
+ j1 uint32 = 0x3320646e // nd 3
+ j2 uint32 = 0x79622d32 // 2-by
+ j3 uint32 = 0x6b206574 // te k
)
+const blockSize = 64
+
+// quarterRound is the core of ChaCha20. It shuffles the bits of 4 state words.
+// It's executed 4 times for each of the 20 ChaCha20 rounds, operating on all 16
+// words each round, in columnar or diagonal groups of 4 at a time.
func quarterRound(a, b, c, d uint32) (uint32, uint32, uint32, uint32) {
a += b
d ^= a
- d = (d << 16) | (d >> 16)
+ d = bits.RotateLeft32(d, 16)
c += d
b ^= c
- b = (b << 12) | (b >> 20)
+ b = bits.RotateLeft32(b, 12)
a += b
d ^= a
- d = (d << 8) | (d >> 24)
+ d = bits.RotateLeft32(d, 8)
c += d
b ^= c
- b = (b << 7) | (b >> 25)
+ b = bits.RotateLeft32(b, 7)
return a, b, c, d
}
@@ -67,116 +78,141 @@
// the src buffers was passed in a single run. That is, Cipher
// maintains state and does not reset at each XORKeyStream call.
func (s *Cipher) XORKeyStream(dst, src []byte) {
- if len(dst) < len(src) {
- panic("chacha20: output smaller than input")
- }
- if subtle.InexactOverlap(dst[:len(src)], src) {
- panic("chacha20: invalid buffer overlap")
- }
-
- // xor src with buffered keystream first
- if s.len != 0 {
- buf := s.buf[len(s.buf)-s.len:]
- if len(src) < len(buf) {
- buf = buf[:len(src)]
- }
- td, ts := dst[:len(buf)], src[:len(buf)] // BCE hint
- for i, b := range buf {
- td[i] = ts[i] ^ b
- }
- s.len -= len(buf)
- if s.len != 0 {
- return
- }
- s.buf = [len(s.buf)]byte{} // zero the empty buffer
- src = src[len(buf):]
- dst = dst[len(buf):]
- }
-
if len(src) == 0 {
return
}
- if haveAsm {
- if uint64(len(src))+uint64(s.counter)*64 > (1<<38)-64 {
- panic("chacha20: counter overflow")
+ if len(dst) < len(src) {
+ panic("chacha20: output smaller than input")
+ }
+ dst = dst[:len(src)]
+ if subtle.InexactOverlap(dst, src) {
+ panic("chacha20: invalid buffer overlap")
+ }
+
+ // First, drain any remaining key stream from a previous XORKeyStream.
+ if s.len != 0 {
+ keyStream := s.buf[bufSize-s.len:]
+ if len(src) < len(keyStream) {
+ keyStream = keyStream[:len(src)]
}
- s.xorKeyStreamAsm(dst, src)
- return
+ _ = src[len(keyStream)-1] // bounds check elimination hint
+ for i, b := range keyStream {
+ dst[i] = src[i] ^ b
+ }
+ s.len -= len(keyStream)
+ src = src[len(keyStream):]
+ dst = dst[len(keyStream):]
}
- // set up a 64-byte buffer to pad out the final block if needed
- // (hoisted out of the main loop to avoid spills)
- rem := len(src) % 64 // length of final block
- fin := len(src) - rem // index of final block
+ const blocksPerBuf = bufSize / blockSize
+ numBufs := (uint64(len(src)) + bufSize - 1) / bufSize
+ if uint64(s.counter)+numBufs*blocksPerBuf >= 1<<32 {
+ panic("chacha20: counter overflow")
+ }
+
+ // xorKeyStreamBlocks implementations expect input lengths that are a
+ // multiple of bufSize. Platform-specific ones process multiple blocks at a
+ // time, so have bufSizes that are a multiple of blockSize.
+
+ rem := len(src) % bufSize
+ full := len(src) - rem
+
+ if full > 0 {
+ s.xorKeyStreamBlocks(dst[:full], src[:full])
+ }
+
+ // If we have a partial (multi-)block, pad it for xorKeyStreamBlocks, and
+ // keep the leftover keystream for the next XORKeyStream invocation.
if rem > 0 {
- copy(s.buf[len(s.buf)-64:], src[fin:])
+ s.buf = [bufSize]byte{}
+ copy(s.buf[:], src[full:])
+ s.xorKeyStreamBlocks(s.buf[:], s.buf[:])
+ s.len = bufSize - copy(dst[full:], s.buf[:])
+ }
+}
+
+func (s *Cipher) xorKeyStreamBlocksGeneric(dst, src []byte) {
+ if len(dst) != len(src) || len(dst)%blockSize != 0 {
+ panic("chacha20: internal error: wrong dst and/or src length")
}
- // pre-calculate most of the first round
- s1, s5, s9, s13 := quarterRound(j1, s.key[1], s.key[5], s.nonce[0])
- s2, s6, s10, s14 := quarterRound(j2, s.key[2], s.key[6], s.nonce[1])
- s3, s7, s11, s15 := quarterRound(j3, s.key[3], s.key[7], s.nonce[2])
+ // To generate each block of key stream, the initial cipher state
+ // (represented below) is passed through 20 rounds of shuffling,
+ // alternatively applying quarterRounds by columns (like 1, 5, 9, 13)
+ // or by diagonals (like 1, 6, 11, 12).
+ //
+ // 0:cccccccc 1:cccccccc 2:cccccccc 3:cccccccc
+ // 4:kkkkkkkk 5:kkkkkkkk 6:kkkkkkkk 7:kkkkkkkk
+ // 8:kkkkkkkk 9:kkkkkkkk 10:kkkkkkkk 11:kkkkkkkk
+ // 12:bbbbbbbb 13:nnnnnnnn 14:nnnnnnnn 15:nnnnnnnn
+ //
+ // c=constant k=key b=blockcount n=nonce
+ var (
+ c0, c1, c2, c3 = j0, j1, j2, j3
+ c4, c5, c6, c7 = s.key[0], s.key[1], s.key[2], s.key[3]
+ c8, c9, c10, c11 = s.key[4], s.key[5], s.key[6], s.key[7]
+ _, c13, c14, c15 = s.counter, s.nonce[0], s.nonce[1], s.nonce[2]
+ )
- n := len(src)
- src, dst = src[:n:n], dst[:n:n] // BCE hint
- for i := 0; i < n; i += 64 {
- // calculate the remainder of the first round
- s0, s4, s8, s12 := quarterRound(j0, s.key[0], s.key[4], s.counter)
+ // Three quarters of the first round don't depend on the counter, so we can
+ // calculate them here, and reuse them for multiple blocks in the loop.
+ // TODO(filippo): experiment with reusing across XORKeyStream calls.
+ s1, s5, s9, s13 := quarterRound(c1, c5, c9, c13)
+ s2, s6, s10, s14 := quarterRound(c2, c6, c10, c14)
+ s3, s7, s11, s15 := quarterRound(c3, c7, c11, c15)
- // execute the second round
+ for i := 0; i < len(src); i += blockSize {
+ // The remainder of the first column round.
+ s0, s4, s8, s12 := quarterRound(c0, c4, c8, s.counter)
+
+ // The second diagonal round.
x0, x5, x10, x15 := quarterRound(s0, s5, s10, s15)
x1, x6, x11, x12 := quarterRound(s1, s6, s11, s12)
x2, x7, x8, x13 := quarterRound(s2, s7, s8, s13)
x3, x4, x9, x14 := quarterRound(s3, s4, s9, s14)
- // execute the remaining 18 rounds
+ // The remaining 18 rounds.
for i := 0; i < 9; i++ {
+ // Column round.
x0, x4, x8, x12 = quarterRound(x0, x4, x8, x12)
x1, x5, x9, x13 = quarterRound(x1, x5, x9, x13)
x2, x6, x10, x14 = quarterRound(x2, x6, x10, x14)
x3, x7, x11, x15 = quarterRound(x3, x7, x11, x15)
+ // Diagonal round.
x0, x5, x10, x15 = quarterRound(x0, x5, x10, x15)
x1, x6, x11, x12 = quarterRound(x1, x6, x11, x12)
x2, x7, x8, x13 = quarterRound(x2, x7, x8, x13)
x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14)
}
- x0 += j0
- x1 += j1
- x2 += j2
- x3 += j3
-
- x4 += s.key[0]
- x5 += s.key[1]
- x6 += s.key[2]
- x7 += s.key[3]
- x8 += s.key[4]
- x9 += s.key[5]
- x10 += s.key[6]
- x11 += s.key[7]
-
+ // Finally, add back the initial state to generate the key stream.
+ x0 += c0
+ x1 += c1
+ x2 += c2
+ x3 += c3
+ x4 += c4
+ x5 += c5
+ x6 += c6
+ x7 += c7
+ x8 += c8
+ x9 += c9
+ x10 += c10
+ x11 += c11
x12 += s.counter
- x13 += s.nonce[0]
- x14 += s.nonce[1]
- x15 += s.nonce[2]
+ x13 += c13
+ x14 += c14
+ x15 += c15
- // increment the counter
s.counter += 1
if s.counter == 0 {
- panic("chacha20: counter overflow")
+ panic("chacha20: internal error: counter overflow")
}
- // pad to 64 bytes if needed
in, out := src[i:], dst[i:]
- if i == fin {
- // src[fin:] has already been copied into s.buf before
- // the main loop
- in, out = s.buf[len(s.buf)-64:], s.buf[len(s.buf)-64:]
- }
- in, out = in[:64], out[:64] // BCE hint
+ in, out = in[:blockSize], out[:blockSize] // bounds check elimination hint
- // XOR the key stream with the source and write out the result
+ // XOR the key stream with the source and write out the result.
xor(out[0:], in[0:], x0)
xor(out[4:], in[4:], x1)
xor(out[8:], in[8:], x2)
@@ -194,22 +230,13 @@
xor(out[56:], in[56:], x14)
xor(out[60:], in[60:], x15)
}
- // copy any trailing bytes out of the buffer and into dst
- if rem != 0 {
- s.len = 64 - rem
- copy(dst[fin:], s.buf[len(s.buf)-64:])
- }
}
// Advance discards bytes in the key stream until the next 64 byte block
-// boundary is reached and updates the counter accordingly. If the key
-// stream is already at a block boundary no bytes will be discarded and
-// the counter will be unchanged.
+// boundary is reached. If the key stream is already at a block boundary no
+// bytes will be discarded.
func (s *Cipher) Advance() {
- s.len -= s.len % 64
- if s.len == 0 {
- s.buf = [len(s.buf)]byte{}
- }
+ s.len -= s.len % blockSize
}
// XORKeyStream crypts bytes from in to out using the given key and counters.
@@ -246,11 +273,13 @@
x12, x13, x14, x15 := nonce[0], nonce[1], nonce[2], nonce[3]
for i := 0; i < 10; i++ {
+ // Diagonal round.
x0, x4, x8, x12 = quarterRound(x0, x4, x8, x12)
x1, x5, x9, x13 = quarterRound(x1, x5, x9, x13)
x2, x6, x10, x14 = quarterRound(x2, x6, x10, x14)
x3, x7, x11, x15 = quarterRound(x3, x7, x11, x15)
+ // Column round.
x0, x5, x10, x15 = quarterRound(x0, x5, x10, x15)
x1, x6, x11, x12 = quarterRound(x1, x6, x11, x12)
x2, x7, x8, x13 = quarterRound(x2, x7, x8, x13)
diff --git a/internal/chacha20/chacha_noasm.go b/internal/chacha20/chacha_noasm.go
index fc26825..ec609ed 100644
--- a/internal/chacha20/chacha_noasm.go
+++ b/internal/chacha20/chacha_noasm.go
@@ -6,11 +6,8 @@
package chacha20
-const (
- bufSize = 64
- haveAsm = false
-)
+const bufSize = blockSize
-func (*Cipher) xorKeyStreamAsm(dst, src []byte) {
- panic("not implemented")
+func (s *Cipher) xorKeyStreamBlocks(dst, src []byte) {
+ s.xorKeyStreamBlocksGeneric(dst, src)
}
diff --git a/internal/chacha20/chacha_ppc64le.go b/internal/chacha20/chacha_ppc64le.go
index 8d832b3..d0ec61f 100644
--- a/internal/chacha20/chacha_ppc64le.go
+++ b/internal/chacha20/chacha_ppc64le.go
@@ -2,53 +2,15 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build ppc64le,!gccgo,!appengine
+// +build !gccgo,!appengine
package chacha20
-import (
- "encoding/binary"
-)
-
-const (
- bufSize = 256
- haveAsm = true
-)
+const bufSize = 256
//go:noescape
func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
-func (c *Cipher) xorKeyStreamAsm(dst, src []byte) {
- // This implementation can handle buffers that aren't multiples of
- // 256.
- if len(src) >= bufSize {
- chaCha20_ctr32_vsx(&dst[0], &src[0], len(src), &c.key, &c.counter)
- } else if len(src)%bufSize != 0 {
- chaCha20_ctr32_vsx(&c.buf[0], &c.buf[0], bufSize, &c.key, &c.counter)
- start := len(src) - len(src)%bufSize
- ts, td, tb := src[start:], dst[start:], c.buf[:]
- // Unroll loop to XOR 32 bytes per iteration.
- for i := 0; i < len(ts)-32; i += 32 {
- td, tb = td[:len(ts)], tb[:len(ts)] // bounds check elimination
- s0 := binary.LittleEndian.Uint64(ts[0:8])
- s1 := binary.LittleEndian.Uint64(ts[8:16])
- s2 := binary.LittleEndian.Uint64(ts[16:24])
- s3 := binary.LittleEndian.Uint64(ts[24:32])
- b0 := binary.LittleEndian.Uint64(tb[0:8])
- b1 := binary.LittleEndian.Uint64(tb[8:16])
- b2 := binary.LittleEndian.Uint64(tb[16:24])
- b3 := binary.LittleEndian.Uint64(tb[24:32])
- binary.LittleEndian.PutUint64(td[0:8], s0^b0)
- binary.LittleEndian.PutUint64(td[8:16], s1^b1)
- binary.LittleEndian.PutUint64(td[16:24], s2^b2)
- binary.LittleEndian.PutUint64(td[24:32], s3^b3)
- ts, td, tb = ts[32:], td[32:], tb[32:]
- }
- td, tb = td[:len(ts)], tb[:len(ts)] // bounds check elimination
- for i, v := range ts {
- td[i] = tb[i] ^ v
- }
- c.len = bufSize - (len(src) % bufSize)
- }
-
+func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) {
+ chaCha20_ctr32_vsx(&dst[0], &src[0], len(src), &c.key, &c.counter)
}
diff --git a/internal/chacha20/asm_ppc64le.s b/internal/chacha20/chacha_ppc64le.s
similarity index 96%
rename from internal/chacha20/asm_ppc64le.s
rename to internal/chacha20/chacha_ppc64le.s
index bc9b562..533014e 100644
--- a/internal/chacha20/asm_ppc64le.s
+++ b/internal/chacha20/chacha_ppc64le.s
@@ -19,7 +19,7 @@
// The differences in this and the original implementation are
// due to the calling conventions and initialization of constants.
-// +build ppc64le,!gccgo,!appengine
+// +build !gccgo,!appengine
#include "textflag.h"
@@ -31,24 +31,7 @@
#define TMP R15
#define CONSTBASE R16
-
-#define X0 R11
-#define X1 R12
-#define X2 R14
-#define X3 R15
-#define X4 R16
-#define X5 R17
-#define X6 R18
-#define X7 R19
-#define X8 R20
-#define X9 R21
-#define X10 R22
-#define X11 R23
-#define X12 R24
-#define X13 R25
-#define X14 R26
-#define X15 R27
-
+#define BLOCKS R17
DATA consts<>+0x00(SB)/8, $0x3320646e61707865
DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
@@ -86,6 +69,7 @@
MOVD $32, R9
MOVD $48, R10
MOVD $64, R11
+ SRD $6, LEN, BLOCKS
// V16
LXVW4X (CONSTBASE)(R0), VS48
ADD $80,CONSTBASE
@@ -429,9 +413,9 @@
BNE loop_outer_vsx
done_vsx:
- // Increment counter by 4
+ // Increment counter by number of 64 byte blocks
MOVD (CNT), R14
- ADD $4, R14
+ ADD BLOCKS, R14
MOVD R14, (CNT)
RET
diff --git a/internal/chacha20/chacha_s390x.go b/internal/chacha20/chacha_s390x.go
index aad645b..cd55f45 100644
--- a/internal/chacha20/chacha_s390x.go
+++ b/internal/chacha20/chacha_s390x.go
@@ -2,28 +2,25 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build s390x,!gccgo,!appengine
+// +build !gccgo,!appengine
package chacha20
-import (
- "golang.org/x/sys/cpu"
-)
+import "golang.org/x/sys/cpu"
var haveAsm = cpu.S390X.HasVX
const bufSize = 256
// xorKeyStreamVX is an assembly implementation of XORKeyStream. It must only
-// be called when the vector facility is available.
-// Implementation in asm_s390x.s.
+// be called when the vector facility is available. Implementation in asm_s390x.s.
//go:noescape
-func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32, buf *[256]byte, len *int)
+func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
-func (c *Cipher) xorKeyStreamAsm(dst, src []byte) {
- xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter, &c.buf, &c.len)
+func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) {
+ if cpu.S390X.HasVX {
+ xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter)
+ } else {
+ c.xorKeyStreamBlocksGeneric(dst, src)
+ }
}
-
-// EXRL targets, DO NOT CALL!
-func mvcSrcToBuf()
-func mvcBufToDst()
diff --git a/internal/chacha20/chacha_s390x.s b/internal/chacha20/chacha_s390x.s
index 57df404..de52a2e 100644
--- a/internal/chacha20/chacha_s390x.s
+++ b/internal/chacha20/chacha_s390x.s
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build s390x,!gccgo,!appengine
+// +build !gccgo,!appengine
#include "go_asm.h"
#include "textflag.h"
@@ -24,15 +24,6 @@
DATA ·constants<>+0x18(SB)/4, $0x79622d32
DATA ·constants<>+0x1c(SB)/4, $0x6b206574
-// EXRL targets:
-TEXT ·mvcSrcToBuf(SB), NOFRAME|NOSPLIT, $0
- MVC $1, (R1), (R8)
- RET
-
-TEXT ·mvcBufToDst(SB), NOFRAME|NOSPLIT, $0
- MVC $1, (R8), (R9)
- RET
-
#define BSWAP V5
#define J0 V6
#define KEY0 V7
@@ -144,7 +135,7 @@
VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]}
VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]}
-// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32, buf *[256]byte, len *int)
+// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
MOVD $·constants<>(SB), R1
MOVD dst+0(FP), R2 // R2=&dst[0]
@@ -152,25 +143,10 @@
MOVD key+48(FP), R5 // R5=key
MOVD nonce+56(FP), R6 // R6=nonce
MOVD counter+64(FP), R7 // R7=counter
- MOVD buf+72(FP), R8 // R8=buf
- MOVD len+80(FP), R9 // R9=len
// load BSWAP and J0
VLM (R1), BSWAP, J0
- // set up tail buffer
- ADD $-1, R4, R12
- MOVBZ R12, R12
- CMPUBEQ R12, $255, aligned
- MOVD R4, R1
- AND $~255, R1
- MOVD $(R3)(R1*1), R1
- EXRL $·mvcSrcToBuf(SB), R12
- MOVD $255, R0
- SUB R12, R0
- MOVD R0, (R9) // update len
-
-aligned:
// setup
MOVD $95, R0
VLM (R5), KEY0, KEY1
@@ -217,9 +193,7 @@
// decrement length
ADD $-256, R4
- BLT tail
-continue:
// rearrange vectors
SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3)
ADDV(J0, X0, X1, X2, X3)
@@ -245,16 +219,6 @@
MOVD $256(R3), R3
CMPBNE R4, $0, chacha
- CMPUBEQ R12, $255, return
- EXRL $·mvcBufToDst(SB), R12 // len was updated during setup
-return:
VSTEF $0, CTR, (R7)
RET
-
-tail:
- MOVD R2, R9
- MOVD R8, R2
- MOVD R8, R3
- MOVD $0, R4
- JMP continue
diff --git a/internal/chacha20/chacha_test.go b/internal/chacha20/chacha_test.go
index 9a7a099..913af4f 100644
--- a/internal/chacha20/chacha_test.go
+++ b/internal/chacha20/chacha_test.go
@@ -7,11 +7,16 @@
import (
"encoding/binary"
"encoding/hex"
- "fmt"
"math/rand"
"testing"
)
+func _() {
+ // Assert that bufSize is a multiple of blockSize.
+ var b [1]byte
+ _ = b[bufSize%blockSize]
+}
+
func TestCore(t *testing.T) {
// This is just a smoke test that checks the example from
// https://tools.ietf.org/html/rfc7539#section-2.3.2. The
@@ -121,6 +126,8 @@
}
// finish the encryption
s.XORKeyStream(output[i:], input[i:])
+ // ensure we tolerate a call with an empty input
+ s.XORKeyStream(output[len(output):], input[len(input):])
got := hex.EncodeToString(output)
if got != c.output {
@@ -170,24 +177,44 @@
}
}
-func BenchmarkChaCha20(b *testing.B) {
- sizes := []int{32, 63, 64, 256, 1024, 1350, 65536}
- for _, size := range sizes {
- s := size
- b.Run(fmt.Sprint(s), func(b *testing.B) {
- k := [32]byte{}
- c := [16]byte{}
- src := make([]byte, s)
- dst := make([]byte, s)
- b.SetBytes(int64(s))
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- XORKeyStream(dst, src, &c, &k)
- }
- })
+func benchmarkChaCha20(b *testing.B, step, count int) {
+ tot := step * count
+ src := make([]byte, tot)
+ dst := make([]byte, tot)
+ b.SetBytes(int64(tot))
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ c := New([8]uint32{}, [3]uint32{})
+ for i := 0; i < tot; i += step {
+ c.XORKeyStream(dst[i:], src[i:i+step])
+ }
}
}
+func BenchmarkChaCha20(b *testing.B) {
+ b.Run("64", func(b *testing.B) {
+ benchmarkChaCha20(b, 64, 1)
+ })
+ b.Run("256", func(b *testing.B) {
+ benchmarkChaCha20(b, 256, 1)
+ })
+ b.Run("10x25", func(b *testing.B) {
+ benchmarkChaCha20(b, 10, 25)
+ })
+ b.Run("4096", func(b *testing.B) {
+ benchmarkChaCha20(b, 256, 1)
+ })
+ b.Run("100x40", func(b *testing.B) {
+ benchmarkChaCha20(b, 100, 40)
+ })
+ b.Run("65536", func(b *testing.B) {
+ benchmarkChaCha20(b, 65536, 1)
+ })
+ b.Run("1000x65", func(b *testing.B) {
+ benchmarkChaCha20(b, 1000, 65)
+ })
+}
+
func TestHChaCha20(t *testing.T) {
// See draft-paragon-paseto-rfc-00 §7.2.1.
key := []byte{0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
diff --git a/internal/chacha20/xor.go b/internal/chacha20/xor.go
index 9c5ba0b..0110c98 100644
--- a/internal/chacha20/xor.go
+++ b/internal/chacha20/xor.go
@@ -4,9 +4,7 @@
package chacha20
-import (
- "runtime"
-)
+import "runtime"
// Platforms that have fast unaligned 32-bit little endian accesses.
const unaligned = runtime.GOARCH == "386" ||