poly1305: add (*MAC).Verify API and use it in chacha20poly1305

Also, make sure New inlines so it does not cause an allocation. With
this, we have a zero allocation poly1305 flow and a zero allocation
generic chacha20poly1305 composition! \o/

While at it, remove some redundant code, and prepare to drop some
complexity once the last assembly implementation of sum is dropped.

Benchstat with "-tags purego" on amd64

name              old time/op    new time/op    delta
Open-64-8         461ns ± 2%     415ns ± 1%    -9.93%  (p=0.000 n=10+8)
Seal-64-8         450ns ± 3%     412ns ± 3%    -8.41%  (p=0.000 n=10+10)
Open-64-X-8       603ns ± 2%     544ns ± 2%    -9.84%  (p=0.000 n=10+10)
Seal-64-X-8       580ns ± 3%     553ns ± 1%    -4.56%  (p=0.000 n=9+8)
Open-1350-8      3.98µs ± 2%    3.65µs ± 2%    -8.28%  (p=0.000 n=9+10)
Seal-1350-8      3.95µs ± 2%    3.64µs ± 1%    -7.93%  (p=0.000 n=9+10)
Open-1350-X-8    4.06µs ± 1%    3.68µs ± 3%    -9.31%  (p=0.000 n=9+10)
Seal-1350-X-8    4.08µs ± 4%    3.64µs ± 1%   -10.71%  (p=0.000 n=9+10)
Open-8192-8      21.7µs ± 3%    18.9µs ± 2%   -13.10%  (p=0.000 n=9+10)
Seal-8192-8      21.5µs ± 3%    18.8µs ± 1%   -12.51%  (p=0.000 n=9+9)
Open-8192-X-8    21.4µs ± 1%    19.1µs ± 2%   -10.88%  (p=0.000 n=10+10)
Seal-8192-X-8    21.3µs ± 2%    19.0µs ± 3%   -10.92%  (p=0.000 n=10+10)

name              old speed      new speed      delta
Open-64-8       139MB/s ± 2%   154MB/s ± 2%   +11.05%  (p=0.000 n=10+8)
Seal-64-8       142MB/s ± 3%   155MB/s ± 3%    +9.11%  (p=0.000 n=10+10)
Open-64-X-8     106MB/s ± 2%   118MB/s ± 2%   +10.93%  (p=0.000 n=10+10)
Seal-64-X-8     110MB/s ± 3%   116MB/s ± 1%    +4.75%  (p=0.000 n=9+8)
Open-1350-8     339MB/s ± 2%   370MB/s ± 2%    +9.04%  (p=0.000 n=9+10)
Seal-1350-8     342MB/s ± 2%   371MB/s ± 1%    +8.60%  (p=0.000 n=9+10)
Open-1350-X-8   333MB/s ± 1%   367MB/s ± 3%   +10.30%  (p=0.000 n=9+10)
Seal-1350-X-8   331MB/s ± 4%   371MB/s ± 2%   +11.96%  (p=0.000 n=9+10)
Open-8192-8     377MB/s ± 3%   434MB/s ± 2%   +15.07%  (p=0.000 n=9+10)
Seal-8192-8     381MB/s ± 3%   436MB/s ± 1%   +14.29%  (p=0.000 n=9+9)
Open-8192-X-8   383MB/s ± 1%   429MB/s ± 2%   +12.21%  (p=0.000 n=10+10)
Seal-8192-X-8   385MB/s ± 2%   432MB/s ± 3%   +12.26%  (p=0.000 n=10+10)

name              old alloc/op   new alloc/op   delta
Open-64-8         96.0B ± 0%      0.0B       -100.00%  (p=0.000 n=10+10)
Seal-64-8         96.0B ± 0%      0.0B       -100.00%  (p=0.000 n=10+10)
Open-64-X-8       96.0B ± 0%      0.0B       -100.00%  (p=0.000 n=10+10)
Seal-64-X-8       96.0B ± 0%      0.0B       -100.00%  (p=0.000 n=10+10)
Open-1350-8      1.41kB ± 0%    0.00kB       -100.00%  (p=0.000 n=10+10)
Seal-1350-8      1.41kB ± 0%    0.00kB       -100.00%  (p=0.000 n=10+10)
Open-1350-X-8    1.41kB ± 0%    0.00kB       -100.00%  (p=0.000 n=10+10)
Seal-1350-X-8    1.41kB ± 0%    0.00kB       -100.00%  (p=0.000 n=10+10)
Open-8192-8      9.47kB ± 0%    0.00kB       -100.00%  (p=0.000 n=10+10)
Seal-8192-8      9.47kB ± 0%    0.00kB       -100.00%  (p=0.000 n=10+10)
Open-8192-X-8    9.47kB ± 0%    0.00kB       -100.00%  (p=0.000 n=10+10)
Seal-8192-X-8    9.47kB ± 0%    0.00kB       -100.00%  (p=0.000 n=10+10)

name              old allocs/op  new allocs/op  delta
Open-64-8          1.00 ± 0%      0.00       -100.00%  (p=0.000 n=10+10)
Seal-64-8          1.00 ± 0%      0.00       -100.00%  (p=0.000 n=10+10)
Open-64-X-8        1.00 ± 0%      0.00       -100.00%  (p=0.000 n=10+10)
Seal-64-X-8        1.00 ± 0%      0.00       -100.00%  (p=0.000 n=10+10)
Open-1350-8        1.00 ± 0%      0.00       -100.00%  (p=0.000 n=10+10)
Seal-1350-8        1.00 ± 0%      0.00       -100.00%  (p=0.000 n=10+10)
Open-1350-X-8      1.00 ± 0%      0.00       -100.00%  (p=0.000 n=10+10)
Seal-1350-X-8      1.00 ± 0%      0.00       -100.00%  (p=0.000 n=10+10)
Open-8192-8        1.00 ± 0%      0.00       -100.00%  (p=0.000 n=10+10)
Seal-8192-8        1.00 ± 0%      0.00       -100.00%  (p=0.000 n=10+10)
Open-8192-X-8      1.00 ± 0%      0.00       -100.00%  (p=0.000 n=10+10)
Seal-8192-X-8      1.00 ± 0%      0.00       -100.00%  (p=0.000 n=10+10)

Change-Id: I2c30ddc960a889b49c8ee8ff8073ffc4e75f43af
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/206977
Run-TryBot: Filippo Valsorda <filippo@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Katie Hockman <katie@golang.org>
diff --git a/chacha20poly1305/chacha20poly1305_generic.go b/chacha20poly1305/chacha20poly1305_generic.go
index 91b3856..fe191d3 100644
--- a/chacha20poly1305/chacha20poly1305_generic.go
+++ b/chacha20poly1305/chacha20poly1305_generic.go
@@ -12,56 +12,64 @@
 	"golang.org/x/crypto/poly1305"
 )
 
-func roundTo16(n int) int {
-	return 16 * ((n + 15) / 16)
+func writeWithPadding(p *poly1305.MAC, b []byte) {
+	p.Write(b)
+	if rem := len(b) % 16; rem != 0 {
+		var buf [16]byte
+		padLen := 16 - rem
+		p.Write(buf[:padLen])
+	}
+}
+
+func writeUint64(p *poly1305.MAC, n int) {
+	var buf [8]byte
+	binary.LittleEndian.PutUint64(buf[:], uint64(n))
+	p.Write(buf[:])
 }
 
 func (c *chacha20poly1305) sealGeneric(dst, nonce, plaintext, additionalData []byte) []byte {
 	ret, out := sliceForAppend(dst, len(plaintext)+poly1305.TagSize)
+	ciphertext, tag := out[:len(plaintext)], out[len(plaintext):]
 	if subtle.InexactOverlap(out, plaintext) {
 		panic("chacha20poly1305: invalid buffer overlap")
 	}
 
-	var polyKey, discardBuf [32]byte
+	var polyKey [32]byte
 	s, _ := chacha20.NewUnauthenticatedCipher(c.key[:], nonce)
 	s.XORKeyStream(polyKey[:], polyKey[:])
-	s.XORKeyStream(discardBuf[:], discardBuf[:]) // skip the next 32 bytes
-	s.XORKeyStream(out, plaintext)
+	s.SetCounter(1) // set the counter to 1, skipping 32 bytes
+	s.XORKeyStream(ciphertext, plaintext)
 
-	polyInput := make([]byte, roundTo16(len(additionalData))+roundTo16(len(plaintext))+8+8)
-	copy(polyInput, additionalData)
-	copy(polyInput[roundTo16(len(additionalData)):], out[:len(plaintext)])
-	binary.LittleEndian.PutUint64(polyInput[len(polyInput)-16:], uint64(len(additionalData)))
-	binary.LittleEndian.PutUint64(polyInput[len(polyInput)-8:], uint64(len(plaintext)))
-
-	var tag [poly1305.TagSize]byte
-	poly1305.Sum(&tag, polyInput, &polyKey)
-	copy(out[len(plaintext):], tag[:])
+	p := poly1305.New(&polyKey)
+	writeWithPadding(p, additionalData)
+	writeWithPadding(p, ciphertext)
+	writeUint64(p, len(additionalData))
+	writeUint64(p, len(plaintext))
+	p.Sum(tag[:0])
 
 	return ret
 }
 
 func (c *chacha20poly1305) openGeneric(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) {
-	var tag [poly1305.TagSize]byte
-	copy(tag[:], ciphertext[len(ciphertext)-16:])
+	tag := ciphertext[len(ciphertext)-16:]
 	ciphertext = ciphertext[:len(ciphertext)-16]
 
-	var polyKey, discardBuf [32]byte
+	var polyKey [32]byte
 	s, _ := chacha20.NewUnauthenticatedCipher(c.key[:], nonce)
 	s.XORKeyStream(polyKey[:], polyKey[:])
-	s.XORKeyStream(discardBuf[:], discardBuf[:]) // skip the next 32 bytes
+	s.SetCounter(1) // set the counter to 1, skipping 32 bytes
 
-	polyInput := make([]byte, roundTo16(len(additionalData))+roundTo16(len(ciphertext))+8+8)
-	copy(polyInput, additionalData)
-	copy(polyInput[roundTo16(len(additionalData)):], ciphertext)
-	binary.LittleEndian.PutUint64(polyInput[len(polyInput)-16:], uint64(len(additionalData)))
-	binary.LittleEndian.PutUint64(polyInput[len(polyInput)-8:], uint64(len(ciphertext)))
+	p := poly1305.New(&polyKey)
+	writeWithPadding(p, additionalData)
+	writeWithPadding(p, ciphertext)
+	writeUint64(p, len(additionalData))
+	writeUint64(p, len(ciphertext))
 
 	ret, out := sliceForAppend(dst, len(ciphertext))
 	if subtle.InexactOverlap(out, ciphertext) {
 		panic("chacha20poly1305: invalid buffer overlap")
 	}
-	if !poly1305.Verify(&tag, polyInput, &polyKey) {
+	if !p.Verify(tag) {
 		for i := range out {
 			out[i] = 0
 		}
diff --git a/poly1305/mac_noasm.go b/poly1305/mac_noasm.go
index b0c2cd0..347c8b1 100644
--- a/poly1305/mac_noasm.go
+++ b/poly1305/mac_noasm.go
@@ -7,5 +7,3 @@
 package poly1305
 
 type mac struct{ macGeneric }
-
-func newMAC(key *[32]byte) mac { return mac{newMACGeneric(key)} }
diff --git a/poly1305/poly1305.go b/poly1305/poly1305.go
index 066159b..3c75c2a 100644
--- a/poly1305/poly1305.go
+++ b/poly1305/poly1305.go
@@ -46,10 +46,9 @@
 // two different messages with the same key allows an attacker
 // to forge messages at will.
 func New(key *[32]byte) *MAC {
-	return &MAC{
-		mac:       newMAC(key),
-		finalized: false,
-	}
+	m := &MAC{}
+	initialize(key, &m.macState)
+	return m
 }
 
 // MAC is an io.Writer computing an authentication tag
@@ -58,7 +57,7 @@
 // MAC cannot be used like common hash.Hash implementations,
 // because using a poly1305 key twice breaks its security.
 // Therefore writing data to a running MAC after calling
-// Sum causes it to panic.
+// Sum or Verify causes it to panic.
 type MAC struct {
 	mac // platform-dependent implementation
 
@@ -71,10 +70,10 @@
 // Write adds more data to the running message authentication code.
 // It never returns an error.
 //
-// It must not be called after the first call of Sum.
+// It must not be called after the first call of Sum or Verify.
 func (h *MAC) Write(p []byte) (n int, err error) {
 	if h.finalized {
-		panic("poly1305: write to MAC after Sum")
+		panic("poly1305: write to MAC after Sum or Verify")
 	}
 	return h.mac.Write(p)
 }
@@ -87,3 +86,12 @@
 	h.finalized = true
 	return append(b, mac[:]...)
 }
+
+// Verify returns whether the authenticator of all data written to
+// the message authentication code matches the expected value.
+func (h *MAC) Verify(expected []byte) bool {
+	var mac [TagSize]byte
+	h.mac.Sum(&mac)
+	h.finalized = true
+	return subtle.ConstantTimeCompare(expected, mac[:]) == 1
+}
diff --git a/poly1305/poly1305_test.go b/poly1305/poly1305_test.go
index b258eed..721a262 100644
--- a/poly1305/poly1305_test.go
+++ b/poly1305/poly1305_test.go
@@ -60,6 +60,30 @@
 		if tag != v.Tag() {
 			t.Errorf("%d: expected %x, got %x", i, v.Tag(), tag[:])
 		}
+		if !Verify(&tag, in, &key) {
+			t.Errorf("%d: tag didn't verify", i)
+		}
+		// If the key is zero, the tag will always be zero, independent of the input.
+		if len(in) > 0 && key != [32]byte{} {
+			in[0] ^= 0xff
+			if Verify(&tag, in, &key) {
+				t.Errorf("%d: tag verified after altering the input", i)
+			}
+			in[0] ^= 0xff
+		}
+		// If the input is empty, the tag only depends on the second half of the key.
+		if len(in) > 0 {
+			key[0] ^= 0xff
+			if Verify(&tag, in, &key) {
+				t.Errorf("%d: tag verified after altering the key", i)
+			}
+			key[0] ^= 0xff
+		}
+		tag[0] ^= 0xff
+		if Verify(&tag, in, &key) {
+			t.Errorf("%d: tag verified after altering the tag", i)
+		}
+		tag[0] ^= 0xff
 	}
 }
 
@@ -150,9 +174,17 @@
 			t.Errorf("#%d: unexpected Write results: n = %d, err = %v", i, n, err)
 		}
 		h.Sum(out[:0])
-		if tag := v.Tag(); out != tag {
+		tag := v.Tag()
+		if out != tag {
 			t.Errorf("%d: expected %x, got %x", i, tag[:], out[:])
 		}
+		if !h.Verify(tag[:]) {
+			t.Errorf("%d: Verify failed", i)
+		}
+		tag[0] ^= 0xff
+		if h.Verify(tag[:]) {
+			t.Errorf("%d: Verify succeeded after modifying the tag", i)
+		}
 	}
 }
 
diff --git a/poly1305/sum_amd64.go b/poly1305/sum_amd64.go
index 35b9e38..99e5a1d 100644
--- a/poly1305/sum_amd64.go
+++ b/poly1305/sum_amd64.go
@@ -9,17 +9,6 @@
 //go:noescape
 func update(state *macState, msg []byte)
 
-func sum(out *[16]byte, m []byte, key *[32]byte) {
-	h := newMAC(key)
-	h.Write(m)
-	h.Sum(out)
-}
-
-func newMAC(key *[32]byte) (h mac) {
-	initialize(key, &h.r, &h.s)
-	return
-}
-
 // mac is a wrapper for macGeneric that redirects calls that would have gone to
 // updateGeneric to update.
 //
diff --git a/poly1305/sum_generic.go b/poly1305/sum_generic.go
index 1187eab..c77ff17 100644
--- a/poly1305/sum_generic.go
+++ b/poly1305/sum_generic.go
@@ -31,9 +31,10 @@
 	h.Sum(out)
 }
 
-func newMACGeneric(key *[32]byte) (h macGeneric) {
-	initialize(key, &h.r, &h.s)
-	return
+func newMACGeneric(key *[32]byte) macGeneric {
+	m := macGeneric{}
+	initialize(key, &m.macState)
+	return m
 }
 
 // macState holds numbers in saturated 64-bit little-endian limbs. That is,
@@ -97,11 +98,12 @@
 	rMask1 = 0x0FFFFFFC0FFFFFFC
 )
 
-func initialize(key *[32]byte, r, s *[2]uint64) {
-	r[0] = binary.LittleEndian.Uint64(key[0:8]) & rMask0
-	r[1] = binary.LittleEndian.Uint64(key[8:16]) & rMask1
-	s[0] = binary.LittleEndian.Uint64(key[16:24])
-	s[1] = binary.LittleEndian.Uint64(key[24:32])
+// initialize loads the 256-bit key into the two 128-bit secret values r and s.
+func initialize(key *[32]byte, m *macState) {
+	m.r[0] = binary.LittleEndian.Uint64(key[0:8]) & rMask0
+	m.r[1] = binary.LittleEndian.Uint64(key[8:16]) & rMask1
+	m.s[0] = binary.LittleEndian.Uint64(key[16:24])
+	m.s[1] = binary.LittleEndian.Uint64(key[24:32])
 }
 
 // uint128 holds a 128-bit number as two 64-bit limbs, for use with the
diff --git a/poly1305/sum_noasm.go b/poly1305/sum_noasm.go
index 2e3ae34..2b55a29 100644
--- a/poly1305/sum_noasm.go
+++ b/poly1305/sum_noasm.go
@@ -2,12 +2,17 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build s390x,!go1.11 !amd64,!s390x,!ppc64le gccgo purego
+// At this point only s390x has an assembly implementation of sum. All other
+// platforms have assembly implementations of mac, and just define sum as using
+// that through New. Once s390x is ported, this file can be deleted and the body
+// of sum moved into Sum.
+
+// +build !go1.11 !s390x gccgo purego
 
 package poly1305
 
 func sum(out *[TagSize]byte, msg []byte, key *[32]byte) {
-	h := newMAC(key)
+	h := New(key)
 	h.Write(msg)
-	h.Sum(out)
+	h.Sum(out[:0])
 }
diff --git a/poly1305/sum_ppc64le.go b/poly1305/sum_ppc64le.go
index 92597bb..2e7a120 100644
--- a/poly1305/sum_ppc64le.go
+++ b/poly1305/sum_ppc64le.go
@@ -9,17 +9,6 @@
 //go:noescape
 func update(state *macState, msg []byte)
 
-func sum(out *[16]byte, m []byte, key *[32]byte) {
-	h := newMAC(key)
-	h.Write(m)
-	h.Sum(out)
-}
-
-func newMAC(key *[32]byte) (h mac) {
-	initialize(key, &h.r, &h.s)
-	return
-}
-
 // mac is a wrapper for macGeneric that redirects calls that would have gone to
 // updateGeneric to update.
 //