sha3: use unaligned reads and xors on x86 and x64 Speedup of about 1.4x on x64. Added benchmarks that use the ShakeHash interface, which doesn't require copying the state. Unaligned or generic xorIn and copyOut functions chosen via buildline, but both are tested. Substantial contributions from Eric Eisner. See golang.org/cl/151630044 for the previous CR. (There are also some minor edits/additions to the documentation.) Change-Id: I9500c25682457c82487512b9b8c66df7d75bff5d Reviewed-on: https://go-review.googlesource.com/2132 Reviewed-by: Adam Langley <agl@golang.org>

commit: 4ed45ec682102c643324fae5dff8dab085b6c300 [log] [tgz]
author: David Leon Gil <coruus@gmail.com> Fri Dec 26 19:04:10 2014 -0800
committer: Adam Langley <agl@golang.org> Mon Jan 12 22:01:33 2015 +0000
tree: 450955bdb881d2f98175acd54c44e26bcad29293
parent: 160b2e156e1c2c1e7d8db6b5e3f6ccf75e9dd34f [diff] [blame]
diff --git a/sha3/sha3.go b/sha3/sha3.go
index 8d77568..c8fd31c 100644
--- a/sha3/sha3.go
+++ b/sha3/sha3.go

@@ -4,10 +4,6 @@
 
 package sha3
 
-import (
-	"encoding/binary"
-)
-
 // spongeDirection indicates the direction bytes are flowing through the sponge.
 type spongeDirection int
 
@@ -30,25 +26,25 @@
 	buf  []byte     // points into storage
 	rate int        // the number of bytes of state to use
 
-	// dsbyte contains the "domain separation" value and the first bit of
-	// the padding. In sections 6.1 and 6.2 of [1], the SHA-3 and SHAKE
-	// functions are defined with bits appended to the message: SHA-3
-	// functions have 01 and SHAKE functions have 1111. Because of the way
-	// that bits are numbered from the LSB upwards, that ends up as
-	// 00000010b and 00001111b, respectively. Then the padding rule from
-	// section 5.1 is applied to pad to a multiple of the rate, which
-	// involves adding a 1 bit, zero or more zero bits and then a final one
-	// bit. The first one bit from the padding is merged into the dsbyte
-	// value giving 00000110b (0x06) and 00011111b (0x1f), respectively.
-	//
-	// [1] http://csrc.nist.gov/publications/drafts/fips-202/fips_202_draft.pdf,
+	// dsbyte contains the "domain separation" bits and the first bit of
+	// the padding. Sections 6.1 and 6.2 of [1] separate the outputs of the
+	// SHA-3 and SHAKE functions by appending bitstrings to the message.
+	// Using a little-endian bit-ordering convention, these are "01" for SHA-3
+	// and "1111" for SHAKE, or 00000010b and 00001111b, respectively. Then the
+	// padding rule from section 5.1 is applied to pad the message to a multiple
+	// of the rate, which involves adding a "1" bit, zero or more "0" bits, and
+	// a final "1" bit. We merge the first "1" bit from the padding into dsbyte,
+	// giving 00000110b (0x06) and 00011111b (0x1f).
+	// [1] http://csrc.nist.gov/publications/drafts/fips-202/fips_202_draft.pdf
+	//     "Draft FIPS 202: SHA-3 Standard: Permutation-Based Hash and
+	//      Extendable-Output Functions (May 2014)"
 	dsbyte  byte
 	storage [maxRate]byte
 
 	// Specific to SHA-3 and SHAKE.
 	fixedOutput bool            // whether this is a fixed-ouput-length instance
 	outputLen   int             // the default output size in bytes
-	state       spongeDirection // current direction of the sponge
+	state       spongeDirection // whether the sponge is absorbing or squeezing
 }
 
 // BlockSize returns the rate of sponge underlying this hash function.
@@ -79,35 +75,6 @@
 	return &ret
 }
 
-// xorIn xors a buffer into the state, byte-swapping to
-// little-endian as necessary; it returns the number of bytes
-// copied, including any zeros appended to the bytestring.
-func (d *state) xorIn(buf []byte) {
-	n := len(buf) / 8
-
-	for i := 0; i < n; i++ {
-		a := binary.LittleEndian.Uint64(buf)
-		d.a[i] ^= a
-		buf = buf[8:]
-	}
-	if len(buf) != 0 {
-		// XOR in the last partial ulint64.
-		a := uint64(0)
-		for i, v := range buf {
-			a |= uint64(v) << uint64(8*i)
-		}
-		d.a[n] ^= a
-	}
-}
-
-// copyOut copies ulint64s to a byte buffer.
-func (d *state) copyOut(b []byte) {
-	for i := 0; len(b) >= 8; i++ {
-		binary.LittleEndian.PutUint64(b, d.a[i])
-		b = b[8:]
-	}
-}
-
 // permute applies the KeccakF-1600 permutation. It handles
 // any input-output buffering.
 func (d *state) permute() {
@@ -115,7 +82,7 @@
 	case spongeAbsorbing:
 		// If we're absorbing, we need to xor the input into the state
 		// before applying the permutation.
-		d.xorIn(d.buf)
+		xorIn(d, d.buf)
 		d.buf = d.storage[:0]
 		keccakF1600(&d.a)
 	case spongeSqueezing:
@@ -123,7 +90,7 @@
 		// copying more output.
 		keccakF1600(&d.a)
 		d.buf = d.storage[:d.rate]
-		d.copyOut(d.buf)
+		copyOut(d, d.buf)
 	}
 }
 
@@ -151,7 +118,7 @@
 	d.permute()
 	d.state = spongeSqueezing
 	d.buf = d.storage[:d.rate]
-	d.copyOut(d.buf)
+	copyOut(d, d.buf)
 }
 
 // Write absorbs more data into the hash's state. It produces an error
@@ -168,7 +135,7 @@
 	for len(p) > 0 {
 		if len(d.buf) == 0 && len(p) >= d.rate {
 			// The fast path; absorb a full "rate" bytes of input and apply the permutation.
-			d.xorIn(p[:d.rate])
+			xorIn(d, p[:d.rate])
 			p = p[d.rate:]
 			keccakF1600(&d.a)
 		} else {
commit	4ed45ec682102c643324fae5dff8dab085b6c300	[log] [tgz]
author	David Leon Gil <coruus@gmail.com>	Fri Dec 26 19:04:10 2014 -0800
committer	Adam Langley <agl@golang.org>	Mon Jan 12 22:01:33 2015 +0000
tree	450955bdb881d2f98175acd54c44e26bcad29293
parent	160b2e156e1c2c1e7d8db6b5e3f6ccf75e9dd34f [diff] [blame]