go.crypto/sha3: new package
Added a pure Go implementation of SHA3 (Keccak) which implements the hash.Hash interface.
A test file is included with performance benchmarks and standard test vectors.

R=agl, nigeltao
CC=golang-dev
https://golang.org/cl/7760044
diff --git a/sha3/keccakf.go b/sha3/keccakf.go
new file mode 100644
index 0000000..107156c
--- /dev/null
+++ b/sha3/keccakf.go
@@ -0,0 +1,171 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sha3
+
+// This file implements the core Keccak permutation function necessary for computing SHA3.
+// This is implemented in a separate file to allow for replacement by an optimized implementation.
+// Nothing in this package is exported.
+// For the detailed specification, refer to the Keccak web site (http://keccak.noekeon.org/).
+
+// rc stores the round constants for use in the ι step.
+var rc = [...]uint64{
+	0x0000000000000001,
+	0x0000000000008082,
+	0x800000000000808A,
+	0x8000000080008000,
+	0x000000000000808B,
+	0x0000000080000001,
+	0x8000000080008081,
+	0x8000000000008009,
+	0x000000000000008A,
+	0x0000000000000088,
+	0x0000000080008009,
+	0x000000008000000A,
+	0x000000008000808B,
+	0x800000000000008B,
+	0x8000000000008089,
+	0x8000000000008003,
+	0x8000000000008002,
+	0x8000000000000080,
+	0x000000000000800A,
+	0x800000008000000A,
+	0x8000000080008081,
+	0x8000000000008080,
+	0x0000000080000001,
+	0x8000000080008008,
+}
+
+// ro_xx represent the rotation offsets for use in the χ step.
+// Defining them as const instead of in an array allows the compiler to insert constant shifts.
+const (
+	ro_00 = 0
+	ro_01 = 36
+	ro_02 = 3
+	ro_03 = 41
+	ro_04 = 18
+	ro_05 = 1
+	ro_06 = 44
+	ro_07 = 10
+	ro_08 = 45
+	ro_09 = 2
+	ro_10 = 62
+	ro_11 = 6
+	ro_12 = 43
+	ro_13 = 15
+	ro_14 = 61
+	ro_15 = 28
+	ro_16 = 55
+	ro_17 = 25
+	ro_18 = 21
+	ro_19 = 56
+	ro_20 = 27
+	ro_21 = 20
+	ro_22 = 39
+	ro_23 = 8
+	ro_24 = 14
+)
+
+// keccakF computes the complete Keccak-f function consisting of 24 rounds with a different
+// constant (rc) in each round. This implementation fully unrolls the round function to avoid
+// inner loops, as well as pre-calculating shift offsets.
+func (d *digest) keccakF() {
+	for _, roundConstant := range rc {
+		// θ step
+		d.c[0] = d.a[0] ^ d.a[5] ^ d.a[10] ^ d.a[15] ^ d.a[20]
+		d.c[1] = d.a[1] ^ d.a[6] ^ d.a[11] ^ d.a[16] ^ d.a[21]
+		d.c[2] = d.a[2] ^ d.a[7] ^ d.a[12] ^ d.a[17] ^ d.a[22]
+		d.c[3] = d.a[3] ^ d.a[8] ^ d.a[13] ^ d.a[18] ^ d.a[23]
+		d.c[4] = d.a[4] ^ d.a[9] ^ d.a[14] ^ d.a[19] ^ d.a[24]
+
+		d.d[0] = d.c[4] ^ (d.c[1]<<1 ^ d.c[1]>>63)
+		d.d[1] = d.c[0] ^ (d.c[2]<<1 ^ d.c[2]>>63)
+		d.d[2] = d.c[1] ^ (d.c[3]<<1 ^ d.c[3]>>63)
+		d.d[3] = d.c[2] ^ (d.c[4]<<1 ^ d.c[4]>>63)
+		d.d[4] = d.c[3] ^ (d.c[0]<<1 ^ d.c[0]>>63)
+
+		d.a[0] ^= d.d[0]
+		d.a[1] ^= d.d[1]
+		d.a[2] ^= d.d[2]
+		d.a[3] ^= d.d[3]
+		d.a[4] ^= d.d[4]
+		d.a[5] ^= d.d[0]
+		d.a[6] ^= d.d[1]
+		d.a[7] ^= d.d[2]
+		d.a[8] ^= d.d[3]
+		d.a[9] ^= d.d[4]
+		d.a[10] ^= d.d[0]
+		d.a[11] ^= d.d[1]
+		d.a[12] ^= d.d[2]
+		d.a[13] ^= d.d[3]
+		d.a[14] ^= d.d[4]
+		d.a[15] ^= d.d[0]
+		d.a[16] ^= d.d[1]
+		d.a[17] ^= d.d[2]
+		d.a[18] ^= d.d[3]
+		d.a[19] ^= d.d[4]
+		d.a[20] ^= d.d[0]
+		d.a[21] ^= d.d[1]
+		d.a[22] ^= d.d[2]
+		d.a[23] ^= d.d[3]
+		d.a[24] ^= d.d[4]
+
+		// ρ and π steps
+		d.b[0] = d.a[0]
+		d.b[1] = d.a[6]<<ro_06 ^ d.a[6]>>(64-ro_06)
+		d.b[2] = d.a[12]<<ro_12 ^ d.a[12]>>(64-ro_12)
+		d.b[3] = d.a[18]<<ro_18 ^ d.a[18]>>(64-ro_18)
+		d.b[4] = d.a[24]<<ro_24 ^ d.a[24]>>(64-ro_24)
+		d.b[5] = d.a[3]<<ro_15 ^ d.a[3]>>(64-ro_15)
+		d.b[6] = d.a[9]<<ro_21 ^ d.a[9]>>(64-ro_21)
+		d.b[7] = d.a[10]<<ro_02 ^ d.a[10]>>(64-ro_02)
+		d.b[8] = d.a[16]<<ro_08 ^ d.a[16]>>(64-ro_08)
+		d.b[9] = d.a[22]<<ro_14 ^ d.a[22]>>(64-ro_14)
+		d.b[10] = d.a[1]<<ro_05 ^ d.a[1]>>(64-ro_05)
+		d.b[11] = d.a[7]<<ro_11 ^ d.a[7]>>(64-ro_11)
+		d.b[12] = d.a[13]<<ro_17 ^ d.a[13]>>(64-ro_17)
+		d.b[13] = d.a[19]<<ro_23 ^ d.a[19]>>(64-ro_23)
+		d.b[14] = d.a[20]<<ro_04 ^ d.a[20]>>(64-ro_04)
+		d.b[15] = d.a[4]<<ro_20 ^ d.a[4]>>(64-ro_20)
+		d.b[16] = d.a[5]<<ro_01 ^ d.a[5]>>(64-ro_01)
+		d.b[17] = d.a[11]<<ro_07 ^ d.a[11]>>(64-ro_07)
+		d.b[18] = d.a[17]<<ro_13 ^ d.a[17]>>(64-ro_13)
+		d.b[19] = d.a[23]<<ro_19 ^ d.a[23]>>(64-ro_19)
+		d.b[20] = d.a[2]<<ro_10 ^ d.a[2]>>(64-ro_10)
+		d.b[21] = d.a[8]<<ro_16 ^ d.a[8]>>(64-ro_16)
+		d.b[22] = d.a[14]<<ro_22 ^ d.a[14]>>(64-ro_22)
+		d.b[23] = d.a[15]<<ro_03 ^ d.a[15]>>(64-ro_03)
+		d.b[24] = d.a[21]<<ro_09 ^ d.a[21]>>(64-ro_09)
+
+		// χ step
+		d.a[0] = d.b[0] ^ (^d.b[1] & d.b[2])
+		d.a[1] = d.b[1] ^ (^d.b[2] & d.b[3])
+		d.a[2] = d.b[2] ^ (^d.b[3] & d.b[4])
+		d.a[3] = d.b[3] ^ (^d.b[4] & d.b[0])
+		d.a[4] = d.b[4] ^ (^d.b[0] & d.b[1])
+		d.a[5] = d.b[5] ^ (^d.b[6] & d.b[7])
+		d.a[6] = d.b[6] ^ (^d.b[7] & d.b[8])
+		d.a[7] = d.b[7] ^ (^d.b[8] & d.b[9])
+		d.a[8] = d.b[8] ^ (^d.b[9] & d.b[5])
+		d.a[9] = d.b[9] ^ (^d.b[5] & d.b[6])
+		d.a[10] = d.b[10] ^ (^d.b[11] & d.b[12])
+		d.a[11] = d.b[11] ^ (^d.b[12] & d.b[13])
+		d.a[12] = d.b[12] ^ (^d.b[13] & d.b[14])
+		d.a[13] = d.b[13] ^ (^d.b[14] & d.b[10])
+		d.a[14] = d.b[14] ^ (^d.b[10] & d.b[11])
+		d.a[15] = d.b[15] ^ (^d.b[16] & d.b[17])
+		d.a[16] = d.b[16] ^ (^d.b[17] & d.b[18])
+		d.a[17] = d.b[17] ^ (^d.b[18] & d.b[19])
+		d.a[18] = d.b[18] ^ (^d.b[19] & d.b[15])
+		d.a[19] = d.b[19] ^ (^d.b[15] & d.b[16])
+		d.a[20] = d.b[20] ^ (^d.b[21] & d.b[22])
+		d.a[21] = d.b[21] ^ (^d.b[22] & d.b[23])
+		d.a[22] = d.b[22] ^ (^d.b[23] & d.b[24])
+		d.a[23] = d.b[23] ^ (^d.b[24] & d.b[20])
+		d.a[24] = d.b[24] ^ (^d.b[20] & d.b[21])
+
+		// ι step
+		d.a[0] ^= roundConstant
+	}
+}
diff --git a/sha3/sha3.go b/sha3/sha3.go
new file mode 100644
index 0000000..22df0ef
--- /dev/null
+++ b/sha3/sha3.go
@@ -0,0 +1,216 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package sha3 implements the SHA3 hash algorithm (formerly called Keccak) chosen by NIST in 2012.
+// This file provides a SHA3 implementation which implements the standard hash.Hash interface.
+// Writing input data, including padding, and reading output data are computed in this file.
+// Note that the current implementation can compute the hash of an integral number of bytes only.
+// This is a consequence of the hash interface in which a buffer of bytes is passed in.
+// The internals of the Keccak-f function are computed in keccakf.go.
+// For the detailed specification, refer to the Keccak web site (http://keccak.noekeon.org/).
+package sha3
+
+import (
+	"encoding/binary"
+	"hash"
+)
+
+// laneSize is the size in bytes of each "lane" of the internal state of SHA3 (5 * 5 * 8).
+// Note that changing this size would requires using a type other than uint64 to store each lane.
+const laneSize = 8
+
+// sliceSize represents the dimensions of the internal state, a square matrix of
+// sliceSize ** 2 lanes. This is the size of both the "rows" and "columns" dimensions in the
+// terminology of the SHA3 specification.
+const sliceSize = 5
+
+// numLanes represents the total number of lanes in the state.
+const numLanes = sliceSize * sliceSize
+
+// stateSize is the size in bytes of the internal state of SHA3 (5 * 5 * WSize).
+const stateSize = laneSize * numLanes
+
+// digest represents the partial evaluation of a checksum.
+// Note that capacity, and not outputSize, is the critical security parameter, as SHA3 can output
+// an arbitrary number of bytes for any given capacity. The Keccak proposal recommends that
+// capacity = 2*outputSize to ensure that finding a collision of size outputSize requires
+// O(2^{outputSize/2}) computations (the birthday lower bound). Future standards may modify the
+// capacity/outputSize ratio to allow for more output with lower cryptographic security.
+type digest struct {
+	a          [numLanes]uint64  // main state of the hash
+	b          [numLanes]uint64  // intermediate states
+	c          [sliceSize]uint64 // intermediate states
+	d          [sliceSize]uint64 // intermediate states
+	outputSize int               // desired output size in bytes
+	capacity   int               // number of bytes to leave untouched during squeeze/absorb
+	absorbed   int               // number of bytes absorbed thus far
+}
+
+// minInt returns the lesser of two integer arguments, to simplify the absorption routine.
+func minInt(v1, v2 int) int {
+	if v1 <= v2 {
+		return v1
+	}
+	return v2
+}
+
+// rate returns the number of bytes of the internal state which can be absorbed or squeezed
+// in between calls to the permutation function.
+func (d *digest) rate() int {
+	return stateSize - d.capacity
+}
+
+// Reset clears the internal state by zeroing bytes in the state buffer.
+// This can be skipped for a newly-created hash state; the default zero-allocated state is correct.
+func (d *digest) Reset() {
+	d.absorbed = 0
+	for i := range d.a {
+		d.a[i] = 0
+	}
+}
+
+// BlockSize, required by the hash.Hash interface, does not have a standard intepretation
+// for a sponge-based construction like SHA3. We return the data rate: the number of bytes which
+// can be absorbed per invocation of the permutation function. For Merkle-Damgård based hashes
+// (ie SHA1, SHA2, MD5) the output size of the internal compression function is returned.
+// We consider this to be roughly equivalent because it represents the number of bytes of output
+// produced per cryptographic operation.
+func (d *digest) BlockSize() int { return d.rate() }
+
+// Size returns the output size of the hash function in bytes.
+func (d *digest) Size() int {
+	return d.outputSize
+}
+
+// unalignedAbsorb is a helper function for Write, which absorbs data that isn't aligned with an
+// 8-byte lane. This requires shifting the individual bytes into position in a uint64.
+func (d *digest) unalignedAbsorb(p []byte) {
+	var t uint64
+	for i := len(p) - 1; i >= 0; i-- {
+		t <<= 8
+		t |= uint64(p[i])
+	}
+	offset := (d.absorbed) % d.rate()
+	t <<= 8 * uint(offset%laneSize)
+	d.a[offset/laneSize] ^= t
+	d.absorbed += len(p)
+}
+
+// Write "absorbs" bytes into the state of the SHA3 hash, updating as needed when the sponge
+// "fills up" with rate() bytes. Since lanes are stored internally as type uint64, this requires
+// converting the incoming bytes into uint64s using a little endian interpretation. This
+// implementation is optimized for large, aligned writes of multiples of 8 bytes (laneSize).
+// Non-aligned or uneven numbers of bytes require shifting and are slower.
+func (d *digest) Write(p []byte) (int, error) {
+	// An initial offset is needed if the we aren't absorbing to the first lane initially.
+	offset := d.absorbed % d.rate()
+	toWrite := len(p)
+
+	// The first lane may need to absorb unaligned and/or incomplete data.
+	if (offset%laneSize != 0 || len(p) < 8) && len(p) > 0 {
+		toAbsorb := minInt(laneSize-(offset%laneSize), len(p))
+		d.unalignedAbsorb(p[:toAbsorb])
+		p = p[toAbsorb:]
+		offset = (d.absorbed) % d.rate()
+
+		// For every rate() bytes absorbed, the state must be permuted via the F Function.
+		if (d.absorbed)%d.rate() == 0 {
+			d.keccakF()
+		}
+	}
+
+	// This loop should absorb the bulk of the data into full, aligned lanes.
+	// It will call the update function as necessary.
+	for len(p) > 7 {
+		firstLane := offset / laneSize
+		lastLane := minInt(d.rate()/laneSize, firstLane+len(p)/laneSize)
+
+		// This inner loop absorbs input bytes into the state in groups of 8, converted to uint64s.
+		for lane := firstLane; lane < lastLane; lane++ {
+			d.a[lane] ^= binary.LittleEndian.Uint64(p[:laneSize])
+			p = p[laneSize:]
+		}
+		d.absorbed += (lastLane - firstLane) * laneSize
+		// For every rate() bytes absorbed, the state must be permuted via the F Function.
+		if (d.absorbed)%d.rate() == 0 {
+			d.keccakF()
+		}
+
+		offset = 0
+	}
+
+	// If there are insufficient bytes to fill the final lane, an unaligned absorption.
+	// This should always start at a correct lane boundary though, or else it would be caught
+	// by the uneven opening lane case above.
+	if len(p) > 0 {
+		d.unalignedAbsorb(p)
+	}
+
+	return toWrite, nil
+}
+
+// pad computes the SHA3 padding scheme based on the number of bytes absorbed.
+// The padding is a 1 bit, followed by an arbitrary number of 0s and then a final 1 bit, such that
+// the input bits plus padding bits are a multiple of rate(). Adding the padding simply requires
+// xoring an opening and closing bit into the appropriate lanes.
+func (d *digest) pad() {
+	offset := d.absorbed % d.rate()
+	// The opening pad bit must be shifted into position based on the number of bytes absorbed
+	padOpenLane := offset / laneSize
+	d.a[padOpenLane] ^= 0x0000000000000001 << uint(8*(offset%laneSize))
+	// The closing padding bit is always in the last position
+	padCloseLane := (d.rate() / laneSize) - 1
+	d.a[padCloseLane] ^= 0x8000000000000000
+}
+
+// finalize prepares the hash to output data by padding and one final permutation of the state.
+func (d *digest) finalize() {
+	d.pad()
+	d.keccakF()
+}
+
+// squeeze outputs an arbitrary number of bytes from the hash state.
+// Squeezing can require multiple calls to the F function (one per rate() bytes squeezed),
+// although this is not the case for standard SHA3 parameters. This implementation only supports
+// squeezing a single time, subsequent squeezes may lose alignment. Future implementations
+// may wish to support multiple squeeze calls, for example to support use as a PRNG.
+func (d *digest) squeeze(in []byte, toSqueeze int) []byte {
+	// Because we read in blocks of laneSize, we need enough room to read
+	// an integral number of lanes
+	needed := toSqueeze + (laneSize-toSqueeze%laneSize)%laneSize
+	if cap(in)-len(in) < needed {
+		newIn := make([]byte, len(in), len(in)+needed)
+		copy(newIn, in)
+		in = newIn
+	}
+	out := in[len(in) : len(in)+needed]
+
+	for len(out) > 0 {
+		for i := 0; i < d.rate() && len(out) > 0; i += laneSize {
+			binary.LittleEndian.PutUint64(out[:], d.a[i/laneSize])
+			out = out[laneSize:]
+		}
+		if len(out) > 0 {
+			d.keccakF()
+		}
+	}
+	return in[:len(in)+toSqueeze] // Re-slice in case we wrote extra data.
+}
+
+// Sum applies padding to the hash state and then squeezes out the desired nubmer of output bytes.
+func (d *digest) Sum(in []byte) []byte {
+	// Make a copy of the original hash so that caller can keep writing and summing.
+	dup := *d
+	dup.finalize()
+	return dup.squeeze(in, dup.outputSize)
+}
+
+// The NewKeccakX constructors enable initializing a hash in any of the four recommend sizes
+// from the Keccak specification, all of which set capacity=2*outputSize. Note that the final
+// NIST standard for SHA3 may specify different input/output lengths.
+// The output size is indicated in bits but converted into bytes internally.
+func NewKeccak224() hash.Hash { return &digest{outputSize: 224 / 8, capacity: 2 * 224 / 8} }
+func NewKeccak256() hash.Hash { return &digest{outputSize: 256 / 8, capacity: 2 * 256 / 8} }
+func NewKeccak384() hash.Hash { return &digest{outputSize: 384 / 8, capacity: 2 * 384 / 8} }
+func NewKeccak512() hash.Hash { return &digest{outputSize: 512 / 8, capacity: 2 * 512 / 8} }
diff --git a/sha3/sha3_test.go b/sha3/sha3_test.go
new file mode 100644
index 0000000..bad4076
--- /dev/null
+++ b/sha3/sha3_test.go
@@ -0,0 +1,276 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sha3
+
+// These tests are a subset of those provided by the Keccak web site(http://keccak.noekeon.org/).
+
+import (
+	"bytes"
+	"encoding/hex"
+	"fmt"
+	"hash"
+	"strings"
+	"testing"
+)
+
+// testDigests maintains a digest state of each standard type.
+var testDigests = map[string]*digest{
+	"Keccak224": {outputSize: 224 / 8, capacity: 2 * 224 / 8},
+	"Keccak256": {outputSize: 256 / 8, capacity: 2 * 256 / 8},
+	"Keccak384": {outputSize: 384 / 8, capacity: 2 * 384 / 8},
+	"Keccak512": {outputSize: 512 / 8, capacity: 2 * 512 / 8},
+}
+
+// testVector represents a test input and expected outputs from multiple algorithm variants.
+type testVector struct {
+	desc   string
+	input  []byte
+	repeat int // input will be concatenated the input this many times.
+	want   map[string]string
+}
+
+// decodeHex converts an hex-encoded string into a raw byte string.
+func decodeHex(s string) []byte {
+	b, err := hex.DecodeString(s)
+	if err != nil {
+		panic(err)
+	}
+	return b
+}
+
+// shortTestVectors stores a series of short testVectors.
+// Inputs of 8, 248, and 264 bits from http://keccak.noekeon.org/ are included below.
+// The standard defines additional test inputs of all sizes between 0 and 2047 bits.
+// Because the current implementation can only handle an integral number of bytes,
+// most of the standard test inputs can't be used.
+var shortKeccakTestVectors = []testVector{
+	{
+		desc:   "short-8b",
+		input:  decodeHex("CC"),
+		repeat: 1,
+		want: map[string]string{
+			"Keccak224": "A9CAB59EB40A10B246290F2D6086E32E3689FAF1D26B470C899F2802",
+			"Keccak256": "EEAD6DBFC7340A56CAEDC044696A168870549A6A7F6F56961E84A54BD9970B8A",
+			"Keccak384": "1B84E62A46E5A201861754AF5DC95C4A1A69CAF4A796AE405680161E29572641F5FA1E8641D7958336EE7B11C58F73E9",
+			"Keccak512": "8630C13CBD066EA74BBE7FE468FEC1DEE10EDC1254FB4C1B7C5FD69B646E44160B8CE01D05A0908CA790DFB080F4B513BC3B6225ECE7A810371441A5AC666EB9",
+		},
+	},
+	{
+		desc:   "short-248b",
+		input:  decodeHex("84FB51B517DF6C5ACCB5D022F8F28DA09B10232D42320FFC32DBECC3835B29"),
+		repeat: 1,
+		want: map[string]string{
+			"Keccak224": "81AF3A7A5BD4C1F948D6AF4B96F93C3B0CF9C0E7A6DA6FCD71EEC7F6",
+			"Keccak256": "D477FB02CAAA95B3280EC8EE882C29D9E8A654B21EF178E0F97571BF9D4D3C1C",
+			"Keccak384": "503DCAA4ADDA5A9420B2E436DD62D9AB2E0254295C2982EF67FCE40F117A2400AB492F7BD5D133C6EC2232268BC27B42",
+			"Keccak512": "9D8098D8D6EDBBAA2BCFC6FB2F89C3EAC67FEC25CDFE75AA7BD570A648E8C8945FF2EC280F6DCF73386109155C5BBC444C707BB42EAB873F5F7476657B1BC1A8",
+		},
+	},
+	{
+		desc:   "short-264b",
+		input:  decodeHex("DE8F1B3FAA4B7040ED4563C3B8E598253178E87E4D0DF75E4FF2F2DEDD5A0BE046"),
+		repeat: 1,
+		want: map[string]string{
+			"Keccak224": "F217812E362EC64D4DC5EACFABC165184BFA456E5C32C2C7900253D0",
+			"Keccak256": "E78C421E6213AFF8DE1F025759A4F2C943DB62BBDE359C8737E19B3776ED2DD2",
+			"Keccak384": "CF38764973F1EC1C34B5433AE75A3AAD1AAEF6AB197850C56C8617BCD6A882F6666883AC17B2DCCDBAA647075D0972B5",
+			"Keccak512": "9A7688E31AAF40C15575FC58C6B39267AAD3722E696E518A9945CF7F7C0FEA84CB3CB2E9F0384A6B5DC671ADE7FB4D2B27011173F3EEEAF17CB451CF26542031",
+		},
+	},
+}
+
+// longTestVectors stores longer testVectors (currently only one).
+// The computed test vector is 64 MiB long and is a truncated version of the
+// ExtremelyLongMsgKAT taken from http://keccak.noekeon.org/.
+var longKeccakTestVectors = []testVector{
+	{
+		desc:   "long-1GiB",
+		input:  []byte("abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmno"),
+		repeat: 1024 * 1024,
+		want: map[string]string{
+			"Keccak224": "50E35E40980FEEFF1EA490957B0E970257F75EA0D410EE0F0B8A7A58",
+			"Keccak256": "5015A4935F0B51E091C6550A94DCD262C08998232CCAA22E7F0756DEAC0DC0D0",
+			"Keccak384": "7907A8D0FAA7BC6A90FE14C6C958C956A0877E751455D8F13ACDB96F144B5896E716C06EC0CB56557A94EF5C3355F6F3",
+			"Keccak512": "3EC327D6759F769DEB74E80CA70C831BC29CAB048A4BF4190E4A1DD5C6507CF2B4B58937FDE81D36014E7DFE1B1DD8B0F27CB7614F9A645FEC114F1DAAEFC056",
+		},
+	},
+}
+
+// TestKeccakVectors checks that correct output is produced for a set of known testVectors.
+func TestKeccakVectors(t *testing.T) {
+	testCases := append([]testVector{}, shortKeccakTestVectors...)
+	if !testing.Short() {
+		testCases = append(testCases, longKeccakTestVectors...)
+	}
+	for _, tc := range testCases {
+		for alg, want := range tc.want {
+			testDigests[alg].Reset()
+			// Write input data each digests, based on the test specification t.
+			for i := 0; i < tc.repeat; i++ {
+				testDigests[alg].Write(tc.input)
+			}
+			// Verify that each algorithm version produced the expected output.
+			got := strings.ToUpper(hex.EncodeToString(testDigests[alg].Sum(nil)))
+			if got != want {
+				t.Errorf("%s, alg=%s\ngot %q, want %q", tc.desc, alg, got, want)
+			}
+		}
+	}
+}
+
+// dumpState is a debugging function to pretty-print the internal state of the hash.
+func (d *digest) dumpState() {
+	fmt.Printf("SHA3 hash, %d B output, %d B capacity (%d B rate)\n", d.outputSize, d.capacity, d.rate())
+	fmt.Printf("Internal state after absorbing %d B:\n", d.absorbed)
+
+	for x := 0; x < sliceSize; x++ {
+		for y := 0; y < sliceSize; y++ {
+			fmt.Printf("%v, ", d.a[x*sliceSize+y])
+		}
+		fmt.Println("")
+	}
+}
+
+// TestUnalignedWrite tests that writing data in an arbitrary pattern with small input buffers.
+func TestUnalignedWrite(t *testing.T) {
+	buf := sequentialBytes(0x10000)
+	for alg, d := range testDigests {
+		d.Reset()
+		d.Write(buf)
+		want := d.Sum(nil)
+		d.Reset()
+		for i := 0; i < len(buf); {
+			// Cycle through offsets which make a 137 byte sequence.
+			// Because 137 is prime this sequence should exercise all corner cases.
+			offsets := [17]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1}
+			for _, j := range offsets {
+				j = minInt(j, len(buf)-i)
+				d.Write(buf[i : i+j])
+				i += j
+			}
+		}
+		got := d.Sum(nil)
+		if !bytes.Equal(got, want) {
+			t.Errorf("Unaligned writes, alg=%s\ngot %q, want %q", alg, got, want)
+		}
+	}
+}
+
+func TestAppend(t *testing.T) {
+	d := NewKeccak224()
+
+	for capacity := 2; capacity < 64; capacity += 64 {
+		// The first time around the loop, Sum will have to reallocate.
+		// The second time, it will not.
+		buf := make([]byte, 2, capacity)
+		d.Reset()
+		d.Write([]byte{0xcc})
+		buf = d.Sum(buf)
+		expected := "0000A9CAB59EB40A10B246290F2D6086E32E3689FAF1D26B470C899F2802"
+		if got := strings.ToUpper(hex.EncodeToString(buf)); got != expected {
+			t.Errorf("got %s, want %s", got, expected)
+		}
+	}
+}
+
+func TestAppendNoRealloc(t *testing.T) {
+	buf := make([]byte, 1, 200)
+	d := NewKeccak224()
+	d.Write([]byte{0xcc})
+	buf = d.Sum(buf)
+	expected := "00A9CAB59EB40A10B246290F2D6086E32E3689FAF1D26B470C899F2802"
+	if got := strings.ToUpper(hex.EncodeToString(buf)); got != expected {
+		t.Errorf("got %s, want %s", got, expected)
+	}
+}
+
+// sequentialBytes produces a buffer of size consecutive bytes 0x00, 0x01, ..., used for testing.
+func sequentialBytes(size int) []byte {
+	result := make([]byte, size)
+	for i := range result {
+		result[i] = byte(i)
+	}
+	return result
+}
+
+// benchmarkBlockWrite tests the speed of writing data and never calling the permutation function.
+func benchmarkBlockWrite(b *testing.B, d *digest) {
+	b.StopTimer()
+	d.Reset()
+	// Write all but the last byte of a block, to ensure that the permutation is not called.
+	data := sequentialBytes(d.rate() - 1)
+	b.SetBytes(int64(len(data)))
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		d.absorbed = 0 // Reset absorbed to avoid ever calling the permutation function
+		d.Write(data)
+	}
+	b.StopTimer()
+	d.Reset()
+}
+
+// BenchmarkPermutationFunction measures the speed of the permutation function with no input data.
+func BenchmarkPermutationFunction(b *testing.B) {
+	b.StopTimer()
+	d := testDigests["Keccak512"]
+	d.Reset()
+	b.SetBytes(int64(stateSize))
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		d.keccakF()
+	}
+	b.StopTimer()
+	d.Reset()
+}
+
+// BenchmarkSingleByteWrite tests the latency from writing a single byte
+func BenchmarkSingleByteWrite(b *testing.B) {
+	b.StopTimer()
+	d := testDigests["Keccak512"]
+	d.Reset()
+	data := sequentialBytes(1) //1 byte buffer
+	b.SetBytes(int64(d.rate()) - 1)
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		d.absorbed = 0 // Reset absorbed to avoid ever calling the permutation function
+
+		// Write all but the last byte of a block, one byte at a time.
+		for j := 0; j < d.rate()-1; j++ {
+			d.Write(data)
+		}
+	}
+	b.StopTimer()
+	d.Reset()
+}
+
+// BenchmarkSingleByteX measures the block write speed for each size of the digest.
+func BenchmarkBlockWrite512(b *testing.B) { benchmarkBlockWrite(b, testDigests["Keccak512"]) }
+func BenchmarkBlockWrite384(b *testing.B) { benchmarkBlockWrite(b, testDigests["Keccak384"]) }
+func BenchmarkBlockWrite256(b *testing.B) { benchmarkBlockWrite(b, testDigests["Keccak256"]) }
+func BenchmarkBlockWrite224(b *testing.B) { benchmarkBlockWrite(b, testDigests["Keccak224"]) }
+
+// benchmarkBulkHash tests the speed to hash a 16 KiB buffer.
+func benchmarkBulkHash(b *testing.B, h hash.Hash) {
+	b.StopTimer()
+	h.Reset()
+	size := 1 << 14
+	data := sequentialBytes(size)
+	b.SetBytes(int64(size))
+	b.StartTimer()
+
+	var digest []byte
+	for i := 0; i < b.N; i++ {
+		h.Write(data)
+		digest = h.Sum(digest[:0])
+	}
+	b.StopTimer()
+	h.Reset()
+}
+
+// benchmarkBulkKeccakX test the speed to hash a 16 KiB buffer by calling benchmarkBulkHash.
+func BenchmarkBulkKeccak512(b *testing.B) { benchmarkBulkHash(b, NewKeccak512()) }
+func BenchmarkBulkKeccak384(b *testing.B) { benchmarkBulkHash(b, NewKeccak384()) }
+func BenchmarkBulkKeccak256(b *testing.B) { benchmarkBulkHash(b, NewKeccak256()) }
+func BenchmarkBulkKeccak224(b *testing.B) { benchmarkBulkHash(b, NewKeccak224()) }