argon2: add new package implementing the Argon2 PBKDF family

This CL adds the package argon2. The argon2 package implements
the Argon2 PBKDF family (Argon2i, Argon2d, Argon2id).

Argon2 is memory-hard key derivation function and is specified
at https://github.com/P-H-C/phc-winner-argon2/blob/master/argon2-specs.pdf
It can be used to derive cryptographic keys with high entropy from low
entropy passwords.

Fixes golang/go#19896

Change-Id: I5b099682a8e3d7569ad18400cebddefc99a7e22f
Reviewed-on: https://go-review.googlesource.com/82575
Run-TryBot: Adam Langley <agl@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Adam Langley <agl@golang.org>
diff --git a/argon2/argon2.go b/argon2/argon2.go
new file mode 100644
index 0000000..61216e8
--- /dev/null
+++ b/argon2/argon2.go
@@ -0,0 +1,219 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package argon2 implements the key derivation function Argon2.
+// Argon2 was selected as the winner of the Password Hashing Competition and can
+// be used to derive cryptographic keys from passwords.
+// Argon2 is specfifed at https://github.com/P-H-C/phc-winner-argon2/blob/master/argon2-specs.pdf
+package argon2
+
+import (
+	"encoding/binary"
+	"sync"
+
+	"golang.org/x/crypto/blake2b"
+)
+
+// The Argon2 version implemented by this package.
+const Version = 0x13
+
+const (
+	argon2d = iota
+	argon2i
+	argon2id
+)
+
+// Key derives a key from the password, salt, and cost parameters using Argon2i
+// returning a byte slice of length keyLen that can be used as cryptographic key.
+// The CPU cost and parallism degree must be greater than zero.
+//
+// For example, you can get a derived key for e.g. AES-256 (which needs a 32-byte key) by doing:
+// `key := argon2.Key([]byte("some password"), salt, 4, 32*1024, 4, 32)`
+//
+// The recommended parameters for interactive logins as of 2017 are time=4, memory=32*1024.
+// The number of threads can be adjusted to the numbers of available CPUs.
+// The time parameter specifies the number of passes over the memory and the memory
+// parameter specifies the size of the memory in KiB. For example memory=32*1024 sets the
+// memory cost to ~32 MB.
+// The cost parameters should be increased as memory latency and CPU parallelism increases.
+// Remember to get a good random salt.
+func Key(password, salt []byte, time, memory uint32, threads uint8, keyLen uint32) []byte {
+	return deriveKey(argon2i, password, salt, nil, nil, time, memory, threads, keyLen)
+}
+
+func deriveKey(mode int, password, salt, secret, data []byte, time, memory uint32, threads uint8, keyLen uint32) []byte {
+	if time < 1 {
+		panic("argon2: number of rounds too small")
+	}
+	if threads < 1 {
+		panic("argon2: paralisim degree too low")
+	}
+	mem := memory / (4 * uint32(threads)) * (4 * uint32(threads))
+	if mem < 8*uint32(threads) {
+		mem = 8 * uint32(threads)
+	}
+	B := initBlocks(password, salt, secret, data, time, mem, uint32(threads), keyLen, mode)
+	processBlocks(B, time, mem, uint32(threads), mode)
+	return extractKey(B, mem, uint32(threads), keyLen)
+}
+
+const blockLength = 128
+
+type block [blockLength]uint64
+
+func initBlocks(password, salt, key, data []byte, time, memory, threads, keyLen uint32, mode int) []block {
+	var (
+		block0 [1024]byte
+		h0     [blake2b.Size + 8]byte
+		params [24]byte
+		tmp    [4]byte
+	)
+
+	b2, _ := blake2b.New512(nil)
+	binary.LittleEndian.PutUint32(params[0:4], threads)
+	binary.LittleEndian.PutUint32(params[4:8], keyLen)
+	binary.LittleEndian.PutUint32(params[8:12], memory)
+	binary.LittleEndian.PutUint32(params[12:16], time)
+	binary.LittleEndian.PutUint32(params[16:20], uint32(Version))
+	binary.LittleEndian.PutUint32(params[20:24], uint32(mode))
+	b2.Write(params[:])
+	binary.LittleEndian.PutUint32(tmp[:], uint32(len(password)))
+	b2.Write(tmp[:])
+	b2.Write(password)
+	binary.LittleEndian.PutUint32(tmp[:], uint32(len(salt)))
+	b2.Write(tmp[:])
+	b2.Write(salt)
+	binary.LittleEndian.PutUint32(tmp[:], uint32(len(key)))
+	b2.Write(tmp[:])
+	b2.Write(key)
+	binary.LittleEndian.PutUint32(tmp[:], uint32(len(data)))
+	b2.Write(tmp[:])
+	b2.Write(data)
+	b2.Sum(h0[:0])
+
+	B := make([]block, memory)
+	for lane := uint32(0); lane < threads; lane++ {
+		j := lane * (memory / threads)
+		binary.LittleEndian.PutUint32(h0[blake2b.Size+4:], lane)
+
+		binary.LittleEndian.PutUint32(h0[blake2b.Size:], 0)
+		blake2bHash(block0[:], h0[:])
+		for i := range B[0] {
+			B[j+0][i] = binary.LittleEndian.Uint64(block0[i*8:])
+		}
+
+		binary.LittleEndian.PutUint32(h0[blake2b.Size:], 1)
+		blake2bHash(block0[:], h0[:])
+		for i := range B[0] {
+			B[j+1][i] = binary.LittleEndian.Uint64(block0[i*8:])
+		}
+	}
+	return B
+}
+
+func processBlocks(B []block, time, memory, threads uint32, mode int) {
+	const syncPoints = 4
+	lanes := memory / threads
+	segments := lanes / syncPoints
+
+	processSegment := func(n, slice, lane uint32, wg *sync.WaitGroup) {
+		var addresses, in, zero block
+		if mode == argon2i || (mode == argon2id && n == 0 && slice < syncPoints/2) {
+			in[0] = uint64(n)
+			in[1] = uint64(lane)
+			in[2] = uint64(slice)
+			in[3] = uint64(memory)
+			in[4] = uint64(time)
+			in[5] = uint64(mode)
+		}
+
+		index := uint32(0)
+		if n == 0 && slice == 0 {
+			index = 2 // we have already generated the first two blocks
+			if mode == argon2i || (mode == argon2id && n == 0 && slice < syncPoints/2) {
+				in[6]++
+				processBlock(&addresses, &in, &zero)
+				processBlock(&addresses, &addresses, &zero)
+			}
+		}
+
+		offset := lane*lanes + slice*segments + index
+		var random uint64
+		for index < segments {
+			prev := offset - 1
+			if index == 0 && slice == 0 {
+				prev = lane*lanes + lanes - 1 // last block in lane
+			}
+			if mode == argon2i || (mode == argon2id && n == 0 && slice < syncPoints/2) {
+				if index%blockLength == 0 {
+					in[6]++
+					processBlock(&addresses, &in, &zero)
+					processBlock(&addresses, &addresses, &zero)
+				}
+				random = addresses[index%blockLength]
+			} else {
+				random = B[prev][0]
+			}
+			newOffset := indexAlpha(random, lanes, segments, threads, n, slice, lane, index)
+			processBlockXOR(&B[offset], &B[prev], &B[newOffset])
+			index, offset = index+1, offset+1
+		}
+		wg.Done()
+	}
+
+	for n := uint32(0); n < time; n++ {
+		for slice := uint32(0); slice < syncPoints; slice++ {
+			var wg sync.WaitGroup
+			for lane := uint32(0); lane < threads; lane++ {
+				wg.Add(1)
+				go processSegment(n, slice, lane, &wg)
+			}
+			wg.Wait()
+		}
+	}
+
+}
+
+func extractKey(B []block, memory, threads, keyLen uint32) []byte {
+	lanes := memory / threads
+	for lane := uint32(0); lane < threads-1; lane++ {
+		for i, v := range B[(lane*lanes)+lanes-1] {
+			B[memory-1][i] ^= v
+		}
+	}
+
+	var block [1024]byte
+	for i, v := range B[memory-1] {
+		binary.LittleEndian.PutUint64(block[i*8:], v)
+	}
+	key := make([]byte, keyLen)
+	blake2bHash(key, block[:])
+	return key
+}
+
+func indexAlpha(rand uint64, lanes, segments, threads, n, slice, lane, index uint32) uint32 {
+	refLane := uint32(rand>>32) % threads
+
+	m, s := 3*segments, (slice+1)%4*segments
+	if lane == refLane {
+		m += index
+	}
+	if n == 0 {
+		m, s = slice*segments, 0
+		if slice == 0 || lane == refLane {
+			m += index
+		}
+	}
+	if index == 0 || lane == refLane {
+		m--
+	}
+	return phi(rand, uint64(m), uint64(s), refLane, lanes)
+}
+
+func phi(rand, m, s uint64, lane, lanes uint32) uint32 {
+	p := rand & 0xFFFFFFFF
+	p = (p * p) >> 32
+	p = (p * m) >> 32
+	return lane*lanes + uint32((s+m-(p+1))%uint64(lanes))
+}
diff --git a/argon2/argon2_test.go b/argon2/argon2_test.go
new file mode 100644
index 0000000..3f72c75
--- /dev/null
+++ b/argon2/argon2_test.go
@@ -0,0 +1,113 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+package argon2
+
+import (
+	"bytes"
+	"encoding/hex"
+	"testing"
+)
+
+var (
+	genKatPassword = []byte{
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+	}
+	genKatSalt   = []byte{0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02}
+	genKatSecret = []byte{0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03}
+	genKatAAD    = []byte{0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04}
+)
+
+func TestArgon2(t *testing.T) {
+	defer func(sse4 bool) { useSSE4 = sse4 }(useSSE4)
+
+	if useSSE4 {
+		t.Log("SSE4.1 version")
+		testArgon2i(t)
+		testArgon2d(t)
+		testArgon2id(t)
+		useSSE4 = false
+	}
+	t.Log("generic version")
+	testArgon2i(t)
+	testArgon2d(t)
+	testArgon2id(t)
+}
+
+func testArgon2d(t *testing.T) {
+	want := []byte{
+		0x51, 0x2b, 0x39, 0x1b, 0x6f, 0x11, 0x62, 0x97,
+		0x53, 0x71, 0xd3, 0x09, 0x19, 0x73, 0x42, 0x94,
+		0xf8, 0x68, 0xe3, 0xbe, 0x39, 0x84, 0xf3, 0xc1,
+		0xa1, 0x3a, 0x4d, 0xb9, 0xfa, 0xbe, 0x4a, 0xcb,
+	}
+	hash := deriveKey(argon2d, genKatPassword, genKatSalt, genKatSecret, genKatAAD, 3, 32, 4, 32)
+	if !bytes.Equal(hash, want) {
+		t.Errorf("derived key does not match - got: %s , want: %s", hex.EncodeToString(hash), hex.EncodeToString(want))
+	}
+}
+
+func testArgon2i(t *testing.T) {
+	want := []byte{
+		0xc8, 0x14, 0xd9, 0xd1, 0xdc, 0x7f, 0x37, 0xaa,
+		0x13, 0xf0, 0xd7, 0x7f, 0x24, 0x94, 0xbd, 0xa1,
+		0xc8, 0xde, 0x6b, 0x01, 0x6d, 0xd3, 0x88, 0xd2,
+		0x99, 0x52, 0xa4, 0xc4, 0x67, 0x2b, 0x6c, 0xe8,
+	}
+	hash := deriveKey(argon2i, genKatPassword, genKatSalt, genKatSecret, genKatAAD, 3, 32, 4, 32)
+	if !bytes.Equal(hash, want) {
+		t.Errorf("derived key does not match - got: %s , want: %s", hex.EncodeToString(hash), hex.EncodeToString(want))
+	}
+}
+
+func testArgon2id(t *testing.T) {
+	want := []byte{
+		0x0d, 0x64, 0x0d, 0xf5, 0x8d, 0x78, 0x76, 0x6c,
+		0x08, 0xc0, 0x37, 0xa3, 0x4a, 0x8b, 0x53, 0xc9,
+		0xd0, 0x1e, 0xf0, 0x45, 0x2d, 0x75, 0xb6, 0x5e,
+		0xb5, 0x25, 0x20, 0xe9, 0x6b, 0x01, 0xe6, 0x59,
+	}
+	hash := deriveKey(argon2id, genKatPassword, genKatSalt, genKatSecret, genKatAAD, 3, 32, 4, 32)
+	if !bytes.Equal(hash, want) {
+		t.Errorf("derived key does not match - got: %s , want: %s", hex.EncodeToString(hash), hex.EncodeToString(want))
+	}
+}
+
+func benchmarkArgon2(mode int, time, memory uint32, threads uint8, keyLen uint32, b *testing.B) {
+	password := []byte("password")
+	salt := []byte("choosing random salts is hard")
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		deriveKey(mode, password, salt, nil, nil, time, memory, threads, keyLen)
+	}
+}
+
+func BenchmarkArgon2i(b *testing.B) {
+	b.Run(" Time: 3 Memory: 32 MB, Threads: 1", func(b *testing.B) { benchmarkArgon2(argon2i, 3, 32*1024, 1, 32, b) })
+	b.Run(" Time: 4 Memory: 32 MB, Threads: 1", func(b *testing.B) { benchmarkArgon2(argon2i, 4, 32*1024, 1, 32, b) })
+	b.Run(" Time: 5 Memory: 32 MB, Threads: 1", func(b *testing.B) { benchmarkArgon2(argon2i, 5, 32*1024, 1, 32, b) })
+	b.Run(" Time: 3 Memory: 64 MB, Threads: 4", func(b *testing.B) { benchmarkArgon2(argon2i, 3, 64*1024, 4, 32, b) })
+	b.Run(" Time: 4 Memory: 64 MB, Threads: 4", func(b *testing.B) { benchmarkArgon2(argon2i, 4, 64*1024, 4, 32, b) })
+	b.Run(" Time: 5 Memory: 64 MB, Threads: 4", func(b *testing.B) { benchmarkArgon2(argon2i, 5, 64*1024, 4, 32, b) })
+}
+
+func BenchmarkArgon2d(b *testing.B) {
+	b.Run(" Time: 3, Memory: 32 MB, Threads: 1", func(b *testing.B) { benchmarkArgon2(argon2d, 3, 32*1024, 1, 32, b) })
+	b.Run(" Time: 4, Memory: 32 MB, Threads: 1", func(b *testing.B) { benchmarkArgon2(argon2d, 4, 32*1024, 1, 32, b) })
+	b.Run(" Time: 5, Memory: 32 MB, Threads: 1", func(b *testing.B) { benchmarkArgon2(argon2d, 5, 32*1024, 1, 32, b) })
+	b.Run(" Time: 3, Memory: 64 MB, Threads: 4", func(b *testing.B) { benchmarkArgon2(argon2d, 3, 64*1024, 4, 32, b) })
+	b.Run(" Time: 4, Memory: 64 MB, Threads: 4", func(b *testing.B) { benchmarkArgon2(argon2d, 4, 64*1024, 4, 32, b) })
+	b.Run(" Time: 5, Memory: 64 MB, Threads: 4", func(b *testing.B) { benchmarkArgon2(argon2d, 5, 64*1024, 4, 32, b) })
+}
+
+func BenchmarkArgon2id(b *testing.B) {
+	b.Run(" Time: 3, Memory: 32 MB, Threads: 1", func(b *testing.B) { benchmarkArgon2(argon2id, 3, 32*1024, 1, 32, b) })
+	b.Run(" Time: 4, Memory: 32 MB, Threads: 1", func(b *testing.B) { benchmarkArgon2(argon2id, 4, 32*1024, 1, 32, b) })
+	b.Run(" Time: 5, Memory: 32 MB, Threads: 1", func(b *testing.B) { benchmarkArgon2(argon2id, 5, 32*1024, 1, 32, b) })
+	b.Run(" Time: 3, Memory: 64 MB, Threads: 4", func(b *testing.B) { benchmarkArgon2(argon2id, 3, 64*1024, 4, 32, b) })
+	b.Run(" Time: 4, Memory: 64 MB, Threads: 4", func(b *testing.B) { benchmarkArgon2(argon2id, 4, 64*1024, 4, 32, b) })
+	b.Run(" Time: 5, Memory: 64 MB, Threads: 4", func(b *testing.B) { benchmarkArgon2(argon2id, 5, 64*1024, 4, 32, b) })
+}
diff --git a/argon2/blake2b.go b/argon2/blake2b.go
new file mode 100644
index 0000000..10f4694
--- /dev/null
+++ b/argon2/blake2b.go
@@ -0,0 +1,53 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package argon2
+
+import (
+	"encoding/binary"
+	"hash"
+
+	"golang.org/x/crypto/blake2b"
+)
+
+// blake2bHash computes an arbitrary long hash value of in
+// and writes the hash to out.
+func blake2bHash(out []byte, in []byte) {
+	var b2 hash.Hash
+	if n := len(out); n < blake2b.Size {
+		b2, _ = blake2b.New(n, nil)
+	} else {
+		b2, _ = blake2b.New512(nil)
+	}
+
+	var buffer [blake2b.Size]byte
+	binary.LittleEndian.PutUint32(buffer[:4], uint32(len(out)))
+	b2.Write(buffer[:4])
+	b2.Write(in)
+
+	if len(out) <= blake2b.Size {
+		b2.Sum(out[:0])
+		return
+	}
+
+	outLen := len(out)
+	b2.Sum(buffer[:0])
+	b2.Reset()
+	copy(out, buffer[:32])
+	out = out[32:]
+	for len(out) > blake2b.Size {
+		b2.Write(buffer[:])
+		b2.Sum(buffer[:0])
+		copy(out, buffer[:32])
+		out = out[32:]
+		b2.Reset()
+	}
+
+	if outLen%blake2b.Size > 0 { // outLen > 64
+		r := ((outLen + 31) / 32) - 2 // ⌈τ /32⌉-2
+		b2, _ = blake2b.New(outLen-32*r, nil)
+	}
+	b2.Write(buffer[:])
+	b2.Sum(out[:0])
+}
diff --git a/argon2/blamka_amd64.go b/argon2/blamka_amd64.go
new file mode 100644
index 0000000..583ac4b
--- /dev/null
+++ b/argon2/blamka_amd64.go
@@ -0,0 +1,59 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package argon2
+
+func init() {
+	useSSE4 = supportsSSE4()
+}
+
+//go:noescape
+func supportsSSE4() bool
+
+//go:noescape
+func mixBlocksSSE2(out, a, b, c *block)
+
+//go:noescape
+func xorBlocksSSE2(out, a, b, c *block)
+
+//go:noescape
+func blamkaSSE4(b *block)
+
+func processBlockSSE(out, in1, in2 *block, xor bool) {
+	var t block
+	mixBlocksSSE2(&t, in1, in2, &t)
+	if useSSE4 {
+		blamkaSSE4(&t)
+	} else {
+		for i := 0; i < blockLength; i += 16 {
+			blamkaGeneric(
+				&t[i+0], &t[i+1], &t[i+2], &t[i+3],
+				&t[i+4], &t[i+5], &t[i+6], &t[i+7],
+				&t[i+8], &t[i+9], &t[i+10], &t[i+11],
+				&t[i+12], &t[i+13], &t[i+14], &t[i+15],
+			)
+		}
+		for i := 0; i < blockLength/8; i += 2 {
+			blamkaGeneric(
+				&t[i], &t[i+1], &t[16+i], &t[16+i+1],
+				&t[32+i], &t[32+i+1], &t[48+i], &t[48+i+1],
+				&t[64+i], &t[64+i+1], &t[80+i], &t[80+i+1],
+				&t[96+i], &t[96+i+1], &t[112+i], &t[112+i+1],
+			)
+		}
+	}
+	if xor {
+		xorBlocksSSE2(out, in1, in2, &t)
+	} else {
+		mixBlocksSSE2(out, in1, in2, &t)
+	}
+}
+
+func processBlock(out, in1, in2 *block) {
+	processBlockSSE(out, in1, in2, false)
+}
+
+func processBlockXOR(out, in1, in2 *block) {
+	processBlockSSE(out, in1, in2, true)
+}
diff --git a/argon2/blamka_amd64.s b/argon2/blamka_amd64.s
new file mode 100644
index 0000000..8a83f7c
--- /dev/null
+++ b/argon2/blamka_amd64.s
@@ -0,0 +1,252 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64,!gccgo,!appengine
+
+#include "textflag.h"
+
+DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
+DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
+GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
+
+DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
+DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
+GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
+
+#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
+	MOVO       v4, t1; \
+	MOVO       v5, v4; \
+	MOVO       t1, v5; \
+	MOVO       v6, t1; \
+	PUNPCKLQDQ v6, t2; \
+	PUNPCKHQDQ v7, v6; \
+	PUNPCKHQDQ t2, v6; \
+	PUNPCKLQDQ v7, t2; \
+	MOVO       t1, v7; \
+	MOVO       v2, t1; \
+	PUNPCKHQDQ t2, v7; \
+	PUNPCKLQDQ v3, t2; \
+	PUNPCKHQDQ t2, v2; \
+	PUNPCKLQDQ t1, t2; \
+	PUNPCKHQDQ t2, v3
+
+#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
+	MOVO       v4, t1; \
+	MOVO       v5, v4; \
+	MOVO       t1, v5; \
+	MOVO       v2, t1; \
+	PUNPCKLQDQ v2, t2; \
+	PUNPCKHQDQ v3, v2; \
+	PUNPCKHQDQ t2, v2; \
+	PUNPCKLQDQ v3, t2; \
+	MOVO       t1, v3; \
+	MOVO       v6, t1; \
+	PUNPCKHQDQ t2, v3; \
+	PUNPCKLQDQ v7, t2; \
+	PUNPCKHQDQ t2, v6; \
+	PUNPCKLQDQ t1, t2; \
+	PUNPCKHQDQ t2, v7
+
+#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48) \
+	MOVO    v0, t0;        \
+	PMULULQ v2, t0;        \
+	PADDQ   v2, v0;        \
+	PADDQ   t0, v0;        \
+	PADDQ   t0, v0;        \
+	PXOR    v0, v6;        \
+	PSHUFD  $0xB1, v6, v6; \
+	MOVO    v4, t0;        \
+	PMULULQ v6, t0;        \
+	PADDQ   v6, v4;        \
+	PADDQ   t0, v4;        \
+	PADDQ   t0, v4;        \
+	PXOR    v4, v2;        \
+	PSHUFB  c40, v2;       \
+	MOVO    v0, t0;        \
+	PMULULQ v2, t0;        \
+	PADDQ   v2, v0;        \
+	PADDQ   t0, v0;        \
+	PADDQ   t0, v0;        \
+	PXOR    v0, v6;        \
+	PSHUFB  c48, v6;       \
+	MOVO    v4, t0;        \
+	PMULULQ v6, t0;        \
+	PADDQ   v6, v4;        \
+	PADDQ   t0, v4;        \
+	PADDQ   t0, v4;        \
+	PXOR    v4, v2;        \
+	MOVO    v2, t0;        \
+	PADDQ   v2, t0;        \
+	PSRLQ   $63, v2;       \
+	PXOR    t0, v2;        \
+	MOVO    v1, t0;        \
+	PMULULQ v3, t0;        \
+	PADDQ   v3, v1;        \
+	PADDQ   t0, v1;        \
+	PADDQ   t0, v1;        \
+	PXOR    v1, v7;        \
+	PSHUFD  $0xB1, v7, v7; \
+	MOVO    v5, t0;        \
+	PMULULQ v7, t0;        \
+	PADDQ   v7, v5;        \
+	PADDQ   t0, v5;        \
+	PADDQ   t0, v5;        \
+	PXOR    v5, v3;        \
+	PSHUFB  c40, v3;       \
+	MOVO    v1, t0;        \
+	PMULULQ v3, t0;        \
+	PADDQ   v3, v1;        \
+	PADDQ   t0, v1;        \
+	PADDQ   t0, v1;        \
+	PXOR    v1, v7;        \
+	PSHUFB  c48, v7;       \
+	MOVO    v5, t0;        \
+	PMULULQ v7, t0;        \
+	PADDQ   v7, v5;        \
+	PADDQ   t0, v5;        \
+	PADDQ   t0, v5;        \
+	PXOR    v5, v3;        \
+	MOVO    v3, t0;        \
+	PADDQ   v3, t0;        \
+	PSRLQ   $63, v3;       \
+	PXOR    t0, v3
+
+#define LOAD_MSG_0(block, off) \
+	MOVOU 8*(off+0)(block), X0;  \
+	MOVOU 8*(off+2)(block), X1;  \
+	MOVOU 8*(off+4)(block), X2;  \
+	MOVOU 8*(off+6)(block), X3;  \
+	MOVOU 8*(off+8)(block), X4;  \
+	MOVOU 8*(off+10)(block), X5; \
+	MOVOU 8*(off+12)(block), X6; \
+	MOVOU 8*(off+14)(block), X7
+
+#define STORE_MSG_0(block, off) \
+	MOVOU X0, 8*(off+0)(block);  \
+	MOVOU X1, 8*(off+2)(block);  \
+	MOVOU X2, 8*(off+4)(block);  \
+	MOVOU X3, 8*(off+6)(block);  \
+	MOVOU X4, 8*(off+8)(block);  \
+	MOVOU X5, 8*(off+10)(block); \
+	MOVOU X6, 8*(off+12)(block); \
+	MOVOU X7, 8*(off+14)(block)
+
+#define LOAD_MSG_1(block, off) \
+	MOVOU 8*off+0*8(block), X0;  \
+	MOVOU 8*off+16*8(block), X1; \
+	MOVOU 8*off+32*8(block), X2; \
+	MOVOU 8*off+48*8(block), X3; \
+	MOVOU 8*off+64*8(block), X4; \
+	MOVOU 8*off+80*8(block), X5; \
+	MOVOU 8*off+96*8(block), X6; \
+	MOVOU 8*off+112*8(block), X7
+
+#define STORE_MSG_1(block, off) \
+	MOVOU X0, 8*off+0*8(block);  \
+	MOVOU X1, 8*off+16*8(block); \
+	MOVOU X2, 8*off+32*8(block); \
+	MOVOU X3, 8*off+48*8(block); \
+	MOVOU X4, 8*off+64*8(block); \
+	MOVOU X5, 8*off+80*8(block); \
+	MOVOU X6, 8*off+96*8(block); \
+	MOVOU X7, 8*off+112*8(block)
+
+#define BLAMKA_ROUND_0(block, off, t0, t1, c40, c48) \
+	LOAD_MSG_0(block, off);                                   \
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
+	SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1);                  \
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
+	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1);              \
+	STORE_MSG_0(block, off)
+
+#define BLAMKA_ROUND_1(block, off, t0, t1, c40, c48) \
+	LOAD_MSG_1(block, off);                                   \
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
+	SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1);                  \
+	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
+	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1);              \
+	STORE_MSG_1(block, off)
+
+// func blamkaSSE4(b *block)
+TEXT ·blamkaSSE4(SB), 4, $0-8
+	MOVQ b+0(FP), AX
+
+	MOVOU ·c40<>(SB), X10
+	MOVOU ·c48<>(SB), X11
+
+	BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11)
+	BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11)
+	BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11)
+	BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11)
+	BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11)
+	BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11)
+	BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11)
+	BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11)
+
+	BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11)
+	BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11)
+	BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11)
+	BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11)
+	BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11)
+	BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11)
+	BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11)
+	BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11)
+	RET
+
+// func mixBlocksSSE2(out, a, b, c *block)
+TEXT ·mixBlocksSSE2(SB), 4, $0-32
+	MOVQ out+0(FP), DX
+	MOVQ a+8(FP), AX
+	MOVQ b+16(FP), BX
+	MOVQ a+24(FP), CX
+	MOVQ $128, BP
+
+loop:
+	MOVOU 0(AX), X0
+	MOVOU 0(BX), X1
+	MOVOU 0(CX), X2
+	PXOR  X1, X0
+	PXOR  X2, X0
+	MOVOU X0, 0(DX)
+	ADDQ  $16, AX
+	ADDQ  $16, BX
+	ADDQ  $16, CX
+	ADDQ  $16, DX
+	SUBQ  $2, BP
+	JA    loop
+	RET
+
+// func xorBlocksSSE2(out, a, b, c *block)
+TEXT ·xorBlocksSSE2(SB), 4, $0-32
+	MOVQ out+0(FP), DX
+	MOVQ a+8(FP), AX
+	MOVQ b+16(FP), BX
+	MOVQ a+24(FP), CX
+	MOVQ $128, BP
+
+loop:
+	MOVOU 0(AX), X0
+	MOVOU 0(BX), X1
+	MOVOU 0(CX), X2
+	MOVOU 0(DX), X3
+	PXOR  X1, X0
+	PXOR  X2, X0
+	PXOR  X3, X0
+	MOVOU X0, 0(DX)
+	ADDQ  $16, AX
+	ADDQ  $16, BX
+	ADDQ  $16, CX
+	ADDQ  $16, DX
+	SUBQ  $2, BP
+	JA    loop
+	RET
+
+// func supportsSSE4() bool
+TEXT ·supportsSSE4(SB), 4, $0-1
+	MOVL $1, AX
+	CPUID
+	SHRL $19, CX       // Bit 19 indicates SSE4 support
+	ANDL $1, CX        // CX != 0 if support SSE4
+	MOVB CX, ret+0(FP)
+	RET
diff --git a/argon2/blamka_generic.go b/argon2/blamka_generic.go
new file mode 100644
index 0000000..a481b22
--- /dev/null
+++ b/argon2/blamka_generic.go
@@ -0,0 +1,163 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package argon2
+
+var useSSE4 bool
+
+func processBlockGeneric(out, in1, in2 *block, xor bool) {
+	var t block
+	for i := range t {
+		t[i] = in1[i] ^ in2[i]
+	}
+	for i := 0; i < blockLength; i += 16 {
+		blamkaGeneric(
+			&t[i+0], &t[i+1], &t[i+2], &t[i+3],
+			&t[i+4], &t[i+5], &t[i+6], &t[i+7],
+			&t[i+8], &t[i+9], &t[i+10], &t[i+11],
+			&t[i+12], &t[i+13], &t[i+14], &t[i+15],
+		)
+	}
+	for i := 0; i < blockLength/8; i += 2 {
+		blamkaGeneric(
+			&t[i], &t[i+1], &t[16+i], &t[16+i+1],
+			&t[32+i], &t[32+i+1], &t[48+i], &t[48+i+1],
+			&t[64+i], &t[64+i+1], &t[80+i], &t[80+i+1],
+			&t[96+i], &t[96+i+1], &t[112+i], &t[112+i+1],
+		)
+	}
+	if xor {
+		for i := range t {
+			out[i] ^= in1[i] ^ in2[i] ^ t[i]
+		}
+	} else {
+		for i := range t {
+			out[i] = in1[i] ^ in2[i] ^ t[i]
+		}
+	}
+}
+
+func blamkaGeneric(t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, t14, t15 *uint64) {
+	v00, v01, v02, v03 := *t00, *t01, *t02, *t03
+	v04, v05, v06, v07 := *t04, *t05, *t06, *t07
+	v08, v09, v10, v11 := *t08, *t09, *t10, *t11
+	v12, v13, v14, v15 := *t12, *t13, *t14, *t15
+
+	v00 += v04 + 2*uint64(uint32(v00))*uint64(uint32(v04))
+	v12 ^= v00
+	v12 = v12>>32 | v12<<32
+	v08 += v12 + 2*uint64(uint32(v08))*uint64(uint32(v12))
+	v04 ^= v08
+	v04 = v04>>24 | v04<<40
+
+	v00 += v04 + 2*uint64(uint32(v00))*uint64(uint32(v04))
+	v12 ^= v00
+	v12 = v12>>16 | v12<<48
+	v08 += v12 + 2*uint64(uint32(v08))*uint64(uint32(v12))
+	v04 ^= v08
+	v04 = v04>>63 | v04<<1
+
+	v01 += v05 + 2*uint64(uint32(v01))*uint64(uint32(v05))
+	v13 ^= v01
+	v13 = v13>>32 | v13<<32
+	v09 += v13 + 2*uint64(uint32(v09))*uint64(uint32(v13))
+	v05 ^= v09
+	v05 = v05>>24 | v05<<40
+
+	v01 += v05 + 2*uint64(uint32(v01))*uint64(uint32(v05))
+	v13 ^= v01
+	v13 = v13>>16 | v13<<48
+	v09 += v13 + 2*uint64(uint32(v09))*uint64(uint32(v13))
+	v05 ^= v09
+	v05 = v05>>63 | v05<<1
+
+	v02 += v06 + 2*uint64(uint32(v02))*uint64(uint32(v06))
+	v14 ^= v02
+	v14 = v14>>32 | v14<<32
+	v10 += v14 + 2*uint64(uint32(v10))*uint64(uint32(v14))
+	v06 ^= v10
+	v06 = v06>>24 | v06<<40
+
+	v02 += v06 + 2*uint64(uint32(v02))*uint64(uint32(v06))
+	v14 ^= v02
+	v14 = v14>>16 | v14<<48
+	v10 += v14 + 2*uint64(uint32(v10))*uint64(uint32(v14))
+	v06 ^= v10
+	v06 = v06>>63 | v06<<1
+
+	v03 += v07 + 2*uint64(uint32(v03))*uint64(uint32(v07))
+	v15 ^= v03
+	v15 = v15>>32 | v15<<32
+	v11 += v15 + 2*uint64(uint32(v11))*uint64(uint32(v15))
+	v07 ^= v11
+	v07 = v07>>24 | v07<<40
+
+	v03 += v07 + 2*uint64(uint32(v03))*uint64(uint32(v07))
+	v15 ^= v03
+	v15 = v15>>16 | v15<<48
+	v11 += v15 + 2*uint64(uint32(v11))*uint64(uint32(v15))
+	v07 ^= v11
+	v07 = v07>>63 | v07<<1
+
+	v00 += v05 + 2*uint64(uint32(v00))*uint64(uint32(v05))
+	v15 ^= v00
+	v15 = v15>>32 | v15<<32
+	v10 += v15 + 2*uint64(uint32(v10))*uint64(uint32(v15))
+	v05 ^= v10
+	v05 = v05>>24 | v05<<40
+
+	v00 += v05 + 2*uint64(uint32(v00))*uint64(uint32(v05))
+	v15 ^= v00
+	v15 = v15>>16 | v15<<48
+	v10 += v15 + 2*uint64(uint32(v10))*uint64(uint32(v15))
+	v05 ^= v10
+	v05 = v05>>63 | v05<<1
+
+	v01 += v06 + 2*uint64(uint32(v01))*uint64(uint32(v06))
+	v12 ^= v01
+	v12 = v12>>32 | v12<<32
+	v11 += v12 + 2*uint64(uint32(v11))*uint64(uint32(v12))
+	v06 ^= v11
+	v06 = v06>>24 | v06<<40
+
+	v01 += v06 + 2*uint64(uint32(v01))*uint64(uint32(v06))
+	v12 ^= v01
+	v12 = v12>>16 | v12<<48
+	v11 += v12 + 2*uint64(uint32(v11))*uint64(uint32(v12))
+	v06 ^= v11
+	v06 = v06>>63 | v06<<1
+
+	v02 += v07 + 2*uint64(uint32(v02))*uint64(uint32(v07))
+	v13 ^= v02
+	v13 = v13>>32 | v13<<32
+	v08 += v13 + 2*uint64(uint32(v08))*uint64(uint32(v13))
+	v07 ^= v08
+	v07 = v07>>24 | v07<<40
+
+	v02 += v07 + 2*uint64(uint32(v02))*uint64(uint32(v07))
+	v13 ^= v02
+	v13 = v13>>16 | v13<<48
+	v08 += v13 + 2*uint64(uint32(v08))*uint64(uint32(v13))
+	v07 ^= v08
+	v07 = v07>>63 | v07<<1
+
+	v03 += v04 + 2*uint64(uint32(v03))*uint64(uint32(v04))
+	v14 ^= v03
+	v14 = v14>>32 | v14<<32
+	v09 += v14 + 2*uint64(uint32(v09))*uint64(uint32(v14))
+	v04 ^= v09
+	v04 = v04>>24 | v04<<40
+
+	v03 += v04 + 2*uint64(uint32(v03))*uint64(uint32(v04))
+	v14 ^= v03
+	v14 = v14>>16 | v14<<48
+	v09 += v14 + 2*uint64(uint32(v09))*uint64(uint32(v14))
+	v04 ^= v09
+	v04 = v04>>63 | v04<<1
+
+	*t00, *t01, *t02, *t03 = v00, v01, v02, v03
+	*t04, *t05, *t06, *t07 = v04, v05, v06, v07
+	*t08, *t09, *t10, *t11 = v08, v09, v10, v11
+	*t12, *t13, *t14, *t15 = v12, v13, v14, v15
+}
diff --git a/argon2/blamka_ref.go b/argon2/blamka_ref.go
new file mode 100644
index 0000000..baf7b55
--- /dev/null
+++ b/argon2/blamka_ref.go
@@ -0,0 +1,15 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64 appengine gccgo
+
+package argon2
+
+func processBlock(out, in1, in2 *block) {
+	processBlockGeneric(out, in1, in2, false)
+}
+
+func processBlockXOR(out, in1, in2 *block) {
+	processBlockGeneric(out, in1, in2, true)
+}
diff --git a/blake2b/blake2b.go b/blake2b/blake2b.go
index 7f0a86e..6dedb89 100644
--- a/blake2b/blake2b.go
+++ b/blake2b/blake2b.go
@@ -39,7 +39,10 @@
 	useSSE4 bool
 )
 
-var errKeySize = errors.New("blake2b: invalid key size")
+var (
+	errKeySize  = errors.New("blake2b: invalid key size")
+	errHashSize = errors.New("blake2b: invalid hash size")
+)
 
 var iv = [8]uint64{
 	0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
@@ -83,7 +86,18 @@
 // key turns the hash into a MAC. The key must between zero and 64 bytes long.
 func New256(key []byte) (hash.Hash, error) { return newDigest(Size256, key) }
 
+// New returns a new hash.Hash computing the BLAKE2b checksum with a custom length.
+// A non-nil key turns the hash into a MAC. The key must between zero and 64 bytes long.
+// The hash size can be a value between 1 and 64 but it is highly recommended to use
+// values equal or greater than:
+// - 32 if BLAKE2b is used as a hash function (The key is zero bytes long).
+// - 16 if BLAKE2b is used as a MAC function (The key is at least 16 bytes long).
+func New(size int, key []byte) (hash.Hash, error) { return newDigest(size, key) }
+
 func newDigest(hashSize int, key []byte) (*digest, error) {
+	if hashSize < 1 || hashSize > Size {
+		return nil, errHashSize
+	}
 	if len(key) > Size {
 		return nil, errKeySize
 	}