sha3: add optimized implementation for s390x

Message-security-assist extension 6 adds support for the SHA-3 and
SHAKE algorithms. This CL allows the sha3 package to use these new
features.

name                 old speed     new speed      delta
PermutationFunction  328MB/s ± 0%   385MB/s ± 0%    +17.28%  (p=0.000 n=9+10)
Sha3_512_MTU         108MB/s ± 0%  2011MB/s ± 0%  +1768.56%  (p=0.000 n=10+10)
Sha3_384_MTU         149MB/s ± 0%  2437MB/s ± 0%  +1534.22%  (p=0.000 n=10+10)
Sha3_256_MTU         185MB/s ± 0%  2739MB/s ± 0%  +1379.93%  (p=0.000 n=10+10)
Sha3_224_MTU         195MB/s ± 0%  2782MB/s ± 0%  +1326.05%  (p=0.000 n=10+10)
Shake128_MTU         225MB/s ± 0%  4436MB/s ± 0%  +1873.18%  (p=0.000 n=9+9)
Shake256_MTU         209MB/s ± 0%  4521MB/s ± 0%  +2059.86%  (p=0.000 n=8+10)
Shake256_16x         188MB/s ± 0%  1366MB/s ± 0%   +624.70%  (p=0.000 n=9+10)
Shake256_1MiB        212MB/s ± 0%  5861MB/s ± 0%  +2666.67%  (p=0.000 n=10+10)
Sha3_512_1MiB        116MB/s ± 0%  4328MB/s ± 0%  +3628.33%  (p=0.000 n=10+10)

Change-Id: I8ebc503ca2b9eda2ebb361dffdbfe79dd97e1975
Reviewed-on: https://go-review.googlesource.com/59391
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
diff --git a/sha3/hashes.go b/sha3/hashes.go
index 2b51cf4..f35a42f 100644
--- a/sha3/hashes.go
+++ b/sha3/hashes.go
@@ -15,22 +15,42 @@
 // New224 creates a new SHA3-224 hash.
 // Its generic security strength is 224 bits against preimage attacks,
 // and 112 bits against collision attacks.
-func New224() hash.Hash { return &state{rate: 144, outputLen: 28, dsbyte: 0x06} }
+func New224() hash.Hash {
+	if h := new224Asm(); h != nil {
+		return h
+	}
+	return &state{rate: 144, outputLen: 28, dsbyte: 0x06}
+}
 
 // New256 creates a new SHA3-256 hash.
 // Its generic security strength is 256 bits against preimage attacks,
 // and 128 bits against collision attacks.
-func New256() hash.Hash { return &state{rate: 136, outputLen: 32, dsbyte: 0x06} }
+func New256() hash.Hash {
+	if h := new256Asm(); h != nil {
+		return h
+	}
+	return &state{rate: 136, outputLen: 32, dsbyte: 0x06}
+}
 
 // New384 creates a new SHA3-384 hash.
 // Its generic security strength is 384 bits against preimage attacks,
 // and 192 bits against collision attacks.
-func New384() hash.Hash { return &state{rate: 104, outputLen: 48, dsbyte: 0x06} }
+func New384() hash.Hash {
+	if h := new384Asm(); h != nil {
+		return h
+	}
+	return &state{rate: 104, outputLen: 48, dsbyte: 0x06}
+}
 
 // New512 creates a new SHA3-512 hash.
 // Its generic security strength is 512 bits against preimage attacks,
 // and 256 bits against collision attacks.
-func New512() hash.Hash { return &state{rate: 72, outputLen: 64, dsbyte: 0x06} }
+func New512() hash.Hash {
+	if h := new512Asm(); h != nil {
+		return h
+	}
+	return &state{rate: 72, outputLen: 64, dsbyte: 0x06}
+}
 
 // Sum224 returns the SHA3-224 digest of the data.
 func Sum224(data []byte) (digest [28]byte) {
diff --git a/sha3/hashes_generic.go b/sha3/hashes_generic.go
new file mode 100644
index 0000000..c4ff3f6
--- /dev/null
+++ b/sha3/hashes_generic.go
@@ -0,0 +1,27 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//+build gccgo appengine !s390x
+
+package sha3
+
+import (
+	"hash"
+)
+
+// new224Asm returns an assembly implementation of SHA3-224 if available,
+// otherwise it returns nil.
+func new224Asm() hash.Hash { return nil }
+
+// new256Asm returns an assembly implementation of SHA3-256 if available,
+// otherwise it returns nil.
+func new256Asm() hash.Hash { return nil }
+
+// new384Asm returns an assembly implementation of SHA3-384 if available,
+// otherwise it returns nil.
+func new384Asm() hash.Hash { return nil }
+
+// new512Asm returns an assembly implementation of SHA3-512 if available,
+// otherwise it returns nil.
+func new512Asm() hash.Hash { return nil }
diff --git a/sha3/sha3_s390x.go b/sha3/sha3_s390x.go
new file mode 100644
index 0000000..f1fb79c
--- /dev/null
+++ b/sha3/sha3_s390x.go
@@ -0,0 +1,289 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//+build !gccgo,!appengine
+
+package sha3
+
+// This file contains code for using the 'compute intermediate
+// message digest' (KIMD) and 'compute last message digest' (KLMD)
+// instructions to compute SHA-3 and SHAKE hashes on IBM Z.
+
+import (
+	"hash"
+)
+
+// codes represent 7-bit KIMD/KLMD function codes as defined in
+// the Principles of Operation.
+type code uint64
+
+const (
+	// function codes for KIMD/KLMD
+	sha3_224  code = 32
+	sha3_256       = 33
+	sha3_384       = 34
+	sha3_512       = 35
+	shake_128      = 36
+	shake_256      = 37
+	nopad          = 0x100
+)
+
+// hasMSA6 reports whether the machine supports the SHA-3 and SHAKE function
+// codes, as defined in message-security-assist extension 6.
+func hasMSA6() bool
+
+// hasAsm caches the result of hasMSA6 (which might be expensive to call).
+var hasAsm = hasMSA6()
+
+// kimd is a wrapper for the 'compute intermediate message digest' instruction.
+// src must be a multiple of the rate for the given function code.
+//go:noescape
+func kimd(function code, chain *[200]byte, src []byte)
+
+// klmd is a wrapper for the 'compute last message digest' instruction.
+// src padding is handled by the instruction.
+//go:noescape
+func klmd(function code, chain *[200]byte, dst, src []byte)
+
+type asmState struct {
+	a         [200]byte       // 1600 bit state
+	buf       []byte          // care must be taken to ensure cap(buf) is a multiple of rate
+	rate      int             // equivalent to block size
+	storage   [3072]byte      // underlying storage for buf
+	outputLen int             // output length if fixed, 0 if not
+	function  code            // KIMD/KLMD function code
+	state     spongeDirection // whether the sponge is absorbing or squeezing
+}
+
+func newAsmState(function code) *asmState {
+	var s asmState
+	s.function = function
+	switch function {
+	case sha3_224:
+		s.rate = 144
+		s.outputLen = 28
+	case sha3_256:
+		s.rate = 136
+		s.outputLen = 32
+	case sha3_384:
+		s.rate = 104
+		s.outputLen = 48
+	case sha3_512:
+		s.rate = 72
+		s.outputLen = 64
+	case shake_128:
+		s.rate = 168
+	case shake_256:
+		s.rate = 136
+	default:
+		panic("sha3: unrecognized function code")
+	}
+
+	// limit s.buf size to a multiple of s.rate
+	s.resetBuf()
+	return &s
+}
+
+func (s *asmState) clone() *asmState {
+	c := *s
+	c.buf = c.storage[:len(s.buf):cap(s.buf)]
+	return &c
+}
+
+// copyIntoBuf copies b into buf. It will panic if there is not enough space to
+// store all of b.
+func (s *asmState) copyIntoBuf(b []byte) {
+	bufLen := len(s.buf)
+	s.buf = s.buf[:len(s.buf)+len(b)]
+	copy(s.buf[bufLen:], b)
+}
+
+// resetBuf points buf at storage, sets the length to 0 and sets cap to be a
+// multiple of the rate.
+func (s *asmState) resetBuf() {
+	max := (cap(s.storage) / s.rate) * s.rate
+	s.buf = s.storage[:0:max]
+}
+
+// Write (via the embedded io.Writer interface) adds more data to the running hash.
+// It never returns an error.
+func (s *asmState) Write(b []byte) (int, error) {
+	if s.state != spongeAbsorbing {
+		panic("sha3: write to sponge after read")
+	}
+	length := len(b)
+	for len(b) > 0 {
+		if len(s.buf) == 0 && len(b) >= cap(s.buf) {
+			// Hash the data directly and push any remaining bytes
+			// into the buffer.
+			remainder := len(s.buf) % s.rate
+			kimd(s.function, &s.a, b[:len(b)-remainder])
+			if remainder != 0 {
+				s.copyIntoBuf(b[len(b)-remainder:])
+			}
+			return length, nil
+		}
+
+		if len(s.buf) == cap(s.buf) {
+			// flush the buffer
+			kimd(s.function, &s.a, s.buf)
+			s.buf = s.buf[:0]
+		}
+
+		// copy as much as we can into the buffer
+		n := len(b)
+		if len(b) > cap(s.buf)-len(s.buf) {
+			n = cap(s.buf) - len(s.buf)
+		}
+		s.copyIntoBuf(b[:n])
+		b = b[n:]
+	}
+	return length, nil
+}
+
+// Read squeezes an arbitrary number of bytes from the sponge.
+func (s *asmState) Read(out []byte) (n int, err error) {
+	n = len(out)
+
+	// need to pad if we were absorbing
+	if s.state == spongeAbsorbing {
+		s.state = spongeSqueezing
+
+		// write hash directly into out if possible
+		if len(out)%s.rate == 0 {
+			klmd(s.function, &s.a, out, s.buf) // len(out) may be 0
+			s.buf = s.buf[:0]
+			return
+		}
+
+		// write hash into buffer
+		max := cap(s.buf)
+		if max > len(out) {
+			max = (len(out)/s.rate)*s.rate + s.rate
+		}
+		klmd(s.function, &s.a, s.buf[:max], s.buf)
+		s.buf = s.buf[:max]
+	}
+
+	for len(out) > 0 {
+		// flush the buffer
+		if len(s.buf) != 0 {
+			c := copy(out, s.buf)
+			out = out[c:]
+			s.buf = s.buf[c:]
+			continue
+		}
+
+		// write hash directly into out if possible
+		if len(out)%s.rate == 0 {
+			klmd(s.function|nopad, &s.a, out, nil)
+			return
+		}
+
+		// write hash into buffer
+		s.resetBuf()
+		if cap(s.buf) > len(out) {
+			s.buf = s.buf[:(len(out)/s.rate)*s.rate+s.rate]
+		}
+		klmd(s.function|nopad, &s.a, s.buf, nil)
+	}
+	return
+}
+
+// Sum appends the current hash to b and returns the resulting slice.
+// It does not change the underlying hash state.
+func (s *asmState) Sum(b []byte) []byte {
+	if s.outputLen == 0 {
+		panic("sha3: cannot call Sum on SHAKE functions")
+	}
+
+	// Copy the state to preserve the original.
+	a := s.a
+
+	// Hash the buffer. Note that we don't clear it because we
+	// aren't updating the state.
+	klmd(s.function, &a, nil, s.buf)
+	return append(b, a[:s.outputLen]...)
+}
+
+// Reset resets the Hash to its initial state.
+func (s *asmState) Reset() {
+	for i := range s.a {
+		s.a[i] = 0
+	}
+	s.resetBuf()
+	s.state = spongeAbsorbing
+}
+
+// Size returns the number of bytes Sum will return.
+func (s *asmState) Size() int {
+	return s.outputLen
+}
+
+// BlockSize returns the hash's underlying block size.
+// The Write method must be able to accept any amount
+// of data, but it may operate more efficiently if all writes
+// are a multiple of the block size.
+func (s *asmState) BlockSize() int {
+	return s.rate
+}
+
+// Clone returns a copy of the ShakeHash in its current state.
+func (s *asmState) Clone() ShakeHash {
+	return s.clone()
+}
+
+// new224Asm returns an assembly implementation of SHA3-224 if available,
+// otherwise it returns nil.
+func new224Asm() hash.Hash {
+	if hasAsm {
+		return newAsmState(sha3_224)
+	}
+	return nil
+}
+
+// new256Asm returns an assembly implementation of SHA3-256 if available,
+// otherwise it returns nil.
+func new256Asm() hash.Hash {
+	if hasAsm {
+		return newAsmState(sha3_256)
+	}
+	return nil
+}
+
+// new384Asm returns an assembly implementation of SHA3-384 if available,
+// otherwise it returns nil.
+func new384Asm() hash.Hash {
+	if hasAsm {
+		return newAsmState(sha3_384)
+	}
+	return nil
+}
+
+// new512Asm returns an assembly implementation of SHA3-512 if available,
+// otherwise it returns nil.
+func new512Asm() hash.Hash {
+	if hasAsm {
+		return newAsmState(sha3_512)
+	}
+	return nil
+}
+
+// newShake128Asm returns an assembly implementation of SHAKE-128 if available,
+// otherwise it returns nil.
+func newShake128Asm() ShakeHash {
+	if hasAsm {
+		return newAsmState(shake_128)
+	}
+	return nil
+}
+
+// newShake256Asm returns an assembly implementation of SHAKE-256 if available,
+// otherwise it returns nil.
+func newShake256Asm() ShakeHash {
+	if hasAsm {
+		return newAsmState(shake_256)
+	}
+	return nil
+}
diff --git a/sha3/sha3_s390x.s b/sha3/sha3_s390x.s
new file mode 100644
index 0000000..20978fc
--- /dev/null
+++ b/sha3/sha3_s390x.s
@@ -0,0 +1,49 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//+build !gccgo,!appengine
+
+#include "textflag.h"
+
+TEXT ·hasMSA6(SB), NOSPLIT, $16-1
+	MOVD $0, R0          // KIMD-Query function code
+	MOVD $tmp-16(SP), R1 // parameter block
+	XC   $16, (R1), (R1) // clear the parameter block
+	WORD $0xB93E0002     // KIMD --, --
+	WORD $0x91FC1004     // TM 4(R1), 0xFC (test bits [32-37])
+	BVS  yes
+
+no:
+	MOVB $0, ret+0(FP)
+	RET
+
+yes:
+	MOVB $1, ret+0(FP)
+	RET
+
+// func kimd(function code, params *[200]byte, src []byte)
+TEXT ·kimd(SB), NOFRAME|NOSPLIT, $0-40
+	MOVD function+0(FP), R0
+	MOVD params+8(FP), R1
+	LMG  src+16(FP), R2, R3 // R2=base, R3=len
+
+continue:
+	WORD $0xB93E0002 // KIMD --, R2
+	BVS  continue    // continue if interrupted
+	MOVD $0, R0      // reset R0 for pre-go1.8 compilers
+	RET
+
+// func klmd(function code, params *[200]byte, dst, src []byte)
+TEXT ·klmd(SB), NOFRAME|NOSPLIT, $0-64
+	// TODO: SHAKE support
+	MOVD function+0(FP), R0
+	MOVD params+8(FP), R1
+	LMG  dst+16(FP), R2, R3 // R2=base, R3=len
+	LMG  src+40(FP), R4, R5 // R4=base, R5=len
+
+continue:
+	WORD $0xB93F0024 // KLMD R2, R4
+	BVS  continue    // continue if interrupted
+	MOVD $0, R0      // reset R0 for pre-go1.8 compilers
+	RET
diff --git a/sha3/shake.go b/sha3/shake.go
index 5a027d2..97c9b06 100644
--- a/sha3/shake.go
+++ b/sha3/shake.go
@@ -38,12 +38,22 @@
 // NewShake128 creates a new SHAKE128 variable-output-length ShakeHash.
 // Its generic security strength is 128 bits against all attacks if at
 // least 32 bytes of its output are used.
-func NewShake128() ShakeHash { return &state{rate: 168, dsbyte: 0x1f} }
+func NewShake128() ShakeHash {
+	if h := newShake128Asm(); h != nil {
+		return h
+	}
+	return &state{rate: 168, dsbyte: 0x1f}
+}
 
 // NewShake256 creates a new SHAKE256 variable-output-length ShakeHash.
 // Its generic security strength is 256 bits against all attacks if
 // at least 64 bytes of its output are used.
-func NewShake256() ShakeHash { return &state{rate: 136, dsbyte: 0x1f} }
+func NewShake256() ShakeHash {
+	if h := newShake256Asm(); h != nil {
+		return h
+	}
+	return &state{rate: 136, dsbyte: 0x1f}
+}
 
 // ShakeSum128 writes an arbitrary-length digest of data into hash.
 func ShakeSum128(hash, data []byte) {
diff --git a/sha3/shake_generic.go b/sha3/shake_generic.go
new file mode 100644
index 0000000..73d0c90
--- /dev/null
+++ b/sha3/shake_generic.go
@@ -0,0 +1,19 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//+build gccgo appengine !s390x
+
+package sha3
+
+// newShake128Asm returns an assembly implementation of SHAKE-128 if available,
+// otherwise it returns nil.
+func newShake128Asm() ShakeHash {
+	return nil
+}
+
+// newShake256Asm returns an assembly implementation of SHAKE-256 if available,
+// otherwise it returns nil.
+func newShake256Asm() ShakeHash {
+	return nil
+}