scrypt: 2x faster. Work on uint32 slices instead of bytes. Replace usage of Salsa20/8 from salsa package with the specialized version. benchmark old ns/op new ns/op delta BenchmarkKey 266430525 126657130 -52.46% R=agl CC=golang-dev https://golang.org/cl/7139050
diff --git a/scrypt/scrypt.go b/scrypt/scrypt.go index 1c01fa5..f7a9fe1 100644 --- a/scrypt/scrypt.go +++ b/scrypt/scrypt.go
@@ -9,69 +9,203 @@ import ( "crypto/sha256" - "encoding/binary" "errors" "code.google.com/p/go.crypto/pbkdf2" - "code.google.com/p/go.crypto/salsa20/salsa" ) const maxInt = int(^uint(0) >> 1) -// blockCopy copies n bytes from src into dst. -func blockCopy(dst, src []byte, n int) { +// blockCopy copies n numbers from src into dst. +func blockCopy(dst, src []uint32, n int) { copy(dst, src[:n]) } -// blockXOR XORs bytes from dst with n bytes from src. -func blockXOR(dst, src []byte, n int) { +// blockXOR XORs numbers from dst with n numbers from src. +func blockXOR(dst, src []uint32, n int) { for i, v := range src[:n] { dst[i] ^= v } } -func blockMix(b, y []byte, r int) { - var x [64]byte +// salsaXOR applies Salsa20/8 to the XOR of 16 numbers from tmp and in, +// and puts the result into both both tmp and out. +func salsaXOR(tmp *[16]uint32, in, out []uint32) { + w0 := tmp[0] ^ in[0] + w1 := tmp[1] ^ in[1] + w2 := tmp[2] ^ in[2] + w3 := tmp[3] ^ in[3] + w4 := tmp[4] ^ in[4] + w5 := tmp[5] ^ in[5] + w6 := tmp[6] ^ in[6] + w7 := tmp[7] ^ in[7] + w8 := tmp[8] ^ in[8] + w9 := tmp[9] ^ in[9] + w10 := tmp[10] ^ in[10] + w11 := tmp[11] ^ in[11] + w12 := tmp[12] ^ in[12] + w13 := tmp[13] ^ in[13] + w14 := tmp[14] ^ in[14] + w15 := tmp[15] ^ in[15] - blockCopy(x[:], b[(2*r-1)*64:], 64) + x0, x1, x2, x3, x4, x5, x6, x7, x8 := w0, w1, w2, w3, w4, w5, w6, w7, w8 + x9, x10, x11, x12, x13, x14, x15 := w9, w10, w11, w12, w13, w14, w15 - for i := 0; i < 2*r*64; i += 64 { - blockXOR(x[:], b[i:], 64) - salsa.Core208(&x, &x) - blockCopy(y[i:], x[:], 64) + for i := 0; i < 8; i += 2 { + u := x0 + x12 + x4 ^= u<<7 | u>>(32-7) + u = x4 + x0 + x8 ^= u<<9 | u>>(32-9) + u = x8 + x4 + x12 ^= u<<13 | u>>(32-13) + u = x12 + x8 + x0 ^= u<<18 | u>>(32-18) + + u = x5 + x1 + x9 ^= u<<7 | u>>(32-7) + u = x9 + x5 + x13 ^= u<<9 | u>>(32-9) + u = x13 + x9 + x1 ^= u<<13 | u>>(32-13) + u = x1 + x13 + x5 ^= u<<18 | u>>(32-18) + + u = x10 + x6 + x14 ^= u<<7 | u>>(32-7) + u = x14 + x10 + x2 ^= u<<9 | u>>(32-9) + u = x2 + x14 + x6 ^= u<<13 | u>>(32-13) + u = x6 + x2 + x10 ^= u<<18 | u>>(32-18) + + u = x15 + x11 + x3 ^= u<<7 | u>>(32-7) + u = x3 + x15 + x7 ^= u<<9 | u>>(32-9) + u = x7 + x3 + x11 ^= u<<13 | u>>(32-13) + u = x11 + x7 + x15 ^= u<<18 | u>>(32-18) + + u = x0 + x3 + x1 ^= u<<7 | u>>(32-7) + u = x1 + x0 + x2 ^= u<<9 | u>>(32-9) + u = x2 + x1 + x3 ^= u<<13 | u>>(32-13) + u = x3 + x2 + x0 ^= u<<18 | u>>(32-18) + + u = x5 + x4 + x6 ^= u<<7 | u>>(32-7) + u = x6 + x5 + x7 ^= u<<9 | u>>(32-9) + u = x7 + x6 + x4 ^= u<<13 | u>>(32-13) + u = x4 + x7 + x5 ^= u<<18 | u>>(32-18) + + u = x10 + x9 + x11 ^= u<<7 | u>>(32-7) + u = x11 + x10 + x8 ^= u<<9 | u>>(32-9) + u = x8 + x11 + x9 ^= u<<13 | u>>(32-13) + u = x9 + x8 + x10 ^= u<<18 | u>>(32-18) + + u = x15 + x14 + x12 ^= u<<7 | u>>(32-7) + u = x12 + x15 + x13 ^= u<<9 | u>>(32-9) + u = x13 + x12 + x14 ^= u<<13 | u>>(32-13) + u = x14 + x13 + x15 ^= u<<18 | u>>(32-18) } + x0 += w0 + x1 += w1 + x2 += w2 + x3 += w3 + x4 += w4 + x5 += w5 + x6 += w6 + x7 += w7 + x8 += w8 + x9 += w9 + x10 += w10 + x11 += w11 + x12 += w12 + x13 += w13 + x14 += w14 + x15 += w15 - for i := 0; i < r; i++ { - blockCopy(b[i*64:], y[i*2*64:], 64) - } + out[0], tmp[0] = x0, x0 + out[1], tmp[1] = x1, x1 + out[2], tmp[2] = x2, x2 + out[3], tmp[3] = x3, x3 + out[4], tmp[4] = x4, x4 + out[5], tmp[5] = x5, x5 + out[6], tmp[6] = x6, x6 + out[7], tmp[7] = x7, x7 + out[8], tmp[8] = x8, x8 + out[9], tmp[9] = x9, x9 + out[10], tmp[10] = x10, x10 + out[11], tmp[11] = x11, x11 + out[12], tmp[12] = x12, x12 + out[13], tmp[13] = x13, x13 + out[14], tmp[14] = x14, x14 + out[15], tmp[15] = x15, x15 +} - for i := 0; i < r; i++ { - blockCopy(b[(i+r)*64:], y[(i*2+1)*64:], 64) +func blockMix(tmp *[16]uint32, in, out []uint32, r int) { + blockCopy(tmp[:], in[(2*r-1)*16:], 16) + for i := 0; i < 2*r; i += 2 { + salsaXOR(tmp, in[i*16:], out[i*8:]) + salsaXOR(tmp, in[i*16+16:], out[i*8+r*16:]) } } -func integer(b []byte, r int) uint64 { - return binary.LittleEndian.Uint64(b[(2*r-1)*64:]) +func integer(b []uint32, r int) uint64 { + j := (2*r - 1) * 16 + return uint64(b[j]) | uint64(b[j+1])<<32 } -func smix(b []byte, r, N int, v, xy []byte) { +func smix(b []byte, r, N int, v, xy []uint32) { + var tmp [16]uint32 x := xy - y := xy[128*r:] + y := xy[32*r:] - blockCopy(x, b, 128*r) - - for i := 0; i < N; i++ { - blockCopy(v[i*128*r:], x, 128*r) - blockMix(x, y, r) + j := 0 + for i := 0; i < 32*r; i++ { + x[i] = uint32(b[j]) | uint32(b[j+1])<<8 | uint32(b[j+2])<<16 | uint32(b[j+3])<<24 + j += 4 } + for i := 0; i < N; i += 2 { + blockCopy(v[i*(32*r):], x, 32*r) + blockMix(&tmp, x, y, r) - for i := 0; i < N; i++ { + blockCopy(v[(i+1)*(32*r):], y, 32*r) + blockMix(&tmp, y, x, r) + } + for i := 0; i < N; i += 2 { j := int(integer(x, r) & uint64(N-1)) - blockXOR(x, v[j*128*r:], 128*r) - blockMix(x, y, r) - } + blockXOR(x, v[j*(32*r):], 32*r) + blockMix(&tmp, x, y, r) - blockCopy(b, x, 128*r) + j = int(integer(y, r) & uint64(N-1)) + blockXOR(y, v[j*(32*r):], 32*r) + blockMix(&tmp, y, x, r) + } + j = 0 + for _, v := range x[:32*r] { + b[j+0] = byte(v >> 0) + b[j+1] = byte(v >> 8) + b[j+2] = byte(v >> 16) + b[j+3] = byte(v >> 24) + j += 4 + } } // Key derives a key from the password, salt, and cost parameters, returning @@ -97,8 +231,8 @@ return nil, errors.New("scrypt: parameters are too large") } - xy := make([]byte, 256*r) - v := make([]byte, 128*r*N) + xy := make([]uint32, 64*r) + v := make([]uint32, 32*N*r) b := pbkdf2.Key(password, salt, 1, p*128*r, sha256.New) for i := 0; i < p; i++ {
diff --git a/scrypt/scrypt_test.go b/scrypt/scrypt_test.go index 60b73ec..e096c3a 100644 --- a/scrypt/scrypt_test.go +++ b/scrypt/scrypt_test.go
@@ -18,6 +18,54 @@ var good = []testVector{ { + "password", + "salt", + 2, 10, 10, + []byte{ + 0x48, 0x2c, 0x85, 0x8e, 0x22, 0x90, 0x55, 0xe6, 0x2f, + 0x41, 0xe0, 0xec, 0x81, 0x9a, 0x5e, 0xe1, 0x8b, 0xdb, + 0x87, 0x25, 0x1a, 0x53, 0x4f, 0x75, 0xac, 0xd9, 0x5a, + 0xc5, 0xe5, 0xa, 0xa1, 0x5f, + }, + }, + { + "password", + "salt", + 16, 100, 100, + []byte{ + 0x88, 0xbd, 0x5e, 0xdb, 0x52, 0xd1, 0xdd, 0x0, 0x18, + 0x87, 0x72, 0xad, 0x36, 0x17, 0x12, 0x90, 0x22, 0x4e, + 0x74, 0x82, 0x95, 0x25, 0xb1, 0x8d, 0x73, 0x23, 0xa5, + 0x7f, 0x91, 0x96, 0x3c, 0x37, + }, + }, + { + "this is a long \000 password", + "and this is a long \000 salt", + 16384, 8, 1, + []byte{ + 0xc3, 0xf1, 0x82, 0xee, 0x2d, 0xec, 0x84, 0x6e, 0x70, + 0xa6, 0x94, 0x2f, 0xb5, 0x29, 0x98, 0x5a, 0x3a, 0x09, + 0x76, 0x5e, 0xf0, 0x4c, 0x61, 0x29, 0x23, 0xb1, 0x7f, + 0x18, 0x55, 0x5a, 0x37, 0x07, 0x6d, 0xeb, 0x2b, 0x98, + 0x30, 0xd6, 0x9d, 0xe5, 0x49, 0x26, 0x51, 0xe4, 0x50, + 0x6a, 0xe5, 0x77, 0x6d, 0x96, 0xd4, 0x0f, 0x67, 0xaa, + 0xee, 0x37, 0xe1, 0x77, 0x7b, 0x8a, 0xd5, 0xc3, 0x11, + 0x14, 0x32, 0xbb, 0x3b, 0x6f, 0x7e, 0x12, 0x64, 0x40, + 0x18, 0x79, 0xe6, 0x41, 0xae, + }, + }, + { + "p", + "s", + 2, 1, 1, + []byte{ + 0x48, 0xb0, 0xd2, 0xa8, 0xa3, 0x27, 0x26, 0x11, 0x98, + 0x4c, 0x50, 0xeb, 0xd6, 0x30, 0xaf, 0x52, + }, + }, + + { "", "", 16, 1, 1,