curve25519: improve cswap

Simplify the constant swap function.

On amd64: Replace the CMOVQEQ scheme with SSE2 code similar to the non-amd64 code.
On non-amd64: Avoid unnecessary loop iterations.

The result is less and slightly faster code.

name 			old time/op 	new time/op 	delta
ScalarBaseMult-4   	653µs ± 0%   	636µs ± 0%   	~     (p=0.100 n=3+3)

name 			old time/op 	new time/op 	delta
ConstantSwap-4  	10.4ns ± 1%   	6.2ns ± 0%  	-39.86%  (p=0.029 n=4+4)

On an i7-65000U

Change-Id: Ia5eea92e0b3eabb6c291d25229aa582b51278552
Reviewed-on: https://go-review.googlesource.com/39693
Reviewed-by: Adam Langley <agl@golang.org>
Run-TryBot: Adam Langley <agl@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
diff --git a/curve25519/cswap_amd64.s b/curve25519/cswap_amd64.s
index 45484d1..cd793a5 100644
--- a/curve25519/cswap_amd64.s
+++ b/curve25519/cswap_amd64.s
@@ -2,87 +2,64 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// This code was translated into a form compatible with 6a from the public
-// domain sources in SUPERCOP: http://bench.cr.yp.to/supercop.html
-
 // +build amd64,!gccgo,!appengine
 
-// func cswap(inout *[5]uint64, v uint64)
+// func cswap(inout *[4][5]uint64, v uint64)
 TEXT ·cswap(SB),7,$0
 	MOVQ inout+0(FP),DI
 	MOVQ v+8(FP),SI
 
-	CMPQ SI,$1
-	MOVQ 0(DI),SI
-	MOVQ 80(DI),DX
-	MOVQ 8(DI),CX
-	MOVQ 88(DI),R8
-	MOVQ SI,R9
-	CMOVQEQ DX,SI
-	CMOVQEQ R9,DX
-	MOVQ CX,R9
-	CMOVQEQ R8,CX
-	CMOVQEQ R9,R8
-	MOVQ SI,0(DI)
-	MOVQ DX,80(DI)
-	MOVQ CX,8(DI)
-	MOVQ R8,88(DI)
-	MOVQ 16(DI),SI
-	MOVQ 96(DI),DX
-	MOVQ 24(DI),CX
-	MOVQ 104(DI),R8
-	MOVQ SI,R9
-	CMOVQEQ DX,SI
-	CMOVQEQ R9,DX
-	MOVQ CX,R9
-	CMOVQEQ R8,CX
-	CMOVQEQ R9,R8
-	MOVQ SI,16(DI)
-	MOVQ DX,96(DI)
-	MOVQ CX,24(DI)
-	MOVQ R8,104(DI)
-	MOVQ 32(DI),SI
-	MOVQ 112(DI),DX
-	MOVQ 40(DI),CX
-	MOVQ 120(DI),R8
-	MOVQ SI,R9
-	CMOVQEQ DX,SI
-	CMOVQEQ R9,DX
-	MOVQ CX,R9
-	CMOVQEQ R8,CX
-	CMOVQEQ R9,R8
-	MOVQ SI,32(DI)
-	MOVQ DX,112(DI)
-	MOVQ CX,40(DI)
-	MOVQ R8,120(DI)
-	MOVQ 48(DI),SI
-	MOVQ 128(DI),DX
-	MOVQ 56(DI),CX
-	MOVQ 136(DI),R8
-	MOVQ SI,R9
-	CMOVQEQ DX,SI
-	CMOVQEQ R9,DX
-	MOVQ CX,R9
-	CMOVQEQ R8,CX
-	CMOVQEQ R9,R8
-	MOVQ SI,48(DI)
-	MOVQ DX,128(DI)
-	MOVQ CX,56(DI)
-	MOVQ R8,136(DI)
-	MOVQ 64(DI),SI
-	MOVQ 144(DI),DX
-	MOVQ 72(DI),CX
-	MOVQ 152(DI),R8
-	MOVQ SI,R9
-	CMOVQEQ DX,SI
-	CMOVQEQ R9,DX
-	MOVQ CX,R9
-	CMOVQEQ R8,CX
-	CMOVQEQ R9,R8
-	MOVQ SI,64(DI)
-	MOVQ DX,144(DI)
-	MOVQ CX,72(DI)
-	MOVQ R8,152(DI)
-	MOVQ DI,AX
-	MOVQ SI,DX
+	SUBQ $1, SI
+	NOTQ SI
+	MOVQ SI, X15
+	PSHUFD $0x44, X15, X15
+
+	MOVOU 0(DI), X0
+	MOVOU 16(DI), X2
+	MOVOU 32(DI), X4
+	MOVOU 48(DI), X6
+	MOVOU 64(DI), X8
+	MOVOU 80(DI), X1
+	MOVOU 96(DI), X3
+	MOVOU 112(DI), X5
+	MOVOU 128(DI), X7
+	MOVOU 144(DI), X9
+
+	MOVO X1, X10
+	MOVO X3, X11
+	MOVO X5, X12
+	MOVO X7, X13
+	MOVO X9, X14
+
+	PXOR X0, X10
+	PXOR X2, X11
+	PXOR X4, X12
+	PXOR X6, X13
+	PXOR X8, X14
+	PAND X15, X10
+	PAND X15, X11
+	PAND X15, X12
+	PAND X15, X13
+	PAND X15, X14
+	PXOR X10, X0
+	PXOR X10, X1
+	PXOR X11, X2
+	PXOR X11, X3
+	PXOR X12, X4
+	PXOR X12, X5
+	PXOR X13, X6
+	PXOR X13, X7
+	PXOR X14, X8
+	PXOR X14, X9
+
+	MOVOU X0, 0(DI)
+	MOVOU X2, 16(DI)
+	MOVOU X4, 32(DI)
+	MOVOU X6, 48(DI)
+	MOVOU X8, 64(DI)
+	MOVOU X1, 80(DI)
+	MOVOU X3, 96(DI)
+	MOVOU X5, 112(DI)
+	MOVOU X7, 128(DI)
+	MOVOU X9, 144(DI)
 	RET
diff --git a/curve25519/curve25519.go b/curve25519/curve25519.go
index 6918c47..2d14c2a 100644
--- a/curve25519/curve25519.go
+++ b/curve25519/curve25519.go
@@ -8,6 +8,10 @@
 
 package curve25519
 
+import (
+	"encoding/binary"
+)
+
 // This code is a port of the public domain, "ref10" implementation of
 // curve25519 from SUPERCOP 20130419 by D. J. Bernstein.
 
@@ -50,17 +54,11 @@
 //
 // Preconditions: b in {0,1}.
 func feCSwap(f, g *fieldElement, b int32) {
-	var x fieldElement
 	b = -b
-	for i := range x {
-		x[i] = b & (f[i] ^ g[i])
-	}
-
 	for i := range f {
-		f[i] ^= x[i]
-	}
-	for i := range g {
-		g[i] ^= x[i]
+		t := b & (f[i] ^ g[i])
+		f[i] ^= t
+		g[i] ^= t
 	}
 }
 
@@ -75,12 +73,7 @@
 
 // load4 reads a 32-bit, little-endian value from in.
 func load4(in []byte) int64 {
-	var r int64
-	r = int64(in[0])
-	r |= int64(in[1]) << 8
-	r |= int64(in[2]) << 16
-	r |= int64(in[3]) << 24
-	return r
+	return int64(binary.LittleEndian.Uint32(in))
 }
 
 func feFromBytes(dst *fieldElement, src *[32]byte) {
diff --git a/curve25519/curve25519_test.go b/curve25519/curve25519_test.go
index 14b0ee8..051a830 100644
--- a/curve25519/curve25519_test.go
+++ b/curve25519/curve25519_test.go
@@ -27,3 +27,13 @@
 		t.Errorf("incorrect result: got %s, want %s", result, expectedHex)
 	}
 }
+
+func BenchmarkScalarBaseMult(b *testing.B) {
+	var in, out [32]byte
+	in[0] = 1
+
+	b.SetBytes(32)
+	for i := 0; i < b.N; i++ {
+		ScalarBaseMult(&out, &in)
+	}
+}