crypto/rsa,crypto/internal/bigmod: improve verify/encrypt performance

Most libraries don't consider N secret, but it's arguably useful for
privacy applications. However, E should generally be fixed, and there is
a lot of performance to be gained by using variable-time exponentiation.

The threshold trick is from BoringSSL.

goos: linux
goarch: amd64
pkg: crypto/rsa
cpu: Intel(R) Core(TM) i5-7400 CPU @ 3.00GHz
                       │     old      │                 new                 │
                       │    sec/op    │   sec/op     vs base                │
DecryptPKCS1v15/2048-4    1.398m ± 0%   1.396m ± 4%        ~ (p=0.853 n=10)
DecryptPKCS1v15/3072-4    3.640m ± 0%   3.652m ± 1%        ~ (p=0.063 n=10)
DecryptPKCS1v15/4096-4    7.756m ± 0%   7.764m ± 0%        ~ (p=0.853 n=10)
EncryptPKCS1v15/2048-4   175.50µ ± 0%   39.37µ ± 0%  -77.57% (p=0.000 n=10)
DecryptOAEP/2048-4        1.375m ± 0%   1.371m ± 1%        ~ (p=0.089 n=10)
EncryptOAEP/2048-4       177.64µ ± 0%   41.17µ ± 1%  -76.82% (p=0.000 n=10)
SignPKCS1v15/2048-4       1.419m ± 0%   1.393m ± 1%   -1.84% (p=0.000 n=10)
VerifyPKCS1v15/2048-4    173.70µ ± 1%   38.28µ ± 2%  -77.96% (p=0.000 n=10)
SignPSS/2048-4            1.437m ± 1%   1.413m ± 0%   -1.64% (p=0.000 n=10)
VerifyPSS/2048-4         176.83µ ± 1%   43.08µ ± 5%  -75.64% (p=0.000 n=10)

This finally makes everything in crypto/rsa faster than it was in Go 1.19.

goos: linux
goarch: amd64
pkg: crypto/rsa
cpu: Intel(R) Core(TM) i5-7400 CPU @ 3.00GHz
                       │ go1.19.txt  │              go1.20.txt               │              go1.21.txt               │               new.txt               │
                       │   sec/op    │    sec/op     vs base                 │    sec/op     vs base                 │   sec/op     vs base                │
DecryptPKCS1v15/2048-4   1.458m ± 0%    1.597m ± 1%    +9.50% (p=0.000 n=10)    1.395m ± 1%    -4.30% (p=0.000 n=10)   1.396m ± 4%   -4.25% (p=0.002 n=10)
DecryptPKCS1v15/3072-4   4.023m ± 1%    5.332m ± 1%   +32.53% (p=0.000 n=10)    3.649m ± 1%    -9.30% (p=0.000 n=10)   3.652m ± 1%   -9.23% (p=0.000 n=10)
DecryptPKCS1v15/4096-4   8.710m ± 1%   11.937m ± 1%   +37.05% (p=0.000 n=10)    7.564m ± 1%   -13.16% (p=0.000 n=10)   7.764m ± 0%  -10.86% (p=0.000 n=10)
EncryptPKCS1v15/2048-4   51.79µ ± 0%   267.68µ ± 0%  +416.90% (p=0.000 n=10)   176.42µ ± 0%  +240.67% (p=0.000 n=10)   39.37µ ± 0%  -23.98% (p=0.000 n=10)
DecryptOAEP/2048-4       1.461m ± 0%    1.613m ± 1%   +10.37% (p=0.000 n=10)    1.415m ± 0%    -3.13% (p=0.000 n=10)   1.371m ± 1%   -6.18% (p=0.000 n=10)
EncryptOAEP/2048-4       54.24µ ± 0%   269.19µ ± 0%  +396.28% (p=0.000 n=10)   177.31µ ± 0%  +226.89% (p=0.000 n=10)   41.17µ ± 1%  -24.10% (p=0.000 n=10)
SignPKCS1v15/2048-4      1.510m ± 0%    1.705m ± 0%   +12.93% (p=0.000 n=10)    1.423m ± 1%    -5.78% (p=0.000 n=10)   1.393m ± 1%   -7.76% (p=0.000 n=10)
VerifyPKCS1v15/2048-4    50.87µ ± 0%   266.41µ ± 1%  +423.71% (p=0.000 n=10)   174.38µ ± 0%  +242.79% (p=0.000 n=10)   38.28µ ± 2%  -24.75% (p=0.000 n=10)
SignPSS/2048-4           1.513m ± 1%    1.709m ± 0%   +12.97% (p=0.000 n=10)    1.461m ± 0%    -3.42% (p=0.000 n=10)   1.413m ± 0%   -6.58% (p=0.000 n=10)
VerifyPSS/2048-4         53.45µ ± 1%   268.56µ ± 0%  +402.48% (p=0.000 n=10)   177.29µ ± 0%  +231.72% (p=0.000 n=10)   43.08µ ± 5%  -19.39% (p=0.000 n=10)
geomean                  514.6µ         1.094m       +112.65%                   801.6µ        +55.77%                  442.1µ       -14.08%

Fixes #63516

Change-Id: If40e596a2e4b3ab7a202ff34591cf9cffecfcc1b
Reviewed-on: https://go-review.googlesource.com/c/go/+/552935
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Than McIntosh <thanm@google.com>
Reviewed-by: Roland Shoemaker <roland@golang.org>
diff --git a/src/crypto/internal/bigmod/nat.go b/src/crypto/internal/bigmod/nat.go
index 5605e9f..7fdd8ef 100644
--- a/src/crypto/internal/bigmod/nat.go
+++ b/src/crypto/internal/bigmod/nat.go
@@ -318,14 +318,48 @@
 // rr returns R*R with R = 2^(_W * n) and n = len(m.nat.limbs).
 func rr(m *Modulus) *Nat {
 	rr := NewNat().ExpandFor(m)
-	// R*R is 2^(2 * _W * n). We can safely get 2^(_W * (n - 1)) by setting the
-	// most significant limb to 1. We then get to R*R by shifting left by _W
-	// n + 1 times.
-	n := len(rr.limbs)
-	rr.limbs[n-1] = 1
-	for i := n - 1; i < 2*n; i++ {
-		rr.shiftIn(0, m) // x = x * 2^_W mod m
+	n := uint(len(rr.limbs))
+	mLen := uint(m.BitLen())
+	logR := _W * n
+
+	// We start by computing R = 2^(_W * n) mod m. We can get pretty close, to
+	// 2^⌊log₂m⌋, by setting the highest bit we can without having to reduce.
+	rr.limbs[n-1] = 1 << ((mLen - 1) % _W)
+	// Then we double until we reach 2^(_W * n).
+	for i := mLen - 1; i < logR; i++ {
+		rr.Add(rr, m)
 	}
+
+	// Next we need to get from R to 2^(_W * n) R mod m (aka from one to R in
+	// the Montgomery domain, meaning we can use Montgomery multiplication now).
+	// We could do that by doubling _W * n times, or with a square-and-double
+	// chain log2(_W * n) long. Turns out the fastest thing is to start out with
+	// doublings, and switch to square-and-double once the exponent is large
+	// enough to justify the cost of the multiplications.
+
+	// The threshold is selected experimentally as a linear function of n.
+	threshold := n / 4
+
+	// We calculate how many of the most-significant bits of the exponent we can
+	// compute before crossing the threshold, and we do it with doublings.
+	i := bits.UintSize
+	for logR>>i <= threshold {
+		i--
+	}
+	for k := uint(0); k < logR>>i; k++ {
+		rr.Add(rr, m)
+	}
+
+	// Then we process the remaining bits of the exponent with a
+	// square-and-double chain.
+	for i > 0 {
+		rr.montgomeryMul(rr, rr, m)
+		i--
+		if logR>>i&1 != 0 {
+			rr.Add(rr, m)
+		}
+	}
+
 	return rr
 }
 
@@ -745,26 +779,21 @@
 	return out.montgomeryReduction(m)
 }
 
-// ExpShort calculates out = x^e mod m.
+// ExpShortVarTime calculates out = x^e mod m.
 //
 // The output will be resized to the size of m and overwritten. x must already
-// be reduced modulo m. This leaks the exact bit size of the exponent.
-func (out *Nat) ExpShort(x *Nat, e uint, m *Modulus) *Nat {
-	xR := NewNat().set(x).montgomeryRepresentation(m)
-
-	out.resetFor(m)
-	out.limbs[0] = 1
-	out.montgomeryRepresentation(m)
-
+// be reduced modulo m. This leaks the exponent through timing side-channels.
+func (out *Nat) ExpShortVarTime(x *Nat, e uint, m *Modulus) *Nat {
 	// For short exponents, precomputing a table and using a window like in Exp
-	// doesn't pay off. Instead, we do a simple constant-time conditional
-	// square-and-multiply chain, skipping the initial run of zeroes.
-	tmp := NewNat().ExpandFor(m)
-	for i := bits.UintSize - bitLen(e); i < bits.UintSize; i++ {
+	// doesn't pay off. Instead, we do a simple conditional square-and-multiply
+	// chain, skipping the initial run of zeroes.
+	xR := NewNat().set(x).montgomeryRepresentation(m)
+	out.set(xR)
+	for i := bits.UintSize - bitLen(e) + 1; i < bits.UintSize; i++ {
 		out.montgomeryMul(out, out, m)
-		k := (e >> (bits.UintSize - i - 1)) & 1
-		tmp.montgomeryMul(out, xR, m)
-		out.assign(ctEq(k, 1), tmp)
+		if k := (e >> (bits.UintSize - i - 1)) & 1; k != 0 {
+			out.montgomeryMul(out, xR, m)
+		}
 	}
 	return out.montgomeryReduction(m)
 }
diff --git a/src/crypto/internal/bigmod/nat_test.go b/src/crypto/internal/bigmod/nat_test.go
index 76e5570..7a956e3 100644
--- a/src/crypto/internal/bigmod/nat_test.go
+++ b/src/crypto/internal/bigmod/nat_test.go
@@ -303,7 +303,7 @@
 	m := modulusFromBytes([]byte{13})
 	x := &Nat{[]uint{3}}
 	out := &Nat{[]uint{0}}
-	out.ExpShort(x, 12, m)
+	out.ExpShortVarTime(x, 12, m)
 	expected := &Nat{[]uint{1}}
 	if out.Equal(expected) != 1 {
 		t.Errorf("%+v != %+v", out, expected)
diff --git a/src/crypto/rsa/rsa.go b/src/crypto/rsa/rsa.go
index 0715421..9342930 100644
--- a/src/crypto/rsa/rsa.go
+++ b/src/crypto/rsa/rsa.go
@@ -43,6 +43,10 @@
 var bigOne = big.NewInt(1)
 
 // A PublicKey represents the public part of an RSA key.
+//
+// The value of the modulus N is considered secret by this library and protected
+// from leaking through timing side-channels. However, neither the value of the
+// exponent E nor the precise bit size of N are similarly protected.
 type PublicKey struct {
 	N *big.Int // modulus
 	E int      // public exponent
@@ -478,10 +482,6 @@
 func encrypt(pub *PublicKey, plaintext []byte) ([]byte, error) {
 	boring.Unreachable()
 
-	// Most of the CPU time for encryption and verification is spent in this
-	// NewModulusFromBig call, because PublicKey doesn't have a Precomputed
-	// field. If performance becomes an issue, consider placing a private
-	// sync.Once on PublicKey to compute this.
 	N, err := bigmod.NewModulusFromBig(pub.N)
 	if err != nil {
 		return nil, err
@@ -492,7 +492,7 @@
 	}
 	e := uint(pub.E)
 
-	return bigmod.NewNat().ExpShort(m, e, N).Bytes(N), nil
+	return bigmod.NewNat().ExpShortVarTime(m, e, N).Bytes(N), nil
 }
 
 // EncryptOAEP encrypts the given message with RSA-OAEP.
@@ -686,7 +686,7 @@
 	}
 
 	if check {
-		c1 := bigmod.NewNat().ExpShort(m, uint(priv.E), N)
+		c1 := bigmod.NewNat().ExpShortVarTime(m, uint(priv.E), N)
 		if c1.Equal(c) != 1 {
 			return nil, ErrDecryption
 		}