math/big: use optimized formula in ModSqrt for 3 mod 4 primes

For primes which are 3 mod 4, using Tonelli-Shanks is slower
and more complicated than using the identity

     a**((p+1)/4) mod p == sqrt(a)

For 2^450-2^225-1 and 2^10860-2^5430-1, which are 3 mod 4:

BenchmarkModSqrt225_TonelliTri      1000     1135375 ns/op
BenchmarkModSqrt225_3Mod4          10000      156009 ns/op
BenchmarkModSqrt5430_Tonelli           1  3448851386 ns/op
BenchmarkModSqrt5430_3Mod4             2   914616710 ns/op

~2.6x to 7x faster.

Fixes #11437 (which is a prime choice of issues to fix)

Change-Id: I813fb29454160483ec29825469e0370d517850c2
Reviewed-on: https://go-review.googlesource.com/11522
Reviewed-by: Adam Langley <agl@golang.org>
diff --git a/src/math/big/int_test.go b/src/math/big/int_test.go
index 9787462..5b80509 100644
--- a/src/math/big/int_test.go
+++ b/src/math/big/int_test.go
@@ -1185,6 +1185,53 @@
 	}
 }
 
+// tri generates the trinomial 2**(n*2) - 2**n - 1, which is always 3 mod 4 and
+// 7 mod 8, so that 2 is always a quadratic residue.
+func tri(n uint) *Int {
+	x := NewInt(1)
+	x.Lsh(x, n)
+	x2 := new(Int).Lsh(x, n)
+	x2.Sub(x2, x)
+	x2.Sub(x2, intOne)
+	return x2
+}
+
+func BenchmarkModSqrt225_Tonelli(b *testing.B) {
+	p := tri(225)
+	x := NewInt(2)
+	for i := 0; i < b.N; i++ {
+		x.SetUint64(2)
+		x.modSqrtTonelliShanks(x, p)
+	}
+}
+
+func BenchmarkModSqrt224_3Mod4(b *testing.B) {
+	p := tri(225)
+	x := new(Int).SetUint64(2)
+	for i := 0; i < b.N; i++ {
+		x.SetUint64(2)
+		x.modSqrt3Mod4Prime(x, p)
+	}
+}
+
+func BenchmarkModSqrt5430_Tonelli(b *testing.B) {
+	p := tri(5430)
+	x := new(Int).SetUint64(2)
+	for i := 0; i < b.N; i++ {
+		x.SetUint64(2)
+		x.modSqrtTonelliShanks(x, p)
+	}
+}
+
+func BenchmarkModSqrt5430_3Mod4(b *testing.B) {
+	p := tri(5430)
+	x := new(Int).SetUint64(2)
+	for i := 0; i < b.N; i++ {
+		x.SetUint64(2)
+		x.modSqrt3Mod4Prime(x, p)
+	}
+}
+
 func TestBitwise(t *testing.T) {
 	x := new(Int)
 	y := new(Int)