crypto/ecdh,crypto/internal/nistec: enable pruning of unused curves

If a program only uses ecdh.P256(), the implementation of the other
curves shouldn't end up in the binary. This mostly required moving some
operations from init() time. Small performance hit in uncompressed
Bytes/SetBytes, but not big enough to show up in higher-level
benchmarks. If it becomes a problem, we can fix it by pregenerating the
p-1 bytes representation in generate.go.

For #52182
Updates #52221

Change-Id: I64460973b59ee3df787d7e967a6c2bcbc114ba65
Reviewed-on: https://go-review.googlesource.com/c/go/+/402555
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Fernando Lobato Meeser <felobato@google.com>
Reviewed-by: Roland Shoemaker <roland@golang.org>
Run-TryBot: Filippo Valsorda <filippo@golang.org>
diff --git a/src/crypto/ecdh/ecdh_test.go b/src/crypto/ecdh/ecdh_test.go
index b27d6c9..5fd690b 100644
--- a/src/crypto/ecdh/ecdh_test.go
+++ b/src/crypto/ecdh/ecdh_test.go
@@ -12,7 +12,13 @@
 	"crypto/rand"
 	"encoding/hex"
 	"fmt"
+	"internal/testenv"
 	"io"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"regexp"
+	"strings"
 	"testing"
 
 	"golang.org/x/crypto/chacha20"
@@ -253,3 +259,71 @@
 }
 
 var zeroReader = zr{}
+
+const linkerTestProgram = `
+package main
+import "crypto/ecdh"
+import "crypto/rand"
+func main() {
+	curve := ecdh.P384()
+	key, err := curve.GenerateKey(rand.Reader)
+	if err != nil { panic(err) }
+	_, err = curve.NewPublicKey(key.PublicKey().Bytes())
+	if err != nil { panic(err) }
+	_, err = curve.NewPrivateKey(key.Bytes())
+	if err != nil { panic(err) }
+	_, err = curve.ECDH(key, key.PublicKey())
+	if err != nil { panic(err) }
+	println("OK")
+}
+`
+
+// TestLinker ensures that using one curve does not bring all other
+// implementations into the binary. This also guarantees that govulncheck can
+// avoid warning about a curve-specific vulnerability if that curve is not used.
+func TestLinker(t *testing.T) {
+	if testing.Short() {
+		t.Skip("test requires running 'go build'")
+	}
+	testenv.MustHaveGoBuild(t)
+
+	dir := t.TempDir()
+	hello := filepath.Join(dir, "hello.go")
+	err := os.WriteFile(hello, []byte(linkerTestProgram), 0664)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	run := func(args ...string) string {
+		cmd := exec.Command(args[0], args[1:]...)
+		cmd.Dir = dir
+		out, err := cmd.CombinedOutput()
+		if err != nil {
+			t.Fatalf("%v: %v\n%s", args, err, string(out))
+		}
+		return string(out)
+	}
+
+	goBin := testenv.GoToolPath(t)
+	run(goBin, "build", "-o", "hello.exe", "hello.go")
+	if out := run("./hello.exe"); out != "OK\n" {
+		t.Error("unexpected output:", out)
+	}
+
+	// List all text symbols under crypto/... and make sure there are some for
+	// P384, but none for the other curves.
+	var consistent bool
+	nm := run(goBin, "tool", "nm", "hello.exe")
+	for _, match := range regexp.MustCompile(`(?m)T (crypto/.*)$`).FindAllStringSubmatch(nm, -1) {
+		symbol := strings.ToLower(match[1])
+		if strings.Contains(symbol, "p384") {
+			consistent = true
+		}
+		if strings.Contains(symbol, "p224") || strings.Contains(symbol, "p256") || strings.Contains(symbol, "p521") {
+			t.Errorf("unexpected symbol in program using only ecdh.P384: %s", match[1])
+		}
+	}
+	if !consistent {
+		t.Error("no P384 symbols found in program using ecdh.P384, test is broken")
+	}
+}
diff --git a/src/crypto/internal/nistec/fiat/generate.go b/src/crypto/internal/nistec/fiat/generate.go
index 3b97307..db57021 100644
--- a/src/crypto/internal/nistec/fiat/generate.go
+++ b/src/crypto/internal/nistec/fiat/generate.go
@@ -182,12 +182,11 @@
 	return subtle.ConstantTimeCompare(eBytes, tBytes)
 }
 
-var {{ .Prefix }}ZeroEncoding = new({{ .Element }}).Bytes()
-
 // IsZero returns 1 if e == 0, and zero otherwise.
 func (e *{{ .Element }}) IsZero() int {
+	zero := make([]byte, {{ .Prefix }}ElementLen)
 	eBytes := e.Bytes()
-	return subtle.ConstantTimeCompare(eBytes, {{ .Prefix }}ZeroEncoding)
+	return subtle.ConstantTimeCompare(eBytes, zero)
 }
 
 // Set sets e = t, and returns e.
@@ -212,12 +211,6 @@
 	return out[:]
 }
 
-// {{ .Prefix }}MinusOneEncoding is the encoding of -1 mod p, so p - 1, the
-// highest canonical encoding. It is used by SetBytes to check for non-canonical
-// encodings such as p + k, 2p + k, etc.
-var {{ .Prefix }}MinusOneEncoding = new({{ .Element }}).Sub(
-	new({{ .Element }}), new({{ .Element }}).One()).Bytes()
-
 // SetBytes sets e = v, where v is a big-endian {{ .BytesLen }}-byte encoding, and returns e.
 // If v is not {{ .BytesLen }} bytes or it encodes a value higher than {{ .Prime }},
 // SetBytes returns nil and an error, and e is unchanged.
@@ -225,14 +218,20 @@
 	if len(v) != {{ .Prefix }}ElementLen {
 		return nil, errors.New("invalid {{ .Element }} encoding")
 	}
+
+	// Check for non-canonical encodings (p + k, 2p + k, etc.) by comparing to
+	// the encoding of -1 mod p, so p - 1, the highest canonical encoding.
+	var minusOneEncoding = new({{ .Element }}).Sub(
+		new({{ .Element }}), new({{ .Element }}).One()).Bytes()
 	for i := range v {
-		if v[i] < {{ .Prefix }}MinusOneEncoding[i] {
+		if v[i] < minusOneEncoding[i] {
 			break
 		}
-		if v[i] > {{ .Prefix }}MinusOneEncoding[i] {
+		if v[i] > minusOneEncoding[i] {
 			return nil, errors.New("invalid {{ .Element }} encoding")
 		}
 	}
+
 	var in [{{ .Prefix }}ElementLen]byte
 	copy(in[:], v)
 	{{ .Prefix }}InvertEndianness(in[:])
diff --git a/src/crypto/internal/nistec/fiat/p224.go b/src/crypto/internal/nistec/fiat/p224.go
index 4dddeb0..e1a78db 100644
--- a/src/crypto/internal/nistec/fiat/p224.go
+++ b/src/crypto/internal/nistec/fiat/p224.go
@@ -37,12 +37,11 @@
 	return subtle.ConstantTimeCompare(eBytes, tBytes)
 }
 
-var p224ZeroEncoding = new(P224Element).Bytes()
-
 // IsZero returns 1 if e == 0, and zero otherwise.
 func (e *P224Element) IsZero() int {
+	zero := make([]byte, p224ElementLen)
 	eBytes := e.Bytes()
-	return subtle.ConstantTimeCompare(eBytes, p224ZeroEncoding)
+	return subtle.ConstantTimeCompare(eBytes, zero)
 }
 
 // Set sets e = t, and returns e.
@@ -67,12 +66,6 @@
 	return out[:]
 }
 
-// p224MinusOneEncoding is the encoding of -1 mod p, so p - 1, the
-// highest canonical encoding. It is used by SetBytes to check for non-canonical
-// encodings such as p + k, 2p + k, etc.
-var p224MinusOneEncoding = new(P224Element).Sub(
-	new(P224Element), new(P224Element).One()).Bytes()
-
 // SetBytes sets e = v, where v is a big-endian 28-byte encoding, and returns e.
 // If v is not 28 bytes or it encodes a value higher than 2^224 - 2^96 + 1,
 // SetBytes returns nil and an error, and e is unchanged.
@@ -80,14 +73,20 @@
 	if len(v) != p224ElementLen {
 		return nil, errors.New("invalid P224Element encoding")
 	}
+
+	// Check for non-canonical encodings (p + k, 2p + k, etc.) by comparing to
+	// the encoding of -1 mod p, so p - 1, the highest canonical encoding.
+	var minusOneEncoding = new(P224Element).Sub(
+		new(P224Element), new(P224Element).One()).Bytes()
 	for i := range v {
-		if v[i] < p224MinusOneEncoding[i] {
+		if v[i] < minusOneEncoding[i] {
 			break
 		}
-		if v[i] > p224MinusOneEncoding[i] {
+		if v[i] > minusOneEncoding[i] {
 			return nil, errors.New("invalid P224Element encoding")
 		}
 	}
+
 	var in [p224ElementLen]byte
 	copy(in[:], v)
 	p224InvertEndianness(in[:])
diff --git a/src/crypto/internal/nistec/fiat/p224_invert.go b/src/crypto/internal/nistec/fiat/p224_invert.go
index 4163ed0..3cf5286 100644
--- a/src/crypto/internal/nistec/fiat/p224_invert.go
+++ b/src/crypto/internal/nistec/fiat/p224_invert.go
@@ -12,7 +12,7 @@
 func (e *P224Element) Invert(x *P224Element) *P224Element {
 	// Inversion is implemented as exponentiation with exponent p − 2.
 	// The sequence of 11 multiplications and 223 squarings is derived from the
-	// following addition chain generated with github.com/mmcloughlin/addchain v0.3.0.
+	// following addition chain generated with github.com/mmcloughlin/addchain v0.4.0.
 	//
 	//	_10     = 2*1
 	//	_11     = 1 + _10
diff --git a/src/crypto/internal/nistec/fiat/p256.go b/src/crypto/internal/nistec/fiat/p256.go
index dfdd0a7..7705904 100644
--- a/src/crypto/internal/nistec/fiat/p256.go
+++ b/src/crypto/internal/nistec/fiat/p256.go
@@ -37,12 +37,11 @@
 	return subtle.ConstantTimeCompare(eBytes, tBytes)
 }
 
-var p256ZeroEncoding = new(P256Element).Bytes()
-
 // IsZero returns 1 if e == 0, and zero otherwise.
 func (e *P256Element) IsZero() int {
+	zero := make([]byte, p256ElementLen)
 	eBytes := e.Bytes()
-	return subtle.ConstantTimeCompare(eBytes, p256ZeroEncoding)
+	return subtle.ConstantTimeCompare(eBytes, zero)
 }
 
 // Set sets e = t, and returns e.
@@ -67,12 +66,6 @@
 	return out[:]
 }
 
-// p256MinusOneEncoding is the encoding of -1 mod p, so p - 1, the
-// highest canonical encoding. It is used by SetBytes to check for non-canonical
-// encodings such as p + k, 2p + k, etc.
-var p256MinusOneEncoding = new(P256Element).Sub(
-	new(P256Element), new(P256Element).One()).Bytes()
-
 // SetBytes sets e = v, where v is a big-endian 32-byte encoding, and returns e.
 // If v is not 32 bytes or it encodes a value higher than 2^256 - 2^224 + 2^192 + 2^96 - 1,
 // SetBytes returns nil and an error, and e is unchanged.
@@ -80,14 +73,20 @@
 	if len(v) != p256ElementLen {
 		return nil, errors.New("invalid P256Element encoding")
 	}
+
+	// Check for non-canonical encodings (p + k, 2p + k, etc.) by comparing to
+	// the encoding of -1 mod p, so p - 1, the highest canonical encoding.
+	var minusOneEncoding = new(P256Element).Sub(
+		new(P256Element), new(P256Element).One()).Bytes()
 	for i := range v {
-		if v[i] < p256MinusOneEncoding[i] {
+		if v[i] < minusOneEncoding[i] {
 			break
 		}
-		if v[i] > p256MinusOneEncoding[i] {
+		if v[i] > minusOneEncoding[i] {
 			return nil, errors.New("invalid P256Element encoding")
 		}
 	}
+
 	var in [p256ElementLen]byte
 	copy(in[:], v)
 	p256InvertEndianness(in[:])
diff --git a/src/crypto/internal/nistec/fiat/p384.go b/src/crypto/internal/nistec/fiat/p384.go
index 5474d77..aed0c01 100644
--- a/src/crypto/internal/nistec/fiat/p384.go
+++ b/src/crypto/internal/nistec/fiat/p384.go
@@ -37,12 +37,11 @@
 	return subtle.ConstantTimeCompare(eBytes, tBytes)
 }
 
-var p384ZeroEncoding = new(P384Element).Bytes()
-
 // IsZero returns 1 if e == 0, and zero otherwise.
 func (e *P384Element) IsZero() int {
+	zero := make([]byte, p384ElementLen)
 	eBytes := e.Bytes()
-	return subtle.ConstantTimeCompare(eBytes, p384ZeroEncoding)
+	return subtle.ConstantTimeCompare(eBytes, zero)
 }
 
 // Set sets e = t, and returns e.
@@ -67,12 +66,6 @@
 	return out[:]
 }
 
-// p384MinusOneEncoding is the encoding of -1 mod p, so p - 1, the
-// highest canonical encoding. It is used by SetBytes to check for non-canonical
-// encodings such as p + k, 2p + k, etc.
-var p384MinusOneEncoding = new(P384Element).Sub(
-	new(P384Element), new(P384Element).One()).Bytes()
-
 // SetBytes sets e = v, where v is a big-endian 48-byte encoding, and returns e.
 // If v is not 48 bytes or it encodes a value higher than 2^384 - 2^128 - 2^96 + 2^32 - 1,
 // SetBytes returns nil and an error, and e is unchanged.
@@ -80,14 +73,20 @@
 	if len(v) != p384ElementLen {
 		return nil, errors.New("invalid P384Element encoding")
 	}
+
+	// Check for non-canonical encodings (p + k, 2p + k, etc.) by comparing to
+	// the encoding of -1 mod p, so p - 1, the highest canonical encoding.
+	var minusOneEncoding = new(P384Element).Sub(
+		new(P384Element), new(P384Element).One()).Bytes()
 	for i := range v {
-		if v[i] < p384MinusOneEncoding[i] {
+		if v[i] < minusOneEncoding[i] {
 			break
 		}
-		if v[i] > p384MinusOneEncoding[i] {
+		if v[i] > minusOneEncoding[i] {
 			return nil, errors.New("invalid P384Element encoding")
 		}
 	}
+
 	var in [p384ElementLen]byte
 	copy(in[:], v)
 	p384InvertEndianness(in[:])
diff --git a/src/crypto/internal/nistec/fiat/p384_invert.go b/src/crypto/internal/nistec/fiat/p384_invert.go
index 24169e9..31591ac 100644
--- a/src/crypto/internal/nistec/fiat/p384_invert.go
+++ b/src/crypto/internal/nistec/fiat/p384_invert.go
@@ -12,7 +12,7 @@
 func (e *P384Element) Invert(x *P384Element) *P384Element {
 	// Inversion is implemented as exponentiation with exponent p − 2.
 	// The sequence of 15 multiplications and 383 squarings is derived from the
-	// following addition chain generated with github.com/mmcloughlin/addchain v0.3.0.
+	// following addition chain generated with github.com/mmcloughlin/addchain v0.4.0.
 	//
 	//	_10     = 2*1
 	//	_11     = 1 + _10
diff --git a/src/crypto/internal/nistec/fiat/p521.go b/src/crypto/internal/nistec/fiat/p521.go
index 3d12117..43ac7d0 100644
--- a/src/crypto/internal/nistec/fiat/p521.go
+++ b/src/crypto/internal/nistec/fiat/p521.go
@@ -37,12 +37,11 @@
 	return subtle.ConstantTimeCompare(eBytes, tBytes)
 }
 
-var p521ZeroEncoding = new(P521Element).Bytes()
-
 // IsZero returns 1 if e == 0, and zero otherwise.
 func (e *P521Element) IsZero() int {
+	zero := make([]byte, p521ElementLen)
 	eBytes := e.Bytes()
-	return subtle.ConstantTimeCompare(eBytes, p521ZeroEncoding)
+	return subtle.ConstantTimeCompare(eBytes, zero)
 }
 
 // Set sets e = t, and returns e.
@@ -67,12 +66,6 @@
 	return out[:]
 }
 
-// p521MinusOneEncoding is the encoding of -1 mod p, so p - 1, the
-// highest canonical encoding. It is used by SetBytes to check for non-canonical
-// encodings such as p + k, 2p + k, etc.
-var p521MinusOneEncoding = new(P521Element).Sub(
-	new(P521Element), new(P521Element).One()).Bytes()
-
 // SetBytes sets e = v, where v is a big-endian 66-byte encoding, and returns e.
 // If v is not 66 bytes or it encodes a value higher than 2^521 - 1,
 // SetBytes returns nil and an error, and e is unchanged.
@@ -80,14 +73,20 @@
 	if len(v) != p521ElementLen {
 		return nil, errors.New("invalid P521Element encoding")
 	}
+
+	// Check for non-canonical encodings (p + k, 2p + k, etc.) by comparing to
+	// the encoding of -1 mod p, so p - 1, the highest canonical encoding.
+	var minusOneEncoding = new(P521Element).Sub(
+		new(P521Element), new(P521Element).One()).Bytes()
 	for i := range v {
-		if v[i] < p521MinusOneEncoding[i] {
+		if v[i] < minusOneEncoding[i] {
 			break
 		}
-		if v[i] > p521MinusOneEncoding[i] {
+		if v[i] > minusOneEncoding[i] {
 			return nil, errors.New("invalid P521Element encoding")
 		}
 	}
+
 	var in [p521ElementLen]byte
 	copy(in[:], v)
 	p521InvertEndianness(in[:])
diff --git a/src/crypto/internal/nistec/fiat/p521_invert.go b/src/crypto/internal/nistec/fiat/p521_invert.go
index 407711a..16c53e1 100644
--- a/src/crypto/internal/nistec/fiat/p521_invert.go
+++ b/src/crypto/internal/nistec/fiat/p521_invert.go
@@ -12,7 +12,7 @@
 func (e *P521Element) Invert(x *P521Element) *P521Element {
 	// Inversion is implemented as exponentiation with exponent p − 2.
 	// The sequence of 13 multiplications and 520 squarings is derived from the
-	// following addition chain generated with github.com/mmcloughlin/addchain v0.3.0.
+	// following addition chain generated with github.com/mmcloughlin/addchain v0.4.0.
 	//
 	//	_10       = 2*1
 	//	_11       = 1 + _10
diff --git a/src/crypto/internal/nistec/generate.go b/src/crypto/internal/nistec/generate.go
index 2c42eb9..0204bc1 100644
--- a/src/crypto/internal/nistec/generate.go
+++ b/src/crypto/internal/nistec/generate.go
@@ -73,7 +73,8 @@
 		p := strings.ToLower(c.P)
 		elementLen := (c.Params.BitSize + 7) / 8
 		B := fmt.Sprintf("%#v", c.Params.B.FillBytes(make([]byte, elementLen)))
-		G := fmt.Sprintf("%#v", elliptic.Marshal(c.Params, c.Params.Gx, c.Params.Gy))
+		Gx := fmt.Sprintf("%#v", c.Params.Gx.FillBytes(make([]byte, elementLen)))
+		Gy := fmt.Sprintf("%#v", c.Params.Gy.FillBytes(make([]byte, elementLen)))
 
 		log.Printf("Generating %s.go...", p)
 		f, err := os.Create(p + ".go")
@@ -83,7 +84,7 @@
 		defer f.Close()
 		buf := &bytes.Buffer{}
 		if err := t.Execute(buf, map[string]interface{}{
-			"P": c.P, "p": p, "B": B, "G": G,
+			"P": c.P, "p": p, "B": B, "Gx": Gx, "Gy": Gy,
 			"Element": c.Element, "ElementLen": elementLen,
 			"BuildTags": c.BuildTags,
 		}); err != nil {
@@ -157,10 +158,6 @@
 	"sync"
 )
 
-var {{.p}}B, _ = new({{.Element}}).SetBytes({{.B}})
-
-var {{.p}}G, _ = New{{.P}}Point().SetBytes({{.G}})
-
 // {{.p}}ElementLength is the length of an element of the base or scalar field,
 // which have the same bytes length for all NIST P curves.
 const {{.p}}ElementLength = {{ .ElementLen }}
@@ -181,13 +178,12 @@
 	}
 }
 
-// New{{.P}}Generator returns a new {{.P}}Point set to the canonical generator.
-func New{{.P}}Generator() *{{.P}}Point {
-	return (&{{.P}}Point{
-		x: new({{.Element}}),
-		y: new({{.Element}}),
-		z: new({{.Element}}),
-	}).Set({{.p}}G)
+// SetGenerator sets p to the canonical generator and returns p.
+func (p *{{.P}}Point) SetGenerator() *{{.P}}Point {
+	p.x.SetBytes({{.Gx}})
+	p.y.SetBytes({{.Gy}})
+	p.z.One()
+	return p
 }
 
 // Set sets p = q and returns p.
@@ -256,6 +252,17 @@
 	}
 }
 
+
+var _{{.p}}B *{{.Element}}
+var _{{.p}}BOnce sync.Once
+
+func {{.p}}B() *{{.Element}} {
+	_{{.p}}BOnce.Do(func() {
+		_{{.p}}B, _ = new({{.Element}}).SetBytes({{.B}})
+	})
+	return _{{.p}}B
+}
+
 // {{.p}}Polynomial sets y2 to x³ - 3x + b, and returns y2.
 func {{.p}}Polynomial(y2, x *{{.Element}}) *{{.Element}} {
 	y2.Square(x)
@@ -263,9 +270,9 @@
 
 	threeX := new({{.Element}}).Add(x, x)
 	threeX.Add(threeX, x)
-
 	y2.Sub(y2, threeX)
-	return y2.Add(y2, {{.p}}B)
+
+	return y2.Add(y2, {{.p}}B())
 }
 
 func {{.p}}CheckOnCurve(x, y *{{.Element}}) error {
@@ -373,13 +380,13 @@
 	x3.Mul(x3, y3)                            // X3 := X3 * Y3
 	y3.Add(t0, t2)                            // Y3 := t0 + t2
 	y3.Sub(x3, y3)                            // Y3 := X3 - Y3
-	z3 := new({{.Element}}).Mul({{.p}}B, t2)  // Z3 := b * t2
+	z3 := new({{.Element}}).Mul({{.p}}B(), t2)  // Z3 := b * t2
 	x3.Sub(y3, z3)                            // X3 := Y3 - Z3
 	z3.Add(x3, x3)                            // Z3 := X3 + X3
 	x3.Add(x3, z3)                            // X3 := X3 + Z3
 	z3.Sub(t1, x3)                            // Z3 := t1 - X3
 	x3.Add(t1, x3)                            // X3 := t1 + X3
-	y3.Mul({{.p}}B, y3)                       // Y3 := b * Y3
+	y3.Mul({{.p}}B(), y3)                     // Y3 := b * Y3
 	t1.Add(t2, t2)                            // t1 := t2 + t2
 	t2.Add(t1, t2)                            // t2 := t1 + t2
 	y3.Sub(y3, t2)                            // Y3 := Y3 - t2
@@ -417,7 +424,7 @@
 	t3.Add(t3, t3)                           // t3 := t3 + t3
 	z3 := new({{.Element}}).Mul(p.x, p.z)    // Z3 := X * Z
 	z3.Add(z3, z3)                           // Z3 := Z3 + Z3
-	y3 := new({{.Element}}).Mul({{.p}}B, t2) // Y3 := b * t2
+	y3 := new({{.Element}}).Mul({{.p}}B(), t2) // Y3 := b * t2
 	y3.Sub(y3, z3)                           // Y3 := Y3 - Z3
 	x3 := new({{.Element}}).Add(y3, y3)      // X3 := Y3 + Y3
 	y3.Add(x3, y3)                           // Y3 := X3 + Y3
@@ -427,7 +434,7 @@
 	x3.Mul(x3, t3)                           // X3 := X3 * t3
 	t3.Add(t2, t2)                           // t3 := t2 + t2
 	t2.Add(t2, t3)                           // t2 := t2 + t3
-	z3.Mul({{.p}}B, z3)                      // Z3 := b * Z3
+	z3.Mul({{.p}}B(), z3)                    // Z3 := b * Z3
 	z3.Sub(z3, t2)                           // Z3 := Z3 - t2
 	z3.Sub(z3, t0)                           // Z3 := Z3 - t0
 	t3.Add(z3, z3)                           // t3 := Z3 + Z3
@@ -531,7 +538,7 @@
 func (p *{{.P}}Point) generatorTable() *[{{.p}}ElementLength * 2]{{.p}}Table {
 	{{.p}}GeneratorTableOnce.Do(func() {
 		{{.p}}GeneratorTable = new([{{.p}}ElementLength * 2]{{.p}}Table)
-		base := New{{.P}}Generator()
+		base := New{{.P}}Point().SetGenerator()
 		for i := 0; i < {{.p}}ElementLength*2; i++ {
 			{{.p}}GeneratorTable[i][0] = New{{.P}}Point().Set(base)
 			for j := 1; j < 15; j++ {
diff --git a/src/crypto/internal/nistec/nistec_test.go b/src/crypto/internal/nistec/nistec_test.go
index adddab2..309f68b 100644
--- a/src/crypto/internal/nistec/nistec_test.go
+++ b/src/crypto/internal/nistec/nistec_test.go
@@ -19,7 +19,7 @@
 
 	t.Run("P224", func(t *testing.T) {
 		if allocs := testing.AllocsPerRun(100, func() {
-			p := nistec.NewP224Generator()
+			p := nistec.NewP224Point().SetGenerator()
 			scalar := make([]byte, 28)
 			rand.Read(scalar)
 			p.ScalarBaseMult(scalar)
@@ -38,7 +38,7 @@
 	})
 	t.Run("P256", func(t *testing.T) {
 		if allocs := testing.AllocsPerRun(100, func() {
-			p := nistec.NewP256Generator()
+			p := nistec.NewP256Point().SetGenerator()
 			scalar := make([]byte, 32)
 			rand.Read(scalar)
 			p.ScalarBaseMult(scalar)
@@ -57,7 +57,7 @@
 	})
 	t.Run("P384", func(t *testing.T) {
 		if allocs := testing.AllocsPerRun(100, func() {
-			p := nistec.NewP384Generator()
+			p := nistec.NewP384Point().SetGenerator()
 			scalar := make([]byte, 48)
 			rand.Read(scalar)
 			p.ScalarBaseMult(scalar)
@@ -76,7 +76,7 @@
 	})
 	t.Run("P521", func(t *testing.T) {
 		if allocs := testing.AllocsPerRun(100, func() {
-			p := nistec.NewP521Generator()
+			p := nistec.NewP521Point().SetGenerator()
 			scalar := make([]byte, 66)
 			rand.Read(scalar)
 			p.ScalarBaseMult(scalar)
@@ -97,6 +97,7 @@
 
 type nistPoint[T any] interface {
 	Bytes() []byte
+	SetGenerator() T
 	SetBytes([]byte) (T, error)
 	Add(T, T) T
 	Double(T) T
@@ -106,21 +107,21 @@
 
 func TestEquivalents(t *testing.T) {
 	t.Run("P224", func(t *testing.T) {
-		testEquivalents(t, nistec.NewP224Point, nistec.NewP224Generator, elliptic.P224())
+		testEquivalents(t, nistec.NewP224Point, elliptic.P224())
 	})
 	t.Run("P256", func(t *testing.T) {
-		testEquivalents(t, nistec.NewP256Point, nistec.NewP256Generator, elliptic.P256())
+		testEquivalents(t, nistec.NewP256Point, elliptic.P256())
 	})
 	t.Run("P384", func(t *testing.T) {
-		testEquivalents(t, nistec.NewP384Point, nistec.NewP384Generator, elliptic.P384())
+		testEquivalents(t, nistec.NewP384Point, elliptic.P384())
 	})
 	t.Run("P521", func(t *testing.T) {
-		testEquivalents(t, nistec.NewP521Point, nistec.NewP521Generator, elliptic.P521())
+		testEquivalents(t, nistec.NewP521Point, elliptic.P521())
 	})
 }
 
-func testEquivalents[P nistPoint[P]](t *testing.T, newPoint, newGenerator func() P, c elliptic.Curve) {
-	p := newGenerator()
+func testEquivalents[P nistPoint[P]](t *testing.T, newPoint func() P, c elliptic.Curve) {
+	p := newPoint().SetGenerator()
 
 	elementSize := (c.Params().BitSize + 7) / 8
 	two := make([]byte, elementSize)
@@ -166,16 +167,16 @@
 
 func BenchmarkScalarMult(b *testing.B) {
 	b.Run("P224", func(b *testing.B) {
-		benchmarkScalarMult(b, nistec.NewP224Generator(), 28)
+		benchmarkScalarMult(b, nistec.NewP224Point().SetGenerator(), 28)
 	})
 	b.Run("P256", func(b *testing.B) {
-		benchmarkScalarMult(b, nistec.NewP256Generator(), 32)
+		benchmarkScalarMult(b, nistec.NewP256Point().SetGenerator(), 32)
 	})
 	b.Run("P384", func(b *testing.B) {
-		benchmarkScalarMult(b, nistec.NewP384Generator(), 48)
+		benchmarkScalarMult(b, nistec.NewP384Point().SetGenerator(), 48)
 	})
 	b.Run("P521", func(b *testing.B) {
-		benchmarkScalarMult(b, nistec.NewP521Generator(), 66)
+		benchmarkScalarMult(b, nistec.NewP521Point().SetGenerator(), 66)
 	})
 }
 
@@ -191,16 +192,16 @@
 
 func BenchmarkScalarBaseMult(b *testing.B) {
 	b.Run("P224", func(b *testing.B) {
-		benchmarkScalarBaseMult(b, nistec.NewP224Generator(), 28)
+		benchmarkScalarBaseMult(b, nistec.NewP224Point().SetGenerator(), 28)
 	})
 	b.Run("P256", func(b *testing.B) {
-		benchmarkScalarBaseMult(b, nistec.NewP256Generator(), 32)
+		benchmarkScalarBaseMult(b, nistec.NewP256Point().SetGenerator(), 32)
 	})
 	b.Run("P384", func(b *testing.B) {
-		benchmarkScalarBaseMult(b, nistec.NewP384Generator(), 48)
+		benchmarkScalarBaseMult(b, nistec.NewP384Point().SetGenerator(), 48)
 	})
 	b.Run("P521", func(b *testing.B) {
-		benchmarkScalarBaseMult(b, nistec.NewP521Generator(), 66)
+		benchmarkScalarBaseMult(b, nistec.NewP521Point().SetGenerator(), 66)
 	})
 }
 
diff --git a/src/crypto/internal/nistec/p224.go b/src/crypto/internal/nistec/p224.go
index 18b43ea..faa971d 100644
--- a/src/crypto/internal/nistec/p224.go
+++ b/src/crypto/internal/nistec/p224.go
@@ -13,10 +13,6 @@
 	"sync"
 )
 
-var p224B, _ = new(fiat.P224Element).SetBytes([]byte{0xb4, 0x5, 0xa, 0x85, 0xc, 0x4, 0xb3, 0xab, 0xf5, 0x41, 0x32, 0x56, 0x50, 0x44, 0xb0, 0xb7, 0xd7, 0xbf, 0xd8, 0xba, 0x27, 0xb, 0x39, 0x43, 0x23, 0x55, 0xff, 0xb4})
-
-var p224G, _ = NewP224Point().SetBytes([]byte{0x4, 0xb7, 0xe, 0xc, 0xbd, 0x6b, 0xb4, 0xbf, 0x7f, 0x32, 0x13, 0x90, 0xb9, 0x4a, 0x3, 0xc1, 0xd3, 0x56, 0xc2, 0x11, 0x22, 0x34, 0x32, 0x80, 0xd6, 0x11, 0x5c, 0x1d, 0x21, 0xbd, 0x37, 0x63, 0x88, 0xb5, 0xf7, 0x23, 0xfb, 0x4c, 0x22, 0xdf, 0xe6, 0xcd, 0x43, 0x75, 0xa0, 0x5a, 0x7, 0x47, 0x64, 0x44, 0xd5, 0x81, 0x99, 0x85, 0x0, 0x7e, 0x34})
-
 // p224ElementLength is the length of an element of the base or scalar field,
 // which have the same bytes length for all NIST P curves.
 const p224ElementLength = 28
@@ -37,13 +33,12 @@
 	}
 }
 
-// NewP224Generator returns a new P224Point set to the canonical generator.
-func NewP224Generator() *P224Point {
-	return (&P224Point{
-		x: new(fiat.P224Element),
-		y: new(fiat.P224Element),
-		z: new(fiat.P224Element),
-	}).Set(p224G)
+// SetGenerator sets p to the canonical generator and returns p.
+func (p *P224Point) SetGenerator() *P224Point {
+	p.x.SetBytes([]byte{0xb7, 0xe, 0xc, 0xbd, 0x6b, 0xb4, 0xbf, 0x7f, 0x32, 0x13, 0x90, 0xb9, 0x4a, 0x3, 0xc1, 0xd3, 0x56, 0xc2, 0x11, 0x22, 0x34, 0x32, 0x80, 0xd6, 0x11, 0x5c, 0x1d, 0x21})
+	p.y.SetBytes([]byte{0xbd, 0x37, 0x63, 0x88, 0xb5, 0xf7, 0x23, 0xfb, 0x4c, 0x22, 0xdf, 0xe6, 0xcd, 0x43, 0x75, 0xa0, 0x5a, 0x7, 0x47, 0x64, 0x44, 0xd5, 0x81, 0x99, 0x85, 0x0, 0x7e, 0x34})
+	p.z.One()
+	return p
 }
 
 // Set sets p = q and returns p.
@@ -112,6 +107,16 @@
 	}
 }
 
+var _p224B *fiat.P224Element
+var _p224BOnce sync.Once
+
+func p224B() *fiat.P224Element {
+	_p224BOnce.Do(func() {
+		_p224B, _ = new(fiat.P224Element).SetBytes([]byte{0xb4, 0x5, 0xa, 0x85, 0xc, 0x4, 0xb3, 0xab, 0xf5, 0x41, 0x32, 0x56, 0x50, 0x44, 0xb0, 0xb7, 0xd7, 0xbf, 0xd8, 0xba, 0x27, 0xb, 0x39, 0x43, 0x23, 0x55, 0xff, 0xb4})
+	})
+	return _p224B
+}
+
 // p224Polynomial sets y2 to x³ - 3x + b, and returns y2.
 func p224Polynomial(y2, x *fiat.P224Element) *fiat.P224Element {
 	y2.Square(x)
@@ -119,9 +124,9 @@
 
 	threeX := new(fiat.P224Element).Add(x, x)
 	threeX.Add(threeX, x)
-
 	y2.Sub(y2, threeX)
-	return y2.Add(y2, p224B)
+
+	return y2.Add(y2, p224B())
 }
 
 func p224CheckOnCurve(x, y *fiat.P224Element) error {
@@ -211,49 +216,49 @@
 	// Complete addition formula for a = -3 from "Complete addition formulas for
 	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §A.2.
 
-	t0 := new(fiat.P224Element).Mul(p1.x, p2.x) // t0 := X1 * X2
-	t1 := new(fiat.P224Element).Mul(p1.y, p2.y) // t1 := Y1 * Y2
-	t2 := new(fiat.P224Element).Mul(p1.z, p2.z) // t2 := Z1 * Z2
-	t3 := new(fiat.P224Element).Add(p1.x, p1.y) // t3 := X1 + Y1
-	t4 := new(fiat.P224Element).Add(p2.x, p2.y) // t4 := X2 + Y2
-	t3.Mul(t3, t4)                              // t3 := t3 * t4
-	t4.Add(t0, t1)                              // t4 := t0 + t1
-	t3.Sub(t3, t4)                              // t3 := t3 - t4
-	t4.Add(p1.y, p1.z)                          // t4 := Y1 + Z1
-	x3 := new(fiat.P224Element).Add(p2.y, p2.z) // X3 := Y2 + Z2
-	t4.Mul(t4, x3)                              // t4 := t4 * X3
-	x3.Add(t1, t2)                              // X3 := t1 + t2
-	t4.Sub(t4, x3)                              // t4 := t4 - X3
-	x3.Add(p1.x, p1.z)                          // X3 := X1 + Z1
-	y3 := new(fiat.P224Element).Add(p2.x, p2.z) // Y3 := X2 + Z2
-	x3.Mul(x3, y3)                              // X3 := X3 * Y3
-	y3.Add(t0, t2)                              // Y3 := t0 + t2
-	y3.Sub(x3, y3)                              // Y3 := X3 - Y3
-	z3 := new(fiat.P224Element).Mul(p224B, t2)  // Z3 := b * t2
-	x3.Sub(y3, z3)                              // X3 := Y3 - Z3
-	z3.Add(x3, x3)                              // Z3 := X3 + X3
-	x3.Add(x3, z3)                              // X3 := X3 + Z3
-	z3.Sub(t1, x3)                              // Z3 := t1 - X3
-	x3.Add(t1, x3)                              // X3 := t1 + X3
-	y3.Mul(p224B, y3)                           // Y3 := b * Y3
-	t1.Add(t2, t2)                              // t1 := t2 + t2
-	t2.Add(t1, t2)                              // t2 := t1 + t2
-	y3.Sub(y3, t2)                              // Y3 := Y3 - t2
-	y3.Sub(y3, t0)                              // Y3 := Y3 - t0
-	t1.Add(y3, y3)                              // t1 := Y3 + Y3
-	y3.Add(t1, y3)                              // Y3 := t1 + Y3
-	t1.Add(t0, t0)                              // t1 := t0 + t0
-	t0.Add(t1, t0)                              // t0 := t1 + t0
-	t0.Sub(t0, t2)                              // t0 := t0 - t2
-	t1.Mul(t4, y3)                              // t1 := t4 * Y3
-	t2.Mul(t0, y3)                              // t2 := t0 * Y3
-	y3.Mul(x3, z3)                              // Y3 := X3 * Z3
-	y3.Add(y3, t2)                              // Y3 := Y3 + t2
-	x3.Mul(t3, x3)                              // X3 := t3 * X3
-	x3.Sub(x3, t1)                              // X3 := X3 - t1
-	z3.Mul(t4, z3)                              // Z3 := t4 * Z3
-	t1.Mul(t3, t0)                              // t1 := t3 * t0
-	z3.Add(z3, t1)                              // Z3 := Z3 + t1
+	t0 := new(fiat.P224Element).Mul(p1.x, p2.x)  // t0 := X1 * X2
+	t1 := new(fiat.P224Element).Mul(p1.y, p2.y)  // t1 := Y1 * Y2
+	t2 := new(fiat.P224Element).Mul(p1.z, p2.z)  // t2 := Z1 * Z2
+	t3 := new(fiat.P224Element).Add(p1.x, p1.y)  // t3 := X1 + Y1
+	t4 := new(fiat.P224Element).Add(p2.x, p2.y)  // t4 := X2 + Y2
+	t3.Mul(t3, t4)                               // t3 := t3 * t4
+	t4.Add(t0, t1)                               // t4 := t0 + t1
+	t3.Sub(t3, t4)                               // t3 := t3 - t4
+	t4.Add(p1.y, p1.z)                           // t4 := Y1 + Z1
+	x3 := new(fiat.P224Element).Add(p2.y, p2.z)  // X3 := Y2 + Z2
+	t4.Mul(t4, x3)                               // t4 := t4 * X3
+	x3.Add(t1, t2)                               // X3 := t1 + t2
+	t4.Sub(t4, x3)                               // t4 := t4 - X3
+	x3.Add(p1.x, p1.z)                           // X3 := X1 + Z1
+	y3 := new(fiat.P224Element).Add(p2.x, p2.z)  // Y3 := X2 + Z2
+	x3.Mul(x3, y3)                               // X3 := X3 * Y3
+	y3.Add(t0, t2)                               // Y3 := t0 + t2
+	y3.Sub(x3, y3)                               // Y3 := X3 - Y3
+	z3 := new(fiat.P224Element).Mul(p224B(), t2) // Z3 := b * t2
+	x3.Sub(y3, z3)                               // X3 := Y3 - Z3
+	z3.Add(x3, x3)                               // Z3 := X3 + X3
+	x3.Add(x3, z3)                               // X3 := X3 + Z3
+	z3.Sub(t1, x3)                               // Z3 := t1 - X3
+	x3.Add(t1, x3)                               // X3 := t1 + X3
+	y3.Mul(p224B(), y3)                          // Y3 := b * Y3
+	t1.Add(t2, t2)                               // t1 := t2 + t2
+	t2.Add(t1, t2)                               // t2 := t1 + t2
+	y3.Sub(y3, t2)                               // Y3 := Y3 - t2
+	y3.Sub(y3, t0)                               // Y3 := Y3 - t0
+	t1.Add(y3, y3)                               // t1 := Y3 + Y3
+	y3.Add(t1, y3)                               // Y3 := t1 + Y3
+	t1.Add(t0, t0)                               // t1 := t0 + t0
+	t0.Add(t1, t0)                               // t0 := t1 + t0
+	t0.Sub(t0, t2)                               // t0 := t0 - t2
+	t1.Mul(t4, y3)                               // t1 := t4 * Y3
+	t2.Mul(t0, y3)                               // t2 := t0 * Y3
+	y3.Mul(x3, z3)                               // Y3 := X3 * Z3
+	y3.Add(y3, t2)                               // Y3 := Y3 + t2
+	x3.Mul(t3, x3)                               // X3 := t3 * X3
+	x3.Sub(x3, t1)                               // X3 := X3 - t1
+	z3.Mul(t4, z3)                               // Z3 := t4 * Z3
+	t1.Mul(t3, t0)                               // t1 := t3 * t0
+	z3.Add(z3, t1)                               // Z3 := Z3 + t1
 
 	q.x.Set(x3)
 	q.y.Set(y3)
@@ -266,40 +271,40 @@
 	// Complete addition formula for a = -3 from "Complete addition formulas for
 	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §A.2.
 
-	t0 := new(fiat.P224Element).Square(p.x)    // t0 := X ^ 2
-	t1 := new(fiat.P224Element).Square(p.y)    // t1 := Y ^ 2
-	t2 := new(fiat.P224Element).Square(p.z)    // t2 := Z ^ 2
-	t3 := new(fiat.P224Element).Mul(p.x, p.y)  // t3 := X * Y
-	t3.Add(t3, t3)                             // t3 := t3 + t3
-	z3 := new(fiat.P224Element).Mul(p.x, p.z)  // Z3 := X * Z
-	z3.Add(z3, z3)                             // Z3 := Z3 + Z3
-	y3 := new(fiat.P224Element).Mul(p224B, t2) // Y3 := b * t2
-	y3.Sub(y3, z3)                             // Y3 := Y3 - Z3
-	x3 := new(fiat.P224Element).Add(y3, y3)    // X3 := Y3 + Y3
-	y3.Add(x3, y3)                             // Y3 := X3 + Y3
-	x3.Sub(t1, y3)                             // X3 := t1 - Y3
-	y3.Add(t1, y3)                             // Y3 := t1 + Y3
-	y3.Mul(x3, y3)                             // Y3 := X3 * Y3
-	x3.Mul(x3, t3)                             // X3 := X3 * t3
-	t3.Add(t2, t2)                             // t3 := t2 + t2
-	t2.Add(t2, t3)                             // t2 := t2 + t3
-	z3.Mul(p224B, z3)                          // Z3 := b * Z3
-	z3.Sub(z3, t2)                             // Z3 := Z3 - t2
-	z3.Sub(z3, t0)                             // Z3 := Z3 - t0
-	t3.Add(z3, z3)                             // t3 := Z3 + Z3
-	z3.Add(z3, t3)                             // Z3 := Z3 + t3
-	t3.Add(t0, t0)                             // t3 := t0 + t0
-	t0.Add(t3, t0)                             // t0 := t3 + t0
-	t0.Sub(t0, t2)                             // t0 := t0 - t2
-	t0.Mul(t0, z3)                             // t0 := t0 * Z3
-	y3.Add(y3, t0)                             // Y3 := Y3 + t0
-	t0.Mul(p.y, p.z)                           // t0 := Y * Z
-	t0.Add(t0, t0)                             // t0 := t0 + t0
-	z3.Mul(t0, z3)                             // Z3 := t0 * Z3
-	x3.Sub(x3, z3)                             // X3 := X3 - Z3
-	z3.Mul(t0, t1)                             // Z3 := t0 * t1
-	z3.Add(z3, z3)                             // Z3 := Z3 + Z3
-	z3.Add(z3, z3)                             // Z3 := Z3 + Z3
+	t0 := new(fiat.P224Element).Square(p.x)      // t0 := X ^ 2
+	t1 := new(fiat.P224Element).Square(p.y)      // t1 := Y ^ 2
+	t2 := new(fiat.P224Element).Square(p.z)      // t2 := Z ^ 2
+	t3 := new(fiat.P224Element).Mul(p.x, p.y)    // t3 := X * Y
+	t3.Add(t3, t3)                               // t3 := t3 + t3
+	z3 := new(fiat.P224Element).Mul(p.x, p.z)    // Z3 := X * Z
+	z3.Add(z3, z3)                               // Z3 := Z3 + Z3
+	y3 := new(fiat.P224Element).Mul(p224B(), t2) // Y3 := b * t2
+	y3.Sub(y3, z3)                               // Y3 := Y3 - Z3
+	x3 := new(fiat.P224Element).Add(y3, y3)      // X3 := Y3 + Y3
+	y3.Add(x3, y3)                               // Y3 := X3 + Y3
+	x3.Sub(t1, y3)                               // X3 := t1 - Y3
+	y3.Add(t1, y3)                               // Y3 := t1 + Y3
+	y3.Mul(x3, y3)                               // Y3 := X3 * Y3
+	x3.Mul(x3, t3)                               // X3 := X3 * t3
+	t3.Add(t2, t2)                               // t3 := t2 + t2
+	t2.Add(t2, t3)                               // t2 := t2 + t3
+	z3.Mul(p224B(), z3)                          // Z3 := b * Z3
+	z3.Sub(z3, t2)                               // Z3 := Z3 - t2
+	z3.Sub(z3, t0)                               // Z3 := Z3 - t0
+	t3.Add(z3, z3)                               // t3 := Z3 + Z3
+	z3.Add(z3, t3)                               // Z3 := Z3 + t3
+	t3.Add(t0, t0)                               // t3 := t0 + t0
+	t0.Add(t3, t0)                               // t0 := t3 + t0
+	t0.Sub(t0, t2)                               // t0 := t0 - t2
+	t0.Mul(t0, z3)                               // t0 := t0 * Z3
+	y3.Add(y3, t0)                               // Y3 := Y3 + t0
+	t0.Mul(p.y, p.z)                             // t0 := Y * Z
+	t0.Add(t0, t0)                               // t0 := t0 + t0
+	z3.Mul(t0, z3)                               // Z3 := t0 * Z3
+	x3.Sub(x3, z3)                               // X3 := X3 - Z3
+	z3.Mul(t0, t1)                               // Z3 := t0 * t1
+	z3.Add(z3, z3)                               // Z3 := Z3 + Z3
+	z3.Add(z3, z3)                               // Z3 := Z3 + Z3
 
 	q.x.Set(x3)
 	q.y.Set(y3)
@@ -387,7 +392,7 @@
 func (p *P224Point) generatorTable() *[p224ElementLength * 2]p224Table {
 	p224GeneratorTableOnce.Do(func() {
 		p224GeneratorTable = new([p224ElementLength * 2]p224Table)
-		base := NewP224Generator()
+		base := NewP224Point().SetGenerator()
 		for i := 0; i < p224ElementLength*2; i++ {
 			p224GeneratorTable[i][0] = NewP224Point().Set(base)
 			for j := 1; j < 15; j++ {
diff --git a/src/crypto/internal/nistec/p224_sqrt.go b/src/crypto/internal/nistec/p224_sqrt.go
index 9a35cea..0c77579 100644
--- a/src/crypto/internal/nistec/p224_sqrt.go
+++ b/src/crypto/internal/nistec/p224_sqrt.go
@@ -12,9 +12,6 @@
 var p224GG *[96]fiat.P224Element
 var p224GGOnce sync.Once
 
-var p224MinusOne = new(fiat.P224Element).Sub(
-	new(fiat.P224Element), new(fiat.P224Element).One())
-
 // p224SqrtCandidate sets r to a square root candidate for x. r and x must not overlap.
 func p224SqrtCandidate(r, x *fiat.P224Element) {
 	// Since p = 1 mod 4, we can't use the exponentiation by (p + 1) / 4 like
@@ -120,6 +117,9 @@
 	//         v <- v*GG[n-i]
 	//         r <- r*GG[n-i-1]
 
+	var p224MinusOne = new(fiat.P224Element).Sub(
+		new(fiat.P224Element), new(fiat.P224Element).One())
+
 	for i := 96 - 1; i >= 1; i-- {
 		w := new(fiat.P224Element).Set(v)
 		for j := 0; j < i-1; j++ {
diff --git a/src/crypto/internal/nistec/p256.go b/src/crypto/internal/nistec/p256.go
index c836c2a..3cfa5fb 100644
--- a/src/crypto/internal/nistec/p256.go
+++ b/src/crypto/internal/nistec/p256.go
@@ -15,10 +15,6 @@
 	"sync"
 )
 
-var p256B, _ = new(fiat.P256Element).SetBytes([]byte{0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7, 0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc, 0x65, 0x1d, 0x6, 0xb0, 0xcc, 0x53, 0xb0, 0xf6, 0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b})
-
-var p256G, _ = NewP256Point().SetBytes([]byte{0x4, 0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, 0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2, 0x77, 0x3, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0, 0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96, 0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, 0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0xf, 0x9e, 0x16, 0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce, 0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5})
-
 // p256ElementLength is the length of an element of the base or scalar field,
 // which have the same bytes length for all NIST P curves.
 const p256ElementLength = 32
@@ -39,13 +35,12 @@
 	}
 }
 
-// NewP256Generator returns a new P256Point set to the canonical generator.
-func NewP256Generator() *P256Point {
-	return (&P256Point{
-		x: new(fiat.P256Element),
-		y: new(fiat.P256Element),
-		z: new(fiat.P256Element),
-	}).Set(p256G)
+// SetGenerator sets p to the canonical generator and returns p.
+func (p *P256Point) SetGenerator() *P256Point {
+	p.x.SetBytes([]byte{0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, 0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2, 0x77, 0x3, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0, 0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96})
+	p.y.SetBytes([]byte{0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, 0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0xf, 0x9e, 0x16, 0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce, 0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5})
+	p.z.One()
+	return p
 }
 
 // Set sets p = q and returns p.
@@ -114,6 +109,16 @@
 	}
 }
 
+var _p256B *fiat.P256Element
+var _p256BOnce sync.Once
+
+func p256B() *fiat.P256Element {
+	_p256BOnce.Do(func() {
+		_p256B, _ = new(fiat.P256Element).SetBytes([]byte{0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7, 0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc, 0x65, 0x1d, 0x6, 0xb0, 0xcc, 0x53, 0xb0, 0xf6, 0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b})
+	})
+	return _p256B
+}
+
 // p256Polynomial sets y2 to x³ - 3x + b, and returns y2.
 func p256Polynomial(y2, x *fiat.P256Element) *fiat.P256Element {
 	y2.Square(x)
@@ -121,9 +126,9 @@
 
 	threeX := new(fiat.P256Element).Add(x, x)
 	threeX.Add(threeX, x)
-
 	y2.Sub(y2, threeX)
-	return y2.Add(y2, p256B)
+
+	return y2.Add(y2, p256B())
 }
 
 func p256CheckOnCurve(x, y *fiat.P256Element) error {
@@ -213,49 +218,49 @@
 	// Complete addition formula for a = -3 from "Complete addition formulas for
 	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §A.2.
 
-	t0 := new(fiat.P256Element).Mul(p1.x, p2.x) // t0 := X1 * X2
-	t1 := new(fiat.P256Element).Mul(p1.y, p2.y) // t1 := Y1 * Y2
-	t2 := new(fiat.P256Element).Mul(p1.z, p2.z) // t2 := Z1 * Z2
-	t3 := new(fiat.P256Element).Add(p1.x, p1.y) // t3 := X1 + Y1
-	t4 := new(fiat.P256Element).Add(p2.x, p2.y) // t4 := X2 + Y2
-	t3.Mul(t3, t4)                              // t3 := t3 * t4
-	t4.Add(t0, t1)                              // t4 := t0 + t1
-	t3.Sub(t3, t4)                              // t3 := t3 - t4
-	t4.Add(p1.y, p1.z)                          // t4 := Y1 + Z1
-	x3 := new(fiat.P256Element).Add(p2.y, p2.z) // X3 := Y2 + Z2
-	t4.Mul(t4, x3)                              // t4 := t4 * X3
-	x3.Add(t1, t2)                              // X3 := t1 + t2
-	t4.Sub(t4, x3)                              // t4 := t4 - X3
-	x3.Add(p1.x, p1.z)                          // X3 := X1 + Z1
-	y3 := new(fiat.P256Element).Add(p2.x, p2.z) // Y3 := X2 + Z2
-	x3.Mul(x3, y3)                              // X3 := X3 * Y3
-	y3.Add(t0, t2)                              // Y3 := t0 + t2
-	y3.Sub(x3, y3)                              // Y3 := X3 - Y3
-	z3 := new(fiat.P256Element).Mul(p256B, t2)  // Z3 := b * t2
-	x3.Sub(y3, z3)                              // X3 := Y3 - Z3
-	z3.Add(x3, x3)                              // Z3 := X3 + X3
-	x3.Add(x3, z3)                              // X3 := X3 + Z3
-	z3.Sub(t1, x3)                              // Z3 := t1 - X3
-	x3.Add(t1, x3)                              // X3 := t1 + X3
-	y3.Mul(p256B, y3)                           // Y3 := b * Y3
-	t1.Add(t2, t2)                              // t1 := t2 + t2
-	t2.Add(t1, t2)                              // t2 := t1 + t2
-	y3.Sub(y3, t2)                              // Y3 := Y3 - t2
-	y3.Sub(y3, t0)                              // Y3 := Y3 - t0
-	t1.Add(y3, y3)                              // t1 := Y3 + Y3
-	y3.Add(t1, y3)                              // Y3 := t1 + Y3
-	t1.Add(t0, t0)                              // t1 := t0 + t0
-	t0.Add(t1, t0)                              // t0 := t1 + t0
-	t0.Sub(t0, t2)                              // t0 := t0 - t2
-	t1.Mul(t4, y3)                              // t1 := t4 * Y3
-	t2.Mul(t0, y3)                              // t2 := t0 * Y3
-	y3.Mul(x3, z3)                              // Y3 := X3 * Z3
-	y3.Add(y3, t2)                              // Y3 := Y3 + t2
-	x3.Mul(t3, x3)                              // X3 := t3 * X3
-	x3.Sub(x3, t1)                              // X3 := X3 - t1
-	z3.Mul(t4, z3)                              // Z3 := t4 * Z3
-	t1.Mul(t3, t0)                              // t1 := t3 * t0
-	z3.Add(z3, t1)                              // Z3 := Z3 + t1
+	t0 := new(fiat.P256Element).Mul(p1.x, p2.x)  // t0 := X1 * X2
+	t1 := new(fiat.P256Element).Mul(p1.y, p2.y)  // t1 := Y1 * Y2
+	t2 := new(fiat.P256Element).Mul(p1.z, p2.z)  // t2 := Z1 * Z2
+	t3 := new(fiat.P256Element).Add(p1.x, p1.y)  // t3 := X1 + Y1
+	t4 := new(fiat.P256Element).Add(p2.x, p2.y)  // t4 := X2 + Y2
+	t3.Mul(t3, t4)                               // t3 := t3 * t4
+	t4.Add(t0, t1)                               // t4 := t0 + t1
+	t3.Sub(t3, t4)                               // t3 := t3 - t4
+	t4.Add(p1.y, p1.z)                           // t4 := Y1 + Z1
+	x3 := new(fiat.P256Element).Add(p2.y, p2.z)  // X3 := Y2 + Z2
+	t4.Mul(t4, x3)                               // t4 := t4 * X3
+	x3.Add(t1, t2)                               // X3 := t1 + t2
+	t4.Sub(t4, x3)                               // t4 := t4 - X3
+	x3.Add(p1.x, p1.z)                           // X3 := X1 + Z1
+	y3 := new(fiat.P256Element).Add(p2.x, p2.z)  // Y3 := X2 + Z2
+	x3.Mul(x3, y3)                               // X3 := X3 * Y3
+	y3.Add(t0, t2)                               // Y3 := t0 + t2
+	y3.Sub(x3, y3)                               // Y3 := X3 - Y3
+	z3 := new(fiat.P256Element).Mul(p256B(), t2) // Z3 := b * t2
+	x3.Sub(y3, z3)                               // X3 := Y3 - Z3
+	z3.Add(x3, x3)                               // Z3 := X3 + X3
+	x3.Add(x3, z3)                               // X3 := X3 + Z3
+	z3.Sub(t1, x3)                               // Z3 := t1 - X3
+	x3.Add(t1, x3)                               // X3 := t1 + X3
+	y3.Mul(p256B(), y3)                          // Y3 := b * Y3
+	t1.Add(t2, t2)                               // t1 := t2 + t2
+	t2.Add(t1, t2)                               // t2 := t1 + t2
+	y3.Sub(y3, t2)                               // Y3 := Y3 - t2
+	y3.Sub(y3, t0)                               // Y3 := Y3 - t0
+	t1.Add(y3, y3)                               // t1 := Y3 + Y3
+	y3.Add(t1, y3)                               // Y3 := t1 + Y3
+	t1.Add(t0, t0)                               // t1 := t0 + t0
+	t0.Add(t1, t0)                               // t0 := t1 + t0
+	t0.Sub(t0, t2)                               // t0 := t0 - t2
+	t1.Mul(t4, y3)                               // t1 := t4 * Y3
+	t2.Mul(t0, y3)                               // t2 := t0 * Y3
+	y3.Mul(x3, z3)                               // Y3 := X3 * Z3
+	y3.Add(y3, t2)                               // Y3 := Y3 + t2
+	x3.Mul(t3, x3)                               // X3 := t3 * X3
+	x3.Sub(x3, t1)                               // X3 := X3 - t1
+	z3.Mul(t4, z3)                               // Z3 := t4 * Z3
+	t1.Mul(t3, t0)                               // t1 := t3 * t0
+	z3.Add(z3, t1)                               // Z3 := Z3 + t1
 
 	q.x.Set(x3)
 	q.y.Set(y3)
@@ -268,40 +273,40 @@
 	// Complete addition formula for a = -3 from "Complete addition formulas for
 	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §A.2.
 
-	t0 := new(fiat.P256Element).Square(p.x)    // t0 := X ^ 2
-	t1 := new(fiat.P256Element).Square(p.y)    // t1 := Y ^ 2
-	t2 := new(fiat.P256Element).Square(p.z)    // t2 := Z ^ 2
-	t3 := new(fiat.P256Element).Mul(p.x, p.y)  // t3 := X * Y
-	t3.Add(t3, t3)                             // t3 := t3 + t3
-	z3 := new(fiat.P256Element).Mul(p.x, p.z)  // Z3 := X * Z
-	z3.Add(z3, z3)                             // Z3 := Z3 + Z3
-	y3 := new(fiat.P256Element).Mul(p256B, t2) // Y3 := b * t2
-	y3.Sub(y3, z3)                             // Y3 := Y3 - Z3
-	x3 := new(fiat.P256Element).Add(y3, y3)    // X3 := Y3 + Y3
-	y3.Add(x3, y3)                             // Y3 := X3 + Y3
-	x3.Sub(t1, y3)                             // X3 := t1 - Y3
-	y3.Add(t1, y3)                             // Y3 := t1 + Y3
-	y3.Mul(x3, y3)                             // Y3 := X3 * Y3
-	x3.Mul(x3, t3)                             // X3 := X3 * t3
-	t3.Add(t2, t2)                             // t3 := t2 + t2
-	t2.Add(t2, t3)                             // t2 := t2 + t3
-	z3.Mul(p256B, z3)                          // Z3 := b * Z3
-	z3.Sub(z3, t2)                             // Z3 := Z3 - t2
-	z3.Sub(z3, t0)                             // Z3 := Z3 - t0
-	t3.Add(z3, z3)                             // t3 := Z3 + Z3
-	z3.Add(z3, t3)                             // Z3 := Z3 + t3
-	t3.Add(t0, t0)                             // t3 := t0 + t0
-	t0.Add(t3, t0)                             // t0 := t3 + t0
-	t0.Sub(t0, t2)                             // t0 := t0 - t2
-	t0.Mul(t0, z3)                             // t0 := t0 * Z3
-	y3.Add(y3, t0)                             // Y3 := Y3 + t0
-	t0.Mul(p.y, p.z)                           // t0 := Y * Z
-	t0.Add(t0, t0)                             // t0 := t0 + t0
-	z3.Mul(t0, z3)                             // Z3 := t0 * Z3
-	x3.Sub(x3, z3)                             // X3 := X3 - Z3
-	z3.Mul(t0, t1)                             // Z3 := t0 * t1
-	z3.Add(z3, z3)                             // Z3 := Z3 + Z3
-	z3.Add(z3, z3)                             // Z3 := Z3 + Z3
+	t0 := new(fiat.P256Element).Square(p.x)      // t0 := X ^ 2
+	t1 := new(fiat.P256Element).Square(p.y)      // t1 := Y ^ 2
+	t2 := new(fiat.P256Element).Square(p.z)      // t2 := Z ^ 2
+	t3 := new(fiat.P256Element).Mul(p.x, p.y)    // t3 := X * Y
+	t3.Add(t3, t3)                               // t3 := t3 + t3
+	z3 := new(fiat.P256Element).Mul(p.x, p.z)    // Z3 := X * Z
+	z3.Add(z3, z3)                               // Z3 := Z3 + Z3
+	y3 := new(fiat.P256Element).Mul(p256B(), t2) // Y3 := b * t2
+	y3.Sub(y3, z3)                               // Y3 := Y3 - Z3
+	x3 := new(fiat.P256Element).Add(y3, y3)      // X3 := Y3 + Y3
+	y3.Add(x3, y3)                               // Y3 := X3 + Y3
+	x3.Sub(t1, y3)                               // X3 := t1 - Y3
+	y3.Add(t1, y3)                               // Y3 := t1 + Y3
+	y3.Mul(x3, y3)                               // Y3 := X3 * Y3
+	x3.Mul(x3, t3)                               // X3 := X3 * t3
+	t3.Add(t2, t2)                               // t3 := t2 + t2
+	t2.Add(t2, t3)                               // t2 := t2 + t3
+	z3.Mul(p256B(), z3)                          // Z3 := b * Z3
+	z3.Sub(z3, t2)                               // Z3 := Z3 - t2
+	z3.Sub(z3, t0)                               // Z3 := Z3 - t0
+	t3.Add(z3, z3)                               // t3 := Z3 + Z3
+	z3.Add(z3, t3)                               // Z3 := Z3 + t3
+	t3.Add(t0, t0)                               // t3 := t0 + t0
+	t0.Add(t3, t0)                               // t0 := t3 + t0
+	t0.Sub(t0, t2)                               // t0 := t0 - t2
+	t0.Mul(t0, z3)                               // t0 := t0 * Z3
+	y3.Add(y3, t0)                               // Y3 := Y3 + t0
+	t0.Mul(p.y, p.z)                             // t0 := Y * Z
+	t0.Add(t0, t0)                               // t0 := t0 + t0
+	z3.Mul(t0, z3)                               // Z3 := t0 * Z3
+	x3.Sub(x3, z3)                               // X3 := X3 - Z3
+	z3.Mul(t0, t1)                               // Z3 := t0 * t1
+	z3.Add(z3, z3)                               // Z3 := Z3 + Z3
+	z3.Add(z3, z3)                               // Z3 := Z3 + Z3
 
 	q.x.Set(x3)
 	q.y.Set(y3)
@@ -389,7 +394,7 @@
 func (p *P256Point) generatorTable() *[p256ElementLength * 2]p256Table {
 	p256GeneratorTableOnce.Do(func() {
 		p256GeneratorTable = new([p256ElementLength * 2]p256Table)
-		base := NewP256Generator()
+		base := NewP256Point().SetGenerator()
 		for i := 0; i < p256ElementLength*2; i++ {
 			p256GeneratorTable[i][0] = NewP256Point().Set(base)
 			for j := 1; j < 15; j++ {
diff --git a/src/crypto/internal/nistec/p256_asm.go b/src/crypto/internal/nistec/p256_asm.go
index 90f0279..6ea161e 100644
--- a/src/crypto/internal/nistec/p256_asm.go
+++ b/src/crypto/internal/nistec/p256_asm.go
@@ -52,15 +52,14 @@
 	}
 }
 
-// NewP256Generator returns a new P256Point set to the canonical generator.
-func NewP256Generator() *P256Point {
-	return &P256Point{
-		x: p256Element{0x79e730d418a9143c, 0x75ba95fc5fedb601,
-			0x79fb732b77622510, 0x18905f76a53755c6},
-		y: p256Element{0xddf25357ce95560a, 0x8b4ab8e4ba19e45c,
-			0xd2e88688dd21f325, 0x8571ff1825885d85},
-		z: p256One,
-	}
+// SetGenerator sets p to the canonical generator and returns p.
+func (p *P256Point) SetGenerator() *P256Point {
+	p.x = p256Element{0x79e730d418a9143c, 0x75ba95fc5fedb601,
+		0x79fb732b77622510, 0x18905f76a53755c6}
+	p.y = p256Element{0xddf25357ce95560a, 0x8b4ab8e4ba19e45c,
+		0xd2e88688dd21f325, 0x8571ff1825885d85}
+	p.z = p256One
+	return p
 }
 
 // Set sets p = q and returns p.
diff --git a/src/crypto/internal/nistec/p256_asm_table_test.go b/src/crypto/internal/nistec/p256_asm_table_test.go
index 8707aeb..5b7246b 100644
--- a/src/crypto/internal/nistec/p256_asm_table_test.go
+++ b/src/crypto/internal/nistec/p256_asm_table_test.go
@@ -12,7 +12,7 @@
 )
 
 func TestP256PrecomputedTable(t *testing.T) {
-	base := NewP256Generator()
+	base := NewP256Point().SetGenerator()
 
 	for i := 0; i < 43; i++ {
 		t.Run(fmt.Sprintf("table[%d]", i), func(t *testing.T) {
diff --git a/src/crypto/internal/nistec/p384.go b/src/crypto/internal/nistec/p384.go
index 40bff73..b452ec9 100644
--- a/src/crypto/internal/nistec/p384.go
+++ b/src/crypto/internal/nistec/p384.go
@@ -13,10 +13,6 @@
 	"sync"
 )
 
-var p384B, _ = new(fiat.P384Element).SetBytes([]byte{0xb3, 0x31, 0x2f, 0xa7, 0xe2, 0x3e, 0xe7, 0xe4, 0x98, 0x8e, 0x5, 0x6b, 0xe3, 0xf8, 0x2d, 0x19, 0x18, 0x1d, 0x9c, 0x6e, 0xfe, 0x81, 0x41, 0x12, 0x3, 0x14, 0x8, 0x8f, 0x50, 0x13, 0x87, 0x5a, 0xc6, 0x56, 0x39, 0x8d, 0x8a, 0x2e, 0xd1, 0x9d, 0x2a, 0x85, 0xc8, 0xed, 0xd3, 0xec, 0x2a, 0xef})
-
-var p384G, _ = NewP384Point().SetBytes([]byte{0x4, 0xaa, 0x87, 0xca, 0x22, 0xbe, 0x8b, 0x5, 0x37, 0x8e, 0xb1, 0xc7, 0x1e, 0xf3, 0x20, 0xad, 0x74, 0x6e, 0x1d, 0x3b, 0x62, 0x8b, 0xa7, 0x9b, 0x98, 0x59, 0xf7, 0x41, 0xe0, 0x82, 0x54, 0x2a, 0x38, 0x55, 0x2, 0xf2, 0x5d, 0xbf, 0x55, 0x29, 0x6c, 0x3a, 0x54, 0x5e, 0x38, 0x72, 0x76, 0xa, 0xb7, 0x36, 0x17, 0xde, 0x4a, 0x96, 0x26, 0x2c, 0x6f, 0x5d, 0x9e, 0x98, 0xbf, 0x92, 0x92, 0xdc, 0x29, 0xf8, 0xf4, 0x1d, 0xbd, 0x28, 0x9a, 0x14, 0x7c, 0xe9, 0xda, 0x31, 0x13, 0xb5, 0xf0, 0xb8, 0xc0, 0xa, 0x60, 0xb1, 0xce, 0x1d, 0x7e, 0x81, 0x9d, 0x7a, 0x43, 0x1d, 0x7c, 0x90, 0xea, 0xe, 0x5f})
-
 // p384ElementLength is the length of an element of the base or scalar field,
 // which have the same bytes length for all NIST P curves.
 const p384ElementLength = 48
@@ -37,13 +33,12 @@
 	}
 }
 
-// NewP384Generator returns a new P384Point set to the canonical generator.
-func NewP384Generator() *P384Point {
-	return (&P384Point{
-		x: new(fiat.P384Element),
-		y: new(fiat.P384Element),
-		z: new(fiat.P384Element),
-	}).Set(p384G)
+// SetGenerator sets p to the canonical generator and returns p.
+func (p *P384Point) SetGenerator() *P384Point {
+	p.x.SetBytes([]byte{0xaa, 0x87, 0xca, 0x22, 0xbe, 0x8b, 0x5, 0x37, 0x8e, 0xb1, 0xc7, 0x1e, 0xf3, 0x20, 0xad, 0x74, 0x6e, 0x1d, 0x3b, 0x62, 0x8b, 0xa7, 0x9b, 0x98, 0x59, 0xf7, 0x41, 0xe0, 0x82, 0x54, 0x2a, 0x38, 0x55, 0x2, 0xf2, 0x5d, 0xbf, 0x55, 0x29, 0x6c, 0x3a, 0x54, 0x5e, 0x38, 0x72, 0x76, 0xa, 0xb7})
+	p.y.SetBytes([]byte{0x36, 0x17, 0xde, 0x4a, 0x96, 0x26, 0x2c, 0x6f, 0x5d, 0x9e, 0x98, 0xbf, 0x92, 0x92, 0xdc, 0x29, 0xf8, 0xf4, 0x1d, 0xbd, 0x28, 0x9a, 0x14, 0x7c, 0xe9, 0xda, 0x31, 0x13, 0xb5, 0xf0, 0xb8, 0xc0, 0xa, 0x60, 0xb1, 0xce, 0x1d, 0x7e, 0x81, 0x9d, 0x7a, 0x43, 0x1d, 0x7c, 0x90, 0xea, 0xe, 0x5f})
+	p.z.One()
+	return p
 }
 
 // Set sets p = q and returns p.
@@ -112,6 +107,16 @@
 	}
 }
 
+var _p384B *fiat.P384Element
+var _p384BOnce sync.Once
+
+func p384B() *fiat.P384Element {
+	_p384BOnce.Do(func() {
+		_p384B, _ = new(fiat.P384Element).SetBytes([]byte{0xb3, 0x31, 0x2f, 0xa7, 0xe2, 0x3e, 0xe7, 0xe4, 0x98, 0x8e, 0x5, 0x6b, 0xe3, 0xf8, 0x2d, 0x19, 0x18, 0x1d, 0x9c, 0x6e, 0xfe, 0x81, 0x41, 0x12, 0x3, 0x14, 0x8, 0x8f, 0x50, 0x13, 0x87, 0x5a, 0xc6, 0x56, 0x39, 0x8d, 0x8a, 0x2e, 0xd1, 0x9d, 0x2a, 0x85, 0xc8, 0xed, 0xd3, 0xec, 0x2a, 0xef})
+	})
+	return _p384B
+}
+
 // p384Polynomial sets y2 to x³ - 3x + b, and returns y2.
 func p384Polynomial(y2, x *fiat.P384Element) *fiat.P384Element {
 	y2.Square(x)
@@ -119,9 +124,9 @@
 
 	threeX := new(fiat.P384Element).Add(x, x)
 	threeX.Add(threeX, x)
-
 	y2.Sub(y2, threeX)
-	return y2.Add(y2, p384B)
+
+	return y2.Add(y2, p384B())
 }
 
 func p384CheckOnCurve(x, y *fiat.P384Element) error {
@@ -211,49 +216,49 @@
 	// Complete addition formula for a = -3 from "Complete addition formulas for
 	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §A.2.
 
-	t0 := new(fiat.P384Element).Mul(p1.x, p2.x) // t0 := X1 * X2
-	t1 := new(fiat.P384Element).Mul(p1.y, p2.y) // t1 := Y1 * Y2
-	t2 := new(fiat.P384Element).Mul(p1.z, p2.z) // t2 := Z1 * Z2
-	t3 := new(fiat.P384Element).Add(p1.x, p1.y) // t3 := X1 + Y1
-	t4 := new(fiat.P384Element).Add(p2.x, p2.y) // t4 := X2 + Y2
-	t3.Mul(t3, t4)                              // t3 := t3 * t4
-	t4.Add(t0, t1)                              // t4 := t0 + t1
-	t3.Sub(t3, t4)                              // t3 := t3 - t4
-	t4.Add(p1.y, p1.z)                          // t4 := Y1 + Z1
-	x3 := new(fiat.P384Element).Add(p2.y, p2.z) // X3 := Y2 + Z2
-	t4.Mul(t4, x3)                              // t4 := t4 * X3
-	x3.Add(t1, t2)                              // X3 := t1 + t2
-	t4.Sub(t4, x3)                              // t4 := t4 - X3
-	x3.Add(p1.x, p1.z)                          // X3 := X1 + Z1
-	y3 := new(fiat.P384Element).Add(p2.x, p2.z) // Y3 := X2 + Z2
-	x3.Mul(x3, y3)                              // X3 := X3 * Y3
-	y3.Add(t0, t2)                              // Y3 := t0 + t2
-	y3.Sub(x3, y3)                              // Y3 := X3 - Y3
-	z3 := new(fiat.P384Element).Mul(p384B, t2)  // Z3 := b * t2
-	x3.Sub(y3, z3)                              // X3 := Y3 - Z3
-	z3.Add(x3, x3)                              // Z3 := X3 + X3
-	x3.Add(x3, z3)                              // X3 := X3 + Z3
-	z3.Sub(t1, x3)                              // Z3 := t1 - X3
-	x3.Add(t1, x3)                              // X3 := t1 + X3
-	y3.Mul(p384B, y3)                           // Y3 := b * Y3
-	t1.Add(t2, t2)                              // t1 := t2 + t2
-	t2.Add(t1, t2)                              // t2 := t1 + t2
-	y3.Sub(y3, t2)                              // Y3 := Y3 - t2
-	y3.Sub(y3, t0)                              // Y3 := Y3 - t0
-	t1.Add(y3, y3)                              // t1 := Y3 + Y3
-	y3.Add(t1, y3)                              // Y3 := t1 + Y3
-	t1.Add(t0, t0)                              // t1 := t0 + t0
-	t0.Add(t1, t0)                              // t0 := t1 + t0
-	t0.Sub(t0, t2)                              // t0 := t0 - t2
-	t1.Mul(t4, y3)                              // t1 := t4 * Y3
-	t2.Mul(t0, y3)                              // t2 := t0 * Y3
-	y3.Mul(x3, z3)                              // Y3 := X3 * Z3
-	y3.Add(y3, t2)                              // Y3 := Y3 + t2
-	x3.Mul(t3, x3)                              // X3 := t3 * X3
-	x3.Sub(x3, t1)                              // X3 := X3 - t1
-	z3.Mul(t4, z3)                              // Z3 := t4 * Z3
-	t1.Mul(t3, t0)                              // t1 := t3 * t0
-	z3.Add(z3, t1)                              // Z3 := Z3 + t1
+	t0 := new(fiat.P384Element).Mul(p1.x, p2.x)  // t0 := X1 * X2
+	t1 := new(fiat.P384Element).Mul(p1.y, p2.y)  // t1 := Y1 * Y2
+	t2 := new(fiat.P384Element).Mul(p1.z, p2.z)  // t2 := Z1 * Z2
+	t3 := new(fiat.P384Element).Add(p1.x, p1.y)  // t3 := X1 + Y1
+	t4 := new(fiat.P384Element).Add(p2.x, p2.y)  // t4 := X2 + Y2
+	t3.Mul(t3, t4)                               // t3 := t3 * t4
+	t4.Add(t0, t1)                               // t4 := t0 + t1
+	t3.Sub(t3, t4)                               // t3 := t3 - t4
+	t4.Add(p1.y, p1.z)                           // t4 := Y1 + Z1
+	x3 := new(fiat.P384Element).Add(p2.y, p2.z)  // X3 := Y2 + Z2
+	t4.Mul(t4, x3)                               // t4 := t4 * X3
+	x3.Add(t1, t2)                               // X3 := t1 + t2
+	t4.Sub(t4, x3)                               // t4 := t4 - X3
+	x3.Add(p1.x, p1.z)                           // X3 := X1 + Z1
+	y3 := new(fiat.P384Element).Add(p2.x, p2.z)  // Y3 := X2 + Z2
+	x3.Mul(x3, y3)                               // X3 := X3 * Y3
+	y3.Add(t0, t2)                               // Y3 := t0 + t2
+	y3.Sub(x3, y3)                               // Y3 := X3 - Y3
+	z3 := new(fiat.P384Element).Mul(p384B(), t2) // Z3 := b * t2
+	x3.Sub(y3, z3)                               // X3 := Y3 - Z3
+	z3.Add(x3, x3)                               // Z3 := X3 + X3
+	x3.Add(x3, z3)                               // X3 := X3 + Z3
+	z3.Sub(t1, x3)                               // Z3 := t1 - X3
+	x3.Add(t1, x3)                               // X3 := t1 + X3
+	y3.Mul(p384B(), y3)                          // Y3 := b * Y3
+	t1.Add(t2, t2)                               // t1 := t2 + t2
+	t2.Add(t1, t2)                               // t2 := t1 + t2
+	y3.Sub(y3, t2)                               // Y3 := Y3 - t2
+	y3.Sub(y3, t0)                               // Y3 := Y3 - t0
+	t1.Add(y3, y3)                               // t1 := Y3 + Y3
+	y3.Add(t1, y3)                               // Y3 := t1 + Y3
+	t1.Add(t0, t0)                               // t1 := t0 + t0
+	t0.Add(t1, t0)                               // t0 := t1 + t0
+	t0.Sub(t0, t2)                               // t0 := t0 - t2
+	t1.Mul(t4, y3)                               // t1 := t4 * Y3
+	t2.Mul(t0, y3)                               // t2 := t0 * Y3
+	y3.Mul(x3, z3)                               // Y3 := X3 * Z3
+	y3.Add(y3, t2)                               // Y3 := Y3 + t2
+	x3.Mul(t3, x3)                               // X3 := t3 * X3
+	x3.Sub(x3, t1)                               // X3 := X3 - t1
+	z3.Mul(t4, z3)                               // Z3 := t4 * Z3
+	t1.Mul(t3, t0)                               // t1 := t3 * t0
+	z3.Add(z3, t1)                               // Z3 := Z3 + t1
 
 	q.x.Set(x3)
 	q.y.Set(y3)
@@ -266,40 +271,40 @@
 	// Complete addition formula for a = -3 from "Complete addition formulas for
 	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §A.2.
 
-	t0 := new(fiat.P384Element).Square(p.x)    // t0 := X ^ 2
-	t1 := new(fiat.P384Element).Square(p.y)    // t1 := Y ^ 2
-	t2 := new(fiat.P384Element).Square(p.z)    // t2 := Z ^ 2
-	t3 := new(fiat.P384Element).Mul(p.x, p.y)  // t3 := X * Y
-	t3.Add(t3, t3)                             // t3 := t3 + t3
-	z3 := new(fiat.P384Element).Mul(p.x, p.z)  // Z3 := X * Z
-	z3.Add(z3, z3)                             // Z3 := Z3 + Z3
-	y3 := new(fiat.P384Element).Mul(p384B, t2) // Y3 := b * t2
-	y3.Sub(y3, z3)                             // Y3 := Y3 - Z3
-	x3 := new(fiat.P384Element).Add(y3, y3)    // X3 := Y3 + Y3
-	y3.Add(x3, y3)                             // Y3 := X3 + Y3
-	x3.Sub(t1, y3)                             // X3 := t1 - Y3
-	y3.Add(t1, y3)                             // Y3 := t1 + Y3
-	y3.Mul(x3, y3)                             // Y3 := X3 * Y3
-	x3.Mul(x3, t3)                             // X3 := X3 * t3
-	t3.Add(t2, t2)                             // t3 := t2 + t2
-	t2.Add(t2, t3)                             // t2 := t2 + t3
-	z3.Mul(p384B, z3)                          // Z3 := b * Z3
-	z3.Sub(z3, t2)                             // Z3 := Z3 - t2
-	z3.Sub(z3, t0)                             // Z3 := Z3 - t0
-	t3.Add(z3, z3)                             // t3 := Z3 + Z3
-	z3.Add(z3, t3)                             // Z3 := Z3 + t3
-	t3.Add(t0, t0)                             // t3 := t0 + t0
-	t0.Add(t3, t0)                             // t0 := t3 + t0
-	t0.Sub(t0, t2)                             // t0 := t0 - t2
-	t0.Mul(t0, z3)                             // t0 := t0 * Z3
-	y3.Add(y3, t0)                             // Y3 := Y3 + t0
-	t0.Mul(p.y, p.z)                           // t0 := Y * Z
-	t0.Add(t0, t0)                             // t0 := t0 + t0
-	z3.Mul(t0, z3)                             // Z3 := t0 * Z3
-	x3.Sub(x3, z3)                             // X3 := X3 - Z3
-	z3.Mul(t0, t1)                             // Z3 := t0 * t1
-	z3.Add(z3, z3)                             // Z3 := Z3 + Z3
-	z3.Add(z3, z3)                             // Z3 := Z3 + Z3
+	t0 := new(fiat.P384Element).Square(p.x)      // t0 := X ^ 2
+	t1 := new(fiat.P384Element).Square(p.y)      // t1 := Y ^ 2
+	t2 := new(fiat.P384Element).Square(p.z)      // t2 := Z ^ 2
+	t3 := new(fiat.P384Element).Mul(p.x, p.y)    // t3 := X * Y
+	t3.Add(t3, t3)                               // t3 := t3 + t3
+	z3 := new(fiat.P384Element).Mul(p.x, p.z)    // Z3 := X * Z
+	z3.Add(z3, z3)                               // Z3 := Z3 + Z3
+	y3 := new(fiat.P384Element).Mul(p384B(), t2) // Y3 := b * t2
+	y3.Sub(y3, z3)                               // Y3 := Y3 - Z3
+	x3 := new(fiat.P384Element).Add(y3, y3)      // X3 := Y3 + Y3
+	y3.Add(x3, y3)                               // Y3 := X3 + Y3
+	x3.Sub(t1, y3)                               // X3 := t1 - Y3
+	y3.Add(t1, y3)                               // Y3 := t1 + Y3
+	y3.Mul(x3, y3)                               // Y3 := X3 * Y3
+	x3.Mul(x3, t3)                               // X3 := X3 * t3
+	t3.Add(t2, t2)                               // t3 := t2 + t2
+	t2.Add(t2, t3)                               // t2 := t2 + t3
+	z3.Mul(p384B(), z3)                          // Z3 := b * Z3
+	z3.Sub(z3, t2)                               // Z3 := Z3 - t2
+	z3.Sub(z3, t0)                               // Z3 := Z3 - t0
+	t3.Add(z3, z3)                               // t3 := Z3 + Z3
+	z3.Add(z3, t3)                               // Z3 := Z3 + t3
+	t3.Add(t0, t0)                               // t3 := t0 + t0
+	t0.Add(t3, t0)                               // t0 := t3 + t0
+	t0.Sub(t0, t2)                               // t0 := t0 - t2
+	t0.Mul(t0, z3)                               // t0 := t0 * Z3
+	y3.Add(y3, t0)                               // Y3 := Y3 + t0
+	t0.Mul(p.y, p.z)                             // t0 := Y * Z
+	t0.Add(t0, t0)                               // t0 := t0 + t0
+	z3.Mul(t0, z3)                               // Z3 := t0 * Z3
+	x3.Sub(x3, z3)                               // X3 := X3 - Z3
+	z3.Mul(t0, t1)                               // Z3 := t0 * t1
+	z3.Add(z3, z3)                               // Z3 := Z3 + Z3
+	z3.Add(z3, z3)                               // Z3 := Z3 + Z3
 
 	q.x.Set(x3)
 	q.y.Set(y3)
@@ -387,7 +392,7 @@
 func (p *P384Point) generatorTable() *[p384ElementLength * 2]p384Table {
 	p384GeneratorTableOnce.Do(func() {
 		p384GeneratorTable = new([p384ElementLength * 2]p384Table)
-		base := NewP384Generator()
+		base := NewP384Point().SetGenerator()
 		for i := 0; i < p384ElementLength*2; i++ {
 			p384GeneratorTable[i][0] = NewP384Point().Set(base)
 			for j := 1; j < 15; j++ {
diff --git a/src/crypto/internal/nistec/p521.go b/src/crypto/internal/nistec/p521.go
index b137d27..a57ad24 100644
--- a/src/crypto/internal/nistec/p521.go
+++ b/src/crypto/internal/nistec/p521.go
@@ -13,10 +13,6 @@
 	"sync"
 )
 
-var p521B, _ = new(fiat.P521Element).SetBytes([]byte{0x0, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, 0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85, 0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3, 0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1, 0x9, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e, 0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1, 0xbf, 0x7, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c, 0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50, 0x3f, 0x0})
-
-var p521G, _ = NewP521Point().SetBytes([]byte{0x4, 0x0, 0xc6, 0x85, 0x8e, 0x6, 0xb7, 0x4, 0x4, 0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95, 0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x5, 0x3f, 0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d, 0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7, 0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff, 0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a, 0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5, 0xbd, 0x66, 0x1, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, 0xc0, 0x4, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d, 0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b, 0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e, 0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4, 0x26, 0x40, 0xc5, 0x50, 0xb9, 0x1, 0x3f, 0xad, 0x7, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72, 0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1, 0x66, 0x50})
-
 // p521ElementLength is the length of an element of the base or scalar field,
 // which have the same bytes length for all NIST P curves.
 const p521ElementLength = 66
@@ -37,13 +33,12 @@
 	}
 }
 
-// NewP521Generator returns a new P521Point set to the canonical generator.
-func NewP521Generator() *P521Point {
-	return (&P521Point{
-		x: new(fiat.P521Element),
-		y: new(fiat.P521Element),
-		z: new(fiat.P521Element),
-	}).Set(p521G)
+// SetGenerator sets p to the canonical generator and returns p.
+func (p *P521Point) SetGenerator() *P521Point {
+	p.x.SetBytes([]byte{0x0, 0xc6, 0x85, 0x8e, 0x6, 0xb7, 0x4, 0x4, 0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95, 0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x5, 0x3f, 0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d, 0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7, 0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff, 0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a, 0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5, 0xbd, 0x66})
+	p.y.SetBytes([]byte{0x1, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, 0xc0, 0x4, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d, 0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b, 0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e, 0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4, 0x26, 0x40, 0xc5, 0x50, 0xb9, 0x1, 0x3f, 0xad, 0x7, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72, 0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1, 0x66, 0x50})
+	p.z.One()
+	return p
 }
 
 // Set sets p = q and returns p.
@@ -112,6 +107,16 @@
 	}
 }
 
+var _p521B *fiat.P521Element
+var _p521BOnce sync.Once
+
+func p521B() *fiat.P521Element {
+	_p521BOnce.Do(func() {
+		_p521B, _ = new(fiat.P521Element).SetBytes([]byte{0x0, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, 0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85, 0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3, 0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1, 0x9, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e, 0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1, 0xbf, 0x7, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c, 0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50, 0x3f, 0x0})
+	})
+	return _p521B
+}
+
 // p521Polynomial sets y2 to x³ - 3x + b, and returns y2.
 func p521Polynomial(y2, x *fiat.P521Element) *fiat.P521Element {
 	y2.Square(x)
@@ -119,9 +124,9 @@
 
 	threeX := new(fiat.P521Element).Add(x, x)
 	threeX.Add(threeX, x)
-
 	y2.Sub(y2, threeX)
-	return y2.Add(y2, p521B)
+
+	return y2.Add(y2, p521B())
 }
 
 func p521CheckOnCurve(x, y *fiat.P521Element) error {
@@ -211,49 +216,49 @@
 	// Complete addition formula for a = -3 from "Complete addition formulas for
 	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §A.2.
 
-	t0 := new(fiat.P521Element).Mul(p1.x, p2.x) // t0 := X1 * X2
-	t1 := new(fiat.P521Element).Mul(p1.y, p2.y) // t1 := Y1 * Y2
-	t2 := new(fiat.P521Element).Mul(p1.z, p2.z) // t2 := Z1 * Z2
-	t3 := new(fiat.P521Element).Add(p1.x, p1.y) // t3 := X1 + Y1
-	t4 := new(fiat.P521Element).Add(p2.x, p2.y) // t4 := X2 + Y2
-	t3.Mul(t3, t4)                              // t3 := t3 * t4
-	t4.Add(t0, t1)                              // t4 := t0 + t1
-	t3.Sub(t3, t4)                              // t3 := t3 - t4
-	t4.Add(p1.y, p1.z)                          // t4 := Y1 + Z1
-	x3 := new(fiat.P521Element).Add(p2.y, p2.z) // X3 := Y2 + Z2
-	t4.Mul(t4, x3)                              // t4 := t4 * X3
-	x3.Add(t1, t2)                              // X3 := t1 + t2
-	t4.Sub(t4, x3)                              // t4 := t4 - X3
-	x3.Add(p1.x, p1.z)                          // X3 := X1 + Z1
-	y3 := new(fiat.P521Element).Add(p2.x, p2.z) // Y3 := X2 + Z2
-	x3.Mul(x3, y3)                              // X3 := X3 * Y3
-	y3.Add(t0, t2)                              // Y3 := t0 + t2
-	y3.Sub(x3, y3)                              // Y3 := X3 - Y3
-	z3 := new(fiat.P521Element).Mul(p521B, t2)  // Z3 := b * t2
-	x3.Sub(y3, z3)                              // X3 := Y3 - Z3
-	z3.Add(x3, x3)                              // Z3 := X3 + X3
-	x3.Add(x3, z3)                              // X3 := X3 + Z3
-	z3.Sub(t1, x3)                              // Z3 := t1 - X3
-	x3.Add(t1, x3)                              // X3 := t1 + X3
-	y3.Mul(p521B, y3)                           // Y3 := b * Y3
-	t1.Add(t2, t2)                              // t1 := t2 + t2
-	t2.Add(t1, t2)                              // t2 := t1 + t2
-	y3.Sub(y3, t2)                              // Y3 := Y3 - t2
-	y3.Sub(y3, t0)                              // Y3 := Y3 - t0
-	t1.Add(y3, y3)                              // t1 := Y3 + Y3
-	y3.Add(t1, y3)                              // Y3 := t1 + Y3
-	t1.Add(t0, t0)                              // t1 := t0 + t0
-	t0.Add(t1, t0)                              // t0 := t1 + t0
-	t0.Sub(t0, t2)                              // t0 := t0 - t2
-	t1.Mul(t4, y3)                              // t1 := t4 * Y3
-	t2.Mul(t0, y3)                              // t2 := t0 * Y3
-	y3.Mul(x3, z3)                              // Y3 := X3 * Z3
-	y3.Add(y3, t2)                              // Y3 := Y3 + t2
-	x3.Mul(t3, x3)                              // X3 := t3 * X3
-	x3.Sub(x3, t1)                              // X3 := X3 - t1
-	z3.Mul(t4, z3)                              // Z3 := t4 * Z3
-	t1.Mul(t3, t0)                              // t1 := t3 * t0
-	z3.Add(z3, t1)                              // Z3 := Z3 + t1
+	t0 := new(fiat.P521Element).Mul(p1.x, p2.x)  // t0 := X1 * X2
+	t1 := new(fiat.P521Element).Mul(p1.y, p2.y)  // t1 := Y1 * Y2
+	t2 := new(fiat.P521Element).Mul(p1.z, p2.z)  // t2 := Z1 * Z2
+	t3 := new(fiat.P521Element).Add(p1.x, p1.y)  // t3 := X1 + Y1
+	t4 := new(fiat.P521Element).Add(p2.x, p2.y)  // t4 := X2 + Y2
+	t3.Mul(t3, t4)                               // t3 := t3 * t4
+	t4.Add(t0, t1)                               // t4 := t0 + t1
+	t3.Sub(t3, t4)                               // t3 := t3 - t4
+	t4.Add(p1.y, p1.z)                           // t4 := Y1 + Z1
+	x3 := new(fiat.P521Element).Add(p2.y, p2.z)  // X3 := Y2 + Z2
+	t4.Mul(t4, x3)                               // t4 := t4 * X3
+	x3.Add(t1, t2)                               // X3 := t1 + t2
+	t4.Sub(t4, x3)                               // t4 := t4 - X3
+	x3.Add(p1.x, p1.z)                           // X3 := X1 + Z1
+	y3 := new(fiat.P521Element).Add(p2.x, p2.z)  // Y3 := X2 + Z2
+	x3.Mul(x3, y3)                               // X3 := X3 * Y3
+	y3.Add(t0, t2)                               // Y3 := t0 + t2
+	y3.Sub(x3, y3)                               // Y3 := X3 - Y3
+	z3 := new(fiat.P521Element).Mul(p521B(), t2) // Z3 := b * t2
+	x3.Sub(y3, z3)                               // X3 := Y3 - Z3
+	z3.Add(x3, x3)                               // Z3 := X3 + X3
+	x3.Add(x3, z3)                               // X3 := X3 + Z3
+	z3.Sub(t1, x3)                               // Z3 := t1 - X3
+	x3.Add(t1, x3)                               // X3 := t1 + X3
+	y3.Mul(p521B(), y3)                          // Y3 := b * Y3
+	t1.Add(t2, t2)                               // t1 := t2 + t2
+	t2.Add(t1, t2)                               // t2 := t1 + t2
+	y3.Sub(y3, t2)                               // Y3 := Y3 - t2
+	y3.Sub(y3, t0)                               // Y3 := Y3 - t0
+	t1.Add(y3, y3)                               // t1 := Y3 + Y3
+	y3.Add(t1, y3)                               // Y3 := t1 + Y3
+	t1.Add(t0, t0)                               // t1 := t0 + t0
+	t0.Add(t1, t0)                               // t0 := t1 + t0
+	t0.Sub(t0, t2)                               // t0 := t0 - t2
+	t1.Mul(t4, y3)                               // t1 := t4 * Y3
+	t2.Mul(t0, y3)                               // t2 := t0 * Y3
+	y3.Mul(x3, z3)                               // Y3 := X3 * Z3
+	y3.Add(y3, t2)                               // Y3 := Y3 + t2
+	x3.Mul(t3, x3)                               // X3 := t3 * X3
+	x3.Sub(x3, t1)                               // X3 := X3 - t1
+	z3.Mul(t4, z3)                               // Z3 := t4 * Z3
+	t1.Mul(t3, t0)                               // t1 := t3 * t0
+	z3.Add(z3, t1)                               // Z3 := Z3 + t1
 
 	q.x.Set(x3)
 	q.y.Set(y3)
@@ -266,40 +271,40 @@
 	// Complete addition formula for a = -3 from "Complete addition formulas for
 	// prime order elliptic curves" (https://eprint.iacr.org/2015/1060), §A.2.
 
-	t0 := new(fiat.P521Element).Square(p.x)    // t0 := X ^ 2
-	t1 := new(fiat.P521Element).Square(p.y)    // t1 := Y ^ 2
-	t2 := new(fiat.P521Element).Square(p.z)    // t2 := Z ^ 2
-	t3 := new(fiat.P521Element).Mul(p.x, p.y)  // t3 := X * Y
-	t3.Add(t3, t3)                             // t3 := t3 + t3
-	z3 := new(fiat.P521Element).Mul(p.x, p.z)  // Z3 := X * Z
-	z3.Add(z3, z3)                             // Z3 := Z3 + Z3
-	y3 := new(fiat.P521Element).Mul(p521B, t2) // Y3 := b * t2
-	y3.Sub(y3, z3)                             // Y3 := Y3 - Z3
-	x3 := new(fiat.P521Element).Add(y3, y3)    // X3 := Y3 + Y3
-	y3.Add(x3, y3)                             // Y3 := X3 + Y3
-	x3.Sub(t1, y3)                             // X3 := t1 - Y3
-	y3.Add(t1, y3)                             // Y3 := t1 + Y3
-	y3.Mul(x3, y3)                             // Y3 := X3 * Y3
-	x3.Mul(x3, t3)                             // X3 := X3 * t3
-	t3.Add(t2, t2)                             // t3 := t2 + t2
-	t2.Add(t2, t3)                             // t2 := t2 + t3
-	z3.Mul(p521B, z3)                          // Z3 := b * Z3
-	z3.Sub(z3, t2)                             // Z3 := Z3 - t2
-	z3.Sub(z3, t0)                             // Z3 := Z3 - t0
-	t3.Add(z3, z3)                             // t3 := Z3 + Z3
-	z3.Add(z3, t3)                             // Z3 := Z3 + t3
-	t3.Add(t0, t0)                             // t3 := t0 + t0
-	t0.Add(t3, t0)                             // t0 := t3 + t0
-	t0.Sub(t0, t2)                             // t0 := t0 - t2
-	t0.Mul(t0, z3)                             // t0 := t0 * Z3
-	y3.Add(y3, t0)                             // Y3 := Y3 + t0
-	t0.Mul(p.y, p.z)                           // t0 := Y * Z
-	t0.Add(t0, t0)                             // t0 := t0 + t0
-	z3.Mul(t0, z3)                             // Z3 := t0 * Z3
-	x3.Sub(x3, z3)                             // X3 := X3 - Z3
-	z3.Mul(t0, t1)                             // Z3 := t0 * t1
-	z3.Add(z3, z3)                             // Z3 := Z3 + Z3
-	z3.Add(z3, z3)                             // Z3 := Z3 + Z3
+	t0 := new(fiat.P521Element).Square(p.x)      // t0 := X ^ 2
+	t1 := new(fiat.P521Element).Square(p.y)      // t1 := Y ^ 2
+	t2 := new(fiat.P521Element).Square(p.z)      // t2 := Z ^ 2
+	t3 := new(fiat.P521Element).Mul(p.x, p.y)    // t3 := X * Y
+	t3.Add(t3, t3)                               // t3 := t3 + t3
+	z3 := new(fiat.P521Element).Mul(p.x, p.z)    // Z3 := X * Z
+	z3.Add(z3, z3)                               // Z3 := Z3 + Z3
+	y3 := new(fiat.P521Element).Mul(p521B(), t2) // Y3 := b * t2
+	y3.Sub(y3, z3)                               // Y3 := Y3 - Z3
+	x3 := new(fiat.P521Element).Add(y3, y3)      // X3 := Y3 + Y3
+	y3.Add(x3, y3)                               // Y3 := X3 + Y3
+	x3.Sub(t1, y3)                               // X3 := t1 - Y3
+	y3.Add(t1, y3)                               // Y3 := t1 + Y3
+	y3.Mul(x3, y3)                               // Y3 := X3 * Y3
+	x3.Mul(x3, t3)                               // X3 := X3 * t3
+	t3.Add(t2, t2)                               // t3 := t2 + t2
+	t2.Add(t2, t3)                               // t2 := t2 + t3
+	z3.Mul(p521B(), z3)                          // Z3 := b * Z3
+	z3.Sub(z3, t2)                               // Z3 := Z3 - t2
+	z3.Sub(z3, t0)                               // Z3 := Z3 - t0
+	t3.Add(z3, z3)                               // t3 := Z3 + Z3
+	z3.Add(z3, t3)                               // Z3 := Z3 + t3
+	t3.Add(t0, t0)                               // t3 := t0 + t0
+	t0.Add(t3, t0)                               // t0 := t3 + t0
+	t0.Sub(t0, t2)                               // t0 := t0 - t2
+	t0.Mul(t0, z3)                               // t0 := t0 * Z3
+	y3.Add(y3, t0)                               // Y3 := Y3 + t0
+	t0.Mul(p.y, p.z)                             // t0 := Y * Z
+	t0.Add(t0, t0)                               // t0 := t0 + t0
+	z3.Mul(t0, z3)                               // Z3 := t0 * Z3
+	x3.Sub(x3, z3)                               // X3 := X3 - Z3
+	z3.Mul(t0, t1)                               // Z3 := t0 * t1
+	z3.Add(z3, z3)                               // Z3 := Z3 + Z3
+	z3.Add(z3, z3)                               // Z3 := Z3 + Z3
 
 	q.x.Set(x3)
 	q.y.Set(y3)
@@ -387,7 +392,7 @@
 func (p *P521Point) generatorTable() *[p521ElementLength * 2]p521Table {
 	p521GeneratorTableOnce.Do(func() {
 		p521GeneratorTable = new([p521ElementLength * 2]p521Table)
-		base := NewP521Generator()
+		base := NewP521Point().SetGenerator()
 		for i := 0; i < p521ElementLength*2; i++ {
 			p521GeneratorTable[i][0] = NewP521Point().Set(base)
 			for j := 1; j < 15; j++ {