src/crypto/elliptic/p256_asm.go - go - Git at Google

 // Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 // This file contains the Go wrapper for the constant-time, 64-bit assembly
 // implementation of P256. The optimizations performed here are described in
 // detail in:
 // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
 //                          256-bit primes"
 // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
 // https://eprint.iacr.org/2013/816.pdf

 //go:build amd64 || arm64
 // +build amd64 arm64

 package elliptic

 import (
 	"math/big"
 )

 type (
 	p256Curve struct {
 		*CurveParams
 	}

 	p256Point struct {
 		xyz [12]uint64
 	}
 )

 var p256 p256Curve

 func initP256() {
 	// See FIPS 186-3, section D.2.3
 	p256.CurveParams = &CurveParams{Name: "P-256"}
 	p256.P, _ = new(big.Int).SetString("115792089210356248762697446949407573530086143415290314195533631308867097853951", 10)
 	p256.N, _ = new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10)
 	p256.B, _ = new(big.Int).SetString("5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", 16)
 	p256.Gx, _ = new(big.Int).SetString("6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", 16)
 	p256.Gy, _ = new(big.Int).SetString("4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", 16)
 	p256.BitSize = 256
 }

 func (curve p256Curve) Params() *CurveParams {
 	return curve.CurveParams
 }

 // Functions implemented in p256_asm_*64.s
 // Montgomery multiplication modulo P256
 //go:noescape
 func p256Mul(res, in1, in2 []uint64)

 // Montgomery square modulo P256, repeated n times (n >= 1)
 //go:noescape
 func p256Sqr(res, in []uint64, n int)

 // Montgomery multiplication by 1
 //go:noescape
 func p256FromMont(res, in []uint64)

 // iff cond == 1  val <- -val
 //go:noescape
 func p256NegCond(val []uint64, cond int)

 // if cond == 0 res <- b; else res <- a
 //go:noescape
 func p256MovCond(res, a, b []uint64, cond int)

 // Endianness swap
 //go:noescape
 func p256BigToLittle(res []uint64, in []byte)

 //go:noescape
 func p256LittleToBig(res []byte, in []uint64)

 // Constant time table access
 //go:noescape
 func p256Select(point, table []uint64, idx int)

 //go:noescape
 func p256SelectBase(point, table []uint64, idx int)

 // Montgomery multiplication modulo Ord(G)
 //go:noescape
 func p256OrdMul(res, in1, in2 []uint64)

 // Montgomery square modulo Ord(G), repeated n times
 //go:noescape
 func p256OrdSqr(res, in []uint64, n int)

 // Point add with in2 being affine point
 // If sign == 1 -> in2 = -in2
 // If sel == 0 -> res = in1
 // if zero == 0 -> res = in2
 //go:noescape
 func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)

 // Point add. Returns one if the two input points were equal and zero
 // otherwise. (Note that, due to the way that the equations work out, some
 // representations of ∞ are considered equal to everything by this function.)
 //go:noescape
 func p256PointAddAsm(res, in1, in2 []uint64) int

 // Point double
 //go:noescape
 func p256PointDoubleAsm(res, in []uint64)

 func (curve p256Curve) Inverse(k *big.Int) *big.Int {
 	if k.Sign() < 0 {
 		// This should never happen.
 		k = new(big.Int).Neg(k)
 	}

 	if k.Cmp(p256.N) >= 0 {
 		// This should never happen.
 		k = new(big.Int).Mod(k, p256.N)
 	}

 	// table will store precomputed powers of x.
 	var table [4 * 9]uint64
 	var (
 		_1      = table[4*0 : 4*1]
 		_11     = table[4*1 : 4*2]
 		_101    = table[4*2 : 4*3]
 		_111    = table[4*3 : 4*4]
 		_1111   = table[4*4 : 4*5]
 		_10101  = table[4*5 : 4*6]
 		_101111 = table[4*6 : 4*7]
 		x       = table[4*7 : 4*8]
 		t       = table[4*8 : 4*9]
 	)

 	fromBig(x[:], k)
 	// This code operates in the Montgomery domain where R = 2^256 mod n
 	// and n is the order of the scalar field. (See initP256 for the
 	// value.) Elements in the Montgomery domain take the form a×R and
 	// multiplication of x and y in the calculates (x × y × R^-1) mod n. RR
 	// is R×R mod n thus the Montgomery multiplication x and RR gives x×R,
 	// i.e. converts x into the Montgomery domain.
 	// Window values borrowed from https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion
 	RR := []uint64{0x83244c95be79eea2, 0x4699799c49bd6fa6, 0x2845b2392b6bec59, 0x66e12d94f3d95620}
 	p256OrdMul(_1, x, RR)      // _1
 	p256OrdSqr(x, _1, 1)       // _10
 	p256OrdMul(_11, x, _1)     // _11
 	p256OrdMul(_101, x, _11)   // _101
 	p256OrdMul(_111, x, _101)  // _111
 	p256OrdSqr(x, _101, 1)     // _1010
 	p256OrdMul(_1111, _101, x) // _1111

 	p256OrdSqr(t, x, 1)          // _10100
 	p256OrdMul(_10101, t, _1)    // _10101
 	p256OrdSqr(x, _10101, 1)     // _101010
 	p256OrdMul(_101111, _101, x) // _101111
 	p256OrdMul(x, _10101, x)     // _111111 = x6
 	p256OrdSqr(t, x, 2)          // _11111100
 	p256OrdMul(t, t, _11)        // _11111111 = x8
 	p256OrdSqr(x, t, 8)          // _ff00
 	p256OrdMul(x, x, t)          // _ffff = x16
 	p256OrdSqr(t, x, 16)         // _ffff0000
 	p256OrdMul(t, t, x)          // _ffffffff = x32

 	p256OrdSqr(x, t, 64)
 	p256OrdMul(x, x, t)
 	p256OrdSqr(x, x, 32)
 	p256OrdMul(x, x, t)

 	sqrs := []uint8{
 		6, 5, 4, 5, 5,
 		4, 3, 3, 5, 9,
 		6, 2, 5, 6, 5,
 		4, 5, 5, 3, 10,
 		2, 5, 5, 3, 7, 6}
 	muls := [][]uint64{
 		_101111, _111, _11, _1111, _10101,
 		_101, _101, _101, _111, _101111,
 		_1111, _1, _1, _1111, _111,
 		_111, _111, _101, _11, _101111,
 		_11, _11, _11, _1, _10101, _1111}

 	for i, s := range sqrs {
 		p256OrdSqr(x, x, int(s))
 		p256OrdMul(x, x, muls[i])
 	}

 	// Multiplying by one in the Montgomery domain converts a Montgomery
 	// value out of the domain.
 	one := []uint64{1, 0, 0, 0}
 	p256OrdMul(x, x, one)

 	xOut := make([]byte, 32)
 	p256LittleToBig(xOut, x)
 	return new(big.Int).SetBytes(xOut)
 }

 // fromBig converts a *big.Int into a format used by this code.
 func fromBig(out []uint64, big *big.Int) {
 	for i := range out {
 		out[i] = 0
 	}

 	for i, v := range big.Bits() {
 		out[i] = uint64(v)
 	}
 }

 // p256GetScalar endian-swaps the big-endian scalar value from in and writes it
 // to out. If the scalar is equal or greater than the order of the group, it's
 // reduced modulo that order.
 func p256GetScalar(out []uint64, in []byte) {
 	n := new(big.Int).SetBytes(in)

 	if n.Cmp(p256.N) >= 0 {
 		n.Mod(n, p256.N)
 	}
 	fromBig(out, n)
 }

 // p256Mul operates in a Montgomery domain with R = 2^256 mod p, where p is the
 // underlying field of the curve. (See initP256 for the value.) Thus rr here is
 // R×R mod p. See comment in Inverse about how this is used.
 var rr = []uint64{0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd}

 func maybeReduceModP(in *big.Int) *big.Int {
 	if in.Cmp(p256.P) < 0 {
 		return in
 	}
 	return new(big.Int).Mod(in, p256.P)
 }

 func (curve p256Curve) CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int) {
 	scalarReversed := make([]uint64, 4)
 	var r1, r2 p256Point
 	p256GetScalar(scalarReversed, baseScalar)
 	r1IsInfinity := scalarIsZero(scalarReversed)
 	r1.p256BaseMult(scalarReversed)

 	p256GetScalar(scalarReversed, scalar)
 	r2IsInfinity := scalarIsZero(scalarReversed)
 	fromBig(r2.xyz[0:4], maybeReduceModP(bigX))
 	fromBig(r2.xyz[4:8], maybeReduceModP(bigY))
 	p256Mul(r2.xyz[0:4], r2.xyz[0:4], rr[:])
 	p256Mul(r2.xyz[4:8], r2.xyz[4:8], rr[:])

 	// This sets r2's Z value to 1, in the Montgomery domain.
 	r2.xyz[8] = 0x0000000000000001
 	r2.xyz[9] = 0xffffffff00000000
 	r2.xyz[10] = 0xffffffffffffffff
 	r2.xyz[11] = 0x00000000fffffffe

 	r2.p256ScalarMult(scalarReversed)

 	var sum, double p256Point
 	pointsEqual := p256PointAddAsm(sum.xyz[:], r1.xyz[:], r2.xyz[:])
 	p256PointDoubleAsm(double.xyz[:], r1.xyz[:])
 	sum.CopyConditional(&double, pointsEqual)
 	sum.CopyConditional(&r1, r2IsInfinity)
 	sum.CopyConditional(&r2, r1IsInfinity)

 	return sum.p256PointToAffine()
 }

 func (curve p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
 	scalarReversed := make([]uint64, 4)
 	p256GetScalar(scalarReversed, scalar)

 	var r p256Point
 	r.p256BaseMult(scalarReversed)
 	return r.p256PointToAffine()
 }

 func (curve p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
 	scalarReversed := make([]uint64, 4)
 	p256GetScalar(scalarReversed, scalar)

 	var r p256Point
 	fromBig(r.xyz[0:4], maybeReduceModP(bigX))
 	fromBig(r.xyz[4:8], maybeReduceModP(bigY))
 	p256Mul(r.xyz[0:4], r.xyz[0:4], rr[:])
 	p256Mul(r.xyz[4:8], r.xyz[4:8], rr[:])
 	// This sets r2's Z value to 1, in the Montgomery domain.
 	r.xyz[8] = 0x0000000000000001
 	r.xyz[9] = 0xffffffff00000000
 	r.xyz[10] = 0xffffffffffffffff
 	r.xyz[11] = 0x00000000fffffffe

 	r.p256ScalarMult(scalarReversed)
 	return r.p256PointToAffine()
 }

 // uint64IsZero returns 1 if x is zero and zero otherwise.
 func uint64IsZero(x uint64) int {
 	x = ^x
 	x &= x >> 32
 	x &= x >> 16
 	x &= x >> 8
 	x &= x >> 4
 	x &= x >> 2
 	x &= x >> 1
 	return int(x & 1)
 }

 // scalarIsZero returns 1 if scalar represents the zero value, and zero
 // otherwise.
 func scalarIsZero(scalar []uint64) int {
 	return uint64IsZero(scalar[0] | scalar[1] | scalar[2] | scalar[3])
 }

 func (p *p256Point) p256PointToAffine() (x, y *big.Int) {
 	zInv := make([]uint64, 4)
 	zInvSq := make([]uint64, 4)
 	p256Inverse(zInv, p.xyz[8:12])
 	p256Sqr(zInvSq, zInv, 1)
 	p256Mul(zInv, zInv, zInvSq)

 	p256Mul(zInvSq, p.xyz[0:4], zInvSq)
 	p256Mul(zInv, p.xyz[4:8], zInv)

 	p256FromMont(zInvSq, zInvSq)
 	p256FromMont(zInv, zInv)

 	xOut := make([]byte, 32)
 	yOut := make([]byte, 32)
 	p256LittleToBig(xOut, zInvSq)
 	p256LittleToBig(yOut, zInv)

 	return new(big.Int).SetBytes(xOut), new(big.Int).SetBytes(yOut)
 }

 // CopyConditional copies overwrites p with src if v == 1, and leaves p
 // unchanged if v == 0.
 func (p *p256Point) CopyConditional(src *p256Point, v int) {
 	pMask := uint64(v) - 1
 	srcMask := ^pMask

 	for i, n := range p.xyz {
 		p.xyz[i] = (n & pMask) | (src.xyz[i] & srcMask)
 	}
 }

 // p256Inverse sets out to in^-1 mod p.
 func p256Inverse(out, in []uint64) {
 	var stack [6 * 4]uint64
 	p2 := stack[4*0 : 4*0+4]
 	p4 := stack[4*1 : 4*1+4]
 	p8 := stack[4*2 : 4*2+4]
 	p16 := stack[4*3 : 4*3+4]
 	p32 := stack[4*4 : 4*4+4]

 	p256Sqr(out, in, 1)
 	p256Mul(p2, out, in) // 3*p

 	p256Sqr(out, p2, 2)
 	p256Mul(p4, out, p2) // f*p

 	p256Sqr(out, p4, 4)
 	p256Mul(p8, out, p4) // ff*p

 	p256Sqr(out, p8, 8)
 	p256Mul(p16, out, p8) // ffff*p

 	p256Sqr(out, p16, 16)
 	p256Mul(p32, out, p16) // ffffffff*p

 	p256Sqr(out, p32, 32)
 	p256Mul(out, out, in)

 	p256Sqr(out, out, 128)
 	p256Mul(out, out, p32)

 	p256Sqr(out, out, 32)
 	p256Mul(out, out, p32)

 	p256Sqr(out, out, 16)
 	p256Mul(out, out, p16)

 	p256Sqr(out, out, 8)
 	p256Mul(out, out, p8)

 	p256Sqr(out, out, 4)
 	p256Mul(out, out, p4)

 	p256Sqr(out, out, 2)
 	p256Mul(out, out, p2)

 	p256Sqr(out, out, 2)
 	p256Mul(out, out, in)
 }

 func (p *p256Point) p256StorePoint(r *[16 * 4 * 3]uint64, index int) {
 	copy(r[index*12:], p.xyz[:])
 }

 func boothW5(in uint) (int, int) {
 	var s uint = ^((in >> 5) - 1)
 	var d uint = (1 << 6) - in - 1
 	d = (d & s) | (in & (^s))
 	d = (d >> 1) + (d & 1)
 	return int(d), int(s & 1)
 }

 func boothW6(in uint) (int, int) {
 	var s uint = ^((in >> 6) - 1)
 	var d uint = (1 << 7) - in - 1
 	d = (d & s) | (in & (^s))
 	d = (d >> 1) + (d & 1)
 	return int(d), int(s & 1)
 }

 func (p *p256Point) p256BaseMult(scalar []uint64) {
 	wvalue := (scalar[0] << 1) & 0x7f
 	sel, sign := boothW6(uint(wvalue))
 	p256SelectBase(p.xyz[0:8], p256Precomputed[0][0:], sel)
 	p256NegCond(p.xyz[4:8], sign)

 	// (This is one, in the Montgomery domain.)
 	p.xyz[8] = 0x0000000000000001
 	p.xyz[9] = 0xffffffff00000000
 	p.xyz[10] = 0xffffffffffffffff
 	p.xyz[11] = 0x00000000fffffffe

 	var t0 p256Point
 	// (This is one, in the Montgomery domain.)
 	t0.xyz[8] = 0x0000000000000001
 	t0.xyz[9] = 0xffffffff00000000
 	t0.xyz[10] = 0xffffffffffffffff
 	t0.xyz[11] = 0x00000000fffffffe

 	index := uint(5)
 	zero := sel

 	for i := 1; i < 43; i++ {
 		if index < 192 {
 			wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x7f
 		} else {
 			wvalue = (scalar[index/64] >> (index % 64)) & 0x7f
 		}
 		index += 6
 		sel, sign = boothW6(uint(wvalue))
 		p256SelectBase(t0.xyz[0:8], p256Precomputed[i][0:], sel)
 		p256PointAddAffineAsm(p.xyz[0:12], p.xyz[0:12], t0.xyz[0:8], sign, sel, zero)
 		zero |= sel
 	}
 }

 func (p *p256Point) p256ScalarMult(scalar []uint64) {
 	// precomp is a table of precomputed points that stores powers of p
 	// from p^1 to p^16.
 	var precomp [16 * 4 * 3]uint64
 	var t0, t1, t2, t3 p256Point

 	// Prepare the table
 	p.p256StorePoint(&precomp, 0) // 1

 	p256PointDoubleAsm(t0.xyz[:], p.xyz[:])
 	p256PointDoubleAsm(t1.xyz[:], t0.xyz[:])
 	p256PointDoubleAsm(t2.xyz[:], t1.xyz[:])
 	p256PointDoubleAsm(t3.xyz[:], t2.xyz[:])
 	t0.p256StorePoint(&precomp, 1)  // 2
 	t1.p256StorePoint(&precomp, 3)  // 4
 	t2.p256StorePoint(&precomp, 7)  // 8
 	t3.p256StorePoint(&precomp, 15) // 16

 	p256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:])
 	p256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:])
 	p256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:])
 	t0.p256StorePoint(&precomp, 2) // 3
 	t1.p256StorePoint(&precomp, 4) // 5
 	t2.p256StorePoint(&precomp, 8) // 9

 	p256PointDoubleAsm(t0.xyz[:], t0.xyz[:])
 	p256PointDoubleAsm(t1.xyz[:], t1.xyz[:])
 	t0.p256StorePoint(&precomp, 5) // 6
 	t1.p256StorePoint(&precomp, 9) // 10

 	p256PointAddAsm(t2.xyz[:], t0.xyz[:], p.xyz[:])
 	p256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:])
 	t2.p256StorePoint(&precomp, 6)  // 7
 	t1.p256StorePoint(&precomp, 10) // 11

 	p256PointDoubleAsm(t0.xyz[:], t0.xyz[:])
 	p256PointDoubleAsm(t2.xyz[:], t2.xyz[:])
 	t0.p256StorePoint(&precomp, 11) // 12
 	t2.p256StorePoint(&precomp, 13) // 14

 	p256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:])
 	p256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:])
 	t0.p256StorePoint(&precomp, 12) // 13
 	t2.p256StorePoint(&precomp, 14) // 15

 	// Start scanning the window from top bit
 	index := uint(254)
 	var sel, sign int

 	wvalue := (scalar[index/64] >> (index % 64)) & 0x3f
 	sel, _ = boothW5(uint(wvalue))

 	p256Select(p.xyz[0:12], precomp[0:], sel)
 	zero := sel

 	for index > 4 {
 		index -= 5
 		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
 		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
 		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
 		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
 		p256PointDoubleAsm(p.xyz[:], p.xyz[:])

 		if index < 192 {
 			wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x3f
 		} else {
 			wvalue = (scalar[index/64] >> (index % 64)) & 0x3f
 		}

 		sel, sign = boothW5(uint(wvalue))

 		p256Select(t0.xyz[0:], precomp[0:], sel)
 		p256NegCond(t0.xyz[4:8], sign)
 		p256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:])
 		p256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel)
 		p256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero)
 		zero |= sel
 	}

 	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
 	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
 	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
 	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
 	p256PointDoubleAsm(p.xyz[:], p.xyz[:])

 	wvalue = (scalar[0] << 1) & 0x3f
 	sel, sign = boothW5(uint(wvalue))

 	p256Select(t0.xyz[0:], precomp[0:], sel)
 	p256NegCond(t0.xyz[4:8], sign)
 	p256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:])
 	p256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel)
 	p256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero)
 }
	// Copyright 2015 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	// This file contains the Go wrapper for the constant-time, 64-bit assembly
	// implementation of P256. The optimizations performed here are described in
	// detail in:
	// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
	// 256-bit primes"
	// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
	// https://eprint.iacr.org/2013/816.pdf

	//go:build amd64 \|\| arm64
	// +build amd64 arm64

	package elliptic

	import (
	"math/big"
	)

	type (
	p256Curve struct {
	*CurveParams
	}

	p256Point struct {
	xyz [12]uint64
	}
	)

	var p256 p256Curve

	func initP256() {
	// See FIPS 186-3, section D.2.3
	p256.CurveParams = &CurveParams{Name: "P-256"}
	p256.P, _ = new(big.Int).SetString("115792089210356248762697446949407573530086143415290314195533631308867097853951", 10)
	p256.N, _ = new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10)
	p256.B, _ = new(big.Int).SetString("5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", 16)
	p256.Gx, _ = new(big.Int).SetString("6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", 16)
	p256.Gy, _ = new(big.Int).SetString("4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", 16)
	p256.BitSize = 256
	}

	func (curve p256Curve) Params() *CurveParams {
	return curve.CurveParams
	}

	// Functions implemented in p256_asm_*64.s
	// Montgomery multiplication modulo P256
	//go:noescape
	func p256Mul(res, in1, in2 []uint64)

	// Montgomery square modulo P256, repeated n times (n >= 1)
	//go:noescape
	func p256Sqr(res, in []uint64, n int)

	// Montgomery multiplication by 1
	//go:noescape
	func p256FromMont(res, in []uint64)

	// iff cond == 1 val <- -val
	//go:noescape
	func p256NegCond(val []uint64, cond int)

	// if cond == 0 res <- b; else res <- a
	//go:noescape
	func p256MovCond(res, a, b []uint64, cond int)

	// Endianness swap
	//go:noescape
	func p256BigToLittle(res []uint64, in []byte)

	//go:noescape
	func p256LittleToBig(res []byte, in []uint64)

	// Constant time table access
	//go:noescape
	func p256Select(point, table []uint64, idx int)

	//go:noescape
	func p256SelectBase(point, table []uint64, idx int)

	// Montgomery multiplication modulo Ord(G)
	//go:noescape
	func p256OrdMul(res, in1, in2 []uint64)

	// Montgomery square modulo Ord(G), repeated n times
	//go:noescape
	func p256OrdSqr(res, in []uint64, n int)

	// Point add with in2 being affine point
	// If sign == 1 -> in2 = -in2
	// If sel == 0 -> res = in1
	// if zero == 0 -> res = in2
	//go:noescape
	func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)

	// Point add. Returns one if the two input points were equal and zero
	// otherwise. (Note that, due to the way that the equations work out, some
	// representations of ∞ are considered equal to everything by this function.)
	//go:noescape
	func p256PointAddAsm(res, in1, in2 []uint64) int

	// Point double
	//go:noescape
	func p256PointDoubleAsm(res, in []uint64)

	func (curve p256Curve) Inverse(k big.Int) big.Int {
	if k.Sign() < 0 {
	// This should never happen.
	k = new(big.Int).Neg(k)
	}

	if k.Cmp(p256.N) >= 0 {
	// This should never happen.
	k = new(big.Int).Mod(k, p256.N)
	}

	// table will store precomputed powers of x.
	var table [4 * 9]uint64
	var (
	_1 = table[40 : 41]
	_11 = table[41 : 42]
	_101 = table[42 : 43]
	_111 = table[43 : 44]
	_1111 = table[44 : 45]
	_10101 = table[45 : 46]
	_101111 = table[46 : 47]
	x = table[47 : 48]
	t = table[48 : 49]
	)

	fromBig(x[:], k)
	// This code operates in the Montgomery domain where R = 2^256 mod n
	// and n is the order of the scalar field. (See initP256 for the
	// value.) Elements in the Montgomery domain take the form a×R and
	// multiplication of x and y in the calculates (x × y × R^-1) mod n. RR
	// is R×R mod n thus the Montgomery multiplication x and RR gives x×R,
	// i.e. converts x into the Montgomery domain.
	// Window values borrowed from https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion
	RR := []uint64{0x83244c95be79eea2, 0x4699799c49bd6fa6, 0x2845b2392b6bec59, 0x66e12d94f3d95620}
	p256OrdMul(_1, x, RR) // _1
	p256OrdSqr(x, _1, 1) // _10
	p256OrdMul(_11, x, _1) // _11
	p256OrdMul(_101, x, _11) // _101
	p256OrdMul(_111, x, _101) // _111
	p256OrdSqr(x, _101, 1) // _1010
	p256OrdMul(_1111, _101, x) // _1111

	p256OrdSqr(t, x, 1) // _10100
	p256OrdMul(_10101, t, _1) // _10101
	p256OrdSqr(x, _10101, 1) // _101010
	p256OrdMul(_101111, _101, x) // _101111
	p256OrdMul(x, _10101, x) // _111111 = x6
	p256OrdSqr(t, x, 2) // _11111100
	p256OrdMul(t, t, _11) // _11111111 = x8
	p256OrdSqr(x, t, 8) // _ff00
	p256OrdMul(x, x, t) // _ffff = x16
	p256OrdSqr(t, x, 16) // _ffff0000
	p256OrdMul(t, t, x) // _ffffffff = x32

	p256OrdSqr(x, t, 64)
	p256OrdMul(x, x, t)
	p256OrdSqr(x, x, 32)
	p256OrdMul(x, x, t)

	sqrs := []uint8{
	6, 5, 4, 5, 5,
	4, 3, 3, 5, 9,
	6, 2, 5, 6, 5,
	4, 5, 5, 3, 10,
	2, 5, 5, 3, 7, 6}
	muls := [][]uint64{
	_101111, _111, _11, _1111, _10101,
	_101, _101, _101, _111, _101111,
	_1111, _1, _1, _1111, _111,
	_111, _111, _101, _11, _101111,
	_11, _11, _11, _1, _10101, _1111}

	for i, s := range sqrs {
	p256OrdSqr(x, x, int(s))
	p256OrdMul(x, x, muls[i])
	}

	// Multiplying by one in the Montgomery domain converts a Montgomery
	// value out of the domain.
	one := []uint64{1, 0, 0, 0}
	p256OrdMul(x, x, one)

	xOut := make([]byte, 32)
	p256LittleToBig(xOut, x)
	return new(big.Int).SetBytes(xOut)
	}

	// fromBig converts a *big.Int into a format used by this code.
	func fromBig(out []uint64, big *big.Int) {
	for i := range out {
	out[i] = 0
	}

	for i, v := range big.Bits() {
	out[i] = uint64(v)
	}
	}

	// p256GetScalar endian-swaps the big-endian scalar value from in and writes it
	// to out. If the scalar is equal or greater than the order of the group, it's
	// reduced modulo that order.
	func p256GetScalar(out []uint64, in []byte) {
	n := new(big.Int).SetBytes(in)

	if n.Cmp(p256.N) >= 0 {
	n.Mod(n, p256.N)
	}
	fromBig(out, n)
	}

	// p256Mul operates in a Montgomery domain with R = 2^256 mod p, where p is the
	// underlying field of the curve. (See initP256 for the value.) Thus rr here is
	// R×R mod p. See comment in Inverse about how this is used.
	var rr = []uint64{0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd}

	func maybeReduceModP(in big.Int) big.Int {
	if in.Cmp(p256.P) < 0 {
	return in
	}
	return new(big.Int).Mod(in, p256.P)
	}

	func (curve p256Curve) CombinedMult(bigX, bigY big.Int, baseScalar, scalar []byte) (x, y big.Int) {
	scalarReversed := make([]uint64, 4)
	var r1, r2 p256Point
	p256GetScalar(scalarReversed, baseScalar)
	r1IsInfinity := scalarIsZero(scalarReversed)
	r1.p256BaseMult(scalarReversed)

	p256GetScalar(scalarReversed, scalar)
	r2IsInfinity := scalarIsZero(scalarReversed)
	fromBig(r2.xyz[0:4], maybeReduceModP(bigX))
	fromBig(r2.xyz[4:8], maybeReduceModP(bigY))
	p256Mul(r2.xyz[0:4], r2.xyz[0:4], rr[:])
	p256Mul(r2.xyz[4:8], r2.xyz[4:8], rr[:])

	// This sets r2's Z value to 1, in the Montgomery domain.
	r2.xyz[8] = 0x0000000000000001
	r2.xyz[9] = 0xffffffff00000000
	r2.xyz[10] = 0xffffffffffffffff
	r2.xyz[11] = 0x00000000fffffffe

	r2.p256ScalarMult(scalarReversed)

	var sum, double p256Point
	pointsEqual := p256PointAddAsm(sum.xyz[:], r1.xyz[:], r2.xyz[:])
	p256PointDoubleAsm(double.xyz[:], r1.xyz[:])
	sum.CopyConditional(&double, pointsEqual)
	sum.CopyConditional(&r1, r2IsInfinity)
	sum.CopyConditional(&r2, r1IsInfinity)

	return sum.p256PointToAffine()
	}

	func (curve p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
	scalarReversed := make([]uint64, 4)
	p256GetScalar(scalarReversed, scalar)

	var r p256Point
	r.p256BaseMult(scalarReversed)
	return r.p256PointToAffine()
	}

	func (curve p256Curve) ScalarMult(bigX, bigY big.Int, scalar []byte) (x, y big.Int) {
	scalarReversed := make([]uint64, 4)
	p256GetScalar(scalarReversed, scalar)

	var r p256Point
	fromBig(r.xyz[0:4], maybeReduceModP(bigX))
	fromBig(r.xyz[4:8], maybeReduceModP(bigY))
	p256Mul(r.xyz[0:4], r.xyz[0:4], rr[:])
	p256Mul(r.xyz[4:8], r.xyz[4:8], rr[:])
	// This sets r2's Z value to 1, in the Montgomery domain.
	r.xyz[8] = 0x0000000000000001
	r.xyz[9] = 0xffffffff00000000
	r.xyz[10] = 0xffffffffffffffff
	r.xyz[11] = 0x00000000fffffffe

	r.p256ScalarMult(scalarReversed)
	return r.p256PointToAffine()
	}

	// uint64IsZero returns 1 if x is zero and zero otherwise.
	func uint64IsZero(x uint64) int {
	x = ^x
	x &= x >> 32
	x &= x >> 16
	x &= x >> 8
	x &= x >> 4
	x &= x >> 2
	x &= x >> 1
	return int(x & 1)
	}

	// scalarIsZero returns 1 if scalar represents the zero value, and zero
	// otherwise.
	func scalarIsZero(scalar []uint64) int {
	return uint64IsZero(scalar[0] \| scalar[1] \| scalar[2] \| scalar[3])
	}

	func (p p256Point) p256PointToAffine() (x, y big.Int) {
	zInv := make([]uint64, 4)
	zInvSq := make([]uint64, 4)
	p256Inverse(zInv, p.xyz[8:12])
	p256Sqr(zInvSq, zInv, 1)
	p256Mul(zInv, zInv, zInvSq)

	p256Mul(zInvSq, p.xyz[0:4], zInvSq)
	p256Mul(zInv, p.xyz[4:8], zInv)

	p256FromMont(zInvSq, zInvSq)
	p256FromMont(zInv, zInv)

	xOut := make([]byte, 32)
	yOut := make([]byte, 32)
	p256LittleToBig(xOut, zInvSq)
	p256LittleToBig(yOut, zInv)

	return new(big.Int).SetBytes(xOut), new(big.Int).SetBytes(yOut)
	}

	// CopyConditional copies overwrites p with src if v == 1, and leaves p
	// unchanged if v == 0.
	func (p p256Point) CopyConditional(src p256Point, v int) {
	pMask := uint64(v) - 1
	srcMask := ^pMask

	for i, n := range p.xyz {
	p.xyz[i] = (n & pMask) \| (src.xyz[i] & srcMask)
	}
	}

	// p256Inverse sets out to in^-1 mod p.
	func p256Inverse(out, in []uint64) {
	var stack [6 * 4]uint64
	p2 := stack[40 : 40+4]
	p4 := stack[41 : 41+4]
	p8 := stack[42 : 42+4]
	p16 := stack[43 : 43+4]
	p32 := stack[44 : 44+4]

	p256Sqr(out, in, 1)
	p256Mul(p2, out, in) // 3*p

	p256Sqr(out, p2, 2)
	p256Mul(p4, out, p2) // f*p

	p256Sqr(out, p4, 4)
	p256Mul(p8, out, p4) // ff*p

	p256Sqr(out, p8, 8)
	p256Mul(p16, out, p8) // ffff*p

	p256Sqr(out, p16, 16)
	p256Mul(p32, out, p16) // ffffffff*p

	p256Sqr(out, p32, 32)
	p256Mul(out, out, in)

	p256Sqr(out, out, 128)
	p256Mul(out, out, p32)

	p256Sqr(out, out, 32)
	p256Mul(out, out, p32)

	p256Sqr(out, out, 16)
	p256Mul(out, out, p16)

	p256Sqr(out, out, 8)
	p256Mul(out, out, p8)

	p256Sqr(out, out, 4)
	p256Mul(out, out, p4)

	p256Sqr(out, out, 2)
	p256Mul(out, out, p2)

	p256Sqr(out, out, 2)
	p256Mul(out, out, in)
	}

	func (p p256Point) p256StorePoint(r [16 * 4 * 3]uint64, index int) {
	copy(r[index*12:], p.xyz[:])
	}

	func boothW5(in uint) (int, int) {
	var s uint = ^((in >> 5) - 1)
	var d uint = (1 << 6) - in - 1
	d = (d & s) \| (in & (^s))
	d = (d >> 1) + (d & 1)
	return int(d), int(s & 1)
	}

	func boothW6(in uint) (int, int) {
	var s uint = ^((in >> 6) - 1)
	var d uint = (1 << 7) - in - 1
	d = (d & s) \| (in & (^s))
	d = (d >> 1) + (d & 1)
	return int(d), int(s & 1)
	}

	func (p *p256Point) p256BaseMult(scalar []uint64) {
	wvalue := (scalar[0] << 1) & 0x7f
	sel, sign := boothW6(uint(wvalue))
	p256SelectBase(p.xyz[0:8], p256Precomputed[0][0:], sel)
	p256NegCond(p.xyz[4:8], sign)

	// (This is one, in the Montgomery domain.)
	p.xyz[8] = 0x0000000000000001
	p.xyz[9] = 0xffffffff00000000
	p.xyz[10] = 0xffffffffffffffff
	p.xyz[11] = 0x00000000fffffffe

	var t0 p256Point
	// (This is one, in the Montgomery domain.)
	t0.xyz[8] = 0x0000000000000001
	t0.xyz[9] = 0xffffffff00000000
	t0.xyz[10] = 0xffffffffffffffff
	t0.xyz[11] = 0x00000000fffffffe

	index := uint(5)
	zero := sel

	for i := 1; i < 43; i++ {
	if index < 192 {
	wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x7f
	} else {
	wvalue = (scalar[index/64] >> (index % 64)) & 0x7f
	}
	index += 6
	sel, sign = boothW6(uint(wvalue))
	p256SelectBase(t0.xyz[0:8], p256Precomputed[i][0:], sel)
	p256PointAddAffineAsm(p.xyz[0:12], p.xyz[0:12], t0.xyz[0:8], sign, sel, zero)
	zero \|= sel
	}
	}

	func (p *p256Point) p256ScalarMult(scalar []uint64) {
	// precomp is a table of precomputed points that stores powers of p
	// from p^1 to p^16.
	var precomp [16 * 4 * 3]uint64
	var t0, t1, t2, t3 p256Point

	// Prepare the table
	p.p256StorePoint(&precomp, 0) // 1

	p256PointDoubleAsm(t0.xyz[:], p.xyz[:])
	p256PointDoubleAsm(t1.xyz[:], t0.xyz[:])
	p256PointDoubleAsm(t2.xyz[:], t1.xyz[:])
	p256PointDoubleAsm(t3.xyz[:], t2.xyz[:])
	t0.p256StorePoint(&precomp, 1) // 2
	t1.p256StorePoint(&precomp, 3) // 4
	t2.p256StorePoint(&precomp, 7) // 8
	t3.p256StorePoint(&precomp, 15) // 16

	p256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:])
	p256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:])
	p256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:])
	t0.p256StorePoint(&precomp, 2) // 3
	t1.p256StorePoint(&precomp, 4) // 5
	t2.p256StorePoint(&precomp, 8) // 9

	p256PointDoubleAsm(t0.xyz[:], t0.xyz[:])
	p256PointDoubleAsm(t1.xyz[:], t1.xyz[:])
	t0.p256StorePoint(&precomp, 5) // 6
	t1.p256StorePoint(&precomp, 9) // 10

	p256PointAddAsm(t2.xyz[:], t0.xyz[:], p.xyz[:])
	p256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:])
	t2.p256StorePoint(&precomp, 6) // 7
	t1.p256StorePoint(&precomp, 10) // 11

	p256PointDoubleAsm(t0.xyz[:], t0.xyz[:])
	p256PointDoubleAsm(t2.xyz[:], t2.xyz[:])
	t0.p256StorePoint(&precomp, 11) // 12
	t2.p256StorePoint(&precomp, 13) // 14

	p256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:])
	p256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:])
	t0.p256StorePoint(&precomp, 12) // 13
	t2.p256StorePoint(&precomp, 14) // 15

	// Start scanning the window from top bit
	index := uint(254)
	var sel, sign int

	wvalue := (scalar[index/64] >> (index % 64)) & 0x3f
	sel, _ = boothW5(uint(wvalue))

	p256Select(p.xyz[0:12], precomp[0:], sel)
	zero := sel

	for index > 4 {
	index -= 5
	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
	p256PointDoubleAsm(p.xyz[:], p.xyz[:])

	if index < 192 {
	wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x3f
	} else {
	wvalue = (scalar[index/64] >> (index % 64)) & 0x3f
	}

	sel, sign = boothW5(uint(wvalue))

	p256Select(t0.xyz[0:], precomp[0:], sel)
	p256NegCond(t0.xyz[4:8], sign)
	p256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:])
	p256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel)
	p256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero)
	zero \|= sel
	}

	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
	p256PointDoubleAsm(p.xyz[:], p.xyz[:])

	wvalue = (scalar[0] << 1) & 0x3f
	sel, sign = boothW5(uint(wvalue))

	p256Select(t0.xyz[0:], precomp[0:], sel)
	p256NegCond(t0.xyz[4:8], sign)
	p256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:])
	p256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel)
	p256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero)
	}