|  | // Copyright 2009 The Go Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style | 
|  | // license that can be found in the LICENSE file. | 
|  |  | 
|  | // This file implements unsigned multi-precision integers (natural | 
|  | // numbers). They are the building blocks for the implementation | 
|  | // of signed integers, rationals, and floating-point numbers. | 
|  | // | 
|  | // Caution: This implementation relies on the function "alias" | 
|  | //          which assumes that (nat) slice capacities are never | 
|  | //          changed (no 3-operand slice expressions). If that | 
|  | //          changes, alias needs to be updated for correctness. | 
|  |  | 
|  | package big | 
|  |  | 
|  | import ( | 
|  | "encoding/binary" | 
|  | "math/bits" | 
|  | "math/rand" | 
|  | "sync" | 
|  | ) | 
|  |  | 
|  | // An unsigned integer x of the form | 
|  | // | 
|  | //   x = x[n-1]*_B^(n-1) + x[n-2]*_B^(n-2) + ... + x[1]*_B + x[0] | 
|  | // | 
|  | // with 0 <= x[i] < _B and 0 <= i < n is stored in a slice of length n, | 
|  | // with the digits x[i] as the slice elements. | 
|  | // | 
|  | // A number is normalized if the slice contains no leading 0 digits. | 
|  | // During arithmetic operations, denormalized values may occur but are | 
|  | // always normalized before returning the final result. The normalized | 
|  | // representation of 0 is the empty or nil slice (length = 0). | 
|  | // | 
|  | type nat []Word | 
|  |  | 
|  | var ( | 
|  | natOne  = nat{1} | 
|  | natTwo  = nat{2} | 
|  | natFive = nat{5} | 
|  | natTen  = nat{10} | 
|  | ) | 
|  |  | 
|  | func (z nat) clear() { | 
|  | for i := range z { | 
|  | z[i] = 0 | 
|  | } | 
|  | } | 
|  |  | 
|  | func (z nat) norm() nat { | 
|  | i := len(z) | 
|  | for i > 0 && z[i-1] == 0 { | 
|  | i-- | 
|  | } | 
|  | return z[0:i] | 
|  | } | 
|  |  | 
|  | func (z nat) make(n int) nat { | 
|  | if n <= cap(z) { | 
|  | return z[:n] // reuse z | 
|  | } | 
|  | if n == 1 { | 
|  | // Most nats start small and stay that way; don't over-allocate. | 
|  | return make(nat, 1) | 
|  | } | 
|  | // Choosing a good value for e has significant performance impact | 
|  | // because it increases the chance that a value can be reused. | 
|  | const e = 4 // extra capacity | 
|  | return make(nat, n, n+e) | 
|  | } | 
|  |  | 
|  | func (z nat) setWord(x Word) nat { | 
|  | if x == 0 { | 
|  | return z[:0] | 
|  | } | 
|  | z = z.make(1) | 
|  | z[0] = x | 
|  | return z | 
|  | } | 
|  |  | 
|  | func (z nat) setUint64(x uint64) nat { | 
|  | // single-word value | 
|  | if w := Word(x); uint64(w) == x { | 
|  | return z.setWord(w) | 
|  | } | 
|  | // 2-word value | 
|  | z = z.make(2) | 
|  | z[1] = Word(x >> 32) | 
|  | z[0] = Word(x) | 
|  | return z | 
|  | } | 
|  |  | 
|  | func (z nat) set(x nat) nat { | 
|  | z = z.make(len(x)) | 
|  | copy(z, x) | 
|  | return z | 
|  | } | 
|  |  | 
|  | func (z nat) add(x, y nat) nat { | 
|  | m := len(x) | 
|  | n := len(y) | 
|  |  | 
|  | switch { | 
|  | case m < n: | 
|  | return z.add(y, x) | 
|  | case m == 0: | 
|  | // n == 0 because m >= n; result is 0 | 
|  | return z[:0] | 
|  | case n == 0: | 
|  | // result is x | 
|  | return z.set(x) | 
|  | } | 
|  | // m > 0 | 
|  |  | 
|  | z = z.make(m + 1) | 
|  | c := addVV(z[0:n], x, y) | 
|  | if m > n { | 
|  | c = addVW(z[n:m], x[n:], c) | 
|  | } | 
|  | z[m] = c | 
|  |  | 
|  | return z.norm() | 
|  | } | 
|  |  | 
|  | func (z nat) sub(x, y nat) nat { | 
|  | m := len(x) | 
|  | n := len(y) | 
|  |  | 
|  | switch { | 
|  | case m < n: | 
|  | panic("underflow") | 
|  | case m == 0: | 
|  | // n == 0 because m >= n; result is 0 | 
|  | return z[:0] | 
|  | case n == 0: | 
|  | // result is x | 
|  | return z.set(x) | 
|  | } | 
|  | // m > 0 | 
|  |  | 
|  | z = z.make(m) | 
|  | c := subVV(z[0:n], x, y) | 
|  | if m > n { | 
|  | c = subVW(z[n:], x[n:], c) | 
|  | } | 
|  | if c != 0 { | 
|  | panic("underflow") | 
|  | } | 
|  |  | 
|  | return z.norm() | 
|  | } | 
|  |  | 
|  | func (x nat) cmp(y nat) (r int) { | 
|  | m := len(x) | 
|  | n := len(y) | 
|  | if m != n || m == 0 { | 
|  | switch { | 
|  | case m < n: | 
|  | r = -1 | 
|  | case m > n: | 
|  | r = 1 | 
|  | } | 
|  | return | 
|  | } | 
|  |  | 
|  | i := m - 1 | 
|  | for i > 0 && x[i] == y[i] { | 
|  | i-- | 
|  | } | 
|  |  | 
|  | switch { | 
|  | case x[i] < y[i]: | 
|  | r = -1 | 
|  | case x[i] > y[i]: | 
|  | r = 1 | 
|  | } | 
|  | return | 
|  | } | 
|  |  | 
|  | func (z nat) mulAddWW(x nat, y, r Word) nat { | 
|  | m := len(x) | 
|  | if m == 0 || y == 0 { | 
|  | return z.setWord(r) // result is r | 
|  | } | 
|  | // m > 0 | 
|  |  | 
|  | z = z.make(m + 1) | 
|  | z[m] = mulAddVWW(z[0:m], x, y, r) | 
|  |  | 
|  | return z.norm() | 
|  | } | 
|  |  | 
|  | // basicMul multiplies x and y and leaves the result in z. | 
|  | // The (non-normalized) result is placed in z[0 : len(x) + len(y)]. | 
|  | func basicMul(z, x, y nat) { | 
|  | z[0 : len(x)+len(y)].clear() // initialize z | 
|  | for i, d := range y { | 
|  | if d != 0 { | 
|  | z[len(x)+i] = addMulVVW(z[i:i+len(x)], x, d) | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // montgomery computes z mod m = x*y*2**(-n*_W) mod m, | 
|  | // assuming k = -1/m mod 2**_W. | 
|  | // z is used for storing the result which is returned; | 
|  | // z must not alias x, y or m. | 
|  | // See Gueron, "Efficient Software Implementations of Modular Exponentiation". | 
|  | // https://eprint.iacr.org/2011/239.pdf | 
|  | // In the terminology of that paper, this is an "Almost Montgomery Multiplication": | 
|  | // x and y are required to satisfy 0 <= z < 2**(n*_W) and then the result | 
|  | // z is guaranteed to satisfy 0 <= z < 2**(n*_W), but it may not be < m. | 
|  | func (z nat) montgomery(x, y, m nat, k Word, n int) nat { | 
|  | // This code assumes x, y, m are all the same length, n. | 
|  | // (required by addMulVVW and the for loop). | 
|  | // It also assumes that x, y are already reduced mod m, | 
|  | // or else the result will not be properly reduced. | 
|  | if len(x) != n || len(y) != n || len(m) != n { | 
|  | panic("math/big: mismatched montgomery number lengths") | 
|  | } | 
|  | z = z.make(n * 2) | 
|  | z.clear() | 
|  | var c Word | 
|  | for i := 0; i < n; i++ { | 
|  | d := y[i] | 
|  | c2 := addMulVVW(z[i:n+i], x, d) | 
|  | t := z[i] * k | 
|  | c3 := addMulVVW(z[i:n+i], m, t) | 
|  | cx := c + c2 | 
|  | cy := cx + c3 | 
|  | z[n+i] = cy | 
|  | if cx < c2 || cy < c3 { | 
|  | c = 1 | 
|  | } else { | 
|  | c = 0 | 
|  | } | 
|  | } | 
|  | if c != 0 { | 
|  | subVV(z[:n], z[n:], m) | 
|  | } else { | 
|  | copy(z[:n], z[n:]) | 
|  | } | 
|  | return z[:n] | 
|  | } | 
|  |  | 
|  | // Fast version of z[0:n+n>>1].add(z[0:n+n>>1], x[0:n]) w/o bounds checks. | 
|  | // Factored out for readability - do not use outside karatsuba. | 
|  | func karatsubaAdd(z, x nat, n int) { | 
|  | if c := addVV(z[0:n], z, x); c != 0 { | 
|  | addVW(z[n:n+n>>1], z[n:], c) | 
|  | } | 
|  | } | 
|  |  | 
|  | // Like karatsubaAdd, but does subtract. | 
|  | func karatsubaSub(z, x nat, n int) { | 
|  | if c := subVV(z[0:n], z, x); c != 0 { | 
|  | subVW(z[n:n+n>>1], z[n:], c) | 
|  | } | 
|  | } | 
|  |  | 
|  | // Operands that are shorter than karatsubaThreshold are multiplied using | 
|  | // "grade school" multiplication; for longer operands the Karatsuba algorithm | 
|  | // is used. | 
|  | var karatsubaThreshold = 40 // computed by calibrate_test.go | 
|  |  | 
|  | // karatsuba multiplies x and y and leaves the result in z. | 
|  | // Both x and y must have the same length n and n must be a | 
|  | // power of 2. The result vector z must have len(z) >= 6*n. | 
|  | // The (non-normalized) result is placed in z[0 : 2*n]. | 
|  | func karatsuba(z, x, y nat) { | 
|  | n := len(y) | 
|  |  | 
|  | // Switch to basic multiplication if numbers are odd or small. | 
|  | // (n is always even if karatsubaThreshold is even, but be | 
|  | // conservative) | 
|  | if n&1 != 0 || n < karatsubaThreshold || n < 2 { | 
|  | basicMul(z, x, y) | 
|  | return | 
|  | } | 
|  | // n&1 == 0 && n >= karatsubaThreshold && n >= 2 | 
|  |  | 
|  | // Karatsuba multiplication is based on the observation that | 
|  | // for two numbers x and y with: | 
|  | // | 
|  | //   x = x1*b + x0 | 
|  | //   y = y1*b + y0 | 
|  | // | 
|  | // the product x*y can be obtained with 3 products z2, z1, z0 | 
|  | // instead of 4: | 
|  | // | 
|  | //   x*y = x1*y1*b*b + (x1*y0 + x0*y1)*b + x0*y0 | 
|  | //       =    z2*b*b +              z1*b +    z0 | 
|  | // | 
|  | // with: | 
|  | // | 
|  | //   xd = x1 - x0 | 
|  | //   yd = y0 - y1 | 
|  | // | 
|  | //   z1 =      xd*yd                    + z2 + z0 | 
|  | //      = (x1-x0)*(y0 - y1)             + z2 + z0 | 
|  | //      = x1*y0 - x1*y1 - x0*y0 + x0*y1 + z2 + z0 | 
|  | //      = x1*y0 -    z2 -    z0 + x0*y1 + z2 + z0 | 
|  | //      = x1*y0                 + x0*y1 | 
|  |  | 
|  | // split x, y into "digits" | 
|  | n2 := n >> 1              // n2 >= 1 | 
|  | x1, x0 := x[n2:], x[0:n2] // x = x1*b + y0 | 
|  | y1, y0 := y[n2:], y[0:n2] // y = y1*b + y0 | 
|  |  | 
|  | // z is used for the result and temporary storage: | 
|  | // | 
|  | //   6*n     5*n     4*n     3*n     2*n     1*n     0*n | 
|  | // z = [z2 copy|z0 copy| xd*yd | yd:xd | x1*y1 | x0*y0 ] | 
|  | // | 
|  | // For each recursive call of karatsuba, an unused slice of | 
|  | // z is passed in that has (at least) half the length of the | 
|  | // caller's z. | 
|  |  | 
|  | // compute z0 and z2 with the result "in place" in z | 
|  | karatsuba(z, x0, y0)     // z0 = x0*y0 | 
|  | karatsuba(z[n:], x1, y1) // z2 = x1*y1 | 
|  |  | 
|  | // compute xd (or the negative value if underflow occurs) | 
|  | s := 1 // sign of product xd*yd | 
|  | xd := z[2*n : 2*n+n2] | 
|  | if subVV(xd, x1, x0) != 0 { // x1-x0 | 
|  | s = -s | 
|  | subVV(xd, x0, x1) // x0-x1 | 
|  | } | 
|  |  | 
|  | // compute yd (or the negative value if underflow occurs) | 
|  | yd := z[2*n+n2 : 3*n] | 
|  | if subVV(yd, y0, y1) != 0 { // y0-y1 | 
|  | s = -s | 
|  | subVV(yd, y1, y0) // y1-y0 | 
|  | } | 
|  |  | 
|  | // p = (x1-x0)*(y0-y1) == x1*y0 - x1*y1 - x0*y0 + x0*y1 for s > 0 | 
|  | // p = (x0-x1)*(y0-y1) == x0*y0 - x0*y1 - x1*y0 + x1*y1 for s < 0 | 
|  | p := z[n*3:] | 
|  | karatsuba(p, xd, yd) | 
|  |  | 
|  | // save original z2:z0 | 
|  | // (ok to use upper half of z since we're done recursing) | 
|  | r := z[n*4:] | 
|  | copy(r, z[:n*2]) | 
|  |  | 
|  | // add up all partial products | 
|  | // | 
|  | //   2*n     n     0 | 
|  | // z = [ z2  | z0  ] | 
|  | //   +    [ z0  ] | 
|  | //   +    [ z2  ] | 
|  | //   +    [  p  ] | 
|  | // | 
|  | karatsubaAdd(z[n2:], r, n) | 
|  | karatsubaAdd(z[n2:], r[n:], n) | 
|  | if s > 0 { | 
|  | karatsubaAdd(z[n2:], p, n) | 
|  | } else { | 
|  | karatsubaSub(z[n2:], p, n) | 
|  | } | 
|  | } | 
|  |  | 
|  | // alias reports whether x and y share the same base array. | 
|  | // Note: alias assumes that the capacity of underlying arrays | 
|  | //       is never changed for nat values; i.e. that there are | 
|  | //       no 3-operand slice expressions in this code (or worse, | 
|  | //       reflect-based operations to the same effect). | 
|  | func alias(x, y nat) bool { | 
|  | return cap(x) > 0 && cap(y) > 0 && &x[0:cap(x)][cap(x)-1] == &y[0:cap(y)][cap(y)-1] | 
|  | } | 
|  |  | 
|  | // addAt implements z += x<<(_W*i); z must be long enough. | 
|  | // (we don't use nat.add because we need z to stay the same | 
|  | // slice, and we don't need to normalize z after each addition) | 
|  | func addAt(z, x nat, i int) { | 
|  | if n := len(x); n > 0 { | 
|  | if c := addVV(z[i:i+n], z[i:], x); c != 0 { | 
|  | j := i + n | 
|  | if j < len(z) { | 
|  | addVW(z[j:], z[j:], c) | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | func max(x, y int) int { | 
|  | if x > y { | 
|  | return x | 
|  | } | 
|  | return y | 
|  | } | 
|  |  | 
|  | // karatsubaLen computes an approximation to the maximum k <= n such that | 
|  | // k = p<<i for a number p <= threshold and an i >= 0. Thus, the | 
|  | // result is the largest number that can be divided repeatedly by 2 before | 
|  | // becoming about the value of threshold. | 
|  | func karatsubaLen(n, threshold int) int { | 
|  | i := uint(0) | 
|  | for n > threshold { | 
|  | n >>= 1 | 
|  | i++ | 
|  | } | 
|  | return n << i | 
|  | } | 
|  |  | 
|  | func (z nat) mul(x, y nat) nat { | 
|  | m := len(x) | 
|  | n := len(y) | 
|  |  | 
|  | switch { | 
|  | case m < n: | 
|  | return z.mul(y, x) | 
|  | case m == 0 || n == 0: | 
|  | return z[:0] | 
|  | case n == 1: | 
|  | return z.mulAddWW(x, y[0], 0) | 
|  | } | 
|  | // m >= n > 1 | 
|  |  | 
|  | // determine if z can be reused | 
|  | if alias(z, x) || alias(z, y) { | 
|  | z = nil // z is an alias for x or y - cannot reuse | 
|  | } | 
|  |  | 
|  | // use basic multiplication if the numbers are small | 
|  | if n < karatsubaThreshold { | 
|  | z = z.make(m + n) | 
|  | basicMul(z, x, y) | 
|  | return z.norm() | 
|  | } | 
|  | // m >= n && n >= karatsubaThreshold && n >= 2 | 
|  |  | 
|  | // determine Karatsuba length k such that | 
|  | // | 
|  | //   x = xh*b + x0  (0 <= x0 < b) | 
|  | //   y = yh*b + y0  (0 <= y0 < b) | 
|  | //   b = 1<<(_W*k)  ("base" of digits xi, yi) | 
|  | // | 
|  | k := karatsubaLen(n, karatsubaThreshold) | 
|  | // k <= n | 
|  |  | 
|  | // multiply x0 and y0 via Karatsuba | 
|  | x0 := x[0:k]              // x0 is not normalized | 
|  | y0 := y[0:k]              // y0 is not normalized | 
|  | z = z.make(max(6*k, m+n)) // enough space for karatsuba of x0*y0 and full result of x*y | 
|  | karatsuba(z, x0, y0) | 
|  | z = z[0 : m+n]  // z has final length but may be incomplete | 
|  | z[2*k:].clear() // upper portion of z is garbage (and 2*k <= m+n since k <= n <= m) | 
|  |  | 
|  | // If xh != 0 or yh != 0, add the missing terms to z. For | 
|  | // | 
|  | //   xh = xi*b^i + ... + x2*b^2 + x1*b (0 <= xi < b) | 
|  | //   yh =                         y1*b (0 <= y1 < b) | 
|  | // | 
|  | // the missing terms are | 
|  | // | 
|  | //   x0*y1*b and xi*y0*b^i, xi*y1*b^(i+1) for i > 0 | 
|  | // | 
|  | // since all the yi for i > 1 are 0 by choice of k: If any of them | 
|  | // were > 0, then yh >= b^2 and thus y >= b^2. Then k' = k*2 would | 
|  | // be a larger valid threshold contradicting the assumption about k. | 
|  | // | 
|  | if k < n || m != n { | 
|  | tp := getNat(3 * k) | 
|  | t := *tp | 
|  |  | 
|  | // add x0*y1*b | 
|  | x0 := x0.norm() | 
|  | y1 := y[k:]       // y1 is normalized because y is | 
|  | t = t.mul(x0, y1) // update t so we don't lose t's underlying array | 
|  | addAt(z, t, k) | 
|  |  | 
|  | // add xi*y0<<i, xi*y1*b<<(i+k) | 
|  | y0 := y0.norm() | 
|  | for i := k; i < len(x); i += k { | 
|  | xi := x[i:] | 
|  | if len(xi) > k { | 
|  | xi = xi[:k] | 
|  | } | 
|  | xi = xi.norm() | 
|  | t = t.mul(xi, y0) | 
|  | addAt(z, t, i) | 
|  | t = t.mul(xi, y1) | 
|  | addAt(z, t, i+k) | 
|  | } | 
|  |  | 
|  | putNat(tp) | 
|  | } | 
|  |  | 
|  | return z.norm() | 
|  | } | 
|  |  | 
|  | // basicSqr sets z = x*x and is asymptotically faster than basicMul | 
|  | // by about a factor of 2, but slower for small arguments due to overhead. | 
|  | // Requirements: len(x) > 0, len(z) == 2*len(x) | 
|  | // The (non-normalized) result is placed in z. | 
|  | func basicSqr(z, x nat) { | 
|  | n := len(x) | 
|  | tp := getNat(2 * n) | 
|  | t := *tp // temporary variable to hold the products | 
|  | t.clear() | 
|  | z[1], z[0] = mulWW(x[0], x[0]) // the initial square | 
|  | for i := 1; i < n; i++ { | 
|  | d := x[i] | 
|  | // z collects the squares x[i] * x[i] | 
|  | z[2*i+1], z[2*i] = mulWW(d, d) | 
|  | // t collects the products x[i] * x[j] where j < i | 
|  | t[2*i] = addMulVVW(t[i:2*i], x[0:i], d) | 
|  | } | 
|  | t[2*n-1] = shlVU(t[1:2*n-1], t[1:2*n-1], 1) // double the j < i products | 
|  | addVV(z, z, t)                              // combine the result | 
|  | putNat(tp) | 
|  | } | 
|  |  | 
|  | // karatsubaSqr squares x and leaves the result in z. | 
|  | // len(x) must be a power of 2 and len(z) >= 6*len(x). | 
|  | // The (non-normalized) result is placed in z[0 : 2*len(x)]. | 
|  | // | 
|  | // The algorithm and the layout of z are the same as for karatsuba. | 
|  | func karatsubaSqr(z, x nat) { | 
|  | n := len(x) | 
|  |  | 
|  | if n&1 != 0 || n < karatsubaSqrThreshold || n < 2 { | 
|  | basicSqr(z[:2*n], x) | 
|  | return | 
|  | } | 
|  |  | 
|  | n2 := n >> 1 | 
|  | x1, x0 := x[n2:], x[0:n2] | 
|  |  | 
|  | karatsubaSqr(z, x0) | 
|  | karatsubaSqr(z[n:], x1) | 
|  |  | 
|  | // s = sign(xd*yd) == -1 for xd != 0; s == 1 for xd == 0 | 
|  | xd := z[2*n : 2*n+n2] | 
|  | if subVV(xd, x1, x0) != 0 { | 
|  | subVV(xd, x0, x1) | 
|  | } | 
|  |  | 
|  | p := z[n*3:] | 
|  | karatsubaSqr(p, xd) | 
|  |  | 
|  | r := z[n*4:] | 
|  | copy(r, z[:n*2]) | 
|  |  | 
|  | karatsubaAdd(z[n2:], r, n) | 
|  | karatsubaAdd(z[n2:], r[n:], n) | 
|  | karatsubaSub(z[n2:], p, n) // s == -1 for p != 0; s == 1 for p == 0 | 
|  | } | 
|  |  | 
|  | // Operands that are shorter than basicSqrThreshold are squared using | 
|  | // "grade school" multiplication; for operands longer than karatsubaSqrThreshold | 
|  | // we use the Karatsuba algorithm optimized for x == y. | 
|  | var basicSqrThreshold = 20      // computed by calibrate_test.go | 
|  | var karatsubaSqrThreshold = 260 // computed by calibrate_test.go | 
|  |  | 
|  | // z = x*x | 
|  | func (z nat) sqr(x nat) nat { | 
|  | n := len(x) | 
|  | switch { | 
|  | case n == 0: | 
|  | return z[:0] | 
|  | case n == 1: | 
|  | d := x[0] | 
|  | z = z.make(2) | 
|  | z[1], z[0] = mulWW(d, d) | 
|  | return z.norm() | 
|  | } | 
|  |  | 
|  | if alias(z, x) { | 
|  | z = nil // z is an alias for x - cannot reuse | 
|  | } | 
|  |  | 
|  | if n < basicSqrThreshold { | 
|  | z = z.make(2 * n) | 
|  | basicMul(z, x, x) | 
|  | return z.norm() | 
|  | } | 
|  | if n < karatsubaSqrThreshold { | 
|  | z = z.make(2 * n) | 
|  | basicSqr(z, x) | 
|  | return z.norm() | 
|  | } | 
|  |  | 
|  | // Use Karatsuba multiplication optimized for x == y. | 
|  | // The algorithm and layout of z are the same as for mul. | 
|  |  | 
|  | // z = (x1*b + x0)^2 = x1^2*b^2 + 2*x1*x0*b + x0^2 | 
|  |  | 
|  | k := karatsubaLen(n, karatsubaSqrThreshold) | 
|  |  | 
|  | x0 := x[0:k] | 
|  | z = z.make(max(6*k, 2*n)) | 
|  | karatsubaSqr(z, x0) // z = x0^2 | 
|  | z = z[0 : 2*n] | 
|  | z[2*k:].clear() | 
|  |  | 
|  | if k < n { | 
|  | tp := getNat(2 * k) | 
|  | t := *tp | 
|  | x0 := x0.norm() | 
|  | x1 := x[k:] | 
|  | t = t.mul(x0, x1) | 
|  | addAt(z, t, k) | 
|  | addAt(z, t, k) // z = 2*x1*x0*b + x0^2 | 
|  | t = t.sqr(x1) | 
|  | addAt(z, t, 2*k) // z = x1^2*b^2 + 2*x1*x0*b + x0^2 | 
|  | putNat(tp) | 
|  | } | 
|  |  | 
|  | return z.norm() | 
|  | } | 
|  |  | 
|  | // mulRange computes the product of all the unsigned integers in the | 
|  | // range [a, b] inclusively. If a > b (empty range), the result is 1. | 
|  | func (z nat) mulRange(a, b uint64) nat { | 
|  | switch { | 
|  | case a == 0: | 
|  | // cut long ranges short (optimization) | 
|  | return z.setUint64(0) | 
|  | case a > b: | 
|  | return z.setUint64(1) | 
|  | case a == b: | 
|  | return z.setUint64(a) | 
|  | case a+1 == b: | 
|  | return z.mul(nat(nil).setUint64(a), nat(nil).setUint64(b)) | 
|  | } | 
|  | m := (a + b) / 2 | 
|  | return z.mul(nat(nil).mulRange(a, m), nat(nil).mulRange(m+1, b)) | 
|  | } | 
|  |  | 
|  | // q = (x-r)/y, with 0 <= r < y | 
|  | func (z nat) divW(x nat, y Word) (q nat, r Word) { | 
|  | m := len(x) | 
|  | switch { | 
|  | case y == 0: | 
|  | panic("division by zero") | 
|  | case y == 1: | 
|  | q = z.set(x) // result is x | 
|  | return | 
|  | case m == 0: | 
|  | q = z[:0] // result is 0 | 
|  | return | 
|  | } | 
|  | // m > 0 | 
|  | z = z.make(m) | 
|  | r = divWVW(z, 0, x, y) | 
|  | q = z.norm() | 
|  | return | 
|  | } | 
|  |  | 
|  | func (z nat) div(z2, u, v nat) (q, r nat) { | 
|  | if len(v) == 0 { | 
|  | panic("division by zero") | 
|  | } | 
|  |  | 
|  | if u.cmp(v) < 0 { | 
|  | q = z[:0] | 
|  | r = z2.set(u) | 
|  | return | 
|  | } | 
|  |  | 
|  | if len(v) == 1 { | 
|  | var r2 Word | 
|  | q, r2 = z.divW(u, v[0]) | 
|  | r = z2.setWord(r2) | 
|  | return | 
|  | } | 
|  |  | 
|  | q, r = z.divLarge(z2, u, v) | 
|  | return | 
|  | } | 
|  |  | 
|  | // getNat returns a *nat of len n. The contents may not be zero. | 
|  | // The pool holds *nat to avoid allocation when converting to interface{}. | 
|  | func getNat(n int) *nat { | 
|  | var z *nat | 
|  | if v := natPool.Get(); v != nil { | 
|  | z = v.(*nat) | 
|  | } | 
|  | if z == nil { | 
|  | z = new(nat) | 
|  | } | 
|  | *z = z.make(n) | 
|  | return z | 
|  | } | 
|  |  | 
|  | func putNat(x *nat) { | 
|  | natPool.Put(x) | 
|  | } | 
|  |  | 
|  | var natPool sync.Pool | 
|  |  | 
|  | // q = (uIn-r)/vIn, with 0 <= r < vIn | 
|  | // Uses z as storage for q, and u as storage for r if possible. | 
|  | // See Knuth, Volume 2, section 4.3.1, Algorithm D. | 
|  | // Preconditions: | 
|  | //    len(vIn) >= 2 | 
|  | //    len(uIn) >= len(vIn) | 
|  | //    u must not alias z | 
|  | func (z nat) divLarge(u, uIn, vIn nat) (q, r nat) { | 
|  | n := len(vIn) | 
|  | m := len(uIn) - n | 
|  |  | 
|  | // D1. | 
|  | shift := nlz(vIn[n-1]) | 
|  | // do not modify vIn, it may be used by another goroutine simultaneously | 
|  | vp := getNat(n) | 
|  | v := *vp | 
|  | shlVU(v, vIn, shift) | 
|  |  | 
|  | // u may safely alias uIn or vIn, the value of uIn is used to set u and vIn was already used | 
|  | u = u.make(len(uIn) + 1) | 
|  | u[len(uIn)] = shlVU(u[0:len(uIn)], uIn, shift) | 
|  |  | 
|  | // z may safely alias uIn or vIn, both values were used already | 
|  | if alias(z, u) { | 
|  | z = nil // z is an alias for u - cannot reuse | 
|  | } | 
|  | q = z.make(m + 1) | 
|  |  | 
|  | if n < divRecursiveThreshold { | 
|  | q.divBasic(u, v) | 
|  | } else { | 
|  | q.divRecursive(u, v) | 
|  | } | 
|  | putNat(vp) | 
|  |  | 
|  | q = q.norm() | 
|  | shrVU(u, u, shift) | 
|  | r = u.norm() | 
|  |  | 
|  | return q, r | 
|  | } | 
|  |  | 
|  | // divBasic performs word-by-word division of u by v. | 
|  | // The quotient is written in pre-allocated q. | 
|  | // The remainder overwrites input u. | 
|  | // | 
|  | // Precondition: | 
|  | // - len(q) >= len(u)-len(v) | 
|  | func (q nat) divBasic(u, v nat) { | 
|  | n := len(v) | 
|  | m := len(u) - n | 
|  |  | 
|  | qhatvp := getNat(n + 1) | 
|  | qhatv := *qhatvp | 
|  |  | 
|  | // D2. | 
|  | vn1 := v[n-1] | 
|  | for j := m; j >= 0; j-- { | 
|  | // D3. | 
|  | qhat := Word(_M) | 
|  | var ujn Word | 
|  | if j+n < len(u) { | 
|  | ujn = u[j+n] | 
|  | } | 
|  | if ujn != vn1 { | 
|  | var rhat Word | 
|  | qhat, rhat = divWW(ujn, u[j+n-1], vn1) | 
|  |  | 
|  | // x1 | x2 = q̂v_{n-2} | 
|  | vn2 := v[n-2] | 
|  | x1, x2 := mulWW(qhat, vn2) | 
|  | // test if q̂v_{n-2} > br̂ + u_{j+n-2} | 
|  | ujn2 := u[j+n-2] | 
|  | for greaterThan(x1, x2, rhat, ujn2) { | 
|  | qhat-- | 
|  | prevRhat := rhat | 
|  | rhat += vn1 | 
|  | // v[n-1] >= 0, so this tests for overflow. | 
|  | if rhat < prevRhat { | 
|  | break | 
|  | } | 
|  | x1, x2 = mulWW(qhat, vn2) | 
|  | } | 
|  | } | 
|  |  | 
|  | // D4. | 
|  | qhatv[n] = mulAddVWW(qhatv[0:n], v, qhat, 0) | 
|  | qhl := len(qhatv) | 
|  | if j+qhl > len(u) && qhatv[n] == 0 { | 
|  | qhl-- | 
|  | } | 
|  | c := subVV(u[j:j+qhl], u[j:], qhatv) | 
|  | if c != 0 { | 
|  | c := addVV(u[j:j+n], u[j:], v) | 
|  | u[j+n] += c | 
|  | qhat-- | 
|  | } | 
|  |  | 
|  | if j == m && m == len(q) && qhat == 0 { | 
|  | continue | 
|  | } | 
|  | q[j] = qhat | 
|  | } | 
|  |  | 
|  | putNat(qhatvp) | 
|  | } | 
|  |  | 
|  | const divRecursiveThreshold = 100 | 
|  |  | 
|  | // divRecursive performs word-by-word division of u by v. | 
|  | // The quotient is written in pre-allocated z. | 
|  | // The remainder overwrites input u. | 
|  | // | 
|  | // Precondition: | 
|  | // - len(z) >= len(u)-len(v) | 
|  | // | 
|  | // See Burnikel, Ziegler, "Fast Recursive Division", Algorithm 1 and 2. | 
|  | func (z nat) divRecursive(u, v nat) { | 
|  | // Recursion depth is less than 2 log2(len(v)) | 
|  | // Allocate a slice of temporaries to be reused across recursion. | 
|  | recDepth := 2 * bits.Len(uint(len(v))) | 
|  | // large enough to perform Karatsuba on operands as large as v | 
|  | tmp := getNat(3 * len(v)) | 
|  | temps := make([]*nat, recDepth) | 
|  | z.clear() | 
|  | z.divRecursiveStep(u, v, 0, tmp, temps) | 
|  | for _, n := range temps { | 
|  | if n != nil { | 
|  | putNat(n) | 
|  | } | 
|  | } | 
|  | putNat(tmp) | 
|  | } | 
|  |  | 
|  | func (z nat) divRecursiveStep(u, v nat, depth int, tmp *nat, temps []*nat) { | 
|  | u = u.norm() | 
|  | v = v.norm() | 
|  |  | 
|  | if len(u) == 0 { | 
|  | z.clear() | 
|  | return | 
|  | } | 
|  | n := len(v) | 
|  | if n < divRecursiveThreshold { | 
|  | z.divBasic(u, v) | 
|  | return | 
|  | } | 
|  | m := len(u) - n | 
|  | if m < 0 { | 
|  | return | 
|  | } | 
|  |  | 
|  | // Produce the quotient by blocks of B words. | 
|  | // Division by v (length n) is done using a length n/2 division | 
|  | // and a length n/2 multiplication for each block. The final | 
|  | // complexity is driven by multiplication complexity. | 
|  | B := n / 2 | 
|  |  | 
|  | // Allocate a nat for qhat below. | 
|  | if temps[depth] == nil { | 
|  | temps[depth] = getNat(n) | 
|  | } else { | 
|  | *temps[depth] = temps[depth].make(B + 1) | 
|  | } | 
|  |  | 
|  | j := m | 
|  | for j > B { | 
|  | // Divide u[j-B:j+n] by vIn. Keep remainder in u | 
|  | // for next block. | 
|  | // | 
|  | // The following property will be used (Lemma 2): | 
|  | // if u = u1 << s + u0 | 
|  | //    v = v1 << s + v0 | 
|  | // then floor(u1/v1) >= floor(u/v) | 
|  | // | 
|  | // Moreover, the difference is at most 2 if len(v1) >= len(u/v) | 
|  | // We choose s = B-1 since len(v)-B >= B+1 >= len(u/v) | 
|  | s := (B - 1) | 
|  | // Except for the first step, the top bits are always | 
|  | // a division remainder, so the quotient length is <= n. | 
|  | uu := u[j-B:] | 
|  |  | 
|  | qhat := *temps[depth] | 
|  | qhat.clear() | 
|  | qhat.divRecursiveStep(uu[s:B+n], v[s:], depth+1, tmp, temps) | 
|  | qhat = qhat.norm() | 
|  | // Adjust the quotient: | 
|  | //    u = u_h << s + u_l | 
|  | //    v = v_h << s + v_l | 
|  | //  u_h = q̂ v_h + rh | 
|  | //    u = q̂ (v - v_l) + rh << s + u_l | 
|  | // After the above step, u contains a remainder: | 
|  | //    u = rh << s + u_l | 
|  | // and we need to subtract q̂ v_l | 
|  | // | 
|  | // But it may be a bit too large, in which case q̂ needs to be smaller. | 
|  | qhatv := tmp.make(3 * n) | 
|  | qhatv.clear() | 
|  | qhatv = qhatv.mul(qhat, v[:s]) | 
|  | for i := 0; i < 2; i++ { | 
|  | e := qhatv.cmp(uu.norm()) | 
|  | if e <= 0 { | 
|  | break | 
|  | } | 
|  | subVW(qhat, qhat, 1) | 
|  | c := subVV(qhatv[:s], qhatv[:s], v[:s]) | 
|  | if len(qhatv) > s { | 
|  | subVW(qhatv[s:], qhatv[s:], c) | 
|  | } | 
|  | addAt(uu[s:], v[s:], 0) | 
|  | } | 
|  | if qhatv.cmp(uu.norm()) > 0 { | 
|  | panic("impossible") | 
|  | } | 
|  | c := subVV(uu[:len(qhatv)], uu[:len(qhatv)], qhatv) | 
|  | if c > 0 { | 
|  | subVW(uu[len(qhatv):], uu[len(qhatv):], c) | 
|  | } | 
|  | addAt(z, qhat, j-B) | 
|  | j -= B | 
|  | } | 
|  |  | 
|  | // Now u < (v<<B), compute lower bits in the same way. | 
|  | // Choose shift = B-1 again. | 
|  | s := B | 
|  | qhat := *temps[depth] | 
|  | qhat.clear() | 
|  | qhat.divRecursiveStep(u[s:].norm(), v[s:], depth+1, tmp, temps) | 
|  | qhat = qhat.norm() | 
|  | qhatv := tmp.make(3 * n) | 
|  | qhatv.clear() | 
|  | qhatv = qhatv.mul(qhat, v[:s]) | 
|  | // Set the correct remainder as before. | 
|  | for i := 0; i < 2; i++ { | 
|  | if e := qhatv.cmp(u.norm()); e > 0 { | 
|  | subVW(qhat, qhat, 1) | 
|  | c := subVV(qhatv[:s], qhatv[:s], v[:s]) | 
|  | if len(qhatv) > s { | 
|  | subVW(qhatv[s:], qhatv[s:], c) | 
|  | } | 
|  | addAt(u[s:], v[s:], 0) | 
|  | } | 
|  | } | 
|  | if qhatv.cmp(u.norm()) > 0 { | 
|  | panic("impossible") | 
|  | } | 
|  | c := subVV(u[0:len(qhatv)], u[0:len(qhatv)], qhatv) | 
|  | if c > 0 { | 
|  | c = subVW(u[len(qhatv):], u[len(qhatv):], c) | 
|  | } | 
|  | if c > 0 { | 
|  | panic("impossible") | 
|  | } | 
|  |  | 
|  | // Done! | 
|  | addAt(z, qhat.norm(), 0) | 
|  | } | 
|  |  | 
|  | // Length of x in bits. x must be normalized. | 
|  | func (x nat) bitLen() int { | 
|  | if i := len(x) - 1; i >= 0 { | 
|  | return i*_W + bits.Len(uint(x[i])) | 
|  | } | 
|  | return 0 | 
|  | } | 
|  |  | 
|  | // trailingZeroBits returns the number of consecutive least significant zero | 
|  | // bits of x. | 
|  | func (x nat) trailingZeroBits() uint { | 
|  | if len(x) == 0 { | 
|  | return 0 | 
|  | } | 
|  | var i uint | 
|  | for x[i] == 0 { | 
|  | i++ | 
|  | } | 
|  | // x[i] != 0 | 
|  | return i*_W + uint(bits.TrailingZeros(uint(x[i]))) | 
|  | } | 
|  |  | 
|  | func same(x, y nat) bool { | 
|  | return len(x) == len(y) && len(x) > 0 && &x[0] == &y[0] | 
|  | } | 
|  |  | 
|  | // z = x << s | 
|  | func (z nat) shl(x nat, s uint) nat { | 
|  | if s == 0 { | 
|  | if same(z, x) { | 
|  | return z | 
|  | } | 
|  | if !alias(z, x) { | 
|  | return z.set(x) | 
|  | } | 
|  | } | 
|  |  | 
|  | m := len(x) | 
|  | if m == 0 { | 
|  | return z[:0] | 
|  | } | 
|  | // m > 0 | 
|  |  | 
|  | n := m + int(s/_W) | 
|  | z = z.make(n + 1) | 
|  | z[n] = shlVU(z[n-m:n], x, s%_W) | 
|  | z[0 : n-m].clear() | 
|  |  | 
|  | return z.norm() | 
|  | } | 
|  |  | 
|  | // z = x >> s | 
|  | func (z nat) shr(x nat, s uint) nat { | 
|  | if s == 0 { | 
|  | if same(z, x) { | 
|  | return z | 
|  | } | 
|  | if !alias(z, x) { | 
|  | return z.set(x) | 
|  | } | 
|  | } | 
|  |  | 
|  | m := len(x) | 
|  | n := m - int(s/_W) | 
|  | if n <= 0 { | 
|  | return z[:0] | 
|  | } | 
|  | // n > 0 | 
|  |  | 
|  | z = z.make(n) | 
|  | shrVU(z, x[m-n:], s%_W) | 
|  |  | 
|  | return z.norm() | 
|  | } | 
|  |  | 
|  | func (z nat) setBit(x nat, i uint, b uint) nat { | 
|  | j := int(i / _W) | 
|  | m := Word(1) << (i % _W) | 
|  | n := len(x) | 
|  | switch b { | 
|  | case 0: | 
|  | z = z.make(n) | 
|  | copy(z, x) | 
|  | if j >= n { | 
|  | // no need to grow | 
|  | return z | 
|  | } | 
|  | z[j] &^= m | 
|  | return z.norm() | 
|  | case 1: | 
|  | if j >= n { | 
|  | z = z.make(j + 1) | 
|  | z[n:].clear() | 
|  | } else { | 
|  | z = z.make(n) | 
|  | } | 
|  | copy(z, x) | 
|  | z[j] |= m | 
|  | // no need to normalize | 
|  | return z | 
|  | } | 
|  | panic("set bit is not 0 or 1") | 
|  | } | 
|  |  | 
|  | // bit returns the value of the i'th bit, with lsb == bit 0. | 
|  | func (x nat) bit(i uint) uint { | 
|  | j := i / _W | 
|  | if j >= uint(len(x)) { | 
|  | return 0 | 
|  | } | 
|  | // 0 <= j < len(x) | 
|  | return uint(x[j] >> (i % _W) & 1) | 
|  | } | 
|  |  | 
|  | // sticky returns 1 if there's a 1 bit within the | 
|  | // i least significant bits, otherwise it returns 0. | 
|  | func (x nat) sticky(i uint) uint { | 
|  | j := i / _W | 
|  | if j >= uint(len(x)) { | 
|  | if len(x) == 0 { | 
|  | return 0 | 
|  | } | 
|  | return 1 | 
|  | } | 
|  | // 0 <= j < len(x) | 
|  | for _, x := range x[:j] { | 
|  | if x != 0 { | 
|  | return 1 | 
|  | } | 
|  | } | 
|  | if x[j]<<(_W-i%_W) != 0 { | 
|  | return 1 | 
|  | } | 
|  | return 0 | 
|  | } | 
|  |  | 
|  | func (z nat) and(x, y nat) nat { | 
|  | m := len(x) | 
|  | n := len(y) | 
|  | if m > n { | 
|  | m = n | 
|  | } | 
|  | // m <= n | 
|  |  | 
|  | z = z.make(m) | 
|  | for i := 0; i < m; i++ { | 
|  | z[i] = x[i] & y[i] | 
|  | } | 
|  |  | 
|  | return z.norm() | 
|  | } | 
|  |  | 
|  | func (z nat) andNot(x, y nat) nat { | 
|  | m := len(x) | 
|  | n := len(y) | 
|  | if n > m { | 
|  | n = m | 
|  | } | 
|  | // m >= n | 
|  |  | 
|  | z = z.make(m) | 
|  | for i := 0; i < n; i++ { | 
|  | z[i] = x[i] &^ y[i] | 
|  | } | 
|  | copy(z[n:m], x[n:m]) | 
|  |  | 
|  | return z.norm() | 
|  | } | 
|  |  | 
|  | func (z nat) or(x, y nat) nat { | 
|  | m := len(x) | 
|  | n := len(y) | 
|  | s := x | 
|  | if m < n { | 
|  | n, m = m, n | 
|  | s = y | 
|  | } | 
|  | // m >= n | 
|  |  | 
|  | z = z.make(m) | 
|  | for i := 0; i < n; i++ { | 
|  | z[i] = x[i] | y[i] | 
|  | } | 
|  | copy(z[n:m], s[n:m]) | 
|  |  | 
|  | return z.norm() | 
|  | } | 
|  |  | 
|  | func (z nat) xor(x, y nat) nat { | 
|  | m := len(x) | 
|  | n := len(y) | 
|  | s := x | 
|  | if m < n { | 
|  | n, m = m, n | 
|  | s = y | 
|  | } | 
|  | // m >= n | 
|  |  | 
|  | z = z.make(m) | 
|  | for i := 0; i < n; i++ { | 
|  | z[i] = x[i] ^ y[i] | 
|  | } | 
|  | copy(z[n:m], s[n:m]) | 
|  |  | 
|  | return z.norm() | 
|  | } | 
|  |  | 
|  | // greaterThan reports whether (x1<<_W + x2) > (y1<<_W + y2) | 
|  | func greaterThan(x1, x2, y1, y2 Word) bool { | 
|  | return x1 > y1 || x1 == y1 && x2 > y2 | 
|  | } | 
|  |  | 
|  | // modW returns x % d. | 
|  | func (x nat) modW(d Word) (r Word) { | 
|  | // TODO(agl): we don't actually need to store the q value. | 
|  | var q nat | 
|  | q = q.make(len(x)) | 
|  | return divWVW(q, 0, x, d) | 
|  | } | 
|  |  | 
|  | // random creates a random integer in [0..limit), using the space in z if | 
|  | // possible. n is the bit length of limit. | 
|  | func (z nat) random(rand *rand.Rand, limit nat, n int) nat { | 
|  | if alias(z, limit) { | 
|  | z = nil // z is an alias for limit - cannot reuse | 
|  | } | 
|  | z = z.make(len(limit)) | 
|  |  | 
|  | bitLengthOfMSW := uint(n % _W) | 
|  | if bitLengthOfMSW == 0 { | 
|  | bitLengthOfMSW = _W | 
|  | } | 
|  | mask := Word((1 << bitLengthOfMSW) - 1) | 
|  |  | 
|  | for { | 
|  | switch _W { | 
|  | case 32: | 
|  | for i := range z { | 
|  | z[i] = Word(rand.Uint32()) | 
|  | } | 
|  | case 64: | 
|  | for i := range z { | 
|  | z[i] = Word(rand.Uint32()) | Word(rand.Uint32())<<32 | 
|  | } | 
|  | default: | 
|  | panic("unknown word size") | 
|  | } | 
|  | z[len(limit)-1] &= mask | 
|  | if z.cmp(limit) < 0 { | 
|  | break | 
|  | } | 
|  | } | 
|  |  | 
|  | return z.norm() | 
|  | } | 
|  |  | 
|  | // If m != 0 (i.e., len(m) != 0), expNN sets z to x**y mod m; | 
|  | // otherwise it sets z to x**y. The result is the value of z. | 
|  | func (z nat) expNN(x, y, m nat) nat { | 
|  | if alias(z, x) || alias(z, y) { | 
|  | // We cannot allow in-place modification of x or y. | 
|  | z = nil | 
|  | } | 
|  |  | 
|  | // x**y mod 1 == 0 | 
|  | if len(m) == 1 && m[0] == 1 { | 
|  | return z.setWord(0) | 
|  | } | 
|  | // m == 0 || m > 1 | 
|  |  | 
|  | // x**0 == 1 | 
|  | if len(y) == 0 { | 
|  | return z.setWord(1) | 
|  | } | 
|  | // y > 0 | 
|  |  | 
|  | // x**1 mod m == x mod m | 
|  | if len(y) == 1 && y[0] == 1 && len(m) != 0 { | 
|  | _, z = nat(nil).div(z, x, m) | 
|  | return z | 
|  | } | 
|  | // y > 1 | 
|  |  | 
|  | if len(m) != 0 { | 
|  | // We likely end up being as long as the modulus. | 
|  | z = z.make(len(m)) | 
|  | } | 
|  | z = z.set(x) | 
|  |  | 
|  | // If the base is non-trivial and the exponent is large, we use | 
|  | // 4-bit, windowed exponentiation. This involves precomputing 14 values | 
|  | // (x^2...x^15) but then reduces the number of multiply-reduces by a | 
|  | // third. Even for a 32-bit exponent, this reduces the number of | 
|  | // operations. Uses Montgomery method for odd moduli. | 
|  | if x.cmp(natOne) > 0 && len(y) > 1 && len(m) > 0 { | 
|  | if m[0]&1 == 1 { | 
|  | return z.expNNMontgomery(x, y, m) | 
|  | } | 
|  | return z.expNNWindowed(x, y, m) | 
|  | } | 
|  |  | 
|  | v := y[len(y)-1] // v > 0 because y is normalized and y > 0 | 
|  | shift := nlz(v) + 1 | 
|  | v <<= shift | 
|  | var q nat | 
|  |  | 
|  | const mask = 1 << (_W - 1) | 
|  |  | 
|  | // We walk through the bits of the exponent one by one. Each time we | 
|  | // see a bit, we square, thus doubling the power. If the bit is a one, | 
|  | // we also multiply by x, thus adding one to the power. | 
|  |  | 
|  | w := _W - int(shift) | 
|  | // zz and r are used to avoid allocating in mul and div as | 
|  | // otherwise the arguments would alias. | 
|  | var zz, r nat | 
|  | for j := 0; j < w; j++ { | 
|  | zz = zz.sqr(z) | 
|  | zz, z = z, zz | 
|  |  | 
|  | if v&mask != 0 { | 
|  | zz = zz.mul(z, x) | 
|  | zz, z = z, zz | 
|  | } | 
|  |  | 
|  | if len(m) != 0 { | 
|  | zz, r = zz.div(r, z, m) | 
|  | zz, r, q, z = q, z, zz, r | 
|  | } | 
|  |  | 
|  | v <<= 1 | 
|  | } | 
|  |  | 
|  | for i := len(y) - 2; i >= 0; i-- { | 
|  | v = y[i] | 
|  |  | 
|  | for j := 0; j < _W; j++ { | 
|  | zz = zz.sqr(z) | 
|  | zz, z = z, zz | 
|  |  | 
|  | if v&mask != 0 { | 
|  | zz = zz.mul(z, x) | 
|  | zz, z = z, zz | 
|  | } | 
|  |  | 
|  | if len(m) != 0 { | 
|  | zz, r = zz.div(r, z, m) | 
|  | zz, r, q, z = q, z, zz, r | 
|  | } | 
|  |  | 
|  | v <<= 1 | 
|  | } | 
|  | } | 
|  |  | 
|  | return z.norm() | 
|  | } | 
|  |  | 
|  | // expNNWindowed calculates x**y mod m using a fixed, 4-bit window. | 
|  | func (z nat) expNNWindowed(x, y, m nat) nat { | 
|  | // zz and r are used to avoid allocating in mul and div as otherwise | 
|  | // the arguments would alias. | 
|  | var zz, r nat | 
|  |  | 
|  | const n = 4 | 
|  | // powers[i] contains x^i. | 
|  | var powers [1 << n]nat | 
|  | powers[0] = natOne | 
|  | powers[1] = x | 
|  | for i := 2; i < 1<<n; i += 2 { | 
|  | p2, p, p1 := &powers[i/2], &powers[i], &powers[i+1] | 
|  | *p = p.sqr(*p2) | 
|  | zz, r = zz.div(r, *p, m) | 
|  | *p, r = r, *p | 
|  | *p1 = p1.mul(*p, x) | 
|  | zz, r = zz.div(r, *p1, m) | 
|  | *p1, r = r, *p1 | 
|  | } | 
|  |  | 
|  | z = z.setWord(1) | 
|  |  | 
|  | for i := len(y) - 1; i >= 0; i-- { | 
|  | yi := y[i] | 
|  | for j := 0; j < _W; j += n { | 
|  | if i != len(y)-1 || j != 0 { | 
|  | // Unrolled loop for significant performance | 
|  | // gain. Use go test -bench=".*" in crypto/rsa | 
|  | // to check performance before making changes. | 
|  | zz = zz.sqr(z) | 
|  | zz, z = z, zz | 
|  | zz, r = zz.div(r, z, m) | 
|  | z, r = r, z | 
|  |  | 
|  | zz = zz.sqr(z) | 
|  | zz, z = z, zz | 
|  | zz, r = zz.div(r, z, m) | 
|  | z, r = r, z | 
|  |  | 
|  | zz = zz.sqr(z) | 
|  | zz, z = z, zz | 
|  | zz, r = zz.div(r, z, m) | 
|  | z, r = r, z | 
|  |  | 
|  | zz = zz.sqr(z) | 
|  | zz, z = z, zz | 
|  | zz, r = zz.div(r, z, m) | 
|  | z, r = r, z | 
|  | } | 
|  |  | 
|  | zz = zz.mul(z, powers[yi>>(_W-n)]) | 
|  | zz, z = z, zz | 
|  | zz, r = zz.div(r, z, m) | 
|  | z, r = r, z | 
|  |  | 
|  | yi <<= n | 
|  | } | 
|  | } | 
|  |  | 
|  | return z.norm() | 
|  | } | 
|  |  | 
|  | // expNNMontgomery calculates x**y mod m using a fixed, 4-bit window. | 
|  | // Uses Montgomery representation. | 
|  | func (z nat) expNNMontgomery(x, y, m nat) nat { | 
|  | numWords := len(m) | 
|  |  | 
|  | // We want the lengths of x and m to be equal. | 
|  | // It is OK if x >= m as long as len(x) == len(m). | 
|  | if len(x) > numWords { | 
|  | _, x = nat(nil).div(nil, x, m) | 
|  | // Note: now len(x) <= numWords, not guaranteed ==. | 
|  | } | 
|  | if len(x) < numWords { | 
|  | rr := make(nat, numWords) | 
|  | copy(rr, x) | 
|  | x = rr | 
|  | } | 
|  |  | 
|  | // Ideally the precomputations would be performed outside, and reused | 
|  | // k0 = -m**-1 mod 2**_W. Algorithm from: Dumas, J.G. "On Newton–Raphson | 
|  | // Iteration for Multiplicative Inverses Modulo Prime Powers". | 
|  | k0 := 2 - m[0] | 
|  | t := m[0] - 1 | 
|  | for i := 1; i < _W; i <<= 1 { | 
|  | t *= t | 
|  | k0 *= (t + 1) | 
|  | } | 
|  | k0 = -k0 | 
|  |  | 
|  | // RR = 2**(2*_W*len(m)) mod m | 
|  | RR := nat(nil).setWord(1) | 
|  | zz := nat(nil).shl(RR, uint(2*numWords*_W)) | 
|  | _, RR = nat(nil).div(RR, zz, m) | 
|  | if len(RR) < numWords { | 
|  | zz = zz.make(numWords) | 
|  | copy(zz, RR) | 
|  | RR = zz | 
|  | } | 
|  | // one = 1, with equal length to that of m | 
|  | one := make(nat, numWords) | 
|  | one[0] = 1 | 
|  |  | 
|  | const n = 4 | 
|  | // powers[i] contains x^i | 
|  | var powers [1 << n]nat | 
|  | powers[0] = powers[0].montgomery(one, RR, m, k0, numWords) | 
|  | powers[1] = powers[1].montgomery(x, RR, m, k0, numWords) | 
|  | for i := 2; i < 1<<n; i++ { | 
|  | powers[i] = powers[i].montgomery(powers[i-1], powers[1], m, k0, numWords) | 
|  | } | 
|  |  | 
|  | // initialize z = 1 (Montgomery 1) | 
|  | z = z.make(numWords) | 
|  | copy(z, powers[0]) | 
|  |  | 
|  | zz = zz.make(numWords) | 
|  |  | 
|  | // same windowed exponent, but with Montgomery multiplications | 
|  | for i := len(y) - 1; i >= 0; i-- { | 
|  | yi := y[i] | 
|  | for j := 0; j < _W; j += n { | 
|  | if i != len(y)-1 || j != 0 { | 
|  | zz = zz.montgomery(z, z, m, k0, numWords) | 
|  | z = z.montgomery(zz, zz, m, k0, numWords) | 
|  | zz = zz.montgomery(z, z, m, k0, numWords) | 
|  | z = z.montgomery(zz, zz, m, k0, numWords) | 
|  | } | 
|  | zz = zz.montgomery(z, powers[yi>>(_W-n)], m, k0, numWords) | 
|  | z, zz = zz, z | 
|  | yi <<= n | 
|  | } | 
|  | } | 
|  | // convert to regular number | 
|  | zz = zz.montgomery(z, one, m, k0, numWords) | 
|  |  | 
|  | // One last reduction, just in case. | 
|  | // See golang.org/issue/13907. | 
|  | if zz.cmp(m) >= 0 { | 
|  | // Common case is m has high bit set; in that case, | 
|  | // since zz is the same length as m, there can be just | 
|  | // one multiple of m to remove. Just subtract. | 
|  | // We think that the subtract should be sufficient in general, | 
|  | // so do that unconditionally, but double-check, | 
|  | // in case our beliefs are wrong. | 
|  | // The div is not expected to be reached. | 
|  | zz = zz.sub(zz, m) | 
|  | if zz.cmp(m) >= 0 { | 
|  | _, zz = nat(nil).div(nil, zz, m) | 
|  | } | 
|  | } | 
|  |  | 
|  | return zz.norm() | 
|  | } | 
|  |  | 
|  | // bytes writes the value of z into buf using big-endian encoding. | 
|  | // len(buf) must be >= len(z)*_S. The value of z is encoded in the | 
|  | // slice buf[i:]. The number i of unused bytes at the beginning of | 
|  | // buf is returned as result. | 
|  | func (z nat) bytes(buf []byte) (i int) { | 
|  | i = len(buf) | 
|  | for _, d := range z { | 
|  | for j := 0; j < _S; j++ { | 
|  | i-- | 
|  | buf[i] = byte(d) | 
|  | d >>= 8 | 
|  | } | 
|  | } | 
|  |  | 
|  | for i < len(buf) && buf[i] == 0 { | 
|  | i++ | 
|  | } | 
|  |  | 
|  | return | 
|  | } | 
|  |  | 
|  | // bigEndianWord returns the contents of buf interpreted as a big-endian encoded Word value. | 
|  | func bigEndianWord(buf []byte) Word { | 
|  | if _W == 64 { | 
|  | return Word(binary.BigEndian.Uint64(buf)) | 
|  | } | 
|  | return Word(binary.BigEndian.Uint32(buf)) | 
|  | } | 
|  |  | 
|  | // setBytes interprets buf as the bytes of a big-endian unsigned | 
|  | // integer, sets z to that value, and returns z. | 
|  | func (z nat) setBytes(buf []byte) nat { | 
|  | z = z.make((len(buf) + _S - 1) / _S) | 
|  |  | 
|  | i := len(buf) | 
|  | for k := 0; i >= _S; k++ { | 
|  | z[k] = bigEndianWord(buf[i-_S : i]) | 
|  | i -= _S | 
|  | } | 
|  | if i > 0 { | 
|  | var d Word | 
|  | for s := uint(0); i > 0; s += 8 { | 
|  | d |= Word(buf[i-1]) << s | 
|  | i-- | 
|  | } | 
|  | z[len(z)-1] = d | 
|  | } | 
|  |  | 
|  | return z.norm() | 
|  | } | 
|  |  | 
|  | // sqrt sets z = ⌊√x⌋ | 
|  | func (z nat) sqrt(x nat) nat { | 
|  | if x.cmp(natOne) <= 0 { | 
|  | return z.set(x) | 
|  | } | 
|  | if alias(z, x) { | 
|  | z = nil | 
|  | } | 
|  |  | 
|  | // Start with value known to be too large and repeat "z = ⌊(z + ⌊x/z⌋)/2⌋" until it stops getting smaller. | 
|  | // See Brent and Zimmermann, Modern Computer Arithmetic, Algorithm 1.13 (SqrtInt). | 
|  | // https://members.loria.fr/PZimmermann/mca/pub226.html | 
|  | // If x is one less than a perfect square, the sequence oscillates between the correct z and z+1; | 
|  | // otherwise it converges to the correct z and stays there. | 
|  | var z1, z2 nat | 
|  | z1 = z | 
|  | z1 = z1.setUint64(1) | 
|  | z1 = z1.shl(z1, uint(x.bitLen()+1)/2) // must be ≥ √x | 
|  | for n := 0; ; n++ { | 
|  | z2, _ = z2.div(nil, x, z1) | 
|  | z2 = z2.add(z2, z1) | 
|  | z2 = z2.shr(z2, 1) | 
|  | if z2.cmp(z1) >= 0 { | 
|  | // z1 is answer. | 
|  | // Figure out whether z1 or z2 is currently aliased to z by looking at loop count. | 
|  | if n&1 == 0 { | 
|  | return z1 | 
|  | } | 
|  | return z.set(z1) | 
|  | } | 
|  | z1, z2 = z2, z1 | 
|  | } | 
|  | } |