internal/poly1305/sum_generic.go - crypto - Git at Google

 // Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 // This file provides the generic implementation of Sum and MAC. Other files
 // might provide optimized assembly implementations of some of this code.

 package poly1305

 import "encoding/binary"

 // Poly1305 [RFC 7539] is a relatively simple algorithm: the authentication tag
 // for a 64 bytes message is approximately
 //
 //     s + m[0:16] * r⁴ + m[16:32] * r³ + m[32:48] * r² + m[48:64] * r  mod  2¹³⁰ - 5
 //
 // for some secret r and s. It can be computed sequentially like
 //
 //     for len(msg) > 0:
 //         h += read(msg, 16)
 //         h *= r
 //         h %= 2¹³⁰ - 5
 //     return h + s
 //
 // All the complexity is about doing performant constant-time math on numbers
 // larger than any available numeric type.

 func sumGeneric(out *[TagSize]byte, msg []byte, key *[32]byte) {
 	h := newMACGeneric(key)
 	h.Write(msg)
 	h.Sum(out)
 }

 func newMACGeneric(key *[32]byte) macGeneric {
 	m := macGeneric{}
 	initialize(key, &m.macState)
 	return m
 }

 // macState holds numbers in saturated 64-bit little-endian limbs. That is,
 // the value of [x0, x1, x2] is x[0] + x[1] * 2⁶⁴ + x[2] * 2¹²⁸.
 type macState struct {
 	// h is the main accumulator. It is to be interpreted modulo 2¹³⁰ - 5, but
 	// can grow larger during and after rounds. It must, however, remain below
 	// 2 * (2¹³⁰ - 5).
 	h [3]uint64
 	// r and s are the private key components.
 	r [2]uint64
 	s [2]uint64
 }

 type macGeneric struct {
 	macState

 	buffer [TagSize]byte
 	offset int
 }

 // Write splits the incoming message into TagSize chunks, and passes them to
 // update. It buffers incomplete chunks.
 func (h *macGeneric) Write(p []byte) (int, error) {
 	nn := len(p)
 	if h.offset > 0 {
 		n := copy(h.buffer[h.offset:], p)
 		if h.offset+n < TagSize {
 			h.offset += n
 			return nn, nil
 		}
 		p = p[n:]
 		h.offset = 0
 		updateGeneric(&h.macState, h.buffer[:])
 	}
 	if n := len(p) - (len(p) % TagSize); n > 0 {
 		updateGeneric(&h.macState, p[:n])
 		p = p[n:]
 	}
 	if len(p) > 0 {
 		h.offset += copy(h.buffer[h.offset:], p)
 	}
 	return nn, nil
 }

 // Sum flushes the last incomplete chunk from the buffer, if any, and generates
 // the MAC output. It does not modify its state, in order to allow for multiple
 // calls to Sum, even if no Write is allowed after Sum.
 func (h *macGeneric) Sum(out *[TagSize]byte) {
 	state := h.macState
 	if h.offset > 0 {
 		updateGeneric(&state, h.buffer[:h.offset])
 	}
 	finalize(out, &state.h, &state.s)
 }

 // [rMask0, rMask1] is the specified Poly1305 clamping mask in little-endian. It
 // clears some bits of the secret coefficient to make it possible to implement
 // multiplication more efficiently.
 const (
 	rMask0 = 0x0FFFFFFC0FFFFFFF
 	rMask1 = 0x0FFFFFFC0FFFFFFC
 )

 // initialize loads the 256-bit key into the two 128-bit secret values r and s.
 func initialize(key *[32]byte, m *macState) {
 	m.r[0] = binary.LittleEndian.Uint64(key[0:8]) & rMask0
 	m.r[1] = binary.LittleEndian.Uint64(key[8:16]) & rMask1
 	m.s[0] = binary.LittleEndian.Uint64(key[16:24])
 	m.s[1] = binary.LittleEndian.Uint64(key[24:32])
 }

 // uint128 holds a 128-bit number as two 64-bit limbs, for use with the
 // bits.Mul64 and bits.Add64 intrinsics.
 type uint128 struct {
 	lo, hi uint64
 }

 func mul64(a, b uint64) uint128 {
 	hi, lo := bitsMul64(a, b)
 	return uint128{lo, hi}
 }

 func add128(a, b uint128) uint128 {
 	lo, c := bitsAdd64(a.lo, b.lo, 0)
 	hi, c := bitsAdd64(a.hi, b.hi, c)
 	if c != 0 {
 		panic("poly1305: unexpected overflow")
 	}
 	return uint128{lo, hi}
 }

 func shiftRightBy2(a uint128) uint128 {
 	a.lo = a.lo>>2 | (a.hi&3)<<62
 	a.hi = a.hi >> 2
 	return a
 }

 // updateGeneric absorbs msg into the state.h accumulator. For each chunk m of
 // 128 bits of message, it computes
 //
 //	h₊ = (h + m) * r  mod  2¹³⁰ - 5
 //
 // If the msg length is not a multiple of TagSize, it assumes the last
 // incomplete chunk is the final one.
 func updateGeneric(state *macState, msg []byte) {
 	h0, h1, h2 := state.h[0], state.h[1], state.h[2]
 	r0, r1 := state.r[0], state.r[1]

 	for len(msg) > 0 {
 		var c uint64

 		// For the first step, h + m, we use a chain of bits.Add64 intrinsics.
 		// The resulting value of h might exceed 2¹³⁰ - 5, but will be partially
 		// reduced at the end of the multiplication below.
 		//
 		// The spec requires us to set a bit just above the message size, not to
 		// hide leading zeroes. For full chunks, that's 1 << 128, so we can just
 		// add 1 to the most significant (2¹²⁸) limb, h2.
 		if len(msg) >= TagSize {
 			h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(msg[0:8]), 0)
 			h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(msg[8:16]), c)
 			h2 += c + 1

 			msg = msg[TagSize:]
 		} else {
 			var buf [TagSize]byte
 			copy(buf[:], msg)
 			buf[len(msg)] = 1

 			h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(buf[0:8]), 0)
 			h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(buf[8:16]), c)
 			h2 += c

 			msg = nil
 		}

 		// Multiplication of big number limbs is similar to elementary school
 		// columnar multiplication. Instead of digits, there are 64-bit limbs.
 		//
 		// We are multiplying a 3 limbs number, h, by a 2 limbs number, r.
 		//
 		//                        h2    h1    h0  x
 		//                              r1    r0  =
 		//                       ----------------
 		//                      h2r0  h1r0  h0r0     <-- individual 128-bit products
 		//            +   h2r1  h1r1  h0r1
 		//               ------------------------
 		//                 m3    m2    m1    m0      <-- result in 128-bit overlapping limbs
 		//               ------------------------
 		//         m3.hi m2.hi m1.hi m0.hi           <-- carry propagation
 		//     +         m3.lo m2.lo m1.lo m0.lo
 		//        -------------------------------
 		//           t4    t3    t2    t1    t0      <-- final result in 64-bit limbs
 		//
 		// The main difference from pen-and-paper multiplication is that we do
 		// carry propagation in a separate step, as if we wrote two digit sums
 		// at first (the 128-bit limbs), and then carried the tens all at once.

 		h0r0 := mul64(h0, r0)
 		h1r0 := mul64(h1, r0)
 		h2r0 := mul64(h2, r0)
 		h0r1 := mul64(h0, r1)
 		h1r1 := mul64(h1, r1)
 		h2r1 := mul64(h2, r1)

 		// Since h2 is known to be at most 7 (5 + 1 + 1), and r0 and r1 have their
 		// top 4 bits cleared by rMask{0,1}, we know that their product is not going
 		// to overflow 64 bits, so we can ignore the high part of the products.
 		//
 		// This also means that the product doesn't have a fifth limb (t4).
 		if h2r0.hi != 0 {
 			panic("poly1305: unexpected overflow")
 		}
 		if h2r1.hi != 0 {
 			panic("poly1305: unexpected overflow")
 		}

 		m0 := h0r0
 		m1 := add128(h1r0, h0r1) // These two additions don't overflow thanks again
 		m2 := add128(h2r0, h1r1) // to the 4 masked bits at the top of r0 and r1.
 		m3 := h2r1

 		t0 := m0.lo
 		t1, c := bitsAdd64(m1.lo, m0.hi, 0)
 		t2, c := bitsAdd64(m2.lo, m1.hi, c)
 		t3, _ := bitsAdd64(m3.lo, m2.hi, c)

 		// Now we have the result as 4 64-bit limbs, and we need to reduce it
 		// modulo 2¹³⁰ - 5. The special shape of this Crandall prime lets us do
 		// a cheap partial reduction according to the reduction identity
 		//
 		//     c * 2¹³⁰ + n  =  c * 5 + n  mod  2¹³⁰ - 5
 		//
 		// because 2¹³⁰ = 5 mod 2¹³⁰ - 5. Partial reduction since the result is
 		// likely to be larger than 2¹³⁰ - 5, but still small enough to fit the
 		// assumptions we make about h in the rest of the code.
 		//
 		// See also https://speakerdeck.com/gtank/engineering-prime-numbers?slide=23

 		// We split the final result at the 2¹³⁰ mark into h and cc, the carry.
 		// Note that the carry bits are effectively shifted left by 2, in other
 		// words, cc = c * 4 for the c in the reduction identity.
 		h0, h1, h2 = t0, t1, t2&maskLow2Bits
 		cc := uint128{t2 & maskNotLow2Bits, t3}

 		// To add c * 5 to h, we first add cc = c * 4, and then add (cc >> 2) = c.

 		h0, c = bitsAdd64(h0, cc.lo, 0)
 		h1, c = bitsAdd64(h1, cc.hi, c)
 		h2 += c

 		cc = shiftRightBy2(cc)

 		h0, c = bitsAdd64(h0, cc.lo, 0)
 		h1, c = bitsAdd64(h1, cc.hi, c)
 		h2 += c

 		// h2 is at most 3 + 1 + 1 = 5, making the whole of h at most
 		//
 		//     5 * 2¹²⁸ + (2¹²⁸ - 1) = 6 * 2¹²⁸ - 1
 	}

 	state.h[0], state.h[1], state.h[2] = h0, h1, h2
 }

 const (
 	maskLow2Bits    uint64 = 0x0000000000000003
 	maskNotLow2Bits uint64 = ^maskLow2Bits
 )

 // select64 returns x if v == 1 and y if v == 0, in constant time.
 func select64(v, x, y uint64) uint64 { return ^(v-1)&x | (v-1)&y }

 // [p0, p1, p2] is 2¹³⁰ - 5 in little endian order.
 const (
 	p0 = 0xFFFFFFFFFFFFFFFB
 	p1 = 0xFFFFFFFFFFFFFFFF
 	p2 = 0x0000000000000003
 )

 // finalize completes the modular reduction of h and computes
 //
 //	out = h + s  mod  2¹²⁸
 func finalize(out *[TagSize]byte, h *[3]uint64, s *[2]uint64) {
 	h0, h1, h2 := h[0], h[1], h[2]

 	// After the partial reduction in updateGeneric, h might be more than
 	// 2¹³⁰ - 5, but will be less than 2 * (2¹³⁰ - 5). To complete the reduction
 	// in constant time, we compute t = h - (2¹³⁰ - 5), and select h as the
 	// result if the subtraction underflows, and t otherwise.

 	hMinusP0, b := bitsSub64(h0, p0, 0)
 	hMinusP1, b := bitsSub64(h1, p1, b)
 	_, b = bitsSub64(h2, p2, b)

 	// h = h if h < p else h - p
 	h0 = select64(b, h0, hMinusP0)
 	h1 = select64(b, h1, hMinusP1)

 	// Finally, we compute the last Poly1305 step
 	//
 	//     tag = h + s  mod  2¹²⁸
 	//
 	// by just doing a wide addition with the 128 low bits of h and discarding
 	// the overflow.
 	h0, c := bitsAdd64(h0, s[0], 0)
 	h1, _ = bitsAdd64(h1, s[1], c)

 	binary.LittleEndian.PutUint64(out[0:8], h0)
 	binary.LittleEndian.PutUint64(out[8:16], h1)
 }
	// Copyright 2018 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	// This file provides the generic implementation of Sum and MAC. Other files
	// might provide optimized assembly implementations of some of this code.

	package poly1305

	import "encoding/binary"

	// Poly1305 [RFC 7539] is a relatively simple algorithm: the authentication tag
	// for a 64 bytes message is approximately
	//
	// s + m[0:16] * r⁴ + m[16:32] * r³ + m[32:48] * r² + m[48:64] * r mod 2¹³⁰ - 5
	//
	// for some secret r and s. It can be computed sequentially like
	//
	// for len(msg) > 0:
	// h += read(msg, 16)
	// h *= r
	// h %= 2¹³⁰ - 5
	// return h + s
	//
	// All the complexity is about doing performant constant-time math on numbers
	// larger than any available numeric type.

	func sumGeneric(out [TagSize]byte, msg []byte, key [32]byte) {
	h := newMACGeneric(key)
	h.Write(msg)
	h.Sum(out)
	}

	func newMACGeneric(key *[32]byte) macGeneric {
	m := macGeneric{}
	initialize(key, &m.macState)
	return m
	}

	// macState holds numbers in saturated 64-bit little-endian limbs. That is,
	// the value of [x0, x1, x2] is x[0] + x[1] * 2⁶⁴ + x[2] * 2¹²⁸.
	type macState struct {
	// h is the main accumulator. It is to be interpreted modulo 2¹³⁰ - 5, but
	// can grow larger during and after rounds. It must, however, remain below
	// 2 * (2¹³⁰ - 5).
	h [3]uint64
	// r and s are the private key components.
	r [2]uint64
	s [2]uint64
	}

	type macGeneric struct {
	macState

	buffer [TagSize]byte
	offset int
	}

	// Write splits the incoming message into TagSize chunks, and passes them to
	// update. It buffers incomplete chunks.
	func (h *macGeneric) Write(p []byte) (int, error) {
	nn := len(p)
	if h.offset > 0 {
	n := copy(h.buffer[h.offset:], p)
	if h.offset+n < TagSize {
	h.offset += n
	return nn, nil
	}
	p = p[n:]
	h.offset = 0
	updateGeneric(&h.macState, h.buffer[:])
	}
	if n := len(p) - (len(p) % TagSize); n > 0 {
	updateGeneric(&h.macState, p[:n])
	p = p[n:]
	}
	if len(p) > 0 {
	h.offset += copy(h.buffer[h.offset:], p)
	}
	return nn, nil
	}

	// Sum flushes the last incomplete chunk from the buffer, if any, and generates
	// the MAC output. It does not modify its state, in order to allow for multiple
	// calls to Sum, even if no Write is allowed after Sum.
	func (h macGeneric) Sum(out [TagSize]byte) {
	state := h.macState
	if h.offset > 0 {
	updateGeneric(&state, h.buffer[:h.offset])
	}
	finalize(out, &state.h, &state.s)
	}

	// [rMask0, rMask1] is the specified Poly1305 clamping mask in little-endian. It
	// clears some bits of the secret coefficient to make it possible to implement
	// multiplication more efficiently.
	const (
	rMask0 = 0x0FFFFFFC0FFFFFFF
	rMask1 = 0x0FFFFFFC0FFFFFFC
	)

	// initialize loads the 256-bit key into the two 128-bit secret values r and s.
	func initialize(key [32]byte, m macState) {
	m.r[0] = binary.LittleEndian.Uint64(key[0:8]) & rMask0
	m.r[1] = binary.LittleEndian.Uint64(key[8:16]) & rMask1
	m.s[0] = binary.LittleEndian.Uint64(key[16:24])
	m.s[1] = binary.LittleEndian.Uint64(key[24:32])
	}

	// uint128 holds a 128-bit number as two 64-bit limbs, for use with the
	// bits.Mul64 and bits.Add64 intrinsics.
	type uint128 struct {
	lo, hi uint64
	}

	func mul64(a, b uint64) uint128 {
	hi, lo := bitsMul64(a, b)
	return uint128{lo, hi}
	}

	func add128(a, b uint128) uint128 {
	lo, c := bitsAdd64(a.lo, b.lo, 0)
	hi, c := bitsAdd64(a.hi, b.hi, c)
	if c != 0 {
	panic("poly1305: unexpected overflow")
	}
	return uint128{lo, hi}
	}

	func shiftRightBy2(a uint128) uint128 {
	a.lo = a.lo>>2 \| (a.hi&3)<<62
	a.hi = a.hi >> 2
	return a
	}

	// updateGeneric absorbs msg into the state.h accumulator. For each chunk m of
	// 128 bits of message, it computes
	//
	// h₊ = (h + m) * r mod 2¹³⁰ - 5
	//
	// If the msg length is not a multiple of TagSize, it assumes the last
	// incomplete chunk is the final one.
	func updateGeneric(state *macState, msg []byte) {
	h0, h1, h2 := state.h[0], state.h[1], state.h[2]
	r0, r1 := state.r[0], state.r[1]

	for len(msg) > 0 {
	var c uint64

	// For the first step, h + m, we use a chain of bits.Add64 intrinsics.
	// The resulting value of h might exceed 2¹³⁰ - 5, but will be partially
	// reduced at the end of the multiplication below.
	//
	// The spec requires us to set a bit just above the message size, not to
	// hide leading zeroes. For full chunks, that's 1 << 128, so we can just
	// add 1 to the most significant (2¹²⁸) limb, h2.
	if len(msg) >= TagSize {
	h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(msg[0:8]), 0)
	h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(msg[8:16]), c)
	h2 += c + 1

	msg = msg[TagSize:]
	} else {
	var buf [TagSize]byte
	copy(buf[:], msg)
	buf[len(msg)] = 1

	h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(buf[0:8]), 0)
	h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(buf[8:16]), c)
	h2 += c

	msg = nil
	}

	// Multiplication of big number limbs is similar to elementary school
	// columnar multiplication. Instead of digits, there are 64-bit limbs.
	//
	// We are multiplying a 3 limbs number, h, by a 2 limbs number, r.
	//
	// h2 h1 h0 x
	// r1 r0 =
	// ----------------
	// h2r0 h1r0 h0r0 <-- individual 128-bit products
	// + h2r1 h1r1 h0r1
	// ------------------------
	// m3 m2 m1 m0 <-- result in 128-bit overlapping limbs
	// ------------------------
	// m3.hi m2.hi m1.hi m0.hi <-- carry propagation
	// + m3.lo m2.lo m1.lo m0.lo
	// -------------------------------
	// t4 t3 t2 t1 t0 <-- final result in 64-bit limbs
	//
	// The main difference from pen-and-paper multiplication is that we do
	// carry propagation in a separate step, as if we wrote two digit sums
	// at first (the 128-bit limbs), and then carried the tens all at once.

	h0r0 := mul64(h0, r0)
	h1r0 := mul64(h1, r0)
	h2r0 := mul64(h2, r0)
	h0r1 := mul64(h0, r1)
	h1r1 := mul64(h1, r1)
	h2r1 := mul64(h2, r1)

	// Since h2 is known to be at most 7 (5 + 1 + 1), and r0 and r1 have their
	// top 4 bits cleared by rMask{0,1}, we know that their product is not going
	// to overflow 64 bits, so we can ignore the high part of the products.
	//
	// This also means that the product doesn't have a fifth limb (t4).
	if h2r0.hi != 0 {
	panic("poly1305: unexpected overflow")
	}
	if h2r1.hi != 0 {
	panic("poly1305: unexpected overflow")
	}

	m0 := h0r0
	m1 := add128(h1r0, h0r1) // These two additions don't overflow thanks again
	m2 := add128(h2r0, h1r1) // to the 4 masked bits at the top of r0 and r1.
	m3 := h2r1

	t0 := m0.lo
	t1, c := bitsAdd64(m1.lo, m0.hi, 0)
	t2, c := bitsAdd64(m2.lo, m1.hi, c)
	t3, _ := bitsAdd64(m3.lo, m2.hi, c)

	// Now we have the result as 4 64-bit limbs, and we need to reduce it
	// modulo 2¹³⁰ - 5. The special shape of this Crandall prime lets us do
	// a cheap partial reduction according to the reduction identity
	//
	// c * 2¹³⁰ + n = c * 5 + n mod 2¹³⁰ - 5
	//
	// because 2¹³⁰ = 5 mod 2¹³⁰ - 5. Partial reduction since the result is
	// likely to be larger than 2¹³⁰ - 5, but still small enough to fit the
	// assumptions we make about h in the rest of the code.
	//
	// See also https://speakerdeck.com/gtank/engineering-prime-numbers?slide=23

	// We split the final result at the 2¹³⁰ mark into h and cc, the carry.
	// Note that the carry bits are effectively shifted left by 2, in other
	// words, cc = c * 4 for the c in the reduction identity.
	h0, h1, h2 = t0, t1, t2&maskLow2Bits
	cc := uint128{t2 & maskNotLow2Bits, t3}

	// To add c * 5 to h, we first add cc = c * 4, and then add (cc >> 2) = c.

	h0, c = bitsAdd64(h0, cc.lo, 0)
	h1, c = bitsAdd64(h1, cc.hi, c)
	h2 += c

	cc = shiftRightBy2(cc)

	h0, c = bitsAdd64(h0, cc.lo, 0)
	h1, c = bitsAdd64(h1, cc.hi, c)
	h2 += c

	// h2 is at most 3 + 1 + 1 = 5, making the whole of h at most
	//
	// 5 * 2¹²⁸ + (2¹²⁸ - 1) = 6 * 2¹²⁸ - 1
	}

	state.h[0], state.h[1], state.h[2] = h0, h1, h2
	}

	const (
	maskLow2Bits uint64 = 0x0000000000000003
	maskNotLow2Bits uint64 = ^maskLow2Bits
	)

	// select64 returns x if v == 1 and y if v == 0, in constant time.
	func select64(v, x, y uint64) uint64 { return ^(v-1)&x \| (v-1)&y }

	// [p0, p1, p2] is 2¹³⁰ - 5 in little endian order.
	const (
	p0 = 0xFFFFFFFFFFFFFFFB
	p1 = 0xFFFFFFFFFFFFFFFF
	p2 = 0x0000000000000003
	)

	// finalize completes the modular reduction of h and computes
	//
	// out = h + s mod 2¹²⁸
	func finalize(out [TagSize]byte, h [3]uint64, s *[2]uint64) {
	h0, h1, h2 := h[0], h[1], h[2]

	// After the partial reduction in updateGeneric, h might be more than
	// 2¹³⁰ - 5, but will be less than 2 * (2¹³⁰ - 5). To complete the reduction
	// in constant time, we compute t = h - (2¹³⁰ - 5), and select h as the
	// result if the subtraction underflows, and t otherwise.

	hMinusP0, b := bitsSub64(h0, p0, 0)
	hMinusP1, b := bitsSub64(h1, p1, b)
	_, b = bitsSub64(h2, p2, b)

	// h = h if h < p else h - p
	h0 = select64(b, h0, hMinusP0)
	h1 = select64(b, h1, hMinusP1)

	// Finally, we compute the last Poly1305 step
	//
	// tag = h + s mod 2¹²⁸
	//
	// by just doing a wide addition with the 128 low bits of h and discarding
	// the overflow.
	h0, c := bitsAdd64(h0, s[0], 0)
	h1, _ = bitsAdd64(h1, s[1], c)

	binary.LittleEndian.PutUint64(out[0:8], h0)
	binary.LittleEndian.PutUint64(out[8:16], h1)
	}