blob: cea9365dcca98845622818ffefde7cca81fc08d6 [file] [log] [blame]
// Copyright 2022 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package main
import (
. "github.com/mmcloughlin/avo/build"
. "github.com/mmcloughlin/avo/operand"
. "github.com/mmcloughlin/avo/reg"
)
//go:generate go run . -out ../nat_amd64.s -stubs ../nat_amd64.go -pkg bigmod
func main() {
Package("crypto/internal/bigmod")
ConstraintExpr("amd64,gc,!purego")
Implement("montgomeryLoop")
size := Load(Param("d").Len(), GP64())
d := Mem{Base: Load(Param("d").Base(), GP64())}
b := Mem{Base: Load(Param("b").Base(), GP64())}
m := Mem{Base: Load(Param("m").Base(), GP64())}
m0inv := Load(Param("m0inv"), GP64())
overflow := zero()
i := zero()
Label("outerLoop")
ai := Load(Param("a").Base(), GP64())
MOVQ(Mem{Base: ai}.Idx(i, 8), ai)
z := uint128{GP64(), GP64()}
mul64(z, b, ai)
add64(z, d)
f := GP64()
MOVQ(m0inv, f)
IMULQ(z.lo, f)
_MASK(f)
addMul64(z, m, f)
carry := shiftBy63(z)
j := zero()
INCQ(j)
JMP(LabelRef("innerLoopCondition"))
Label("innerLoop")
// z = d[j] + a[i] * b[j] + f * m[j] + carry
z = uint128{GP64(), GP64()}
mul64(z, b.Idx(j, 8), ai)
addMul64(z, m.Idx(j, 8), f)
add64(z, d.Idx(j, 8))
add64(z, carry)
// d[j-1] = z_lo & _MASK
storeMasked(z.lo, d.Idx(j, 8).Offset(-8))
// carry = z_hi<<1 | z_lo>>_W
MOVQ(shiftBy63(z), carry)
INCQ(j)
Label("innerLoopCondition")
CMPQ(size, j)
JGT(LabelRef("innerLoop"))
ADDQ(carry, overflow)
storeMasked(overflow, d.Idx(size, 8).Offset(-8))
SHRQ(Imm(63), overflow)
INCQ(i)
CMPQ(size, i)
JGT(LabelRef("outerLoop"))
Store(overflow, ReturnIndex(0))
RET()
Generate()
}
// zero zeroes a new register and returns it.
func zero() Register {
r := GP64()
XORQ(r, r)
return r
}
// _MASK masks out the top bit of r.
func _MASK(r Register) {
BTRQ(Imm(63), r)
}
type uint128 struct {
hi, lo GPVirtual
}
// storeMasked stores _MASK(src) in dst. It doesn't modify src.
func storeMasked(src, dst Op) {
out := GP64()
MOVQ(src, out)
_MASK(out)
MOVQ(out, dst)
}
// shiftBy63 returns z >> 63. It reuses z.lo.
func shiftBy63(z uint128) Register {
SHRQ(Imm(63), z.hi, z.lo)
result := z.lo
z.hi, z.lo = nil, nil
return result
}
// add64 sets r to r + a.
func add64(r uint128, a Op) {
ADDQ(a, r.lo)
ADCQ(Imm(0), r.hi)
}
// mul64 sets r to a * b.
func mul64(r uint128, a, b Op) {
MOVQ(a, RAX)
MULQ(b) // RDX, RAX = RAX * b
MOVQ(RAX, r.lo)
MOVQ(RDX, r.hi)
}
// addMul64 sets r to r + a * b.
func addMul64(r uint128, a, b Op) {
MOVQ(a, RAX)
MULQ(b) // RDX, RAX = RAX * b
ADDQ(RAX, r.lo)
ADCQ(RDX, r.hi)
}