src/math/big/internal/asmgen/mul.go - go - Git at Google

 // Copyright 2025 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package asmgen

 // mulAddVWW generates mulAddVWW, which does z, c = x*m + a.
 func mulAddVWW(a *Asm) {
 	f := a.Func("func mulAddVWW(z, x []Word, m, a Word) (c Word)")

 	if a.AltCarry().Valid() {
 		addMulVirtualCarry(f, 0)
 		return
 	}
 	addMul(f, "", "x", 0)
 }

 // addMulVVWW generates addMulVVWW which does z, c = x + y*m + a.
 // (A more pedantic name would be addMulAddVVWW.)
 func addMulVVWW(a *Asm) {
 	f := a.Func("func addMulVVWW(z, x, y []Word, m, a Word) (c Word)")

 	// If the architecture has virtual carries, emit that version unconditionally.
 	if a.AltCarry().Valid() {
 		addMulVirtualCarry(f, 1)
 		return
 	}

 	// If the architecture optionally has two carries, test and emit both versions.
 	if a.JmpEnable(OptionAltCarry, "altcarry") {
 		regs := a.RegsUsed()
 		addMul(f, "x", "y", 1)
 		a.Label("altcarry")
 		a.SetOption(OptionAltCarry, true)
 		a.SetRegsUsed(regs)
 		addMulAlt(f)
 		a.SetOption(OptionAltCarry, false)
 		return
 	}

 	// Otherwise emit the one-carry form.
 	addMul(f, "x", "y", 1)
 }

 // Computing z = addsrc + m*mulsrc + a, we need:
 //
 //	for i := range z {
 //		lo, hi := m * mulsrc[i]
 //		lo, carry = bits.Add(lo, a, 0)
 //		lo, carryAlt = bits.Add(lo, addsrc[i], 0)
 //		z[i] = lo
 //		a = hi + carry + carryAlt  // cannot overflow
 //	}
 //
 // The final addition cannot overflow because after processing N words,
 // the maximum possible value is (for a 64-bit system):
 //
 //	  (2**64N - 1) + (2**64 - 1)*(2**64N - 1) + (2**64 - 1)
 //	= (2**64)*(2**64N - 1) + (2**64 - 1)
 //	= 2**64(N+1) - 1,
 //
 // which fits in N+1 words (the high order one being the new value of a).
 //
 // (For example, with 3 decimal words, 999 + 9*999 + 9 = 999*10 + 9 = 9999.)
 //
 // If we unroll the loop a bit, then we can chain the carries in two passes.
 // Consider:
 //
 //	lo0, hi0 := m * mulsrc[i]
 //	lo0, carry = bits.Add(lo0, a, 0)
 //	lo0, carryAlt = bits.Add(lo0, addsrc[i], 0)
 //	z[i] = lo0
 //	a = hi + carry + carryAlt // cannot overflow
 //
 //	lo1, hi1 := m * mulsrc[i]
 //	lo1, carry = bits.Add(lo1, a, 0)
 //	lo1, carryAlt = bits.Add(lo1, addsrc[i], 0)
 //	z[i] = lo1
 //	a = hi + carry + carryAlt // cannot overflow
 //
 //	lo2, hi2 := m * mulsrc[i]
 //	lo2, carry = bits.Add(lo2, a, 0)
 //	lo2, carryAlt = bits.Add(lo2, addsrc[i], 0)
 //	z[i] = lo2
 //	a = hi + carry + carryAlt // cannot overflow
 //
 //	lo3, hi3 := m * mulsrc[i]
 //	lo3, carry = bits.Add(lo3, a, 0)
 //	lo3, carryAlt = bits.Add(lo3, addsrc[i], 0)
 //	z[i] = lo3
 //	a = hi + carry + carryAlt // cannot overflow
 //
 // There are three ways we can optimize this sequence.
 //
 // (1) Reordering, we can chain carries so that we can use one hardware carry flag
 // but amortize the cost of saving and restoring it across multiple instructions:
 //
 //	// multiply
 //	lo0, hi0 := m * mulsrc[i]
 //	lo1, hi1 := m * mulsrc[i+1]
 //	lo2, hi2 := m * mulsrc[i+2]
 //	lo3, hi3 := m * mulsrc[i+3]
 //
 //	lo0, carry = bits.Add(lo0, a, 0)
 //	lo1, carry = bits.Add(lo1, hi0, carry)
 //	lo2, carry = bits.Add(lo2, hi1, carry)
 //	lo3, carry = bits.Add(lo3, hi2, carry)
 //	a = hi3 + carry // cannot overflow
 //
 //	// add
 //	lo0, carryAlt = bits.Add(lo0, addsrc[i], 0)
 //	lo1, carryAlt = bits.Add(lo1, addsrc[i+1], carryAlt)
 //	lo2, carryAlt = bits.Add(lo2, addsrc[i+2], carryAlt)
 //	lo3, carryAlt = bits.Add(lo3, addrsc[i+3], carryAlt)
 //	a = a + carryAlt // cannot overflow
 //
 //	z[i] = lo0
 //	z[i+1] = lo1
 //	z[i+2] = lo2
 //	z[i+3] = lo3
 //
 // addMul takes this approach, using the hardware carry flag
 // first for carry and then for carryAlt.
 //
 // (2) addMulAlt assumes there are two hardware carry flags available.
 // It dedicates one each to carry and carryAlt, so that a multi-block
 // unrolling can keep the flags in hardware across all the blocks.
 // So even if the block size is 1, the code can do:
 //
 //	// multiply and add
 //	lo0, hi0 := m * mulsrc[i]
 //	lo0, carry = bits.Add(lo0, a, 0)
 //	lo0, carryAlt = bits.Add(lo0, addsrc[i], 0)
 //	z[i] = lo0
 //
 //	lo1, hi1 := m * mulsrc[i+1]
 //	lo1, carry = bits.Add(lo1, hi0, carry)
 //	lo1, carryAlt = bits.Add(lo1, addsrc[i+1], carryAlt)
 //	z[i+1] = lo1
 //
 //	lo2, hi2 := m * mulsrc[i+2]
 //	lo2, carry = bits.Add(lo2, hi1, carry)
 //	lo2, carryAlt = bits.Add(lo2, addsrc[i+2], carryAlt)
 //	z[i+2] = lo2
 //
 //	lo3, hi3 := m * mulsrc[i+3]
 //	lo3, carry = bits.Add(lo3, hi2, carry)
 //	lo3, carryAlt = bits.Add(lo3, addrsc[i+3], carryAlt)
 //	z[i+3] = lo2
 //
 //	a = hi3 + carry + carryAlt // cannot overflow
 //
 // (3) addMulVirtualCarry optimizes for systems with explicitly computed carry bits
 // (loong64, mips, riscv64), cutting the number of actual instructions almost by half.
 // Look again at the original word-at-a-time version:
 //
 //	lo1, hi1 := m * mulsrc[i]
 //	lo1, carry = bits.Add(lo1, a, 0)
 //	lo1, carryAlt = bits.Add(lo1, addsrc[i], 0)
 //	z[i] = lo1
 //	a = hi + carry + carryAlt // cannot overflow
 //
 // Although it uses four adds per word, those are cheap adds: the two bits.Add adds
 // use two instructions each (ADD+SLTU) and the final + adds only use one ADD each,
 // for a total of 6 instructions per word. In contrast, the middle stanzas in (2) use
 // only two “adds” per word, but these are SetCarry|UseCarry adds, which compile to
 // five instruction each, for a total of 10 instructions per word. So the word-at-a-time
 // loop is actually better. And we can reorder things slightly to use only a single carry bit:
 //
 //	lo1, hi1 := m * mulsrc[i]
 //	lo1, carry = bits.Add(lo1, a, 0)
 //	a = hi + carry
 //	lo1, carry = bits.Add(lo1, addsrc[i], 0)
 //	a = a + carry
 //	z[i] = lo1
 func addMul(f *Func, addsrc, mulsrc string, mulIndex int) {
 	a := f.Asm
 	mh := HintNone
 	if a.Arch == Arch386 && addsrc != "" {
 		mh = HintMemOK // too few registers otherwise
 	}
 	m := f.ArgHint("m", mh)
 	c := f.Arg("a")
 	n := f.Arg("z_len")

 	p := f.Pipe()
 	if addsrc != "" {
 		p.SetHint(addsrc, HintMemOK)
 	}
 	p.SetHint(mulsrc, HintMulSrc)
 	unroll := []int{1, 4}
 	switch a.Arch {
 	case Arch386:
 		unroll = []int{1} // too few registers
 	case ArchARM:
 		p.SetMaxColumns(2) // too few registers (but more than 386)
 	case ArchARM64:
 		unroll = []int{1, 8} // 5% speedup on c4as16
 	}

 	// See the large comment above for an explanation of the code being generated.
 	// This is optimization strategy 1.
 	p.Start(n, unroll...)
 	p.Loop(func(in, out [][]Reg) {
 		a.Comment("multiply")
 		prev := c
 		flag := SetCarry
 		for i, x := range in[mulIndex] {
 			hi := a.RegHint(HintMulHi)
 			a.MulWide(m, x, x, hi)
 			a.Add(prev, x, x, flag)
 			flag = UseCarry | SetCarry
 			if prev != c {
 				a.Free(prev)
 			}
 			out[0][i] = x
 			prev = hi
 		}
 		a.Add(a.Imm(0), prev, c, UseCarry|SmashCarry)
 		if addsrc != "" {
 			a.Comment("add")
 			flag := SetCarry
 			for i, x := range in[0] {
 				a.Add(x, out[0][i], out[0][i], flag)
 				flag = UseCarry | SetCarry
 			}
 			a.Add(a.Imm(0), c, c, UseCarry|SmashCarry)
 		}
 		p.StoreN(out)
 	})

 	f.StoreArg(c, "c")
 	a.Ret()
 }

 func addMulAlt(f *Func) {
 	a := f.Asm
 	m := f.ArgHint("m", HintMulSrc)
 	c := f.Arg("a")
 	n := f.Arg("z_len")

 	// On amd64, we need a non-immediate for the AtUnrollEnd adds.
 	r0 := a.ZR()
 	if !r0.Valid() {
 		r0 = a.Reg()
 		a.Mov(a.Imm(0), r0)
 	}

 	p := f.Pipe()
 	p.SetLabel("alt")
 	p.SetHint("x", HintMemOK)
 	p.SetHint("y", HintMemOK)
 	if a.Arch == ArchAMD64 {
 		p.SetMaxColumns(2)
 	}

 	// See the large comment above for an explanation of the code being generated.
 	// This is optimization strategy (2).
 	var hi Reg
 	prev := c
 	p.Start(n, 1, 8)
 	p.AtUnrollStart(func() {
 		a.Comment("multiply and add")
 		a.ClearCarry(AddCarry | AltCarry)
 		a.ClearCarry(AddCarry)
 		hi = a.Reg()
 	})
 	p.AtUnrollEnd(func() {
 		a.Add(r0, prev, c, UseCarry|SmashCarry)
 		a.Add(r0, c, c, UseCarry|SmashCarry|AltCarry)
 		prev = c
 	})
 	p.Loop(func(in, out [][]Reg) {
 		for i, y := range in[1] {
 			x := in[0][i]
 			lo := y
 			if lo.IsMem() {
 				lo = a.Reg()
 			}
 			a.MulWide(m, y, lo, hi)
 			a.Add(prev, lo, lo, UseCarry|SetCarry)
 			a.Add(x, lo, lo, UseCarry|SetCarry|AltCarry)
 			out[0][i] = lo
 			prev, hi = hi, prev
 		}
 		p.StoreN(out)
 	})

 	f.StoreArg(c, "c")
 	a.Ret()
 }

 func addMulVirtualCarry(f *Func, mulIndex int) {
 	a := f.Asm
 	m := f.Arg("m")
 	c := f.Arg("a")
 	n := f.Arg("z_len")

 	// See the large comment above for an explanation of the code being generated.
 	// This is optimization strategy (3).
 	p := f.Pipe()
 	p.Start(n, 1, 4)
 	p.Loop(func(in, out [][]Reg) {
 		a.Comment("synthetic carry, one column at a time")
 		lo, hi := a.Reg(), a.Reg()
 		for i, x := range in[mulIndex] {
 			a.MulWide(m, x, lo, hi)
 			if mulIndex == 1 {
 				a.Add(in[0][i], lo, lo, SetCarry)
 				a.Add(a.Imm(0), hi, hi, UseCarry|SmashCarry)
 			}
 			a.Add(c, lo, x, SetCarry)
 			a.Add(a.Imm(0), hi, c, UseCarry|SmashCarry)
 			out[0][i] = x
 		}
 		p.StoreN(out)
 	})
 	f.StoreArg(c, "c")
 	a.Ret()
 }
	// Copyright 2025 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package asmgen

	// mulAddVWW generates mulAddVWW, which does z, c = x*m + a.
	func mulAddVWW(a *Asm) {
	f := a.Func("func mulAddVWW(z, x []Word, m, a Word) (c Word)")

	if a.AltCarry().Valid() {
	addMulVirtualCarry(f, 0)
	return
	}
	addMul(f, "", "x", 0)
	}

	// addMulVVWW generates addMulVVWW which does z, c = x + y*m + a.
	// (A more pedantic name would be addMulAddVVWW.)
	func addMulVVWW(a *Asm) {
	f := a.Func("func addMulVVWW(z, x, y []Word, m, a Word) (c Word)")

	// If the architecture has virtual carries, emit that version unconditionally.
	if a.AltCarry().Valid() {
	addMulVirtualCarry(f, 1)
	return
	}

	// If the architecture optionally has two carries, test and emit both versions.
	if a.JmpEnable(OptionAltCarry, "altcarry") {
	regs := a.RegsUsed()
	addMul(f, "x", "y", 1)
	a.Label("altcarry")
	a.SetOption(OptionAltCarry, true)
	a.SetRegsUsed(regs)
	addMulAlt(f)
	a.SetOption(OptionAltCarry, false)
	return
	}

	// Otherwise emit the one-carry form.
	addMul(f, "x", "y", 1)
	}

	// Computing z = addsrc + m*mulsrc + a, we need:
	//
	// for i := range z {
	// lo, hi := m * mulsrc[i]
	// lo, carry = bits.Add(lo, a, 0)
	// lo, carryAlt = bits.Add(lo, addsrc[i], 0)
	// z[i] = lo
	// a = hi + carry + carryAlt // cannot overflow
	// }
	//
	// The final addition cannot overflow because after processing N words,
	// the maximum possible value is (for a 64-bit system):
	//
	// (264N - 1) + (264 - 1)(264N - 1) + (2*64 - 1)
	// = (2*64)(264N - 1) + (264 - 1)
	// = 2**64(N+1) - 1,
	//
	// which fits in N+1 words (the high order one being the new value of a).
	//
	// (For example, with 3 decimal words, 999 + 9999 + 9 = 99910 + 9 = 9999.)
	//
	// If we unroll the loop a bit, then we can chain the carries in two passes.
	// Consider:
	//
	// lo0, hi0 := m * mulsrc[i]
	// lo0, carry = bits.Add(lo0, a, 0)
	// lo0, carryAlt = bits.Add(lo0, addsrc[i], 0)
	// z[i] = lo0
	// a = hi + carry + carryAlt // cannot overflow
	//
	// lo1, hi1 := m * mulsrc[i]
	// lo1, carry = bits.Add(lo1, a, 0)
	// lo1, carryAlt = bits.Add(lo1, addsrc[i], 0)
	// z[i] = lo1
	// a = hi + carry + carryAlt // cannot overflow
	//
	// lo2, hi2 := m * mulsrc[i]
	// lo2, carry = bits.Add(lo2, a, 0)
	// lo2, carryAlt = bits.Add(lo2, addsrc[i], 0)
	// z[i] = lo2
	// a = hi + carry + carryAlt // cannot overflow
	//
	// lo3, hi3 := m * mulsrc[i]
	// lo3, carry = bits.Add(lo3, a, 0)
	// lo3, carryAlt = bits.Add(lo3, addsrc[i], 0)
	// z[i] = lo3
	// a = hi + carry + carryAlt // cannot overflow
	//
	// There are three ways we can optimize this sequence.
	//
	// (1) Reordering, we can chain carries so that we can use one hardware carry flag
	// but amortize the cost of saving and restoring it across multiple instructions:
	//
	// // multiply
	// lo0, hi0 := m * mulsrc[i]
	// lo1, hi1 := m * mulsrc[i+1]
	// lo2, hi2 := m * mulsrc[i+2]
	// lo3, hi3 := m * mulsrc[i+3]
	//
	// lo0, carry = bits.Add(lo0, a, 0)
	// lo1, carry = bits.Add(lo1, hi0, carry)
	// lo2, carry = bits.Add(lo2, hi1, carry)
	// lo3, carry = bits.Add(lo3, hi2, carry)
	// a = hi3 + carry // cannot overflow
	//
	// // add
	// lo0, carryAlt = bits.Add(lo0, addsrc[i], 0)
	// lo1, carryAlt = bits.Add(lo1, addsrc[i+1], carryAlt)
	// lo2, carryAlt = bits.Add(lo2, addsrc[i+2], carryAlt)
	// lo3, carryAlt = bits.Add(lo3, addrsc[i+3], carryAlt)
	// a = a + carryAlt // cannot overflow
	//
	// z[i] = lo0
	// z[i+1] = lo1
	// z[i+2] = lo2
	// z[i+3] = lo3
	//
	// addMul takes this approach, using the hardware carry flag
	// first for carry and then for carryAlt.
	//
	// (2) addMulAlt assumes there are two hardware carry flags available.
	// It dedicates one each to carry and carryAlt, so that a multi-block
	// unrolling can keep the flags in hardware across all the blocks.
	// So even if the block size is 1, the code can do:
	//
	// // multiply and add
	// lo0, hi0 := m * mulsrc[i]
	// lo0, carry = bits.Add(lo0, a, 0)
	// lo0, carryAlt = bits.Add(lo0, addsrc[i], 0)
	// z[i] = lo0
	//
	// lo1, hi1 := m * mulsrc[i+1]
	// lo1, carry = bits.Add(lo1, hi0, carry)
	// lo1, carryAlt = bits.Add(lo1, addsrc[i+1], carryAlt)
	// z[i+1] = lo1
	//
	// lo2, hi2 := m * mulsrc[i+2]
	// lo2, carry = bits.Add(lo2, hi1, carry)
	// lo2, carryAlt = bits.Add(lo2, addsrc[i+2], carryAlt)
	// z[i+2] = lo2
	//
	// lo3, hi3 := m * mulsrc[i+3]
	// lo3, carry = bits.Add(lo3, hi2, carry)
	// lo3, carryAlt = bits.Add(lo3, addrsc[i+3], carryAlt)
	// z[i+3] = lo2
	//
	// a = hi3 + carry + carryAlt // cannot overflow
	//
	// (3) addMulVirtualCarry optimizes for systems with explicitly computed carry bits
	// (loong64, mips, riscv64), cutting the number of actual instructions almost by half.
	// Look again at the original word-at-a-time version:
	//
	// lo1, hi1 := m * mulsrc[i]
	// lo1, carry = bits.Add(lo1, a, 0)
	// lo1, carryAlt = bits.Add(lo1, addsrc[i], 0)
	// z[i] = lo1
	// a = hi + carry + carryAlt // cannot overflow
	//
	// Although it uses four adds per word, those are cheap adds: the two bits.Add adds
	// use two instructions each (ADD+SLTU) and the final + adds only use one ADD each,
	// for a total of 6 instructions per word. In contrast, the middle stanzas in (2) use
	// only two “adds” per word, but these are SetCarry\|UseCarry adds, which compile to
	// five instruction each, for a total of 10 instructions per word. So the word-at-a-time
	// loop is actually better. And we can reorder things slightly to use only a single carry bit:
	//
	// lo1, hi1 := m * mulsrc[i]
	// lo1, carry = bits.Add(lo1, a, 0)
	// a = hi + carry
	// lo1, carry = bits.Add(lo1, addsrc[i], 0)
	// a = a + carry
	// z[i] = lo1
	func addMul(f *Func, addsrc, mulsrc string, mulIndex int) {
	a := f.Asm
	mh := HintNone
	if a.Arch == Arch386 && addsrc != "" {
	mh = HintMemOK // too few registers otherwise
	}
	m := f.ArgHint("m", mh)
	c := f.Arg("a")
	n := f.Arg("z_len")

	p := f.Pipe()
	if addsrc != "" {
	p.SetHint(addsrc, HintMemOK)
	}
	p.SetHint(mulsrc, HintMulSrc)
	unroll := []int{1, 4}
	switch a.Arch {
	case Arch386:
	unroll = []int{1} // too few registers
	case ArchARM:
	p.SetMaxColumns(2) // too few registers (but more than 386)
	case ArchARM64:
	unroll = []int{1, 8} // 5% speedup on c4as16
	}

	// See the large comment above for an explanation of the code being generated.
	// This is optimization strategy 1.
	p.Start(n, unroll...)
	p.Loop(func(in, out [][]Reg) {
	a.Comment("multiply")
	prev := c
	flag := SetCarry
	for i, x := range in[mulIndex] {
	hi := a.RegHint(HintMulHi)
	a.MulWide(m, x, x, hi)
	a.Add(prev, x, x, flag)
	flag = UseCarry \| SetCarry
	if prev != c {
	a.Free(prev)
	}
	out[0][i] = x
	prev = hi
	}
	a.Add(a.Imm(0), prev, c, UseCarry\|SmashCarry)
	if addsrc != "" {
	a.Comment("add")
	flag := SetCarry
	for i, x := range in[0] {
	a.Add(x, out[0][i], out[0][i], flag)
	flag = UseCarry \| SetCarry
	}
	a.Add(a.Imm(0), c, c, UseCarry\|SmashCarry)
	}
	p.StoreN(out)
	})

	f.StoreArg(c, "c")
	a.Ret()
	}

	func addMulAlt(f *Func) {
	a := f.Asm
	m := f.ArgHint("m", HintMulSrc)
	c := f.Arg("a")
	n := f.Arg("z_len")

	// On amd64, we need a non-immediate for the AtUnrollEnd adds.
	r0 := a.ZR()
	if !r0.Valid() {
	r0 = a.Reg()
	a.Mov(a.Imm(0), r0)
	}

	p := f.Pipe()
	p.SetLabel("alt")
	p.SetHint("x", HintMemOK)
	p.SetHint("y", HintMemOK)
	if a.Arch == ArchAMD64 {
	p.SetMaxColumns(2)
	}

	// See the large comment above for an explanation of the code being generated.
	// This is optimization strategy (2).
	var hi Reg
	prev := c
	p.Start(n, 1, 8)
	p.AtUnrollStart(func() {
	a.Comment("multiply and add")
	a.ClearCarry(AddCarry \| AltCarry)
	a.ClearCarry(AddCarry)
	hi = a.Reg()
	})
	p.AtUnrollEnd(func() {
	a.Add(r0, prev, c, UseCarry\|SmashCarry)
	a.Add(r0, c, c, UseCarry\|SmashCarry\|AltCarry)
	prev = c
	})
	p.Loop(func(in, out [][]Reg) {
	for i, y := range in[1] {
	x := in[0][i]
	lo := y
	if lo.IsMem() {
	lo = a.Reg()
	}
	a.MulWide(m, y, lo, hi)
	a.Add(prev, lo, lo, UseCarry\|SetCarry)
	a.Add(x, lo, lo, UseCarry\|SetCarry\|AltCarry)
	out[0][i] = lo
	prev, hi = hi, prev
	}
	p.StoreN(out)
	})

	f.StoreArg(c, "c")
	a.Ret()
	}

	func addMulVirtualCarry(f *Func, mulIndex int) {
	a := f.Asm
	m := f.Arg("m")
	c := f.Arg("a")
	n := f.Arg("z_len")

	// See the large comment above for an explanation of the code being generated.
	// This is optimization strategy (3).
	p := f.Pipe()
	p.Start(n, 1, 4)
	p.Loop(func(in, out [][]Reg) {
	a.Comment("synthetic carry, one column at a time")
	lo, hi := a.Reg(), a.Reg()
	for i, x := range in[mulIndex] {
	a.MulWide(m, x, lo, hi)
	if mulIndex == 1 {
	a.Add(in[0][i], lo, lo, SetCarry)
	a.Add(a.Imm(0), hi, hi, UseCarry\|SmashCarry)
	}
	a.Add(c, lo, x, SetCarry)
	a.Add(a.Imm(0), hi, c, UseCarry\|SmashCarry)
	out[0][i] = x
	}
	p.StoreN(out)
	})
	f.StoreArg(c, "c")
	a.Ret()
	}