| // Copyright 2025 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package asmgen |
| |
| // shiftVU generates lshVU and rshVU, which do |
| // z, c = x << s and z, c = x >> s, for 0 < s < _W. |
| func shiftVU(a *Asm, name string) { |
| // Because these routines can be called for z.Lsh(z, N) and z.Rsh(z, N), |
| // the input and output slices may be aliased at different offsets. |
| // For example (on 64-bit systems), during z.Lsh(z, 65), &z[0] == &x[1], |
| // and during z.Rsh(z, 65), &z[1] == &x[0]. |
| // For left shift, we must process the slices from len(z)-1 down to 0, |
| // so that we don't overwrite a word before we need to read it. |
| // For right shift, we must process the slices from 0 up to len(z)-1. |
| // The different traversals at least make the two cases more consistent, |
| // since we're always delaying the output by one word compared |
| // to the input. |
| |
| f := a.Func("func " + name + "(z, x []Word, s uint) (c Word)") |
| |
| // Check for no input early, since we need to start by reading 1 word. |
| n := f.Arg("z_len") |
| a.JmpZero(n, "ret0") |
| |
| // Start loop by reading first input word. |
| s := f.ArgHint("s", HintShiftCount) |
| p := f.Pipe() |
| if name == "lshVU" { |
| p.SetBackward() |
| } |
| unroll := []int{1, 4} |
| if a.Arch == Arch386 { |
| unroll = []int{1} // too few registers for more |
| p.SetUseIndexCounter() |
| } |
| p.LoadPtrs(n) |
| a.Comment("shift first word into carry") |
| prev := p.LoadN(1)[0][0] |
| |
| // Decide how to shift. On systems with a wide shift (x86), use that. |
| // Otherwise, we need shift by s and negative (reverse) shift by 64-s or 32-s. |
| shift := a.Lsh |
| shiftWide := a.LshWide |
| negShift := a.Rsh |
| negShiftReg := a.RshReg |
| if name == "rshVU" { |
| shift = a.Rsh |
| shiftWide = a.RshWide |
| negShift = a.Lsh |
| negShiftReg = a.LshReg |
| } |
| if a.Arch.HasShiftWide() { |
| // Use wide shift to avoid needing negative shifts. |
| // The invariant is that prev holds the previous word (not shifted at all), |
| // to be used as input into the wide shift. |
| // After the loop finishes, prev holds the final output word to be written. |
| c := a.Reg() |
| shiftWide(s, prev, a.Imm(0), c) |
| f.StoreArg(c, "c") |
| a.Free(c) |
| a.Comment("shift remaining words") |
| p.Start(n, unroll...) |
| p.Loop(func(in [][]Reg, out [][]Reg) { |
| // We reuse the input registers as output, delayed one cycle; prev is the first output. |
| // After writing the outputs to memory, we can copy the final x value into prev |
| // for the next iteration. |
| old := prev |
| for i, x := range in[0] { |
| shiftWide(s, x, old, old) |
| out[0][i] = old |
| old = x |
| } |
| p.StoreN(out) |
| a.Mov(old, prev) |
| }) |
| a.Comment("store final shifted bits") |
| shift(s, prev, prev) |
| } else { |
| // Construct values from x << s and x >> (64-s). |
| // After the first word has been processed, the invariant is that |
| // prev holds x << s, to be used as the high bits of the next output word, |
| // once we find the low bits after reading the next input word. |
| // After the loop finishes, prev holds the final output word to be written. |
| sNeg := a.Reg() |
| a.Mov(a.Imm(a.Arch.WordBits), sNeg) |
| a.Sub(s, sNeg, sNeg, SmashCarry) |
| c := a.Reg() |
| negShift(sNeg, prev, c) |
| shift(s, prev, prev) |
| f.StoreArg(c, "c") |
| a.Free(c) |
| a.Comment("shift remaining words") |
| p.Start(n, unroll...) |
| p.Loop(func(in, out [][]Reg) { |
| if a.HasRegShift() { |
| // ARM (32-bit) allows shifts in most arithmetic expressions, |
| // including OR, letting us combine the negShift and a.Or. |
| // The simplest way to manage the registers is to do StoreN for |
| // one output at a time, and since we don't use multi-register |
| // stores on ARM, that doesn't hurt us. |
| out[0] = out[0][:1] |
| for _, x := range in[0] { |
| a.Or(negShiftReg(sNeg, x), prev, prev) |
| out[0][0] = prev |
| p.StoreN(out) |
| shift(s, x, prev) |
| } |
| return |
| } |
| // We reuse the input registers as output, delayed one cycle; z0 is the first output. |
| z0 := a.Reg() |
| z := z0 |
| for i, x := range in[0] { |
| negShift(sNeg, x, z) |
| a.Or(prev, z, z) |
| shift(s, x, prev) |
| out[0][i] = z |
| z = x |
| } |
| p.StoreN(out) |
| }) |
| a.Comment("store final shifted bits") |
| } |
| p.StoreN([][]Reg{{prev}}) |
| p.Done() |
| a.Free(s) |
| a.Ret() |
| |
| // Return 0, used from above. |
| a.Label("ret0") |
| f.StoreArg(a.Imm(0), "c") |
| a.Ret() |
| } |