| // Copyright 2025 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| //go:build goexperiment.simd && amd64 |
| |
| package archsimd |
| |
| // These constants represent the source pattern for the four parameters |
| // (a, b, c, d) passed to SelectFromPair and SelectFromPairGrouped. |
| // L means the element comes from the 'x' vector (Low), and |
| // H means it comes from the 'y' vector (High). |
| // The order of the letters corresponds to elements a, b, c, d. |
| // The underlying integer value is a bitmask where: |
| // Bit 0: Source of element 'a' (0 for x, 1 for y) |
| // Bit 1: Source of element 'b' (0 for x, 1 for y) |
| // Bit 2: Source of element 'c' (0 for x, 1 for y) |
| // Bit 3: Source of element 'd' (0 for x, 1 for y) |
| // Note that the least-significant bit is on the LEFT in this encoding. |
| const ( |
| _LLLL = iota // a:x, b:x, c:x, d:x |
| _HLLL // a:y, b:x, c:x, d:x |
| _LHLL // a:x, b:y, c:x, d:x |
| _HHLL // a:y, b:y, c:x, d:x |
| _LLHL // a:x, b:x, c:y, d:x |
| _HLHL // a:y, b:x, c:y, d:x |
| _LHHL // a:x, b:y, c:y, d:x |
| _HHHL // a:y, b:y, c:y, d:x |
| _LLLH // a:x, b:x, c:x, d:y |
| _HLLH // a:y, b:x, c:x, d:y |
| _LHLH // a:x, b:y, c:x, d:y |
| _HHLH // a:y, b:y, c:x, d:y |
| _LLHH // a:x, b:x, c:y, d:y |
| _HLHH // a:y, b:x, c:y, d:y |
| _LHHH // a:x, b:y, c:y, d:y |
| _HHHH // a:y, b:y, c:y, d:y |
| ) |
| |
| // These constants represent the source pattern for the four parameters |
| // (a, b, c, d) passed to SelectFromPair and SelectFromPairGrouped for |
| // two-element vectors. |
| const ( |
| _LL = iota |
| _HL |
| _LH |
| _HH |
| ) |
| |
| // SelectFromPair returns the selection of four elements from the two |
| // vectors x and y, where selector values in the range 0-3 specify |
| // elements from x and values in the range 4-7 specify the 0-3 elements |
| // of y. When the selectors are constants and the selection can be |
| // implemented in a single instruction, it will be, otherwise it |
| // requires two. a is the source index of the least element in the |
| // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th |
| // elements in the output. For example, |
| // {1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81}) returns {4,8,25,81} |
| // |
| // If the selectors are not constant this will translate to a function |
| // call. |
| // |
| // Asm: VSHUFPS, CPU Feature: AVX |
| func (x Int32x4) SelectFromPair(a, b, c, d uint8, y Int32x4) Int32x4 { |
| // pattern gets the concatenation of "x or y?" bits |
| // (0 == x, 1 == y) |
| // This will determine operand choice/order and whether a second |
| // instruction is needed. |
| pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1 |
| |
| // a-d are masked down to their offsets within x or y |
| // this is not necessary for x, but this is easier on the |
| // eyes and reduces the risk of an error now or later. |
| a, b, c, d = a&3, b&3, c&3, d&3 |
| |
| switch pattern { |
| case _LLLL: |
| return x.concatSelectedConstant(cscimm4(a, b, c, d), x) |
| case _HHHH: |
| return y.concatSelectedConstant(cscimm4(a, b, c, d), y) |
| case _LLHH: |
| return x.concatSelectedConstant(cscimm4(a, b, c, d), y) |
| case _HHLL: |
| return y.concatSelectedConstant(cscimm4(a, b, c, d), x) |
| |
| case _HLLL: |
| z := y.concatSelectedConstant(cscimm4(a, a, b, b), x) |
| return z.concatSelectedConstant(cscimm4(0, 2, c, d), x) |
| case _LHLL: |
| z := x.concatSelectedConstant(cscimm4(a, a, b, b), y) |
| return z.concatSelectedConstant(cscimm4(0, 2, c, d), x) |
| |
| case _HLHH: |
| z := y.concatSelectedConstant(cscimm4(a, a, b, b), x) |
| return z.concatSelectedConstant(cscimm4(0, 2, c, d), y) |
| case _LHHH: |
| z := x.concatSelectedConstant(cscimm4(a, a, b, b), y) |
| return z.concatSelectedConstant(cscimm4(0, 2, c, d), y) |
| |
| case _LLLH: |
| z := x.concatSelectedConstant(cscimm4(c, c, d, d), y) |
| return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z) |
| case _LLHL: |
| z := y.concatSelectedConstant(cscimm4(c, c, d, d), x) |
| return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z) |
| case _HHLH: |
| z := x.concatSelectedConstant(cscimm4(c, c, d, d), y) |
| return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z) |
| case _HHHL: |
| z := y.concatSelectedConstant(cscimm4(c, c, d, d), x) |
| return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z) |
| |
| case _LHLH: |
| z := x.concatSelectedConstant(cscimm4(a, c, b, d), y) |
| return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) |
| case _HLHL: |
| z := x.concatSelectedConstant(cscimm4(b, d, a, c), y) |
| return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) |
| case _HLLH: |
| z := x.concatSelectedConstant(cscimm4(b, c, a, d), y) |
| return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) |
| case _LHHL: |
| z := x.concatSelectedConstant(cscimm4(a, d, b, c), y) |
| return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) |
| } |
| panic("missing case, switch should be exhaustive") |
| } |
| |
| // SelectFromPair returns the selection of four elements from the two |
| // vectors x and y, where selector values in the range 0-3 specify |
| // elements from x and values in the range 4-7 specify the 0-3 elements |
| // of y. When the selectors are constants and can be the selection |
| // can be implemented in a single instruction, it will be, otherwise |
| // it requires two. a is the source index of the least element in the |
| // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th |
| // elements in the output. For example, |
| // {1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81}) returns {4,8,25,81} |
| // |
| // If the selectors are not constant this will translate to a function |
| // call. |
| // |
| // Asm: VSHUFPS, CPU Feature: AVX |
| func (x Uint32x4) SelectFromPair(a, b, c, d uint8, y Uint32x4) Uint32x4 { |
| pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1 |
| |
| a, b, c, d = a&3, b&3, c&3, d&3 |
| |
| switch pattern { |
| case _LLLL: |
| return x.concatSelectedConstant(cscimm4(a, b, c, d), x) |
| case _HHHH: |
| return y.concatSelectedConstant(cscimm4(a, b, c, d), y) |
| case _LLHH: |
| return x.concatSelectedConstant(cscimm4(a, b, c, d), y) |
| case _HHLL: |
| return y.concatSelectedConstant(cscimm4(a, b, c, d), x) |
| |
| case _HLLL: |
| z := y.concatSelectedConstant(cscimm4(a, a, b, b), x) |
| return z.concatSelectedConstant(cscimm4(0, 2, c, d), x) |
| case _LHLL: |
| z := x.concatSelectedConstant(cscimm4(a, a, b, b), y) |
| return z.concatSelectedConstant(cscimm4(0, 2, c, d), x) |
| |
| case _HLHH: |
| z := y.concatSelectedConstant(cscimm4(a, a, b, b), x) |
| return z.concatSelectedConstant(cscimm4(0, 2, c, d), y) |
| case _LHHH: |
| z := x.concatSelectedConstant(cscimm4(a, a, b, b), y) |
| return z.concatSelectedConstant(cscimm4(0, 2, c, d), y) |
| |
| case _LLLH: |
| z := x.concatSelectedConstant(cscimm4(c, c, d, d), y) |
| return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z) |
| case _LLHL: |
| z := y.concatSelectedConstant(cscimm4(c, c, d, d), x) |
| return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z) |
| case _HHLH: |
| z := x.concatSelectedConstant(cscimm4(c, c, d, d), y) |
| return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z) |
| case _HHHL: |
| z := y.concatSelectedConstant(cscimm4(c, c, d, d), x) |
| return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z) |
| |
| case _LHLH: |
| z := x.concatSelectedConstant(cscimm4(a, c, b, d), y) |
| return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) |
| case _HLHL: |
| z := x.concatSelectedConstant(cscimm4(b, d, a, c), y) |
| return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) |
| case _HLLH: |
| z := x.concatSelectedConstant(cscimm4(b, c, a, d), y) |
| return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) |
| case _LHHL: |
| z := x.concatSelectedConstant(cscimm4(a, d, b, c), y) |
| return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) |
| } |
| panic("missing case, switch should be exhaustive") |
| } |
| |
| // SelectFromPair returns the selection of four elements from the two |
| // vectors x and y, where selector values in the range 0-3 specify |
| // elements from x and values in the range 4-7 specify the 0-3 elements |
| // of y. When the selectors are constants and can be the selection |
| // can be implemented in a single instruction, it will be, otherwise |
| // it requires two. a is the source index of the least element in the |
| // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th |
| // elements in the output. For example, |
| // {1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81}) returns {4,8,25,81} |
| // |
| // If the selectors are not constant this will translate to a function |
| // call. |
| // |
| // Asm: VSHUFPS, CPU Feature: AVX |
| func (x Float32x4) SelectFromPair(a, b, c, d uint8, y Float32x4) Float32x4 { |
| pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1 |
| |
| a, b, c, d = a&3, b&3, c&3, d&3 |
| |
| switch pattern { |
| case _LLLL: |
| return x.concatSelectedConstant(cscimm4(a, b, c, d), x) |
| case _HHHH: |
| return y.concatSelectedConstant(cscimm4(a, b, c, d), y) |
| case _LLHH: |
| return x.concatSelectedConstant(cscimm4(a, b, c, d), y) |
| case _HHLL: |
| return y.concatSelectedConstant(cscimm4(a, b, c, d), x) |
| |
| case _HLLL: |
| z := y.concatSelectedConstant(cscimm4(a, a, b, b), x) |
| return z.concatSelectedConstant(cscimm4(0, 2, c, d), x) |
| case _LHLL: |
| z := x.concatSelectedConstant(cscimm4(a, a, b, b), y) |
| return z.concatSelectedConstant(cscimm4(0, 2, c, d), x) |
| |
| case _HLHH: |
| z := y.concatSelectedConstant(cscimm4(a, a, b, b), x) |
| return z.concatSelectedConstant(cscimm4(0, 2, c, d), y) |
| case _LHHH: |
| z := x.concatSelectedConstant(cscimm4(a, a, b, b), y) |
| return z.concatSelectedConstant(cscimm4(0, 2, c, d), y) |
| |
| case _LLLH: |
| z := x.concatSelectedConstant(cscimm4(c, c, d, d), y) |
| return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z) |
| case _LLHL: |
| z := y.concatSelectedConstant(cscimm4(c, c, d, d), x) |
| return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z) |
| case _HHLH: |
| z := x.concatSelectedConstant(cscimm4(c, c, d, d), y) |
| return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z) |
| case _HHHL: |
| z := y.concatSelectedConstant(cscimm4(c, c, d, d), x) |
| return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z) |
| |
| case _LHLH: |
| z := x.concatSelectedConstant(cscimm4(a, c, b, d), y) |
| return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) |
| case _HLHL: |
| z := x.concatSelectedConstant(cscimm4(b, d, a, c), y) |
| return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) |
| case _HLLH: |
| z := x.concatSelectedConstant(cscimm4(b, c, a, d), y) |
| return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) |
| case _LHHL: |
| z := x.concatSelectedConstant(cscimm4(a, d, b, c), y) |
| return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) |
| } |
| panic("missing case, switch should be exhaustive") |
| } |
| |
| // SelectFromPairGrouped returns, for each of the two 128-bit halves of |
| // the vectors x and y, the selection of four elements from x and y, |
| // where selector values in the range 0-3 specify elements from x and |
| // values in the range 4-7 specify the 0-3 elements of y. |
| // When the selectors are constants and can be the selection |
| // can be implemented in a single instruction, it will be, otherwise |
| // it requires two. a is the source index of the least element in the |
| // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th |
| // elements in the output. For example, |
| // {1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289}) |
| // |
| // returns {4,8,25,81,64,128,169,289} |
| // |
| // If the selectors are not constant this will translate to a function |
| // call. |
| // |
| // Asm: VSHUFPS, CPU Feature: AVX |
| func (x Int32x8) SelectFromPairGrouped(a, b, c, d uint8, y Int32x8) Int32x8 { |
| pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1 |
| |
| a, b, c, d = a&3, b&3, c&3, d&3 |
| |
| switch pattern { |
| case _LLLL: |
| return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) |
| case _HHHH: |
| return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) |
| case _LLHH: |
| return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) |
| case _HHLL: |
| return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) |
| |
| case _HLLL: |
| z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) |
| return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) |
| case _LHLL: |
| z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) |
| return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) |
| |
| case _HLHH: |
| z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) |
| return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) |
| case _LHHH: |
| z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) |
| return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) |
| |
| case _LLLH: |
| z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) |
| return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) |
| case _LLHL: |
| z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) |
| return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) |
| case _HHLH: |
| z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) |
| return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) |
| case _HHHL: |
| z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) |
| return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) |
| |
| case _LHLH: |
| z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y) |
| return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) |
| case _HLHL: |
| z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y) |
| return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) |
| case _HLLH: |
| z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y) |
| return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) |
| case _LHHL: |
| z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y) |
| return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) |
| } |
| panic("missing case, switch should be exhaustive") |
| } |
| |
| // SelectFromPairGrouped returns, for each of the two 128-bit halves of |
| // the vectors x and y, the selection of four elements from x and y, |
| // where selector values in the range 0-3 specify elements from x and |
| // values in the range 4-7 specify the 0-3 elements of y. |
| // When the selectors are constants and can be the selection |
| // can be implemented in a single instruction, it will be, otherwise |
| // it requires two. a is the source index of the least element in the |
| // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th |
| // elements in the output. For example, |
| // {1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289}) |
| // |
| // returns {4,8,25,81,64,128,169,289} |
| // |
| // If the selectors are not constant this will translate to a function |
| // call. |
| // |
| // Asm: VSHUFPS, CPU Feature: AVX |
| func (x Uint32x8) SelectFromPairGrouped(a, b, c, d uint8, y Uint32x8) Uint32x8 { |
| pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1 |
| |
| a, b, c, d = a&3, b&3, c&3, d&3 |
| |
| switch pattern { |
| case _LLLL: |
| return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) |
| case _HHHH: |
| return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) |
| case _LLHH: |
| return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) |
| case _HHLL: |
| return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) |
| |
| case _HLLL: |
| z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) |
| return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) |
| case _LHLL: |
| z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) |
| return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) |
| |
| case _HLHH: |
| z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) |
| return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) |
| case _LHHH: |
| z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) |
| return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) |
| |
| case _LLLH: |
| z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) |
| return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) |
| case _LLHL: |
| z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) |
| return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) |
| case _HHLH: |
| z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) |
| return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) |
| case _HHHL: |
| z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) |
| return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) |
| |
| case _LHLH: |
| z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y) |
| return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) |
| case _HLHL: |
| z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y) |
| return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) |
| case _HLLH: |
| z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y) |
| return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) |
| case _LHHL: |
| z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y) |
| return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) |
| } |
| panic("missing case, switch should be exhaustive") |
| } |
| |
| // SelectFromPairGrouped returns, for each of the two 128-bit halves of |
| // the vectors x and y, the selection of four elements from x and y, |
| // where selector values in the range 0-3 specify elements from x and |
| // values in the range 4-7 specify the 0-3 elements of y. |
| // When the selectors are constants and can be the selection |
| // can be implemented in a single instruction, it will be, otherwise |
| // it requires two. a is the source index of the least element in the |
| // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th |
| // elements in the output. For example, |
| // {1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289}) |
| // |
| // returns {4,8,25,81,64,128,169,289} |
| // |
| // If the selectors are not constant this will translate to a function |
| // call. |
| // |
| // Asm: VSHUFPS, CPU Feature: AVX |
| func (x Float32x8) SelectFromPairGrouped(a, b, c, d uint8, y Float32x8) Float32x8 { |
| pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1 |
| |
| a, b, c, d = a&3, b&3, c&3, d&3 |
| |
| switch pattern { |
| case _LLLL: |
| return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) |
| case _HHHH: |
| return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) |
| case _LLHH: |
| return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) |
| case _HHLL: |
| return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) |
| |
| case _HLLL: |
| z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) |
| return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) |
| case _LHLL: |
| z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) |
| return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) |
| |
| case _HLHH: |
| z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) |
| return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) |
| case _LHHH: |
| z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) |
| return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) |
| |
| case _LLLH: |
| z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) |
| return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) |
| case _LLHL: |
| z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) |
| return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) |
| case _HHLH: |
| z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) |
| return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) |
| case _HHHL: |
| z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) |
| return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) |
| |
| case _LHLH: |
| z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y) |
| return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) |
| case _HLHL: |
| z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y) |
| return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) |
| case _HLLH: |
| z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y) |
| return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) |
| case _LHHL: |
| z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y) |
| return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) |
| } |
| panic("missing case, switch should be exhaustive") |
| } |
| |
| // SelectFromPairGrouped returns, for each of the four 128-bit subvectors |
| // of the vectors x and y, the selection of four elements from x and y, |
| // where selector values in the range 0-3 specify elements from x and |
| // values in the range 4-7 specify the 0-3 elements of y. |
| // When the selectors are constants and can be the selection |
| // can be implemented in a single instruction, it will be, otherwise |
| // it requires two. |
| // |
| // If the selectors are not constant this will translate to a function |
| // call. |
| // |
| // Asm: VSHUFPS, CPU Feature: AVX512 |
| func (x Int32x16) SelectFromPairGrouped(a, b, c, d uint8, y Int32x16) Int32x16 { |
| pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1 |
| |
| a, b, c, d = a&3, b&3, c&3, d&3 |
| |
| switch pattern { |
| case _LLLL: |
| return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) |
| case _HHHH: |
| return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) |
| case _LLHH: |
| return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) |
| case _HHLL: |
| return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) |
| |
| case _HLLL: |
| z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) |
| return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) |
| case _LHLL: |
| z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) |
| return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) |
| |
| case _HLHH: |
| z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) |
| return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) |
| case _LHHH: |
| z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) |
| return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) |
| |
| case _LLLH: |
| z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) |
| return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) |
| case _LLHL: |
| z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) |
| return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) |
| case _HHLH: |
| z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) |
| return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) |
| case _HHHL: |
| z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) |
| return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) |
| |
| case _LHLH: |
| z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y) |
| return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) |
| case _HLHL: |
| z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y) |
| return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) |
| case _HLLH: |
| z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y) |
| return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) |
| case _LHHL: |
| z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y) |
| return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) |
| } |
| panic("missing case, switch should be exhaustive") |
| } |
| |
| // SelectFromPairGrouped returns, for each of the four 128-bit subvectors |
| // of the vectors x and y, the selection of four elements from x and y, |
| // where selector values in the range 0-3 specify elements from x and |
| // values in the range 4-7 specify the 0-3 elements of y. |
| // When the selectors are constants and can be the selection |
| // can be implemented in a single instruction, it will be, otherwise |
| // it requires two. |
| // |
| // If the selectors are not constant this will translate to a function |
| // call. |
| // |
| // Asm: VSHUFPS, CPU Feature: AVX512 |
| func (x Uint32x16) SelectFromPairGrouped(a, b, c, d uint8, y Uint32x16) Uint32x16 { |
| pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1 |
| |
| a, b, c, d = a&3, b&3, c&3, d&3 |
| |
| switch pattern { |
| case _LLLL: |
| return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) |
| case _HHHH: |
| return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) |
| case _LLHH: |
| return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) |
| case _HHLL: |
| return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) |
| |
| case _HLLL: |
| z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) |
| return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) |
| case _LHLL: |
| z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) |
| return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) |
| |
| case _HLHH: |
| z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) |
| return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) |
| case _LHHH: |
| z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) |
| return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) |
| |
| case _LLLH: |
| z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) |
| return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) |
| case _LLHL: |
| z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) |
| return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) |
| case _HHLH: |
| z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) |
| return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) |
| case _HHHL: |
| z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) |
| return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) |
| |
| case _LHLH: |
| z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y) |
| return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) |
| case _HLHL: |
| z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y) |
| return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) |
| case _HLLH: |
| z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y) |
| return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) |
| case _LHHL: |
| z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y) |
| return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) |
| } |
| panic("missing case, switch should be exhaustive") |
| } |
| |
| // SelectFromPairGrouped returns, for each of the four 128-bit subvectors |
| // of the vectors x and y, the selection of four elements from x and y, |
| // where selector values in the range 0-3 specify elements from x and |
| // values in the range 4-7 specify the 0-3 elements of y. |
| // When the selectors are constants and can be the selection |
| // can be implemented in a single instruction, it will be, otherwise |
| // it requires two. |
| // |
| // If the selectors are not constant this will translate to a function |
| // call. |
| // |
| // Asm: VSHUFPS, CPU Feature: AVX512 |
| func (x Float32x16) SelectFromPairGrouped(a, b, c, d uint8, y Float32x16) Float32x16 { |
| pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1 |
| |
| a, b, c, d = a&3, b&3, c&3, d&3 |
| |
| switch pattern { |
| case _LLLL: |
| return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) |
| case _HHHH: |
| return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) |
| case _LLHH: |
| return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) |
| case _HHLL: |
| return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) |
| |
| case _HLLL: |
| z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) |
| return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) |
| case _LHLL: |
| z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) |
| return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) |
| |
| case _HLHH: |
| z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) |
| return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) |
| case _LHHH: |
| z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) |
| return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) |
| |
| case _LLLH: |
| z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) |
| return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) |
| case _LLHL: |
| z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) |
| return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) |
| case _HHLH: |
| z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) |
| return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) |
| case _HHHL: |
| z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) |
| return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) |
| |
| case _LHLH: |
| z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y) |
| return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) |
| case _HLHL: |
| z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y) |
| return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) |
| case _HLLH: |
| z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y) |
| return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) |
| case _LHHL: |
| z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y) |
| return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) |
| } |
| panic("missing case, switch should be exhaustive") |
| } |
| |
| // cscimm4 converts the 4 vector element indices into a single |
| // uint8 for use as an immediate. |
| func cscimm4(a, b, c, d uint8) uint8 { |
| return uint8(a + b<<2 + c<<4 + d<<6) |
| } |
| |
| // cscimm2 converts the 2 vector element indices into a single |
| // uint8 for use as an immediate. |
| func cscimm2(a, b uint8) uint8 { |
| return uint8(a + b<<1) |
| } |
| |
| // cscimm2g2 converts the 2 vector element indices into a single |
| // uint8 for use as an immediate, but duplicated for VSHUFPD |
| // to emulate grouped behavior of VSHUFPS |
| func cscimm2g2(a, b uint8) uint8 { |
| g := cscimm2(a, b) |
| return g + g<<2 |
| } |
| |
| // cscimm2g4 converts the 2 vector element indices into a single |
| // uint8 for use as an immediate, but with four copies for VSHUFPD |
| // to emulate grouped behavior of VSHUFPS |
| func cscimm2g4(a, b uint8) uint8 { |
| g := cscimm2g2(a, b) |
| return g + g<<4 |
| } |
| |
| // SelectFromPair returns the selection of two elements from the two |
| // vectors x and y, where selector values in the range 0-1 specify |
| // elements from x and values in the range 2-3 specify the 0-1 elements |
| // of y. When the selectors are constants the selection can be |
| // implemented in a single instruction. |
| // |
| // If the selectors are not constant this will translate to a function |
| // call. |
| // |
| // Asm: VSHUFPD, CPU Feature: AVX |
| func (x Uint64x2) SelectFromPair(a, b uint8, y Uint64x2) Uint64x2 { |
| pattern := (a&2)>>1 + (b & 2) |
| |
| a, b = a&1, b&1 |
| |
| switch pattern { |
| case _LL: |
| return x.concatSelectedConstant(cscimm2(a, b), x) |
| case _HH: |
| return y.concatSelectedConstant(cscimm2(a, b), y) |
| case _LH: |
| return x.concatSelectedConstant(cscimm2(a, b), y) |
| case _HL: |
| return y.concatSelectedConstant(cscimm2(a, b), x) |
| } |
| panic("missing case, switch should be exhaustive") |
| } |
| |
| // SelectFromPairGrouped returns, for each of the two 128-bit halves of |
| // the vectors x and y, the selection of two elements from the two |
| // vectors x and y, where selector values in the range 0-1 specify |
| // elements from x and values in the range 2-3 specify the 0-1 elements |
| // of y. When the selectors are constants the selection can be |
| // implemented in a single instruction. |
| // |
| // If the selectors are not constant this will translate to a function |
| // call. |
| // |
| // Asm: VSHUFPD, CPU Feature: AVX |
| func (x Uint64x4) SelectFromPairGrouped(a, b uint8, y Uint64x4) Uint64x4 { |
| pattern := (a&2)>>1 + (b & 2) |
| |
| a, b = a&1, b&1 |
| |
| switch pattern { |
| case _LL: |
| return x.concatSelectedConstantGrouped(cscimm2g2(a, b), x) |
| case _HH: |
| return y.concatSelectedConstantGrouped(cscimm2g2(a, b), y) |
| case _LH: |
| return x.concatSelectedConstantGrouped(cscimm2g2(a, b), y) |
| case _HL: |
| return y.concatSelectedConstantGrouped(cscimm2g2(a, b), x) |
| } |
| panic("missing case, switch should be exhaustive") |
| } |
| |
| // SelectFromPairGrouped returns, for each of the four 128-bit subvectors |
| // of the vectors x and y, the selection of two elements from the two |
| // vectors x and y, where selector values in the range 0-1 specify |
| // elements from x and values in the range 2-3 specify the 0-1 elements |
| // of y. When the selectors are constants the selection can be |
| // implemented in a single instruction. |
| // |
| // If the selectors are not constant this will translate to a function |
| // call. |
| // |
| // Asm: VSHUFPD, CPU Feature: AVX512 |
| func (x Uint64x8) SelectFromPairGrouped(a, b uint8, y Uint64x8) Uint64x8 { |
| pattern := (a&2)>>1 + (b & 2) |
| |
| a, b = a&1, b&1 |
| |
| switch pattern { |
| case _LL: |
| return x.concatSelectedConstantGrouped(cscimm2g4(a, b), x) |
| case _HH: |
| return y.concatSelectedConstantGrouped(cscimm2g4(a, b), y) |
| case _LH: |
| return x.concatSelectedConstantGrouped(cscimm2g4(a, b), y) |
| case _HL: |
| return y.concatSelectedConstantGrouped(cscimm2g4(a, b), x) |
| } |
| panic("missing case, switch should be exhaustive") |
| } |
| |
| // SelectFromPair returns the selection of two elements from the two |
| // vectors x and y, where selector values in the range 0-1 specify |
| // elements from x and values in the range 2-3 specify the 0-1 elements |
| // of y. When the selectors are constants the selection can be |
| // implemented in a single instruction. |
| // |
| // If the selectors are not constant this will translate to a function |
| // call. |
| // |
| // Asm: VSHUFPD, CPU Feature: AVX |
| func (x Float64x2) SelectFromPair(a, b uint8, y Float64x2) Float64x2 { |
| pattern := (a&2)>>1 + (b & 2) |
| |
| a, b = a&1, b&1 |
| |
| switch pattern { |
| case _LL: |
| return x.concatSelectedConstant(cscimm2(a, b), x) |
| case _HH: |
| return y.concatSelectedConstant(cscimm2(a, b), y) |
| case _LH: |
| return x.concatSelectedConstant(cscimm2(a, b), y) |
| case _HL: |
| return y.concatSelectedConstant(cscimm2(a, b), x) |
| } |
| panic("missing case, switch should be exhaustive") |
| } |
| |
| // SelectFromPairGrouped returns, for each of the two 128-bit halves of |
| // the vectors x and y, the selection of two elements from the two |
| // vectors x and y, where selector values in the range 0-1 specify |
| // elements from x and values in the range 2-3 specify the 0-1 elements |
| // of y. When the selectors are constants the selection can be |
| // implemented in a single instruction. |
| // |
| // If the selectors are not constant this will translate to a function |
| // call. |
| // |
| // Asm: VSHUFPD, CPU Feature: AVX |
| func (x Float64x4) SelectFromPairGrouped(a, b uint8, y Float64x4) Float64x4 { |
| pattern := (a&2)>>1 + (b & 2) |
| |
| a, b = a&1, b&1 |
| |
| switch pattern { |
| case _LL: |
| return x.concatSelectedConstantGrouped(cscimm2g2(a, b), x) |
| case _HH: |
| return y.concatSelectedConstantGrouped(cscimm2g2(a, b), y) |
| case _LH: |
| return x.concatSelectedConstantGrouped(cscimm2g2(a, b), y) |
| case _HL: |
| return y.concatSelectedConstantGrouped(cscimm2g2(a, b), x) |
| } |
| panic("missing case, switch should be exhaustive") |
| } |
| |
| // SelectFromPairGrouped returns, for each of the four 128-bit subvectors |
| // of the vectors x and y, the selection of two elements from the two |
| // vectors x and y, where selector values in the range 0-1 specify |
| // elements from x and values in the range 2-3 specify the 0-1 elements |
| // of y. When the selectors are constants the selection can be |
| // implemented in a single instruction. |
| // |
| // If the selectors are not constant this will translate to a function |
| // call. |
| // |
| // Asm: VSHUFPD, CPU Feature: AVX512 |
| func (x Float64x8) SelectFromPairGrouped(a, b uint8, y Float64x8) Float64x8 { |
| pattern := (a&2)>>1 + (b & 2) |
| |
| a, b = a&1, b&1 |
| |
| switch pattern { |
| case _LL: |
| return x.concatSelectedConstantGrouped(cscimm2g4(a, b), x) |
| case _HH: |
| return y.concatSelectedConstantGrouped(cscimm2g4(a, b), y) |
| case _LH: |
| return x.concatSelectedConstantGrouped(cscimm2g4(a, b), y) |
| case _HL: |
| return y.concatSelectedConstantGrouped(cscimm2g4(a, b), x) |
| } |
| panic("missing case, switch should be exhaustive") |
| } |
| |
| // SelectFromPair returns the selection of two elements from the two |
| // vectors x and y, where selector values in the range 0-1 specify |
| // elements from x and values in the range 2-3 specify the 0-1 elements |
| // of y. When the selectors are constants the selection can be |
| // implemented in a single instruction. |
| // |
| // If the selectors are not constant this will translate to a function |
| // call. |
| // |
| // Asm: VSHUFPD, CPU Feature: AVX |
| func (x Int64x2) SelectFromPair(a, b uint8, y Int64x2) Int64x2 { |
| pattern := (a&2)>>1 + (b & 2) |
| |
| a, b = a&1, b&1 |
| |
| switch pattern { |
| case _LL: |
| return x.concatSelectedConstant(cscimm2(a, b), x) |
| case _HH: |
| return y.concatSelectedConstant(cscimm2(a, b), y) |
| case _LH: |
| return x.concatSelectedConstant(cscimm2(a, b), y) |
| case _HL: |
| return y.concatSelectedConstant(cscimm2(a, b), x) |
| } |
| panic("missing case, switch should be exhaustive") |
| } |
| |
| // SelectFromPairGrouped returns, for each of the two 128-bit halves of |
| // the vectors x and y, the selection of two elements from the two |
| // vectors x and y, where selector values in the range 0-1 specify |
| // elements from x and values in the range 2-3 specify the 0-1 elements |
| // of y. When the selectors are constants the selection can be |
| // implemented in a single instruction. |
| // |
| // If the selectors are not constant this will translate to a function |
| // call. |
| // |
| // Asm: VSHUFPD, CPU Feature: AVX |
| func (x Int64x4) SelectFromPairGrouped(a, b uint8, y Int64x4) Int64x4 { |
| pattern := (a&2)>>1 + (b & 2) |
| |
| a, b = a&1, b&1 |
| |
| switch pattern { |
| case _LL: |
| return x.concatSelectedConstantGrouped(cscimm2g2(a, b), x) |
| case _HH: |
| return y.concatSelectedConstantGrouped(cscimm2g2(a, b), y) |
| case _LH: |
| return x.concatSelectedConstantGrouped(cscimm2g2(a, b), y) |
| case _HL: |
| return y.concatSelectedConstantGrouped(cscimm2g2(a, b), x) |
| } |
| panic("missing case, switch should be exhaustive") |
| } |
| |
| // SelectFromPairGrouped returns, for each of the four 128-bit subvectors |
| // of the vectors x and y, the selection of two elements from the two |
| // vectors x and y, where selector values in the range 0-1 specify |
| // elements from x and values in the range 2-3 specify the 0-1 elements |
| // of y. When the selectors are constants the selection can be |
| // implemented in a single instruction. |
| // |
| // If the selectors are not constant this will translate to a function |
| // call. |
| // |
| // Asm: VSHUFPD, CPU Feature: AVX512 |
| func (x Int64x8) SelectFromPairGrouped(a, b uint8, y Int64x8) Int64x8 { |
| pattern := (a&2)>>1 + (b & 2) |
| |
| a, b = a&1, b&1 |
| |
| switch pattern { |
| case _LL: |
| return x.concatSelectedConstantGrouped(cscimm2g4(a, b), x) |
| case _HH: |
| return y.concatSelectedConstantGrouped(cscimm2g4(a, b), y) |
| case _LH: |
| return x.concatSelectedConstantGrouped(cscimm2g4(a, b), y) |
| case _HL: |
| return y.concatSelectedConstantGrouped(cscimm2g4(a, b), x) |
| } |
| panic("missing case, switch should be exhaustive") |
| } |
| |
| /* PermuteScalars */ |
| |
| // PermuteScalars performs a permutation of vector x's elements using the supplied indices: |
| // |
| // result = {x[a], x[b], x[c], x[d]} |
| // |
| // Parameters a,b,c,d should have values between 0 and 3. |
| // If a through d are constants, then an instruction will be inlined, otherwise |
| // a jump table may be generated. |
| // |
| // Asm: VPSHUFD, CPU Feature: AVX |
| func (x Int32x4) PermuteScalars(a, b, c, d uint8) Int32x4 { |
| return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) |
| } |
| |
| // PermuteScalars performs a permutation of vector x's elements using the supplied indices: |
| // |
| // result = {x[a], x[b], x[c], x[d]} |
| // |
| // Parameters a,b,c,d should have values between 0 and 3. |
| // If a through d are constants, then an instruction will be inlined, otherwise |
| // a jump table may be generated. |
| // |
| // Asm: VPSHUFD, CPU Feature: AVX |
| func (x Uint32x4) PermuteScalars(a, b, c, d uint8) Uint32x4 { |
| return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) |
| } |
| |
| /* PermuteScalarsGrouped */ |
| |
| // PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices: |
| // |
| // result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]} |
| // |
| // Parameters a,b,c,d should have values between 0 and 3. |
| // If a through d are constants, then an instruction will be inlined, otherwise |
| // a jump table may be generated. |
| // |
| // Asm: VPSHUFD, CPU Feature: AVX2 |
| func (x Int32x8) PermuteScalarsGrouped(a, b, c, d uint8) Int32x8 { |
| return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) |
| } |
| |
| // PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices: |
| // |
| // result = |
| // { x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4], |
| // x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]} |
| // |
| // Parameters a,b,c,d should have values between 0 and 3. |
| // If a through d are constants, then an instruction will be inlined, otherwise |
| // a jump table may be generated. |
| // |
| // Asm: VPSHUFD, CPU Feature: AVX512 |
| func (x Int32x16) PermuteScalarsGrouped(a, b, c, d uint8) Int32x16 { |
| return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) |
| } |
| |
| // PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices: |
| // |
| // result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]} |
| // |
| // Parameters a,b,c,d should have values between 0 and 3. |
| // If a through d are constants, then an instruction will be inlined, otherwise |
| // a jump table is generated. |
| // |
| // Asm: VPSHUFD, CPU Feature: AVX2 |
| func (x Uint32x8) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x8 { |
| return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) |
| } |
| |
| // PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices: |
| // |
| // result = |
| // { x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4], |
| // x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]} |
| // |
| // Parameters a,b,c,d should have values between 0 and 3. |
| // If a through d are constants, then an instruction will be inlined, otherwise |
| // a jump table is generated. |
| // |
| // Asm: VPSHUFD, CPU Feature: AVX512 |
| func (x Uint32x16) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x16 { |
| return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) |
| } |
| |
| /* PermuteScalarsHi */ |
| |
| // PermuteScalarsHi performs a permutation of vector x using the supplied indices: |
| // |
| // result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]} |
| // |
| // Parameters a,b,c,d should have values between 0 and 3. |
| // If a through d are constants, then an instruction will be inlined, otherwise |
| // a jump table is generated. |
| // |
| // Asm: VPSHUFHW, CPU Feature: AVX512 |
| func (x Int16x8) PermuteScalarsHi(a, b, c, d uint8) Int16x8 { |
| return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) |
| } |
| |
| // PermuteScalarsHi performs a permutation of vector x using the supplied indices: |
| // |
| // result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]} |
| // |
| // Parameters a,b,c,d should have values between 0 and 3. |
| // If a through d are constants, then an instruction will be inlined, otherwise |
| // a jump table is generated. |
| // |
| // Asm: VPSHUFHW, CPU Feature: AVX512 |
| func (x Uint16x8) PermuteScalarsHi(a, b, c, d uint8) Uint16x8 { |
| return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) |
| } |
| |
| /* PermuteScalarsHiGrouped */ |
| |
| // PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices: |
| // |
| // result = |
| // {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4], |
| // x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]} |
| // |
| // Parameters a,b,c,d should have values between 0 and 3. |
| // If a through d are constants, then an instruction will be inlined, otherwise |
| // a jump table is generated. |
| // |
| // Asm: VPSHUFHW, CPU Feature: AVX2 |
| func (x Int16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x16 { |
| return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) |
| } |
| |
| // PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices: |
| // |
| // result = |
| // {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4], |
| // x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12], |
| // x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20], |
| // x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]} |
| // |
| // Parameters a,b,c,d should have values between 0 and 3. |
| // If a through d are constants, then an instruction will be inlined, otherwise |
| // a jump table is generated. |
| // |
| // Asm: VPSHUFHW, CPU Feature: AVX512 |
| func (x Int16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x32 { |
| return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) |
| } |
| |
| // PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices: |
| // |
| // result = |
| // {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4], |
| // x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]} |
| // |
| // Each group is of size 128-bit. |
| // |
| // Parameters a,b,c,d should have values between 0 and 3. |
| // If a through d are constants, then an instruction will be inlined, otherwise |
| // a jump table is generated. |
| // |
| // Asm: VPSHUFHW, CPU Feature: AVX2 |
| func (x Uint16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x16 { |
| return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) |
| } |
| |
| // PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices: |
| // |
| // result = |
| // { x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4], |
| // x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12], |
| // x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20], |
| // x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]} |
| // |
| // Parameters a,b,c,d should have values between 0 and 3. |
| // If a through d are constants, then an instruction will be inlined, otherwise |
| // a jump table is generated. |
| // |
| // Asm: VPSHUFHW, CPU Feature: AVX512 |
| func (x Uint16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x32 { |
| return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) |
| } |
| |
| /* PermuteScalarsLo */ |
| |
| // PermuteScalarsLo performs a permutation of vector x using the supplied indices: |
| // |
| // result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]} |
| // |
| // Parameters a,b,c,d should have values between 0 and 3. |
| // If a through d are constants, then an instruction will be inlined, otherwise |
| // a jump table is generated. |
| // |
| // Asm: VPSHUFLW, CPU Feature: AVX512 |
| func (x Int16x8) PermuteScalarsLo(a, b, c, d uint8) Int16x8 { |
| return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) |
| } |
| |
| // PermuteScalarsLo performs a permutation of vector x using the supplied indices: |
| // |
| // result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]} |
| // |
| // Parameters a,b,c,d should have values between 0 and 3. |
| // If a through d are constants, then an instruction will be inlined, otherwise |
| // a jump table is generated. |
| // |
| // Asm: VPSHUFLW, CPU Feature: AVX512 |
| func (x Uint16x8) PermuteScalarsLo(a, b, c, d uint8) Uint16x8 { |
| return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) |
| } |
| |
| /* PermuteScalarsLoGrouped */ |
| |
| // PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices: |
| // |
| // result = |
| // {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7], |
| // x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]} |
| // |
| // Parameters a,b,c,d should have values between 0 and 3. |
| // If a through d are constants, then an instruction will be inlined, otherwise |
| // a jump table is generated. |
| // |
| // Asm: VPSHUFLW, CPU Feature: AVX2 |
| func (x Int16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x16 { |
| return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) |
| } |
| |
| // PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices: |
| // |
| // result = |
| // {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7], |
| // x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15], |
| // x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23], |
| // x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]} |
| // |
| // Parameters a,b,c,d should have values between 0 and 3. |
| // If a through d are constants, then an instruction will be inlined, otherwise |
| // a jump table is generated. |
| // |
| // Asm: VPSHUFLW, CPU Feature: AVX512 |
| func (x Int16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x32 { |
| return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) |
| } |
| |
| // PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices: |
| // |
| // result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7], |
| // x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]} |
| // |
| // Parameters a,b,c,d should have values between 0 and 3. |
| // If a through d are constants, then an instruction will be inlined, otherwise |
| // a jump table is generated. |
| // |
| // Asm: VPSHUFLW, CPU Feature: AVX2 |
| func (x Uint16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x16 { |
| return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) |
| } |
| |
| // PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices: |
| // |
| // result = |
| // {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7], |
| // x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15], |
| // x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23], |
| // x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]} |
| // |
| // Each group is of size 128-bit. |
| // |
| // Parameters a,b,c,d should have values between 0 and 3. |
| // If a through d are constants, then an instruction will be inlined, otherwise |
| // a jump table is generated. |
| // |
| // Asm: VPSHUFLW, CPU Feature: AVX512 |
| func (x Uint16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x32 { |
| return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) |
| } |
| |
| // CarrylessMultiply computes one of four possible carryless |
| // multiplications of selected high and low halves of x and y, |
| // depending on the values of a and b, returning the 128-bit |
| // product in the concatenated two elements of the result. |
| // a selects the low (0) or high (1) element of x and |
| // b selects the low (0) or high (1) element of y. |
| // |
| // A carryless multiplication uses bitwise XOR instead of |
| // add-with-carry, for example (in base two): |
| // 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101 |
| // |
| // This also models multiplication of polynomials with coefficients |
| // from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 = |
| // x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds |
| // polynomial terms, but coefficients "add" with XOR.) |
| // |
| // constant values of a and b will result in better performance, |
| // otherwise the intrinsic may translate into a jump table. |
| // |
| // Asm: VPCLMULQDQ, CPU Feature: AVX |
| func (x Uint64x2) CarrylessMultiply(a, b uint8, y Uint64x2) Uint64x2 { |
| return x.carrylessMultiply(a&1+((b&1)<<4), y) |
| } |
| |
| // CarrylessMultiplyGrouped computes one of four possible carryless |
| // multiplications of selected high and low halves of each of the two |
| // 128-bit lanes of x and y, depending on the values of a and b, |
| // and returns the four 128-bit products in the result's lanes. |
| // a selects the low (0) or high (1) elements of x's lanes and |
| // b selects the low (0) or high (1) elements of y's lanes. |
| // |
| // A carryless multiplication uses bitwise XOR instead of |
| // add-with-carry, for example (in base two): |
| // 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101 |
| // |
| // This also models multiplication of polynomials with coefficients |
| // from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 = |
| // x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds |
| // polynomial terms, but coefficients "add" with XOR.) |
| // |
| // constant values of a and b will result in better performance, |
| // otherwise the intrinsic may translate into a jump table. |
| // |
| // Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ |
| func (x Uint64x4) CarrylessMultiplyGrouped(a, b uint8, y Uint64x4) Uint64x4 { |
| return x.carrylessMultiply(a&1+((b&1)<<4), y) |
| } |
| |
| // CarrylessMultiplyGrouped computes one of four possible carryless |
| // multiplications of selected high and low halves of each of the four |
| // 128-bit lanes of x and y, depending on the values of a and b, |
| // and returns the four 128-bit products in the result's lanes. |
| // a selects the low (0) or high (1) elements of x's lanes and |
| // b selects the low (0) or high (1) elements of y's lanes. |
| // |
| // A carryless multiplication uses bitwise XOR instead of |
| // add-with-carry, for example (in base two): |
| // 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101 |
| // |
| // This also models multiplication of polynomials with coefficients |
| // from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 = |
| // x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds |
| // polynomial terms, but coefficients "add" with XOR.) |
| // |
| // constant values of a and b will result in better performance, |
| // otherwise the intrinsic may translate into a jump table. |
| // |
| // Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ |
| func (x Uint64x8) CarrylessMultiplyGrouped(a, b uint8, y Uint64x8) Uint64x8 { |
| return x.carrylessMultiply(a&1+((b&1)<<4), y) |
| } |