| // Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT. |
| |
| //go:build goexperiment.simd |
| |
| package archsimd |
| |
| /* blend */ |
| |
| // blend blends two vectors based on mask values, choosing either |
| // the first or the second based on whether the third is false or true |
| // |
| // Asm: VPBLENDVB, CPU Feature: AVX |
| func (x Int8x16) blend(y Int8x16, mask Int8x16) Int8x16 |
| |
| // blend blends two vectors based on mask values, choosing either |
| // the first or the second based on whether the third is false or true |
| // |
| // Asm: VPBLENDVB, CPU Feature: AVX2 |
| func (x Int8x32) blend(y Int8x32, mask Int8x32) Int8x32 |
| |
| /* blendMasked */ |
| |
| // blendMasked blends two vectors based on mask values, choosing either |
| // the first or the second based on whether the third is false or true |
| // |
| // This operation is applied selectively under a write mask. |
| // |
| // Asm: VPBLENDMB, CPU Feature: AVX512 |
| func (x Int8x64) blendMasked(y Int8x64, mask Mask8x64) Int8x64 |
| |
| // blendMasked blends two vectors based on mask values, choosing either |
| // the first or the second based on whether the third is false or true |
| // |
| // This operation is applied selectively under a write mask. |
| // |
| // Asm: VPBLENDMW, CPU Feature: AVX512 |
| func (x Int16x32) blendMasked(y Int16x32, mask Mask16x32) Int16x32 |
| |
| // blendMasked blends two vectors based on mask values, choosing either |
| // the first or the second based on whether the third is false or true |
| // |
| // This operation is applied selectively under a write mask. |
| // |
| // Asm: VPBLENDMD, CPU Feature: AVX512 |
| func (x Int32x16) blendMasked(y Int32x16, mask Mask32x16) Int32x16 |
| |
| // blendMasked blends two vectors based on mask values, choosing either |
| // the first or the second based on whether the third is false or true |
| // |
| // This operation is applied selectively under a write mask. |
| // |
| // Asm: VPBLENDMQ, CPU Feature: AVX512 |
| func (x Int64x8) blendMasked(y Int64x8, mask Mask64x8) Int64x8 |
| |
| /* carrylessMultiply */ |
| |
| // carrylessMultiply computes one of four possible Galois polynomial |
| // products of selected high and low halves of x and y, |
| // depending on the value of xyHiLo, returning the 128-bit |
| // product in the concatenated two elements of the result. |
| // Bit 0 selects the low (0) or high (1) element of x and |
| // bit 4 selects the low (0x00) or high (0x10) element of y. |
| // |
| // xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPCLMULQDQ, CPU Feature: AVX |
| func (x Uint64x2) carrylessMultiply(xyHiLo uint8, y Uint64x2) Uint64x2 |
| |
| // carrylessMultiply computes one of two possible Galois polynomial |
| // products of selected high and low halves of each of the two |
| // 128-bit lanes of x and y, depending on the value of xyHiLo, |
| // and returns the four 128-bit products in the result's lanes. |
| // Bit 0 selects the low (0) or high (1) elements of x's lanes and |
| // bit 4 selects the low (0x00) or high (0x10) elements of y's lanes. |
| // |
| // xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ |
| func (x Uint64x4) carrylessMultiply(xyHiLo uint8, y Uint64x4) Uint64x4 |
| |
| // carrylessMultiply computes one of four possible Galois polynomial |
| // products of selected high and low halves of each of the four |
| // 128-bit lanes of x and y, depending on the value of xyHiLo, |
| // and returns the four 128-bit products in the result's lanes. |
| // Bit 0 selects the low (0) or high (1) elements of x's lanes and |
| // bit 4 selects the low (0x00) or high (0x10) elements of y's lanes. |
| // |
| // xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ |
| func (x Uint64x8) carrylessMultiply(xyHiLo uint8, y Uint64x8) Uint64x8 |
| |
| /* concatSelectedConstant */ |
| |
| // concatSelectedConstant concatenates selected elements from x and y into the lower and upper |
| // halves of the output. The selection is chosen by the constant parameter h1h0l1l0 |
| // where each {h,l}{1,0} is two bits specify which element from y or x to select. |
| // For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns |
| // {2, 0, 5, 7} (don't forget that the binary constant is written big-endian). |
| // |
| // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VSHUFPS, CPU Feature: AVX |
| func (x Float32x4) concatSelectedConstant(h1h0l1l0 uint8, y Float32x4) Float32x4 |
| |
| // concatSelectedConstant concatenates selected elements from x and y into the lower and upper |
| // halves of the output. The selection is chosen by the constant parameter hilo |
| // where hi and lo are each one bit specifying which 64-bit element to select |
| // from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7}) |
| // returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1, |
| // selecting from y, is 1, and selects 7. |
| // |
| // hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VSHUFPD, CPU Feature: AVX |
| func (x Float64x2) concatSelectedConstant(hilo uint8, y Float64x2) Float64x2 |
| |
| // concatSelectedConstant concatenates selected elements from x and y into the lower and upper |
| // halves of the output. The selection is chosen by the constant parameter h1h0l1l0 |
| // where each {h,l}{1,0} is two bits specify which element from y or x to select. |
| // For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns |
| // {2, 0, 5, 7} (don't forget that the binary constant is written big-endian). |
| // |
| // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VSHUFPS, CPU Feature: AVX |
| func (x Int32x4) concatSelectedConstant(h1h0l1l0 uint8, y Int32x4) Int32x4 |
| |
| // concatSelectedConstant concatenates selected elements from x and y into the lower and upper |
| // halves of the output. The selection is chosen by the constant parameter hilo |
| // where hi and lo are each one bit specifying which 64-bit element to select |
| // from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7}) |
| // returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1, |
| // selecting from y, is 1, and selects 7. |
| // |
| // hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VSHUFPD, CPU Feature: AVX |
| func (x Int64x2) concatSelectedConstant(hilo uint8, y Int64x2) Int64x2 |
| |
| // concatSelectedConstant concatenates selected elements from x and y into the lower and upper |
| // halves of the output. The selection is chosen by the constant parameter h1h0l1l0 |
| // where each {h,l}{1,0} is two bits specify which element from y or x to select. |
| // For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns |
| // {2, 0, 5, 7} (don't forget that the binary constant is written big-endian). |
| // |
| // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VSHUFPS, CPU Feature: AVX |
| func (x Uint32x4) concatSelectedConstant(h1h0l1l0 uint8, y Uint32x4) Uint32x4 |
| |
| // concatSelectedConstant concatenates selected elements from x and y into the lower and upper |
| // halves of the output. The selection is chosen by the constant parameter hilo |
| // where hi and lo are each one bit specifying which 64-bit element to select |
| // from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7}) |
| // returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1, |
| // selecting from y, is 1, and selects 7. |
| // |
| // hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VSHUFPD, CPU Feature: AVX |
| func (x Uint64x2) concatSelectedConstant(hilo uint8, y Uint64x2) Uint64x2 |
| |
| /* concatSelectedConstantGrouped */ |
| |
| // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y |
| // into the lower and upper halves of corresponding subvectors of the output. |
| // The selection is chosen by the constant parameter h1h0l1l0 |
| // where each {h,l}{1,0} is two bits specifying which element from y or x to select. |
| // For example, |
| // {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) |
| // returns {2,0,5,7,10,8,13,15} |
| // (don't forget that the binary constant is written big-endian). |
| // |
| // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VSHUFPS, CPU Feature: AVX |
| func (x Float32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x8) Float32x8 |
| |
| // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y |
| // into the lower and upper halves of corresponding subvectors of the output. |
| // The selection is chosen by the constant parameter h1h0l1l0 |
| // where each {h,l}{1,0} is two bits specifying which element from y or x to select. |
| // For example, |
| // |
| // {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped( |
| // 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) |
| // |
| // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215} |
| // |
| // (don't forget that the binary constant is written big-endian). |
| // |
| // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VSHUFPS, CPU Feature: AVX512 |
| func (x Float32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x16) Float32x16 |
| |
| // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y |
| // into the lower and upper halves of corresponding subvectors of the output. |
| // The selections are specified by the constant parameter hilos where each |
| // hi and lo pair select 64-bit elements from the corresponding 128-bit |
| // subvectors of x and y. |
| // |
| // For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11}) |
| // returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least |
| // 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), |
| // then 1, selecting element 1 from x's upper 128 bits (9), then 1, |
| // selecting element 1 from y's upper 128 bits (11). |
| // This differs from the same method applied to a 32x8 vector, where |
| // the 8-bit constant performs the same selection on both subvectors. |
| // |
| // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VSHUFPD, CPU Feature: AVX |
| func (x Float64x4) concatSelectedConstantGrouped(hilos uint8, y Float64x4) Float64x4 |
| |
| // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y |
| // into the lower and upper halves of corresponding subvectors of the output. |
| // The selections are specified by the constant parameter hilos where each |
| // hi and lo pair select 64-bit elements from the corresponding 128-bit |
| // subvectors of x and y. |
| // |
| // For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19}) |
| // returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's |
| // least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), |
| // then 1, selecting element 1 from x's next 128 bits (9), then 1, |
| // selecting element 1 from y's upper 128 bits (11). The next two 0 bits select |
| // the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two |
| // 1 bits select the upper elements from x and y's last 128 bits (17, 19). |
| // This differs from the same method applied to a 32x8 or 32x16 vector, where |
| // the 8-bit constant performs the same selection on all the subvectors. |
| // |
| // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VSHUFPD, CPU Feature: AVX512 |
| func (x Float64x8) concatSelectedConstantGrouped(hilos uint8, y Float64x8) Float64x8 |
| |
| // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y |
| // into the lower and upper halves of corresponding subvectors of the output. |
| // The selection is chosen by the constant parameter h1h0l1l0 |
| // where each {h,l}{1,0} is two bits specifying which element from y or x to select. |
| // For example, |
| // {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) |
| // returns {2,0,5,7,10,8,13,15} |
| // (don't forget that the binary constant is written big-endian). |
| // |
| // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VSHUFPS, CPU Feature: AVX |
| func (x Int32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x8) Int32x8 |
| |
| // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y |
| // into the lower and upper halves of corresponding subvectors of the output. |
| // The selection is chosen by the constant parameter h1h0l1l0 |
| // where each {h,l}{1,0} is two bits specifying which element from y or x to select. |
| // For example, |
| // |
| // {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped( |
| // 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) |
| // |
| // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215} |
| // |
| // (don't forget that the binary constant is written big-endian). |
| // |
| // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VSHUFPS, CPU Feature: AVX512 |
| func (x Int32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x16) Int32x16 |
| |
| // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y |
| // into the lower and upper halves of corresponding subvectors of the output. |
| // The selections are specified by the constant parameter hilos where each |
| // hi and lo pair select 64-bit elements from the corresponding 128-bit |
| // subvectors of x and y. |
| // |
| // For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11}) |
| // returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least |
| // 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), |
| // then 1, selecting element 1 from x's upper 128 bits (9), then 1, |
| // selecting element 1 from y's upper 128 bits (11). |
| // This differs from the same method applied to a 32x8 vector, where |
| // the 8-bit constant performs the same selection on both subvectors. |
| // |
| // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VSHUFPD, CPU Feature: AVX |
| func (x Int64x4) concatSelectedConstantGrouped(hilos uint8, y Int64x4) Int64x4 |
| |
| // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y |
| // into the lower and upper halves of corresponding subvectors of the output. |
| // The selections are specified by the constant parameter hilos where each |
| // hi and lo pair select 64-bit elements from the corresponding 128-bit |
| // subvectors of x and y. |
| // |
| // For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19}) |
| // returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's |
| // least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), |
| // then 1, selecting element 1 from x's next 128 bits (9), then 1, |
| // selecting element 1 from y's upper 128 bits (11). The next two 0 bits select |
| // the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two |
| // 1 bits select the upper elements from x and y's last 128 bits (17, 19). |
| // This differs from the same method applied to a 32x8 or 32x16 vector, where |
| // the 8-bit constant performs the same selection on all the subvectors. |
| // |
| // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VSHUFPD, CPU Feature: AVX512 |
| func (x Int64x8) concatSelectedConstantGrouped(hilos uint8, y Int64x8) Int64x8 |
| |
| // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y |
| // into the lower and upper halves of corresponding subvectors of the output. |
| // The selection is chosen by the constant parameter h1h0l1l0 |
| // where each {h,l}{1,0} is two bits specifying which element from y or x to select. |
| // For example, |
| // {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) |
| // returns {2,0,5,7,10,8,13,15} |
| // (don't forget that the binary constant is written big-endian). |
| // |
| // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VSHUFPS, CPU Feature: AVX |
| func (x Uint32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x8) Uint32x8 |
| |
| // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y |
| // into the lower and upper halves of corresponding subvectors of the output. |
| // The selection is chosen by the constant parameter h1h0l1l0 |
| // where each {h,l}{1,0} is two bits specifying which element from y or x to select. |
| // For example, |
| // |
| // {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped( |
| // 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) |
| // |
| // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215} |
| // |
| // (don't forget that the binary constant is written big-endian). |
| // |
| // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VSHUFPS, CPU Feature: AVX512 |
| func (x Uint32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x16) Uint32x16 |
| |
| // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y |
| // into the lower and upper halves of corresponding subvectors of the output. |
| // The selections are specified by the constant parameter hilos where each |
| // hi and lo pair select 64-bit elements from the corresponding 128-bit |
| // subvectors of x and y. |
| // |
| // For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11}) |
| // returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least |
| // 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), |
| // then 1, selecting element 1 from x's upper 128 bits (9), then 1, |
| // selecting element 1 from y's upper 128 bits (11). |
| // This differs from the same method applied to a 32x8 vector, where |
| // the 8-bit constant performs the same selection on both subvectors. |
| // |
| // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VSHUFPD, CPU Feature: AVX |
| func (x Uint64x4) concatSelectedConstantGrouped(hilos uint8, y Uint64x4) Uint64x4 |
| |
| // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y |
| // into the lower and upper halves of corresponding subvectors of the output. |
| // The selections are specified by the constant parameter hilos where each |
| // hi and lo pair select 64-bit elements from the corresponding 128-bit |
| // subvectors of x and y. |
| // |
| // For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19}) |
| // returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's |
| // least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), |
| // then 1, selecting element 1 from x's next 128 bits (9), then 1, |
| // selecting element 1 from y's upper 128 bits (11). The next two 0 bits select |
| // the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two |
| // 1 bits select the upper elements from x and y's last 128 bits (17, 19). |
| // This differs from the same method applied to a 32x8 or 32x16 vector, where |
| // the 8-bit constant performs the same selection on all the subvectors. |
| // |
| // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VSHUFPD, CPU Feature: AVX512 |
| func (x Uint64x8) concatSelectedConstantGrouped(hilos uint8, y Uint64x8) Uint64x8 |
| |
| /* permuteScalars */ |
| |
| // permuteScalars performs a permutation of vector x using constant indices: |
| // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} |
| // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. |
| // |
| // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPSHUFD, CPU Feature: AVX |
| func (x Int32x4) permuteScalars(indices uint8) Int32x4 |
| |
| // permuteScalars performs a permutation of vector x using constant indices: |
| // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} |
| // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. |
| // |
| // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPSHUFD, CPU Feature: AVX |
| func (x Uint32x4) permuteScalars(indices uint8) Uint32x4 |
| |
| /* permuteScalarsGrouped */ |
| |
| // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices: |
| // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} |
| // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. |
| // Each group is of size 128-bit. |
| // |
| // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPSHUFD, CPU Feature: AVX2 |
| func (x Int32x8) permuteScalarsGrouped(indices uint8) Int32x8 |
| |
| // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices: |
| // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} |
| // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. |
| // Each group is of size 128-bit. |
| // |
| // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPSHUFD, CPU Feature: AVX512 |
| func (x Int32x16) permuteScalarsGrouped(indices uint8) Int32x16 |
| |
| // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices: |
| // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} |
| // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. |
| // Each group is of size 128-bit. |
| // |
| // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPSHUFD, CPU Feature: AVX2 |
| func (x Uint32x8) permuteScalarsGrouped(indices uint8) Uint32x8 |
| |
| // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices: |
| // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} |
| // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. |
| // Each group is of size 128-bit. |
| // |
| // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPSHUFD, CPU Feature: AVX512 |
| func (x Uint32x16) permuteScalarsGrouped(indices uint8) Uint32x16 |
| |
| /* permuteScalarsHi */ |
| |
| // permuteScalarsHi performs a permutation of vector x using constant indices: |
| // result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} |
| // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. |
| // |
| // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPSHUFHW, CPU Feature: AVX512 |
| func (x Int16x8) permuteScalarsHi(indices uint8) Int16x8 |
| |
| // permuteScalarsHi performs a permutation of vector x using constant indices: |
| // result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} |
| // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. |
| // |
| // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPSHUFHW, CPU Feature: AVX512 |
| func (x Uint16x8) permuteScalarsHi(indices uint8) Uint16x8 |
| |
| /* permuteScalarsHiGrouped */ |
| |
| // permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices: |
| // result = |
| // |
| // {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], |
| // x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...} |
| // |
| // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. |
| // Each group is of size 128-bit. |
| // |
| // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPSHUFHW, CPU Feature: AVX2 |
| func (x Int16x16) permuteScalarsHiGrouped(indices uint8) Int16x16 |
| |
| // permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices: |
| // result = |
| // |
| // {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], |
| // x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...} |
| // |
| // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. |
| // Each group is of size 128-bit. |
| // |
| // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPSHUFHW, CPU Feature: AVX512 |
| func (x Int16x32) permuteScalarsHiGrouped(indices uint8) Int16x32 |
| |
| // permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices: |
| // result = |
| // |
| // {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], |
| // x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...} |
| // |
| // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. |
| // Each group is of size 128-bit. |
| // |
| // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPSHUFHW, CPU Feature: AVX2 |
| func (x Uint16x16) permuteScalarsHiGrouped(indices uint8) Uint16x16 |
| |
| // permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices: |
| // result = |
| // |
| // {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], |
| // x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...} |
| // |
| // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. |
| // Each group is of size 128-bit. |
| // |
| // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPSHUFHW, CPU Feature: AVX512 |
| func (x Uint16x32) permuteScalarsHiGrouped(indices uint8) Uint16x32 |
| |
| /* permuteScalarsLo */ |
| |
| // permuteScalarsLo performs a permutation of vector x using constant indices: |
| // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]} |
| // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. |
| // |
| // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPSHUFLW, CPU Feature: AVX512 |
| func (x Int16x8) permuteScalarsLo(indices uint8) Int16x8 |
| |
| // permuteScalarsLo performs a permutation of vector x using constant indices: |
| // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]} |
| // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. |
| // |
| // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPSHUFLW, CPU Feature: AVX512 |
| func (x Uint16x8) permuteScalarsLo(indices uint8) Uint16x8 |
| |
| /* permuteScalarsLoGrouped */ |
| |
| // permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices: |
| // |
| // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7], |
| // x_group1[indices[0:2]], ...} |
| // |
| // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. |
| // Each group is of size 128-bit. |
| // |
| // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPSHUFLW, CPU Feature: AVX2 |
| func (x Int16x16) permuteScalarsLoGrouped(indices uint8) Int16x16 |
| |
| // permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices: |
| // |
| // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7], |
| // x_group1[indices[0:2]], ...} |
| // |
| // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. |
| // Each group is of size 128-bit. |
| // |
| // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPSHUFLW, CPU Feature: AVX512 |
| func (x Int16x32) permuteScalarsLoGrouped(indices uint8) Int16x32 |
| |
| // permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices: |
| // |
| // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7], |
| // x_group1[indices[0:2]], ...} |
| // |
| // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. |
| // Each group is of size 128-bit. |
| // |
| // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPSHUFLW, CPU Feature: AVX2 |
| func (x Uint16x16) permuteScalarsLoGrouped(indices uint8) Uint16x16 |
| |
| // permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices: |
| // |
| // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7], |
| // x_group1[indices[0:2]], ...} |
| // |
| // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. |
| // Each group is of size 128-bit. |
| // |
| // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPSHUFLW, CPU Feature: AVX512 |
| func (x Uint16x32) permuteScalarsLoGrouped(indices uint8) Uint16x32 |
| |
| /* tern */ |
| |
| // tern performs a logical operation on three vectors based on the 8-bit truth table. |
| // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) |
| // |
| // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPTERNLOGD, CPU Feature: AVX512 |
| func (x Int32x4) tern(table uint8, y Int32x4, z Int32x4) Int32x4 |
| |
| // tern performs a logical operation on three vectors based on the 8-bit truth table. |
| // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) |
| // |
| // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPTERNLOGD, CPU Feature: AVX512 |
| func (x Int32x8) tern(table uint8, y Int32x8, z Int32x8) Int32x8 |
| |
| // tern performs a logical operation on three vectors based on the 8-bit truth table. |
| // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) |
| // |
| // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPTERNLOGD, CPU Feature: AVX512 |
| func (x Int32x16) tern(table uint8, y Int32x16, z Int32x16) Int32x16 |
| |
| // tern performs a logical operation on three vectors based on the 8-bit truth table. |
| // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) |
| // |
| // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPTERNLOGQ, CPU Feature: AVX512 |
| func (x Int64x2) tern(table uint8, y Int64x2, z Int64x2) Int64x2 |
| |
| // tern performs a logical operation on three vectors based on the 8-bit truth table. |
| // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) |
| // |
| // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPTERNLOGQ, CPU Feature: AVX512 |
| func (x Int64x4) tern(table uint8, y Int64x4, z Int64x4) Int64x4 |
| |
| // tern performs a logical operation on three vectors based on the 8-bit truth table. |
| // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) |
| // |
| // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPTERNLOGQ, CPU Feature: AVX512 |
| func (x Int64x8) tern(table uint8, y Int64x8, z Int64x8) Int64x8 |
| |
| // tern performs a logical operation on three vectors based on the 8-bit truth table. |
| // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) |
| // |
| // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPTERNLOGD, CPU Feature: AVX512 |
| func (x Uint32x4) tern(table uint8, y Uint32x4, z Uint32x4) Uint32x4 |
| |
| // tern performs a logical operation on three vectors based on the 8-bit truth table. |
| // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) |
| // |
| // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPTERNLOGD, CPU Feature: AVX512 |
| func (x Uint32x8) tern(table uint8, y Uint32x8, z Uint32x8) Uint32x8 |
| |
| // tern performs a logical operation on three vectors based on the 8-bit truth table. |
| // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) |
| // |
| // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPTERNLOGD, CPU Feature: AVX512 |
| func (x Uint32x16) tern(table uint8, y Uint32x16, z Uint32x16) Uint32x16 |
| |
| // tern performs a logical operation on three vectors based on the 8-bit truth table. |
| // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) |
| // |
| // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPTERNLOGQ, CPU Feature: AVX512 |
| func (x Uint64x2) tern(table uint8, y Uint64x2, z Uint64x2) Uint64x2 |
| |
| // tern performs a logical operation on three vectors based on the 8-bit truth table. |
| // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) |
| // |
| // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPTERNLOGQ, CPU Feature: AVX512 |
| func (x Uint64x4) tern(table uint8, y Uint64x4, z Uint64x4) Uint64x4 |
| |
| // tern performs a logical operation on three vectors based on the 8-bit truth table. |
| // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) |
| // |
| // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. |
| // |
| // Asm: VPTERNLOGQ, CPU Feature: AVX512 |
| func (x Uint64x8) tern(table uint8, y Uint64x8, z Uint64x8) Uint64x8 |