From WikiChip
AVX-512 Vector Bit Manipulation Instructions (VBMI) - x86
< x86

AVX-512 Vector Bit Manipulation Instructions (AVX512_VBMI) is an x86 extension and part of the AVX-512 SIMD instruction set. The VBMI2 extension also adds compress/expand and funnel shift instructions.

Overview

VPERMB
Permutes the bytes of the second source operand using element indices. For each element of the destination vector the instruction obtains a source index, modulo vector size in elements, from the corresponding element of the first source operand.
VPERMI2B, VPERMT2B
The "I" instruction variant concatenates the second and first source operand and permutes their elements using element indices. For each element of the destination vector it obtains a source index, modulo twice the vector size in elements, from the byte in this lane and overwrites it.
The "T" variant concatenates the second source and destination operand, and obtains the source indices from the first source operand. In other words the instructions perform the same operation, one overwriting the indices, the other one half of the data table. The destination and first source operand is a vector register. The second source operand can be a vector register or a vector in memory.

It should be noted that the AVX-512 Foundation extension already provides instructions to permute 16-bit words, doublewords, quadwords, and single and double precision floating point values in the same ways.

VPMULTISHIFTQB
Copies 8 consecutive bits from the second source operand into each byte of the destination vector using bit indices. For each destination byte the instruction obtains an index from the corresponding byte of the first source operand. The operation is confined to 64-bit quadwords so the indices can only address bits in the same 64-bit lane as the index and destination byte. The instruction increments the index for each bit modulo 64. In other words the operation for each destination byte is:
dest.byte[i] = bitwise_rotate_right(source2.quadword[i / 8], source1.byte[i] and 63) and 255
The destination and first source operand is a vector register. The second source operand can be a vector register, a vector in memory, or one quadword broadcast to all 64-bit lanes of the vector. Write masking is supported with quadword granularity.

All these instructions can operate on 128-, 256-, or 512-bit wide vectors. If the vector size is less than 512 bits the instructions zero the unused higher bits in the destination register to avoid a dependency on earlier instructions writing those bits.

Detection

Support for these instructions is indicated by the AVX512_VBMI feature flag. 128- and 256-bit vectors are supported if the AVX512VL flag is set as well.

CPUID Instruction Set
Input Output
EAX=07H, ECX=0 EBX[bit 31] AVX512VL
EAX=07H, ECX=0 ECX[bit 01] AVX512_VBMI

Microarchitecture support

Designer Microarchitecture Year Support Level
F CD ER PF BW DQ VL FP16 IFMA VBMI VBMI2 BITALG VPOPCNTDQ VP2INTERSECT 4VNNIW 4FMAPS VNNI BF16
Intel Knights Landing 2016
Knights Mill 2017
Skylake (server) 2017
Cannon Lake 2018
Cascade Lake 2019
Cooper Lake 2020
Tiger Lake 2020
Rocket Lake 2021
Alder Lake 2021
Ice Lake (server) 2021
Sapphire Rapids 2023
AMD Zen 4 2022
Centaur CHA

Intrinsic functions

// VPERMB
__m512i _mm512_permutexvar_epi8( __m512i idx, __m512i a);
__m512i _mm512_mask_permutexvar_epi8(__m512i s, __mmask64 k, __m512i idx, __m512i a);
__m512i _mm512_maskz_permutexvar_epi8( __mmask64 k, __m512i idx, __m512i a);
__m256i _mm256_permutexvar_epi8( __m256i idx, __m256i a);
__m256i _mm256_mask_permutexvar_epi8(__m256i s, __mmask32 k, __m256i idx, __m256i a);
__m256i _mm256_maskz_permutexvar_epi8( __mmask32 k, __m256i idx, __m256i a);
__m128i _mm_permutexvar_epi8( __m128i idx, __m128i a);
__m128i _mm_mask_permutexvar_epi8(__m128i s, __mmask16 k, __m128i idx, __m128i a);
__m128i _mm_maskz_permutexvar_epi8( __mmask16 k, __m128i idx, __m128i a);
// VPERMI2B
__m512i _mm512_permutex2var_epi8(__m512i a, __m512i idx, __m512i b);
__m512i _mm512_mask2_permutex2var_epi8(__m512i a, __m512i idx, __mmask64 k, __m512i b);
__m512i _mm512_maskz_permutex2var_epi8(__mmask64 k, __m512i a, __m512i idx, __m512i b);
__m256i _mm256_permutex2var_epi8(__m256i a, __m256i idx, __m256i b);
__m256i _mm256_mask2_permutex2var_epi8(__m256i a, __m256i idx, __mmask32 k, __m256i b);
__m256i _mm256_maskz_permutex2var_epi8(__mmask32 k, __m256i a, __m256i idx, __m256i b);
__m128i _mm_permutex2var_epi8(__m128i a, __m128i idx, __m128i b);
__m128i _mm_mask2_permutex2var_epi8(__m128i a, __m128i idx, __mmask16 k, __m128i b);
__m128i _mm_maskz_permutex2var_epi8(__mmask16 k, __m128i a, __m128i idx, __m128i b);
// VPERMT2B
__m512i _mm512_permutex2var_epi8(__m512i a, __m512i idx, __m512i b);
__m512i _mm512_mask_permutex2var_epi8(__m512i a, __mmask64 k, __m512i idx, __m512i b);
__m512i _mm512_maskz_permutex2var_epi8(__mmask64 k, __m512i a, __m512i idx, __m512i b);
__m256i _mm256_permutex2var_epi8(__m256i a, __m256i idx, __m256i b);
__m256i _mm256_mask_permutex2var_epi8(__m256i a, __mmask32 k, __m256i idx, __m256i b);
__m256i _mm256_maskz_permutex2var_epi8(__mmask32 k, __m256i a, __m256i idx, __m256i b);
__m128i _mm_permutex2var_epi8(__m128i a, __m128i idx, __m128i b);
__m128i _mm_mask_permutex2var_epi8(__m128i a, __mmask16 k, __m128i idx, __m128i b);
__m128i _mm_maskz_permutex2var_epi8(__mmask16 k, __m128i a, __m128i idx, __m128i b);
// VPMULTISHIFTQB
__m512i _mm512_multishift_epi64_epi8( __m512i a, __m512i b);
__m512i _mm512_mask_multishift_epi64_epi8(__m512i s, __mmask64 k, __m512i a, __m512i b);
__m512i _mm512_maskz_multishift_epi64_epi8( __mmask64 k, __m512i a, __m512i b);
__m256i _mm256_multishift_epi64_epi8( __m256i a, __m256i b);
__m256i _mm256_mask_multishift_epi64_epi8(__m256i s, __mmask32 k, __m256i a, __m256i b);
__m256i _mm256_maskz_multishift_epi64_epi8( __mmask32 k, __m256i a, __m256i b);
__m128i _mm_multishift_epi64_epi8( __m128i a, __m128i b);
__m128i _mm_mask_multishift_epi64_epi8(__m128i s, __mmask8 k, __m128i a, __m128i b);
__m128i _mm_maskz_multishift_epi64_epi8( __mmask8 k, __m128i a, __m128i b);

Bibliography