AVX2 根据条件将连续元素扩展为稀疏向量? (如 AVX512 VPEXPANDD)
AVX2 expand contiguous elements to a sparse vector based on a condition? (like AVX512 VPEXPANDD)
有谁知道如何向量化以下代码?
uint32_t r[8];
uint16_t* ptr;
for (int j = 0; j < 8; ++j)
if (r[j] < C)
r[j] = *(ptr++);
这基本上是一个掩码收集操作。自动矢量化器无法处理这个问题。如果 ptr 是 uint32_t*,它应该可以直接用 _mm256_mask_i32gather_epi32 实现。但即便如此,你如何生成正确的索引向量?并且无论如何只使用打包加载并洗牌结果(需要类似的索引向量)会不会更快?
更新答案:主要代码段已重写为函数和解决方案
已添加适用于 AMD 处理器的 suitable。
正如 Peter Cordes 在评论中提到的,AVX-512 指令 vpexpandd
在这里很有用。
下面的函数 _mm256_mask_expand_epi32_AVX2_BMI()
和 _mm256_mask_expand_epi32_AVX2()
更多
或更少地模仿这条指令。对于 Intel Haswell 处理器和更新的处理器,AVX2_BMI 变体是 suitable。
_mm256_mask_expand_epi32_AVX2()
函数是 suitable 对于 AMD 处理器有慢速或
缺少 pdep
指令,例如 Ryzen 处理器。
在这个函数中有一些高吞吐量的指令,
例如移位和简单的算术运算,被用来代替 pdep
指令。
AMD 处理器的另一种可能性是
一次只处理 4 个元素,并使用一个微小的(16 个元素)查找-table 来检索
shuf_mask.
在这两个函数下方显示了如何使用它们来矢量化您的标量代码
回答使用了与 Peter Cordes 在 回答中类似的想法,
其中讨论了基于掩码的左包装。在那个答案中 BMI2
指令 pext
用于计算置换向量。
这里我们使用 pdep
指令来计算置换向量。
函数_mm256_mask_expand_epi32_AVX2()
求置换向量
以不同的方式通过计算
r<C
面具上的 prefix sum。
因为无符号uint32_t
,我使用了Paul R 的想法进行epu32
无符号比较。
/* gcc -O3 -m64 -Wall -mavx2 -march=broadwell mask_expand_avx.c */
#include <immintrin.h>
#include <stdio.h>
#include <stdint.h>
__m256i _mm256_mask_expand_epi32_AVX2_BMI(__m256i src, __m256i mask, __m256i insert_vals, int* nonz){
/* Scatter the insert_vals to the positions indicated by mask. */
/* Blend the src with these scattered insert_vals. */
/* Return also the number of nonzeros in mask (which is inexpensive here */
/* because _mm256_movemask_epi8(mask) has to be computed anyway.) */
/* This code is suitable for Intel Haswell and newer processors. */
/* This code is less suitble for AMD Ryzen processors, due to the */
/* slow pdep instruction on those processors, see _mm256_mask_expand_epi32_AVX2 */
uint32_t all_indx = 0x76543210;
uint32_t mask_int32 = _mm256_movemask_epi8(mask); /* Packed mask of 8 nibbles */
uint32_t wanted_indx = _pdep_u32(all_indx, mask_int32); /* Select the right nibbles from all_indx */
uint64_t expand_indx = _pdep_u64(wanted_indx, 0x0F0F0F0F0F0F0F0F); /* Expand the nibbles to bytes */
__m128i shuf_mask_8bit = _mm_cvtsi64_si128(expand_indx); /* Move to AVX-128 register */
__m256i shuf_mask = _mm256_cvtepu8_epi32(shuf_mask_8bit); /* Expand bytes to 32-bit integers */
__m256i insert_vals_exp = _mm256_permutevar8x32_epi32(insert_vals, shuf_mask); /* Expand insert_vals to the right positions */
__m256i dst = _mm256_blendv_epi8(src, insert_vals_exp, mask); /* src is replaced by insert_vals_exp at the postions indicated by mask */
*nonz = _mm_popcnt_u32(mask_int32) >> 2;
return dst;
}
__m256i _mm256_mask_expand_epi32_AVX2(__m256i src, __m256i mask, __m256i insert_vals, int* nonz){
/* Scatter the insert_vals to the positions indicated by mask. */
/* Blend the src with these scattered insert_vals. */
/* Return also the number of nonzeros in mask. */
/* This code is an alternative for the _mm256_mask_expand_epi32_AVX2_BMI function. */
/* In contrast to that code, this code doesn't use the BMI instruction pdep. */
/* Therefore, this code is suitable for AMD processors. */
__m128i mask_lo = _mm256_castsi256_si128(mask);
__m128i mask_hi = _mm256_extracti128_si256(mask, 1);
__m128i mask_hi_lo = _mm_packs_epi32(mask_lo, mask_hi); /* Compressed 128-bits (8 x 16-bits) mask */
*nonz = _mm_popcnt_u32(_mm_movemask_epi8(mask_hi_lo)) >> 1;
__m128i prefix_sum = mask_hi_lo;
__m128i prefix_sum_shft = _mm_slli_si128(prefix_sum, 2); /* The permutation vector is based on the */
prefix_sum = _mm_add_epi16(prefix_sum, prefix_sum_shft); /* Prefix sum of the mask. */
prefix_sum_shft = _mm_slli_si128(prefix_sum, 4);
prefix_sum = _mm_add_epi16(prefix_sum, prefix_sum_shft);
prefix_sum_shft = _mm_slli_si128(prefix_sum, 8);
prefix_sum = _mm_add_epi16(prefix_sum, prefix_sum_shft);
__m128i shuf_mask_16bit = _mm_sub_epi16(_mm_set1_epi16(-1), prefix_sum);
__m256i shuf_mask = _mm256_cvtepu16_epi32(shuf_mask_16bit); /* Expand 16-bit integers to 32-bit integers */
__m256i insert_vals_exp = _mm256_permutevar8x32_epi32(insert_vals, shuf_mask); /* Expand insert_vals to the right positions */
__m256i dst = _mm256_blendv_epi8(src, insert_vals_exp, mask); /* src is replaced by insert_vals_exp at the postions indicated by mask */
return dst;
}
/* Unsigned integer compare _mm256_cmplt_epu32 doesn't exist */
/* The next two lines are based on Paul R's answer */
#define _mm256_cmpge_epu32(a, b) _mm256_cmpeq_epi32(_mm256_max_epu32(a, b), a)
#define _mm256_cmplt_epu32(a, b) _mm256_xor_si256(_mm256_cmpge_epu32(a, b), _mm256_set1_epi32(-1))
int print_input(uint32_t* r, uint32_t C, uint16_t* ptr);
int print_output(uint32_t* r, uint16_t* ptr);
int main(){
int nonz;
uint32_t r[8] = {6, 3, 1001, 2, 1002, 7, 5, 1003};
uint32_t r_new[8];
uint32_t C = 9;
uint16_t* ptr = malloc(8*2); /* allocate 16 bytes for 8 uint16_t's */
ptr[0] = 11; ptr[1] = 12; ptr[2] = 13;ptr[3] = 14; ptr[4] = 15; ptr[5] = 16; ptr[6] = 17; ptr[7] = 18;
uint16_t* ptr_new;
printf("Test values:\n");
print_input(r,C,ptr);
__m256i src = _mm256_loadu_si256((__m256i *)r);
__m128i ins = _mm_loadu_si128((__m128i *)ptr);
__m256i insert_vals = _mm256_cvtepu16_epi32(ins);
__m256i mask_C = _mm256_cmplt_epu32(src,_mm256_set1_epi32(C));
printf("Output _mm256_mask_expand_epi32_AVX2_BMI:\n");
__m256i output = _mm256_mask_expand_epi32_AVX2_BMI(src, mask_C, insert_vals, &nonz);
_mm256_storeu_si256((__m256i *)r_new,output);
ptr_new = ptr + nonz;
print_output(r_new,ptr_new);
printf("Output _mm256_mask_expand_epi32_AVX2:\n");
output = _mm256_mask_expand_epi32_AVX2(src, mask_C, insert_vals, &nonz);
_mm256_storeu_si256((__m256i *)r_new,output);
ptr_new = ptr + nonz;
print_output(r_new,ptr_new);
printf("Output scalar loop:\n");
for (int j = 0; j < 8; ++j)
if (r[j] < C)
r[j] = *(ptr++);
print_output(r,ptr);
return 0;
}
int print_input(uint32_t* r, uint32_t C, uint16_t* ptr){
printf("r[0]..r[7] = %4u %4u %4u %4u %4u %4u %4u %4u \n",r[0],r[1],r[2],r[3],r[4],r[5],r[6],r[7]);
printf("Threshold value C = %4u %4u %4u %4u %4u %4u %4u %4u \n",C,C,C,C,C,C,C,C);
printf("ptr[0]..ptr[7] = %4hu %4hu %4hu %4hu %4hu %4hu %4hu %4hu \n\n",ptr[0],ptr[1],ptr[2],ptr[3],ptr[4],ptr[5],ptr[6],ptr[7]);
return 0;
}
int print_output(uint32_t* r, uint16_t* ptr){
printf("r[0]..r[7] = %4u %4u %4u %4u %4u %4u %4u %4u \n",r[0],r[1],r[2],r[3],r[4],r[5],r[6],r[7]);
printf("ptr = %p \n\n",ptr);
return 0;
}
输出为:
$ ./a.out
Test values:
r[0]..r[7] = 6 3 1001 2 1002 7 5 1003
Threshold value C = 9 9 9 9 9 9 9 9
ptr[0]..ptr[7] = 11 12 13 14 15 16 17 18
Output _mm256_mask_expand_epi32_AVX2_BMI:
r[0]..r[7] = 11 12 1001 13 1002 14 15 1003
ptr = 0x92c01a
Output _mm256_mask_expand_epi32_AVX2:
r[0]..r[7] = 11 12 1001 13 1002 14 15 1003
ptr = 0x92c01a
Output scalar loop:
r[0]..r[7] = 11 12 1001 13 1002 14 15 1003
ptr = 0x92c01a
有谁知道如何向量化以下代码?
uint32_t r[8];
uint16_t* ptr;
for (int j = 0; j < 8; ++j)
if (r[j] < C)
r[j] = *(ptr++);
这基本上是一个掩码收集操作。自动矢量化器无法处理这个问题。如果 ptr 是 uint32_t*,它应该可以直接用 _mm256_mask_i32gather_epi32 实现。但即便如此,你如何生成正确的索引向量?并且无论如何只使用打包加载并洗牌结果(需要类似的索引向量)会不会更快?
更新答案:主要代码段已重写为函数和解决方案 已添加适用于 AMD 处理器的 suitable。
正如 Peter Cordes 在评论中提到的,AVX-512 指令 vpexpandd
在这里很有用。
下面的函数 _mm256_mask_expand_epi32_AVX2_BMI()
和 _mm256_mask_expand_epi32_AVX2()
更多
或更少地模仿这条指令。对于 Intel Haswell 处理器和更新的处理器,AVX2_BMI 变体是 suitable。
_mm256_mask_expand_epi32_AVX2()
函数是 suitable 对于 AMD 处理器有慢速或
缺少 pdep
指令,例如 Ryzen 处理器。
在这个函数中有一些高吞吐量的指令,
例如移位和简单的算术运算,被用来代替 pdep
指令。
AMD 处理器的另一种可能性是
一次只处理 4 个元素,并使用一个微小的(16 个元素)查找-table 来检索
shuf_mask.
在这两个函数下方显示了如何使用它们来矢量化您的标量代码
回答使用了与 Peter Cordes 在 pext
用于计算置换向量。
这里我们使用 pdep
指令来计算置换向量。
函数_mm256_mask_expand_epi32_AVX2()
求置换向量
以不同的方式通过计算
r<C
面具上的 prefix sum。
因为无符号uint32_t
,我使用了Paul R 的想法进行epu32
无符号比较。
/* gcc -O3 -m64 -Wall -mavx2 -march=broadwell mask_expand_avx.c */
#include <immintrin.h>
#include <stdio.h>
#include <stdint.h>
__m256i _mm256_mask_expand_epi32_AVX2_BMI(__m256i src, __m256i mask, __m256i insert_vals, int* nonz){
/* Scatter the insert_vals to the positions indicated by mask. */
/* Blend the src with these scattered insert_vals. */
/* Return also the number of nonzeros in mask (which is inexpensive here */
/* because _mm256_movemask_epi8(mask) has to be computed anyway.) */
/* This code is suitable for Intel Haswell and newer processors. */
/* This code is less suitble for AMD Ryzen processors, due to the */
/* slow pdep instruction on those processors, see _mm256_mask_expand_epi32_AVX2 */
uint32_t all_indx = 0x76543210;
uint32_t mask_int32 = _mm256_movemask_epi8(mask); /* Packed mask of 8 nibbles */
uint32_t wanted_indx = _pdep_u32(all_indx, mask_int32); /* Select the right nibbles from all_indx */
uint64_t expand_indx = _pdep_u64(wanted_indx, 0x0F0F0F0F0F0F0F0F); /* Expand the nibbles to bytes */
__m128i shuf_mask_8bit = _mm_cvtsi64_si128(expand_indx); /* Move to AVX-128 register */
__m256i shuf_mask = _mm256_cvtepu8_epi32(shuf_mask_8bit); /* Expand bytes to 32-bit integers */
__m256i insert_vals_exp = _mm256_permutevar8x32_epi32(insert_vals, shuf_mask); /* Expand insert_vals to the right positions */
__m256i dst = _mm256_blendv_epi8(src, insert_vals_exp, mask); /* src is replaced by insert_vals_exp at the postions indicated by mask */
*nonz = _mm_popcnt_u32(mask_int32) >> 2;
return dst;
}
__m256i _mm256_mask_expand_epi32_AVX2(__m256i src, __m256i mask, __m256i insert_vals, int* nonz){
/* Scatter the insert_vals to the positions indicated by mask. */
/* Blend the src with these scattered insert_vals. */
/* Return also the number of nonzeros in mask. */
/* This code is an alternative for the _mm256_mask_expand_epi32_AVX2_BMI function. */
/* In contrast to that code, this code doesn't use the BMI instruction pdep. */
/* Therefore, this code is suitable for AMD processors. */
__m128i mask_lo = _mm256_castsi256_si128(mask);
__m128i mask_hi = _mm256_extracti128_si256(mask, 1);
__m128i mask_hi_lo = _mm_packs_epi32(mask_lo, mask_hi); /* Compressed 128-bits (8 x 16-bits) mask */
*nonz = _mm_popcnt_u32(_mm_movemask_epi8(mask_hi_lo)) >> 1;
__m128i prefix_sum = mask_hi_lo;
__m128i prefix_sum_shft = _mm_slli_si128(prefix_sum, 2); /* The permutation vector is based on the */
prefix_sum = _mm_add_epi16(prefix_sum, prefix_sum_shft); /* Prefix sum of the mask. */
prefix_sum_shft = _mm_slli_si128(prefix_sum, 4);
prefix_sum = _mm_add_epi16(prefix_sum, prefix_sum_shft);
prefix_sum_shft = _mm_slli_si128(prefix_sum, 8);
prefix_sum = _mm_add_epi16(prefix_sum, prefix_sum_shft);
__m128i shuf_mask_16bit = _mm_sub_epi16(_mm_set1_epi16(-1), prefix_sum);
__m256i shuf_mask = _mm256_cvtepu16_epi32(shuf_mask_16bit); /* Expand 16-bit integers to 32-bit integers */
__m256i insert_vals_exp = _mm256_permutevar8x32_epi32(insert_vals, shuf_mask); /* Expand insert_vals to the right positions */
__m256i dst = _mm256_blendv_epi8(src, insert_vals_exp, mask); /* src is replaced by insert_vals_exp at the postions indicated by mask */
return dst;
}
/* Unsigned integer compare _mm256_cmplt_epu32 doesn't exist */
/* The next two lines are based on Paul R's answer */
#define _mm256_cmpge_epu32(a, b) _mm256_cmpeq_epi32(_mm256_max_epu32(a, b), a)
#define _mm256_cmplt_epu32(a, b) _mm256_xor_si256(_mm256_cmpge_epu32(a, b), _mm256_set1_epi32(-1))
int print_input(uint32_t* r, uint32_t C, uint16_t* ptr);
int print_output(uint32_t* r, uint16_t* ptr);
int main(){
int nonz;
uint32_t r[8] = {6, 3, 1001, 2, 1002, 7, 5, 1003};
uint32_t r_new[8];
uint32_t C = 9;
uint16_t* ptr = malloc(8*2); /* allocate 16 bytes for 8 uint16_t's */
ptr[0] = 11; ptr[1] = 12; ptr[2] = 13;ptr[3] = 14; ptr[4] = 15; ptr[5] = 16; ptr[6] = 17; ptr[7] = 18;
uint16_t* ptr_new;
printf("Test values:\n");
print_input(r,C,ptr);
__m256i src = _mm256_loadu_si256((__m256i *)r);
__m128i ins = _mm_loadu_si128((__m128i *)ptr);
__m256i insert_vals = _mm256_cvtepu16_epi32(ins);
__m256i mask_C = _mm256_cmplt_epu32(src,_mm256_set1_epi32(C));
printf("Output _mm256_mask_expand_epi32_AVX2_BMI:\n");
__m256i output = _mm256_mask_expand_epi32_AVX2_BMI(src, mask_C, insert_vals, &nonz);
_mm256_storeu_si256((__m256i *)r_new,output);
ptr_new = ptr + nonz;
print_output(r_new,ptr_new);
printf("Output _mm256_mask_expand_epi32_AVX2:\n");
output = _mm256_mask_expand_epi32_AVX2(src, mask_C, insert_vals, &nonz);
_mm256_storeu_si256((__m256i *)r_new,output);
ptr_new = ptr + nonz;
print_output(r_new,ptr_new);
printf("Output scalar loop:\n");
for (int j = 0; j < 8; ++j)
if (r[j] < C)
r[j] = *(ptr++);
print_output(r,ptr);
return 0;
}
int print_input(uint32_t* r, uint32_t C, uint16_t* ptr){
printf("r[0]..r[7] = %4u %4u %4u %4u %4u %4u %4u %4u \n",r[0],r[1],r[2],r[3],r[4],r[5],r[6],r[7]);
printf("Threshold value C = %4u %4u %4u %4u %4u %4u %4u %4u \n",C,C,C,C,C,C,C,C);
printf("ptr[0]..ptr[7] = %4hu %4hu %4hu %4hu %4hu %4hu %4hu %4hu \n\n",ptr[0],ptr[1],ptr[2],ptr[3],ptr[4],ptr[5],ptr[6],ptr[7]);
return 0;
}
int print_output(uint32_t* r, uint16_t* ptr){
printf("r[0]..r[7] = %4u %4u %4u %4u %4u %4u %4u %4u \n",r[0],r[1],r[2],r[3],r[4],r[5],r[6],r[7]);
printf("ptr = %p \n\n",ptr);
return 0;
}
输出为:
$ ./a.out
Test values:
r[0]..r[7] = 6 3 1001 2 1002 7 5 1003
Threshold value C = 9 9 9 9 9 9 9 9
ptr[0]..ptr[7] = 11 12 13 14 15 16 17 18
Output _mm256_mask_expand_epi32_AVX2_BMI:
r[0]..r[7] = 11 12 1001 13 1002 14 15 1003
ptr = 0x92c01a
Output _mm256_mask_expand_epi32_AVX2:
r[0]..r[7] = 11 12 1001 13 1002 14 15 1003
ptr = 0x92c01a
Output scalar loop:
r[0]..r[7] = 11 12 1001 13 1002 14 15 1003
ptr = 0x92c01a