可以通过使用输入寄存器来寻址输出 SIMD 寄存器
Is possible to address the output SIMD register by using an input register
是否可以使用输入向量的标量值来索引输出向量?我尝试在 SIMD 中实现以下功能,但找不到任何解决方案。
void shuffle(unsigned char * a, // input a
unsigned char * r){ // output r
for (i=0; i < 16; i++)
r[i] = 0;
for (i=0; i < 16; i++)
r[a[i] % 16] = 1;
}
示例输入/输出向量如下所示
unsigned char * a = {0, 0, 0, 10, 0, 0, 0, 2, 0, 0, 0, 0, 3, 1, 0, 0 };
... do SIMD magic
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
unsigned char * r = {1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0 };
我找不到任何可以动态寻址赋值左侧的合适指令。也许这个功能可以通过移位操作来实现?有人实施过类似的东西吗?
看来_mm_shuffle_epi8确实是解题的关键。这个想法是根据输入向量 a 的值设置各个位。
这些位分布在(水平或)128 位宽的字节上
注册.
#include <stdio.h>
#include <immintrin.h>
/* gcc -O3 -Wall -mavx test4.c */
/* gcc -O3 -Wall -msse2 -mssse3 -msse4.1 test4.c */
int print_char128(__m128i * x);
int print_char128(__m128i * x){
unsigned char v_x[16];
_mm_storeu_si128((__m128i *)v_x,*x);
printf("%4u %4u %4u %4u | %4u %4u %4u %4u | %4u %4u %4u %4u | %4u %4u %4u %4u \n",
v_x[0], v_x[1], v_x[2], v_x[3], v_x[4], v_x[5], v_x[6], v_x[7],
v_x[8], v_x[9], v_x[10], v_x[11], v_x[12], v_x[13], v_x[14], v_x[15] );
return 0;
}
int main()
{
unsigned char a_v[] = {0, 0, 0, 10, 0, 0, 0, 2, 0, 0, 0, 0, 3, 1, 0, 0 };
/*unsigned char a_v[] = {13, 30, 0, 10, 0, 6, 0, 2, 0, 0, 7, 0, 3, 11, 0, 0 };*/
__m128i t0, t1, t2, t3;
__m128i a, r, msk0, msk1, msk0_1, zero, bin_ones, one_epi8;
/* set some constants */
unsigned char msk0_v[] ={1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0};
msk0=_mm_loadu_si128((__m128i *)msk0_v);
msk1=_mm_shuffle_epi32(msk0,0b01001110);
msk0_1=_mm_blend_epi16(msk0,msk1,0b11110000);
zero=_mm_setzero_si128();
bin_ones=_mm_cmpeq_epi32(zero,zero);
one_epi8=_mm_sub_epi8(zero,bin_ones);
/* load indices */
a=_mm_loadu_si128((__m128i *)a_v);
/* start of 'SIMD magic' */
/* index a_i sets the a_i -th bit within a byte of t0 if 0<=a_i<8 */
/* or set (a_i-8)-th bit within a byte of t1 if 8<=a_i<16 */
t0=_mm_shuffle_epi8(msk0,a);
t1=_mm_shuffle_epi8(msk1,a);
/* horizontal OR of the bytes in t0 and t1: */
t2=_mm_blend_epi16(t0,t1,0b11110000);
t3=_mm_alignr_epi8(t1,t0,8);
t0=_mm_or_si128(t2,t3);
t1=_mm_shuffle_epi32(t0,0b10110001);
t0=_mm_or_si128(t0,t1);
t1=_mm_slli_si128(t0,2);
t0=_mm_or_si128(t0,t1);
t1=_mm_slli_si128(t0,1);
t0=_mm_or_si128(t0,t1);
t0=_mm_shuffle_epi32(t0,0b11110101); /* end of horizontal OR */
/* filter out the relevant bits */
t0=_mm_and_si128(t0,msk0_1);
t0=_mm_cmpeq_epi8(t0,zero);
r=_mm_andnot_si128(t0,one_epi8); /* the result is in r */
print_char128(&r);
return 0;
}
这应该工作得相当快:除了设置常量和加载数据的指令外,它只有 15 条 SSEx 指令。在今天的处理器上,这些指令都只有 1 个周期的延迟。
(倒数)吞吐量甚至更小:1/2 或 1/3 周期。
内在的_mm_blend_epi16是SSE4.1,其他的是SSSE3.
是否可以使用输入向量的标量值来索引输出向量?我尝试在 SIMD 中实现以下功能,但找不到任何解决方案。
void shuffle(unsigned char * a, // input a
unsigned char * r){ // output r
for (i=0; i < 16; i++)
r[i] = 0;
for (i=0; i < 16; i++)
r[a[i] % 16] = 1;
}
示例输入/输出向量如下所示
unsigned char * a = {0, 0, 0, 10, 0, 0, 0, 2, 0, 0, 0, 0, 3, 1, 0, 0 };
... do SIMD magic
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
unsigned char * r = {1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0 };
我找不到任何可以动态寻址赋值左侧的合适指令。也许这个功能可以通过移位操作来实现?有人实施过类似的东西吗?
看来_mm_shuffle_epi8确实是解题的关键。这个想法是根据输入向量 a 的值设置各个位。 这些位分布在(水平或)128 位宽的字节上 注册.
#include <stdio.h>
#include <immintrin.h>
/* gcc -O3 -Wall -mavx test4.c */
/* gcc -O3 -Wall -msse2 -mssse3 -msse4.1 test4.c */
int print_char128(__m128i * x);
int print_char128(__m128i * x){
unsigned char v_x[16];
_mm_storeu_si128((__m128i *)v_x,*x);
printf("%4u %4u %4u %4u | %4u %4u %4u %4u | %4u %4u %4u %4u | %4u %4u %4u %4u \n",
v_x[0], v_x[1], v_x[2], v_x[3], v_x[4], v_x[5], v_x[6], v_x[7],
v_x[8], v_x[9], v_x[10], v_x[11], v_x[12], v_x[13], v_x[14], v_x[15] );
return 0;
}
int main()
{
unsigned char a_v[] = {0, 0, 0, 10, 0, 0, 0, 2, 0, 0, 0, 0, 3, 1, 0, 0 };
/*unsigned char a_v[] = {13, 30, 0, 10, 0, 6, 0, 2, 0, 0, 7, 0, 3, 11, 0, 0 };*/
__m128i t0, t1, t2, t3;
__m128i a, r, msk0, msk1, msk0_1, zero, bin_ones, one_epi8;
/* set some constants */
unsigned char msk0_v[] ={1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0};
msk0=_mm_loadu_si128((__m128i *)msk0_v);
msk1=_mm_shuffle_epi32(msk0,0b01001110);
msk0_1=_mm_blend_epi16(msk0,msk1,0b11110000);
zero=_mm_setzero_si128();
bin_ones=_mm_cmpeq_epi32(zero,zero);
one_epi8=_mm_sub_epi8(zero,bin_ones);
/* load indices */
a=_mm_loadu_si128((__m128i *)a_v);
/* start of 'SIMD magic' */
/* index a_i sets the a_i -th bit within a byte of t0 if 0<=a_i<8 */
/* or set (a_i-8)-th bit within a byte of t1 if 8<=a_i<16 */
t0=_mm_shuffle_epi8(msk0,a);
t1=_mm_shuffle_epi8(msk1,a);
/* horizontal OR of the bytes in t0 and t1: */
t2=_mm_blend_epi16(t0,t1,0b11110000);
t3=_mm_alignr_epi8(t1,t0,8);
t0=_mm_or_si128(t2,t3);
t1=_mm_shuffle_epi32(t0,0b10110001);
t0=_mm_or_si128(t0,t1);
t1=_mm_slli_si128(t0,2);
t0=_mm_or_si128(t0,t1);
t1=_mm_slli_si128(t0,1);
t0=_mm_or_si128(t0,t1);
t0=_mm_shuffle_epi32(t0,0b11110101); /* end of horizontal OR */
/* filter out the relevant bits */
t0=_mm_and_si128(t0,msk0_1);
t0=_mm_cmpeq_epi8(t0,zero);
r=_mm_andnot_si128(t0,one_epi8); /* the result is in r */
print_char128(&r);
return 0;
}
这应该工作得相当快:除了设置常量和加载数据的指令外,它只有 15 条 SSEx 指令。在今天的处理器上,这些指令都只有 1 个周期的延迟。 (倒数)吞吐量甚至更小:1/2 或 1/3 周期。 内在的_mm_blend_epi16是SSE4.1,其他的是SSSE3.