SIMD - 如何从 2 个不同元素宽度的向量中添加相应的值(char 或 uint8_t 添加到 int)
SIMD - how to add corresponding values from 2 vectors of different element widths (char or uint8_t adding to int)
请告诉我如何从同一类型的 SIMD 向量中添加值,但值本身在这些 SIMD 向量中被不同数量的字节占用。
这是一个例子:
int main()
{
//--------------------------------------------------------------
int my_int_sequence[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
__m128i my_int_sequence_m128i_1 = _mm_loadu_si128((__m128i*) & my_int_sequence[0]);
__m128i my_int_sequence_m128i_2 = _mm_loadu_si128((__m128i*) & my_int_sequence[4]);
__m128i my_int_sequence_m128i_3 = _mm_loadu_si128((__m128i*) & my_int_sequence[8]);
__m128i my_int_sequence_m128i_4 = _mm_loadu_si128((__m128i*) & my_int_sequence[12]);
//--------------------------------------------------------------
//-----------------------------------------------------------------------
char my_char_mask[16] = { 1,0,1,1,0,1,0,1,1,1,0,1,0,1,0,1 };
__m128i my_char_mask_m128i = _mm_loadu_si128((__m128i*) &my_char_mask[0]);
//-----------------------------------------------------------------------
}
也就是说,我在 my_int_sequence 数组中有一个 int 值数组 - 因为所有 16 个 int 值都不能放在一个 __m128i 向量中,所以我加载这些值4个值进入第4个__m128i向量。
我还有一个 16 字节的数组,我也将其加载到 my_char_mask_my_m128i 向量中。
现在我想添加到 my_int_sequence_m128i_x 向量的每个 4 字节值,就好像 my_char_mask_my_m128i 向量中对应的一个字节值一样。
问题很明显,我需要把不同的维度加起来。可能吗?
也许我需要矢量的每个字节 my_char_mask_my_m128i - 如何将其转换为 4 个字节?
Perhaps I need each byte of the vector my_char_mask_my_m128i - how to transform it into 4 bytes?
您正在寻找 SSE4.1 内部函数 _mm_cvtepi8_epi32()
,它将 SSE 向量中的前 4 个(带符号的)8 位整数 sign-extends 转换为 32 位整数。将其与一些移动相结合,将接下来的 4 个移动到下一个扩展的位置,你会得到类似的东西:
#include <iostream>
#include <cstdint>
#include <emmintrin.h>
#include <smmintrin.h>
void print_int4(__m128i vec) {
alignas(16) std::int32_t ints[4];
_mm_store_si128(reinterpret_cast<__m128i*>(ints), vec);
std::cout << '[' << ints[0] << ", " << ints[1] << ", " << ints[2] << ", "
<< ints[3] << ']';
}
int main(void) {
alignas(16) std::int32_t
my_int_sequence[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
alignas(16) std::int8_t
my_char_mask[16] = { 1,0,1,1,0,1,0,1,1,1,0,1,0,1,0,1 };
__m128i char_mask = _mm_load_si128(reinterpret_cast<__m128i*>(my_char_mask));
// Loop through the 32-bit int array 4 at a time
for (int n = 0; n < 16; n += 4) {
// Load the next 4 ints
__m128i vec =
_mm_load_si128(reinterpret_cast<__m128i*>(my_int_sequence + n));
// Convert the next 4 chars to ints
__m128i chars_to_add = _mm_cvtepi8_epi32(char_mask);
// Shift out those 4 chars
char_mask = _mm_srli_si128(char_mask, 4);
// And add together
__m128i sum = _mm_add_epi32(vec, chars_to_add);
print_int4(vec);
std::cout << " + ";
print_int4(chars_to_add);
std::cout << " = ";
print_int4(sum);
std::cout << '\n';
}
}
示例(请注意,您通常必须告诉编译器生成 SSE 4.1 指令 - g++
和 clang++
使用适当的 -march=XXXX
选项或 -msse4.1
) :
$ g++ -O -Wall -Wextra -std=gnu++11 -msse4.1 demo.cc
$ ./a.out
[0, 1, 2, 3] + [1, 0, 1, 1] = [1, 1, 3, 4]
[4, 5, 6, 7] + [0, 1, 0, 1] = [4, 6, 6, 8]
[8, 9, 10, 11] + [1, 1, 0, 1] = [9, 10, 10, 12]
[12, 13, 14, 15] + [0, 1, 0, 1] = [12, 14, 14, 16]
Peter Cordes 建议的替代版本,如果您的编译器足够新,可以 _mm_loadu_si32()
:
// Loop through the 32-bit int array 4 at a time
for (int n = 0; n < 16; n += 4) {
// Load the next 4 ints
__m128i vec =
_mm_load_si128(reinterpret_cast<__m128i*>(my_int_sequence + n));
// Load the next 4 chars
__m128i char_mask = _mm_loadu_si32(my_char_mask + n);
// Convert them to ints
__m128i chars_to_add = _mm_cvtepi8_epi32(char_mask);
// And add together
__m128i sum = _mm_add_epi32(vec, chars_to_add);
// Do more stuff
}
请告诉我如何从同一类型的 SIMD 向量中添加值,但值本身在这些 SIMD 向量中被不同数量的字节占用。
这是一个例子:
int main()
{
//--------------------------------------------------------------
int my_int_sequence[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
__m128i my_int_sequence_m128i_1 = _mm_loadu_si128((__m128i*) & my_int_sequence[0]);
__m128i my_int_sequence_m128i_2 = _mm_loadu_si128((__m128i*) & my_int_sequence[4]);
__m128i my_int_sequence_m128i_3 = _mm_loadu_si128((__m128i*) & my_int_sequence[8]);
__m128i my_int_sequence_m128i_4 = _mm_loadu_si128((__m128i*) & my_int_sequence[12]);
//--------------------------------------------------------------
//-----------------------------------------------------------------------
char my_char_mask[16] = { 1,0,1,1,0,1,0,1,1,1,0,1,0,1,0,1 };
__m128i my_char_mask_m128i = _mm_loadu_si128((__m128i*) &my_char_mask[0]);
//-----------------------------------------------------------------------
}
也就是说,我在 my_int_sequence 数组中有一个 int 值数组 - 因为所有 16 个 int 值都不能放在一个 __m128i 向量中,所以我加载这些值4个值进入第4个__m128i向量。
我还有一个 16 字节的数组,我也将其加载到 my_char_mask_my_m128i 向量中。
现在我想添加到 my_int_sequence_m128i_x 向量的每个 4 字节值,就好像 my_char_mask_my_m128i 向量中对应的一个字节值一样。
问题很明显,我需要把不同的维度加起来。可能吗?
也许我需要矢量的每个字节 my_char_mask_my_m128i - 如何将其转换为 4 个字节?
Perhaps I need each byte of the vector my_char_mask_my_m128i - how to transform it into 4 bytes?
您正在寻找 SSE4.1 内部函数 _mm_cvtepi8_epi32()
,它将 SSE 向量中的前 4 个(带符号的)8 位整数 sign-extends 转换为 32 位整数。将其与一些移动相结合,将接下来的 4 个移动到下一个扩展的位置,你会得到类似的东西:
#include <iostream>
#include <cstdint>
#include <emmintrin.h>
#include <smmintrin.h>
void print_int4(__m128i vec) {
alignas(16) std::int32_t ints[4];
_mm_store_si128(reinterpret_cast<__m128i*>(ints), vec);
std::cout << '[' << ints[0] << ", " << ints[1] << ", " << ints[2] << ", "
<< ints[3] << ']';
}
int main(void) {
alignas(16) std::int32_t
my_int_sequence[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
alignas(16) std::int8_t
my_char_mask[16] = { 1,0,1,1,0,1,0,1,1,1,0,1,0,1,0,1 };
__m128i char_mask = _mm_load_si128(reinterpret_cast<__m128i*>(my_char_mask));
// Loop through the 32-bit int array 4 at a time
for (int n = 0; n < 16; n += 4) {
// Load the next 4 ints
__m128i vec =
_mm_load_si128(reinterpret_cast<__m128i*>(my_int_sequence + n));
// Convert the next 4 chars to ints
__m128i chars_to_add = _mm_cvtepi8_epi32(char_mask);
// Shift out those 4 chars
char_mask = _mm_srli_si128(char_mask, 4);
// And add together
__m128i sum = _mm_add_epi32(vec, chars_to_add);
print_int4(vec);
std::cout << " + ";
print_int4(chars_to_add);
std::cout << " = ";
print_int4(sum);
std::cout << '\n';
}
}
示例(请注意,您通常必须告诉编译器生成 SSE 4.1 指令 - g++
和 clang++
使用适当的 -march=XXXX
选项或 -msse4.1
) :
$ g++ -O -Wall -Wextra -std=gnu++11 -msse4.1 demo.cc
$ ./a.out
[0, 1, 2, 3] + [1, 0, 1, 1] = [1, 1, 3, 4]
[4, 5, 6, 7] + [0, 1, 0, 1] = [4, 6, 6, 8]
[8, 9, 10, 11] + [1, 1, 0, 1] = [9, 10, 10, 12]
[12, 13, 14, 15] + [0, 1, 0, 1] = [12, 14, 14, 16]
Peter Cordes 建议的替代版本,如果您的编译器足够新,可以 _mm_loadu_si32()
:
// Loop through the 32-bit int array 4 at a time
for (int n = 0; n < 16; n += 4) {
// Load the next 4 ints
__m128i vec =
_mm_load_si128(reinterpret_cast<__m128i*>(my_int_sequence + n));
// Load the next 4 chars
__m128i char_mask = _mm_loadu_si32(my_char_mask + n);
// Convert them to ints
__m128i chars_to_add = _mm_cvtepi8_epi32(char_mask);
// And add together
__m128i sum = _mm_add_epi32(vec, chars_to_add);
// Do more stuff
}