C/C++ 中使用 AVX2 的两个无符号字节向量的内积
Inner product of two unsigned byte vectors using AVX2 in C/C++
我想使用 SSE/AVX2 实现快速相关系数计算。操作数是两个 unsigned char
向量。该函数应该大致等同于:
float correlate_simple(const unsigned char* vec1, const unsigned char* vec2, size_t length)
{
int sum1 = 0;
int sum2 = 0;
int sum11 = 0;
int sum22 = 0;
int sum12 = 0;
for (size_t i = length; i > 0; --i, ++vec1, ++vec2) {
sum1 += *vec1;
sum2 += *vec2;
sum11 += *vec1 * *vec1;
sum22 += *vec2 * *vec2;
sum12 += *vec1 * *vec2;
}
double mean1 = double(sum1) / double(length);
double mean2 = double(sum2) / double(length);
double mean11 = double(sum11) / double(length);
double mean22 = double(sum22) / double(length);
double mean12 = double(sum12) / double(length);
double b = (mean11 - mean1 * mean1) * (mean22 - mean2 * mean2);
if (b <= 0.0)
return 0.0f;
double a = (mean12 - mean1 * mean2);
return float(a / sqrt(b));
}
参数length
取值范围为1到小于1000
为此,我研究了如何实现两个无符号字节数组的内积。但是,我无法想出一个不涉及将所有无符号 8 位值转换为有符号 16 位值的解决方案。
内在 _mm256_maddubs_epi16(a, b)
期望 b
是一个带符号的字节。在这种情况下这不是问题,因为从 b 中减去某个常数(此处:127)不会改变相关系数。不幸的是,我找不到一个允许我从无符号字节中减去 127 产生有符号字节的内在函数(不依赖于某些二进制补码魔法)。
// vec: const unsigned char*
auto x = _mm256_load_si256((const __m256i*) vec);
auto v = _mm256_set1_epi8(127);
// wrong if vec[i] is less than 127:
auto x_centered = _mm256_sub_epi8 (x, v);
计算内积(以及最终的相关系数)的最佳方法是什么?
附录:
下面是我目前实现的一个纯内积。我决定转换为 16 位整数以避免溢出错误。
更新:从一次读取 128 位更改为 256 位。
int accumulate_i32(__m256i x)
{
auto tmp1 = _mm256_srli_si256(x, 8);
x = _mm256_add_epi32(x, tmp1);
auto tmp2 = _mm256_extractf128_si256(x, 1);
tmp2 = _mm_add_epi32(tmp2, _mm256_castsi256_si128(x));
return _mm_cvtsi128_si32(tmp2) + _mm_extract_epi32(tmp2, 1);
}
int inner_product_avx(const unsigned char* vec1, const unsigned char* vec2, unsigned int length)
{
constexpr unsigned int memoryAlignmentBytes = 32;
constexpr unsigned int bytesPerPack = 256 / 8;
assert((reinterpret_cast<std::uintptr_t>(vec1) % memoryAlignmentBytes) == 0);
assert((reinterpret_cast<std::uintptr_t>(vec2) % memoryAlignmentBytes) == 0);
// compute middle part via AVX2
unsigned int packCount = length / bytesPerPack;
const __m256i zeros = _mm256_setzero_si256();
auto sumlo = _mm256_setzero_si256();
auto sumhi = _mm256_setzero_si256();
for (unsigned int packIdx = 0; packIdx < packCount; ++packIdx) {
auto x1 = _mm256_load_si256((const __m256i*)vec1);
auto x2 = _mm256_load_si256((const __m256i*)vec2);
auto x1lo = _mm256_unpacklo_epi8(x1, zeros);
auto x1hi = _mm256_unpackhi_epi8(x1, zeros);
auto x2lo = _mm256_unpacklo_epi8(x2, zeros);
auto x2hi = _mm256_unpackhi_epi8(x2, zeros);
auto tmplo = _mm256_madd_epi16(x1lo, x2lo);
auto tmphi = _mm256_madd_epi16(x1hi, x2hi);
sumlo = _mm256_add_epi32(sumlo, tmplo);
sumhi = _mm256_add_epi32(sumhi, tmphi);
vec1 += bytesPerPack;
vec2 += bytesPerPack;
}
int sum = accumulate_i32(sumlo) + accumulate_i32(sumhi);
// compute remaining part that cannot be represented as a
// whole packed integer
unsigned int packRestCount = length % bytesPerPack;
for (size_t i = packRestCount; i > 0; --i, ++vec1, ++vec2)
sum += int(*vec1) * int(*vec2);
return sum;
}
这大约占简单 C++ 实现时间的 20%(见下文)。考虑到 AVX 代码同时处理 16 个 16 位整数这一事实,我本以为会有更高的增益。 - 这是合理的还是我错过了什么?
展开 AVX 代码中的最后一个循环并没有减少计算时间。
int inner_product_simple(const unsigned char* vec1, const unsigned char* vec2, size_t length)
{
int sum = 0;
for (size_t i = length; i > 0; --i, ++vec1, ++vec2)
sum += int(*vec1) * int(*vec2);
return sum;
}
Unfortunately I could not find an intrinsic that would allow me to subtract 127 from unsigned bytes producing signed bytes (not relying on some two's complement magic).
Intel x86 CPU 使用 2 的有符号数的补码表示,这就是 signed/unsigned 压缩整数没有单独版本的 SIMD 内在函数的原因。英特尔 SIMD 内部函数不在 C++ 标准的范围内,并且具有特定的 well-defined 行为。
我会从类似的事情开始。它像您当前的代码一样使用 32 位累加器。未经测试。
namespace
{
// Compute sum of the 64-bit lanes, convert to double
inline double hadd_epi64( const __m256i i64 )
{
__m128i res = _mm256_castsi256_si128( i64 );
res = _mm_add_epi64( res, _mm256_extractf128_si256( i64, 1 ) );
res = _mm_add_epi64( res, _mm_unpackhi_epi64( res, res ) );
return (double)_mm_cvtsi128_si64( res );
}
// Convert 32-bit lanes into 64-bit, compute sum of the 8, convert to double
inline double hadd_epu32( __m256i x )
{
const __m256i zero = _mm256_setzero_si256();
__m256i i64 = _mm256_unpacklo_epi32( x, zero );
i64 = _mm256_add_epi64( i64, _mm256_unpackhi_epi32( x, zero ) );
return hadd_epi64( i64 );
};
}
class InnerProduct
{
// These fields are interpreted as 64-bit integers
__m256i a, b;
// These fields are interpreted as 32-bit integers
__m256i aa, bb, ab;
// Accumulate products of 16-bit numbers, 16 of them at once
inline void add16( __m256i x, __m256i y )
{
const __m256i x2 = _mm256_madd_epi16( x, x );
const __m256i y2 = _mm256_madd_epi16( y, y );
const __m256i prod = _mm256_madd_epi16( x, y );
aa = _mm256_add_epi32( aa, x2 );
bb = _mm256_add_epi32( bb, y2 );
ab = _mm256_add_epi32( ab, prod );
}
public:
InnerProduct()
{
a = b = aa = bb = ab = _mm256_setzero_si256();
}
// Handle 32 bytes
inline void addBytes( __m256i x, __m256i y )
{
// Accumulate values
const __m256i zero = _mm256_setzero_si256();
a = _mm256_add_epi64( a, _mm256_sad_epu8( x, zero ) );
b = _mm256_add_epi64( b, _mm256_sad_epu8( y, zero ) );
// Split the vectors into 2 sets of 16-bit numbers, accumulate products
const __m256i z = _mm256_unpacklo_epi8( x, zero );
const __m256i w = _mm256_unpacklo_epi8( y, zero );
add16( z, w );
x = _mm256_unpackhi_epi8( x, zero );
y = _mm256_unpackhi_epi8( y, zero );
add16( x, y );
}
// Compute the result
float compute( size_t count ) const
{
const double div = (double)count;
const double mean1 = hadd_epi64( a ) / div;
const double mean2 = hadd_epi64( b ) / div;
const double mean11 = hadd_epu32( aa ) / div;
const double mean22 = hadd_epu32( bb ) / div;
const double mean12 = hadd_epu32( ab ) / div;
const double b = ( mean11 - mean1 * mean1 ) * ( mean22 - mean2 * mean2 );
if( b <= 0 )
return 0;
const double a = ( mean12 - mean1 * mean2 );
return float( a / sqrt( b ) );
}
};
// Load 1-31 bytes into AVX register, zero out unused higher bytes
inline __m256i loadPartial( const uint8_t* p, size_t length )
{
alignas( 32 ) std::array<uint8_t, 32> arr{};
memcpy( arr.data(), p, length );
return _mm256_load_si256( ( const __m256i* )arr.data() );
}
float correlate_simple( const uint8_t* vec1, const uint8_t* vec2, const size_t length )
{
InnerProduct ip;
const uint8_t* const vec1End = vec1 + ( ( length / 32 ) * 32 );
for( ; vec1 < vec1End; vec1 += 32, vec2 += 32 )
{
const __m256i x = _mm256_loadu_si256( ( const __m256i * )vec1 );
const __m256i y = _mm256_loadu_si256( ( const __m256i * )vec2 );
ip.addBytes( x, y );
}
const size_t remainder = length % 32;
if( remainder > 0 )
{
const __m256i x = loadPartial( vec1, remainder );
const __m256i y = loadPartial( vec2, remainder );
ip.addBytes( x, y );
}
return ip.compute( length );
}
我想使用 SSE/AVX2 实现快速相关系数计算。操作数是两个 unsigned char
向量。该函数应该大致等同于:
float correlate_simple(const unsigned char* vec1, const unsigned char* vec2, size_t length)
{
int sum1 = 0;
int sum2 = 0;
int sum11 = 0;
int sum22 = 0;
int sum12 = 0;
for (size_t i = length; i > 0; --i, ++vec1, ++vec2) {
sum1 += *vec1;
sum2 += *vec2;
sum11 += *vec1 * *vec1;
sum22 += *vec2 * *vec2;
sum12 += *vec1 * *vec2;
}
double mean1 = double(sum1) / double(length);
double mean2 = double(sum2) / double(length);
double mean11 = double(sum11) / double(length);
double mean22 = double(sum22) / double(length);
double mean12 = double(sum12) / double(length);
double b = (mean11 - mean1 * mean1) * (mean22 - mean2 * mean2);
if (b <= 0.0)
return 0.0f;
double a = (mean12 - mean1 * mean2);
return float(a / sqrt(b));
}
参数length
取值范围为1到小于1000
为此,我研究了如何实现两个无符号字节数组的内积。但是,我无法想出一个不涉及将所有无符号 8 位值转换为有符号 16 位值的解决方案。
内在 _mm256_maddubs_epi16(a, b)
期望 b
是一个带符号的字节。在这种情况下这不是问题,因为从 b 中减去某个常数(此处:127)不会改变相关系数。不幸的是,我找不到一个允许我从无符号字节中减去 127 产生有符号字节的内在函数(不依赖于某些二进制补码魔法)。
// vec: const unsigned char*
auto x = _mm256_load_si256((const __m256i*) vec);
auto v = _mm256_set1_epi8(127);
// wrong if vec[i] is less than 127:
auto x_centered = _mm256_sub_epi8 (x, v);
计算内积(以及最终的相关系数)的最佳方法是什么?
附录:
下面是我目前实现的一个纯内积。我决定转换为 16 位整数以避免溢出错误。 更新:从一次读取 128 位更改为 256 位。
int accumulate_i32(__m256i x)
{
auto tmp1 = _mm256_srli_si256(x, 8);
x = _mm256_add_epi32(x, tmp1);
auto tmp2 = _mm256_extractf128_si256(x, 1);
tmp2 = _mm_add_epi32(tmp2, _mm256_castsi256_si128(x));
return _mm_cvtsi128_si32(tmp2) + _mm_extract_epi32(tmp2, 1);
}
int inner_product_avx(const unsigned char* vec1, const unsigned char* vec2, unsigned int length)
{
constexpr unsigned int memoryAlignmentBytes = 32;
constexpr unsigned int bytesPerPack = 256 / 8;
assert((reinterpret_cast<std::uintptr_t>(vec1) % memoryAlignmentBytes) == 0);
assert((reinterpret_cast<std::uintptr_t>(vec2) % memoryAlignmentBytes) == 0);
// compute middle part via AVX2
unsigned int packCount = length / bytesPerPack;
const __m256i zeros = _mm256_setzero_si256();
auto sumlo = _mm256_setzero_si256();
auto sumhi = _mm256_setzero_si256();
for (unsigned int packIdx = 0; packIdx < packCount; ++packIdx) {
auto x1 = _mm256_load_si256((const __m256i*)vec1);
auto x2 = _mm256_load_si256((const __m256i*)vec2);
auto x1lo = _mm256_unpacklo_epi8(x1, zeros);
auto x1hi = _mm256_unpackhi_epi8(x1, zeros);
auto x2lo = _mm256_unpacklo_epi8(x2, zeros);
auto x2hi = _mm256_unpackhi_epi8(x2, zeros);
auto tmplo = _mm256_madd_epi16(x1lo, x2lo);
auto tmphi = _mm256_madd_epi16(x1hi, x2hi);
sumlo = _mm256_add_epi32(sumlo, tmplo);
sumhi = _mm256_add_epi32(sumhi, tmphi);
vec1 += bytesPerPack;
vec2 += bytesPerPack;
}
int sum = accumulate_i32(sumlo) + accumulate_i32(sumhi);
// compute remaining part that cannot be represented as a
// whole packed integer
unsigned int packRestCount = length % bytesPerPack;
for (size_t i = packRestCount; i > 0; --i, ++vec1, ++vec2)
sum += int(*vec1) * int(*vec2);
return sum;
}
这大约占简单 C++ 实现时间的 20%(见下文)。考虑到 AVX 代码同时处理 16 个 16 位整数这一事实,我本以为会有更高的增益。 - 这是合理的还是我错过了什么?
展开 AVX 代码中的最后一个循环并没有减少计算时间。
int inner_product_simple(const unsigned char* vec1, const unsigned char* vec2, size_t length)
{
int sum = 0;
for (size_t i = length; i > 0; --i, ++vec1, ++vec2)
sum += int(*vec1) * int(*vec2);
return sum;
}
Unfortunately I could not find an intrinsic that would allow me to subtract 127 from unsigned bytes producing signed bytes (not relying on some two's complement magic).
Intel x86 CPU 使用 2 的有符号数的补码表示,这就是 signed/unsigned 压缩整数没有单独版本的 SIMD 内在函数的原因。英特尔 SIMD 内部函数不在 C++ 标准的范围内,并且具有特定的 well-defined 行为。
我会从类似的事情开始。它像您当前的代码一样使用 32 位累加器。未经测试。
namespace
{
// Compute sum of the 64-bit lanes, convert to double
inline double hadd_epi64( const __m256i i64 )
{
__m128i res = _mm256_castsi256_si128( i64 );
res = _mm_add_epi64( res, _mm256_extractf128_si256( i64, 1 ) );
res = _mm_add_epi64( res, _mm_unpackhi_epi64( res, res ) );
return (double)_mm_cvtsi128_si64( res );
}
// Convert 32-bit lanes into 64-bit, compute sum of the 8, convert to double
inline double hadd_epu32( __m256i x )
{
const __m256i zero = _mm256_setzero_si256();
__m256i i64 = _mm256_unpacklo_epi32( x, zero );
i64 = _mm256_add_epi64( i64, _mm256_unpackhi_epi32( x, zero ) );
return hadd_epi64( i64 );
};
}
class InnerProduct
{
// These fields are interpreted as 64-bit integers
__m256i a, b;
// These fields are interpreted as 32-bit integers
__m256i aa, bb, ab;
// Accumulate products of 16-bit numbers, 16 of them at once
inline void add16( __m256i x, __m256i y )
{
const __m256i x2 = _mm256_madd_epi16( x, x );
const __m256i y2 = _mm256_madd_epi16( y, y );
const __m256i prod = _mm256_madd_epi16( x, y );
aa = _mm256_add_epi32( aa, x2 );
bb = _mm256_add_epi32( bb, y2 );
ab = _mm256_add_epi32( ab, prod );
}
public:
InnerProduct()
{
a = b = aa = bb = ab = _mm256_setzero_si256();
}
// Handle 32 bytes
inline void addBytes( __m256i x, __m256i y )
{
// Accumulate values
const __m256i zero = _mm256_setzero_si256();
a = _mm256_add_epi64( a, _mm256_sad_epu8( x, zero ) );
b = _mm256_add_epi64( b, _mm256_sad_epu8( y, zero ) );
// Split the vectors into 2 sets of 16-bit numbers, accumulate products
const __m256i z = _mm256_unpacklo_epi8( x, zero );
const __m256i w = _mm256_unpacklo_epi8( y, zero );
add16( z, w );
x = _mm256_unpackhi_epi8( x, zero );
y = _mm256_unpackhi_epi8( y, zero );
add16( x, y );
}
// Compute the result
float compute( size_t count ) const
{
const double div = (double)count;
const double mean1 = hadd_epi64( a ) / div;
const double mean2 = hadd_epi64( b ) / div;
const double mean11 = hadd_epu32( aa ) / div;
const double mean22 = hadd_epu32( bb ) / div;
const double mean12 = hadd_epu32( ab ) / div;
const double b = ( mean11 - mean1 * mean1 ) * ( mean22 - mean2 * mean2 );
if( b <= 0 )
return 0;
const double a = ( mean12 - mean1 * mean2 );
return float( a / sqrt( b ) );
}
};
// Load 1-31 bytes into AVX register, zero out unused higher bytes
inline __m256i loadPartial( const uint8_t* p, size_t length )
{
alignas( 32 ) std::array<uint8_t, 32> arr{};
memcpy( arr.data(), p, length );
return _mm256_load_si256( ( const __m256i* )arr.data() );
}
float correlate_simple( const uint8_t* vec1, const uint8_t* vec2, const size_t length )
{
InnerProduct ip;
const uint8_t* const vec1End = vec1 + ( ( length / 32 ) * 32 );
for( ; vec1 < vec1End; vec1 += 32, vec2 += 32 )
{
const __m256i x = _mm256_loadu_si256( ( const __m256i * )vec1 );
const __m256i y = _mm256_loadu_si256( ( const __m256i * )vec2 );
ip.addBytes( x, y );
}
const size_t remainder = length % 32;
if( remainder > 0 )
{
const __m256i x = loadPartial( vec1, remainder );
const __m256i y = loadPartial( vec2, remainder );
ip.addBytes( x, y );
}
return ip.compute( length );
}