SSE4.1 无符号整数比较溢出
SSE4.1 unsigned integer comparison with overflow
考虑到 16 位无符号加法 (_mm_add_epi16()
) 会溢出,有什么方法可以执行 C >= (A + B) 与 SSE2/4.1 指令的比较吗?
代码片段看起来像-
#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a)
__m128i *a = (__m128i *)&ptr1;
__m128i *b = (__m128i *)&ptr2;
__m128i *c = (__m128i *)&ptr3;
_m128i xa = _mm_lddqu_si128(a);
_m128i xb = _mm_lddqu_si128(b);
_m128i xc = _mm_lddqu_si128(c);
_m128i res = _mm_add_epi16(xa, xb);
_m128i xmm3 = _mm_cmpge_epu16(xc, res);
问题是当 16 位加法溢出(回绕)时,大于比较会导致误报。我不能为我的目的使用饱和添加。我在这里 SSE2 integer overflow checking 查看了检测无符号加法溢出的机制。但是我如何使用 if 进行大于比较。
这里有一些合理的方法:
#include <cstdint>
using v8u16 = uint16_t __attribute__((vector_size(16)));
v8u16 lthsum1(v8u16 a, v8u16 b, v8u16 c) {
return (c >= a) & (c - a >= b);
}
v8u16 lthsum2(v8u16 a, v8u16 b, v8u16 c) {
return (a + b >= a) & (a + b <= c);
}
你可以看到这是如何编译的
godbolt. Both approaches are broadly equivalent, and I'm not seeing large changes with -msse4.1
with gcc, but AVX2 and later do improve the code. clang also gets minor improvements with sse4.1 for the second variant. With AVX512BW,clang 本身就做得很好。
您可以根据指令集中可用的内容构建缺少的原语。
这是一种可能的实现方式,未经测试。 Disassembly.
// Compare uint16_t lanes for a >= b
inline __m128i cmpge_epu16( __m128i a, __m128i b )
{
const __m128i max = _mm_max_epu16( a, b );
return _mm_cmpeq_epi16( max, a );
}
// Compare uint16_t lanes for c >= a + b, with overflow handling
__m128i cmpgeSum( __m128i a, __m128i b, __m128i c )
{
// Compute c >= a + b, ignoring overflow issues
const __m128i sum = _mm_add_epi16( a, b );
const __m128i ge = cmpge_epu16( c, sum );
// Detect overflow of a + b
const __m128i sumSaturated = _mm_adds_epu16( a, b );
const __m128i sumInRange = _mm_cmpeq_epi16( sum, sumSaturated );
// Combine the two
return _mm_and_si128( ge, sumInRange );
}
考虑到 16 位无符号加法 (_mm_add_epi16()
) 会溢出,有什么方法可以执行 C >= (A + B) 与 SSE2/4.1 指令的比较吗?
代码片段看起来像-
#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a)
__m128i *a = (__m128i *)&ptr1;
__m128i *b = (__m128i *)&ptr2;
__m128i *c = (__m128i *)&ptr3;
_m128i xa = _mm_lddqu_si128(a);
_m128i xb = _mm_lddqu_si128(b);
_m128i xc = _mm_lddqu_si128(c);
_m128i res = _mm_add_epi16(xa, xb);
_m128i xmm3 = _mm_cmpge_epu16(xc, res);
问题是当 16 位加法溢出(回绕)时,大于比较会导致误报。我不能为我的目的使用饱和添加。我在这里 SSE2 integer overflow checking 查看了检测无符号加法溢出的机制。但是我如何使用 if 进行大于比较。
这里有一些合理的方法:
#include <cstdint>
using v8u16 = uint16_t __attribute__((vector_size(16)));
v8u16 lthsum1(v8u16 a, v8u16 b, v8u16 c) {
return (c >= a) & (c - a >= b);
}
v8u16 lthsum2(v8u16 a, v8u16 b, v8u16 c) {
return (a + b >= a) & (a + b <= c);
}
你可以看到这是如何编译的
godbolt. Both approaches are broadly equivalent, and I'm not seeing large changes with -msse4.1
with gcc, but AVX2 and later do improve the code. clang also gets minor improvements with sse4.1 for the second variant. With AVX512BW,clang 本身就做得很好。
您可以根据指令集中可用的内容构建缺少的原语。 这是一种可能的实现方式,未经测试。 Disassembly.
// Compare uint16_t lanes for a >= b
inline __m128i cmpge_epu16( __m128i a, __m128i b )
{
const __m128i max = _mm_max_epu16( a, b );
return _mm_cmpeq_epi16( max, a );
}
// Compare uint16_t lanes for c >= a + b, with overflow handling
__m128i cmpgeSum( __m128i a, __m128i b, __m128i c )
{
// Compute c >= a + b, ignoring overflow issues
const __m128i sum = _mm_add_epi16( a, b );
const __m128i ge = cmpge_epu16( c, sum );
// Detect overflow of a + b
const __m128i sumSaturated = _mm_adds_epu16( a, b );
const __m128i sumInRange = _mm_cmpeq_epi16( sum, sumSaturated );
// Combine the two
return _mm_and_si128( ge, sumInRange );
}