如何在sse2上模拟pcmpgtq?

How to simulate pcmpgtq on sse2?

PCMPGTQ 是在 sse4.2 中引入的,它为产生掩码的 64 位数字提供大于符号的比较。

如何在早于 sse4.2 的指令集上支持这一功能?

更新:同样的问题适用于带有 Neon 的 ARMv7,它也缺少 64 位比较器。可以在这里找到姐妹问题:

下面的代码演示了可以使用预先存在的 SSE 指令通过第一原则来模拟所需的功能:带符号的“小于”谓词为真相当于溢出标志与减法后的符号标志不同。显然我们没有标志,因此也需要模拟标志,将结果谓词留在每个四字的最高有效位 (MSB) 中。在第二步中,我们可以将 MSB 扩展为四字大小的掩码。

由此产生的相当冗长的代码很好地表明,从性能的角度来看,这可能不是模拟所需功能的最佳方式,即使它在功能上是正确的。 Compiler Explorer (godbolt.org) 在使用 Clang 11 编译时显示十二条指令(请参阅下面的汇编列表)。

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "nmmintrin.h"

#define  USE_SSE42_REF (0)
#define  NBR_OF_TESTS  (1000000000)

#if USE_SSE42_REF
__m128i pcmpgtq_ref (__m128i a, __m128i b)
{
    return _mm_cmpgt_epi64 (a, b);
}
#else // USE_SSE42_REF
__m128i pcmpgtq_ref (__m128i a, __m128i b)
{
    __m128i r;
    struct {
        int64_t lo;
        int64_t hi;
    } hilo_a, hilo_b, hilo_r;
    memcpy (&hilo_a, &a, sizeof hilo_a);
    memcpy (&hilo_b, &b, sizeof hilo_b);
    hilo_r.lo = hilo_a.lo > hilo_b.lo ? (-1LL) : 0LL;
    hilo_r.hi = hilo_a.hi > hilo_b.hi ? (-1LL) : 0LL;
    memcpy (&r, &hilo_r, sizeof r);
    return r;
}
#endif // USE_SSE42_REF

/* "signed less than" == (OF != SF); compute predicate in MSB of each byte */
__m128i ltq_core (__m128i a, __m128i b)
{
    __m128i m = _mm_set1_epi64x (0x7fffffffffffffffULL);
    __m128i c = _mm_and_si128 (b, m);
    __m128i d = _mm_andnot_si128 (a, m);
    __m128i t = _mm_add_epi64 (c, d);
    __m128i s = _mm_xor_si128 (a, b);
    __m128i x = _mm_xor_si128 (a, t);
    __m128i y = _mm_and_si128 (x, s);
    __m128i r = _mm_xor_si128 (y, t);
    return r;
}

/* extend sign bits into mask, quadword-wise */
__m128i q_sign_to_mask (__m128i a)
{
    __m128i q = _mm_set1_epi64x (0);
    __m128i s = _mm_srli_epi64 (a, 63);
    __m128i r = _mm_sub_epi64 (q, s);
    return r;
}

__m128i pcmpltq (__m128i a, __m128i b)
{
    return q_sign_to_mask (ltq_core (a, b));
}

__m128i pcmpgtq (__m128i a, __m128i b)
{
    return pcmpltq (b, a);
}

/*
  https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
  From: geo <gmars...@gmail.com>
  Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
  Subject: 64-bit KISS RNGs
  Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)

  This 64-bit KISS RNG has three components, each nearly
  good enough to serve alone.    The components are:
  Multiply-With-Carry (MWC), period (2^121+2^63-1)
  Xorshift (XSH), period 2^64-1
  Congruential (CNG), period 2^64
*/

static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;

#define MWC64  (kiss64_t = (kiss64_x << 58) + kiss64_c, \
                kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
                kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64  (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
                kiss64_y ^= (kiss64_y << 43))
#define CNG64  (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)

int main (void)
{
    struct {
        uint64_t lo;
        uint64_t hi;
    } hilo_a, hilo_b, hilo_res, hilo_ref;

    for (int i = 0; i < NBR_OF_TESTS; i++) {
        uint64_t al = KISS64;
        uint64_t ah = KISS64;
        uint64_t bl = KISS64;
        uint64_t bh = KISS64;

        if ((i & 0xff) == 0x00) bl = al; // increase chance of equality
        if ((i & 0xff) == 0xff) bh = ah; // increase chance of equality

        __m128i a = _mm_set_epi64x (ah, al);
        __m128i b = _mm_set_epi64x (bh, bl);

        __m128i res = pcmpgtq (a, b);
        __m128i ref = pcmpgtq_ref (a, b);
        
        memcpy (&hilo_res, &res, sizeof hilo_res);
        memcpy (&hilo_ref, &ref, sizeof hilo_ref);

        if ((hilo_res.hi != hilo_ref.hi) || (hilo_res.lo != hilo_ref.lo)) {
            memcpy (&hilo_a, &a, sizeof hilo_a);
            memcpy (&hilo_b, &b, sizeof hilo_b); 
            printf ("error: a=%016llx_%016llx  b=%016llx_%016llx  res=%016llx_%016llx  ref=%016llx_%016llx\n",
                    hilo_a.hi, hilo_a.lo, hilo_b.hi, hilo_b.lo,
                    hilo_res.hi, hilo_res.lo, hilo_ref.hi, hilo_ref.lo);
            return EXIT_FAILURE;
        }
    }
    return EXIT_SUCCESS;
}

使用 -msse2 编译时,Clang 11 的输出如下所示:

.LCPI4_0:
        .quad   9223372036854775807             # 0x7fffffffffffffff
        .quad   9223372036854775807             # 0x7fffffffffffffff
pcmpgtq:                                # @pcmpgtq
        movdqa  xmm2, xmmword ptr [rip + .LCPI4_0] # xmm2 = [9223372036854775807,9223372036854775807]
        movdqa  xmm3, xmm1
        pxor    xmm3, xmm0
        pand    xmm0, xmm2
        movdqa  xmm4, xmm1
        pandn   xmm4, xmm2
        paddq   xmm4, xmm0
        pand    xmm1, xmm3
        pandn   xmm3, xmm4
        por     xmm3, xmm1
        psrad   xmm3, 31
        pshufd  xmm0, xmm3, 245                 # xmm0 = xmm3[1,1,3,3]
        ret

我不确定这是否是最佳输出,但这是来自 Clang 的 x64 输出。我也采用了相同的实现并将其转换为支持带有 neon 的 armv7。

查看 Godbolt 和下面的程序集:

.LCPI0_0:
        .quad   2147483648                      # 0x80000000
        .quad   2147483648                      # 0x80000000
cmpgtq_sim(long __vector(2), long __vector(2)):                  # @cmpgtq_sim(long __vector(2), long __vector(2))
        movdqa  xmm2, xmmword ptr [rip + .LCPI0_0] # xmm2 = [2147483648,2147483648]
        pxor    xmm1, xmm2
        pxor    xmm0, xmm2
        movdqa  xmm2, xmm0
        pcmpgtd xmm2, xmm1
        pshufd  xmm3, xmm2, 160                 # xmm3 = xmm2[0,0,2,2]
        pcmpeqd xmm0, xmm1
        pshufd  xmm1, xmm0, 245                 # xmm1 = xmm0[1,1,3,3]
        pand    xmm1, xmm3
        pshufd  xmm0, xmm2, 245                 # xmm0 = xmm2[1,1,3,3]
        por     xmm0, xmm1

A​​RMv7+霓虹灯

cmpgtq_sim(__simd128_int64_t, __simd128_int64_t):
        vldr    d16, .L3
        vldr    d17, .L3+8
        veor    q0, q0, q8
        veor    q1, q1, q8
        vcgt.s32        q8, q0, q1
        vceq.i32        q1, q0, q1
        vmov    q9, q8  @ v4si
        vmov    q10, q1  @ v4si
        vtrn.32 q9, q8
        vmov    q0, q8  @ v4si
        vmov    q8, q1  @ v4si
        vtrn.32 q10, q8
        vand    q8, q8, q9
        vorr    q0, q8, q0
        bx      lr
.L3:
        .word   -2147483648
        .word   0
        .word   -2147483648
        .word   0
__m128i pcmpgtq_sse2 (__m128i a, __m128i b) {
    __m128i r = _mm_and_si128(_mm_cmpeq_epi32(a, b), _mm_sub_epi64(b, a));
    r = _mm_or_si128(r, _mm_cmpgt_epi32(a, b));
    return _mm_shuffle_epi32(r, _MM_SHUFFLE(3,3,1,1));
}

我们有 32 位带符号的比较内在函数,因此将打包的 qword 拆分为 dword 对。

如果 a 中的高位双字大于 b 中的高位双字,则无需比较低位双字。

if (a.hi > b.hi) { r.hi = 0xFFFFFFFF; }
if (a.hi <= b.hi) { r.hi = 0x00000000; }

如果 a 中的高位双字等于 b 中的高位双字,则 64 位减法将清除或设置结果的所有 32 位高位(如果高位双字相等,然后它们相互“抵消”,实际上是低位双字的无符号比较,将结果放在高位双字中。

if (a.hi == b.hi) { r = (b - a) & 0xFFFFFFFF00000000; }

将高32位的比较掩码复制到低32位。

r.lo = r.hi

更新:这是 SSE2 和 ARMv7+Neon 的 Godbolt