通过 avx 指令向量化间接访问

Question

我最近了解到矢量指令（理论上），我对如何使用它们来加速我的应用程序感到很兴奋。

我想改进的一个领域是一个非常热的循环：

__declspec(noinline) void pleaseVectorize(int* arr, int* someGlobalArray, int* output)
{
    for (int i = 0; i < 16; ++i)
    {
        auto someIndex = arr[i];
        output[i] = someGlobalArray[someIndex];
    }

    for (int i = 0; i < 16; ++i)
    {
         if (output[i] == 1)
         {
             return i;
         }
    }

    return -1;
}

但是，当然，所有 3 个主要编译器（msvc、gcc、clang）都拒绝对其进行矢量化。我能理解为什么，但我想得到确认。

如果我必须手动对其进行矢量化，它将是：

(1) VectorLoad "arr"，这会带来 16 个 4 字节整数，让我们假设为 zmm0

(2) 16 内存从zmm0[0..3]指向的地址加载到zmm1[0..3]，从zmm0[4..7]指向的地址加载到zmm1[4 ..7]等等等等

(3)比较zmm0和zmm1

(4) 向量 popcnt 到输出中以找出最高有效位并将其除以 8 以获得匹配的索引

首先，矢量指令可以做这些事情吗？就像他们可以执行此 "gathering" 操作，即从指向 zmm0 的地址进行加载吗？

这是 clang 生成的内容：

0000000000400530 <_Z5superPiS_S_>:
  400530:       48 63 07                movslq (%rdi),%rax
  400533:       8b 04 86                mov    (%rsi,%rax,4),%eax
  400536:       89 02                   mov    %eax,(%rdx)
  400538:       48 63 47 04             movslq 0x4(%rdi),%rax
  40053c:       8b 04 86                mov    (%rsi,%rax,4),%eax
  40053f:       89 42 04                mov    %eax,0x4(%rdx)
  400542:       48 63 47 08             movslq 0x8(%rdi),%rax
  400546:       8b 04 86                mov    (%rsi,%rax,4),%eax
  400549:       89 42 08                mov    %eax,0x8(%rdx)
  40054c:       48 63 47 0c             movslq 0xc(%rdi),%rax
  400550:       8b 04 86                mov    (%rsi,%rax,4),%eax
  400553:       89 42 0c                mov    %eax,0xc(%rdx)
  400556:       48 63 47 10             movslq 0x10(%rdi),%rax
  40055a:       8b 04 86                mov    (%rsi,%rax,4),%eax
  40055d:       89 42 10                mov    %eax,0x10(%rdx)
  400560:       48 63 47 14             movslq 0x14(%rdi),%rax
  400564:       8b 04 86                mov    (%rsi,%rax,4),%eax
  400567:       89 42 14                mov    %eax,0x14(%rdx)
  40056a:       48 63 47 18             movslq 0x18(%rdi),%rax
  40056e:       8b 04 86                mov    (%rsi,%rax,4),%eax
  400571:       89 42 18                mov    %eax,0x18(%rdx)
  400574:       48 63 47 1c             movslq 0x1c(%rdi),%rax
  400578:       8b 04 86                mov    (%rsi,%rax,4),%eax
  40057b:       89 42 1c                mov    %eax,0x1c(%rdx)
  40057e:       48 63 47 20             movslq 0x20(%rdi),%rax
  400582:       8b 04 86                mov    (%rsi,%rax,4),%eax
  400585:       89 42 20                mov    %eax,0x20(%rdx)
  400588:       48 63 47 24             movslq 0x24(%rdi),%rax
  40058c:       8b 04 86                mov    (%rsi,%rax,4),%eax
  40058f:       89 42 24                mov    %eax,0x24(%rdx)
  400592:       48 63 47 28             movslq 0x28(%rdi),%rax
  400596:       8b 04 86                mov    (%rsi,%rax,4),%eax
  400599:       89 42 28                mov    %eax,0x28(%rdx)
  40059c:       48 63 47 2c             movslq 0x2c(%rdi),%rax
  4005a0:       8b 04 86                mov    (%rsi,%rax,4),%eax
  4005a3:       89 42 2c                mov    %eax,0x2c(%rdx)
  4005a6:       48 63 47 30             movslq 0x30(%rdi),%rax
  4005aa:       8b 04 86                mov    (%rsi,%rax,4),%eax
  4005ad:       89 42 30                mov    %eax,0x30(%rdx)
  4005b0:       48 63 47 34             movslq 0x34(%rdi),%rax
  4005b4:       8b 04 86                mov    (%rsi,%rax,4),%eax
  4005b7:       89 42 34                mov    %eax,0x34(%rdx)
  4005ba:       48 63 47 38             movslq 0x38(%rdi),%rax
  4005be:       8b 04 86                mov    (%rsi,%rax,4),%eax
  4005c1:       89 42 38                mov    %eax,0x38(%rdx)
  4005c4:       48 63 47 3c             movslq 0x3c(%rdi),%rax
  4005c8:       8b 04 86                mov    (%rsi,%rax,4),%eax
  4005cb:       89 42 3c                mov    %eax,0x3c(%rdx)
  4005ce:       c3                      retq
  4005cf:       90                      nop

Answer 1

你对它如何工作的想法很接近，除了你想要一个 bit-scan / find-first-set-bit (x86 BSF or TZCNT) 比较位图，而不是人口计数（number 位集).

AVX2 / AVX512 有 vpgatherdd，它确实使用了带符号的 32 位缩放索引的向量。它几乎不值得在 Haswell 上使用，在 Broadwell 上有所改进，在 Skylake 上非常好。（http://agner.org/optimize/, and see other links in the x86 tag wiki，比如intel的优化手册里面有一节是关于gather performance的）。相比之下，SIMD compare 和 bitscan 非常便宜；单 uop 和完全流水线。

gcc8.1 可以自动矢量化你的收集，if 它可以证明你的输入不与你的 output 函数 arg。有时在内联后可能，但对于非内联版本，您可以使用 int * __restrict output 来保证这一点。或者，如果您使 output 成为局部临时参数而不是函数参数。（一般规则：通过非 _restrict 指针存储通常会抑制自动矢量化，特别是如果它是可以别名的 char*。）

gcc 和 clang 从不向量化搜索循环；仅在进入循环 之前可以计算行程计数的循环。但是ICC可以；它进行标量收集并存储结果（即使 output[] 是本地的，所以它没有作为运行的副作用宁功能), 然后使用 SIMD 打包比较 + 位扫描。

Compiler output for a __restrict version。请注意，在针对 Skylake-AVX512 进行调整时，gcc8.1 和 ICC 默认会避免使用 512 位向量。 512 位向量可以限制最大涡轮增压，并且当它们在管道中时始终关闭端口 1 上的向量 ALU，因此将 AVX512 或 AVX2 与 256 位向量一起使用是有意义的，以防此功能仅一个大程序的一小部分。（编译器不知道这个函数在你的程序中非常热门。）

如果 output[] 是本地的，更好的代码生成策略可能是在收集时进行比较，因此早期命中会跳过其余的负载。完全标量的编译器（clang 和 MSVC）都错过了这个优化。事实上，它们甚至存储到本地数组，即使 clang 大多不会重新读取它（将结果保存在寄存器中）。在第一个循环内使用比较编写源代码可以获得更好的标量代码。（取决于来自收集的缓存未命中与来自非 SIMD 搜索的分支错误预测，标量可能是一个很好的策略。特别是如果前几个元素的命中很常见。当前的收集硬件无法利用来自相同的缓存行，因此硬限制仍然是每个时钟周期加载 2 个元素。但是，如果您的数据大部分在缓存中很热，则使用索引的宽矢量加载来提供收集可以显着降低加载端口/缓存访问压力。）

编译器可能已将代码的 __restrict 版本自动向量化为类似这样的代码。（gcc 管理收集部分，ICC 管理 SIMD 比较部分）

;; Windows x64 calling convention: rcx,rdx, r8,r9
; but of course you'd actually inline this
; only uses ZMM16..31, so vzeroupper not required

vmovdqu32   zmm16, [rcx/arr]   ; You def. want to reach an alignment boundary if you can for ZMM loads, vmovdqa32 will enforce that

kxnorw      k1, k0,k0      ; k1 = -1.  k0 false dep is likely not a problem.
  ; optional: vpxord  xmm17, xmm17, xmm17   ; break merge-masking false dep
vpgatherdd  zmm17{k1}, [rdx + zmm16 * 4]    ; GlobalArray + scaled-vector-index
; sets k1 = 0 when done

vmovdqu32   [r8/output], zmm17

vpcmpd      k1, zmm17, zmm31, 0    ; 0->EQ.  Outside the loop, do zmm31=set1_epi32(1)
                                   ; k1 = compare bitmap
kortestw    k1, k1
jz         .not_found      ; early check for not-found

kmovw       edx, k1

           ; tzcnt doesn't have a false dep on the output on Skylake
           ; so no AVX512 CPUs need to worry about that HSW/BDW issue
tzcnt       eax, edx       ; bit-scan for the first (lowest-address) set element
                           ; input=0 produces output=32
      ; or avoid the branch and let 32 be the not-found return value.
      ; or do a branchless kortestw / cmov if -1 is directly useful without branching
ret

.not_found:
   mov eax, -1
   ret

你可以自己用内部函数来做这个:

Intel 的指令集参考手册（HTML 摘自 http://felixcloutier.com/x86/index.html) includes C/C++ intrinsic names for each instruction, or search for them in https://software.intel.com/sites/landingpage/IntrinsicsGuide/

我将 output 类型更改为 __m512i。如果您不手动矢量化调用者，则可以将其更改回数组。 您肯定希望此函数内联。

#include <immintrin.h>

//__declspec(noinline)  // I *hope* this was just to see the stand-alone asm version
                        // but it means the output array can't optimize away at all

//static inline
int find_first_1(const int *__restrict arr, const int *__restrict someGlobalArray, __m512i *__restrict output)
{
    __m512i vindex = _mm512_load_si512(arr);
    __m512i gather = _mm512_i32gather_epi32(vindex, someGlobalArray, 4);  // indexing by 4-byte int
    *output = gather;  

    __mmask16 cmp = _mm512_cmpeq_epi32_mask(gather, _mm512_set1_epi32(1));
       // Intrinsics make masks freely convert to integer
       // even though it costs a `kmov` instruction either way.
    int onepos =  _tzcnt_u32(cmp);
    if (onepos >= 16){
        return -1;
    }
    return onepos;
}

所有 4 个 x86 编译器都生成与我建议的类似的 asm (see it on the Godbolt compiler explorer)，但当然它们必须实际实现 set1_epi32(1) 向量常量，或使用（广播）内存操作数。 Clang 实际上使用来自常量的 {1to16} 广播负载进行比较：vpcmpeqd k0, zmm1, dword ptr [rip + .LCPI0_0]{1to16}。（当然他们会在内联到循环时做出不同的选择。）其他人使用 mov eax,1 / vpbroadcastd zmm0, eax.

gcc8.1 -O3 -march=skylake-avx512 有两个冗余的 mov eax, -1 指令：一个用于为收集提供 kmov，另一个用于 return-value东西。愚蠢的编译器应该保留它并为 1.

使用不同的寄存器

他们都使用 zmm0..15，因此无法避免 vzeroupper。（xmm16.31 不能通过 legacy-SSE 访问，所以 doesn't exist if the only wide vector registers you use are y/zmm16..31). There may still be tiny possible advantages to vzeroupper, like cheaper context switches when the upper halves of ymm or zmm regs are known to be zero (）。如果你无论如何都要使用它，没有理由避免 xmm0..15.

哦，在 Windows 调用约定中，xmm6..15 是调用保留的。（不是 ymm/zmm，只是低 128 位），所以如果你运行 out of xmm0..5 regs.

，那么 zmm16..31 是一个不错的选择

通过 avx 指令向量化间接访问

Vectorizing indirect access through avx instructions

c++

simd

vectorization

compiler-optimization

avx512