AVX2 代码不能比基于 gcc 的优化更快

AVX2 code cannot be faster than gcc base optmization

我正在通过使用内联汇编编写 AVX 代码来研究 AVX。在这种情况下,我尝试在一个简单的函数中实现 AVX。我做的函数名是lower_all_chars_base.

它的行为是:对 std::string0x20 中的每个字符应用逻辑或。

因此该函数将使字符变为小写。

我试过做AVX版本的函数,store指令没有对齐,一点提速都没有

然后我想,如果内存访问是对齐的,那肯定更快。在那之后,我尝试制作具有对齐存储的 AVX 版本,但仍然 gcc 基本优化 -O3 手动击败了我的矢量化代码。我在这里做错了什么?

函数总结

  1. lower_all_chars_base 简单的函数。
  2. lower_all_chars_avx_aligned AVX2 对齐移动版本:
  1. lower_all_chars_avx_unaligned AVX2 未对齐移动版本:

问题

  1. 为什么 gcc 基础优化 -O3 打败了我的优化?
  2. 我做错了什么?
  3. 执行此操作的正确 AVX 操作是什么?

基准测试结果

root@esteh:/tmp# g++ --version
g++ (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

root@esteh:/tmp# g++ -Wall -Wextra -std=c++2a -O3 test.cpp -o test
root@esteh:/tmp# nice -n -20 ./test
lower_all_chars_base
Min   =  0.00662300
Max   =  0.00793100
Avg   =  0.00717280
Total =  0.07172800

lower_all_chars_avx_aligned
Min   =  0.00650200
Max   =  0.00785100
Avg   =  0.00726220
Total =  0.07262200

lower_all_chars_avx_unaligned
Min   =  0.00623600
Max   =  0.00835000
Avg   =  0.00701360
Total =  0.07013600

代码

编辑:N - 1 用于 memset。

神箭link:https://godbolt.org/z/a16cGK


#include <ctime>
#include <cstdio>
#include <cassert>
#include <cstring>
#include <iostream>
using std::string;

void lower_all_chars_base(string &str);
void lower_all_chars_avx_aligned(string &str);
void lower_all_chars_avx_unaligned(string &str);
void do_benchmark(std::string &x, void (*fx)(string &));
void mem_flush(const void *p, unsigned int allocation_size);

#define N (size_t)(1024u * 1024 * 40)

#define BENCHMARK(STR, FX) do { \
 puts(#FX); \
 do_benchmark(STR, FX); \
} while(0)

int main() {
  static char x[N];
  memset(x, 'A', N - 1);
  string a(x), b(x), c(x);
  
  BENCHMARK(a, lower_all_chars_base);
  BENCHMARK(b, lower_all_chars_avx_aligned);
  BENCHMARK(c, lower_all_chars_avx_unaligned);

  assert(a == b);
  assert(b == c);

  memset(x, 'a', N - 1);
  assert(memcmp(c.c_str(), x, N - 1) == 0);
}

void do_benchmark(std::string &x, void (*fx)(string &)) {
  const size_t n = 10;
  double min, max, avg, c, total = 0;
  for (size_t i = 0; i < n; i++) {
    clock_t time0 = clock();
    fx(x);
    clock_t time1 = clock();

    c = (double)(time1 - time0) / CLOCKS_PER_SEC;
    total += c;
    if (i == 0) {
      min = max = c;
    } else {
      if (c > max) max = c;
      if (c < min) min = c;
    }
    mem_flush(x.c_str(), x.size());
  }
  avg = total / (double)n;
  printf("Min   =  %.8f\n", min);
  printf("Max   =  %.8f\n", max);
  printf("Avg   =  %.8f\n", avg);
  printf("Total =  %.8f\n\n", total);
}

__attribute__((noinline))
void lower_all_chars_base(string &str) {
  char *cs = (char *)str.c_str();
  size_t len = str.size();
  while (len--) {
    *cs++ |= 0x20;
  }
}

static const uint64_t mask[] __attribute__((aligned(32))) = {
  0x2020202020202020ull, 0x2020202020202020ull,
  0x2020202020202020ull, 0x2020202020202020ull
};

__attribute__((noinline))
void lower_all_chars_avx_aligned(string &str) {
  char *cs = (char *)str.c_str();
  size_t len = str.size();

  /* Only use AVX for data bigger than 4K. */
  if (len > 4096) {
    /* Handle unaligned data from the head. */
    uint8_t n = (uintptr_t)cs & 0b11111u;
    for (uint8_t i = 0; i < n; i++) {
      *cs++ |= 0x20;
    }

    len -= n;

    /* Prevent AVX to process data beyond the array. */
    size_t vlen = len - 288;
    size_t j;

    /* Process the aligned memory with AVX. */
    asm volatile("vmovdqa %[mask], %%ymm0"::[mask]"m"(mask):"ymm0");
    for (j = 0; j < vlen; j += 288) {
      asm volatile(
        "vpor\t(%[cs],%[j]), %%ymm0, %%ymm1\n\t"
        "vpor\t32(%[cs],%[j]), %%ymm0, %%ymm2\n\t"
        "vpor\t64(%[cs],%[j]), %%ymm0, %%ymm3\n\t"
        "vpor\t96(%[cs],%[j]), %%ymm0, %%ymm4\n\t"
        "vpor\t128(%[cs],%[j]), %%ymm0, %%ymm5\n\t"
        "vpor\t160(%[cs],%[j]), %%ymm0, %%ymm6\n\t"
        "vpor\t192(%[cs],%[j]), %%ymm0, %%ymm7\n\t"
        "vpor\t224(%[cs],%[j]), %%ymm0, %%ymm8\n\t"
        "vpor\t256(%[cs],%[j]), %%ymm0, %%ymm9\n\t"
        "vmovdqa\t%%ymm1, (%[cs],%[j])\n\t"
        "vmovdqa\t%%ymm2, 32(%[cs],%[j])\n\t"
        "vmovdqa\t%%ymm3, 64(%[cs],%[j])\n\t"
        "vmovdqa\t%%ymm4, 96(%[cs],%[j])\n\t"
        "vmovdqa\t%%ymm5, 128(%[cs],%[j])\n\t"
        "vmovdqa\t%%ymm6, 160(%[cs],%[j])\n\t"
        "vmovdqa\t%%ymm7, 192(%[cs],%[j])\n\t"
        "vmovdqa\t%%ymm8, 224(%[cs],%[j])\n\t"
        "vmovdqa\t%%ymm9, 256(%[cs],%[j])"
        :
        : [cs]"p"(cs), [j]"r"(j)
        : "memory", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
          "ymm6", "ymm7", "ymm8", "ymm9"
      );
    }
    asm volatile("vzeroupper":::
      "ymm0", "ymm1", "ymm2", "ymm3",
      "ymm4", "ymm5", "ymm6", "ymm7",
      "ymm8", "ymm9", "ymm10", "ymm11",
      "ymm12","ymm13","ymm14","ymm15"
    );
    cs  += j;
    len -= j;
  }

  /* Backup remaining elements from the AVX operation. */
  for (size_t i = 0; i < len; i++) {
    *cs++ |= 0x20;
  }
}

__attribute__((noinline))
void lower_all_chars_avx_unaligned(string &str) {
  char *cs = (char *)str.c_str();
  size_t len = str.size();

  /* Only use AVX for data bigger than 4K. */
  if (len > 4096) {
    size_t j;
    size_t vlen  = len - 288;
    asm volatile("vmovdqa %[mask], %%ymm0"::[mask]"m"(mask):"ymm0");
    for (j = 0; j < vlen; j += 288) {
      asm volatile(
        "vpor\t(%[cs],%[j]), %%ymm0, %%ymm1\n\t"
        "vpor\t32(%[cs],%[j]), %%ymm0, %%ymm2\n\t"
        "vpor\t64(%[cs],%[j]), %%ymm0, %%ymm3\n\t"
        "vpor\t96(%[cs],%[j]), %%ymm0, %%ymm4\n\t"
        "vpor\t128(%[cs],%[j]), %%ymm0, %%ymm5\n\t"
        "vpor\t160(%[cs],%[j]), %%ymm0, %%ymm6\n\t"
        "vpor\t192(%[cs],%[j]), %%ymm0, %%ymm7\n\t"
        "vpor\t224(%[cs],%[j]), %%ymm0, %%ymm8\n\t"
        "vpor\t256(%[cs],%[j]), %%ymm0, %%ymm9\n\t"
        "vmovdqu\t%%ymm1, (%[cs],%[j])\n\t"
        "vmovdqu\t%%ymm2, 32(%[cs],%[j])\n\t"
        "vmovdqu\t%%ymm3, 64(%[cs],%[j])\n\t"
        "vmovdqu\t%%ymm4, 96(%[cs],%[j])\n\t"
        "vmovdqu\t%%ymm5, 128(%[cs],%[j])\n\t"
        "vmovdqu\t%%ymm6, 160(%[cs],%[j])\n\t"
        "vmovdqu\t%%ymm7, 192(%[cs],%[j])\n\t"
        "vmovdqu\t%%ymm8, 224(%[cs],%[j])\n\t"
        "vmovdqu\t%%ymm9, 256(%[cs],%[j])"
        :
        : [cs]"p"(cs), [j]"r"(j)
        : "memory", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
          "ymm6", "ymm7", "ymm8", "ymm9"
      );
    }
    asm volatile("vzeroupper":::
      "ymm0", "ymm1", "ymm2", "ymm3",
      "ymm4", "ymm5", "ymm6", "ymm7",
      "ymm8", "ymm9", "ymm10", "ymm11",
      "ymm12","ymm13","ymm14","ymm15"
    );
    cs  += j;
    len -= j;
  }

  /* Backup remaining elements from the AVX operation. */
  for (size_t i = 0; i < len; i++) {
    *cs++ |= 0x20;
  }
}

void mem_flush(const void *p, unsigned int allocation_size) {
  /*  */
  const size_t cache_line = 64;
  const char *cp = (const char *)p;
  size_t i = 0;
  if (p == NULL || allocation_size <= 0)
    return;

  for (i = 0; i < allocation_size; i += cache_line) {
    asm volatile("clflush (%0)"::"r"(&cp[i]):"memory");
  }
  asm volatile("sfence"::: "memory");
}

我尝试在评论中应用一些建议。

然而,我不使用内部函数。现在,手工编码的汇编版本比使用 -O3 -mavx2 标志的 gcc 优化快大约 1.02 倍。这不是一个显着的加速。但是我学到了很多关于内联汇编的知识。我还在等待其他答案,我希望有比这个更好的答案。


lower_all_chars_avx_aligned 功能变化汇总:

  1. 使用 vbroadcastsd 加载蒙版,就像 clang 一样。
  2. 使用虚拟内存输出替换内联汇编中的 volatile"memory" 破坏。
  3. 将 AVX2 阈值降低到 1024 字节。
  4. 使用 AVX2 处理未对齐的 head/tail。
  5. AVX2 循环完全用内联汇编编写。
  6. 利用 ymm0ymm15 寄存器。
  7. 添加 __AVX____AVX2__ 常量检查以防止 vzeroupper 发出两次。

基准更改摘要:

  1. 要处理的数据长度为 3.5 GB(原来只有 40 MB)。
  2. 运行每个函数30次(原来只有10次)
  3. 添加了 -mavx2 以改进 gcc 基础优化。
  4. lower_all_chars_avx_unaligned 已删除。
  5. 使用 malloc 中的堆而不是 static char[] 变量来处理更大的数据。
  6. 运行 在 Skylake 微架构上(在 Broadwell 上)。

基准信息:

  • Min是30次函数调用的最小时间。
  • Max是30次函数调用的最大时间。
  • Avg 是 30 次函数调用的平均时间。
  • Total 是 30 次函数调用的总时间。

基准测试结果:

  • CPU:Intel(R) Xeon(R) Gold 6140 CPU @ 2.30GHz
  • 微架构:Skylake
root@yukii-hpc2:/tmp# g++ -Wall -Wextra -std=c++2a -O3 -mavx2 test.cpp -o test
root@yukii-hpc2:/tmp# nice -n -20 ./test
lower_all_chars_avx_aligned
Min   =  0.31086600
Max   =  0.31319800
Avg   =  0.31159833
Total =  9.34795000

lower_all_chars_base
Min   =  0.31823400
Max   =  0.32902100
Avg   =  0.31904893
Total =  9.57146800

root@yukii-hpc2:/tmp# g++ --version
g++ (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

root@yukii-hpc2:/tmp# 

代码

/*
  
*/

#include <ctime>
#include <cstdio>
#include <cassert>
#include <cstring>
#include <cstdlib>
#include <iostream>
using std::string;

void lower_all_chars_base(string &str);
void lower_all_chars_avx_aligned(string &str);
void do_benchmark(string &x, void (*fx)(string &));
void mem_flush(const void *p, unsigned int allocation_size);

#define _M(N) (size_t)(1024ull * 1024 * (N))
#define _G(N) (size_t)(1024ull * 1024 * 1024 * (N))

#define N (_G(3) + _M(512) + 1234) /* 3.5 G + 1234 */
/*
   1234 is just to make it odd,
   so it likely will jump to the tail of AVX aligned loop.
*/

#define BENCHMARK(STR, FX) do { \
 puts(#FX); \
 do_benchmark(STR, FX); \
} while(0)

int main() {
  char *x = (char *)malloc(N + 1);
  memset(x, 'A', N);
  x[N] = '[=11=]';

  {
    string a(x);
    memset(x, 'a', N);
    BENCHMARK(a, lower_all_chars_avx_aligned);
    assert(memcmp(a.c_str(), x, N) == 0);
  }

  /* Restore value for the next benchmark. */
  memset(x, 'A', N);

  {
    string a(x);
    memset(x, 'a', N);
    BENCHMARK(a, lower_all_chars_base);
    assert(memcmp(a.c_str(), x, N) == 0);
  }

  free(x);
}

inline static void lower_all_chars_b1024(char *cs, uint16_t len) {
  while (len--) {
    *cs++ |= 0x20;
  }
}

/* Aligned memory for mask for performance. */
static const uint64_t mask[] __attribute__((aligned(32))) = {
  0x2020202020202020ull
};

__attribute__((noinline))
void lower_all_chars_avx_aligned(string &str) {
  char   *cs = (char *)str.c_str();
  size_t len = str.size();

  /* Only use AVX for data bigger than or equal to 1K. */
  if (len >= 1024) {

    size_t    avx_size  = 0x1e0; /* Bytes per AVX main iteration. */
    char      *end      = &(cs[len]);

    /* End of aligned process iteration. */
    char      *end_avx  = &(end[-avx_size]);

    /* Dummy variable, to let the compiler choose the best GP register. */
    uintptr_t rem_bytes;

    asm(
      /* Prepare %[rem_bytes] initial value. */
      "movq\t%[end], %[rem_bytes]\n\t"

      /* Load the mask. */
      "vbroadcastsd\t%[mask], %%ymm0\n\t"

      /* Handle unaligned memory from the head. */
      "vpor\t(%[cs]), %%ymm0, %%ymm1\n\t"
      "vmovdqu\t%%ymm1, (%[cs])\n\t"
      "addq\t[=11=]x20, %[cs]\n\t" /* Move to the next 32 bytes. */


      /* Handle aligned memory part.

        Use `vmovdqa` to make sure that the memory is
        aligned properly.

        Note that ORing is idempotent: you can OR the same
        byte multiple times without changing it further. So
        %[cs] can partially overlap with `vmovdqu` operation
        before this point.

        #comment115632279_65404362
      */
      "andq\t$~0b11111ull, %[cs]\n\t" /* Clear 5-bit LSB. */
      "1:\n\t"

      "vpor\t0x000(%[cs]), %%ymm0, %%ymm1\n\t"
      "vpor\t0x020(%[cs]), %%ymm0, %%ymm2\n\t"
      "vpor\t0x040(%[cs]), %%ymm0, %%ymm3\n\t"
      "vpor\t0x060(%[cs]), %%ymm0, %%ymm4\n\t"
      "vpor\t0x080(%[cs]), %%ymm0, %%ymm5\n\t"
      "vpor\t0x0a0(%[cs]), %%ymm0, %%ymm6\n\t"
      "vpor\t0x0c0(%[cs]), %%ymm0, %%ymm7\n\t"
      "vpor\t0x0e0(%[cs]), %%ymm0, %%ymm8\n\t"
      "vpor\t0x100(%[cs]), %%ymm0, %%ymm9\n\t"
      "vpor\t0x120(%[cs]), %%ymm0, %%ymm10\n\t"
      "vpor\t0x140(%[cs]), %%ymm0, %%ymm11\n\t"
      "vpor\t0x160(%[cs]), %%ymm0, %%ymm12\n\t"
      "vpor\t0x180(%[cs]), %%ymm0, %%ymm13\n\t"
      "vpor\t0x1a0(%[cs]), %%ymm0, %%ymm14\n\t"
      "vpor\t0x1c0(%[cs]), %%ymm0, %%ymm15\n\t"

      /* Plug the result to aligned memory.  */
      "vmovdqa\t%%ymm1, 0x000(%[cs])\n\t"
      "vmovdqa\t%%ymm2, 0x020(%[cs])\n\t"
      "vmovdqa\t%%ymm3, 0x040(%[cs])\n\t"
      "vmovdqa\t%%ymm4, 0x060(%[cs])\n\t"
      "vmovdqa\t%%ymm5, 0x080(%[cs])\n\t"
      "vmovdqa\t%%ymm6, 0x0a0(%[cs])\n\t"
      "vmovdqa\t%%ymm7, 0x0c0(%[cs])\n\t"
      "vmovdqa\t%%ymm8, 0x0e0(%[cs])\n\t"
      "vmovdqa\t%%ymm9, 0x100(%[cs])\n\t"
      "vmovdqa\t%%ymm10, 0x120(%[cs])\n\t"
      "vmovdqa\t%%ymm11, 0x140(%[cs])\n\t"
      "vmovdqa\t%%ymm12, 0x160(%[cs])\n\t"
      "vmovdqa\t%%ymm13, 0x180(%[cs])\n\t"
      "vmovdqa\t%%ymm14, 0x1a0(%[cs])\n\t"
      "vmovdqa\t%%ymm15, 0x1c0(%[cs])\n\t"

      "addq\t%[avx_size], %[cs]\n\t"
      "cmpq\t%[end_avx], %[cs]\n\t"
      "jb\t1b\n\t"

      "subq\t%[cs], %[rem_bytes]\n\t"

      /* Now, %[rem_bytes] contains the remaining bytes. */
      "testq\t%[rem_bytes], %[rem_bytes]\n\t"
      "jz\t3f\n\t"
      /* There's no remaining bytes if `jz` is taken. */


      /* Handle the tail, may be back off several bytes
         to make the remaining bytes to be multiple of 32.
       */
      "leaq\t0b11111(%[rem_bytes]), %[dec_avx]\n\t"
      "andq\t$~0b11111ull, %[dec_avx]\n\t"
      "subq\t%[rem_bytes], %[dec_avx]\n\t"
      "subq\t%[dec_avx], %[cs]\n\t"

      "2:\n\t"
      "vpor\t(%[cs]), %%ymm0, %%ymm1\n\t"
      "vmovdqu\t%%ymm1, (%[cs])\n\t"
      "addq\t[=11=]x20, %[cs]\n\t"
      "cmpq\t%[end], %[cs]\n\t"
      "jb\t2b\n\t"

      "3:\n\t"
      #if !defined(__AVX__) && !defined(__AVX2__)
      "vzeroupper"
      #endif
      /* Output */
      : [cs]"+r"(cs),
        [end]"+r"(end),
        [end_avx]"+r"(end_avx),
        [dec_avx]"=r"(end_avx), /* May reuse end_avx if needed. */
        [rem_bytes]"=r"(rem_bytes),

        /* Tell the compiler that this inline assembly is
           going to read/write `len` bytes from `cs`. */
        [dummy_mem_output]"+m"(*(char (*)[len])cs)

      /* Input */
      : [mask]"m"(mask),
        [avx_size]"n"(avx_size)

      /* Clobbers */
      : "ymm0", "ymm1", "ymm2", "ymm3",
        "ymm4", "ymm5", "ymm6", "ymm7",
        "ymm8", "ymm9", "ymm10", "ymm11",
        "ymm12", "ymm13", "ymm14", "ymm15"
    );
  } else {
    /* Let the compiler use its own optimization here. */
    lower_all_chars_b1024(cs, len);
  }
}

__attribute__((noinline))
void lower_all_chars_base(string &str) {
  char *cs = (char *)str.c_str();
  size_t len = str.size();
  while (len--) {
    *cs++ |= 0x20;
  }
}

void do_benchmark(string &x, void (*fx)(string &)) {
  const size_t n = 30;
  double min = 0, max = 0, avg, c, total = 0;
  for (size_t i = 0; i < n; i++) {

    mem_flush(x.c_str(), x.size());

    clock_t time0 = clock();
    fx(x);
    clock_t time1 = clock();

    c = (double)(time1 - time0) / CLOCKS_PER_SEC;
    total += c;
    if (i == 0) {
      min = max = c;
    } else {
      if (c > max) max = c;
      if (c < min) min = c;
    }
  }
  avg = total / (double)n;
  printf("Min   =  %.8f\n", min);
  printf("Max   =  %.8f\n", max);
  printf("Avg   =  %.8f\n", avg);
  printf("Total =  %.8f\n\n", total);
}

void mem_flush(const void *p, unsigned int allocation_size) {
  /*  */
  const size_t cache_line = 64;
  const char *cp = (const char *)p;
  size_t i = 0;
  if (p == NULL || allocation_size <= 0)
    return;

  for (i = 0; i < allocation_size; i += cache_line) {
    asm volatile("clflush (%0)"::"r"(&cp[i]):"memory");
  }
  asm volatile("sfence"::: "memory");
}

根据个人经验,您的解决方案需要更多地了解您正在使用的处理器的整体架构——特别是 L1 缓存行大小,以及触发 L1 缓存行加载的因素。尝试首先编写 read-only 循环 [例如 sum_of_bytes 或 strlen] 并对其进行基准测试,而不是 read-modify-write,然后优化 read-only 循环。您会发现上面显示的代码每次越过缓存行边界时都会停止...等待数据从下一级缓存(L1,L2,L3)传输到您的代码需要它的地方是。根据您的操作系统和处理器使用的虚拟内存页面大小,在 4kB 或 64kB 边界处可能会出现类似的停顿。如果您提供处理器“预取”提示“在当前内部循环结束时,我们将需要光标 + 1024 [或合适的缓存或分页偏移量]”。此外,将内部循环大小限制在 1024 micro-ops 以下,以允许充分利用 CPU 指令解码管道。此外,一旦达到 input/output 缓冲区的某个最小大小,multi-thread 代码并使用并行处理实际上是值得的——需要权衡,例如为每个线程设置循环的时间,以及线程与数据缓冲区的 NUMA 亲和力。 总的来说,这不是一个简单的问题,通常不值得付出努力,除非您正在为某种嵌入式应用程序高度优化一个 CPU 模型,或者想要在 HPC 基准测试中大放异彩。