AVX2 代码不能比基于 gcc 的优化更快
AVX2 code cannot be faster than gcc base optmization
我正在通过使用内联汇编编写 AVX 代码来研究 AVX。在这种情况下,我尝试在一个简单的函数中实现 AVX。我做的函数名是lower_all_chars_base
.
它的行为是:对 std::string
和 0x20
中的每个字符应用逻辑或。
- 设
c
为输入中的每个字符。
- 假设输入仅包含此集合中的字符
'A' <= c && c <= 'Z'
。
因此该函数将使字符变为小写。
我试过做AVX版本的函数,store指令没有对齐,一点提速都没有
然后我想,如果内存访问是对齐的,那肯定更快。在那之后,我尝试制作具有对齐存储的 AVX 版本,但仍然 gcc
基本优化 -O3
手动击败了我的矢量化代码。我在这里做错了什么?
函数总结
lower_all_chars_base
简单的函数。
lower_all_chars_avx_aligned
AVX2 对齐移动版本:
- 使用基本操作处理第一个未对齐的内存。
- 然后使用 AVX2 处理对齐的内存部分并对齐移动。
- 然后剩下的再做基础操作
lower_all_chars_avx_unaligned
AVX2 未对齐移动版本:
- 使用 AVX2 和未对齐移动处理数据
- 然后剩下的基本操作
问题
- 为什么
gcc
基础优化 -O3
打败了我的优化?
- 我做错了什么?
- 执行此操作的正确 AVX 操作是什么?
基准测试结果
- CPU:英特尔(R) 至强(R) CPU E5-2650 v4 (2.20GHz)
- 微架构:Broadwell
root@esteh:/tmp# g++ --version
g++ (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
root@esteh:/tmp# g++ -Wall -Wextra -std=c++2a -O3 test.cpp -o test
root@esteh:/tmp# nice -n -20 ./test
lower_all_chars_base
Min = 0.00662300
Max = 0.00793100
Avg = 0.00717280
Total = 0.07172800
lower_all_chars_avx_aligned
Min = 0.00650200
Max = 0.00785100
Avg = 0.00726220
Total = 0.07262200
lower_all_chars_avx_unaligned
Min = 0.00623600
Max = 0.00835000
Avg = 0.00701360
Total = 0.07013600
代码
编辑:N - 1
用于 memset。
神箭link:https://godbolt.org/z/a16cGK
#include <ctime>
#include <cstdio>
#include <cassert>
#include <cstring>
#include <iostream>
using std::string;
void lower_all_chars_base(string &str);
void lower_all_chars_avx_aligned(string &str);
void lower_all_chars_avx_unaligned(string &str);
void do_benchmark(std::string &x, void (*fx)(string &));
void mem_flush(const void *p, unsigned int allocation_size);
#define N (size_t)(1024u * 1024 * 40)
#define BENCHMARK(STR, FX) do { \
puts(#FX); \
do_benchmark(STR, FX); \
} while(0)
int main() {
static char x[N];
memset(x, 'A', N - 1);
string a(x), b(x), c(x);
BENCHMARK(a, lower_all_chars_base);
BENCHMARK(b, lower_all_chars_avx_aligned);
BENCHMARK(c, lower_all_chars_avx_unaligned);
assert(a == b);
assert(b == c);
memset(x, 'a', N - 1);
assert(memcmp(c.c_str(), x, N - 1) == 0);
}
void do_benchmark(std::string &x, void (*fx)(string &)) {
const size_t n = 10;
double min, max, avg, c, total = 0;
for (size_t i = 0; i < n; i++) {
clock_t time0 = clock();
fx(x);
clock_t time1 = clock();
c = (double)(time1 - time0) / CLOCKS_PER_SEC;
total += c;
if (i == 0) {
min = max = c;
} else {
if (c > max) max = c;
if (c < min) min = c;
}
mem_flush(x.c_str(), x.size());
}
avg = total / (double)n;
printf("Min = %.8f\n", min);
printf("Max = %.8f\n", max);
printf("Avg = %.8f\n", avg);
printf("Total = %.8f\n\n", total);
}
__attribute__((noinline))
void lower_all_chars_base(string &str) {
char *cs = (char *)str.c_str();
size_t len = str.size();
while (len--) {
*cs++ |= 0x20;
}
}
static const uint64_t mask[] __attribute__((aligned(32))) = {
0x2020202020202020ull, 0x2020202020202020ull,
0x2020202020202020ull, 0x2020202020202020ull
};
__attribute__((noinline))
void lower_all_chars_avx_aligned(string &str) {
char *cs = (char *)str.c_str();
size_t len = str.size();
/* Only use AVX for data bigger than 4K. */
if (len > 4096) {
/* Handle unaligned data from the head. */
uint8_t n = (uintptr_t)cs & 0b11111u;
for (uint8_t i = 0; i < n; i++) {
*cs++ |= 0x20;
}
len -= n;
/* Prevent AVX to process data beyond the array. */
size_t vlen = len - 288;
size_t j;
/* Process the aligned memory with AVX. */
asm volatile("vmovdqa %[mask], %%ymm0"::[mask]"m"(mask):"ymm0");
for (j = 0; j < vlen; j += 288) {
asm volatile(
"vpor\t(%[cs],%[j]), %%ymm0, %%ymm1\n\t"
"vpor\t32(%[cs],%[j]), %%ymm0, %%ymm2\n\t"
"vpor\t64(%[cs],%[j]), %%ymm0, %%ymm3\n\t"
"vpor\t96(%[cs],%[j]), %%ymm0, %%ymm4\n\t"
"vpor\t128(%[cs],%[j]), %%ymm0, %%ymm5\n\t"
"vpor\t160(%[cs],%[j]), %%ymm0, %%ymm6\n\t"
"vpor\t192(%[cs],%[j]), %%ymm0, %%ymm7\n\t"
"vpor\t224(%[cs],%[j]), %%ymm0, %%ymm8\n\t"
"vpor\t256(%[cs],%[j]), %%ymm0, %%ymm9\n\t"
"vmovdqa\t%%ymm1, (%[cs],%[j])\n\t"
"vmovdqa\t%%ymm2, 32(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm3, 64(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm4, 96(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm5, 128(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm6, 160(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm7, 192(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm8, 224(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm9, 256(%[cs],%[j])"
:
: [cs]"p"(cs), [j]"r"(j)
: "memory", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
"ymm6", "ymm7", "ymm8", "ymm9"
);
}
asm volatile("vzeroupper":::
"ymm0", "ymm1", "ymm2", "ymm3",
"ymm4", "ymm5", "ymm6", "ymm7",
"ymm8", "ymm9", "ymm10", "ymm11",
"ymm12","ymm13","ymm14","ymm15"
);
cs += j;
len -= j;
}
/* Backup remaining elements from the AVX operation. */
for (size_t i = 0; i < len; i++) {
*cs++ |= 0x20;
}
}
__attribute__((noinline))
void lower_all_chars_avx_unaligned(string &str) {
char *cs = (char *)str.c_str();
size_t len = str.size();
/* Only use AVX for data bigger than 4K. */
if (len > 4096) {
size_t j;
size_t vlen = len - 288;
asm volatile("vmovdqa %[mask], %%ymm0"::[mask]"m"(mask):"ymm0");
for (j = 0; j < vlen; j += 288) {
asm volatile(
"vpor\t(%[cs],%[j]), %%ymm0, %%ymm1\n\t"
"vpor\t32(%[cs],%[j]), %%ymm0, %%ymm2\n\t"
"vpor\t64(%[cs],%[j]), %%ymm0, %%ymm3\n\t"
"vpor\t96(%[cs],%[j]), %%ymm0, %%ymm4\n\t"
"vpor\t128(%[cs],%[j]), %%ymm0, %%ymm5\n\t"
"vpor\t160(%[cs],%[j]), %%ymm0, %%ymm6\n\t"
"vpor\t192(%[cs],%[j]), %%ymm0, %%ymm7\n\t"
"vpor\t224(%[cs],%[j]), %%ymm0, %%ymm8\n\t"
"vpor\t256(%[cs],%[j]), %%ymm0, %%ymm9\n\t"
"vmovdqu\t%%ymm1, (%[cs],%[j])\n\t"
"vmovdqu\t%%ymm2, 32(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm3, 64(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm4, 96(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm5, 128(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm6, 160(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm7, 192(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm8, 224(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm9, 256(%[cs],%[j])"
:
: [cs]"p"(cs), [j]"r"(j)
: "memory", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
"ymm6", "ymm7", "ymm8", "ymm9"
);
}
asm volatile("vzeroupper":::
"ymm0", "ymm1", "ymm2", "ymm3",
"ymm4", "ymm5", "ymm6", "ymm7",
"ymm8", "ymm9", "ymm10", "ymm11",
"ymm12","ymm13","ymm14","ymm15"
);
cs += j;
len -= j;
}
/* Backup remaining elements from the AVX operation. */
for (size_t i = 0; i < len; i++) {
*cs++ |= 0x20;
}
}
void mem_flush(const void *p, unsigned int allocation_size) {
/* */
const size_t cache_line = 64;
const char *cp = (const char *)p;
size_t i = 0;
if (p == NULL || allocation_size <= 0)
return;
for (i = 0; i < allocation_size; i += cache_line) {
asm volatile("clflush (%0)"::"r"(&cp[i]):"memory");
}
asm volatile("sfence"::: "memory");
}
我尝试在评论中应用一些建议。
然而,我不使用内部函数。现在,手工编码的汇编版本比使用 -O3 -mavx2
标志的 gcc 优化快大约 1.02 倍。这不是一个显着的加速。但是我学到了很多关于内联汇编的知识。我还在等待其他答案,我希望有比这个更好的答案。
lower_all_chars_avx_aligned
功能变化汇总:
- 使用
vbroadcastsd
加载蒙版,就像 clang
一样。
- 使用虚拟内存输出替换内联汇编中的
volatile
和 "memory"
破坏。
- 将 AVX2 阈值降低到 1024 字节。
- 使用 AVX2 处理未对齐的 head/tail。
- AVX2 循环完全用内联汇编编写。
- 利用
ymm0
到 ymm15
寄存器。
- 添加
__AVX__
和 __AVX2__
常量检查以防止 vzeroupper
发出两次。
基准更改摘要:
- 要处理的数据长度为 3.5 GB(原来只有 40 MB)。
- 运行每个函数30次(原来只有10次)
- 添加了
-mavx2
以改进 gcc 基础优化。
lower_all_chars_avx_unaligned
已删除。
- 使用
malloc
中的堆而不是 static char[]
变量来处理更大的数据。
- 运行 在 Skylake 微架构上(在 Broadwell 上)。
基准信息:
Min
是30次函数调用的最小时间。
Max
是30次函数调用的最大时间。
Avg
是 30 次函数调用的平均时间。
Total
是 30 次函数调用的总时间。
基准测试结果:
- CPU:Intel(R) Xeon(R) Gold 6140 CPU @ 2.30GHz
- 微架构:Skylake
root@yukii-hpc2:/tmp# g++ -Wall -Wextra -std=c++2a -O3 -mavx2 test.cpp -o test
root@yukii-hpc2:/tmp# nice -n -20 ./test
lower_all_chars_avx_aligned
Min = 0.31086600
Max = 0.31319800
Avg = 0.31159833
Total = 9.34795000
lower_all_chars_base
Min = 0.31823400
Max = 0.32902100
Avg = 0.31904893
Total = 9.57146800
root@yukii-hpc2:/tmp# g++ --version
g++ (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
root@yukii-hpc2:/tmp#
代码
- 神箭link:https://godbolt.org/z/ox7hjT
/*
*/
#include <ctime>
#include <cstdio>
#include <cassert>
#include <cstring>
#include <cstdlib>
#include <iostream>
using std::string;
void lower_all_chars_base(string &str);
void lower_all_chars_avx_aligned(string &str);
void do_benchmark(string &x, void (*fx)(string &));
void mem_flush(const void *p, unsigned int allocation_size);
#define _M(N) (size_t)(1024ull * 1024 * (N))
#define _G(N) (size_t)(1024ull * 1024 * 1024 * (N))
#define N (_G(3) + _M(512) + 1234) /* 3.5 G + 1234 */
/*
1234 is just to make it odd,
so it likely will jump to the tail of AVX aligned loop.
*/
#define BENCHMARK(STR, FX) do { \
puts(#FX); \
do_benchmark(STR, FX); \
} while(0)
int main() {
char *x = (char *)malloc(N + 1);
memset(x, 'A', N);
x[N] = '[=11=]';
{
string a(x);
memset(x, 'a', N);
BENCHMARK(a, lower_all_chars_avx_aligned);
assert(memcmp(a.c_str(), x, N) == 0);
}
/* Restore value for the next benchmark. */
memset(x, 'A', N);
{
string a(x);
memset(x, 'a', N);
BENCHMARK(a, lower_all_chars_base);
assert(memcmp(a.c_str(), x, N) == 0);
}
free(x);
}
inline static void lower_all_chars_b1024(char *cs, uint16_t len) {
while (len--) {
*cs++ |= 0x20;
}
}
/* Aligned memory for mask for performance. */
static const uint64_t mask[] __attribute__((aligned(32))) = {
0x2020202020202020ull
};
__attribute__((noinline))
void lower_all_chars_avx_aligned(string &str) {
char *cs = (char *)str.c_str();
size_t len = str.size();
/* Only use AVX for data bigger than or equal to 1K. */
if (len >= 1024) {
size_t avx_size = 0x1e0; /* Bytes per AVX main iteration. */
char *end = &(cs[len]);
/* End of aligned process iteration. */
char *end_avx = &(end[-avx_size]);
/* Dummy variable, to let the compiler choose the best GP register. */
uintptr_t rem_bytes;
asm(
/* Prepare %[rem_bytes] initial value. */
"movq\t%[end], %[rem_bytes]\n\t"
/* Load the mask. */
"vbroadcastsd\t%[mask], %%ymm0\n\t"
/* Handle unaligned memory from the head. */
"vpor\t(%[cs]), %%ymm0, %%ymm1\n\t"
"vmovdqu\t%%ymm1, (%[cs])\n\t"
"addq\t[=11=]x20, %[cs]\n\t" /* Move to the next 32 bytes. */
/* Handle aligned memory part.
Use `vmovdqa` to make sure that the memory is
aligned properly.
Note that ORing is idempotent: you can OR the same
byte multiple times without changing it further. So
%[cs] can partially overlap with `vmovdqu` operation
before this point.
#comment115632279_65404362
*/
"andq\t$~0b11111ull, %[cs]\n\t" /* Clear 5-bit LSB. */
"1:\n\t"
"vpor\t0x000(%[cs]), %%ymm0, %%ymm1\n\t"
"vpor\t0x020(%[cs]), %%ymm0, %%ymm2\n\t"
"vpor\t0x040(%[cs]), %%ymm0, %%ymm3\n\t"
"vpor\t0x060(%[cs]), %%ymm0, %%ymm4\n\t"
"vpor\t0x080(%[cs]), %%ymm0, %%ymm5\n\t"
"vpor\t0x0a0(%[cs]), %%ymm0, %%ymm6\n\t"
"vpor\t0x0c0(%[cs]), %%ymm0, %%ymm7\n\t"
"vpor\t0x0e0(%[cs]), %%ymm0, %%ymm8\n\t"
"vpor\t0x100(%[cs]), %%ymm0, %%ymm9\n\t"
"vpor\t0x120(%[cs]), %%ymm0, %%ymm10\n\t"
"vpor\t0x140(%[cs]), %%ymm0, %%ymm11\n\t"
"vpor\t0x160(%[cs]), %%ymm0, %%ymm12\n\t"
"vpor\t0x180(%[cs]), %%ymm0, %%ymm13\n\t"
"vpor\t0x1a0(%[cs]), %%ymm0, %%ymm14\n\t"
"vpor\t0x1c0(%[cs]), %%ymm0, %%ymm15\n\t"
/* Plug the result to aligned memory. */
"vmovdqa\t%%ymm1, 0x000(%[cs])\n\t"
"vmovdqa\t%%ymm2, 0x020(%[cs])\n\t"
"vmovdqa\t%%ymm3, 0x040(%[cs])\n\t"
"vmovdqa\t%%ymm4, 0x060(%[cs])\n\t"
"vmovdqa\t%%ymm5, 0x080(%[cs])\n\t"
"vmovdqa\t%%ymm6, 0x0a0(%[cs])\n\t"
"vmovdqa\t%%ymm7, 0x0c0(%[cs])\n\t"
"vmovdqa\t%%ymm8, 0x0e0(%[cs])\n\t"
"vmovdqa\t%%ymm9, 0x100(%[cs])\n\t"
"vmovdqa\t%%ymm10, 0x120(%[cs])\n\t"
"vmovdqa\t%%ymm11, 0x140(%[cs])\n\t"
"vmovdqa\t%%ymm12, 0x160(%[cs])\n\t"
"vmovdqa\t%%ymm13, 0x180(%[cs])\n\t"
"vmovdqa\t%%ymm14, 0x1a0(%[cs])\n\t"
"vmovdqa\t%%ymm15, 0x1c0(%[cs])\n\t"
"addq\t%[avx_size], %[cs]\n\t"
"cmpq\t%[end_avx], %[cs]\n\t"
"jb\t1b\n\t"
"subq\t%[cs], %[rem_bytes]\n\t"
/* Now, %[rem_bytes] contains the remaining bytes. */
"testq\t%[rem_bytes], %[rem_bytes]\n\t"
"jz\t3f\n\t"
/* There's no remaining bytes if `jz` is taken. */
/* Handle the tail, may be back off several bytes
to make the remaining bytes to be multiple of 32.
*/
"leaq\t0b11111(%[rem_bytes]), %[dec_avx]\n\t"
"andq\t$~0b11111ull, %[dec_avx]\n\t"
"subq\t%[rem_bytes], %[dec_avx]\n\t"
"subq\t%[dec_avx], %[cs]\n\t"
"2:\n\t"
"vpor\t(%[cs]), %%ymm0, %%ymm1\n\t"
"vmovdqu\t%%ymm1, (%[cs])\n\t"
"addq\t[=11=]x20, %[cs]\n\t"
"cmpq\t%[end], %[cs]\n\t"
"jb\t2b\n\t"
"3:\n\t"
#if !defined(__AVX__) && !defined(__AVX2__)
"vzeroupper"
#endif
/* Output */
: [cs]"+r"(cs),
[end]"+r"(end),
[end_avx]"+r"(end_avx),
[dec_avx]"=r"(end_avx), /* May reuse end_avx if needed. */
[rem_bytes]"=r"(rem_bytes),
/* Tell the compiler that this inline assembly is
going to read/write `len` bytes from `cs`. */
[dummy_mem_output]"+m"(*(char (*)[len])cs)
/* Input */
: [mask]"m"(mask),
[avx_size]"n"(avx_size)
/* Clobbers */
: "ymm0", "ymm1", "ymm2", "ymm3",
"ymm4", "ymm5", "ymm6", "ymm7",
"ymm8", "ymm9", "ymm10", "ymm11",
"ymm12", "ymm13", "ymm14", "ymm15"
);
} else {
/* Let the compiler use its own optimization here. */
lower_all_chars_b1024(cs, len);
}
}
__attribute__((noinline))
void lower_all_chars_base(string &str) {
char *cs = (char *)str.c_str();
size_t len = str.size();
while (len--) {
*cs++ |= 0x20;
}
}
void do_benchmark(string &x, void (*fx)(string &)) {
const size_t n = 30;
double min = 0, max = 0, avg, c, total = 0;
for (size_t i = 0; i < n; i++) {
mem_flush(x.c_str(), x.size());
clock_t time0 = clock();
fx(x);
clock_t time1 = clock();
c = (double)(time1 - time0) / CLOCKS_PER_SEC;
total += c;
if (i == 0) {
min = max = c;
} else {
if (c > max) max = c;
if (c < min) min = c;
}
}
avg = total / (double)n;
printf("Min = %.8f\n", min);
printf("Max = %.8f\n", max);
printf("Avg = %.8f\n", avg);
printf("Total = %.8f\n\n", total);
}
void mem_flush(const void *p, unsigned int allocation_size) {
/* */
const size_t cache_line = 64;
const char *cp = (const char *)p;
size_t i = 0;
if (p == NULL || allocation_size <= 0)
return;
for (i = 0; i < allocation_size; i += cache_line) {
asm volatile("clflush (%0)"::"r"(&cp[i]):"memory");
}
asm volatile("sfence"::: "memory");
}
根据个人经验,您的解决方案需要更多地了解您正在使用的处理器的整体架构——特别是 L1 缓存行大小,以及触发 L1 缓存行加载的因素。尝试首先编写 read-only 循环 [例如 sum_of_bytes 或 strlen] 并对其进行基准测试,而不是 read-modify-write,然后优化 read-only 循环。您会发现上面显示的代码每次越过缓存行边界时都会停止...等待数据从下一级缓存(L1,L2,L3)传输到您的代码需要它的地方是。根据您的操作系统和处理器使用的虚拟内存页面大小,在 4kB 或 64kB 边界处可能会出现类似的停顿。如果您提供处理器“预取”提示“在当前内部循环结束时,我们将需要光标 + 1024 [或合适的缓存或分页偏移量]”。此外,将内部循环大小限制在 1024 micro-ops 以下,以允许充分利用 CPU 指令解码管道。此外,一旦达到 input/output 缓冲区的某个最小大小,multi-thread 代码并使用并行处理实际上是值得的——需要权衡,例如为每个线程设置循环的时间,以及线程与数据缓冲区的 NUMA 亲和力。
总的来说,这不是一个简单的问题,通常不值得付出努力,除非您正在为某种嵌入式应用程序高度优化一个 CPU 模型,或者想要在 HPC 基准测试中大放异彩。
我正在通过使用内联汇编编写 AVX 代码来研究 AVX。在这种情况下,我尝试在一个简单的函数中实现 AVX。我做的函数名是lower_all_chars_base
.
它的行为是:对 std::string
和 0x20
中的每个字符应用逻辑或。
- 设
c
为输入中的每个字符。 - 假设输入仅包含此集合中的字符
'A' <= c && c <= 'Z'
。
因此该函数将使字符变为小写。
我试过做AVX版本的函数,store指令没有对齐,一点提速都没有
然后我想,如果内存访问是对齐的,那肯定更快。在那之后,我尝试制作具有对齐存储的 AVX 版本,但仍然 gcc
基本优化 -O3
手动击败了我的矢量化代码。我在这里做错了什么?
函数总结
lower_all_chars_base
简单的函数。lower_all_chars_avx_aligned
AVX2 对齐移动版本:
- 使用基本操作处理第一个未对齐的内存。
- 然后使用 AVX2 处理对齐的内存部分并对齐移动。
- 然后剩下的再做基础操作
lower_all_chars_avx_unaligned
AVX2 未对齐移动版本:
- 使用 AVX2 和未对齐移动处理数据
- 然后剩下的基本操作
问题
- 为什么
gcc
基础优化-O3
打败了我的优化? - 我做错了什么?
- 执行此操作的正确 AVX 操作是什么?
基准测试结果
- CPU:英特尔(R) 至强(R) CPU E5-2650 v4 (2.20GHz)
- 微架构:Broadwell
root@esteh:/tmp# g++ --version
g++ (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
root@esteh:/tmp# g++ -Wall -Wextra -std=c++2a -O3 test.cpp -o test
root@esteh:/tmp# nice -n -20 ./test
lower_all_chars_base
Min = 0.00662300
Max = 0.00793100
Avg = 0.00717280
Total = 0.07172800
lower_all_chars_avx_aligned
Min = 0.00650200
Max = 0.00785100
Avg = 0.00726220
Total = 0.07262200
lower_all_chars_avx_unaligned
Min = 0.00623600
Max = 0.00835000
Avg = 0.00701360
Total = 0.07013600
代码
编辑:N - 1
用于 memset。
神箭link:https://godbolt.org/z/a16cGK
#include <ctime>
#include <cstdio>
#include <cassert>
#include <cstring>
#include <iostream>
using std::string;
void lower_all_chars_base(string &str);
void lower_all_chars_avx_aligned(string &str);
void lower_all_chars_avx_unaligned(string &str);
void do_benchmark(std::string &x, void (*fx)(string &));
void mem_flush(const void *p, unsigned int allocation_size);
#define N (size_t)(1024u * 1024 * 40)
#define BENCHMARK(STR, FX) do { \
puts(#FX); \
do_benchmark(STR, FX); \
} while(0)
int main() {
static char x[N];
memset(x, 'A', N - 1);
string a(x), b(x), c(x);
BENCHMARK(a, lower_all_chars_base);
BENCHMARK(b, lower_all_chars_avx_aligned);
BENCHMARK(c, lower_all_chars_avx_unaligned);
assert(a == b);
assert(b == c);
memset(x, 'a', N - 1);
assert(memcmp(c.c_str(), x, N - 1) == 0);
}
void do_benchmark(std::string &x, void (*fx)(string &)) {
const size_t n = 10;
double min, max, avg, c, total = 0;
for (size_t i = 0; i < n; i++) {
clock_t time0 = clock();
fx(x);
clock_t time1 = clock();
c = (double)(time1 - time0) / CLOCKS_PER_SEC;
total += c;
if (i == 0) {
min = max = c;
} else {
if (c > max) max = c;
if (c < min) min = c;
}
mem_flush(x.c_str(), x.size());
}
avg = total / (double)n;
printf("Min = %.8f\n", min);
printf("Max = %.8f\n", max);
printf("Avg = %.8f\n", avg);
printf("Total = %.8f\n\n", total);
}
__attribute__((noinline))
void lower_all_chars_base(string &str) {
char *cs = (char *)str.c_str();
size_t len = str.size();
while (len--) {
*cs++ |= 0x20;
}
}
static const uint64_t mask[] __attribute__((aligned(32))) = {
0x2020202020202020ull, 0x2020202020202020ull,
0x2020202020202020ull, 0x2020202020202020ull
};
__attribute__((noinline))
void lower_all_chars_avx_aligned(string &str) {
char *cs = (char *)str.c_str();
size_t len = str.size();
/* Only use AVX for data bigger than 4K. */
if (len > 4096) {
/* Handle unaligned data from the head. */
uint8_t n = (uintptr_t)cs & 0b11111u;
for (uint8_t i = 0; i < n; i++) {
*cs++ |= 0x20;
}
len -= n;
/* Prevent AVX to process data beyond the array. */
size_t vlen = len - 288;
size_t j;
/* Process the aligned memory with AVX. */
asm volatile("vmovdqa %[mask], %%ymm0"::[mask]"m"(mask):"ymm0");
for (j = 0; j < vlen; j += 288) {
asm volatile(
"vpor\t(%[cs],%[j]), %%ymm0, %%ymm1\n\t"
"vpor\t32(%[cs],%[j]), %%ymm0, %%ymm2\n\t"
"vpor\t64(%[cs],%[j]), %%ymm0, %%ymm3\n\t"
"vpor\t96(%[cs],%[j]), %%ymm0, %%ymm4\n\t"
"vpor\t128(%[cs],%[j]), %%ymm0, %%ymm5\n\t"
"vpor\t160(%[cs],%[j]), %%ymm0, %%ymm6\n\t"
"vpor\t192(%[cs],%[j]), %%ymm0, %%ymm7\n\t"
"vpor\t224(%[cs],%[j]), %%ymm0, %%ymm8\n\t"
"vpor\t256(%[cs],%[j]), %%ymm0, %%ymm9\n\t"
"vmovdqa\t%%ymm1, (%[cs],%[j])\n\t"
"vmovdqa\t%%ymm2, 32(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm3, 64(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm4, 96(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm5, 128(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm6, 160(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm7, 192(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm8, 224(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm9, 256(%[cs],%[j])"
:
: [cs]"p"(cs), [j]"r"(j)
: "memory", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
"ymm6", "ymm7", "ymm8", "ymm9"
);
}
asm volatile("vzeroupper":::
"ymm0", "ymm1", "ymm2", "ymm3",
"ymm4", "ymm5", "ymm6", "ymm7",
"ymm8", "ymm9", "ymm10", "ymm11",
"ymm12","ymm13","ymm14","ymm15"
);
cs += j;
len -= j;
}
/* Backup remaining elements from the AVX operation. */
for (size_t i = 0; i < len; i++) {
*cs++ |= 0x20;
}
}
__attribute__((noinline))
void lower_all_chars_avx_unaligned(string &str) {
char *cs = (char *)str.c_str();
size_t len = str.size();
/* Only use AVX for data bigger than 4K. */
if (len > 4096) {
size_t j;
size_t vlen = len - 288;
asm volatile("vmovdqa %[mask], %%ymm0"::[mask]"m"(mask):"ymm0");
for (j = 0; j < vlen; j += 288) {
asm volatile(
"vpor\t(%[cs],%[j]), %%ymm0, %%ymm1\n\t"
"vpor\t32(%[cs],%[j]), %%ymm0, %%ymm2\n\t"
"vpor\t64(%[cs],%[j]), %%ymm0, %%ymm3\n\t"
"vpor\t96(%[cs],%[j]), %%ymm0, %%ymm4\n\t"
"vpor\t128(%[cs],%[j]), %%ymm0, %%ymm5\n\t"
"vpor\t160(%[cs],%[j]), %%ymm0, %%ymm6\n\t"
"vpor\t192(%[cs],%[j]), %%ymm0, %%ymm7\n\t"
"vpor\t224(%[cs],%[j]), %%ymm0, %%ymm8\n\t"
"vpor\t256(%[cs],%[j]), %%ymm0, %%ymm9\n\t"
"vmovdqu\t%%ymm1, (%[cs],%[j])\n\t"
"vmovdqu\t%%ymm2, 32(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm3, 64(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm4, 96(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm5, 128(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm6, 160(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm7, 192(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm8, 224(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm9, 256(%[cs],%[j])"
:
: [cs]"p"(cs), [j]"r"(j)
: "memory", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
"ymm6", "ymm7", "ymm8", "ymm9"
);
}
asm volatile("vzeroupper":::
"ymm0", "ymm1", "ymm2", "ymm3",
"ymm4", "ymm5", "ymm6", "ymm7",
"ymm8", "ymm9", "ymm10", "ymm11",
"ymm12","ymm13","ymm14","ymm15"
);
cs += j;
len -= j;
}
/* Backup remaining elements from the AVX operation. */
for (size_t i = 0; i < len; i++) {
*cs++ |= 0x20;
}
}
void mem_flush(const void *p, unsigned int allocation_size) {
/* */
const size_t cache_line = 64;
const char *cp = (const char *)p;
size_t i = 0;
if (p == NULL || allocation_size <= 0)
return;
for (i = 0; i < allocation_size; i += cache_line) {
asm volatile("clflush (%0)"::"r"(&cp[i]):"memory");
}
asm volatile("sfence"::: "memory");
}
我尝试在评论中应用一些建议。
然而,我不使用内部函数。现在,手工编码的汇编版本比使用 -O3 -mavx2
标志的 gcc 优化快大约 1.02 倍。这不是一个显着的加速。但是我学到了很多关于内联汇编的知识。我还在等待其他答案,我希望有比这个更好的答案。
lower_all_chars_avx_aligned
功能变化汇总:
- 使用
vbroadcastsd
加载蒙版,就像clang
一样。 - 使用虚拟内存输出替换内联汇编中的
volatile
和"memory"
破坏。 - 将 AVX2 阈值降低到 1024 字节。
- 使用 AVX2 处理未对齐的 head/tail。
- AVX2 循环完全用内联汇编编写。
- 利用
ymm0
到ymm15
寄存器。 - 添加
__AVX__
和__AVX2__
常量检查以防止vzeroupper
发出两次。
基准更改摘要:
- 要处理的数据长度为 3.5 GB(原来只有 40 MB)。
- 运行每个函数30次(原来只有10次)
- 添加了
-mavx2
以改进 gcc 基础优化。 lower_all_chars_avx_unaligned
已删除。- 使用
malloc
中的堆而不是static char[]
变量来处理更大的数据。 - 运行 在 Skylake 微架构上(在 Broadwell 上)。
基准信息:
Min
是30次函数调用的最小时间。Max
是30次函数调用的最大时间。Avg
是 30 次函数调用的平均时间。Total
是 30 次函数调用的总时间。
基准测试结果:
- CPU:Intel(R) Xeon(R) Gold 6140 CPU @ 2.30GHz
- 微架构:Skylake
root@yukii-hpc2:/tmp# g++ -Wall -Wextra -std=c++2a -O3 -mavx2 test.cpp -o test
root@yukii-hpc2:/tmp# nice -n -20 ./test
lower_all_chars_avx_aligned
Min = 0.31086600
Max = 0.31319800
Avg = 0.31159833
Total = 9.34795000
lower_all_chars_base
Min = 0.31823400
Max = 0.32902100
Avg = 0.31904893
Total = 9.57146800
root@yukii-hpc2:/tmp# g++ --version
g++ (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
root@yukii-hpc2:/tmp#
代码
- 神箭link:https://godbolt.org/z/ox7hjT
/*
*/
#include <ctime>
#include <cstdio>
#include <cassert>
#include <cstring>
#include <cstdlib>
#include <iostream>
using std::string;
void lower_all_chars_base(string &str);
void lower_all_chars_avx_aligned(string &str);
void do_benchmark(string &x, void (*fx)(string &));
void mem_flush(const void *p, unsigned int allocation_size);
#define _M(N) (size_t)(1024ull * 1024 * (N))
#define _G(N) (size_t)(1024ull * 1024 * 1024 * (N))
#define N (_G(3) + _M(512) + 1234) /* 3.5 G + 1234 */
/*
1234 is just to make it odd,
so it likely will jump to the tail of AVX aligned loop.
*/
#define BENCHMARK(STR, FX) do { \
puts(#FX); \
do_benchmark(STR, FX); \
} while(0)
int main() {
char *x = (char *)malloc(N + 1);
memset(x, 'A', N);
x[N] = '[=11=]';
{
string a(x);
memset(x, 'a', N);
BENCHMARK(a, lower_all_chars_avx_aligned);
assert(memcmp(a.c_str(), x, N) == 0);
}
/* Restore value for the next benchmark. */
memset(x, 'A', N);
{
string a(x);
memset(x, 'a', N);
BENCHMARK(a, lower_all_chars_base);
assert(memcmp(a.c_str(), x, N) == 0);
}
free(x);
}
inline static void lower_all_chars_b1024(char *cs, uint16_t len) {
while (len--) {
*cs++ |= 0x20;
}
}
/* Aligned memory for mask for performance. */
static const uint64_t mask[] __attribute__((aligned(32))) = {
0x2020202020202020ull
};
__attribute__((noinline))
void lower_all_chars_avx_aligned(string &str) {
char *cs = (char *)str.c_str();
size_t len = str.size();
/* Only use AVX for data bigger than or equal to 1K. */
if (len >= 1024) {
size_t avx_size = 0x1e0; /* Bytes per AVX main iteration. */
char *end = &(cs[len]);
/* End of aligned process iteration. */
char *end_avx = &(end[-avx_size]);
/* Dummy variable, to let the compiler choose the best GP register. */
uintptr_t rem_bytes;
asm(
/* Prepare %[rem_bytes] initial value. */
"movq\t%[end], %[rem_bytes]\n\t"
/* Load the mask. */
"vbroadcastsd\t%[mask], %%ymm0\n\t"
/* Handle unaligned memory from the head. */
"vpor\t(%[cs]), %%ymm0, %%ymm1\n\t"
"vmovdqu\t%%ymm1, (%[cs])\n\t"
"addq\t[=11=]x20, %[cs]\n\t" /* Move to the next 32 bytes. */
/* Handle aligned memory part.
Use `vmovdqa` to make sure that the memory is
aligned properly.
Note that ORing is idempotent: you can OR the same
byte multiple times without changing it further. So
%[cs] can partially overlap with `vmovdqu` operation
before this point.
#comment115632279_65404362
*/
"andq\t$~0b11111ull, %[cs]\n\t" /* Clear 5-bit LSB. */
"1:\n\t"
"vpor\t0x000(%[cs]), %%ymm0, %%ymm1\n\t"
"vpor\t0x020(%[cs]), %%ymm0, %%ymm2\n\t"
"vpor\t0x040(%[cs]), %%ymm0, %%ymm3\n\t"
"vpor\t0x060(%[cs]), %%ymm0, %%ymm4\n\t"
"vpor\t0x080(%[cs]), %%ymm0, %%ymm5\n\t"
"vpor\t0x0a0(%[cs]), %%ymm0, %%ymm6\n\t"
"vpor\t0x0c0(%[cs]), %%ymm0, %%ymm7\n\t"
"vpor\t0x0e0(%[cs]), %%ymm0, %%ymm8\n\t"
"vpor\t0x100(%[cs]), %%ymm0, %%ymm9\n\t"
"vpor\t0x120(%[cs]), %%ymm0, %%ymm10\n\t"
"vpor\t0x140(%[cs]), %%ymm0, %%ymm11\n\t"
"vpor\t0x160(%[cs]), %%ymm0, %%ymm12\n\t"
"vpor\t0x180(%[cs]), %%ymm0, %%ymm13\n\t"
"vpor\t0x1a0(%[cs]), %%ymm0, %%ymm14\n\t"
"vpor\t0x1c0(%[cs]), %%ymm0, %%ymm15\n\t"
/* Plug the result to aligned memory. */
"vmovdqa\t%%ymm1, 0x000(%[cs])\n\t"
"vmovdqa\t%%ymm2, 0x020(%[cs])\n\t"
"vmovdqa\t%%ymm3, 0x040(%[cs])\n\t"
"vmovdqa\t%%ymm4, 0x060(%[cs])\n\t"
"vmovdqa\t%%ymm5, 0x080(%[cs])\n\t"
"vmovdqa\t%%ymm6, 0x0a0(%[cs])\n\t"
"vmovdqa\t%%ymm7, 0x0c0(%[cs])\n\t"
"vmovdqa\t%%ymm8, 0x0e0(%[cs])\n\t"
"vmovdqa\t%%ymm9, 0x100(%[cs])\n\t"
"vmovdqa\t%%ymm10, 0x120(%[cs])\n\t"
"vmovdqa\t%%ymm11, 0x140(%[cs])\n\t"
"vmovdqa\t%%ymm12, 0x160(%[cs])\n\t"
"vmovdqa\t%%ymm13, 0x180(%[cs])\n\t"
"vmovdqa\t%%ymm14, 0x1a0(%[cs])\n\t"
"vmovdqa\t%%ymm15, 0x1c0(%[cs])\n\t"
"addq\t%[avx_size], %[cs]\n\t"
"cmpq\t%[end_avx], %[cs]\n\t"
"jb\t1b\n\t"
"subq\t%[cs], %[rem_bytes]\n\t"
/* Now, %[rem_bytes] contains the remaining bytes. */
"testq\t%[rem_bytes], %[rem_bytes]\n\t"
"jz\t3f\n\t"
/* There's no remaining bytes if `jz` is taken. */
/* Handle the tail, may be back off several bytes
to make the remaining bytes to be multiple of 32.
*/
"leaq\t0b11111(%[rem_bytes]), %[dec_avx]\n\t"
"andq\t$~0b11111ull, %[dec_avx]\n\t"
"subq\t%[rem_bytes], %[dec_avx]\n\t"
"subq\t%[dec_avx], %[cs]\n\t"
"2:\n\t"
"vpor\t(%[cs]), %%ymm0, %%ymm1\n\t"
"vmovdqu\t%%ymm1, (%[cs])\n\t"
"addq\t[=11=]x20, %[cs]\n\t"
"cmpq\t%[end], %[cs]\n\t"
"jb\t2b\n\t"
"3:\n\t"
#if !defined(__AVX__) && !defined(__AVX2__)
"vzeroupper"
#endif
/* Output */
: [cs]"+r"(cs),
[end]"+r"(end),
[end_avx]"+r"(end_avx),
[dec_avx]"=r"(end_avx), /* May reuse end_avx if needed. */
[rem_bytes]"=r"(rem_bytes),
/* Tell the compiler that this inline assembly is
going to read/write `len` bytes from `cs`. */
[dummy_mem_output]"+m"(*(char (*)[len])cs)
/* Input */
: [mask]"m"(mask),
[avx_size]"n"(avx_size)
/* Clobbers */
: "ymm0", "ymm1", "ymm2", "ymm3",
"ymm4", "ymm5", "ymm6", "ymm7",
"ymm8", "ymm9", "ymm10", "ymm11",
"ymm12", "ymm13", "ymm14", "ymm15"
);
} else {
/* Let the compiler use its own optimization here. */
lower_all_chars_b1024(cs, len);
}
}
__attribute__((noinline))
void lower_all_chars_base(string &str) {
char *cs = (char *)str.c_str();
size_t len = str.size();
while (len--) {
*cs++ |= 0x20;
}
}
void do_benchmark(string &x, void (*fx)(string &)) {
const size_t n = 30;
double min = 0, max = 0, avg, c, total = 0;
for (size_t i = 0; i < n; i++) {
mem_flush(x.c_str(), x.size());
clock_t time0 = clock();
fx(x);
clock_t time1 = clock();
c = (double)(time1 - time0) / CLOCKS_PER_SEC;
total += c;
if (i == 0) {
min = max = c;
} else {
if (c > max) max = c;
if (c < min) min = c;
}
}
avg = total / (double)n;
printf("Min = %.8f\n", min);
printf("Max = %.8f\n", max);
printf("Avg = %.8f\n", avg);
printf("Total = %.8f\n\n", total);
}
void mem_flush(const void *p, unsigned int allocation_size) {
/* */
const size_t cache_line = 64;
const char *cp = (const char *)p;
size_t i = 0;
if (p == NULL || allocation_size <= 0)
return;
for (i = 0; i < allocation_size; i += cache_line) {
asm volatile("clflush (%0)"::"r"(&cp[i]):"memory");
}
asm volatile("sfence"::: "memory");
}
根据个人经验,您的解决方案需要更多地了解您正在使用的处理器的整体架构——特别是 L1 缓存行大小,以及触发 L1 缓存行加载的因素。尝试首先编写 read-only 循环 [例如 sum_of_bytes 或 strlen] 并对其进行基准测试,而不是 read-modify-write,然后优化 read-only 循环。您会发现上面显示的代码每次越过缓存行边界时都会停止...等待数据从下一级缓存(L1,L2,L3)传输到您的代码需要它的地方是。根据您的操作系统和处理器使用的虚拟内存页面大小,在 4kB 或 64kB 边界处可能会出现类似的停顿。如果您提供处理器“预取”提示“在当前内部循环结束时,我们将需要光标 + 1024 [或合适的缓存或分页偏移量]”。此外,将内部循环大小限制在 1024 micro-ops 以下,以允许充分利用 CPU 指令解码管道。此外,一旦达到 input/output 缓冲区的某个最小大小,multi-thread 代码并使用并行处理实际上是值得的——需要权衡,例如为每个线程设置循环的时间,以及线程与数据缓冲区的 NUMA 亲和力。 总的来说,这不是一个简单的问题,通常不值得付出努力,除非您正在为某种嵌入式应用程序高度优化一个 CPU 模型,或者想要在 HPC 基准测试中大放异彩。