为什么使用全局变量会使多线程执行速度减慢 2 倍,而在其他环境中却使其速度提高 2 倍?
Why using global variabes makes the multi-threaded execution 2x slower while in other environment it makes it 2x faster?
以下代码取自用g++编译的示例。多线程比单线程快 2 倍。
我在 Visual Studio 2019 年执行它,结果恰恰相反:单线程比多线程快 2 倍。
#include<thread>
#include<iostream>
#include<chrono>
using namespace std;
using ll = long long;
ll odd, even;
void par(const ll ini, const ll fim)
{
for (auto i = ini; i <= fim; i++)
if (!(i & 1))
even += i;
}
void impar(const ll ini, const ll fim)
{
for (auto i = ini; i <= fim; i++)
if (i & 1)
odd += i;
}
int main()
{
const ll start = 0;
const ll end = 190000000;
/* SINGLE THREADED */
auto start_single = chrono::high_resolution_clock::now();
par(start, end);
impar(start, end);
auto end_single = chrono::high_resolution_clock::now();
auto single_duration = chrono::duration_cast<chrono::microseconds>(end_single - start_single).count();
cout << "SINGLE THREADED\nEven sum: " << even << "\nOdd sum: " << odd << "\nTime: " << single_duration << "ms\n\n\n";
/* END OF SINGLE*/
/* MULTI THREADED */
even = odd = 0;
auto start_multi= chrono::high_resolution_clock::now();
thread t(par, start, end);
thread t2(impar, start, end);
t.join();
t2.join();
auto end_multi = chrono::high_resolution_clock::now();
auto multi_duration = chrono::duration_cast<chrono::microseconds>(end_multi - start_multi).count();
cout << "MULTI THREADED\nEven sum: " << even << "\nOdd sum: " << odd << "\nTime: " << multi_duration << "ms\n";
/*END OF MULTI*/
cout << "\n\nIs multi faster than single? => " << boolalpha << (multi_duration < single_duration) << '\n';
}
但是,如果我对我的函数进行如下所示的小修改:
void par(const ll ini, const ll fim)
{
ll temp = 0;
for (auto i = ini; i <= fim; i++)
if (!(i & 1))
temp += i;
even = temp;
}
void impar(const ll ini, const ll fim)
{
ll temp = 0;
for (auto i = ini; i <= fim; i++)
if (i & 1)
temp += i;
odd = temp;
}
多线程性能更好。我想知道是什么导致了这种行为(解释它的实现中可能存在哪些差异)。
此外,我使用 www.onlinegdb.com 的 gcc 进行了编译,结果与我机器中 Visual Studio 的结果相似。
您是 false sharing 的受害者。
odd
和even
相邻,从两个线程访问它们会导致L3缓存行争用(a.k.a false sharing).
您可以通过将它们展开 64 个字节来修复它,以确保它们位于不同的缓存行中,例如,像这样:
alignas(64) ll odd, even;
有了这个改变,我用 2 个线程得到了很好的加速:
SINGLE THREADED
Even sum: 9025000095000000
Odd sum: 9025000000000000
Time: 825954ms
MULTI THREADED
Even sum: 9025000095000000
Odd sum: 9025000000000000
Time: 532420ms
至于 G++ 性能 - 它可能正在执行您手动为您进行的优化。 MSVC 在优化全局变量时更加谨慎。
以下代码取自用g++编译的示例。多线程比单线程快 2 倍。
我在 Visual Studio 2019 年执行它,结果恰恰相反:单线程比多线程快 2 倍。
#include<thread>
#include<iostream>
#include<chrono>
using namespace std;
using ll = long long;
ll odd, even;
void par(const ll ini, const ll fim)
{
for (auto i = ini; i <= fim; i++)
if (!(i & 1))
even += i;
}
void impar(const ll ini, const ll fim)
{
for (auto i = ini; i <= fim; i++)
if (i & 1)
odd += i;
}
int main()
{
const ll start = 0;
const ll end = 190000000;
/* SINGLE THREADED */
auto start_single = chrono::high_resolution_clock::now();
par(start, end);
impar(start, end);
auto end_single = chrono::high_resolution_clock::now();
auto single_duration = chrono::duration_cast<chrono::microseconds>(end_single - start_single).count();
cout << "SINGLE THREADED\nEven sum: " << even << "\nOdd sum: " << odd << "\nTime: " << single_duration << "ms\n\n\n";
/* END OF SINGLE*/
/* MULTI THREADED */
even = odd = 0;
auto start_multi= chrono::high_resolution_clock::now();
thread t(par, start, end);
thread t2(impar, start, end);
t.join();
t2.join();
auto end_multi = chrono::high_resolution_clock::now();
auto multi_duration = chrono::duration_cast<chrono::microseconds>(end_multi - start_multi).count();
cout << "MULTI THREADED\nEven sum: " << even << "\nOdd sum: " << odd << "\nTime: " << multi_duration << "ms\n";
/*END OF MULTI*/
cout << "\n\nIs multi faster than single? => " << boolalpha << (multi_duration < single_duration) << '\n';
}
但是,如果我对我的函数进行如下所示的小修改:
void par(const ll ini, const ll fim)
{
ll temp = 0;
for (auto i = ini; i <= fim; i++)
if (!(i & 1))
temp += i;
even = temp;
}
void impar(const ll ini, const ll fim)
{
ll temp = 0;
for (auto i = ini; i <= fim; i++)
if (i & 1)
temp += i;
odd = temp;
}
多线程性能更好。我想知道是什么导致了这种行为(解释它的实现中可能存在哪些差异)。
此外,我使用 www.onlinegdb.com 的 gcc 进行了编译,结果与我机器中 Visual Studio 的结果相似。
您是 false sharing 的受害者。
odd
和even
相邻,从两个线程访问它们会导致L3缓存行争用(a.k.a false sharing).
您可以通过将它们展开 64 个字节来修复它,以确保它们位于不同的缓存行中,例如,像这样:
alignas(64) ll odd, even;
有了这个改变,我用 2 个线程得到了很好的加速:
SINGLE THREADED
Even sum: 9025000095000000
Odd sum: 9025000000000000
Time: 825954ms
MULTI THREADED
Even sum: 9025000095000000
Odd sum: 9025000000000000
Time: 532420ms
至于 G++ 性能 - 它可能正在执行您手动为您进行的优化。 MSVC 在优化全局变量时更加谨慎。