使用静态变量与自动变量不会影响 运行 时间性能
Using static variables Vs automatic doesn't impact the run-time performance
GCC 的奇怪优化让我感到困惑。以下两个函数(calculate_with_static_vars
和 calculate_with_stack_vars
)的执行速度没有任何有意义的差异。
这是 MRE 代码:
#include <iostream>
#include <cstddef>
#include <cmath>
#include <chrono>
// just a simple timer, DON't PAY ATTENTION TO THIS
struct ScopedTimer
{
const std::chrono::time_point< std::chrono::steady_clock > start { std::chrono::steady_clock::now( ) };
std::chrono::time_point< std::chrono::steady_clock > end;
ScopedTimer( ) = default;
~ScopedTimer( )
{
end = std::chrono::steady_clock::now( );
std::clog << "\nTimer took "
<< std::chrono::duration< double, std::milli>( end - start ).count( )
<< " ms\n";
}
ScopedTimer( const ScopedTimer& ) = delete;
ScopedTimer& operator=( const ScopedTimer& ) = delete;
};
// this is the custom struct
struct Point3D
{
float x, y, z;
};
// the candidate 1
float calculate_with_static_vars( const Point3D point5 )
{
static constexpr Point3D point1 { 1.5f, 4.83f, 2.01f }; // static vars
static constexpr Point3D point2 { 2.5f, 5.83f, 3.01f };
static constexpr Point3D point3 { 3.5f, 6.83f, 4.01f };
static constexpr Point3D point4 { 4.5f, 7.83f, 5.01f };
const auto dist1 { std::hypot( point1.x - point2.x,
point1.y - point2.y,
point1.z - point2.z ) };
const auto dist2 { std::hypot( point2.x - point3.x,
point2.y - point3.y,
point2.z - point3.z ) };
const auto dist3 { std::hypot( point3.x - point4.x,
point3.y - point4.y,
point3.z - point4.z ) };
const auto dist4 { std::hypot( point4.x - point5.x,
point4.y - point5.y,
point4.z - point5.z ) };
return dist1 + dist2 + dist3 + dist4;
}
// the candidate 2
float calculate_with_stack_vars( const Point3D point5 )
{
constexpr Point3D point1 { 1.5f, 4.83f, 2.01f }; // stack vars
constexpr Point3D point2 { 2.5f, 5.83f, 3.01f };
constexpr Point3D point3 { 3.5f, 6.83f, 4.01f };
constexpr Point3D point4 { 4.5f, 7.83f, 5.01f };
const auto dist1 { std::hypot( point1.x - point2.x,
point1.y - point2.y,
point1.z - point2.z ) };
const auto dist2 { std::hypot( point2.x - point3.x,
point2.y - point3.y,
point2.z - point3.z ) };
const auto dist3 { std::hypot( point3.x - point4.x,
point3.y - point4.y,
point3.z - point4.z ) };
const auto dist4 { std::hypot( point4.x - point5.x,
point4.y - point5.y,
point4.z - point5.z ) };
return dist1 + dist2 + dist3 + dist4;
}
// a function that decides which of the above functions to call based on the branch_flag
inline float testFunc( const bool branch_flag, const bool arg_flag )
{
bool isStatic { branch_flag };
Point3D point2;
if ( arg_flag ) { point2 = { 3.5f, 7.33f, 9.04f }; }
else { point2 = { 2.5f, 6.33f, 8.04f }; }
float dist;
constexpr size_t numOfIterations { 1'000'000'000 };
if ( isStatic )
{
for ( size_t counter { }; counter < numOfIterations; ++counter )
{
dist = calculate_with_static_vars( point2 );
}
}
else
{
for ( size_t counter { }; counter < numOfIterations; ++counter )
{
dist = calculate_with_stack_vars( point2 );
}
}
return dist;
}
int main( )
{
bool branch_flag;
std::cin >> branch_flag;
bool arg_flag;
std::cin >> arg_flag;
float dist;
{
ScopedTimer timer;
dist = testFunc( branch_flag, arg_flag );
}
std::cout << "Sum of the distances of the four points: " << dist << '\n';
}
这两个函数做同样的工作(计算4个点之间的距离并返回它们的和)它们唯一的区别是一个使用静态变量而另一个使用堆栈变量(a.k.a自动).
用户在控制台上有两个输入的两个布尔值(第一个用于决定要 运行 的函数,第二个不重要的是用于决定将哪个参数传递给被调用的函数).像这样:
true // runs the function with static vars
true // passes the first point to it
或
false // runs the function with automatic vars
true // passes the first point to it
然后 testFunc
内的循环调用所选函数 1 十亿 次。
现在有人可能想知道为什么这段代码有这么多膨胀。原因是我想阻止 GCC 进行积极的编译时优化。否则,它会使这两个函数隐含 consteval
,这将破坏我测试的目的。
所以问题是这些函数如何花费与 运行 相同的时间(在我的旧机器上大约 22 秒)?静态版本不应该快得多吗,因为它分配存储然后只初始化它的变量一次?
So the question is how are these functions taking the same amount of time to run (~22 sec on my old machine)?
因为它们可以编译成相同的程序集。
Shouldn't the static version be considerably faster since it allocates storage and then initializes its variables only once?
没有。变量是编译时常量。实际上,编译器可以避免为它们提供任何存储空间。
通过 constant-folding 优化,两个函数实际上等同于:
return 5.19615269 // dist1 + dist2 + dist3
+ std::hypot(
4.5f - point5.x,
7.83f - point5.y,
5.01f - point5.z);
GCC 的奇怪优化让我感到困惑。以下两个函数(calculate_with_static_vars
和 calculate_with_stack_vars
)的执行速度没有任何有意义的差异。
这是 MRE 代码:
#include <iostream>
#include <cstddef>
#include <cmath>
#include <chrono>
// just a simple timer, DON't PAY ATTENTION TO THIS
struct ScopedTimer
{
const std::chrono::time_point< std::chrono::steady_clock > start { std::chrono::steady_clock::now( ) };
std::chrono::time_point< std::chrono::steady_clock > end;
ScopedTimer( ) = default;
~ScopedTimer( )
{
end = std::chrono::steady_clock::now( );
std::clog << "\nTimer took "
<< std::chrono::duration< double, std::milli>( end - start ).count( )
<< " ms\n";
}
ScopedTimer( const ScopedTimer& ) = delete;
ScopedTimer& operator=( const ScopedTimer& ) = delete;
};
// this is the custom struct
struct Point3D
{
float x, y, z;
};
// the candidate 1
float calculate_with_static_vars( const Point3D point5 )
{
static constexpr Point3D point1 { 1.5f, 4.83f, 2.01f }; // static vars
static constexpr Point3D point2 { 2.5f, 5.83f, 3.01f };
static constexpr Point3D point3 { 3.5f, 6.83f, 4.01f };
static constexpr Point3D point4 { 4.5f, 7.83f, 5.01f };
const auto dist1 { std::hypot( point1.x - point2.x,
point1.y - point2.y,
point1.z - point2.z ) };
const auto dist2 { std::hypot( point2.x - point3.x,
point2.y - point3.y,
point2.z - point3.z ) };
const auto dist3 { std::hypot( point3.x - point4.x,
point3.y - point4.y,
point3.z - point4.z ) };
const auto dist4 { std::hypot( point4.x - point5.x,
point4.y - point5.y,
point4.z - point5.z ) };
return dist1 + dist2 + dist3 + dist4;
}
// the candidate 2
float calculate_with_stack_vars( const Point3D point5 )
{
constexpr Point3D point1 { 1.5f, 4.83f, 2.01f }; // stack vars
constexpr Point3D point2 { 2.5f, 5.83f, 3.01f };
constexpr Point3D point3 { 3.5f, 6.83f, 4.01f };
constexpr Point3D point4 { 4.5f, 7.83f, 5.01f };
const auto dist1 { std::hypot( point1.x - point2.x,
point1.y - point2.y,
point1.z - point2.z ) };
const auto dist2 { std::hypot( point2.x - point3.x,
point2.y - point3.y,
point2.z - point3.z ) };
const auto dist3 { std::hypot( point3.x - point4.x,
point3.y - point4.y,
point3.z - point4.z ) };
const auto dist4 { std::hypot( point4.x - point5.x,
point4.y - point5.y,
point4.z - point5.z ) };
return dist1 + dist2 + dist3 + dist4;
}
// a function that decides which of the above functions to call based on the branch_flag
inline float testFunc( const bool branch_flag, const bool arg_flag )
{
bool isStatic { branch_flag };
Point3D point2;
if ( arg_flag ) { point2 = { 3.5f, 7.33f, 9.04f }; }
else { point2 = { 2.5f, 6.33f, 8.04f }; }
float dist;
constexpr size_t numOfIterations { 1'000'000'000 };
if ( isStatic )
{
for ( size_t counter { }; counter < numOfIterations; ++counter )
{
dist = calculate_with_static_vars( point2 );
}
}
else
{
for ( size_t counter { }; counter < numOfIterations; ++counter )
{
dist = calculate_with_stack_vars( point2 );
}
}
return dist;
}
int main( )
{
bool branch_flag;
std::cin >> branch_flag;
bool arg_flag;
std::cin >> arg_flag;
float dist;
{
ScopedTimer timer;
dist = testFunc( branch_flag, arg_flag );
}
std::cout << "Sum of the distances of the four points: " << dist << '\n';
}
这两个函数做同样的工作(计算4个点之间的距离并返回它们的和)它们唯一的区别是一个使用静态变量而另一个使用堆栈变量(a.k.a自动).
用户在控制台上有两个输入的两个布尔值(第一个用于决定要 运行 的函数,第二个不重要的是用于决定将哪个参数传递给被调用的函数).像这样:
true // runs the function with static vars
true // passes the first point to it
或
false // runs the function with automatic vars
true // passes the first point to it
然后 testFunc
内的循环调用所选函数 1 十亿 次。
现在有人可能想知道为什么这段代码有这么多膨胀。原因是我想阻止 GCC 进行积极的编译时优化。否则,它会使这两个函数隐含 consteval
,这将破坏我测试的目的。
所以问题是这些函数如何花费与 运行 相同的时间(在我的旧机器上大约 22 秒)?静态版本不应该快得多吗,因为它分配存储然后只初始化它的变量一次?
So the question is how are these functions taking the same amount of time to run (~22 sec on my old machine)?
因为它们可以编译成相同的程序集。
Shouldn't the static version be considerably faster since it allocates storage and then initializes its variables only once?
没有。变量是编译时常量。实际上,编译器可以避免为它们提供任何存储空间。
通过 constant-folding 优化,两个函数实际上等同于:
return 5.19615269 // dist1 + dist2 + dist3
+ std::hypot(
4.5f - point5.x,
7.83f - point5.y,
5.01f - point5.z);