<cmath> 中的基准函数与 GCC 和 MSVS
Benchmark functions in <cmath> with GCC and MSVS
我的任务是对 cmath
中几乎每个函数的 64 位整数和 double
的时间成本进行基准测试。这是我的源代码:
#include <unordered_map>
#include <string>
#include <cmath>
#include <cstdint>
#include <vector>
#include <random>
#include <iostream>
#include <fstream>
#include <iomanip>
#include <chrono>
#include <numeric>
template<typename timetype>
struct tiktok
{
std::chrono::time_point<std::chrono::steady_clock> start;
// Return time passed since tik.
std::size_t tik() { start = std::chrono::steady_clock::now(); return 0; }
// Return time passed since tok.
std::size_t tok()
{
return std::chrono::duration_cast<timetype> (
std::chrono::steady_clock::now() - start).count();
}
};
double mathHspeed(std::unordered_map<std::string, int64_t>& M,
int rngSeed, int maxIter = 100000)
{
std::mt19937 rng(rngSeed);
std::uniform_real_distribution<double> U(-5, 5);
std::uniform_int_distribution<int32_t> Uint(-2147483647, 2147483647);
tiktok<std::chrono::nanoseconds> timer;
double S = 0;
int64_t duration = 0;
maxIter = (maxIter / 2) * 2; // Make sure maxIter is even.
std::vector<int64_t> u(maxIter);
std::vector<double> v(maxIter);
int64_t loadingCost = 0;
// Time cost of reading and writing 8-bytes = `loadingCost`
// Let loadingCost just be 0 since we only need rough numbers.
if (false)
{
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = U(rng);
timer.tik();
for (int i = 0, iend = v.size() - 2; i < iend; ++i) v[i] += v[i + 1];
duration = timer.tok();
timer.tik();
for (int i = 0, iend = v.size() - 2; i < iend; ++i) // one more addition.
v[i] += v[i + 1] + v[i + 2];
std::size_t duration2 = timer.tok();
loadingCost = std::max<int64_t>(
0, (int64_t)duration - ((int64_t)duration2 - (int64_t)duration));
S += std::accumulate(v.begin(), v.end(), 0.0);
}
#define sampleDouble for(int i = 0, iend = v.size(); i < iend; ++i) v[i] = U(rng);
#define sampleInt for(int i = 0, iend = u.size(); i < iend; ++i) u[i] = (int64_t)Uint(rng) - Uint(rng);
sampleInt; timer.tik();
for (int i = 0, iend = u.size() - 1; i < iend; ++i)
u[i] += u[i + 1];
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(u.begin(), u.end(), 0.0);
M["+ ( int64 )"] = duration;
sampleInt; timer.tik();
for (int i = 0, iend = u.size() - 1; i < iend; ++i)
u[i] *= u[i + 1];
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(u.begin(), u.end(), 0.0);
M["x ( int64 )"] = duration;
for (int i = 0, iend = u.size() - 1; i < iend; i += 2)
{
u[i] = (int64_t)Uint(rng) * Uint(rng); u[i + 1] = Uint(rng);
}
timer.tik();
for (int i = 0, iend = u.size() - 1; i < iend; ++i)
u[i] = u[i] / u[i + 1] + u[i] % u[i + 1];
duration = std::max<int64_t>(timer.tok() - loadingCost - M["+ ( int64 )"], 0);
S += std::accumulate(u.begin(), u.end(), 0.0);
M["/% ( int64 )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] += v[i + 1];
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["+ ( double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] *= v[i + 1];
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["x ( double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] /= v[i + 1];
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["/ ( double )"] = duration;
for (int i = 0, iend = u.size(); i < iend; ++i) u[i] = (int64_t)Uint(rng) - Uint(rng);
timer.tik();
for (int i = 0, iend = u.size(); i < iend; ++i)
u[i] = std::abs(u[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(u.begin(), u.end(), 0.0);
M["abs ( int64 )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::abs(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["abs ( double )"] = duration;
for (int i = 0, iend = v.size() - 1; i < iend; i += 2)
{
v[i] = U(rng) * U(rng); v[i + 1] = U(rng);
}
timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] = std::fmod(v[i], v[i + 1]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["fmod ( double, double )"] = duration;
for (int i = 0, iend = v.size() - 1; i < iend; i += 2)
{
v[i] = U(rng) * U(rng); v[i + 1] = U(rng);
}
timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] = std::remainder(v[i], v[i + 1]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["remainder ( double, double )"] = duration;
for (int i = 0, iend = v.size() - 1; i < iend; i += 2)
{
v[i] = U(rng) * U(rng); v[i + 1] = U(rng);
}
timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
{
int tmp = 0;
v[i] = std::remquo(v[i], v[i + 1], &tmp) + tmp;
}
duration = std::max<int64_t>(timer.tok() - loadingCost - M["+ ( double )"], 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["remquo ( double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size() - 2; i < iend; ++i)
v[i] = std::fma(v[i], v[i + 1], v[i + 2]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["fma ( double, double, double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] = std::fmax(v[i], v[i + 1]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["fmax ( double, double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] = std::fmin(v[i], v[i + 1]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["fmin ( double, double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] = std::fdim(v[i], v[i + 1]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["fdim ( double, double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::exp(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["exp ( double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::exp2(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["exp2 ( double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::expm1(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["expm1 ( double )"] = duration;
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng)) + 1e-6;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::log(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["log ( double )"] = duration;
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng)) + 1e-6;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::log10(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["log10 ( double )"] = duration;
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng)) + 1e-6;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::log2(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["log2 ( double )"] = duration;
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng)) + 1e-6;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::log1p(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["log1p ( double )"] = duration;
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng));
timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] = std::pow(v[i], v[i + 1]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["pow ( double, double )"] = duration;
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng));
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::sqrt(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["sqrt ( double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::cbrt(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["cbrt ( double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] = std::hypot(v[i], v[i + 1]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["hypot ( double, double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size() - 2; i < iend; ++i)
v[i] = std::hypot(v[i], v[i + 1], v[i + 2]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["hypot ( double, double, double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::sin(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["sin ( double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::cos(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["cos ( double )"] = duration;
std::uniform_real_distribution<double> UhalfPi(-3.14 / 2, 3.14 / 2);
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = UhalfPi(rng);
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::tan(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["tan ( double )"] = duration;
std::uniform_real_distribution<double> U_11(-0.99, 0.99);
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = U_11(rng);
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::asin(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["asin ( double )"] = duration;
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = U_11(rng);
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] =
std::acos(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["acos ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::atan(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["atan ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] = std::atan2(v[i], v[i + 1]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["atan2 ( double, double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::sinh(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["sinh ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::cosh(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["cosh ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::tanh(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["tanh ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::asinh(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["asinh ( double )"] = duration;
std::uniform_real_distribution<double> U1_10(1.1, 10);
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = U1_10(rng);
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::acosh(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["acosh ( double )"] = duration;
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = U_11(rng);
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::atanh(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["atanh ( double )"] = duration;
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng));
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::erf(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["erf ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::erfc(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["erfc ( double )"] = duration;
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng)) + 1e-3;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::tgamma(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["tgamma ( double )"] = duration;
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng)) + 1e-3;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::lgamma(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["lgamma ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::ceil(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["ceil ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::floor(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["floor ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::trunc(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["trunc ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::round(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["round ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::nearbyint(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["nearbyint ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::rint(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["rint ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
{
int tmp;
v[i] = std::frexp(v[i], &tmp) + tmp;
}
duration = std::max<int64_t>(timer.tok() - loadingCost - M["+ ( double )"], 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["frexp ( double, int* )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
{
int tmp = (int)v[i + 1];
v[i] = std::ldexp(v[i], tmp);
}
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["ldexp ( double, int* )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
{
double tmp;
v[i] = std::modf(v[i], &tmp) + tmp;
}
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["modf ( double, double* )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] = std::copysign(v[i], v[i + 1]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["copysign ( double, double )"] = duration;
std::uniform_real_distribution<double> betaU(0.001, 30);
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = betaU(rng);
timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] = std::beta(v[i], v[i + 1]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["beta ( double, double )"] = duration; // test x, y in [0.001, 30]
std::uniform_real_distribution<double> expintU(-30, 30);
for (int i = 0, iend = v.size(); i < iend; ++i)
{
v[i] = expintU(rng);
if (v[i] > -0.01 and v[i] < 0.01) v[i] = 0.01;
}
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::expint(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["expint ( double )"] = duration; // x != 0, test x in [-30, 30] & x != 0
// std::uniform_real_distribution<double> riemann_zetaU(-1, 2);
// for(int i = 0, iend = v.size(); i < iend; ++i) v[i] = riemann_zetaU(rng);
// timer.tik();
// for(int i = 0, iend = v.size(); i < iend; ++i)
// v[i] = std::riemann_zeta(v[i]);
// duration = std::max<int64_t> (timer.tok() - loadingCost, 0);
// S += std::accumulate(v.begin(), v.end(), 0.0);
// M["riemann_zeta ( double )"] = duration; // test x in [-1, 2].
return S;
}
int main()
{
std::cout << "Random seed: ";
int randomSeed;
std::cin >> randomSeed;
std::cout << "Max iteration: ";
int maxIter;
std::cin >> maxIter;
std::cout << "Save file path: ";
std::string save;
std::cin >> save;
std::unordered_map<std::string, int64_t> M;
double S = mathHspeed(M, randomSeed, maxIter);
// 57.
constexpr int Nfuns = 56;
std::string funNames[] = {
"+ ( int64 )", "x ( int64 )", "/% ( int64 )",
"+ ( double )", "x ( double )", "/ ( double )",
"abs ( int64 )", "abs ( double )", "fmod ( double, double )",
"remainder ( double, double )", "remquo ( double )",
"fma ( double, double, double )", "fmax ( double, double )",
"fmin ( double, double )", "fdim ( double, double )",
"exp ( double )", "exp2 ( double )", "expm1 ( double )",
"log ( double )", "log10 ( double )", "log2 ( double )",
"log1p ( double )", "pow ( double, double )", "sqrt ( double )",
"cbrt ( double )", "hypot ( double, double )",
"hypot ( double, double, double )", "sin ( double )",
"cos ( double )", "tan ( double )", "asin ( double )",
"acos ( double )", "atan ( double )", "atan2 ( double, double )",
"sinh ( double )", "cosh ( double )", "tanh ( double )",
"asinh ( double )", "acosh ( double )", "atanh ( double )",
"erf ( double )", "erfc ( double )", "tgamma ( double )",
"lgamma ( double )", "ceil ( double )", "floor ( double )",
"trunc ( double )", "round ( double )", "nearbyint ( double )",
"rint ( double )", "frexp ( double, int* )", "ldexp ( double, int* )",
"modf ( double, double* )", "copysign ( double, double )",
"beta ( double, double )", "expint ( double )"//, "riemann_zeta ( double )"
};
std::string funNamesNoComma[] = {
"+ ( int64 )", "x ( int64 )", "/% ( int64 )",
"+ ( double )", "x ( double )", "/ ( double )",
"abs ( int64 )", "abs ( double )", "fmod ( double double )",
"remainder ( double double )", "remquo ( double )",
"fma ( double double double )", "fmax ( double double )",
"fmin ( double double )", "fdim ( double double )",
"exp ( double )", "exp2 ( double )", "expm1 ( double )",
"log ( double )", "log10 ( double )", "log2 ( double )",
"log1p ( double )", "pow ( double double )", "sqrt ( double )",
"cbrt ( double )", "hypot ( double double )",
"hypot ( double double double )", "sin ( double )",
"cos ( double )", "tan ( double )", "asin ( double )",
"acos ( double )", "atan ( double )", "atan2 ( double double )",
"sinh ( double )", "cosh ( double )", "tanh ( double )",
"asinh ( double )", "acosh ( double )", "atanh ( double )",
"erf ( double )", "erfc ( double )", "tgamma ( double )",
"lgamma ( double )", "ceil ( double )", "floor ( double )",
"trunc ( double )", "round ( double )", "nearbyint ( double )",
"rint ( double )", "frexp ( double int* )", "ldexp ( double int* )",
"modf ( double double* )", "copysign ( double double )",
"beta ( double double )", "expint ( double )"//, "riemann_zeta ( double )"
};
double relativeTime[Nfuns];
for (int i = 0; i < Nfuns; ++i)
relativeTime[i] = std::round(M[funNames[i]] / (M["+ ( int64 )"] + 0.0) * 10) / 10.0;
constexpr int nameWidth = 32;
constexpr int realtimewd = 16;
constexpr int relatimewd = 16;
std::ofstream out(save.c_str());
out << std::setw(nameWidth) << "Function name,"
<< std::setw(realtimewd) << "Time cost (ms),"
<< std::setw(relatimewd) << "Relative" << std::endl;
for (int i = 0; i < Nfuns; ++i)
{
out << std::setw(nameWidth) << funNamesNoComma[i] << ","
<< std::setw(realtimewd) << std::round(M[funNames[i]] / 1000.0) << ","
<< std::setw(relatimewd) << relativeTime[i] << std::endl;
}
std::cout << "\nDummy sum = " << S << std::endl;
return 0;
}
在 Windows 64 位笔记本电脑上使用 GCC-8.3(-O0
或 -O3
)和 MSVS Community 2019(/O2
)编译代码Intel i9-9980,512KB L1缓存,2MB L2缓存,16MB L3缓存,然后输入以下参数:
下面的table显示了结果:
更具体地说,MSVS 中的优化菜单如下所示:
我无法将“全程序优化”设置为“是”,因为 MSVS 一直抱怨“/ZI 和 /GL 不兼容”。
我的问题:
(1) 我是 MSVS 工具链的新手。为什么 MSVS 构建的 executable 对于大多数函数来说都这么慢? MSVS /O2
甚至无法胜过 GCC -O0
。如何让 MSVS 产生同样快速的代码?我注意到 GCC 产生了一个大约 3MB 的 .exe
,但是 MSVS 产生了一个大约 154KB 的 .exe
和一个 2.9MB 的 .pdb
。
(2) 有趣的是,有一些函数,例如 sin(x)
、cos(x)
和 exp(x)
,其中 MSVS 代码要快得多。除了可能的不同库实现之外还有什么原因吗?
(3) 为什么 exp2(x)
在 MSVS 中比 exp(x)
慢 5 倍?我尝试交换源文件中两个函数的代码块。没区别。
谢谢!
要找到瓶颈,您必须首先确定浪费时间的地方(坏 code/compile、内存 access/throughput)。显然,您已经尝试实现这一目标。请记住,分析本身也会消耗大量资源。 MSVC 提供内置的指令分析。这可能有助于快速识别与指令相关的热点。您可以测量整个程序,或仅测量 2 个断点之间。
不确定您分享的屏幕截图上的 whole program optimiation
是否出于特定原因设置为 no
。
其他性能测量工具,如 xperf
或各种芯片制造商工具(取决于使用的硬件)可以帮助测量其他资源(内存,包括缓存未命中等)。
有一些编译器设置可以帮助针对特定场景进行优化。无论如何,您必须弄清楚为什么 性能不是最佳的。
有各种编译器开关(@njuffa 指出了一些),用于修改编译器行为,例如 fp:fast
的浮点严格性。显然你已经尝试过了。
/arch
允许使用 SSE/AVX(2)/AVX-512 的指定指令集进行自动矢量化。这是 CPU 相关的,所以首先检查支持的指令集,例如使用 CPU-Z (http://www.cpuid.com)。这可以通过自动 parallelization/SIMDfying 提高性能。
您可能还希望支持针对特定 CPU 指令集 /favor:AMD64
、/favor:INTEL64
、/favor:ATOM
的一般优化,因为这有助于编译器考虑芯片特定指令 latency/throughput.
所有提到的函数都存在 CPU 指令,所以我猜这取决于实现,因为编译器似乎不清楚,只要硬件相同。
您可以尝试使用其他编译器,例如 LLVM (clang)。在这里查看 https://docs.microsoft.com/en-us/cpp/build/clang-support-msbuild?view=msvc-170
我经历了各种编译器的优点和缺点。例如,它似乎并不那么容易,当它启动时从 MSVC 中有条件地移动创建管道友好代码。
我的任务是对 cmath
中几乎每个函数的 64 位整数和 double
的时间成本进行基准测试。这是我的源代码:
#include <unordered_map>
#include <string>
#include <cmath>
#include <cstdint>
#include <vector>
#include <random>
#include <iostream>
#include <fstream>
#include <iomanip>
#include <chrono>
#include <numeric>
template<typename timetype>
struct tiktok
{
std::chrono::time_point<std::chrono::steady_clock> start;
// Return time passed since tik.
std::size_t tik() { start = std::chrono::steady_clock::now(); return 0; }
// Return time passed since tok.
std::size_t tok()
{
return std::chrono::duration_cast<timetype> (
std::chrono::steady_clock::now() - start).count();
}
};
double mathHspeed(std::unordered_map<std::string, int64_t>& M,
int rngSeed, int maxIter = 100000)
{
std::mt19937 rng(rngSeed);
std::uniform_real_distribution<double> U(-5, 5);
std::uniform_int_distribution<int32_t> Uint(-2147483647, 2147483647);
tiktok<std::chrono::nanoseconds> timer;
double S = 0;
int64_t duration = 0;
maxIter = (maxIter / 2) * 2; // Make sure maxIter is even.
std::vector<int64_t> u(maxIter);
std::vector<double> v(maxIter);
int64_t loadingCost = 0;
// Time cost of reading and writing 8-bytes = `loadingCost`
// Let loadingCost just be 0 since we only need rough numbers.
if (false)
{
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = U(rng);
timer.tik();
for (int i = 0, iend = v.size() - 2; i < iend; ++i) v[i] += v[i + 1];
duration = timer.tok();
timer.tik();
for (int i = 0, iend = v.size() - 2; i < iend; ++i) // one more addition.
v[i] += v[i + 1] + v[i + 2];
std::size_t duration2 = timer.tok();
loadingCost = std::max<int64_t>(
0, (int64_t)duration - ((int64_t)duration2 - (int64_t)duration));
S += std::accumulate(v.begin(), v.end(), 0.0);
}
#define sampleDouble for(int i = 0, iend = v.size(); i < iend; ++i) v[i] = U(rng);
#define sampleInt for(int i = 0, iend = u.size(); i < iend; ++i) u[i] = (int64_t)Uint(rng) - Uint(rng);
sampleInt; timer.tik();
for (int i = 0, iend = u.size() - 1; i < iend; ++i)
u[i] += u[i + 1];
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(u.begin(), u.end(), 0.0);
M["+ ( int64 )"] = duration;
sampleInt; timer.tik();
for (int i = 0, iend = u.size() - 1; i < iend; ++i)
u[i] *= u[i + 1];
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(u.begin(), u.end(), 0.0);
M["x ( int64 )"] = duration;
for (int i = 0, iend = u.size() - 1; i < iend; i += 2)
{
u[i] = (int64_t)Uint(rng) * Uint(rng); u[i + 1] = Uint(rng);
}
timer.tik();
for (int i = 0, iend = u.size() - 1; i < iend; ++i)
u[i] = u[i] / u[i + 1] + u[i] % u[i + 1];
duration = std::max<int64_t>(timer.tok() - loadingCost - M["+ ( int64 )"], 0);
S += std::accumulate(u.begin(), u.end(), 0.0);
M["/% ( int64 )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] += v[i + 1];
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["+ ( double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] *= v[i + 1];
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["x ( double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] /= v[i + 1];
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["/ ( double )"] = duration;
for (int i = 0, iend = u.size(); i < iend; ++i) u[i] = (int64_t)Uint(rng) - Uint(rng);
timer.tik();
for (int i = 0, iend = u.size(); i < iend; ++i)
u[i] = std::abs(u[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(u.begin(), u.end(), 0.0);
M["abs ( int64 )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::abs(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["abs ( double )"] = duration;
for (int i = 0, iend = v.size() - 1; i < iend; i += 2)
{
v[i] = U(rng) * U(rng); v[i + 1] = U(rng);
}
timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] = std::fmod(v[i], v[i + 1]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["fmod ( double, double )"] = duration;
for (int i = 0, iend = v.size() - 1; i < iend; i += 2)
{
v[i] = U(rng) * U(rng); v[i + 1] = U(rng);
}
timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] = std::remainder(v[i], v[i + 1]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["remainder ( double, double )"] = duration;
for (int i = 0, iend = v.size() - 1; i < iend; i += 2)
{
v[i] = U(rng) * U(rng); v[i + 1] = U(rng);
}
timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
{
int tmp = 0;
v[i] = std::remquo(v[i], v[i + 1], &tmp) + tmp;
}
duration = std::max<int64_t>(timer.tok() - loadingCost - M["+ ( double )"], 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["remquo ( double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size() - 2; i < iend; ++i)
v[i] = std::fma(v[i], v[i + 1], v[i + 2]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["fma ( double, double, double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] = std::fmax(v[i], v[i + 1]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["fmax ( double, double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] = std::fmin(v[i], v[i + 1]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["fmin ( double, double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] = std::fdim(v[i], v[i + 1]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["fdim ( double, double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::exp(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["exp ( double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::exp2(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["exp2 ( double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::expm1(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["expm1 ( double )"] = duration;
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng)) + 1e-6;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::log(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["log ( double )"] = duration;
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng)) + 1e-6;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::log10(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["log10 ( double )"] = duration;
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng)) + 1e-6;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::log2(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["log2 ( double )"] = duration;
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng)) + 1e-6;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::log1p(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["log1p ( double )"] = duration;
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng));
timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] = std::pow(v[i], v[i + 1]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["pow ( double, double )"] = duration;
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng));
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::sqrt(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["sqrt ( double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::cbrt(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["cbrt ( double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] = std::hypot(v[i], v[i + 1]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["hypot ( double, double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size() - 2; i < iend; ++i)
v[i] = std::hypot(v[i], v[i + 1], v[i + 2]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["hypot ( double, double, double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::sin(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["sin ( double )"] = duration;
sampleDouble; timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::cos(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["cos ( double )"] = duration;
std::uniform_real_distribution<double> UhalfPi(-3.14 / 2, 3.14 / 2);
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = UhalfPi(rng);
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::tan(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["tan ( double )"] = duration;
std::uniform_real_distribution<double> U_11(-0.99, 0.99);
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = U_11(rng);
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::asin(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["asin ( double )"] = duration;
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = U_11(rng);
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] =
std::acos(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["acos ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::atan(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["atan ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] = std::atan2(v[i], v[i + 1]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["atan2 ( double, double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::sinh(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["sinh ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::cosh(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["cosh ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::tanh(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["tanh ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::asinh(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["asinh ( double )"] = duration;
std::uniform_real_distribution<double> U1_10(1.1, 10);
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = U1_10(rng);
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::acosh(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["acosh ( double )"] = duration;
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = U_11(rng);
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::atanh(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["atanh ( double )"] = duration;
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng));
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::erf(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["erf ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::erfc(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["erfc ( double )"] = duration;
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng)) + 1e-3;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::tgamma(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["tgamma ( double )"] = duration;
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng)) + 1e-3;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::lgamma(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["lgamma ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::ceil(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["ceil ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::floor(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["floor ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::trunc(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["trunc ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::round(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["round ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::nearbyint(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["nearbyint ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::rint(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["rint ( double )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
{
int tmp;
v[i] = std::frexp(v[i], &tmp) + tmp;
}
duration = std::max<int64_t>(timer.tok() - loadingCost - M["+ ( double )"], 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["frexp ( double, int* )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
{
int tmp = (int)v[i + 1];
v[i] = std::ldexp(v[i], tmp);
}
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["ldexp ( double, int* )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
{
double tmp;
v[i] = std::modf(v[i], &tmp) + tmp;
}
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["modf ( double, double* )"] = duration;
sampleDouble;
timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] = std::copysign(v[i], v[i + 1]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["copysign ( double, double )"] = duration;
std::uniform_real_distribution<double> betaU(0.001, 30);
for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = betaU(rng);
timer.tik();
for (int i = 0, iend = v.size() - 1; i < iend; ++i)
v[i] = std::beta(v[i], v[i + 1]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["beta ( double, double )"] = duration; // test x, y in [0.001, 30]
std::uniform_real_distribution<double> expintU(-30, 30);
for (int i = 0, iend = v.size(); i < iend; ++i)
{
v[i] = expintU(rng);
if (v[i] > -0.01 and v[i] < 0.01) v[i] = 0.01;
}
timer.tik();
for (int i = 0, iend = v.size(); i < iend; ++i)
v[i] = std::expint(v[i]);
duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
S += std::accumulate(v.begin(), v.end(), 0.0);
M["expint ( double )"] = duration; // x != 0, test x in [-30, 30] & x != 0
// std::uniform_real_distribution<double> riemann_zetaU(-1, 2);
// for(int i = 0, iend = v.size(); i < iend; ++i) v[i] = riemann_zetaU(rng);
// timer.tik();
// for(int i = 0, iend = v.size(); i < iend; ++i)
// v[i] = std::riemann_zeta(v[i]);
// duration = std::max<int64_t> (timer.tok() - loadingCost, 0);
// S += std::accumulate(v.begin(), v.end(), 0.0);
// M["riemann_zeta ( double )"] = duration; // test x in [-1, 2].
return S;
}
int main()
{
std::cout << "Random seed: ";
int randomSeed;
std::cin >> randomSeed;
std::cout << "Max iteration: ";
int maxIter;
std::cin >> maxIter;
std::cout << "Save file path: ";
std::string save;
std::cin >> save;
std::unordered_map<std::string, int64_t> M;
double S = mathHspeed(M, randomSeed, maxIter);
// 57.
constexpr int Nfuns = 56;
std::string funNames[] = {
"+ ( int64 )", "x ( int64 )", "/% ( int64 )",
"+ ( double )", "x ( double )", "/ ( double )",
"abs ( int64 )", "abs ( double )", "fmod ( double, double )",
"remainder ( double, double )", "remquo ( double )",
"fma ( double, double, double )", "fmax ( double, double )",
"fmin ( double, double )", "fdim ( double, double )",
"exp ( double )", "exp2 ( double )", "expm1 ( double )",
"log ( double )", "log10 ( double )", "log2 ( double )",
"log1p ( double )", "pow ( double, double )", "sqrt ( double )",
"cbrt ( double )", "hypot ( double, double )",
"hypot ( double, double, double )", "sin ( double )",
"cos ( double )", "tan ( double )", "asin ( double )",
"acos ( double )", "atan ( double )", "atan2 ( double, double )",
"sinh ( double )", "cosh ( double )", "tanh ( double )",
"asinh ( double )", "acosh ( double )", "atanh ( double )",
"erf ( double )", "erfc ( double )", "tgamma ( double )",
"lgamma ( double )", "ceil ( double )", "floor ( double )",
"trunc ( double )", "round ( double )", "nearbyint ( double )",
"rint ( double )", "frexp ( double, int* )", "ldexp ( double, int* )",
"modf ( double, double* )", "copysign ( double, double )",
"beta ( double, double )", "expint ( double )"//, "riemann_zeta ( double )"
};
std::string funNamesNoComma[] = {
"+ ( int64 )", "x ( int64 )", "/% ( int64 )",
"+ ( double )", "x ( double )", "/ ( double )",
"abs ( int64 )", "abs ( double )", "fmod ( double double )",
"remainder ( double double )", "remquo ( double )",
"fma ( double double double )", "fmax ( double double )",
"fmin ( double double )", "fdim ( double double )",
"exp ( double )", "exp2 ( double )", "expm1 ( double )",
"log ( double )", "log10 ( double )", "log2 ( double )",
"log1p ( double )", "pow ( double double )", "sqrt ( double )",
"cbrt ( double )", "hypot ( double double )",
"hypot ( double double double )", "sin ( double )",
"cos ( double )", "tan ( double )", "asin ( double )",
"acos ( double )", "atan ( double )", "atan2 ( double double )",
"sinh ( double )", "cosh ( double )", "tanh ( double )",
"asinh ( double )", "acosh ( double )", "atanh ( double )",
"erf ( double )", "erfc ( double )", "tgamma ( double )",
"lgamma ( double )", "ceil ( double )", "floor ( double )",
"trunc ( double )", "round ( double )", "nearbyint ( double )",
"rint ( double )", "frexp ( double int* )", "ldexp ( double int* )",
"modf ( double double* )", "copysign ( double double )",
"beta ( double double )", "expint ( double )"//, "riemann_zeta ( double )"
};
double relativeTime[Nfuns];
for (int i = 0; i < Nfuns; ++i)
relativeTime[i] = std::round(M[funNames[i]] / (M["+ ( int64 )"] + 0.0) * 10) / 10.0;
constexpr int nameWidth = 32;
constexpr int realtimewd = 16;
constexpr int relatimewd = 16;
std::ofstream out(save.c_str());
out << std::setw(nameWidth) << "Function name,"
<< std::setw(realtimewd) << "Time cost (ms),"
<< std::setw(relatimewd) << "Relative" << std::endl;
for (int i = 0; i < Nfuns; ++i)
{
out << std::setw(nameWidth) << funNamesNoComma[i] << ","
<< std::setw(realtimewd) << std::round(M[funNames[i]] / 1000.0) << ","
<< std::setw(relatimewd) << relativeTime[i] << std::endl;
}
std::cout << "\nDummy sum = " << S << std::endl;
return 0;
}
在 Windows 64 位笔记本电脑上使用 GCC-8.3(-O0
或 -O3
)和 MSVS Community 2019(/O2
)编译代码Intel i9-9980,512KB L1缓存,2MB L2缓存,16MB L3缓存,然后输入以下参数:
下面的table显示了结果:
更具体地说,MSVS 中的优化菜单如下所示:
我无法将“全程序优化”设置为“是”,因为 MSVS 一直抱怨“/ZI 和 /GL 不兼容”。
我的问题:
(1) 我是 MSVS 工具链的新手。为什么 MSVS 构建的 executable 对于大多数函数来说都这么慢? MSVS /O2
甚至无法胜过 GCC -O0
。如何让 MSVS 产生同样快速的代码?我注意到 GCC 产生了一个大约 3MB 的 .exe
,但是 MSVS 产生了一个大约 154KB 的 .exe
和一个 2.9MB 的 .pdb
。
(2) 有趣的是,有一些函数,例如 sin(x)
、cos(x)
和 exp(x)
,其中 MSVS 代码要快得多。除了可能的不同库实现之外还有什么原因吗?
(3) 为什么 exp2(x)
在 MSVS 中比 exp(x)
慢 5 倍?我尝试交换源文件中两个函数的代码块。没区别。
谢谢!
要找到瓶颈,您必须首先确定浪费时间的地方(坏 code/compile、内存 access/throughput)。显然,您已经尝试实现这一目标。请记住,分析本身也会消耗大量资源。 MSVC 提供内置的指令分析。这可能有助于快速识别与指令相关的热点。您可以测量整个程序,或仅测量 2 个断点之间。
不确定您分享的屏幕截图上的 whole program optimiation
是否出于特定原因设置为 no
。
其他性能测量工具,如 xperf
或各种芯片制造商工具(取决于使用的硬件)可以帮助测量其他资源(内存,包括缓存未命中等)。
有一些编译器设置可以帮助针对特定场景进行优化。无论如何,您必须弄清楚为什么 性能不是最佳的。
有各种编译器开关(@njuffa 指出了一些),用于修改编译器行为,例如 fp:fast
的浮点严格性。显然你已经尝试过了。
/arch
允许使用 SSE/AVX(2)/AVX-512 的指定指令集进行自动矢量化。这是 CPU 相关的,所以首先检查支持的指令集,例如使用 CPU-Z (http://www.cpuid.com)。这可以通过自动 parallelization/SIMDfying 提高性能。
您可能还希望支持针对特定 CPU 指令集 /favor:AMD64
、/favor:INTEL64
、/favor:ATOM
的一般优化,因为这有助于编译器考虑芯片特定指令 latency/throughput.
所有提到的函数都存在 CPU 指令,所以我猜这取决于实现,因为编译器似乎不清楚,只要硬件相同。
您可以尝试使用其他编译器,例如 LLVM (clang)。在这里查看 https://docs.microsoft.com/en-us/cpp/build/clang-support-msbuild?view=msvc-170 我经历了各种编译器的优点和缺点。例如,它似乎并不那么容易,当它启动时从 MSVC 中有条件地移动创建管道友好代码。