<cmath> 中的基准函数与 GCC 和 MSVS

Question

我的任务是对 cmath 中几乎每个函数的 64 位整数和 double 的时间成本进行基准测试。这是我的源代码：

#include <unordered_map>
#include <string>
#include <cmath>
#include <cstdint>
#include <vector>
#include <random>
#include <iostream>
#include <fstream>
#include <iomanip>
#include <chrono>
#include <numeric>


template<typename timetype>
struct tiktok
{
  std::chrono::time_point<std::chrono::steady_clock> start;
  // Return time passed since tik.
  std::size_t tik() { start = std::chrono::steady_clock::now(); return 0; }
  // Return time passed since tok.
  std::size_t tok()
  {
    return std::chrono::duration_cast<timetype> (
        std::chrono::steady_clock::now() - start).count();
  }
};


double mathHspeed(std::unordered_map<std::string, int64_t>& M,
                  int rngSeed, int maxIter = 100000)
{
  std::mt19937 rng(rngSeed);
  std::uniform_real_distribution<double> U(-5, 5);
  std::uniform_int_distribution<int32_t> Uint(-2147483647, 2147483647);
  tiktok<std::chrono::nanoseconds> timer;
  double S = 0;
  int64_t duration = 0;
  maxIter = (maxIter / 2) * 2; // Make sure maxIter is even.
  std::vector<int64_t> u(maxIter);
  std::vector<double> v(maxIter);
  int64_t loadingCost = 0;
  
  
  // Time cost of reading and writing 8-bytes = `loadingCost`
  // Let loadingCost just be 0 since we only need rough numbers.
  if (false)
  {
    for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = U(rng);
    
    
    timer.tik();
    for (int i = 0, iend = v.size() - 2; i < iend; ++i) v[i] += v[i + 1];
    duration = timer.tok();
    
    
    timer.tik();
    for (int i = 0, iend = v.size() - 2; i < iend; ++i) // one more addition.
      v[i] += v[i + 1] + v[i + 2];
    std::size_t duration2 = timer.tok();
    
    
    loadingCost = std::max<int64_t>(
      0, (int64_t)duration - ((int64_t)duration2 - (int64_t)duration));
    S += std::accumulate(v.begin(), v.end(), 0.0);
  }
  
  
#define sampleDouble for(int i = 0, iend = v.size(); i < iend; ++i) v[i] = U(rng);
#define sampleInt for(int i = 0, iend = u.size(); i < iend; ++i) u[i] = (int64_t)Uint(rng) - Uint(rng);
  
  
  sampleInt; timer.tik();
  for (int i = 0, iend = u.size() - 1; i < iend; ++i)
    u[i] += u[i + 1];
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(u.begin(), u.end(), 0.0);
  M["+ ( int64 )"] = duration;
  
  
  sampleInt; timer.tik();
  for (int i = 0, iend = u.size() - 1; i < iend; ++i)
    u[i] *= u[i + 1];
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(u.begin(), u.end(), 0.0);
  M["x ( int64 )"] = duration;
  
  
  for (int i = 0, iend = u.size() - 1; i < iend; i += 2)
  {
    u[i] = (int64_t)Uint(rng) * Uint(rng); u[i + 1] = Uint(rng);
  }
  timer.tik();
  for (int i = 0, iend = u.size() - 1; i < iend; ++i)
    u[i] = u[i] / u[i + 1] + u[i] % u[i + 1];
  duration = std::max<int64_t>(timer.tok() - loadingCost - M["+ ( int64 )"], 0);
  S += std::accumulate(u.begin(), u.end(), 0.0);
  M["/% ( int64 )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend; ++i)
    v[i] += v[i + 1];
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["+ ( double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend; ++i)
    v[i] *= v[i + 1];
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["x ( double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend; ++i)
    v[i] /= v[i + 1];
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["/ ( double )"] = duration;
  
  
  for (int i = 0, iend = u.size(); i < iend; ++i) u[i] = (int64_t)Uint(rng) - Uint(rng);
  timer.tik();
  for (int i = 0, iend = u.size(); i < iend; ++i)
    u[i] = std::abs(u[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(u.begin(), u.end(), 0.0);
  M["abs ( int64 )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::abs(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["abs ( double )"] = duration;
  
  
  for (int i = 0, iend = v.size() - 1; i < iend; i += 2)
  {
    v[i] = U(rng) * U(rng); v[i + 1] = U(rng);
  }
  timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend; ++i)
    v[i] = std::fmod(v[i], v[i + 1]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["fmod ( double, double )"] = duration;
  
  
  for (int i = 0, iend = v.size() - 1; i < iend; i += 2)
  {
    v[i] = U(rng) * U(rng); v[i + 1] = U(rng);
  }
  timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend; ++i)
    v[i] = std::remainder(v[i], v[i + 1]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["remainder ( double, double )"] = duration;
  
  
  for (int i = 0, iend = v.size() - 1; i < iend; i += 2)
  {
    v[i] = U(rng) * U(rng); v[i + 1] = U(rng);
  }
  timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend; ++i)
  {
    int tmp = 0;
    v[i] = std::remquo(v[i], v[i + 1], &tmp) + tmp;
  }
  duration = std::max<int64_t>(timer.tok() - loadingCost - M["+ ( double )"], 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["remquo ( double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size() - 2; i < iend; ++i)
    v[i] = std::fma(v[i], v[i + 1], v[i + 2]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["fma ( double, double, double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend; ++i)
    v[i] = std::fmax(v[i], v[i + 1]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["fmax ( double, double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend; ++i)
    v[i] = std::fmin(v[i], v[i + 1]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["fmin ( double, double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend; ++i)
    v[i] = std::fdim(v[i], v[i + 1]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["fdim ( double, double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::exp(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["exp ( double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::exp2(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["exp2 ( double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::expm1(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["expm1 ( double )"] = duration;
  
  
  for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng)) + 1e-6;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::log(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["log ( double )"] = duration;
  
  
  for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng)) + 1e-6;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::log10(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["log10 ( double )"] = duration;
  
  
  for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng)) + 1e-6;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::log2(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["log2 ( double )"] = duration;
  
  
  for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng)) + 1e-6;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::log1p(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["log1p ( double )"] = duration;
  
  
  for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng));
  timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend; ++i)
    v[i] = std::pow(v[i], v[i + 1]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["pow ( double, double )"] = duration;
  
  
  for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng));
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::sqrt(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["sqrt ( double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::cbrt(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["cbrt ( double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend; ++i)
    v[i] = std::hypot(v[i], v[i + 1]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["hypot ( double, double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size() - 2; i < iend; ++i)
    v[i] = std::hypot(v[i], v[i + 1], v[i + 2]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["hypot ( double, double, double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::sin(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["sin ( double )"] = duration;
  
  
  sampleDouble; timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::cos(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["cos ( double )"] = duration;
  
  
  std::uniform_real_distribution<double> UhalfPi(-3.14 / 2, 3.14 / 2);
  for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = UhalfPi(rng);
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::tan(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["tan ( double )"] = duration;
  
  
  std::uniform_real_distribution<double> U_11(-0.99, 0.99);
  for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = U_11(rng);
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::asin(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["asin ( double )"] = duration;
  
  
  for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = U_11(rng);
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i) v[i] =
    std::acos(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["acos ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::atan(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["atan ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend; ++i)
    v[i] = std::atan2(v[i], v[i + 1]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["atan2 ( double, double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::sinh(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["sinh ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::cosh(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["cosh ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::tanh(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["tanh ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::asinh(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["asinh ( double )"] = duration;
  
  
  std::uniform_real_distribution<double> U1_10(1.1, 10);
  for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = U1_10(rng);
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::acosh(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["acosh ( double )"] = duration;
  
  
  for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = U_11(rng);
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::atanh(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["atanh ( double )"] = duration;
  
  
  for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng));
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::erf(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["erf ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::erfc(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["erfc ( double )"] = duration;
  
  
  for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng)) + 1e-3;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::tgamma(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["tgamma ( double )"] = duration;
  
  
  for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = std::abs(U(rng)) + 1e-3;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::lgamma(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["lgamma ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::ceil(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["ceil ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::floor(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["floor ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::trunc(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["trunc ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::round(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["round ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::nearbyint(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["nearbyint ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::rint(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["rint ( double )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
  {
    int tmp;
    v[i] = std::frexp(v[i], &tmp) + tmp;
  }
  duration = std::max<int64_t>(timer.tok() - loadingCost - M["+ ( double )"], 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["frexp ( double, int* )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend; ++i)
  {
    int tmp = (int)v[i + 1];
    v[i] = std::ldexp(v[i], tmp);
  }
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["ldexp ( double, int* )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
  {
    double tmp;
    v[i] = std::modf(v[i], &tmp) + tmp;
  }
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["modf ( double, double* )"] = duration;
  
  
  sampleDouble;
  timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend; ++i)
    v[i] = std::copysign(v[i], v[i + 1]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["copysign ( double, double )"] = duration;
  
  
  std::uniform_real_distribution<double> betaU(0.001, 30);
  for (int i = 0, iend = v.size(); i < iend; ++i) v[i] = betaU(rng);
  timer.tik();
  for (int i = 0, iend = v.size() - 1; i < iend; ++i)
    v[i] = std::beta(v[i], v[i + 1]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["beta ( double, double )"] = duration; // test x, y in [0.001, 30]
  
  
  std::uniform_real_distribution<double> expintU(-30, 30);
  for (int i = 0, iend = v.size(); i < iend; ++i)
  {
    v[i] = expintU(rng);
    if (v[i] > -0.01 and v[i] < 0.01) v[i] = 0.01;
  }
  timer.tik();
  for (int i = 0, iend = v.size(); i < iend; ++i)
    v[i] = std::expint(v[i]);
  duration = std::max<int64_t>(timer.tok() - loadingCost, 0);
  S += std::accumulate(v.begin(), v.end(), 0.0);
  M["expint ( double )"] = duration; // x != 0, test x in [-30, 30] & x != 0
  
  
  // std::uniform_real_distribution<double> riemann_zetaU(-1, 2);
  // for(int i = 0, iend = v.size(); i < iend; ++i) v[i] = riemann_zetaU(rng);
  // timer.tik();
  // for(int i = 0, iend = v.size(); i < iend; ++i)
  //   v[i] = std::riemann_zeta(v[i]);
  // duration = std::max<int64_t> (timer.tok() - loadingCost, 0);
  // S += std::accumulate(v.begin(), v.end(), 0.0);
  // M["riemann_zeta ( double )"] = duration; // test x in [-1, 2].
  
  
  return S;
}


int main()
{
  std::cout << "Random seed: ";
  int randomSeed;
  std::cin >> randomSeed;
  std::cout << "Max iteration: ";
  int maxIter;
  std::cin >> maxIter;
  std::cout << "Save file path: ";
  std::string save;
  std::cin >> save;
  
  
  std::unordered_map<std::string, int64_t> M;
  double S = mathHspeed(M, randomSeed, maxIter);
  
  
  // 57.
  constexpr int Nfuns = 56;
  std::string funNames[] = {
    "+ ( int64 )", "x ( int64 )", "/% ( int64 )",
    "+ ( double )", "x ( double )", "/ ( double )",
    "abs ( int64 )", "abs ( double )", "fmod ( double, double )",
    "remainder ( double, double )", "remquo ( double )",
    "fma ( double, double, double )", "fmax ( double, double )",
    "fmin ( double, double )", "fdim ( double, double )",
    "exp ( double )", "exp2 ( double )", "expm1 ( double )",
    "log ( double )", "log10 ( double )", "log2 ( double )",
    "log1p ( double )", "pow ( double, double )", "sqrt ( double )",
    "cbrt ( double )", "hypot ( double, double )",
    "hypot ( double, double, double )", "sin ( double )",
    "cos ( double )", "tan ( double )", "asin ( double )",
    "acos ( double )", "atan ( double )", "atan2 ( double, double )",
    "sinh ( double )", "cosh ( double )", "tanh ( double )",
    "asinh ( double )", "acosh ( double )", "atanh ( double )",
    "erf ( double )", "erfc ( double )", "tgamma ( double )",
    "lgamma ( double )", "ceil ( double )", "floor ( double )",
    "trunc ( double )", "round ( double )", "nearbyint ( double )",
    "rint ( double )", "frexp ( double, int* )", "ldexp ( double, int* )",
    "modf ( double, double* )", "copysign ( double, double )",
    "beta ( double, double )", "expint ( double )"//, "riemann_zeta ( double )"
  };
  
  
  std::string funNamesNoComma[] = {
    "+ ( int64 )", "x ( int64 )", "/% ( int64 )",
    "+ ( double )", "x ( double )", "/ ( double )",
    "abs ( int64 )", "abs ( double )", "fmod ( double double )",
    "remainder ( double double )", "remquo ( double )",
    "fma ( double double double )", "fmax ( double double )",
    "fmin ( double double )", "fdim ( double double )",
    "exp ( double )", "exp2 ( double )", "expm1 ( double )",
    "log ( double )", "log10 ( double )", "log2 ( double )",
    "log1p ( double )", "pow ( double double )", "sqrt ( double )",
    "cbrt ( double )", "hypot ( double double )",
    "hypot ( double double double )", "sin ( double )",
    "cos ( double )", "tan ( double )", "asin ( double )",
    "acos ( double )", "atan ( double )", "atan2 ( double double )",
    "sinh ( double )", "cosh ( double )", "tanh ( double )",
    "asinh ( double )", "acosh ( double )", "atanh ( double )",
    "erf ( double )", "erfc ( double )", "tgamma ( double )",
    "lgamma ( double )", "ceil ( double )", "floor ( double )",
    "trunc ( double )", "round ( double )", "nearbyint ( double )",
    "rint ( double )", "frexp ( double int* )", "ldexp ( double int* )",
    "modf ( double double* )", "copysign ( double double )",
    "beta ( double double )", "expint ( double )"//, "riemann_zeta ( double )"
  };
  
  
  double relativeTime[Nfuns];
  for (int i = 0; i < Nfuns; ++i)
    relativeTime[i] = std::round(M[funNames[i]] / (M["+ ( int64 )"] + 0.0) * 10) / 10.0;
  
  
  constexpr int nameWidth = 32;
  constexpr int realtimewd = 16;
  constexpr int relatimewd = 16;
  std::ofstream out(save.c_str());
  out << std::setw(nameWidth) << "Function name,"
      << std::setw(realtimewd) << "Time cost (ms),"
      << std::setw(relatimewd) << "Relative" << std::endl;
  
  
  for (int i = 0; i < Nfuns; ++i)
  {
    out << std::setw(nameWidth) << funNamesNoComma[i] << ","
        << std::setw(realtimewd) << std::round(M[funNames[i]] / 1000.0) << ","
        << std::setw(relatimewd) << relativeTime[i] << std::endl;
  }
  
  
  std::cout << "\nDummy sum = " << S << std::endl;
  
  
  return 0;
}

在 Windows 64 位笔记本电脑上使用 GCC-8.3（-O0 或 -O3）和 MSVS Community 2019（/O2）编译代码Intel i9-9980，512KB L1缓存，2MB L2缓存，16MB L3缓存，然后输入以下参数：

下面的table显示了结果：

更具体地说，MSVS 中的优化菜单如下所示：

我无法将“全程序优化”设置为“是”，因为 MSVS 一直抱怨“/ZI 和 /GL 不兼容”。

我的问题：

(1) 我是 MSVS 工具链的新手。为什么 MSVS 构建的 executable 对于大多数函数来说都这么慢？ MSVS /O2 甚至无法胜过 GCC -O0。如何让 MSVS 产生同样快速的代码？我注意到 GCC 产生了一个大约 3MB 的 .exe，但是 MSVS 产生了一个大约 154KB 的 .exe 和一个 2.9MB 的 .pdb。

(2) 有趣的是，有一些函数，例如 sin(x)、cos(x) 和 exp(x)，其中 MSVS 代码要快得多。除了可能的不同库实现之外还有什么原因吗？

(3) 为什么 exp2(x) 在 MSVS 中比 exp(x) 慢 5 倍？我尝试交换源文件中两个函数的代码块。没区别。

谢谢！

Answer 1

要找到瓶颈，您必须首先确定浪费时间的地方（坏 code/compile、内存 access/throughput）。显然，您已经尝试实现这一目标。请记住，分析本身也会消耗大量资源。 MSVC 提供内置的指令分析。这可能有助于快速识别与指令相关的热点。您可以测量整个程序，或仅测量 2 个断点之间。

不确定您分享的屏幕截图上的 whole program optimiation 是否出于特定原因设置为 no。

其他性能测量工具，如 xperf 或各种芯片制造商工具（取决于使用的硬件）可以帮助测量其他资源（内存，包括缓存未命中等）。

有一些编译器设置可以帮助针对特定场景进行优化。无论如何，您必须弄清楚为什么 性能不是最佳的。

有各种编译器开关（@njuffa 指出了一些），用于修改编译器行为，例如 fp:fast 的浮点严格性。显然你已经尝试过了。 /arch 允许使用 SSE/AVX(2)/AVX-512 的指定指令集进行自动矢量化。这是 CPU 相关的，所以首先检查支持的指令集，例如使用 CPU-Z (http://www.cpuid.com)。这可以通过自动 parallelization/SIMDfying 提高性能。您可能还希望支持针对特定 CPU 指令集 /favor:AMD64、/favor:INTEL64、/favor:ATOM 的一般优化，因为这有助于编译器考虑芯片特定指令 latency/throughput.

所有提到的函数都存在 CPU 指令，所以我猜这取决于实现，因为编译器似乎不清楚，只要硬件相同。

您可以尝试使用其他编译器，例如 LLVM (clang)。在这里查看 https://docs.microsoft.com/en-us/cpp/build/clang-support-msbuild?view=msvc-170 我经历了各种编译器的优点和缺点。例如，它似乎并不那么容易，当它启动时从 MSVC 中有条件地移动创建管道友好代码。

<cmath> 中的基准函数与 GCC 和 MSVS

Benchmark functions in <cmath> with GCC and MSVS

c++

gcc

cmath

visual-studio