快速精确地实现 pow(complex<double>, 2)

Question

我有一个代码，我在其中执行许多形式的操作

double z_sq = pow(abs(std::complex<double> z), 2);

计算复数 z 的 |z|²，其中性能和精度是我的主要关注点。我读到 C++ 的 pow() 函数将 int 指数转换为 double，因此引入了数值错误，而且性能通常也比 x*x (discussed here for real numbers) 差。同样，<complex> 的 std::pow() 方法不必要地将指数转换为 std::complex<double>。

我的第一个问题是如何以最佳性能和精度实现复数平方 Re²+Im² 或 zz*。我考虑过做类似

的事情

double complex_square1(std::complex<double> z) {
    double abs_z = abs(z);
    return abs_z * abs_z;
}

或（这给出了类型错误，但这是我能想到的最直接的方法）

double complex_square2(std::complex<double> z) {
    return z * conj(z);
}

我的第二个问题是，对于许多 (10^14) 个这样的操作，函数调用是否会显着降低性能。那么在 complex_square1 的情况下，将 abs(z) * abs(z) 写入 .cpp 文件而不是为其定义函数会更好吗？还有其他更好的方法吗？

Answer 1

我认为仅取虚部和实部的平方和是很难击败的。

下面我测量它比实际计算幅度和平方快大约 5 倍：

#include <complex>
#include <chrono>
#include <random>
#include <vector>
#include <iostream>

double square_of_magnitude_1( std::complex<double> z) {
    auto magnitude = std::abs(z);
    return magnitude * magnitude;
}

double square_of_magnitude_2( std::complex<double> z) {
    return z.imag() * z.imag() + z.real() * z.real();
}


volatile double v; // assign to volatile so calls do not get optimized away.
std::random_device rd;
std::mt19937 e2(rd());
std::uniform_real_distribution<double> dist(0, 10);

int main() {
    using std::chrono::high_resolution_clock;
    using std::chrono::duration_cast;
    using std::chrono::duration;
    using std::chrono::microseconds;

    std::vector<std::complex<double>> numbers(10000000);
    std::generate(numbers.begin(), numbers.end(), []() {return std::complex<double>(dist(e2), dist(e2)); });

    auto t1 = high_resolution_clock::now();
    for (const auto& z : numbers) {
        v = square_of_magnitude_1(z);
    }
    auto t2 = high_resolution_clock::now();
    std::cout << "square_of_magnitude_1 => " << duration_cast<microseconds>(t2 - t1).count() << "\n";


    t1 = high_resolution_clock::now();
    for (const auto& z : numbers) {
        v = square_of_magnitude_2(z);
    }
    t2 = high_resolution_clock::now();
    std::cout << "square_of_magnitude_2 => " << duration_cast<microseconds>(t2 - t1).count() << "\n";

}

上述产量的典型运行

square_of_magnitude_1 => 54948
square_of_magnitude_2 => 9730

Answer 2

根据 Peter 的回答（再次感谢），这是建议方法中最快的，我补充了我的比较代码供感兴趣的人使用：

#include <complex>
#include <chrono>

using complex = std::complex<double>;

double complex_square1(complex z) {
    return z.real()*z.real() + z.imag()*z.imag();
}


double complex_square2(complex z) {
    double abs_z = abs(z);
    return abs_z * abs_z;
}


complex z = complex(0.5, 0.5);

unsigned int N = (int)1e7;
double z_sq;

int main() {
    auto start = std::chrono::high_resolution_clock::now();

    for (size_t i = 0; i < N; ++i) {
        z_sq = z.real()*z.real() + z.imag()*z.imag();       // fastest
        // z_sq = complex_square1(z);                       // slower by a factor 1.6 WITHOUT OPTIMIZATION
        // z_sq = complex_square2(z);                       // slower by a factor 5.5
        // z_sq = pow(abs(z), 2);                           // slower by a factor 6
        // z_sq = norm(z);                                  // slower by a factor 5
    }

    auto end = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();

    printf("Duration: %d ms\n", duration);
    printf("z_sq = %f", z_sq);
}

这也回答了我的第二个问题：在没有优化的情况下进行编译时，函数调用确实变得很明显，但计算时间只增加了大约 60%，这是在我的 Windows 机器上使用 GCC 测得的编译器。

经过优化，基准测试中的前两行与 Patrick Roberts 指出的一样快。

Answer 3

根据@Azure27 的回答，我将代码重新整理成一个使用Google Benchmark.

的解决方案

#include <benchmark/benchmark.h>
#include <complex>
#include <chrono>

using complex = std::complex<double>;

double complex_square1(complex z) {
    return z.real()*z.real() + z.imag()*z.imag();
}


double complex_square2(complex z) {
    const double abs_z = abs(z);
    return abs_z * abs_z;
}


complex z{0.5, 0.5};
unsigned int N{(int)1e7};
volatile double z_sq;

int ExplicitSquareCalculation() 
{
    for (size_t i = 0; i < N; ++i) {
        z_sq = z.real()*z.real() + z.imag()*z.imag();       // fastest
    }

    // printf("z_sq = %f", z_sq);
    return 0;
}

static void BM_ExplicitSquareCalculation(benchmark::State& state) {
  for (auto _ : state) {
    ExplicitSquareCalculation();
  }
}

int ComplexSquare1() 
{
    for (size_t i = 0; i < N; ++i) {
        z_sq = complex_square1(z);                       // slower by a factor 1.6 WITHOUT OPTIMIZATION
    }

    // printf("z_sq = %f", z_sq);
    return 0;
}

static void BM_ComplexSquare1(benchmark::State& state) {
  for (auto _ : state) {
    ComplexSquare1();
  }
}

int ComplexSquare2() 
{
    for (size_t i = 0; i < N; ++i) {
        z_sq = complex_square2(z);                       // slower by a factor 5.5
    }

    // printf("z_sq = %f", z_sq);
    return 0;
}

static void BM_ComplexSquare2(benchmark::State& state) {
  for (auto _ : state) {
    ComplexSquare2();
  }
}

int SquareOfAbsolute() 
{
    for (size_t i = 0; i < N; ++i) {
        z_sq = pow(abs(z), 2);                           // slower by a factor 6
    }

    // printf("z_sq = %f", z_sq);
    return 0;
}

static void BM_SquareOfAbsolute(benchmark::State& state) {
  for (auto _ : state) {
    SquareOfAbsolute();
  }
}

int StdNormalize() 
{
    for (size_t i = 0; i < N; ++i) {
        z_sq = norm(z);                                  // slower by a factor 5
    }

    // printf("z_sq = %f", z_sq);
    return 0;
}

static void BM_StdNormalize(benchmark::State& state) {
  for (auto _ : state) {
    StdNormalize();
  }
}

BENCHMARK(BM_ExplicitSquareCalculation);
BENCHMARK(BM_ComplexSquare1);
BENCHMARK(BM_ComplexSquare2);
BENCHMARK(BM_SquareOfAbsolute);
BENCHMARK(BM_StdNormalize);

BENCHMARK_MAIN();

在 Compiler Explorer，我得到了奇怪的结果。 YMMV.

快速精确地实现 pow(complex<double>, 2)

Fast and precise implementation of pow(complex<double>, 2)

c++

performance

complex-numbers

pow