大型阵列(矩阵)的 OpenMP 性能优化

OpenMP performance optmization with a large array (matrix)

我是 OpenMP 的新手,抱歉,如果这个问题看起来有些多余。 这是我的顺序代码,它对行中的每个元素执行 do_something() 并保存到下一行:

    for (int i = column; i < row * column; i++)
        A[n] = do_something(A[n - column]);

我试着用简单的并行来并行化它

for (int i = 1; i < row; i++)
{   
    // I put column iteration in the inside loop
    // becuaes of cache miss and I tried to schedule the
    // same thread on the same cache line to avoid false 
    // sharing
    #pragma omp parallel for schedule(static, cacheline_size)
    for (int j = 0; j < column; j++)
    {
        A[i * column+ j] = do_something(A[(i - 1) * column+ j]);
    }
    
}

然而,我每个线程只能获得大约 1.5 倍的加速,并且加速上限为 4 个线程,加速为 2.5 倍(我有足够的内核)。

我试图将列循环放在外面,但它比顺序代码还要慢。我怀疑这是每个循环中的线程创建,有没有办法改进它?

编辑:我在 Ubuntu 20.04

上使用 g++

j 的循环有问题。在并行部分结束时启动线程组和同步是相当昂贵的,尤其是对于大量线程。

将循环移到外部会破坏引用的位置。这个问题的经典解决方案是平铺。将有问题的循环分成两部分。在整个范围内步长为 cacheline 的一个。另一个步长为 1,范围为 0cacheline-1。外部循环用于利用并行化,它应该移到外面。虽然 innerloop 利用了引用的局部性,但它应该被移到里面。

#pragma omp parallel for
for (int j0 = 0; j0 < column; j0+=cacheline_size) {
for (int i = 1; i < row; i++) {
   int j1 = MIN(j0 + cacheline_size, column);
   for (int j = j0; j < j1; ++j)
        A[i * column+ j] = do_something(A[(i - 1) * column+ j]);
}}

选择最佳步骤通常需要进行一些实验。

使用这个基准,确实,@tstanisl 的解决方案要好得多。

#include <benchmark/benchmark.h>
#include <memory>
#include <omp.h>

constexpr int cacheline_size{64};

double do_something(const double v)
{
    return v * 2;
}

void do_matrix_calculation(double* A, const int column, const int row)
{
    for (int i = 1; i < row; i++)
    {   
        // I put column iteration in the inside loop
        // because of cache miss and I tried to schedule the
        // same thread on the same cache line to avoid false 
        // sharing
        #pragma omp parallel for schedule(static, cacheline_size)
        for (int j = 0; j < column; j++)
        {
            A[i * column+ j] = do_something(A[(i - 1) * column + j]);
        }
    }
}

void BM_matrix_calculation(benchmark::State& state)
{
    const int n_threads = state.range(0);
    const int cols = state.range(1);
    const int rows = state.range(2);
    std::unique_ptr<double[]> arr = std::make_unique<double[]>(cols * rows);

    omp_set_num_threads(n_threads);

    for (auto _ : state)
    {
        do_matrix_calculation(arr.get(), cols, rows);
    }
}

void do_matrix_calculation2(double* A, const int column, const int row)
{
    #pragma omp parallel for schedule(static, cacheline_size) collapse(2)
    for (int i = 1; i < row; i++)
    {   
        for (int j = 0; j < column; j++)
        {
            A[i * column+ j] = do_something(A[(i - 1) * column + j]);
        }
    }
}

void BM_matrix_calculation2(benchmark::State& state)
{
    const int n_threads = state.range(0);
    const int cols = state.range(1);
    const int rows = state.range(2);
    std::unique_ptr<double[]> arr = std::make_unique<double[]>(cols * rows);

    omp_set_num_threads(n_threads);

    for (auto _ : state)
    {
        do_matrix_calculation2(arr.get(), cols, rows);
    }
}

void do_matrix_calculation3(double* A, const int column, const int row)
{
    #pragma omp parallel for
    for (int j0 = 0; j0 < column; j0+=cacheline_size) 
    {
        for (int i = 1; i < row; i++) 
        {
            int j1 = std::min(j0 + cacheline_size, column);
            for (int j = j0; j < j1; ++j)
                A[i * column+ j] = do_something(A[(i - 1) * column+ j]);
        }
    }
}

void BM_matrix_calculation3(benchmark::State& state)
{
    const int n_threads = state.range(0);
    const int cols = state.range(1);
    const int rows = state.range(2);
    std::unique_ptr<double[]> arr = std::make_unique<double[]>(cols * rows);

    omp_set_num_threads(n_threads);

    for (auto _ : state)
    {
        do_matrix_calculation3(arr.get(), cols, rows);
    }
}

BENCHMARK(BM_matrix_calculation)
    ->Args({1, 1024, 1024})
    ->Args({2, 1024, 1024})
    ->Args({4, 1024, 1024})
    ->Args({8, 1024, 1024});

BENCHMARK(BM_matrix_calculation2)
    ->Args({1, 1024, 1024})
    ->Args({2, 1024, 1024})
    ->Args({4, 1024, 1024})
    ->Args({8, 1024, 1024});

BENCHMARK(BM_matrix_calculation3)
    ->Args({1, 1024, 1024})
    ->Args({2, 1024, 1024})
    ->Args({4, 1024, 1024})
    ->Args({8, 1024, 1024});

BENCHMARK_MAIN();

BM_matrix_calculationBM_matrix_calculation2 之间的性能差异似乎对缓存行大小很敏感。当然,这需要针对您的特定机器进行优化。