大型阵列(矩阵)的 OpenMP 性能优化
OpenMP performance optmization with a large array (matrix)
我是 OpenMP 的新手,抱歉,如果这个问题看起来有些多余。
这是我的顺序代码,它对行中的每个元素执行 do_something() 并保存到下一行:
for (int i = column; i < row * column; i++)
A[n] = do_something(A[n - column]);
我试着用简单的并行来并行化它
for (int i = 1; i < row; i++)
{
// I put column iteration in the inside loop
// becuaes of cache miss and I tried to schedule the
// same thread on the same cache line to avoid false
// sharing
#pragma omp parallel for schedule(static, cacheline_size)
for (int j = 0; j < column; j++)
{
A[i * column+ j] = do_something(A[(i - 1) * column+ j]);
}
}
然而,我每个线程只能获得大约 1.5 倍的加速,并且加速上限为 4 个线程,加速为 2.5 倍(我有足够的内核)。
我试图将列循环放在外面,但它比顺序代码还要慢。我怀疑这是每个循环中的线程创建,有没有办法改进它?
编辑:我在 Ubuntu 20.04
上使用 g++
j
的循环有问题。在并行部分结束时启动线程组和同步是相当昂贵的,尤其是对于大量线程。
将循环移到外部会破坏引用的位置。这个问题的经典解决方案是平铺。将有问题的循环分成两部分。在整个范围内步长为 cacheline
的一个。另一个步长为 1
,范围为 0
到 cacheline-1
。外部循环用于利用并行化,它应该移到外面。虽然 innerloop 利用了引用的局部性,但它应该被移到里面。
#pragma omp parallel for
for (int j0 = 0; j0 < column; j0+=cacheline_size) {
for (int i = 1; i < row; i++) {
int j1 = MIN(j0 + cacheline_size, column);
for (int j = j0; j < j1; ++j)
A[i * column+ j] = do_something(A[(i - 1) * column+ j]);
}}
选择最佳步骤通常需要进行一些实验。
使用这个基准,确实,@tstanisl 的解决方案要好得多。
#include <benchmark/benchmark.h>
#include <memory>
#include <omp.h>
constexpr int cacheline_size{64};
double do_something(const double v)
{
return v * 2;
}
void do_matrix_calculation(double* A, const int column, const int row)
{
for (int i = 1; i < row; i++)
{
// I put column iteration in the inside loop
// because of cache miss and I tried to schedule the
// same thread on the same cache line to avoid false
// sharing
#pragma omp parallel for schedule(static, cacheline_size)
for (int j = 0; j < column; j++)
{
A[i * column+ j] = do_something(A[(i - 1) * column + j]);
}
}
}
void BM_matrix_calculation(benchmark::State& state)
{
const int n_threads = state.range(0);
const int cols = state.range(1);
const int rows = state.range(2);
std::unique_ptr<double[]> arr = std::make_unique<double[]>(cols * rows);
omp_set_num_threads(n_threads);
for (auto _ : state)
{
do_matrix_calculation(arr.get(), cols, rows);
}
}
void do_matrix_calculation2(double* A, const int column, const int row)
{
#pragma omp parallel for schedule(static, cacheline_size) collapse(2)
for (int i = 1; i < row; i++)
{
for (int j = 0; j < column; j++)
{
A[i * column+ j] = do_something(A[(i - 1) * column + j]);
}
}
}
void BM_matrix_calculation2(benchmark::State& state)
{
const int n_threads = state.range(0);
const int cols = state.range(1);
const int rows = state.range(2);
std::unique_ptr<double[]> arr = std::make_unique<double[]>(cols * rows);
omp_set_num_threads(n_threads);
for (auto _ : state)
{
do_matrix_calculation2(arr.get(), cols, rows);
}
}
void do_matrix_calculation3(double* A, const int column, const int row)
{
#pragma omp parallel for
for (int j0 = 0; j0 < column; j0+=cacheline_size)
{
for (int i = 1; i < row; i++)
{
int j1 = std::min(j0 + cacheline_size, column);
for (int j = j0; j < j1; ++j)
A[i * column+ j] = do_something(A[(i - 1) * column+ j]);
}
}
}
void BM_matrix_calculation3(benchmark::State& state)
{
const int n_threads = state.range(0);
const int cols = state.range(1);
const int rows = state.range(2);
std::unique_ptr<double[]> arr = std::make_unique<double[]>(cols * rows);
omp_set_num_threads(n_threads);
for (auto _ : state)
{
do_matrix_calculation3(arr.get(), cols, rows);
}
}
BENCHMARK(BM_matrix_calculation)
->Args({1, 1024, 1024})
->Args({2, 1024, 1024})
->Args({4, 1024, 1024})
->Args({8, 1024, 1024});
BENCHMARK(BM_matrix_calculation2)
->Args({1, 1024, 1024})
->Args({2, 1024, 1024})
->Args({4, 1024, 1024})
->Args({8, 1024, 1024});
BENCHMARK(BM_matrix_calculation3)
->Args({1, 1024, 1024})
->Args({2, 1024, 1024})
->Args({4, 1024, 1024})
->Args({8, 1024, 1024});
BENCHMARK_MAIN();
BM_matrix_calculation
和 BM_matrix_calculation2
之间的性能差异似乎对缓存行大小很敏感。当然,这需要针对您的特定机器进行优化。
我是 OpenMP 的新手,抱歉,如果这个问题看起来有些多余。 这是我的顺序代码,它对行中的每个元素执行 do_something() 并保存到下一行:
for (int i = column; i < row * column; i++)
A[n] = do_something(A[n - column]);
我试着用简单的并行来并行化它
for (int i = 1; i < row; i++)
{
// I put column iteration in the inside loop
// becuaes of cache miss and I tried to schedule the
// same thread on the same cache line to avoid false
// sharing
#pragma omp parallel for schedule(static, cacheline_size)
for (int j = 0; j < column; j++)
{
A[i * column+ j] = do_something(A[(i - 1) * column+ j]);
}
}
然而,我每个线程只能获得大约 1.5 倍的加速,并且加速上限为 4 个线程,加速为 2.5 倍(我有足够的内核)。
我试图将列循环放在外面,但它比顺序代码还要慢。我怀疑这是每个循环中的线程创建,有没有办法改进它?
编辑:我在 Ubuntu 20.04
上使用 g++j
的循环有问题。在并行部分结束时启动线程组和同步是相当昂贵的,尤其是对于大量线程。
将循环移到外部会破坏引用的位置。这个问题的经典解决方案是平铺。将有问题的循环分成两部分。在整个范围内步长为 cacheline
的一个。另一个步长为 1
,范围为 0
到 cacheline-1
。外部循环用于利用并行化,它应该移到外面。虽然 innerloop 利用了引用的局部性,但它应该被移到里面。
#pragma omp parallel for
for (int j0 = 0; j0 < column; j0+=cacheline_size) {
for (int i = 1; i < row; i++) {
int j1 = MIN(j0 + cacheline_size, column);
for (int j = j0; j < j1; ++j)
A[i * column+ j] = do_something(A[(i - 1) * column+ j]);
}}
选择最佳步骤通常需要进行一些实验。
使用这个基准,确实,@tstanisl 的解决方案要好得多。
#include <benchmark/benchmark.h>
#include <memory>
#include <omp.h>
constexpr int cacheline_size{64};
double do_something(const double v)
{
return v * 2;
}
void do_matrix_calculation(double* A, const int column, const int row)
{
for (int i = 1; i < row; i++)
{
// I put column iteration in the inside loop
// because of cache miss and I tried to schedule the
// same thread on the same cache line to avoid false
// sharing
#pragma omp parallel for schedule(static, cacheline_size)
for (int j = 0; j < column; j++)
{
A[i * column+ j] = do_something(A[(i - 1) * column + j]);
}
}
}
void BM_matrix_calculation(benchmark::State& state)
{
const int n_threads = state.range(0);
const int cols = state.range(1);
const int rows = state.range(2);
std::unique_ptr<double[]> arr = std::make_unique<double[]>(cols * rows);
omp_set_num_threads(n_threads);
for (auto _ : state)
{
do_matrix_calculation(arr.get(), cols, rows);
}
}
void do_matrix_calculation2(double* A, const int column, const int row)
{
#pragma omp parallel for schedule(static, cacheline_size) collapse(2)
for (int i = 1; i < row; i++)
{
for (int j = 0; j < column; j++)
{
A[i * column+ j] = do_something(A[(i - 1) * column + j]);
}
}
}
void BM_matrix_calculation2(benchmark::State& state)
{
const int n_threads = state.range(0);
const int cols = state.range(1);
const int rows = state.range(2);
std::unique_ptr<double[]> arr = std::make_unique<double[]>(cols * rows);
omp_set_num_threads(n_threads);
for (auto _ : state)
{
do_matrix_calculation2(arr.get(), cols, rows);
}
}
void do_matrix_calculation3(double* A, const int column, const int row)
{
#pragma omp parallel for
for (int j0 = 0; j0 < column; j0+=cacheline_size)
{
for (int i = 1; i < row; i++)
{
int j1 = std::min(j0 + cacheline_size, column);
for (int j = j0; j < j1; ++j)
A[i * column+ j] = do_something(A[(i - 1) * column+ j]);
}
}
}
void BM_matrix_calculation3(benchmark::State& state)
{
const int n_threads = state.range(0);
const int cols = state.range(1);
const int rows = state.range(2);
std::unique_ptr<double[]> arr = std::make_unique<double[]>(cols * rows);
omp_set_num_threads(n_threads);
for (auto _ : state)
{
do_matrix_calculation3(arr.get(), cols, rows);
}
}
BENCHMARK(BM_matrix_calculation)
->Args({1, 1024, 1024})
->Args({2, 1024, 1024})
->Args({4, 1024, 1024})
->Args({8, 1024, 1024});
BENCHMARK(BM_matrix_calculation2)
->Args({1, 1024, 1024})
->Args({2, 1024, 1024})
->Args({4, 1024, 1024})
->Args({8, 1024, 1024});
BENCHMARK(BM_matrix_calculation3)
->Args({1, 1024, 1024})
->Args({2, 1024, 1024})
->Args({4, 1024, 1024})
->Args({8, 1024, 1024});
BENCHMARK_MAIN();
BM_matrix_calculation
和 BM_matrix_calculation2
之间的性能差异似乎对缓存行大小很敏感。当然,这需要针对您的特定机器进行优化。