C++ 矩阵乘以较慢的 OpenACC
C++ Matrix Multiply Slower OpenACC
当我在使用 GCC 编译期间启用 OpenACC 时,我的代码速度变慢是否有原因?我目前在 Windows 10 上使用 GCC 6.3.0。我真的不确定为什么会这样。
这是我正在编译的命令:g++ -fopenacc -o a Example.cpp
这是我的 C++ 代码:
#include <stdlib.h>
#include <cassert>
#include <chrono>
double *A, *B, *C;
int main(int argc, char* argv[]) {
long long N = 100;
A = new double[N * N];
B = new double[N * N];
C = new double[N * N];
srand(42);
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
A[i * N + j] = rand();
B[i * N + j] = rand();
}
}
for (int x = 0; x < 10; x++) {
auto start_time = std::chrono::high_resolution_clock::now();
#pragma acc kernels
{
#pragma acc loop independent
for (int i = 0; i < N; i++) {
#pragma acc loop independent
for (int j = 0; j < N; j++) {
double total = 0;
#pragma acc loop independent reduction (+: total)
for (int k = 0; k < N; k++) {
total += A[i * N + j] * B[k * N + j];
}
C[i * N + j] = total;
}
}
}
auto end_time = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> duration = end_time - start_time;
printf("%f seconds\n", duration.count());
}
return 0;
}
不确定 GNU 6.3 是否支持 OpenACC,至少不是很好,我也不知道 Windows 是否支持它。我在 Linux 上使用 GNU 10.2,其中 OpenACC 支持更好。
但是,GNU 仍然不能很好地处理“内核”指令,所以我建议改用并行。此外,您缺少数据区域,因此如果卸载此代码,您将收到运行时错误。
例如:
% g++ --version
g++ (GCC) 10.2.0
Copyright (C) 2020 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
%
% cat mm.cpp
#include <stdlib.h>
#include <stdio.h>
#include <cassert>
#include <chrono>
#ifdef USE_PARALLEL
#define ACC_TYPE parallel
#else
#define ACC_TYPE kernels
#endif
double *A, *B, *C;
int main(int argc, char* argv[]) {
long long N = 100;
A = new double[N * N];
B = new double[N * N];
C = new double[N * N];
srand(42);
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
A[i * N + j] = rand();
B[i * N + j] = rand();
}
}
#pragma acc data copyin(A[:N*N],B[:N*N]) copyout(C[:N*N])
for (int x = 0; x < 10; x++) {
auto start_time = std::chrono::high_resolution_clock::now();
#pragma acc ACC_TYPE
{
#pragma acc loop independent
for (int i = 0; i < N; i++) {
#pragma acc loop independent
for (int j = 0; j < N; j++) {
double total = 0;
#pragma acc loop independent reduction (+: total)
for (int k = 0; k < N; k++) {
total += A[i * N + j] * B[k * N + j];
}
C[i * N + j] = total;
}
}
}
auto end_time = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> duration = end_time - start_time;
printf("%f seconds\n", duration.count());
}
return 0;
}
% g++ -fopenacc mm.cpp -Ofast ; a.out
0.020057 seconds
0.020025 seconds
0.020022 seconds
0.020021 seconds
0.019538 seconds
0.018271 seconds
0.018270 seconds
0.018264 seconds
0.018274 seconds
0.018270 seconds
% g++ -fopenacc mm.cpp -Ofast -DUSE_PARALLEL ; a.out
0.000123 seconds
0.000086 seconds
0.000081 seconds
0.000078 seconds
0.000078 seconds
0.000077 seconds
0.000077 seconds
0.000076 seconds
0.000076 seconds
0.000076 seconds
当我在使用 GCC 编译期间启用 OpenACC 时,我的代码速度变慢是否有原因?我目前在 Windows 10 上使用 GCC 6.3.0。我真的不确定为什么会这样。
这是我正在编译的命令:g++ -fopenacc -o a Example.cpp
这是我的 C++ 代码:
#include <stdlib.h>
#include <cassert>
#include <chrono>
double *A, *B, *C;
int main(int argc, char* argv[]) {
long long N = 100;
A = new double[N * N];
B = new double[N * N];
C = new double[N * N];
srand(42);
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
A[i * N + j] = rand();
B[i * N + j] = rand();
}
}
for (int x = 0; x < 10; x++) {
auto start_time = std::chrono::high_resolution_clock::now();
#pragma acc kernels
{
#pragma acc loop independent
for (int i = 0; i < N; i++) {
#pragma acc loop independent
for (int j = 0; j < N; j++) {
double total = 0;
#pragma acc loop independent reduction (+: total)
for (int k = 0; k < N; k++) {
total += A[i * N + j] * B[k * N + j];
}
C[i * N + j] = total;
}
}
}
auto end_time = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> duration = end_time - start_time;
printf("%f seconds\n", duration.count());
}
return 0;
}
不确定 GNU 6.3 是否支持 OpenACC,至少不是很好,我也不知道 Windows 是否支持它。我在 Linux 上使用 GNU 10.2,其中 OpenACC 支持更好。
但是,GNU 仍然不能很好地处理“内核”指令,所以我建议改用并行。此外,您缺少数据区域,因此如果卸载此代码,您将收到运行时错误。
例如:
% g++ --version
g++ (GCC) 10.2.0
Copyright (C) 2020 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
%
% cat mm.cpp
#include <stdlib.h>
#include <stdio.h>
#include <cassert>
#include <chrono>
#ifdef USE_PARALLEL
#define ACC_TYPE parallel
#else
#define ACC_TYPE kernels
#endif
double *A, *B, *C;
int main(int argc, char* argv[]) {
long long N = 100;
A = new double[N * N];
B = new double[N * N];
C = new double[N * N];
srand(42);
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
A[i * N + j] = rand();
B[i * N + j] = rand();
}
}
#pragma acc data copyin(A[:N*N],B[:N*N]) copyout(C[:N*N])
for (int x = 0; x < 10; x++) {
auto start_time = std::chrono::high_resolution_clock::now();
#pragma acc ACC_TYPE
{
#pragma acc loop independent
for (int i = 0; i < N; i++) {
#pragma acc loop independent
for (int j = 0; j < N; j++) {
double total = 0;
#pragma acc loop independent reduction (+: total)
for (int k = 0; k < N; k++) {
total += A[i * N + j] * B[k * N + j];
}
C[i * N + j] = total;
}
}
}
auto end_time = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> duration = end_time - start_time;
printf("%f seconds\n", duration.count());
}
return 0;
}
% g++ -fopenacc mm.cpp -Ofast ; a.out
0.020057 seconds
0.020025 seconds
0.020022 seconds
0.020021 seconds
0.019538 seconds
0.018271 seconds
0.018270 seconds
0.018264 seconds
0.018274 seconds
0.018270 seconds
% g++ -fopenacc mm.cpp -Ofast -DUSE_PARALLEL ; a.out
0.000123 seconds
0.000086 seconds
0.000081 seconds
0.000078 seconds
0.000078 seconds
0.000077 seconds
0.000077 seconds
0.000076 seconds
0.000076 seconds
0.000076 seconds