通过 openmp 在 C++ 中并行化 three_for_loop
Parallelize three_for_loop in C++ by openmp
我有密码。这里,A、B、C、A1、B1、C1 是 3 维向量。 A、B、C相互独立,A1、B1、C1也相互独立。我想使用 openmp 并行计算它。但是,我运行它用openmp,我得到"Segmentation fault"error.Could你能帮我解决这个问题吗?先感谢您。
#include <omp.h>
#include<math.h>
#include<cmath>
#include<vector>
#include<iostream>
using namespace std;
int main ()
{
int NX=801; // NUmber of grid in X direction
int NY=501;
int NZ=401;
float PI=3.14159265358979323846;
unsigned int i,j,k;
vector<vector<vector<float> > > A (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > B (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > C (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > A1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > B1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > C1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
cout<<"start"<<endl;
#pragma omp parallel for private (j) shared(A,B,C,i,k,NX,NY,NZ)
for (i=0;i<NX;i++)
for (j=0;j<NY;j++)
for (k=0;k<NZ;k++)
{
A[i][j][k]=sin(2.0*PI/float(NX*NY*NZ)*float(i*j*k));
B[i][j][k]=cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k));
C[i][j][k]=sin(2.0*PI/float(NX*NY*NZ))*cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k));
}
#pragma omp parallel for private (j) shared(A1,B1,C1,A,B,C,i,k,NX,NY,NZ)
for (i=1;i<NX-1;i++)
for (j=1;j<NY-1;j++)
for (k=1;k<NZ-1;k++)
{
A1[i][j][k]=C[i+1][j][k]*cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k));
B1[i][j][k]=A[i][j][k]+B[i][j][k]+C[i][j][k]*cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k));
C1[i][j][k]=16.0*A[i][j][k]*cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k));
}
cout<<"finish"<<endl;
return 0;
}
此代码很容易与 OpenMP 并行化。但是,您在自己的尝试中犯了一些错误,特别是试图声明 i
和 k
shared
而实际上它们应该是 private
。更好的是,不要提前声明变量,只需在 for
循环内声明它们。这样,它们将自动具有正确的范围,防止您混淆它。
这是它给出的结果:
#include <omp.h>
#include<math.h>
#include<cmath>
#include<vector>
#include<iostream>
using namespace std;
int main ()
{
int NX=801; // NUmber of grid in X direction
int NY=501;
int NZ=401;
float PI=3.14159265358979323846;
vector<vector<vector<float> > > A (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > B (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > C (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > A1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > B1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > C1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
cout<<"start"<<endl;
#pragma omp parallel for
for (int i=0;i<NX;i++)
for (int j=0;j<NY;j++)
for (int k=0;k<NZ;k++)
{
A[i][j][k]=sin(2.0*PI/float(NX*NY*NZ)*float(i*j*k));
B[i][j][k]=cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k));
C[i][j][k]=sin(2.0*PI/float(NX*NY*NZ))*cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k));
}
#pragma omp parallel for
for (int i=1;i<NX-1;i++)
for (int j=1;j<NY-1;j++)
for (int k=1;k<NZ-1;k++)
{
A1[i][j][k]=C[i+1][j][k]*cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k));
B1[i][j][k]=A[i][j][k]+B[i][j][k]+C[i][j][k]*cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k));
C1[i][j][k]=16.0*A[i][j][k]*cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k));
}
cout<<"finish"<<endl;
return 0;
}
现在,由于您要求并行化此代码,我猜您对性能感兴趣。因此,没有什么能阻止您像这样实施一两个非常基本的性能优化:
#include <omp.h>
#include<math.h>
#include<cmath>
#include<vector>
#include<iostream>
using namespace std;
int main ()
{
int NX=801; // NUmber of grid in X direction
int NY=501;
int NZ=401;
float PI=3.14159265358979323846;
vector<vector<vector<float> > > A (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > B (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > C (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > A1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > B1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > C1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
const float PIOverSize = PI/(NX*NY*NZ);
const float sin2PIOverSize = sin(2.0f*PIOverSize);
cout<<"start"<<endl;
double tbeg = omp_get_wtime();
#pragma omp parallel
{
#pragma omp for
for (int i=0;i<NX;i++)
for (int j=0;j<NY;j++)
{
float IJPIOverSize=i*j*PIOverSize;
for (int k=0;k<NZ;k++)
{
A[i][j][k]=sin(2.0f*IJPIOverSize*k);
B[i][j][k]=cos(5.0f*IJPIOverSize*k);
C[i][j][k]=sin2PIOverSize*cos(5.0f*IJPIOverSize*k);
}
}
#pragma omp for
for (int i=1;i<NX-1;i++)
for (int j=1;j<NY-1;j++)
{
float IJPIOverSize=i*j*PIOverSize;
for (int k=1;k<NZ-1;k++)
{
A1[i][j][k]=C[i+1][j][k]*cos(5.0f*IJPIOverSize*k);
B1[i][j][k]=A[i][j][k]+B[i][j][k]+C[i][j][k]*cos(5.0f*IJPIOverSize*k);
C1[i][j][k]=16.0f*A[i][j][k]*cos(5.0f*IJPIOverSize*k);
}
}
}
double time = omp_get_wtime() - tbeg;
cout<<"finish in "<<time<<" seconds"<<endl;
return 0;
}
有了这个,您的代码应该已经快得多了。
我有密码。这里,A、B、C、A1、B1、C1 是 3 维向量。 A、B、C相互独立,A1、B1、C1也相互独立。我想使用 openmp 并行计算它。但是,我运行它用openmp,我得到"Segmentation fault"error.Could你能帮我解决这个问题吗?先感谢您。
#include <omp.h>
#include<math.h>
#include<cmath>
#include<vector>
#include<iostream>
using namespace std;
int main ()
{
int NX=801; // NUmber of grid in X direction
int NY=501;
int NZ=401;
float PI=3.14159265358979323846;
unsigned int i,j,k;
vector<vector<vector<float> > > A (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > B (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > C (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > A1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > B1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > C1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
cout<<"start"<<endl;
#pragma omp parallel for private (j) shared(A,B,C,i,k,NX,NY,NZ)
for (i=0;i<NX;i++)
for (j=0;j<NY;j++)
for (k=0;k<NZ;k++)
{
A[i][j][k]=sin(2.0*PI/float(NX*NY*NZ)*float(i*j*k));
B[i][j][k]=cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k));
C[i][j][k]=sin(2.0*PI/float(NX*NY*NZ))*cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k));
}
#pragma omp parallel for private (j) shared(A1,B1,C1,A,B,C,i,k,NX,NY,NZ)
for (i=1;i<NX-1;i++)
for (j=1;j<NY-1;j++)
for (k=1;k<NZ-1;k++)
{
A1[i][j][k]=C[i+1][j][k]*cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k));
B1[i][j][k]=A[i][j][k]+B[i][j][k]+C[i][j][k]*cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k));
C1[i][j][k]=16.0*A[i][j][k]*cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k));
}
cout<<"finish"<<endl;
return 0;
}
此代码很容易与 OpenMP 并行化。但是,您在自己的尝试中犯了一些错误,特别是试图声明 i
和 k
shared
而实际上它们应该是 private
。更好的是,不要提前声明变量,只需在 for
循环内声明它们。这样,它们将自动具有正确的范围,防止您混淆它。
这是它给出的结果:
#include <omp.h>
#include<math.h>
#include<cmath>
#include<vector>
#include<iostream>
using namespace std;
int main ()
{
int NX=801; // NUmber of grid in X direction
int NY=501;
int NZ=401;
float PI=3.14159265358979323846;
vector<vector<vector<float> > > A (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > B (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > C (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > A1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > B1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > C1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
cout<<"start"<<endl;
#pragma omp parallel for
for (int i=0;i<NX;i++)
for (int j=0;j<NY;j++)
for (int k=0;k<NZ;k++)
{
A[i][j][k]=sin(2.0*PI/float(NX*NY*NZ)*float(i*j*k));
B[i][j][k]=cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k));
C[i][j][k]=sin(2.0*PI/float(NX*NY*NZ))*cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k));
}
#pragma omp parallel for
for (int i=1;i<NX-1;i++)
for (int j=1;j<NY-1;j++)
for (int k=1;k<NZ-1;k++)
{
A1[i][j][k]=C[i+1][j][k]*cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k));
B1[i][j][k]=A[i][j][k]+B[i][j][k]+C[i][j][k]*cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k));
C1[i][j][k]=16.0*A[i][j][k]*cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k));
}
cout<<"finish"<<endl;
return 0;
}
现在,由于您要求并行化此代码,我猜您对性能感兴趣。因此,没有什么能阻止您像这样实施一两个非常基本的性能优化:
#include <omp.h>
#include<math.h>
#include<cmath>
#include<vector>
#include<iostream>
using namespace std;
int main ()
{
int NX=801; // NUmber of grid in X direction
int NY=501;
int NZ=401;
float PI=3.14159265358979323846;
vector<vector<vector<float> > > A (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > B (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > C (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > A1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > B1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
vector<vector<vector<float> > > C1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0)));
const float PIOverSize = PI/(NX*NY*NZ);
const float sin2PIOverSize = sin(2.0f*PIOverSize);
cout<<"start"<<endl;
double tbeg = omp_get_wtime();
#pragma omp parallel
{
#pragma omp for
for (int i=0;i<NX;i++)
for (int j=0;j<NY;j++)
{
float IJPIOverSize=i*j*PIOverSize;
for (int k=0;k<NZ;k++)
{
A[i][j][k]=sin(2.0f*IJPIOverSize*k);
B[i][j][k]=cos(5.0f*IJPIOverSize*k);
C[i][j][k]=sin2PIOverSize*cos(5.0f*IJPIOverSize*k);
}
}
#pragma omp for
for (int i=1;i<NX-1;i++)
for (int j=1;j<NY-1;j++)
{
float IJPIOverSize=i*j*PIOverSize;
for (int k=1;k<NZ-1;k++)
{
A1[i][j][k]=C[i+1][j][k]*cos(5.0f*IJPIOverSize*k);
B1[i][j][k]=A[i][j][k]+B[i][j][k]+C[i][j][k]*cos(5.0f*IJPIOverSize*k);
C1[i][j][k]=16.0f*A[i][j][k]*cos(5.0f*IJPIOverSize*k);
}
}
}
double time = omp_get_wtime() - tbeg;
cout<<"finish in "<<time<<" seconds"<<endl;
return 0;
}
有了这个,您的代码应该已经快得多了。