OpenACC 和 CUDA 感知 MPI
OpenACC and CUDA aware MPI
我想在设备上移动 main 中的整个 while 循环。当我将 #pragma acc host_data use_device(err)
添加到 MPI_Allreduce (&err, &err, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
时出现问题。
错误是 err 的减少不起作用,因此代码在循环一步后退出。
在MPI_Allreduce()
之后,即使使用#pragma acc update self(err)
,err仍然为零。
我正在编译 mpicc -acc -ta=tesla:managed -Minfo=accel -w jacobi.c
并且 运行 mpirun -np 2 -mca pml ^ucx ./a.out
你能帮我找出错误吗?
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define PARALLEL
#define NX_GLOB 128 /* Global number of interior points */
#define NY_GLOB 128 /* Global number of interior points */
#define NGHOST 1
#define NDIM 2
#ifdef PARALLEL
#include <mpi.h>
MPI_Comm MPI_COMM_CART;
#endif
typedef struct MPI_Decomp_{
int nprocs[NDIM]; /* Number of processors in each dimension */
int periods[NDIM]; /* Periodicity flag in each dimension */
int coords[NDIM]; /* Cartesian coordinate in the MPI topology */
int gsize[NDIM]; /* Global domain size (no ghosts) */
int lsize[NDIM]; /* Local domain size (no ghosts) */
int start[NDIM]; /* Local start index in each dimension */
int procL[NDIM]; /* Rank of left-lying process in each direction */
int procR[NDIM]; /* Rank of right-lying process in each direction */
int rank; /* Local process rank */
int size; /* Communicator size */
} MPI_Decomp;
void BoundaryConditions(double **, double *, double *, int, int, MPI_Decomp *);
void DomainDecomposition(MPI_Decomp *);
void WriteSolution (double **, int, int, MPI_Decomp *);
double **Allocate_2DdblArray(int, int);
int **Allocate_2DintArray(int, int);
void Show_2DdblArray(double **, int, int, const char *);
void Show_2DintArray(int **, int, int, const char *);
int nx_tot, ny_tot;
int main(int argc, char ** argv)
{
int nx, i, ibeg, iend;
int ny, j, jbeg, jend;
int k, rank=0, size=1;
double xbeg = 0.0, xend = 1.0;
double ybeg = 0.0, yend = 1.0;
double dx = (xend - xbeg)/(NX_GLOB + 1);
double dy = (yend - ybeg)/(NY_GLOB + 1);
double *xg, *yg, *x, *y, **phi, **phi0;
double err, tol;
MPI_Decomp mpi_decomp;
double err_glob;
int procL[NDIM] = {-1,-1};
int procR[NDIM] = {-1,-1};
/* --------------------------------------------------------
0. Initialize the MPI execution environment
-------------------------------------------------------- */
#ifdef PARALLEL
MPI_Datatype row_type, col_type;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
DomainDecomposition(&mpi_decomp);
nx = mpi_decomp.lsize[0];
ny = mpi_decomp.lsize[1];
#else
mpi_decomp.gsize[0] = mpi_decomp.lsize[0] = nx = NX_GLOB;
mpi_decomp.gsize[1] = mpi_decomp.lsize[1] = ny = NY_GLOB;
mpi_decomp.procL[0] = mpi_decomp.procL[1] = -1;
mpi_decomp.procR[0] = mpi_decomp.procR[1] = -1;
#endif
/* --------------------------------------------------------
1. Set local grid indices
-------------------------------------------------------- */
ibeg = NGHOST;
iend = ibeg + nx - 1;
nx = iend - ibeg + 1;
nx_tot = nx + 2*NGHOST;
jbeg = NGHOST;
jend = jbeg + ny - 1;
ny = jend - jbeg + 1;
ny_tot = ny + 2*NGHOST;
/* --------------------------------------------------------
2. Generate global and local grids
-------------------------------------------------------- */
xg = (double *) malloc ( (NX_GLOB+2*NGHOST)*sizeof(double));
yg = (double *) malloc ( (NY_GLOB+2*NGHOST)*sizeof(double));
for (i = 0; i < (NX_GLOB+2*NGHOST); i++) xg[i] = xbeg + (i-ibeg+1)*dx;
for (j = 0; j < (NY_GLOB+2*NGHOST); j++) yg[j] = ybeg + (j-jbeg+1)*dy;
#ifdef PARALLEL
x = xg + mpi_decomp.start[0];
y = yg + mpi_decomp.start[1];
#else
x = xg;
y = yg;
#endif
/* --------------------------------------------------------
3. Allocate memory on local processor and
assign initial conditions.
-------------------------------------------------------- */
phi = Allocate_2DdblArray(ny_tot, nx_tot);
phi0 = Allocate_2DdblArray(ny_tot, nx_tot);
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi0[j][i] = 0.0;
}}
#ifdef PARALLEL
MPI_Type_contiguous (nx_tot, MPI_DOUBLE, &row_type);
MPI_Type_vector (ny_tot, 1, nx_tot, MPI_DOUBLE, &col_type);
MPI_Type_commit (&row_type);
MPI_Type_commit (&col_type);
#endif
/* --------------------------------------------------------
4. Main iteration cycle
-------------------------------------------------------- */
tol = 1.e-5;
err = 1.0;
k = 0;
#pragma acc enter data copyin(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST], y[NX_GLOB+2*NGHOST], err, err_glob)
while (err > tol){
/* -- 4a. Set boundary conditions first -- */
BoundaryConditions(phi0, x, y, nx, ny, &mpi_decomp);
/* -- 4b. Jacobi's method and residual (interior points) -- */
err = 0.0;
#pragma acc parallel loop collapse(2) reduction(+:err) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi[j][i] = 0.25*( phi0[j][i-1] + phi0[j][i+1]
+ phi0[j-1][i] + phi0[j+1][i] );
err += dx*dy*fabs(phi[j][i] - phi0[j][i]);
}}
#pragma acc parallel loop collapse(2) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi0[j][i] = phi[j][i];
}}
#ifdef PARALLEL
// double err_glob;
#pragma acc host_data use_device(err, err_glob)
{
MPI_Allreduce (&err, &err_glob, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
}
err = err_glob;
#endif
// #pragma acc update host(err)
if (rank == 0){
printf ("k = %d; err = %8.3e\n",k, err);
}
k++;
}
#pragma acc exit data copyout(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot], err, err_glob)
WriteSolution (phi, nx, ny, &mpi_decomp);
#ifdef PARALLEL
MPI_Finalize();
#endif
return 0;
}
#ifdef PARALLEL
/* ********************************************************************* */
void DomainDecomposition(MPI_Decomp *mpi_decomp)
/*
*
*********************************************************************** */
{
int dim, i;
int rank, size;
int *coords = mpi_decomp->coords;
int *gsize = mpi_decomp->gsize;
int *lsize = mpi_decomp->lsize;
int *nprocs = mpi_decomp->nprocs;
int *periods = mpi_decomp->periods;
int *procL = mpi_decomp->procL;
int *procR = mpi_decomp->procR;
int *start = mpi_decomp->start;
int new_coords[NDIM];
/* --------------------------------------------------------
1. Get rank & size
-------------------------------------------------------- */
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
mpi_decomp->rank = rank;
mpi_decomp->size = size;
/* --------------------------------------------------------
2. Obtain number of processor along each dimension.
Use maximally squared decomp.
-------------------------------------------------------- */
nprocs[0] = (int)sqrt(size);
nprocs[1] = size/nprocs[0];
if (nprocs[0]*nprocs[1] != size){
if (rank == 0) printf ("! Cannot decompose\n");
MPI_Finalize();
exit(1);
}
if (rank == 0){
printf ("Decomposition achieved with %d X %d procs\n",nprocs[0],nprocs[1]);
}
periods[0] = 0;
periods[1] = 0;
/* --------------------------------------------------------
3. Create Cartesian topology
-------------------------------------------------------- */
MPI_Cart_create(MPI_COMM_WORLD, NDIM, nprocs, periods,
0, &MPI_COMM_CART);
MPI_Cart_get(MPI_COMM_CART, NDIM, nprocs, periods, coords);
/* --------------------------------------------------------
4. Fill structure members
-------------------------------------------------------- */
gsize[0] = NX_GLOB;
gsize[1] = NY_GLOB;
lsize[0] = NX_GLOB/nprocs[0];
lsize[1] = NY_GLOB/nprocs[1];
start[0] = coords[0]*lsize[0];
start[1] = coords[1]*lsize[1];
/* --------------------------------------------------------
5. Determine ranks of neighbour processors
-------------------------------------------------------- */
for (dim = 0; dim < NDIM; dim++) {
for (i = 0; i < NDIM; i++) new_coords[i] = coords[i];
new_coords[dim] = coords[dim] + 1;
if (new_coords[dim] < nprocs[dim]) {
MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procR[dim]) );
} else {
procR[dim] = MPI_PROC_NULL;
}
new_coords[dim] = coords[dim] - 1;
if (new_coords[dim] >= 0) {
MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procL[dim]) );
} else {
procL[dim] = MPI_PROC_NULL;
}
}
/* --------------------------------------------------------
6. Print processor information.
(Use MPI_Bcast() to print in sequence)
-------------------------------------------------------- */
int proc, go;
for (proc = 0; proc < size; proc++){
go = proc;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == go) {
printf ("[Rank %d]\n",rank);
printf (" coords = [%d, %d], lsize = [%d, %d]\n",
coords[0], coords[1], lsize[0], lsize[1]);
for (dim = 0; dim < NDIM; dim++){
printf (" (procL, procR)[%d] = %d, %d\n", dim, procL[dim], procR[dim]);
}
}
MPI_Barrier(MPI_COMM_WORLD);
}
return;
}
#endif
/* ********************************************************************* */
void BoundaryConditions(double **phi, double *x, double *y,
int nx, int ny, MPI_Decomp *mpi_decomp)
/*
*********************************************************************** */
{
int i,j;
int ibeg = NGHOST;
int iend = ibeg + nx - 1;
int jbeg = NGHOST;
int jend = jbeg + ny - 1;
int *procL = mpi_decomp->procL;
int *procR = mpi_decomp->procR;
#ifdef PARALLEL
int rank = mpi_decomp->rank;
int size = mpi_decomp->size;
double send_buf[NX_GLOB + 2*NGHOST];
double recv_buf[NX_GLOB + 2*NGHOST];
/* Used for testing
for (j = 0; j <= jend+1; j++){
for (i = 0; i <= iend+1; i++){
phi[j][i] = -1;
}}
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi[j][i] = rank;
}}
*/
#pragma acc enter data create(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
// Left buffer
i = ibeg;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procL[0], 0,
recv_buf, jend+1, MPI_DOUBLE, procL[0], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i-1] = recv_buf[j];
// Right buffer
i = iend;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procR[0], 0,
recv_buf, jend+1, MPI_DOUBLE, procR[0], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i+1] = recv_buf[j];
// Bottom buffer
j = jbeg;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
// #pragma acc update self(send_buf[:NX_GLOB+2*NGHOST])
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procL[1], 0,
recv_buf, iend+1, MPI_DOUBLE, procL[1], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j-1][i] = recv_buf[i];
// Top buffer
j = jend;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procR[1], 0,
recv_buf, iend+1, MPI_DOUBLE, procR[1], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j+1][i] = recv_buf[i];
#pragma acc exit data copyout(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
#endif
/* -- Left -- */
if (procL[0] < 0){
i = ibeg-1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y[:NY_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i] = 1.0-y[j];
}
/* -- Right -- */
if (procR[0] < 0){
i = iend+1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y[:NY_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i] = y[j]*y[j];
}
/* -- Bottom -- */
if (procL[1] < 0){
j = jbeg-1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j][i] = 1.0-x[i];
}
/* -- Top -- */
if (procR[1] < 0){
j = jend+1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j][i] = x[i];
}
return;
#ifdef PARALLEL
// Print
MPI_Barrier(MPI_COMM_WORLD);
int go, proc;
for (proc = 0; proc < size; proc++){
go = proc;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == go) {
printf ("Boundary [Rank %d]\n",rank);
for (j = jend+1; j >= 0; j--){
for (i = 0; i <= iend+1; i++){
printf ("%6.2f ", phi[j][i]);
}
printf ("\n");
}
}
}
MPI_Finalize();
exit(0);
#endif
}
/* ********************************************************************* */
void WriteSolution (double **phi, int nx, int ny, MPI_Decomp *md)
/*
*********************************************************************** */
{
int i,j;
int ibeg = NGHOST;
int iend = ibeg + nx - 1;
int jbeg = NGHOST;
int jend = jbeg + ny - 1;
static int nfile = 0;
char fname[32];
sprintf (fname,"laplace2D_MPIACC.txt",nfile);
/*
for (j = jbeg-1; j <= jend+1; j++) for (i = ibeg-1; i <= iend+1; i++) {
phi[j][i] = -1;
}
for (j = jbeg; j <= jend; j++) for (i = ibeg; i <= iend; i++) {
phi[j][i] = md->rank;
}
*/
#ifdef PARALLEL
MPI_File fh;
MPI_Datatype type_local, type_domain;
int amode = MPI_MODE_CREATE | MPI_MODE_WRONLY;
int gsize[2], lsize[2], start[2];
/* --------------------------------------------------------
1. Create a local array type without the ghost zones
This datatype will be passed to MPI_File_write()
-------------------------------------------------------- */
gsize[0] = md->lsize[0] + 2*NGHOST;
gsize[1] = md->lsize[1] + 2*NGHOST;
lsize[0] = md->lsize[0];
lsize[1] = md->lsize[1];
start[0] = NGHOST;
start[1] = NGHOST;
MPI_Type_create_subarray (NDIM, gsize, lsize, start,
MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_local);
MPI_Type_commit (&type_local);
/* --------------------------------------------------------
2. Create the subarry in the global domain.
This datatype is used to set the file view.
-------------------------------------------------------- */
gsize[0] = NX_GLOB;
gsize[1] = NY_GLOB;
lsize[0] = md->lsize[0];
lsize[1] = md->lsize[1];
start[0] = lsize[0]*md->coords[0]; // equal to md->start[0]
start[1] = lsize[1]*md->coords[1]; // equal to md->start[1]
MPI_Type_create_subarray (NDIM, gsize, lsize, start,
MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_domain);
MPI_Type_commit (&type_domain);
/* --------------------------------------------------------
3. Write to disk
-------------------------------------------------------- */
MPI_File_delete(fname, MPI_INFO_NULL);
MPI_File_open(MPI_COMM_CART, fname, amode, MPI_INFO_NULL, &fh);
MPI_File_set_view(fh, 0, MPI_DOUBLE, type_domain, "native", MPI_INFO_NULL);
MPI_File_write_all(fh, phi[0], 1, type_local, MPI_STATUS_IGNORE);
MPI_File_close(&fh);
MPI_Type_free (&type_local);
MPI_Type_free (&type_domain);
#else
FILE *fp;
printf ("> Writing %s\n",fname);
fp = fopen(fname, "wb");
for (j = jbeg; j <= jend; j++){
fwrite (phi[j] + ibeg, sizeof(double), nx, fp);
}
fclose(fp);
#endif
nfile++;
}
/* ********************************************************************* */
double **Allocate_2DdblArray(int nx, int ny)
/*
* Allocate memory for a double precision array with
* nx rows and ny columns
*********************************************************************** */
{
int i,j;
double **buf;
buf = (double **)malloc (nx*sizeof(double *));
buf[0] = (double *) malloc (nx*ny*sizeof(double));
for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;
return buf;
}
/* ********************************************************************* */
int **Allocate_2DintArray(int nx, int ny)
/*
* Allocate memory for an integer-type array with
* nx rows and ny columns
*********************************************************************** */
{
int i,j;
int **buf;
buf = (int **)malloc (nx*sizeof(int *));
buf[0] = (int *) malloc (nx*ny*sizeof(int));
for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;
return buf;
}
/* ********************************************************************* */
void Show_2DdblArray(double **A, int nx, int ny, const char *string)
/*
*********************************************************************** */
{
int i, j;
printf ("%s\n",string);
printf ("------------------------------\n");
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
printf ("%8.2f ", A[i][j]);
}
printf ("\n");
}
printf ("------------------------------\n");
}
/* ********************************************************************* */
void Show_2DintArray(int **A, int nx, int ny, const char *string)
/*
*********************************************************************** */
{
int i, j;
printf ("%s\n",string);
for (j = 0; j < ny; j++) printf ("-----");
printf ("\n");
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
printf ("%03d ", A[i][j]);
}
printf ("\n");
}
for (j = 0; j < ny; j++) printf ("-----");
printf ("\n");
}
感谢您更新示例。这里有一些问题。
首先,对于“err”和“err_glob”。在循环开始时,您在主机上设置“err=0”但不在设备上更新它。然后在 MPI_AllReduce 调用之后,您再次在主机上设置“err=err_glob”,因此需要更新“err_glob”。
第二个问题是,当 运行 具有多个等级时,代码会部分出现“y”错误。问题是您使用的是全局大小而不是“x”和“y”的局部大小,因此当您复制“y”时,由于偏移量,它会与“x”重叠。我通过将“xg”和“yg”复制到设备来解决这个问题。
至于相对于 CPU 的性能,这里的主要问题是尺寸小,因此代码严重利用 GPU。我将 GLOB 大小增加到 4096 并看到更好的相对性能,尽管代码收敛得更快。
我还冒昧地添加了一些用于设备分配排名的样板代码,以便代码可以利用多个 GPU。
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define PARALLEL
#define NX_GLOB 128 /* Global number of interior points */
#define NY_GLOB 128 /* Global number of interior points */
#define NGHOST 1
#define NDIM 2
#ifdef PARALLEL
#include <mpi.h>
MPI_Comm MPI_COMM_CART;
#endif
#ifdef _OPENACC
#include <openacc.h>
#endif
typedef struct MPI_Decomp_{
int nprocs[NDIM]; /* Number of processors in each dimension */
int periods[NDIM]; /* Periodicity flag in each dimension */
int coords[NDIM]; /* Cartesian coordinate in the MPI topology */
int gsize[NDIM]; /* Global domain size (no ghosts) */
int lsize[NDIM]; /* Local domain size (no ghosts) */
int start[NDIM]; /* Local start index in each dimension */
int procL[NDIM]; /* Rank of left-lying process in each direction */
int procR[NDIM]; /* Rank of right-lying process in each direction */
int rank; /* Local process rank */
int size; /* Communicator size */
} MPI_Decomp;
void BoundaryConditions(double **, double *, double *, int, int, MPI_Decomp *);
void DomainDecomposition(MPI_Decomp *);
void WriteSolution (double **, int, int, MPI_Decomp *);
double **Allocate_2DdblArray(int, int);
int **Allocate_2DintArray(int, int);
void Show_2DdblArray(double **, int, int, const char *);
void Show_2DintArray(int **, int, int, const char *);
int nx_tot, ny_tot;
int main(int argc, char ** argv)
{
int nx, i, ibeg, iend;
int ny, j, jbeg, jend;
int k, rank=0, size=1;
int xsize,ysize;
double xbeg = 0.0, xend = 1.0;
double ybeg = 0.0, yend = 1.0;
double dx = (xend - xbeg)/(NX_GLOB + 1);
double dy = (yend - ybeg)/(NY_GLOB + 1);
double *xg, *yg, *x, *y, **phi, **phi0;
double err, tol;
MPI_Decomp mpi_decomp;
double err_glob;
int procL[NDIM] = {-1,-1};
int procR[NDIM] = {-1,-1};
/* --------------------------------------------------------
0. Initialize the MPI execution environment
-------------------------------------------------------- */
#ifdef PARALLEL
MPI_Datatype row_type, col_type;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
DomainDecomposition(&mpi_decomp);
nx = mpi_decomp.lsize[0];
ny = mpi_decomp.lsize[1];
#else
mpi_decomp.gsize[0] = mpi_decomp.lsize[0] = nx = NX_GLOB;
mpi_decomp.gsize[1] = mpi_decomp.lsize[1] = ny = NY_GLOB;
mpi_decomp.procL[0] = mpi_decomp.procL[1] = -1;
mpi_decomp.procR[0] = mpi_decomp.procR[1] = -1;
#endif
#ifdef _OPENACC
/* -------------------------------------------------------
0. Set the device for each rank
------------------------------------------------------- */
int device_type, num_devices;
int gpuId;
MPI_Comm shmcomm;
int local_rank;
// Get the local rank number
MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
MPI_INFO_NULL, &shmcomm);
MPI_Comm_rank(shmcomm, &local_rank);
// Device num = local rank mod number of devices on the node
device_type = acc_get_device_type();
num_devices = acc_get_num_devices(device_type);
gpuId = local_rank % num_devices;
acc_set_device_num(gpuId, device_type);
acc_init(device_type);
#endif
/* --------------------------------------------------------
1. Set local grid indices
-------------------------------------------------------- */
ibeg = NGHOST;
iend = ibeg + nx - 1;
nx = iend - ibeg + 1;
nx_tot = nx + 2*NGHOST;
jbeg = NGHOST;
jend = jbeg + ny - 1;
ny = jend - jbeg + 1;
ny_tot = ny + 2*NGHOST;
/* --------------------------------------------------------
2. Generate global and local grids
-------------------------------------------------------- */
xg = (double *) malloc ( (NX_GLOB+2*NGHOST)*sizeof(double));
yg = (double *) malloc ( (NY_GLOB+2*NGHOST)*sizeof(double));
for (i = 0; i < (NX_GLOB+2*NGHOST); i++) xg[i] = xbeg + (i-ibeg+1)*dx;
for (j = 0; j < (NY_GLOB+2*NGHOST); j++) yg[j] = ybeg + (j-jbeg+1)*dy;
#pragma acc enter data copyin(xg[:NX_GLOB+2*NGHOST],yg[:NY_GLOB+2*NGHOST])
#ifdef PARALLEL
x = xg + mpi_decomp.start[0];
y = yg + mpi_decomp.start[1];
#else
x = xg;
y = yg;
#endif
/* --------------------------------------------------------
3. Allocate memory on local processor and
assign initial conditions.
-------------------------------------------------------- */
phi = Allocate_2DdblArray(ny_tot, nx_tot);
phi0 = Allocate_2DdblArray(ny_tot, nx_tot);
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi0[j][i] = 0.0;
}}
#ifdef PARALLEL
MPI_Type_contiguous (nx_tot, MPI_DOUBLE, &row_type);
MPI_Type_vector (ny_tot, 1, nx_tot, MPI_DOUBLE, &col_type);
MPI_Type_commit (&row_type);
MPI_Type_commit (&col_type);
#endif
/* --------------------------------------------------------
4. Main iteration cycle
-------------------------------------------------------- */
tol = 1.e-5;
err = 1.0;
k = 0;
//#pragma acc enter data copyin(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST], y[:NX_GLOB+2*NGHOST])
#pragma acc enter data copyin(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot],err,err_glob)
while (err > tol){
/* -- 4a. Set boundary conditions first -- */
BoundaryConditions(phi0, x, y, nx, ny, &mpi_decomp);
/* -- 4b. Jacobi's method and residual (interior points) -- */
err = 0.0;
#pragma acc update device(err)
#pragma acc parallel loop collapse(2) reduction(+:err) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi[j][i] = 0.25*( phi0[j][i-1] + phi0[j][i+1]
+ phi0[j-1][i] + phi0[j+1][i] );
err += dx*dy*fabs(phi[j][i] - phi0[j][i]);
}}
#pragma acc parallel loop collapse(2) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi0[j][i] = phi[j][i];
}}
#ifdef PARALLEL
// double err_glob;
#pragma acc host_data use_device(err, err_glob)
{
MPI_Allreduce (&err, &err_glob, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
}
#pragma acc update host(err_glob)
err = err_glob;
#endif
if (rank == 0){
printf ("k = %d; err = %8.3e\n",k, err);
}
k++;
}
#pragma acc exit data copyout(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot],err,err_glob)
WriteSolution (phi, nx, ny, &mpi_decomp);
#ifdef PARALLEL
MPI_Finalize();
#endif
return 0;
}
#ifdef PARALLEL
/* ********************************************************************* */
void DomainDecomposition(MPI_Decomp *mpi_decomp)
/*
*
*********************************************************************** */
{
int dim, i;
int rank, size;
int *coords = mpi_decomp->coords;
int *gsize = mpi_decomp->gsize;
int *lsize = mpi_decomp->lsize;
int *nprocs = mpi_decomp->nprocs;
int *periods = mpi_decomp->periods;
int *procL = mpi_decomp->procL;
int *procR = mpi_decomp->procR;
int *start = mpi_decomp->start;
int new_coords[NDIM];
/* --------------------------------------------------------
1. Get rank & size
-------------------------------------------------------- */
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
mpi_decomp->rank = rank;
mpi_decomp->size = size;
/* --------------------------------------------------------
2. Obtain number of processor along each dimension.
Use maximally squared decomp.
-------------------------------------------------------- */
nprocs[0] = (int)sqrt(size);
nprocs[1] = size/nprocs[0];
if (nprocs[0]*nprocs[1] != size){
if (rank == 0) printf ("! Cannot decompose\n");
MPI_Finalize();
exit(1);
}
if (rank == 0){
printf ("Decomposition achieved with %d X %d procs\n",nprocs[0],nprocs[1]);
}
periods[0] = 0;
periods[1] = 0;
/* --------------------------------------------------------
3. Create Cartesian topology
-------------------------------------------------------- */
MPI_Cart_create(MPI_COMM_WORLD, NDIM, nprocs, periods,
0, &MPI_COMM_CART);
MPI_Cart_get(MPI_COMM_CART, NDIM, nprocs, periods, coords);
/* --------------------------------------------------------
4. Fill structure members
-------------------------------------------------------- */
gsize[0] = NX_GLOB;
gsize[1] = NY_GLOB;
lsize[0] = NX_GLOB/nprocs[0];
lsize[1] = NY_GLOB/nprocs[1];
start[0] = coords[0]*lsize[0];
start[1] = coords[1]*lsize[1];
/* --------------------------------------------------------
5. Determine ranks of neighbour processors
-------------------------------------------------------- */
for (dim = 0; dim < NDIM; dim++) {
for (i = 0; i < NDIM; i++) new_coords[i] = coords[i];
new_coords[dim] = coords[dim] + 1;
if (new_coords[dim] < nprocs[dim]) {
MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procR[dim]) );
} else {
procR[dim] = MPI_PROC_NULL;
}
new_coords[dim] = coords[dim] - 1;
if (new_coords[dim] >= 0) {
MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procL[dim]) );
} else {
procL[dim] = MPI_PROC_NULL;
}
}
/* --------------------------------------------------------
6. Print processor information.
(Use MPI_Bcast() to print in sequence)
-------------------------------------------------------- */
int proc, go;
for (proc = 0; proc < size; proc++){
go = proc;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == go) {
printf ("[Rank %d]\n",rank);
printf (" coords = [%d, %d], lsize = [%d, %d]\n",
coords[0], coords[1], lsize[0], lsize[1]);
for (dim = 0; dim < NDIM; dim++){
printf (" (procL, procR)[%d] = %d, %d\n", dim, procL[dim], procR[dim]);
}
}
MPI_Barrier(MPI_COMM_WORLD);
}
return;
}
#endif
/* ********************************************************************* */
void BoundaryConditions(double **phi, double *x, double *y,
int nx, int ny, MPI_Decomp *mpi_decomp)
/*
*********************************************************************** */
{
int i,j;
int ibeg = NGHOST;
int iend = ibeg + nx - 1;
int jbeg = NGHOST;
int jend = jbeg + ny - 1;
int *procL = mpi_decomp->procL;
int *procR = mpi_decomp->procR;
#ifdef PARALLEL
int rank = mpi_decomp->rank;
int size = mpi_decomp->size;
double send_buf[NX_GLOB + 2*NGHOST];
double recv_buf[NX_GLOB + 2*NGHOST];
/* Used for testing
for (j = 0; j <= jend+1; j++){
for (i = 0; i <= iend+1; i++){
phi[j][i] = -1;
}}
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi[j][i] = rank;
}}
*/
#pragma acc enter data create(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
// Left buffer
i = ibeg;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procL[0], 0,
recv_buf, jend+1, MPI_DOUBLE, procL[0], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i-1] = recv_buf[j];
// Right buffer
i = iend;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procR[0], 0,
recv_buf, jend+1, MPI_DOUBLE, procR[0], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i+1] = recv_buf[j];
// Bottom buffer
j = jbeg;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
// #pragma acc update self(send_buf[:NX_GLOB+2*NGHOST])
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procL[1], 0,
recv_buf, iend+1, MPI_DOUBLE, procL[1], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j-1][i] = recv_buf[i];
// Top buffer
j = jend;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procR[1], 0,
recv_buf, iend+1, MPI_DOUBLE, procR[1], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j+1][i] = recv_buf[i];
#pragma acc exit data copyout(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
#endif
/* -- Left -- */
if (procL[0] < 0){
i = ibeg-1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y)
for (j = jbeg; j <= jend; j++) phi[j][i] = 1.0-y[j];
}
/* -- Right -- */
if (procR[0] < 0){
i = iend+1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y)
for (j = jbeg; j <= jend; j++) phi[j][i] = y[j]*y[j];
}
/* -- Bottom -- */
if (procL[1] < 0){
j = jbeg-1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x)
for (i = ibeg; i <= iend; i++) phi[j][i] = 1.0-x[i];
}
/* -- Top -- */
if (procR[1] < 0){
j = jend+1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x)
for (i = ibeg; i <= iend; i++) phi[j][i] = x[i];
}
return;
#ifdef PARALLEL
// Print
MPI_Barrier(MPI_COMM_WORLD);
int go, proc;
for (proc = 0; proc < size; proc++){
go = proc;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == go) {
printf ("Boundary [Rank %d]\n",rank);
for (j = jend+1; j >= 0; j--){
for (i = 0; i <= iend+1; i++){
printf ("%6.2f ", phi[j][i]);
}
printf ("\n");
}
}
}
MPI_Finalize();
exit(0);
#endif
}
/* ********************************************************************* */
void WriteSolution (double **phi, int nx, int ny, MPI_Decomp *md)
/*
*********************************************************************** */
{
int i,j;
int ibeg = NGHOST;
int iend = ibeg + nx - 1;
int jbeg = NGHOST;
int jend = jbeg + ny - 1;
static int nfile = 0;
char fname[32];
sprintf (fname,"laplace2D_MPIACC.txt",nfile);
/*
for (j = jbeg-1; j <= jend+1; j++) for (i = ibeg-1; i <= iend+1; i++) {
phi[j][i] = -1;
}
for (j = jbeg; j <= jend; j++) for (i = ibeg; i <= iend; i++) {
phi[j][i] = md->rank;
}
*/
#ifdef PARALLEL
MPI_File fh;
MPI_Datatype type_local, type_domain;
int amode = MPI_MODE_CREATE | MPI_MODE_WRONLY;
int gsize[2], lsize[2], start[2];
/* --------------------------------------------------------
1. Create a local array type without the ghost zones
This datatype will be passed to MPI_File_write()
-------------------------------------------------------- */
gsize[0] = md->lsize[0] + 2*NGHOST;
gsize[1] = md->lsize[1] + 2*NGHOST;
lsize[0] = md->lsize[0];
lsize[1] = md->lsize[1];
start[0] = NGHOST;
start[1] = NGHOST;
MPI_Type_create_subarray (NDIM, gsize, lsize, start,
MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_local);
MPI_Type_commit (&type_local);
/* --------------------------------------------------------
2. Create the subarry in the global domain.
This datatype is used to set the file view.
-------------------------------------------------------- */
gsize[0] = NX_GLOB;
gsize[1] = NY_GLOB;
lsize[0] = md->lsize[0];
lsize[1] = md->lsize[1];
start[0] = lsize[0]*md->coords[0]; // equal to md->start[0]
start[1] = lsize[1]*md->coords[1]; // equal to md->start[1]
MPI_Type_create_subarray (NDIM, gsize, lsize, start,
MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_domain);
MPI_Type_commit (&type_domain);
/* --------------------------------------------------------
3. Write to disk
-------------------------------------------------------- */
MPI_File_delete(fname, MPI_INFO_NULL);
MPI_File_open(MPI_COMM_CART, fname, amode, MPI_INFO_NULL, &fh);
MPI_File_set_view(fh, 0, MPI_DOUBLE, type_domain, "native", MPI_INFO_NULL);
MPI_File_write_all(fh, phi[0], 1, type_local, MPI_STATUS_IGNORE);
MPI_File_close(&fh);
MPI_Type_free (&type_local);
MPI_Type_free (&type_domain);
#else
FILE *fp;
printf ("> Writing %s\n",fname);
fp = fopen(fname, "wb");
for (j = jbeg; j <= jend; j++){
fwrite (phi[j] + ibeg, sizeof(double), nx, fp);
}
fclose(fp);
#endif
nfile++;
}
/* ********************************************************************* */
double **Allocate_2DdblArray(int nx, int ny)
/*
* Allocate memory for a double precision array with
* nx rows and ny columns
*********************************************************************** */
{
int i,j;
double **buf;
buf = (double **)malloc (nx*sizeof(double *));
buf[0] = (double *) malloc (nx*ny*sizeof(double));
for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;
return buf;
}
/* ********************************************************************* */
int **Allocate_2DintArray(int nx, int ny)
/*
* Allocate memory for an integer-type array with
* nx rows and ny columns
*********************************************************************** */
{
int i,j;
int **buf;
buf = (int **)malloc (nx*sizeof(int *));
buf[0] = (int *) malloc (nx*ny*sizeof(int));
for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;
return buf;
}
/* ********************************************************************* */
void Show_2DdblArray(double **A, int nx, int ny, const char *string)
/*
*********************************************************************** */
{
int i, j;
printf ("%s\n",string);
printf ("------------------------------\n");
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
printf ("%8.2f ", A[i][j]);
}
printf ("\n");
}
printf ("------------------------------\n");
}
/* ********************************************************************* */
void Show_2DintArray(int **A, int nx, int ny, const char *string)
/*
*********************************************************************** */
{
int i, j;
printf ("%s\n",string);
for (j = 0; j < ny; j++) printf ("-----");
printf ("\n");
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
printf ("%03d ", A[i][j]);
}
printf ("\n");
}
for (j = 0; j < ny; j++) printf ("-----");
printf ("\n");
}
我想在设备上移动 main 中的整个 while 循环。当我将 #pragma acc host_data use_device(err)
添加到 MPI_Allreduce (&err, &err, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
时出现问题。
错误是 err 的减少不起作用,因此代码在循环一步后退出。
在MPI_Allreduce()
之后,即使使用#pragma acc update self(err)
,err仍然为零。
我正在编译 mpicc -acc -ta=tesla:managed -Minfo=accel -w jacobi.c
并且 运行 mpirun -np 2 -mca pml ^ucx ./a.out
你能帮我找出错误吗?
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define PARALLEL
#define NX_GLOB 128 /* Global number of interior points */
#define NY_GLOB 128 /* Global number of interior points */
#define NGHOST 1
#define NDIM 2
#ifdef PARALLEL
#include <mpi.h>
MPI_Comm MPI_COMM_CART;
#endif
typedef struct MPI_Decomp_{
int nprocs[NDIM]; /* Number of processors in each dimension */
int periods[NDIM]; /* Periodicity flag in each dimension */
int coords[NDIM]; /* Cartesian coordinate in the MPI topology */
int gsize[NDIM]; /* Global domain size (no ghosts) */
int lsize[NDIM]; /* Local domain size (no ghosts) */
int start[NDIM]; /* Local start index in each dimension */
int procL[NDIM]; /* Rank of left-lying process in each direction */
int procR[NDIM]; /* Rank of right-lying process in each direction */
int rank; /* Local process rank */
int size; /* Communicator size */
} MPI_Decomp;
void BoundaryConditions(double **, double *, double *, int, int, MPI_Decomp *);
void DomainDecomposition(MPI_Decomp *);
void WriteSolution (double **, int, int, MPI_Decomp *);
double **Allocate_2DdblArray(int, int);
int **Allocate_2DintArray(int, int);
void Show_2DdblArray(double **, int, int, const char *);
void Show_2DintArray(int **, int, int, const char *);
int nx_tot, ny_tot;
int main(int argc, char ** argv)
{
int nx, i, ibeg, iend;
int ny, j, jbeg, jend;
int k, rank=0, size=1;
double xbeg = 0.0, xend = 1.0;
double ybeg = 0.0, yend = 1.0;
double dx = (xend - xbeg)/(NX_GLOB + 1);
double dy = (yend - ybeg)/(NY_GLOB + 1);
double *xg, *yg, *x, *y, **phi, **phi0;
double err, tol;
MPI_Decomp mpi_decomp;
double err_glob;
int procL[NDIM] = {-1,-1};
int procR[NDIM] = {-1,-1};
/* --------------------------------------------------------
0. Initialize the MPI execution environment
-------------------------------------------------------- */
#ifdef PARALLEL
MPI_Datatype row_type, col_type;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
DomainDecomposition(&mpi_decomp);
nx = mpi_decomp.lsize[0];
ny = mpi_decomp.lsize[1];
#else
mpi_decomp.gsize[0] = mpi_decomp.lsize[0] = nx = NX_GLOB;
mpi_decomp.gsize[1] = mpi_decomp.lsize[1] = ny = NY_GLOB;
mpi_decomp.procL[0] = mpi_decomp.procL[1] = -1;
mpi_decomp.procR[0] = mpi_decomp.procR[1] = -1;
#endif
/* --------------------------------------------------------
1. Set local grid indices
-------------------------------------------------------- */
ibeg = NGHOST;
iend = ibeg + nx - 1;
nx = iend - ibeg + 1;
nx_tot = nx + 2*NGHOST;
jbeg = NGHOST;
jend = jbeg + ny - 1;
ny = jend - jbeg + 1;
ny_tot = ny + 2*NGHOST;
/* --------------------------------------------------------
2. Generate global and local grids
-------------------------------------------------------- */
xg = (double *) malloc ( (NX_GLOB+2*NGHOST)*sizeof(double));
yg = (double *) malloc ( (NY_GLOB+2*NGHOST)*sizeof(double));
for (i = 0; i < (NX_GLOB+2*NGHOST); i++) xg[i] = xbeg + (i-ibeg+1)*dx;
for (j = 0; j < (NY_GLOB+2*NGHOST); j++) yg[j] = ybeg + (j-jbeg+1)*dy;
#ifdef PARALLEL
x = xg + mpi_decomp.start[0];
y = yg + mpi_decomp.start[1];
#else
x = xg;
y = yg;
#endif
/* --------------------------------------------------------
3. Allocate memory on local processor and
assign initial conditions.
-------------------------------------------------------- */
phi = Allocate_2DdblArray(ny_tot, nx_tot);
phi0 = Allocate_2DdblArray(ny_tot, nx_tot);
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi0[j][i] = 0.0;
}}
#ifdef PARALLEL
MPI_Type_contiguous (nx_tot, MPI_DOUBLE, &row_type);
MPI_Type_vector (ny_tot, 1, nx_tot, MPI_DOUBLE, &col_type);
MPI_Type_commit (&row_type);
MPI_Type_commit (&col_type);
#endif
/* --------------------------------------------------------
4. Main iteration cycle
-------------------------------------------------------- */
tol = 1.e-5;
err = 1.0;
k = 0;
#pragma acc enter data copyin(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST], y[NX_GLOB+2*NGHOST], err, err_glob)
while (err > tol){
/* -- 4a. Set boundary conditions first -- */
BoundaryConditions(phi0, x, y, nx, ny, &mpi_decomp);
/* -- 4b. Jacobi's method and residual (interior points) -- */
err = 0.0;
#pragma acc parallel loop collapse(2) reduction(+:err) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi[j][i] = 0.25*( phi0[j][i-1] + phi0[j][i+1]
+ phi0[j-1][i] + phi0[j+1][i] );
err += dx*dy*fabs(phi[j][i] - phi0[j][i]);
}}
#pragma acc parallel loop collapse(2) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi0[j][i] = phi[j][i];
}}
#ifdef PARALLEL
// double err_glob;
#pragma acc host_data use_device(err, err_glob)
{
MPI_Allreduce (&err, &err_glob, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
}
err = err_glob;
#endif
// #pragma acc update host(err)
if (rank == 0){
printf ("k = %d; err = %8.3e\n",k, err);
}
k++;
}
#pragma acc exit data copyout(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot], err, err_glob)
WriteSolution (phi, nx, ny, &mpi_decomp);
#ifdef PARALLEL
MPI_Finalize();
#endif
return 0;
}
#ifdef PARALLEL
/* ********************************************************************* */
void DomainDecomposition(MPI_Decomp *mpi_decomp)
/*
*
*********************************************************************** */
{
int dim, i;
int rank, size;
int *coords = mpi_decomp->coords;
int *gsize = mpi_decomp->gsize;
int *lsize = mpi_decomp->lsize;
int *nprocs = mpi_decomp->nprocs;
int *periods = mpi_decomp->periods;
int *procL = mpi_decomp->procL;
int *procR = mpi_decomp->procR;
int *start = mpi_decomp->start;
int new_coords[NDIM];
/* --------------------------------------------------------
1. Get rank & size
-------------------------------------------------------- */
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
mpi_decomp->rank = rank;
mpi_decomp->size = size;
/* --------------------------------------------------------
2. Obtain number of processor along each dimension.
Use maximally squared decomp.
-------------------------------------------------------- */
nprocs[0] = (int)sqrt(size);
nprocs[1] = size/nprocs[0];
if (nprocs[0]*nprocs[1] != size){
if (rank == 0) printf ("! Cannot decompose\n");
MPI_Finalize();
exit(1);
}
if (rank == 0){
printf ("Decomposition achieved with %d X %d procs\n",nprocs[0],nprocs[1]);
}
periods[0] = 0;
periods[1] = 0;
/* --------------------------------------------------------
3. Create Cartesian topology
-------------------------------------------------------- */
MPI_Cart_create(MPI_COMM_WORLD, NDIM, nprocs, periods,
0, &MPI_COMM_CART);
MPI_Cart_get(MPI_COMM_CART, NDIM, nprocs, periods, coords);
/* --------------------------------------------------------
4. Fill structure members
-------------------------------------------------------- */
gsize[0] = NX_GLOB;
gsize[1] = NY_GLOB;
lsize[0] = NX_GLOB/nprocs[0];
lsize[1] = NY_GLOB/nprocs[1];
start[0] = coords[0]*lsize[0];
start[1] = coords[1]*lsize[1];
/* --------------------------------------------------------
5. Determine ranks of neighbour processors
-------------------------------------------------------- */
for (dim = 0; dim < NDIM; dim++) {
for (i = 0; i < NDIM; i++) new_coords[i] = coords[i];
new_coords[dim] = coords[dim] + 1;
if (new_coords[dim] < nprocs[dim]) {
MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procR[dim]) );
} else {
procR[dim] = MPI_PROC_NULL;
}
new_coords[dim] = coords[dim] - 1;
if (new_coords[dim] >= 0) {
MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procL[dim]) );
} else {
procL[dim] = MPI_PROC_NULL;
}
}
/* --------------------------------------------------------
6. Print processor information.
(Use MPI_Bcast() to print in sequence)
-------------------------------------------------------- */
int proc, go;
for (proc = 0; proc < size; proc++){
go = proc;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == go) {
printf ("[Rank %d]\n",rank);
printf (" coords = [%d, %d], lsize = [%d, %d]\n",
coords[0], coords[1], lsize[0], lsize[1]);
for (dim = 0; dim < NDIM; dim++){
printf (" (procL, procR)[%d] = %d, %d\n", dim, procL[dim], procR[dim]);
}
}
MPI_Barrier(MPI_COMM_WORLD);
}
return;
}
#endif
/* ********************************************************************* */
void BoundaryConditions(double **phi, double *x, double *y,
int nx, int ny, MPI_Decomp *mpi_decomp)
/*
*********************************************************************** */
{
int i,j;
int ibeg = NGHOST;
int iend = ibeg + nx - 1;
int jbeg = NGHOST;
int jend = jbeg + ny - 1;
int *procL = mpi_decomp->procL;
int *procR = mpi_decomp->procR;
#ifdef PARALLEL
int rank = mpi_decomp->rank;
int size = mpi_decomp->size;
double send_buf[NX_GLOB + 2*NGHOST];
double recv_buf[NX_GLOB + 2*NGHOST];
/* Used for testing
for (j = 0; j <= jend+1; j++){
for (i = 0; i <= iend+1; i++){
phi[j][i] = -1;
}}
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi[j][i] = rank;
}}
*/
#pragma acc enter data create(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
// Left buffer
i = ibeg;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procL[0], 0,
recv_buf, jend+1, MPI_DOUBLE, procL[0], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i-1] = recv_buf[j];
// Right buffer
i = iend;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procR[0], 0,
recv_buf, jend+1, MPI_DOUBLE, procR[0], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i+1] = recv_buf[j];
// Bottom buffer
j = jbeg;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
// #pragma acc update self(send_buf[:NX_GLOB+2*NGHOST])
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procL[1], 0,
recv_buf, iend+1, MPI_DOUBLE, procL[1], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j-1][i] = recv_buf[i];
// Top buffer
j = jend;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procR[1], 0,
recv_buf, iend+1, MPI_DOUBLE, procR[1], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j+1][i] = recv_buf[i];
#pragma acc exit data copyout(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
#endif
/* -- Left -- */
if (procL[0] < 0){
i = ibeg-1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y[:NY_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i] = 1.0-y[j];
}
/* -- Right -- */
if (procR[0] < 0){
i = iend+1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y[:NY_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i] = y[j]*y[j];
}
/* -- Bottom -- */
if (procL[1] < 0){
j = jbeg-1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j][i] = 1.0-x[i];
}
/* -- Top -- */
if (procR[1] < 0){
j = jend+1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j][i] = x[i];
}
return;
#ifdef PARALLEL
// Print
MPI_Barrier(MPI_COMM_WORLD);
int go, proc;
for (proc = 0; proc < size; proc++){
go = proc;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == go) {
printf ("Boundary [Rank %d]\n",rank);
for (j = jend+1; j >= 0; j--){
for (i = 0; i <= iend+1; i++){
printf ("%6.2f ", phi[j][i]);
}
printf ("\n");
}
}
}
MPI_Finalize();
exit(0);
#endif
}
/* ********************************************************************* */
void WriteSolution (double **phi, int nx, int ny, MPI_Decomp *md)
/*
*********************************************************************** */
{
int i,j;
int ibeg = NGHOST;
int iend = ibeg + nx - 1;
int jbeg = NGHOST;
int jend = jbeg + ny - 1;
static int nfile = 0;
char fname[32];
sprintf (fname,"laplace2D_MPIACC.txt",nfile);
/*
for (j = jbeg-1; j <= jend+1; j++) for (i = ibeg-1; i <= iend+1; i++) {
phi[j][i] = -1;
}
for (j = jbeg; j <= jend; j++) for (i = ibeg; i <= iend; i++) {
phi[j][i] = md->rank;
}
*/
#ifdef PARALLEL
MPI_File fh;
MPI_Datatype type_local, type_domain;
int amode = MPI_MODE_CREATE | MPI_MODE_WRONLY;
int gsize[2], lsize[2], start[2];
/* --------------------------------------------------------
1. Create a local array type without the ghost zones
This datatype will be passed to MPI_File_write()
-------------------------------------------------------- */
gsize[0] = md->lsize[0] + 2*NGHOST;
gsize[1] = md->lsize[1] + 2*NGHOST;
lsize[0] = md->lsize[0];
lsize[1] = md->lsize[1];
start[0] = NGHOST;
start[1] = NGHOST;
MPI_Type_create_subarray (NDIM, gsize, lsize, start,
MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_local);
MPI_Type_commit (&type_local);
/* --------------------------------------------------------
2. Create the subarry in the global domain.
This datatype is used to set the file view.
-------------------------------------------------------- */
gsize[0] = NX_GLOB;
gsize[1] = NY_GLOB;
lsize[0] = md->lsize[0];
lsize[1] = md->lsize[1];
start[0] = lsize[0]*md->coords[0]; // equal to md->start[0]
start[1] = lsize[1]*md->coords[1]; // equal to md->start[1]
MPI_Type_create_subarray (NDIM, gsize, lsize, start,
MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_domain);
MPI_Type_commit (&type_domain);
/* --------------------------------------------------------
3. Write to disk
-------------------------------------------------------- */
MPI_File_delete(fname, MPI_INFO_NULL);
MPI_File_open(MPI_COMM_CART, fname, amode, MPI_INFO_NULL, &fh);
MPI_File_set_view(fh, 0, MPI_DOUBLE, type_domain, "native", MPI_INFO_NULL);
MPI_File_write_all(fh, phi[0], 1, type_local, MPI_STATUS_IGNORE);
MPI_File_close(&fh);
MPI_Type_free (&type_local);
MPI_Type_free (&type_domain);
#else
FILE *fp;
printf ("> Writing %s\n",fname);
fp = fopen(fname, "wb");
for (j = jbeg; j <= jend; j++){
fwrite (phi[j] + ibeg, sizeof(double), nx, fp);
}
fclose(fp);
#endif
nfile++;
}
/* ********************************************************************* */
double **Allocate_2DdblArray(int nx, int ny)
/*
* Allocate memory for a double precision array with
* nx rows and ny columns
*********************************************************************** */
{
int i,j;
double **buf;
buf = (double **)malloc (nx*sizeof(double *));
buf[0] = (double *) malloc (nx*ny*sizeof(double));
for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;
return buf;
}
/* ********************************************************************* */
int **Allocate_2DintArray(int nx, int ny)
/*
* Allocate memory for an integer-type array with
* nx rows and ny columns
*********************************************************************** */
{
int i,j;
int **buf;
buf = (int **)malloc (nx*sizeof(int *));
buf[0] = (int *) malloc (nx*ny*sizeof(int));
for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;
return buf;
}
/* ********************************************************************* */
void Show_2DdblArray(double **A, int nx, int ny, const char *string)
/*
*********************************************************************** */
{
int i, j;
printf ("%s\n",string);
printf ("------------------------------\n");
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
printf ("%8.2f ", A[i][j]);
}
printf ("\n");
}
printf ("------------------------------\n");
}
/* ********************************************************************* */
void Show_2DintArray(int **A, int nx, int ny, const char *string)
/*
*********************************************************************** */
{
int i, j;
printf ("%s\n",string);
for (j = 0; j < ny; j++) printf ("-----");
printf ("\n");
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
printf ("%03d ", A[i][j]);
}
printf ("\n");
}
for (j = 0; j < ny; j++) printf ("-----");
printf ("\n");
}
感谢您更新示例。这里有一些问题。
首先,对于“err”和“err_glob”。在循环开始时,您在主机上设置“err=0”但不在设备上更新它。然后在 MPI_AllReduce 调用之后,您再次在主机上设置“err=err_glob”,因此需要更新“err_glob”。
第二个问题是,当 运行 具有多个等级时,代码会部分出现“y”错误。问题是您使用的是全局大小而不是“x”和“y”的局部大小,因此当您复制“y”时,由于偏移量,它会与“x”重叠。我通过将“xg”和“yg”复制到设备来解决这个问题。
至于相对于 CPU 的性能,这里的主要问题是尺寸小,因此代码严重利用 GPU。我将 GLOB 大小增加到 4096 并看到更好的相对性能,尽管代码收敛得更快。
我还冒昧地添加了一些用于设备分配排名的样板代码,以便代码可以利用多个 GPU。
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define PARALLEL
#define NX_GLOB 128 /* Global number of interior points */
#define NY_GLOB 128 /* Global number of interior points */
#define NGHOST 1
#define NDIM 2
#ifdef PARALLEL
#include <mpi.h>
MPI_Comm MPI_COMM_CART;
#endif
#ifdef _OPENACC
#include <openacc.h>
#endif
typedef struct MPI_Decomp_{
int nprocs[NDIM]; /* Number of processors in each dimension */
int periods[NDIM]; /* Periodicity flag in each dimension */
int coords[NDIM]; /* Cartesian coordinate in the MPI topology */
int gsize[NDIM]; /* Global domain size (no ghosts) */
int lsize[NDIM]; /* Local domain size (no ghosts) */
int start[NDIM]; /* Local start index in each dimension */
int procL[NDIM]; /* Rank of left-lying process in each direction */
int procR[NDIM]; /* Rank of right-lying process in each direction */
int rank; /* Local process rank */
int size; /* Communicator size */
} MPI_Decomp;
void BoundaryConditions(double **, double *, double *, int, int, MPI_Decomp *);
void DomainDecomposition(MPI_Decomp *);
void WriteSolution (double **, int, int, MPI_Decomp *);
double **Allocate_2DdblArray(int, int);
int **Allocate_2DintArray(int, int);
void Show_2DdblArray(double **, int, int, const char *);
void Show_2DintArray(int **, int, int, const char *);
int nx_tot, ny_tot;
int main(int argc, char ** argv)
{
int nx, i, ibeg, iend;
int ny, j, jbeg, jend;
int k, rank=0, size=1;
int xsize,ysize;
double xbeg = 0.0, xend = 1.0;
double ybeg = 0.0, yend = 1.0;
double dx = (xend - xbeg)/(NX_GLOB + 1);
double dy = (yend - ybeg)/(NY_GLOB + 1);
double *xg, *yg, *x, *y, **phi, **phi0;
double err, tol;
MPI_Decomp mpi_decomp;
double err_glob;
int procL[NDIM] = {-1,-1};
int procR[NDIM] = {-1,-1};
/* --------------------------------------------------------
0. Initialize the MPI execution environment
-------------------------------------------------------- */
#ifdef PARALLEL
MPI_Datatype row_type, col_type;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
DomainDecomposition(&mpi_decomp);
nx = mpi_decomp.lsize[0];
ny = mpi_decomp.lsize[1];
#else
mpi_decomp.gsize[0] = mpi_decomp.lsize[0] = nx = NX_GLOB;
mpi_decomp.gsize[1] = mpi_decomp.lsize[1] = ny = NY_GLOB;
mpi_decomp.procL[0] = mpi_decomp.procL[1] = -1;
mpi_decomp.procR[0] = mpi_decomp.procR[1] = -1;
#endif
#ifdef _OPENACC
/* -------------------------------------------------------
0. Set the device for each rank
------------------------------------------------------- */
int device_type, num_devices;
int gpuId;
MPI_Comm shmcomm;
int local_rank;
// Get the local rank number
MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
MPI_INFO_NULL, &shmcomm);
MPI_Comm_rank(shmcomm, &local_rank);
// Device num = local rank mod number of devices on the node
device_type = acc_get_device_type();
num_devices = acc_get_num_devices(device_type);
gpuId = local_rank % num_devices;
acc_set_device_num(gpuId, device_type);
acc_init(device_type);
#endif
/* --------------------------------------------------------
1. Set local grid indices
-------------------------------------------------------- */
ibeg = NGHOST;
iend = ibeg + nx - 1;
nx = iend - ibeg + 1;
nx_tot = nx + 2*NGHOST;
jbeg = NGHOST;
jend = jbeg + ny - 1;
ny = jend - jbeg + 1;
ny_tot = ny + 2*NGHOST;
/* --------------------------------------------------------
2. Generate global and local grids
-------------------------------------------------------- */
xg = (double *) malloc ( (NX_GLOB+2*NGHOST)*sizeof(double));
yg = (double *) malloc ( (NY_GLOB+2*NGHOST)*sizeof(double));
for (i = 0; i < (NX_GLOB+2*NGHOST); i++) xg[i] = xbeg + (i-ibeg+1)*dx;
for (j = 0; j < (NY_GLOB+2*NGHOST); j++) yg[j] = ybeg + (j-jbeg+1)*dy;
#pragma acc enter data copyin(xg[:NX_GLOB+2*NGHOST],yg[:NY_GLOB+2*NGHOST])
#ifdef PARALLEL
x = xg + mpi_decomp.start[0];
y = yg + mpi_decomp.start[1];
#else
x = xg;
y = yg;
#endif
/* --------------------------------------------------------
3. Allocate memory on local processor and
assign initial conditions.
-------------------------------------------------------- */
phi = Allocate_2DdblArray(ny_tot, nx_tot);
phi0 = Allocate_2DdblArray(ny_tot, nx_tot);
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi0[j][i] = 0.0;
}}
#ifdef PARALLEL
MPI_Type_contiguous (nx_tot, MPI_DOUBLE, &row_type);
MPI_Type_vector (ny_tot, 1, nx_tot, MPI_DOUBLE, &col_type);
MPI_Type_commit (&row_type);
MPI_Type_commit (&col_type);
#endif
/* --------------------------------------------------------
4. Main iteration cycle
-------------------------------------------------------- */
tol = 1.e-5;
err = 1.0;
k = 0;
//#pragma acc enter data copyin(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST], y[:NX_GLOB+2*NGHOST])
#pragma acc enter data copyin(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot],err,err_glob)
while (err > tol){
/* -- 4a. Set boundary conditions first -- */
BoundaryConditions(phi0, x, y, nx, ny, &mpi_decomp);
/* -- 4b. Jacobi's method and residual (interior points) -- */
err = 0.0;
#pragma acc update device(err)
#pragma acc parallel loop collapse(2) reduction(+:err) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi[j][i] = 0.25*( phi0[j][i-1] + phi0[j][i+1]
+ phi0[j-1][i] + phi0[j+1][i] );
err += dx*dy*fabs(phi[j][i] - phi0[j][i]);
}}
#pragma acc parallel loop collapse(2) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi0[j][i] = phi[j][i];
}}
#ifdef PARALLEL
// double err_glob;
#pragma acc host_data use_device(err, err_glob)
{
MPI_Allreduce (&err, &err_glob, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
}
#pragma acc update host(err_glob)
err = err_glob;
#endif
if (rank == 0){
printf ("k = %d; err = %8.3e\n",k, err);
}
k++;
}
#pragma acc exit data copyout(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot],err,err_glob)
WriteSolution (phi, nx, ny, &mpi_decomp);
#ifdef PARALLEL
MPI_Finalize();
#endif
return 0;
}
#ifdef PARALLEL
/* ********************************************************************* */
void DomainDecomposition(MPI_Decomp *mpi_decomp)
/*
*
*********************************************************************** */
{
int dim, i;
int rank, size;
int *coords = mpi_decomp->coords;
int *gsize = mpi_decomp->gsize;
int *lsize = mpi_decomp->lsize;
int *nprocs = mpi_decomp->nprocs;
int *periods = mpi_decomp->periods;
int *procL = mpi_decomp->procL;
int *procR = mpi_decomp->procR;
int *start = mpi_decomp->start;
int new_coords[NDIM];
/* --------------------------------------------------------
1. Get rank & size
-------------------------------------------------------- */
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
mpi_decomp->rank = rank;
mpi_decomp->size = size;
/* --------------------------------------------------------
2. Obtain number of processor along each dimension.
Use maximally squared decomp.
-------------------------------------------------------- */
nprocs[0] = (int)sqrt(size);
nprocs[1] = size/nprocs[0];
if (nprocs[0]*nprocs[1] != size){
if (rank == 0) printf ("! Cannot decompose\n");
MPI_Finalize();
exit(1);
}
if (rank == 0){
printf ("Decomposition achieved with %d X %d procs\n",nprocs[0],nprocs[1]);
}
periods[0] = 0;
periods[1] = 0;
/* --------------------------------------------------------
3. Create Cartesian topology
-------------------------------------------------------- */
MPI_Cart_create(MPI_COMM_WORLD, NDIM, nprocs, periods,
0, &MPI_COMM_CART);
MPI_Cart_get(MPI_COMM_CART, NDIM, nprocs, periods, coords);
/* --------------------------------------------------------
4. Fill structure members
-------------------------------------------------------- */
gsize[0] = NX_GLOB;
gsize[1] = NY_GLOB;
lsize[0] = NX_GLOB/nprocs[0];
lsize[1] = NY_GLOB/nprocs[1];
start[0] = coords[0]*lsize[0];
start[1] = coords[1]*lsize[1];
/* --------------------------------------------------------
5. Determine ranks of neighbour processors
-------------------------------------------------------- */
for (dim = 0; dim < NDIM; dim++) {
for (i = 0; i < NDIM; i++) new_coords[i] = coords[i];
new_coords[dim] = coords[dim] + 1;
if (new_coords[dim] < nprocs[dim]) {
MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procR[dim]) );
} else {
procR[dim] = MPI_PROC_NULL;
}
new_coords[dim] = coords[dim] - 1;
if (new_coords[dim] >= 0) {
MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procL[dim]) );
} else {
procL[dim] = MPI_PROC_NULL;
}
}
/* --------------------------------------------------------
6. Print processor information.
(Use MPI_Bcast() to print in sequence)
-------------------------------------------------------- */
int proc, go;
for (proc = 0; proc < size; proc++){
go = proc;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == go) {
printf ("[Rank %d]\n",rank);
printf (" coords = [%d, %d], lsize = [%d, %d]\n",
coords[0], coords[1], lsize[0], lsize[1]);
for (dim = 0; dim < NDIM; dim++){
printf (" (procL, procR)[%d] = %d, %d\n", dim, procL[dim], procR[dim]);
}
}
MPI_Barrier(MPI_COMM_WORLD);
}
return;
}
#endif
/* ********************************************************************* */
void BoundaryConditions(double **phi, double *x, double *y,
int nx, int ny, MPI_Decomp *mpi_decomp)
/*
*********************************************************************** */
{
int i,j;
int ibeg = NGHOST;
int iend = ibeg + nx - 1;
int jbeg = NGHOST;
int jend = jbeg + ny - 1;
int *procL = mpi_decomp->procL;
int *procR = mpi_decomp->procR;
#ifdef PARALLEL
int rank = mpi_decomp->rank;
int size = mpi_decomp->size;
double send_buf[NX_GLOB + 2*NGHOST];
double recv_buf[NX_GLOB + 2*NGHOST];
/* Used for testing
for (j = 0; j <= jend+1; j++){
for (i = 0; i <= iend+1; i++){
phi[j][i] = -1;
}}
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi[j][i] = rank;
}}
*/
#pragma acc enter data create(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
// Left buffer
i = ibeg;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procL[0], 0,
recv_buf, jend+1, MPI_DOUBLE, procL[0], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i-1] = recv_buf[j];
// Right buffer
i = iend;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procR[0], 0,
recv_buf, jend+1, MPI_DOUBLE, procR[0], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i+1] = recv_buf[j];
// Bottom buffer
j = jbeg;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
// #pragma acc update self(send_buf[:NX_GLOB+2*NGHOST])
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procL[1], 0,
recv_buf, iend+1, MPI_DOUBLE, procL[1], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j-1][i] = recv_buf[i];
// Top buffer
j = jend;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procR[1], 0,
recv_buf, iend+1, MPI_DOUBLE, procR[1], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j+1][i] = recv_buf[i];
#pragma acc exit data copyout(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
#endif
/* -- Left -- */
if (procL[0] < 0){
i = ibeg-1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y)
for (j = jbeg; j <= jend; j++) phi[j][i] = 1.0-y[j];
}
/* -- Right -- */
if (procR[0] < 0){
i = iend+1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y)
for (j = jbeg; j <= jend; j++) phi[j][i] = y[j]*y[j];
}
/* -- Bottom -- */
if (procL[1] < 0){
j = jbeg-1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x)
for (i = ibeg; i <= iend; i++) phi[j][i] = 1.0-x[i];
}
/* -- Top -- */
if (procR[1] < 0){
j = jend+1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x)
for (i = ibeg; i <= iend; i++) phi[j][i] = x[i];
}
return;
#ifdef PARALLEL
// Print
MPI_Barrier(MPI_COMM_WORLD);
int go, proc;
for (proc = 0; proc < size; proc++){
go = proc;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == go) {
printf ("Boundary [Rank %d]\n",rank);
for (j = jend+1; j >= 0; j--){
for (i = 0; i <= iend+1; i++){
printf ("%6.2f ", phi[j][i]);
}
printf ("\n");
}
}
}
MPI_Finalize();
exit(0);
#endif
}
/* ********************************************************************* */
void WriteSolution (double **phi, int nx, int ny, MPI_Decomp *md)
/*
*********************************************************************** */
{
int i,j;
int ibeg = NGHOST;
int iend = ibeg + nx - 1;
int jbeg = NGHOST;
int jend = jbeg + ny - 1;
static int nfile = 0;
char fname[32];
sprintf (fname,"laplace2D_MPIACC.txt",nfile);
/*
for (j = jbeg-1; j <= jend+1; j++) for (i = ibeg-1; i <= iend+1; i++) {
phi[j][i] = -1;
}
for (j = jbeg; j <= jend; j++) for (i = ibeg; i <= iend; i++) {
phi[j][i] = md->rank;
}
*/
#ifdef PARALLEL
MPI_File fh;
MPI_Datatype type_local, type_domain;
int amode = MPI_MODE_CREATE | MPI_MODE_WRONLY;
int gsize[2], lsize[2], start[2];
/* --------------------------------------------------------
1. Create a local array type without the ghost zones
This datatype will be passed to MPI_File_write()
-------------------------------------------------------- */
gsize[0] = md->lsize[0] + 2*NGHOST;
gsize[1] = md->lsize[1] + 2*NGHOST;
lsize[0] = md->lsize[0];
lsize[1] = md->lsize[1];
start[0] = NGHOST;
start[1] = NGHOST;
MPI_Type_create_subarray (NDIM, gsize, lsize, start,
MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_local);
MPI_Type_commit (&type_local);
/* --------------------------------------------------------
2. Create the subarry in the global domain.
This datatype is used to set the file view.
-------------------------------------------------------- */
gsize[0] = NX_GLOB;
gsize[1] = NY_GLOB;
lsize[0] = md->lsize[0];
lsize[1] = md->lsize[1];
start[0] = lsize[0]*md->coords[0]; // equal to md->start[0]
start[1] = lsize[1]*md->coords[1]; // equal to md->start[1]
MPI_Type_create_subarray (NDIM, gsize, lsize, start,
MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_domain);
MPI_Type_commit (&type_domain);
/* --------------------------------------------------------
3. Write to disk
-------------------------------------------------------- */
MPI_File_delete(fname, MPI_INFO_NULL);
MPI_File_open(MPI_COMM_CART, fname, amode, MPI_INFO_NULL, &fh);
MPI_File_set_view(fh, 0, MPI_DOUBLE, type_domain, "native", MPI_INFO_NULL);
MPI_File_write_all(fh, phi[0], 1, type_local, MPI_STATUS_IGNORE);
MPI_File_close(&fh);
MPI_Type_free (&type_local);
MPI_Type_free (&type_domain);
#else
FILE *fp;
printf ("> Writing %s\n",fname);
fp = fopen(fname, "wb");
for (j = jbeg; j <= jend; j++){
fwrite (phi[j] + ibeg, sizeof(double), nx, fp);
}
fclose(fp);
#endif
nfile++;
}
/* ********************************************************************* */
double **Allocate_2DdblArray(int nx, int ny)
/*
* Allocate memory for a double precision array with
* nx rows and ny columns
*********************************************************************** */
{
int i,j;
double **buf;
buf = (double **)malloc (nx*sizeof(double *));
buf[0] = (double *) malloc (nx*ny*sizeof(double));
for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;
return buf;
}
/* ********************************************************************* */
int **Allocate_2DintArray(int nx, int ny)
/*
* Allocate memory for an integer-type array with
* nx rows and ny columns
*********************************************************************** */
{
int i,j;
int **buf;
buf = (int **)malloc (nx*sizeof(int *));
buf[0] = (int *) malloc (nx*ny*sizeof(int));
for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;
return buf;
}
/* ********************************************************************* */
void Show_2DdblArray(double **A, int nx, int ny, const char *string)
/*
*********************************************************************** */
{
int i, j;
printf ("%s\n",string);
printf ("------------------------------\n");
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
printf ("%8.2f ", A[i][j]);
}
printf ("\n");
}
printf ("------------------------------\n");
}
/* ********************************************************************* */
void Show_2DintArray(int **A, int nx, int ny, const char *string)
/*
*********************************************************************** */
{
int i, j;
printf ("%s\n",string);
for (j = 0; j < ny; j++) printf ("-----");
printf ("\n");
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
printf ("%03d ", A[i][j]);
}
printf ("\n");
}
for (j = 0; j < ny; j++) printf ("-----");
printf ("\n");
}