OpenACC 和 CUDA 感知 MPI

OpenACC and CUDA aware MPI

我想在设备上移动 main 中的整个 while 循环。当我将 #pragma acc host_data use_device(err) 添加到 MPI_Allreduce (&err, &err, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); 时出现问题。

错误是 err 的减少不起作用,因此代码在循环一步后退出。

MPI_Allreduce()之后,即使使用#pragma acc update self(err),err仍然为零。

我正在编译 mpicc -acc -ta=tesla:managed -Minfo=accel -w jacobi.c 并且 运行 mpirun -np 2 -mca pml ^ucx ./a.out

你能帮我找出错误吗?

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define PARALLEL
#define NX_GLOB     128   /* Global number of interior points */
#define NY_GLOB     128  /* Global number of interior points */
#define NGHOST   1
#define NDIM     2

#ifdef PARALLEL
  #include <mpi.h>
  MPI_Comm MPI_COMM_CART;
#endif

typedef struct MPI_Decomp_{
  int nprocs[NDIM];     /*  Number of processors in each dimension */
  int periods[NDIM];    /*  Periodicity flag in each dimension     */
  int coords[NDIM];     /*  Cartesian coordinate in the MPI topology */
  int gsize[NDIM];      /*  Global domain size (no ghosts)  */
  int lsize[NDIM];      /*  Local domain size (no ghosts)   */
  int start[NDIM];      /*  Local start index in each dimension           */
  int procL[NDIM];      /*  Rank of left-lying  process in each direction */
  int procR[NDIM];      /*  Rank of right-lying process in each direction */
  int rank;             /*  Local process rank */
  int size;             /*  Communicator size  */
} MPI_Decomp;



void BoundaryConditions(double **, double *, double *, int, int, MPI_Decomp *);
void DomainDecomposition(MPI_Decomp *);
void WriteSolution (double **, int, int, MPI_Decomp *);
double **Allocate_2DdblArray(int, int);
int    **Allocate_2DintArray(int, int);
void   Show_2DdblArray(double **, int, int, const char *);
void   Show_2DintArray(int **, int, int, const char *);

int nx_tot, ny_tot;

int main(int argc, char ** argv)
{
  int    nx, i, ibeg, iend;
  int    ny, j, jbeg, jend;
  int    k, rank=0, size=1;
  double xbeg = 0.0, xend = 1.0;
  double ybeg = 0.0, yend = 1.0;
  double dx = (xend - xbeg)/(NX_GLOB + 1);
  double dy = (yend - ybeg)/(NY_GLOB + 1);
  double *xg, *yg, *x, *y, **phi, **phi0;
  double err, tol;
  MPI_Decomp  mpi_decomp;
  double err_glob;
  int procL[NDIM] = {-1,-1};
  int procR[NDIM] = {-1,-1};

/* --------------------------------------------------------
   0. Initialize the MPI execution environment
   -------------------------------------------------------- */

#ifdef PARALLEL
  MPI_Datatype row_type, col_type;

  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  DomainDecomposition(&mpi_decomp);
  nx = mpi_decomp.lsize[0];
  ny = mpi_decomp.lsize[1];
#else
  mpi_decomp.gsize[0] = mpi_decomp.lsize[0] = nx = NX_GLOB;
  mpi_decomp.gsize[1] = mpi_decomp.lsize[1] = ny = NY_GLOB;
  mpi_decomp.procL[0] = mpi_decomp.procL[1] = -1;
  mpi_decomp.procR[0] = mpi_decomp.procR[1] = -1;
#endif

/* --------------------------------------------------------
   1. Set local grid indices
   -------------------------------------------------------- */

  ibeg   = NGHOST;
  iend   = ibeg + nx - 1;
  nx     = iend - ibeg + 1;
  nx_tot = nx + 2*NGHOST;

  jbeg   = NGHOST;
  jend   = jbeg + ny - 1;
  ny     = jend - jbeg + 1;
  ny_tot = ny + 2*NGHOST;

/* --------------------------------------------------------
   2. Generate global and local grids
   -------------------------------------------------------- */

  xg = (double *) malloc ( (NX_GLOB+2*NGHOST)*sizeof(double));
  yg = (double *) malloc ( (NY_GLOB+2*NGHOST)*sizeof(double));
  for (i = 0; i < (NX_GLOB+2*NGHOST); i++) xg[i] = xbeg + (i-ibeg+1)*dx;
  for (j = 0; j < (NY_GLOB+2*NGHOST); j++) yg[j] = ybeg + (j-jbeg+1)*dy;
  #ifdef PARALLEL
  x = xg + mpi_decomp.start[0];
  y = yg + mpi_decomp.start[1];
  #else
  x = xg;
  y = yg;
  #endif

/* --------------------------------------------------------
   3. Allocate memory on local processor and
      assign initial conditions.
   -------------------------------------------------------- */

  phi  = Allocate_2DdblArray(ny_tot, nx_tot);
  phi0 = Allocate_2DdblArray(ny_tot, nx_tot);

  for (j = jbeg; j <= jend; j++){
  for (i = ibeg; i <= iend; i++){
    phi0[j][i] = 0.0;
  }}

#ifdef PARALLEL
  MPI_Type_contiguous (nx_tot, MPI_DOUBLE, &row_type);
  MPI_Type_vector     (ny_tot, 1, nx_tot, MPI_DOUBLE, &col_type);
  MPI_Type_commit (&row_type);
  MPI_Type_commit (&col_type);
#endif

/* --------------------------------------------------------
   4. Main iteration cycle
   -------------------------------------------------------- */

  tol = 1.e-5;
  err = 1.0;
  k   = 0;

  #pragma acc enter data copyin(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST], y[NX_GLOB+2*NGHOST], err, err_glob)
  while  (err > tol){

  /* -- 4a. Set boundary conditions first -- */

    BoundaryConditions(phi0, x, y, nx, ny, &mpi_decomp);

  /* -- 4b. Jacobi's method and residual (interior points) -- */

    err = 0.0;

    #pragma acc parallel loop collapse(2) reduction(+:err) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
    for (j = jbeg; j <= jend; j++){
    for (i = ibeg; i <= iend; i++){
      phi[j][i] = 0.25*(  phi0[j][i-1] + phi0[j][i+1]
                        + phi0[j-1][i] + phi0[j+1][i] );

      err += dx*dy*fabs(phi[j][i] - phi0[j][i]);
    }}


    #pragma acc parallel loop collapse(2) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
    for (j = jbeg; j <= jend; j++){
    for (i = ibeg; i <= iend; i++){
      phi0[j][i] = phi[j][i];
    }}

    #ifdef PARALLEL
    // double err_glob;
    #pragma acc host_data use_device(err, err_glob)
    {
    MPI_Allreduce (&err, &err_glob, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    }
    err = err_glob;
    #endif

    // #pragma acc update host(err)
    if (rank == 0){
      printf ("k = %d; err = %8.3e\n",k, err);
    }
    k++;
  }
  #pragma acc exit data copyout(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot], err, err_glob)

  WriteSolution (phi, nx, ny, &mpi_decomp);

  #ifdef PARALLEL
  MPI_Finalize();
  #endif
  return 0;
}

#ifdef PARALLEL
/* ********************************************************************* */
void DomainDecomposition(MPI_Decomp *mpi_decomp)
/*
 *
 *********************************************************************** */
{
  int dim, i;
  int rank, size;
  int *coords  = mpi_decomp->coords;
  int *gsize   = mpi_decomp->gsize;
  int *lsize   = mpi_decomp->lsize;
  int *nprocs  = mpi_decomp->nprocs;
  int *periods = mpi_decomp->periods;
  int *procL   = mpi_decomp->procL;
  int *procR   = mpi_decomp->procR;
  int *start   = mpi_decomp->start;
  int new_coords[NDIM];

/* --------------------------------------------------------
   1. Get rank & size
   -------------------------------------------------------- */

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  mpi_decomp->rank = rank;
  mpi_decomp->size = size;

/* --------------------------------------------------------
   2. Obtain number of processor along each dimension.
      Use maximally squared decomp.
   -------------------------------------------------------- */

  nprocs[0] = (int)sqrt(size);
  nprocs[1] = size/nprocs[0];
  if (nprocs[0]*nprocs[1] != size){
    if (rank == 0) printf ("! Cannot decompose\n");
    MPI_Finalize();
    exit(1);
  }
  if (rank == 0){
    printf ("Decomposition achieved with %d X %d procs\n",nprocs[0],nprocs[1]);
  }

  periods[0] = 0;
  periods[1] = 0;

/* --------------------------------------------------------
   3. Create Cartesian topology
   -------------------------------------------------------- */

  MPI_Cart_create(MPI_COMM_WORLD, NDIM, nprocs, periods,
                                        0, &MPI_COMM_CART);
  MPI_Cart_get(MPI_COMM_CART, NDIM, nprocs, periods, coords);

/* --------------------------------------------------------
   4. Fill structure members
   -------------------------------------------------------- */

  gsize[0] = NX_GLOB;
  gsize[1] = NY_GLOB;
  lsize[0] = NX_GLOB/nprocs[0];
  lsize[1] = NY_GLOB/nprocs[1];
  start[0] = coords[0]*lsize[0];
  start[1] = coords[1]*lsize[1];

/* --------------------------------------------------------
   5. Determine ranks of neighbour processors
   -------------------------------------------------------- */

  for (dim = 0; dim < NDIM; dim++) {
    for (i = 0; i < NDIM; i++) new_coords[i] = coords[i];

    new_coords[dim] = coords[dim] + 1;
    if (new_coords[dim] < nprocs[dim]) {
      MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procR[dim]) );
    } else {
      procR[dim] = MPI_PROC_NULL;
    }

    new_coords[dim] = coords[dim] - 1;
    if (new_coords[dim] >= 0) {
     MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procL[dim]) );
    } else {
      procL[dim] = MPI_PROC_NULL;
    }
  }

/* --------------------------------------------------------
   6. Print processor information.
      (Use MPI_Bcast() to print in sequence)
   -------------------------------------------------------- */

  int proc, go;
  for (proc = 0; proc < size; proc++){
    go = proc;
    MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
    if (rank == go) {
      printf ("[Rank %d]\n",rank);
      printf ("  coords = [%d, %d], lsize = [%d, %d]\n",
                 coords[0], coords[1], lsize[0], lsize[1]);
      for (dim = 0; dim < NDIM; dim++){
        printf ("  (procL, procR)[%d] = %d, %d\n", dim, procL[dim], procR[dim]);
      }
    }
    MPI_Barrier(MPI_COMM_WORLD);
  }

  return;
}
#endif

/* ********************************************************************* */
void BoundaryConditions(double **phi, double *x, double *y,
                        int nx, int ny, MPI_Decomp *mpi_decomp)
/*
 *********************************************************************** */
{
  int i,j;
  int ibeg   = NGHOST;
  int iend   = ibeg + nx - 1;

  int jbeg   = NGHOST;
  int jend   = jbeg + ny - 1;

  int *procL = mpi_decomp->procL;
  int *procR = mpi_decomp->procR;
#ifdef PARALLEL
  int rank = mpi_decomp->rank;
  int size = mpi_decomp->size;
  double send_buf[NX_GLOB + 2*NGHOST];
  double recv_buf[NX_GLOB + 2*NGHOST];

/*  Used for testing
    for (j = 0; j <= jend+1; j++){
    for (i = 0; i <= iend+1; i++){
      phi[j][i] = -1;
    }}

    for (j = jbeg; j <= jend; j++){
    for (i = ibeg; i <= iend; i++){
      phi[j][i] = rank;
    }}
*/

  #pragma acc enter data create(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])

  // Left buffer
  i = ibeg;

  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
  for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
  #pragma acc host_data use_device(send_buf, recv_buf)
  {
  MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procL[0], 0,
                recv_buf, jend+1, MPI_DOUBLE, procL[0], 0,
                MPI_COMM_WORLD, MPI_STATUS_IGNORE);
  }
  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
  for (j = jbeg; j <= jend; j++) phi[j][i-1] = recv_buf[j];

  // Right buffer
  i = iend;
  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
  for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
  #pragma acc host_data use_device(send_buf, recv_buf)
  {
  MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procR[0], 0,
                recv_buf, jend+1, MPI_DOUBLE, procR[0], 0,
                MPI_COMM_WORLD, MPI_STATUS_IGNORE);
  }
  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
  for (j = jbeg; j <= jend; j++) phi[j][i+1] = recv_buf[j];


  // Bottom buffer
  j = jbeg;
  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
  for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
  // #pragma acc update self(send_buf[:NX_GLOB+2*NGHOST])
  #pragma acc host_data use_device(send_buf, recv_buf)
  {
  MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procL[1], 0,
                  recv_buf, iend+1, MPI_DOUBLE, procL[1], 0,
                  MPI_COMM_WORLD, MPI_STATUS_IGNORE);
  }
  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
  for (i = ibeg; i <= iend; i++) phi[j-1][i] = recv_buf[i];

  // Top buffer
  j = jend;
  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
  for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
  #pragma acc host_data use_device(send_buf, recv_buf)
  {
  MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procR[1], 0,
                recv_buf, iend+1, MPI_DOUBLE, procR[1], 0,
                MPI_COMM_WORLD, MPI_STATUS_IGNORE);
  }
  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
  for (i = ibeg; i <= iend; i++) phi[j+1][i] = recv_buf[i];

  #pragma acc exit data copyout(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])

#endif

/* -- Left -- */

  if (procL[0] < 0){
    i = ibeg-1;
    #pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y[:NY_GLOB+2*NGHOST])
    for (j = jbeg; j <= jend; j++) phi[j][i] = 1.0-y[j];
  }

/* -- Right -- */

  if (procR[0] < 0){
    i = iend+1;
    #pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y[:NY_GLOB+2*NGHOST])
    for (j = jbeg; j <= jend; j++) phi[j][i] = y[j]*y[j];
  }

/* -- Bottom -- */

  if (procL[1] < 0){
    j = jbeg-1;
    #pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST])
    for (i = ibeg; i <= iend; i++) phi[j][i] = 1.0-x[i];
  }

/* -- Top -- */

  if (procR[1] < 0){
    j = jend+1;
    #pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST])
    for (i = ibeg; i <= iend; i++) phi[j][i] = x[i];
  }

  return;

#ifdef PARALLEL
// Print
  MPI_Barrier(MPI_COMM_WORLD);
  int go, proc;
  for (proc = 0; proc < size; proc++){
    go = proc;
    MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);

    if (rank == go) {
      printf ("Boundary [Rank %d]\n",rank);
      for (j = jend+1; j >= 0; j--){
        for (i = 0; i <= iend+1; i++){
          printf ("%6.2f  ", phi[j][i]);
        }
        printf ("\n");
      }
    }
  }

MPI_Finalize();
exit(0);
#endif
}

/* ********************************************************************* */
void WriteSolution (double **phi, int nx, int ny, MPI_Decomp *md)
/*
 *********************************************************************** */
{
  int i,j;
  int ibeg   = NGHOST;
  int iend   = ibeg + nx - 1;

  int jbeg   = NGHOST;
  int jend   = jbeg + ny - 1;

  static int nfile = 0;
  char fname[32];

  sprintf (fname,"laplace2D_MPIACC.txt",nfile);

/*
for (j = jbeg-1; j <= jend+1; j++) for (i = ibeg-1; i <= iend+1; i++) {
  phi[j][i] = -1;
}
for (j = jbeg; j <= jend; j++) for (i = ibeg; i <= iend; i++) {
  phi[j][i] = md->rank;
}
*/
#ifdef PARALLEL
  MPI_File fh;
  MPI_Datatype type_local, type_domain;
  int amode = MPI_MODE_CREATE | MPI_MODE_WRONLY;
  int gsize[2], lsize[2], start[2];

/* --------------------------------------------------------
   1. Create a local array type without the ghost zones
      This datatype will be passed to MPI_File_write()
   -------------------------------------------------------- */

  gsize[0] = md->lsize[0] + 2*NGHOST;
  gsize[1] = md->lsize[1] + 2*NGHOST;

  lsize[0] = md->lsize[0];
  lsize[1] = md->lsize[1];

  start[0] = NGHOST;
  start[1] = NGHOST;

  MPI_Type_create_subarray (NDIM, gsize, lsize, start,
                            MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_local);
  MPI_Type_commit (&type_local);

/* --------------------------------------------------------
   2. Create the subarry in the global domain.
      This datatype is used to set the file view.
   -------------------------------------------------------- */

  gsize[0] = NX_GLOB;
  gsize[1] = NY_GLOB;

  lsize[0] = md->lsize[0];
  lsize[1] = md->lsize[1];

  start[0] = lsize[0]*md->coords[0];  // equal to md->start[0]
  start[1] = lsize[1]*md->coords[1];  // equal to md->start[1]

  MPI_Type_create_subarray (NDIM, gsize, lsize, start,
                            MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_domain);
  MPI_Type_commit (&type_domain);

/* --------------------------------------------------------
   3. Write to disk
   -------------------------------------------------------- */

  MPI_File_delete(fname, MPI_INFO_NULL);
  MPI_File_open(MPI_COMM_CART, fname, amode, MPI_INFO_NULL, &fh);
  MPI_File_set_view(fh, 0, MPI_DOUBLE, type_domain, "native", MPI_INFO_NULL);
  MPI_File_write_all(fh, phi[0], 1, type_local, MPI_STATUS_IGNORE);
  MPI_File_close(&fh);
  MPI_Type_free (&type_local);
  MPI_Type_free (&type_domain);
#else
  FILE *fp;
  printf ("> Writing %s\n",fname);
  fp = fopen(fname, "wb");

  for (j = jbeg; j <= jend; j++){
    fwrite (phi[j] + ibeg, sizeof(double), nx, fp);
  }

  fclose(fp);
#endif

  nfile++;
}

/* ********************************************************************* */
double **Allocate_2DdblArray(int nx, int ny)
/*
 * Allocate memory for a double precision array with
 * nx rows and ny columns
 *********************************************************************** */
{
  int i,j;
  double **buf;

  buf    = (double **)malloc (nx*sizeof(double *));
  buf[0] = (double *) malloc (nx*ny*sizeof(double));
  for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;

  return buf;
}
/* ********************************************************************* */
int **Allocate_2DintArray(int nx, int ny)
/*
 * Allocate memory for an integer-type array with
 * nx rows and ny columns
 *********************************************************************** */
{
  int i,j;
  int **buf;

  buf    = (int **)malloc (nx*sizeof(int *));
  buf[0] = (int *) malloc (nx*ny*sizeof(int));
  for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;

  return buf;
}


/* ********************************************************************* */
void Show_2DdblArray(double **A, int nx, int ny, const char *string)
/*
 *********************************************************************** */
{
  int i, j;

  printf ("%s\n",string);
  printf ("------------------------------\n");
  for (i = 0; i < nx; i++) {
    for (j = 0; j < ny; j++) {
      printf ("%8.2f  ", A[i][j]);
    }
    printf ("\n");
  }
  printf ("------------------------------\n");
}
/* ********************************************************************* */
void Show_2DintArray(int **A, int nx, int ny, const char *string)
/*
 *********************************************************************** */
{
  int i, j;

  printf ("%s\n",string);
  for (j = 0; j < ny; j++) printf ("-----");
  printf ("\n");

  for (i = 0; i < nx; i++) {
    for (j = 0; j < ny; j++) {
      printf ("%03d  ", A[i][j]);
    }
    printf ("\n");
  }

  for (j = 0; j < ny; j++) printf ("-----");
  printf ("\n");
}

感谢您更新示例。这里有一些问题。

首先,对于“err”和“err_glob”。在循环开始时,您在主机上设置“err=0”但不在设备上更新它。然后在 MPI_AllReduce 调用之后,您再次在主机上设置“err=err_glob”,因此需要更新“err_glob”。

第二个问题是,当 运行 具有多个等级时,代码会部分出现“y”错误。问题是您使用的是全局大小而不是“x”和“y”的局部大小,因此当您复制“y”时,由于偏移量,它会与“x”重叠。我通过将“xg”和“yg”复制到设备来解决这个问题。

至于相对于 CPU 的性能,这里的主要问题是尺寸小,因此代码严重利用 GPU。我将 GLOB 大小增加到 4096 并看到更好的相对性能,尽管代码收敛得更快。

我还冒昧地添加了一些用于设备分配排名的样板代码,以便代码可以利用多个 GPU。

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define PARALLEL
#define NX_GLOB     128   /* Global number of interior points */
#define NY_GLOB     128  /* Global number of interior points */
#define NGHOST   1
#define NDIM     2

#ifdef PARALLEL
  #include <mpi.h>
  MPI_Comm MPI_COMM_CART;
#endif

#ifdef _OPENACC
  #include <openacc.h>
#endif

typedef struct MPI_Decomp_{
  int nprocs[NDIM];     /*  Number of processors in each dimension */
  int periods[NDIM];    /*  Periodicity flag in each dimension     */
  int coords[NDIM];     /*  Cartesian coordinate in the MPI topology */
  int gsize[NDIM];      /*  Global domain size (no ghosts)  */
  int lsize[NDIM];      /*  Local domain size (no ghosts)   */
  int start[NDIM];      /*  Local start index in each dimension           */
  int procL[NDIM];      /*  Rank of left-lying  process in each direction */
  int procR[NDIM];      /*  Rank of right-lying process in each direction */
  int rank;             /*  Local process rank */
  int size;             /*  Communicator size  */
} MPI_Decomp;



void BoundaryConditions(double **, double *, double *, int, int, MPI_Decomp *);
void DomainDecomposition(MPI_Decomp *);
void WriteSolution (double **, int, int, MPI_Decomp *);
double **Allocate_2DdblArray(int, int);
int    **Allocate_2DintArray(int, int);
void   Show_2DdblArray(double **, int, int, const char *);
void   Show_2DintArray(int **, int, int, const char *);

int nx_tot, ny_tot;

int main(int argc, char ** argv)
{
  int    nx, i, ibeg, iend;
  int    ny, j, jbeg, jend;
  int    k, rank=0, size=1;
  int xsize,ysize;
  double xbeg = 0.0, xend = 1.0;
  double ybeg = 0.0, yend = 1.0;
  double dx = (xend - xbeg)/(NX_GLOB + 1);
  double dy = (yend - ybeg)/(NY_GLOB + 1);
  double *xg, *yg, *x, *y, **phi, **phi0;
  double err, tol;
  MPI_Decomp  mpi_decomp;
  double err_glob;
  int procL[NDIM] = {-1,-1};
  int procR[NDIM] = {-1,-1};

/* --------------------------------------------------------
   0. Initialize the MPI execution environment
   -------------------------------------------------------- */

#ifdef PARALLEL
  MPI_Datatype row_type, col_type;

  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  DomainDecomposition(&mpi_decomp);
  nx = mpi_decomp.lsize[0];
  ny = mpi_decomp.lsize[1];
#else
  mpi_decomp.gsize[0] = mpi_decomp.lsize[0] = nx = NX_GLOB;
  mpi_decomp.gsize[1] = mpi_decomp.lsize[1] = ny = NY_GLOB;
  mpi_decomp.procL[0] = mpi_decomp.procL[1] = -1;
  mpi_decomp.procR[0] = mpi_decomp.procR[1] = -1;
#endif

#ifdef _OPENACC
/* -------------------------------------------------------
   0. Set the device for each rank
   ------------------------------------------------------- */
  int device_type, num_devices;
  int gpuId;
  MPI_Comm shmcomm;
  int local_rank;

// Get the local rank number
  MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
                      MPI_INFO_NULL, &shmcomm);
  MPI_Comm_rank(shmcomm, &local_rank);

// Device num = local rank mod number of devices on the node
  device_type = acc_get_device_type();
  num_devices = acc_get_num_devices(device_type);
  gpuId = local_rank % num_devices;
  acc_set_device_num(gpuId, device_type);
  acc_init(device_type);
#endif

/* --------------------------------------------------------
   1. Set local grid indices
   -------------------------------------------------------- */

  ibeg   = NGHOST;
  iend   = ibeg + nx - 1;
  nx     = iend - ibeg + 1;
  nx_tot = nx + 2*NGHOST;

  jbeg   = NGHOST;
  jend   = jbeg + ny - 1;
  ny     = jend - jbeg + 1;
  ny_tot = ny + 2*NGHOST;

/* --------------------------------------------------------
   2. Generate global and local grids
   -------------------------------------------------------- */

  xg = (double *) malloc ( (NX_GLOB+2*NGHOST)*sizeof(double));
  yg = (double *) malloc ( (NY_GLOB+2*NGHOST)*sizeof(double));
  for (i = 0; i < (NX_GLOB+2*NGHOST); i++) xg[i] = xbeg + (i-ibeg+1)*dx;
  for (j = 0; j < (NY_GLOB+2*NGHOST); j++) yg[j] = ybeg + (j-jbeg+1)*dy;
#pragma acc enter data copyin(xg[:NX_GLOB+2*NGHOST],yg[:NY_GLOB+2*NGHOST])
  #ifdef PARALLEL
  x = xg + mpi_decomp.start[0];
  y = yg + mpi_decomp.start[1];
  #else
  x = xg;
  y = yg;
  #endif

/* --------------------------------------------------------
   3. Allocate memory on local processor and
      assign initial conditions.
   -------------------------------------------------------- */

  phi  = Allocate_2DdblArray(ny_tot, nx_tot);
  phi0 = Allocate_2DdblArray(ny_tot, nx_tot);

  for (j = jbeg; j <= jend; j++){
  for (i = ibeg; i <= iend; i++){
    phi0[j][i] = 0.0;
  }}

#ifdef PARALLEL
  MPI_Type_contiguous (nx_tot, MPI_DOUBLE, &row_type);
  MPI_Type_vector     (ny_tot, 1, nx_tot, MPI_DOUBLE, &col_type);
  MPI_Type_commit (&row_type);
  MPI_Type_commit (&col_type);
#endif

/* --------------------------------------------------------
   4. Main iteration cycle
   -------------------------------------------------------- */

  tol = 1.e-5;
  err = 1.0;
  k   = 0;

  //#pragma acc enter data copyin(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST], y[:NX_GLOB+2*NGHOST])
  #pragma acc enter data copyin(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot],err,err_glob)
 while  (err > tol){

  /* -- 4a. Set boundary conditions first -- */

    BoundaryConditions(phi0, x, y, nx, ny, &mpi_decomp);

  /* -- 4b. Jacobi's method and residual (interior points) -- */

    err = 0.0;
#pragma acc update device(err)

    #pragma acc parallel loop collapse(2) reduction(+:err) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
    for (j = jbeg; j <= jend; j++){
    for (i = ibeg; i <= iend; i++){
      phi[j][i] = 0.25*(  phi0[j][i-1] + phi0[j][i+1]
                        + phi0[j-1][i] + phi0[j+1][i] );

      err += dx*dy*fabs(phi[j][i] - phi0[j][i]);
    }}


    #pragma acc parallel loop collapse(2) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
    for (j = jbeg; j <= jend; j++){
    for (i = ibeg; i <= iend; i++){
      phi0[j][i] = phi[j][i];
    }}

    #ifdef PARALLEL
    // double err_glob;
    #pragma acc host_data use_device(err, err_glob)
    {
    MPI_Allreduce (&err, &err_glob, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    }
    #pragma acc update host(err_glob)
    err = err_glob;
    #endif

    if (rank == 0){
      printf ("k = %d; err = %8.3e\n",k, err);
    }
    k++;
  }
  #pragma acc exit data copyout(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot],err,err_glob)

  WriteSolution (phi, nx, ny, &mpi_decomp);

  #ifdef PARALLEL
  MPI_Finalize();
  #endif
  return 0;
}

#ifdef PARALLEL
/* ********************************************************************* */
void DomainDecomposition(MPI_Decomp *mpi_decomp)
/*
 *
 *********************************************************************** */
{
  int dim, i;
  int rank, size;
  int *coords  = mpi_decomp->coords;
  int *gsize   = mpi_decomp->gsize;
  int *lsize   = mpi_decomp->lsize;
  int *nprocs  = mpi_decomp->nprocs;
  int *periods = mpi_decomp->periods;
  int *procL   = mpi_decomp->procL;
  int *procR   = mpi_decomp->procR;
  int *start   = mpi_decomp->start;
  int new_coords[NDIM];

/* --------------------------------------------------------
   1. Get rank & size
   -------------------------------------------------------- */

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  mpi_decomp->rank = rank;
  mpi_decomp->size = size;

/* --------------------------------------------------------
   2. Obtain number of processor along each dimension.
      Use maximally squared decomp.
   -------------------------------------------------------- */

  nprocs[0] = (int)sqrt(size);
  nprocs[1] = size/nprocs[0];
  if (nprocs[0]*nprocs[1] != size){
    if (rank == 0) printf ("! Cannot decompose\n");
    MPI_Finalize();
    exit(1);
  }
  if (rank == 0){
    printf ("Decomposition achieved with %d X %d procs\n",nprocs[0],nprocs[1]);
  }

  periods[0] = 0;
  periods[1] = 0;

/* --------------------------------------------------------
   3. Create Cartesian topology
   -------------------------------------------------------- */

  MPI_Cart_create(MPI_COMM_WORLD, NDIM, nprocs, periods,
                                        0, &MPI_COMM_CART);
  MPI_Cart_get(MPI_COMM_CART, NDIM, nprocs, periods, coords);

/* --------------------------------------------------------
   4. Fill structure members
   -------------------------------------------------------- */

  gsize[0] = NX_GLOB;
  gsize[1] = NY_GLOB;
  lsize[0] = NX_GLOB/nprocs[0];
  lsize[1] = NY_GLOB/nprocs[1];
  start[0] = coords[0]*lsize[0];
  start[1] = coords[1]*lsize[1];

/* --------------------------------------------------------
   5. Determine ranks of neighbour processors
   -------------------------------------------------------- */

  for (dim = 0; dim < NDIM; dim++) {
    for (i = 0; i < NDIM; i++) new_coords[i] = coords[i];

    new_coords[dim] = coords[dim] + 1;
    if (new_coords[dim] < nprocs[dim]) {
      MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procR[dim]) );
    } else {
      procR[dim] = MPI_PROC_NULL;
    }

    new_coords[dim] = coords[dim] - 1;
    if (new_coords[dim] >= 0) {
     MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procL[dim]) );
    } else {
      procL[dim] = MPI_PROC_NULL;
    }
  }

/* --------------------------------------------------------
   6. Print processor information.
      (Use MPI_Bcast() to print in sequence)
   -------------------------------------------------------- */

  int proc, go;
  for (proc = 0; proc < size; proc++){
    go = proc;
    MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
    if (rank == go) {
      printf ("[Rank %d]\n",rank);
      printf ("  coords = [%d, %d], lsize = [%d, %d]\n",
                 coords[0], coords[1], lsize[0], lsize[1]);
      for (dim = 0; dim < NDIM; dim++){
        printf ("  (procL, procR)[%d] = %d, %d\n", dim, procL[dim], procR[dim]);
      }
    }
    MPI_Barrier(MPI_COMM_WORLD);
  }

  return;
}
#endif

/* ********************************************************************* */
void BoundaryConditions(double **phi, double *x, double *y,
                        int nx, int ny, MPI_Decomp *mpi_decomp)
/*
 *********************************************************************** */
{
  int i,j;
  int ibeg   = NGHOST;
  int iend   = ibeg + nx - 1;

  int jbeg   = NGHOST;
  int jend   = jbeg + ny - 1;

  int *procL = mpi_decomp->procL;
  int *procR = mpi_decomp->procR;
#ifdef PARALLEL
  int rank = mpi_decomp->rank;
  int size = mpi_decomp->size;
  double send_buf[NX_GLOB + 2*NGHOST];
  double recv_buf[NX_GLOB + 2*NGHOST];

/*  Used for testing
    for (j = 0; j <= jend+1; j++){
    for (i = 0; i <= iend+1; i++){
      phi[j][i] = -1;
    }}

    for (j = jbeg; j <= jend; j++){
    for (i = ibeg; i <= iend; i++){
      phi[j][i] = rank;
    }}
*/

  #pragma acc enter data create(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])

  // Left buffer
  i = ibeg;

  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
  for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
  #pragma acc host_data use_device(send_buf, recv_buf)
  {
  MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procL[0], 0,
                recv_buf, jend+1, MPI_DOUBLE, procL[0], 0,
                MPI_COMM_WORLD, MPI_STATUS_IGNORE);
  }
  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
  for (j = jbeg; j <= jend; j++) phi[j][i-1] = recv_buf[j];

  // Right buffer
  i = iend;
  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
  for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
  #pragma acc host_data use_device(send_buf, recv_buf)
  {
  MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procR[0], 0,
                recv_buf, jend+1, MPI_DOUBLE, procR[0], 0,
                MPI_COMM_WORLD, MPI_STATUS_IGNORE);
  }
  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
  for (j = jbeg; j <= jend; j++) phi[j][i+1] = recv_buf[j];


  // Bottom buffer
  j = jbeg;
  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
  for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
  // #pragma acc update self(send_buf[:NX_GLOB+2*NGHOST])
  #pragma acc host_data use_device(send_buf, recv_buf)
  {
  MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procL[1], 0,
                  recv_buf, iend+1, MPI_DOUBLE, procL[1], 0,
                  MPI_COMM_WORLD, MPI_STATUS_IGNORE);
  }
  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
  for (i = ibeg; i <= iend; i++) phi[j-1][i] = recv_buf[i];

  // Top buffer
  j = jend;
  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
  for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
  #pragma acc host_data use_device(send_buf, recv_buf)
  {
  MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procR[1], 0,
                recv_buf, iend+1, MPI_DOUBLE, procR[1], 0,
                MPI_COMM_WORLD, MPI_STATUS_IGNORE);
  }
  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
  for (i = ibeg; i <= iend; i++) phi[j+1][i] = recv_buf[i];

  #pragma acc exit data copyout(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])

#endif

/* -- Left -- */

  if (procL[0] < 0){
    i = ibeg-1;
    #pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y)
    for (j = jbeg; j <= jend; j++) phi[j][i] = 1.0-y[j];
  }

/* -- Right -- */

  if (procR[0] < 0){
    i = iend+1;
    #pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y)
    for (j = jbeg; j <= jend; j++) phi[j][i] = y[j]*y[j];
  }

/* -- Bottom -- */

  if (procL[1] < 0){
    j = jbeg-1;
    #pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x)
    for (i = ibeg; i <= iend; i++) phi[j][i] = 1.0-x[i];
  }

/* -- Top -- */

  if (procR[1] < 0){
    j = jend+1;
    #pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x)
    for (i = ibeg; i <= iend; i++) phi[j][i] = x[i];
  }

  return;

#ifdef PARALLEL
// Print
  MPI_Barrier(MPI_COMM_WORLD);
  int go, proc;
  for (proc = 0; proc < size; proc++){
    go = proc;
    MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);

    if (rank == go) {
      printf ("Boundary [Rank %d]\n",rank);
      for (j = jend+1; j >= 0; j--){
        for (i = 0; i <= iend+1; i++){
          printf ("%6.2f  ", phi[j][i]);
        }
        printf ("\n");
      }
    }
  }

MPI_Finalize();
exit(0);
#endif
}

/* ********************************************************************* */
void WriteSolution (double **phi, int nx, int ny, MPI_Decomp *md)
/*
 *********************************************************************** */
{
  int i,j;
  int ibeg   = NGHOST;
  int iend   = ibeg + nx - 1;

  int jbeg   = NGHOST;
  int jend   = jbeg + ny - 1;

  static int nfile = 0;
  char fname[32];

  sprintf (fname,"laplace2D_MPIACC.txt",nfile);

/*
for (j = jbeg-1; j <= jend+1; j++) for (i = ibeg-1; i <= iend+1; i++) {
  phi[j][i] = -1;
}
for (j = jbeg; j <= jend; j++) for (i = ibeg; i <= iend; i++) {
  phi[j][i] = md->rank;
}
*/
#ifdef PARALLEL
  MPI_File fh;
  MPI_Datatype type_local, type_domain;
  int amode = MPI_MODE_CREATE | MPI_MODE_WRONLY;
  int gsize[2], lsize[2], start[2];

/* --------------------------------------------------------
   1. Create a local array type without the ghost zones
      This datatype will be passed to MPI_File_write()
   -------------------------------------------------------- */

  gsize[0] = md->lsize[0] + 2*NGHOST;
  gsize[1] = md->lsize[1] + 2*NGHOST;

  lsize[0] = md->lsize[0];
  lsize[1] = md->lsize[1];

  start[0] = NGHOST;
  start[1] = NGHOST;

  MPI_Type_create_subarray (NDIM, gsize, lsize, start,
                            MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_local);
  MPI_Type_commit (&type_local);

/* --------------------------------------------------------
   2. Create the subarry in the global domain.
      This datatype is used to set the file view.
   -------------------------------------------------------- */

  gsize[0] = NX_GLOB;
  gsize[1] = NY_GLOB;

  lsize[0] = md->lsize[0];
  lsize[1] = md->lsize[1];

  start[0] = lsize[0]*md->coords[0];  // equal to md->start[0]
  start[1] = lsize[1]*md->coords[1];  // equal to md->start[1]

  MPI_Type_create_subarray (NDIM, gsize, lsize, start,
                            MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_domain);
  MPI_Type_commit (&type_domain);

/* --------------------------------------------------------
   3. Write to disk
   -------------------------------------------------------- */

  MPI_File_delete(fname, MPI_INFO_NULL);
  MPI_File_open(MPI_COMM_CART, fname, amode, MPI_INFO_NULL, &fh);
  MPI_File_set_view(fh, 0, MPI_DOUBLE, type_domain, "native", MPI_INFO_NULL);
  MPI_File_write_all(fh, phi[0], 1, type_local, MPI_STATUS_IGNORE);
  MPI_File_close(&fh);
  MPI_Type_free (&type_local);
  MPI_Type_free (&type_domain);
#else
  FILE *fp;
  printf ("> Writing %s\n",fname);
  fp = fopen(fname, "wb");

  for (j = jbeg; j <= jend; j++){
    fwrite (phi[j] + ibeg, sizeof(double), nx, fp);
  }

  fclose(fp);
#endif

  nfile++;
}

/* ********************************************************************* */
double **Allocate_2DdblArray(int nx, int ny)
/*
 * Allocate memory for a double precision array with
 * nx rows and ny columns
 *********************************************************************** */
{
  int i,j;
  double **buf;

  buf    = (double **)malloc (nx*sizeof(double *));
  buf[0] = (double *) malloc (nx*ny*sizeof(double));
  for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;

  return buf;
}
/* ********************************************************************* */
int **Allocate_2DintArray(int nx, int ny)
/*
 * Allocate memory for an integer-type array with
 * nx rows and ny columns
 *********************************************************************** */
{
  int i,j;
  int **buf;

  buf    = (int **)malloc (nx*sizeof(int *));
  buf[0] = (int *) malloc (nx*ny*sizeof(int));
  for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;

  return buf;
}


/* ********************************************************************* */
void Show_2DdblArray(double **A, int nx, int ny, const char *string)
/*
 *********************************************************************** */
{
  int i, j;

  printf ("%s\n",string);
  printf ("------------------------------\n");
  for (i = 0; i < nx; i++) {
    for (j = 0; j < ny; j++) {
      printf ("%8.2f  ", A[i][j]);
    }
    printf ("\n");
  }
  printf ("------------------------------\n");
}
/* ********************************************************************* */
void Show_2DintArray(int **A, int nx, int ny, const char *string)
/*
 *********************************************************************** */
{
  int i, j;

  printf ("%s\n",string);
  for (j = 0; j < ny; j++) printf ("-----");
  printf ("\n");

  for (i = 0; i < nx; i++) {
    for (j = 0; j < ny; j++) {
      printf ("%03d  ", A[i][j]);
    }
    printf ("\n");
  }

  for (j = 0; j < ny; j++) printf ("-----");
  printf ("\n");
}