在并行循环之前将静态数组声明为 "private" 是否完全等同于在循环内声明数组?

When declaring a static array as "private" before a parallel loop is perfectly equivalent to declaring the array inside the loop?

我遇到过这样一种情况,即在索引 i 的循环内定义数组(案例 #1)和在循环外声明它们的情况下,代码会生成不同的结果i 索引并使用子句 private(案例 #2)。 案例 #2 仅在 CPU 上生成与代码 运行 相同的结果。

案例 #1

  #pragma acc parallel loop
  for (j = jbeg; j <= jend; j++){
  
    #pragma acc loop 
    for (i = ibeg; i <= iend; i++){
      double Rc[NFLX][NFLX];
      double eta[NFLX], um[NFLX], dv[NFLX];
      double lambda[NFLX], alambda[NFLX];
      double fL[NFLX], fR[NFLX];
      .
      .
      .
    }
  }}

案例 #2

  #pragma acc parallel loop
  for (j = jbeg; j <= jend; j++){

    double Rc[NFLX][NFLX];
    double eta[NFLX], um[NFLX], dv[NFLX];
    double lambda[NFLX], alambda[NFLX];
    double fL[NFLX], fR[NFLX];

    #pragma acc loop private(Rc[:NFLX][:NFLX], eta[:NFLX],  \
                             um[:NFLX], lambda[:NFLX], alambda[:NFLX], \
                             dv[:NFLX], fL[:NFLX], fR[:NFLX])
    for (i = ibeg; i <= iend; i++){
      .
      .
      .
    }
  }}

我有以下值:

NFLX = 8;
jbeg = 3, jend = 258;
ibeg = 3, iend = 1026;

在哪些情况下这两种技术是等效的,什么时候选择一种比另一种更好?

这是我看到的 -Minfo=accel:

案例 #1:

71, Local memory used for Rc,dv,fR,um,lambda,alambda,fL,eta

案例 #2:

71, Local memory used for Rc,dv,fR,lambda,alambda,fL,eta
     CUDA shared memory used for Rc,eta
     Local memory used for um
     CUDA shared memory used for um,lambda,alambda,dv,fL,fR

函数:

/* ********************************************************************* */
void Roe_Solver (Data *d, timeStep *Dts, Grid *grid, RBox *box)
/*
 * Solve the Riemann problem between L/R states using a
 * Rusanov-Lax Friedrichs flux.
 *********************************************************************** */
{
  int  i, j, k;
  int  ibeg = *(box->nbeg)-1, iend = *(box->nend);
  int  jbeg = *(box->tbeg),   jend = *(box->tend);
  int  kbeg = *(box->bbeg),   kend = *(box->bend);
  int  VXn = VX1, VXt = VX2, VXb = VX3;
  int  MXn = MX1, MXt = MX2, MXb = MX3;
  int ni, nj;
  double gmm = GAMMA_EOS;
  double gmm1 = gmm - 1.0;
  double gmm1_inv  = 1.0/gmm1;
  double delta     = 1.e-7;
  double delta_inv = 1.0/delta;

  ARRAY_OFFSET (grid, ni, nj);

  INDEX_CYCLE (grid->dir, VXn, VXt, VXb);
  INDEX_CYCLE (grid->dir, MXn, MXt, MXb);

  #pragma acc parallel loop collapse(2) present(d, Dts, grid)
  for (k = kbeg; k <= kend; k++){
  for (j = jbeg; j <= jend; j++){

    long int offset = ni*(j + nj*k);

    double  * __restrict__ cmax    = &Dts->cmax     [offset];
    double  * __restrict__ SL      = &d->sweep.SL   [offset];
    double  * __restrict__ SR      = &d->sweep.SR   [offset];

    double um[NFLX];
    double fL[NFLX], fR[NFLX];

    #pragma acc loop private(um[:NFLX], fL[:NFLX], fR[:NFLX])
    for (i = ibeg; i <= iend; i++){
      int nv;
      double  scrh, vel2;
      double  a2, a, h;
      double alambda, lambda, eta;
      double s, c, hl, hr;
      double bmin, bmax, scrh1;
      double pL, pR;

      double * __restrict__ vL   = d->sweep.vL  [offset + i];
      double * __restrict__ vR   = d->sweep.vR  [offset + i];
      double * __restrict__ uL   = d->sweep.uL  [offset + i];
      double * __restrict__ uR   = d->sweep.uR  [offset + i];
      double * __restrict__ flux = d->sweep.flux[offset + i];

      double a2L = SoundSpeed2 (vL);
      double a2R = SoundSpeed2 (vR);

      PrimToCons (vL, uL);
      PrimToCons (vR, uR);

      Flux (vL, uL, fL, grid->dir);
      Flux (vR, uR, fR, grid->dir);

      pL = vL[PRS];
      pR = vR[PRS];


      s       = sqrt(vR[RHO]/vL[RHO]);
      um[RHO]  = vL[RHO]*s;
      s       = 1.0/(1.0 + s);
      c       = 1.0 - s;

      um[VX1] = s*vL[VX1] + c*vR[VX1];
      um[VX2] = s*vL[VX2] + c*vR[VX2];
      um[VX3] = s*vL[VX3] + c*vR[VX3];

      vel2 = um[VX1]*um[VX1] + um[VX2]*um[VX2] + um[VX3]*um[VX3];

      hl  = 0.5*(vL[VX1]*vL[VX1] + vL[VX2]*vL[VX2] + vL[VX3]*vL[VX3]);
      hl += a2L*gmm1_inv;

      hr = 0.5*(vR[VX1]*vR[VX1] + vR[VX2]*vR[VX2] + vR[VX3]*vR[VX3]);
      hr += a2R*gmm1_inv;

      h = s*hl + c*hr;

    /* ----------------------------------------------------
       1. the following should be  equivalent to

         scrh = dv[VX1]*dv[VX1] + dv[VX2]*dv[VX2] + dv[VX3]*dv[VX3];

         a2 = s*a2L + c*a2R + 0.5*gmm1*s*c*scrh;

         and therefore always positive.
       ---------------------------------------------------- */

      a2 = gmm1*(h - 0.5*vel2);
      a  = sqrt(a2);

    /* ----------------------------------------------------------------
       2. define non-zero components of conservative eigenvectors Rc,
          eigenvalues (lambda) and wave strenght eta = L.du
        ----------------------------------------------------------------  */

      #pragma acc loop seq
      NFLX_LOOP(nv) flux[nv] = 0.5*(fL[nv] + fR[nv]);

      /*  ---- (u - c_s)  ----  */

      SL[i] = um[VXn] - a;

      /*  ---- (u + c_s)  ----  */

      SR[i] = um[VXn] + a;

     /*  ----  get max eigenvalue  ----  */

      cmax[i] = fabs(um[VXn]) + a;

      NFLX_LOOP(nv) flux[nv] =   0.5*(fL[nv] + fR[nv]) - 0.5*cmax[i]*(uR[nv] - uL[nv]);

      #if DIMENSIONS > 1

      /* ---------------------------------------------
         3. use the HLL flux function if the interface
            lies within a strong shock.
            The effect of this switch is visible
            in the Mach reflection test.
        --------------------------------------------- */

      scrh  = fabs(vL[PRS] - vR[PRS]);
      scrh /= MIN(vL[PRS],vR[PRS]);

      if (scrh > 0.5 && (vR[VXn] < vL[VXn])){   /* -- tunable parameter -- */
        bmin = MIN(0.0, SL[i]);
        bmax = MAX(0.0, SR[i]);
        scrh1 = 1.0/(bmax - bmin);

        #pragma acc loop seq
        for (nv = 0; nv < NFLX; nv++){
          flux[nv]  = bmin*bmax*(uR[nv] - uL[nv])
                      + bmax*fL[nv] - bmin*fR[nv];
          flux[nv] *= scrh1;
        }
      }
      #endif  /* DIMENSIONS > 1 */

    } /* End loop on i */
  }} /* End loop on j,k */
}

从技术上讲它们是等价的,但在实践中是不同的。发生的事情是编译器会将这些数组的声明提升到循环之外。这是编译器的标准做法,发生在应用 OpenACC 指令之前。应该发生的是,这些数组在它们声明的作用域单元内被隐式私有化。但是,编译器目前不跟踪这一点,因此数组作为共享数组被隐式复制到计算区域中。如果添加标志“-Minfo=accel”,您将看到指示隐式副本的编译器反馈消息。

我有一个请求此支持的未解决问题报告,TPR #31360,但是实现它一直是一个挑战,因此目前还没有在已发布的编译器中实现。因此 until/if 我们可以修复该行为,您需要手动提升这些数组的声明,然后将它们添加到“私有”子句中。