如何使用 OpenMP 并行化在矩阵上具有迭代的 while 循环?

How to parallelise a while loop that has iterations on a matrix with OpenMP?

我正在尝试并行化此代码的 while 循环。

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define n 100

float max_dif(float w_new[n][n], float w[n][n]);

int main()
{
    int i,j;
    float w[n][n],w_new[n][n]={0},max=100;

#pragma omp parallel for private(j)
    for (i = 0; i < n; i++)
    {
        for (j = 0; j < n; j++)
        {
            w[i][j]=75;
        }
    }

#pragma omp parallel for
    for (i = 0; i < n; i++)
    {
        w[0][i]=0;
        w[n-1][i]=100;
        w[i][0]=100;
        w[i][n-1]=100;

        w_new[0][i]=0;
        w_new[n-1][i]=100;
        w_new[i][0]=100;
        w_new[i][n-1]=100;
    }

    w[0][n-1]=100;
    w_new[0][n-1]=100;

    int counter=0;

    while(max > 0.0001)
    {   
        for (i = 1; i < n-1; i++)
        {
            for (j = 1; j < n-1; j++)
            {
                w_new[i][j]=(float)((w[i+1][j]+w[i-1][j]+w[i][j+1]+w[i][j-1])/4);
            }
        }
        max = max_dif(w_new,w);
        for (i = 0; i < n; i++)
        {
            for (j = 0; j < n; j++)
            {
                w[i][j]=w_new[i][j];
            }
        }
        counter++;
    }

    printf("Counter is: %d\n", counter);
    return 0;
}

float max_dif(float w_new[n][n], float w[n][n])
{
    int i,j;
    float max=0;
    for (i = 1; i < n-1; i++)
    {
        for (j = 1; j < n-1; j++)
        {
            if (max < fabs(w_new[i][j]-w[i][j]))
                max = fabs(w_new[i][j]-w[i][j]);
        }
    }
    return max;
}

我当时正在考虑构造的基本并行,但我的计数器变量预计在 5000 左右。所以我希望它是并行的。但是我需要 w_new 复制回 w 以便我可以计算 w_new 的下一次迭代。所以我认为任务或部分不会起作用,因为我需要一个 w 代替另一个。

直接并行化 while(max > 0.0001)复杂 与 OpenMP 并且可能也不那么高效。在 OpenMP 中,您有两种主要的代码并行化方式,使用基于循环的并行化或基于任务的并行化。后者不适合当前代码,前者不能应用于 loop,在编译时无法确定该循环执行的迭代次数。

但是,您可以并行化 while 中执行的计算,例如:

  #pragma omp parallel
  {
      #pragma omp for
      for (int i = 1; i < n-1; i++){
        for (int j = 1; j < n-1; j++){
            w_new[i][j]=(float)((w[i+1][j]+w[i-1][j]+w[i][j+1]+w[i][j-1])/4);
        }
      }

max_dif 绝对是并行化的良好候选者:

#pragma omp for reduction(max: max)
for (int i = 1; i < n-1; i++){
    for (int j = 1; j < n-1; j++){
        if (max < fabs(w_new[i][j]-w[i][j]))
            max = fabs(w_new[i][j]-w[i][j]);
    }
}  

最后是 while:

中的最后一个循环
    #pragma omp for
    for (int i = 0; i < n; i++){
        for (int j = 0; j < n; j++){
            w[i][j]=w_new[i][j];
        }
    }

不幸的是,要应用此方法,您必须内联函数 max_dif,因为要在 reduction 子句中使用的变量必须 shared,并且由于max 分配在并行区域内,它将对每个线程私有。不允许在 #pragma omp for 中使用 shared 子句。

完整示例:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define n 100
int main()
{
    float w[n][n],w_new[n][n]={0},max=100;

    #pragma omp parallel for
    for (int i = 0; i < n; i++)
        for (int j = 0; j < n; j++)
            w[i][j]=75;

    #pragma omp parallel for
    for (int i = 0; i < n; i++){
        w[0][i]=0;
        w[n-1][i]=100;
        w[i][0]=100;
        w[i][n-1]=100;

        w_new[0][i]=0;
        w_new[n-1][i]=100;
        w_new[i][0]=100;
        w_new[i][n-1]=100;
    }

    w[0][n-1]=100;
    w_new[0][n-1]=100;

    int counter=0;

    while(max > 0.0001){   
      #pragma omp parallel
      {
          max = 0;
          #pragma omp for    
          for (int i = 1; i < n-1; i++)
             for (int j = 1; j < n-1; j++)
                w_new[i][j]=(float)((w[i+1][j]+w[i-1][j]+w[i][j+1]+w[i][j-1])/4);
        
          #pragma omp for reduction(max: max)
          for (int i = 1; i < n-1; i++){
             for (int j = 1; j < n-1; j++){
               if (max < fabs(w_new[i][j]-w[i][j]))
                  max = fabs(w_new[i][j]-w[i][j]);
             }
          }       
          #pragma omp for nowait
          for (int i = 0; i < n; i++)
              for (int j = 0; j < n; j++)
                 w[i][j]=w_new[i][j];
      }
      counter++;
    }

    printf("Counter is: %d\n", counter);
    return 0;
}

输出:

Counter is: 4894

I was thinking of the basic parallel for construct inside the while, but my counter variable is expected to be around 5000. So I would like that to be in parallel.

这是可能的,但需要进行一些更改。我在代码中解释它们。实际上我们可以为整个代码使用一个单独的并行区域:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define n 100

int main()
{
   float w[n][n],w_new[n][n]={0},max=100;
   int counter = 0;
   #pragma omp parallel
   {
      #pragma omp for
      for (int i = 0; i < n; i++)
        for (int j = 0; j < n; j++)
             w[i][j]=75;

      #pragma omp for
      for (int i = 0; i < n; i++){
        w[0][i]=0;
        w[n-1][i]=100;
        w[i][0]=100;
        w[i][n-1]=100;

        w_new[0][i]=0;
        w_new[n-1][i]=100;
        w_new[i][0]=100;
        w_new[i][n-1]=100;
     }

     w[0][n-1]=100;
     w_new[0][n-1]=100;

     
    while(max > 0.0001){
       // We need this barrier to ensure that we don't have a
       // race condition between the master updating max to zero
       // and the other threads reading max > 0.0001 
       #pragma omp barrier
       #pragma omp master
       #pragma omp atomic write
        max = 0;
       #pragma omp barrier
       
        #pragma omp for reduction(max:max)
        for (int i = 1; i < n-1; i++){
           for (int j = 1; j < n-1; j++){
               w_new[i][j]=(float)((w[i+1][j]+w[i-1][j]+w[i][j+1]+w[i][j-1])/4);
               if (max < fabs(w_new[i][j]-w[i][j]))
                   max = fabs(w_new[i][j]-w[i][j]);
           }
        }      
          
        #pragma omp for nowait
        for (int i = 0; i < n; i++)
            for (int j = 0; j < n; j++)
               w[i][j]=w_new[i][j];
          
        #pragma omp master
        counter++;
     } 
   }
   printf("Counter is: %d\n", counter);
   return 0;
}

我进行了表面测试,对于 4 核,此版本对于 1000 的输入有 2.8x 的加速,并不算太好。可以通过尝试对其部分进行矢量化来进一步改进。