OpenACC - 复杂循环携带 a->,c->,b-> 的依赖性阻止并行化

OpenACC - Complex loop carried dependence of a->,c->,b-> prevents parallelization

我正在使用 OpenACC 比较 CPU 上使用 PGI Community Edition 19.10(在 Windows 上)的并行矩阵乘法与非并行矩阵乘法运算的执行时间。我使用的代码是:

#include <time.h>
#include <stdlib.h>

int main()
{
    // seed the random number generator
    srand(42);

    // Pick some arbitrary constraints to make the problem harder
    const int SIZE_XY = 1000;
    const int MIN_VAL = 5000;   
    const int MAX_VAL = 7000000;

    int i, j, k; // iterators

    double time_spent = 0.0;
    clock_t begin = clock();

    // Generate two 2D arrays to be filled with random numbers
    // and an array, c, with all 0s
    int *a[SIZE_XY];
    int *b[SIZE_XY];
    int *c[SIZE_XY];
    for (i = 0; i < SIZE_XY; i++)
    {
        a[i] = (int *)malloc(SIZE_XY * sizeof(int));
        b[i] = (int *)malloc(SIZE_XY * sizeof(int));
        c[i] = (int *)malloc(SIZE_XY * sizeof(int));
    }

    #pragma acc kernels
    {
        for (i = 0; i < SIZE_XY; i++)
        {
            for (j = 0; j < SIZE_XY; j++)
            {
                a[i][j] = (rand() % MAX_VAL) + MIN_VAL;
                b[i][j] = (rand() % MAX_VAL) + MIN_VAL;
                c[i][j] = 0;
            }
        }
    }

    printf("Array A allocated and filled with random numbers ...\n");
    printf("Array B allocated and filled with random numbers ...\n");
    printf("Array C initialized ...\n");

    // Dot product the two arrays together into c
    #pragma acc kernels //HERE
    {
        for (i = 0; i < SIZE_XY; i++)
        {
            for (j = 0; j < SIZE_XY; j++)
            {
                for (k = 0; k < SIZE_XY; k++)
                {
                    c[i][j] = c[i][j] + a[i][k] * b[k][j];
                }
            }
        }
    }

    printf("Matrices multiplied ...\n");
    printf("The first three values of A x B are %d, %d, %d\n", c[0][0], c[0][1], c[0][2]);

    clock_t end = clock();

    time_spent += (double)(end - begin) / CLOCKS_PER_SEC;

    printf("Time elpased is %f seconds", time_spent);
}

当我 运行 在 PGI CMD 中执行以下命令时: pgcc -acc -ta=multicore -Minfo=all,accel matrixACC.c 我收到以下内容:

59, Complex loop carried dependence of a->,c->,b-> prevents parallelization
62, Complex loop carried dependence of a->,c->,b-> prevents parallelization
64, Complex loop carried dependence of a->,c->,b-> prevents parallelization
    Loop carried dependence due to exposed use of c[i1][i2] prevents parallelization 

我能否得到一些帮助来理解为什么会发生这种情况以及我如何能够并行化计算矩阵乘法的循环。

谢谢

编译器无法确定您的 3 个指针变量(abc)是否可能 alias 彼此。如果它们以某种方式彼此别名,则无法确定计算任何特定 c[i][j] 的独立性,并且无法正确并行化(任何)循环。

解决此问题的一种可能方法是通知编译器,您作为程序员保证(例如)第一个循环代表独立的 activity(在其各种迭代中)。您可以通过在第一个 for 循环语句之前立即放置 #pragma acc loop independent 来做到这一点。对于您在此处选择的矩阵大小(以及多核目标),这将为您提供大量公开的并行性。 (编译器仍然会发出 Minfo 关于其他循环的非并行化的消息,但这没关系,很可能。对于多核目标,拥有 1000 个并行工作项应该足以获得良好的性能)。

请注意,对于您选择的初始化范围,您的计算很容易溢出 int 存储空间。你会得到毫无意义的结果。

以下代码可以解决上述问题:

$ cat t1.c
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
int main()
{
    // seed the random number generator
    srand(42);

    // Pick some arbitrary constraints to make the problem harder
    const int SIZE_XY = 1000;
    const int MIN_VAL = 5000;
    const int MAX_VAL = 7000000;

    int i, j, k; // iterators

    double time_spent = 0.0;
    clock_t begin = clock();

    // Generate two 2D arrays to be filled with random numbers
    // and an array, c, with all 0s
    int * restrict a[SIZE_XY];
    int * restrict b[SIZE_XY];
    int * restrict c[SIZE_XY];
    for (i = 0; i < SIZE_XY; i++)
    {
        a[i] = (int *)malloc(SIZE_XY * sizeof(int));
        b[i] = (int *)malloc(SIZE_XY * sizeof(int));
        c[i] = (int *)malloc(SIZE_XY * sizeof(int));
    }

    #pragma acc kernels
    {
        for (i = 0; i < SIZE_XY; i++)
        {
            for (j = 0; j < SIZE_XY; j++)
            {
                a[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
                b[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
                c[i][j] = 0;
            }
        }
    }

    printf("Array A allocated and filled with random numbers ...\n");
    printf("Array B allocated and filled with random numbers ...\n");
    printf("Array C initialized ...\n");

    // Dot product the two arrays together into c
    #pragma acc kernels //HERE
    {
        #pragma acc loop independent
        for (i = 0; i < SIZE_XY; i++)
        {
            for (j = 0; j < SIZE_XY; j++)
            {
                for (k = 0; k < SIZE_XY; k++)
                {
                    c[i][j] = c[i][j] + a[i][k] * b[k][j];
                }
            }
        }
    }

    printf("Matrices multiplied ...\n");
    printf("The first three values of A x B are %d, %d, %d\n", c[0][0], c[0][1], c[0][2]);

    clock_t end = clock();

    time_spent += (double)(end - begin) / CLOCKS_PER_SEC;

    printf("Time elpased is %f seconds", time_spent);
}
$ gcc -o t1 t1.c -std=c99
$ pgcc -acc -ta=multicore -Minfo=all,accel t1.c -o t1p
"t1.c", line 21: warning: use of a const variable in a constant expression is
          nonstandard in C
      int * restrict a[SIZE_XY];
                       ^

"t1.c", line 22: warning: use of a const variable in a constant expression is
          nonstandard in C
      int * restrict b[SIZE_XY];
                       ^

"t1.c", line 23: warning: use of a const variable in a constant expression is
          nonstandard in C
      int * restrict c[SIZE_XY];
                       ^

"t1.c", line 11: warning: variable "MIN_VAL" was declared but never referenced
      const int MIN_VAL = 5000;
                ^

"t1.c", line 12: warning: variable "MAX_VAL" was declared but never referenced
      const int MAX_VAL = 7000000;
                ^

main:
     33, Loop is parallelizable
         Generating Multicore code
         33, #pragma acc loop gang
     35, Loop is parallelizable
     52, Loop is parallelizable
         Generating Multicore code
         52, #pragma acc loop gang
     54, Complex loop carried dependence of a->,c->,b-> prevents parallelization
     56, Complex loop carried dependence of a->,c->,b-> prevents parallelization
         Loop carried dependence of c-> prevents parallelization
         Loop carried backward dependence of c-> prevents vectorization
$ time ./t1
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 9.010000 seconds
real    0m9.079s
user    0m9.019s
sys     0m0.061s
$ time ./t1p
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 20.140000 seconds
real    0m0.563s
user    0m20.053s
sys     0m0.132s
$

在我的机器上,使用 gcc 编译的代码大约需要 9 秒,而使用 PGI OpenACC 编译器编译的代码大约需要 0.5 秒。

顺便说一句,我个人通常会避免使用您选择的数组分配方法,因为不保证各种 malloc 操作会导致 adjacent/contiguous 分配。然而,对于 multicore 目标,代码可以正常工作。

为了解决这个问题,我建议您像这样对代码进行一些更改:

$ cat t1.c
#include <time.h>
#include <stdlib.h>
#include <stdio.h>

typedef int mt;
#define SIZE_XY 1000
typedef mt mat[SIZE_XY];

int main()
{
    // seed the random number generator
    srand(42);

    // Pick some arbitrary constraints to make the problem harder

    int i, j, k; // iterators

    double time_spent = 0.0;
    clock_t begin = clock();

    // Generate two 2D arrays to be filled with random numbers
    // and an array, c, with all 0s
    mat * restrict a;
    mat * restrict b;
    mat * restrict c;
    a = (mat *)malloc(SIZE_XY*SIZE_XY * sizeof(mt));
    b = (mat *)malloc(SIZE_XY*SIZE_XY * sizeof(mt));
    c = (mat *)malloc(SIZE_XY*SIZE_XY * sizeof(mt));

    #pragma acc kernels
    {
        for (i = 0; i < SIZE_XY; i++)
        {
            for (j = 0; j < SIZE_XY; j++)
            {
                a[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
                b[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
                c[i][j] = 0;
            }
        }
    }

    printf("Array A allocated and filled with random numbers ...\n");
    printf("Array B allocated and filled with random numbers ...\n");
    printf("Array C initialized ...\n");

    // Dot product the two arrays together into c
    #pragma acc kernels
    {
        for (i = 0; i < SIZE_XY; i++)
        {
            for (j = 0; j < SIZE_XY; j++)
            {
                for (k = 0; k < SIZE_XY; k++)
                {
                    c[i][j] = c[i][j] + a[i][k] * b[k][j];
                }
            }
        }
    }

    printf("Matrices multiplied ...\n");
    printf("The first three values of A x B are %d, %d, %d\n", c[0][0], c[0][1], c[0][2]);

    clock_t end = clock();

    time_spent += (double)(end - begin) / CLOCKS_PER_SEC;

    printf("Time elpased is %f seconds", time_spent);
}
$ gcc -o t1 t1.c -std=c99 -O3
$ pgcc -acc -ta=multicore -Minfo=all,accel t1.c -o t1p
main:
     32, Loop is parallelizable
         Generating Multicore code
         32, #pragma acc loop gang
     34, Loop is parallelizable
     51, Loop is parallelizable
         Generating Multicore code
         51, #pragma acc loop gang
     53, Loop is parallelizable
     55, Complex loop carried dependence of c-> prevents parallelization
         Loop carried dependence of c-> prevents parallelization
         Loop carried backward dependence of c-> prevents vectorization
$ time ./t1
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 0.650000 seconds
real    0m0.708s
user    0m0.663s
sys     0m0.047s
$ time ./t1p
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 17.510000 seconds
real    0m0.499s
user    0m17.466s
sys     0m0.093s
$

(gcc 4.8.5,pgcc 20.5-0,Xeon E5-2690 v2,总共 40 个内核)

有几个优点:

  1. 我们可以使用 c99 restrict 关键字将我们的意图传达给编译器,而无需使用额外的 pragma
  2. 这将是 abc 的连续分配,如果您决定从 multicore 切换到加速器目标
  3. OpenACC 编译器现在无需额外帮助即可处理前两个循环嵌套。
  4. gnu 编译器也喜欢这种级别的通信。在我的机器上,“普通”gnu 编译器 (gcc) 生成的代码运行速度几乎与 OpenACC 代码一样快。 (~0.7s 对比~0.5s)