OpenACC - 复杂循环携带 a->,c->,b-> 的依赖性阻止并行化
OpenACC - Complex loop carried dependence of a->,c->,b-> prevents parallelization
我正在使用 OpenACC 比较 CPU 上使用 PGI Community Edition 19.10(在 Windows 上)的并行矩阵乘法与非并行矩阵乘法运算的执行时间。我使用的代码是:
#include <time.h>
#include <stdlib.h>
int main()
{
// seed the random number generator
srand(42);
// Pick some arbitrary constraints to make the problem harder
const int SIZE_XY = 1000;
const int MIN_VAL = 5000;
const int MAX_VAL = 7000000;
int i, j, k; // iterators
double time_spent = 0.0;
clock_t begin = clock();
// Generate two 2D arrays to be filled with random numbers
// and an array, c, with all 0s
int *a[SIZE_XY];
int *b[SIZE_XY];
int *c[SIZE_XY];
for (i = 0; i < SIZE_XY; i++)
{
a[i] = (int *)malloc(SIZE_XY * sizeof(int));
b[i] = (int *)malloc(SIZE_XY * sizeof(int));
c[i] = (int *)malloc(SIZE_XY * sizeof(int));
}
#pragma acc kernels
{
for (i = 0; i < SIZE_XY; i++)
{
for (j = 0; j < SIZE_XY; j++)
{
a[i][j] = (rand() % MAX_VAL) + MIN_VAL;
b[i][j] = (rand() % MAX_VAL) + MIN_VAL;
c[i][j] = 0;
}
}
}
printf("Array A allocated and filled with random numbers ...\n");
printf("Array B allocated and filled with random numbers ...\n");
printf("Array C initialized ...\n");
// Dot product the two arrays together into c
#pragma acc kernels //HERE
{
for (i = 0; i < SIZE_XY; i++)
{
for (j = 0; j < SIZE_XY; j++)
{
for (k = 0; k < SIZE_XY; k++)
{
c[i][j] = c[i][j] + a[i][k] * b[k][j];
}
}
}
}
printf("Matrices multiplied ...\n");
printf("The first three values of A x B are %d, %d, %d\n", c[0][0], c[0][1], c[0][2]);
clock_t end = clock();
time_spent += (double)(end - begin) / CLOCKS_PER_SEC;
printf("Time elpased is %f seconds", time_spent);
}
当我 运行 在 PGI CMD 中执行以下命令时: pgcc -acc -ta=multicore -Minfo=all,accel matrixACC.c
我收到以下内容:
59, Complex loop carried dependence of a->,c->,b-> prevents parallelization
62, Complex loop carried dependence of a->,c->,b-> prevents parallelization
64, Complex loop carried dependence of a->,c->,b-> prevents parallelization
Loop carried dependence due to exposed use of c[i1][i2] prevents parallelization
我能否得到一些帮助来理解为什么会发生这种情况以及我如何能够并行化计算矩阵乘法的循环。
谢谢
编译器无法确定您的 3 个指针变量(a
、b
、c
)是否可能 alias 彼此。如果它们以某种方式彼此别名,则无法确定计算任何特定 c[i][j]
的独立性,并且无法正确并行化(任何)循环。
解决此问题的一种可能方法是通知编译器,您作为程序员保证(例如)第一个循环代表独立的 activity(在其各种迭代中)。您可以通过在第一个 for
循环语句之前立即放置 #pragma acc loop independent
来做到这一点。对于您在此处选择的矩阵大小(以及多核目标),这将为您提供大量公开的并行性。 (编译器仍然会发出 Minfo
关于其他循环的非并行化的消息,但这没关系,很可能。对于多核目标,拥有 1000 个并行工作项应该足以获得良好的性能)。
请注意,对于您选择的初始化范围,您的计算很容易溢出 int
存储空间。你会得到毫无意义的结果。
以下代码可以解决上述问题:
$ cat t1.c
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
int main()
{
// seed the random number generator
srand(42);
// Pick some arbitrary constraints to make the problem harder
const int SIZE_XY = 1000;
const int MIN_VAL = 5000;
const int MAX_VAL = 7000000;
int i, j, k; // iterators
double time_spent = 0.0;
clock_t begin = clock();
// Generate two 2D arrays to be filled with random numbers
// and an array, c, with all 0s
int * restrict a[SIZE_XY];
int * restrict b[SIZE_XY];
int * restrict c[SIZE_XY];
for (i = 0; i < SIZE_XY; i++)
{
a[i] = (int *)malloc(SIZE_XY * sizeof(int));
b[i] = (int *)malloc(SIZE_XY * sizeof(int));
c[i] = (int *)malloc(SIZE_XY * sizeof(int));
}
#pragma acc kernels
{
for (i = 0; i < SIZE_XY; i++)
{
for (j = 0; j < SIZE_XY; j++)
{
a[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
b[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
c[i][j] = 0;
}
}
}
printf("Array A allocated and filled with random numbers ...\n");
printf("Array B allocated and filled with random numbers ...\n");
printf("Array C initialized ...\n");
// Dot product the two arrays together into c
#pragma acc kernels //HERE
{
#pragma acc loop independent
for (i = 0; i < SIZE_XY; i++)
{
for (j = 0; j < SIZE_XY; j++)
{
for (k = 0; k < SIZE_XY; k++)
{
c[i][j] = c[i][j] + a[i][k] * b[k][j];
}
}
}
}
printf("Matrices multiplied ...\n");
printf("The first three values of A x B are %d, %d, %d\n", c[0][0], c[0][1], c[0][2]);
clock_t end = clock();
time_spent += (double)(end - begin) / CLOCKS_PER_SEC;
printf("Time elpased is %f seconds", time_spent);
}
$ gcc -o t1 t1.c -std=c99
$ pgcc -acc -ta=multicore -Minfo=all,accel t1.c -o t1p
"t1.c", line 21: warning: use of a const variable in a constant expression is
nonstandard in C
int * restrict a[SIZE_XY];
^
"t1.c", line 22: warning: use of a const variable in a constant expression is
nonstandard in C
int * restrict b[SIZE_XY];
^
"t1.c", line 23: warning: use of a const variable in a constant expression is
nonstandard in C
int * restrict c[SIZE_XY];
^
"t1.c", line 11: warning: variable "MIN_VAL" was declared but never referenced
const int MIN_VAL = 5000;
^
"t1.c", line 12: warning: variable "MAX_VAL" was declared but never referenced
const int MAX_VAL = 7000000;
^
main:
33, Loop is parallelizable
Generating Multicore code
33, #pragma acc loop gang
35, Loop is parallelizable
52, Loop is parallelizable
Generating Multicore code
52, #pragma acc loop gang
54, Complex loop carried dependence of a->,c->,b-> prevents parallelization
56, Complex loop carried dependence of a->,c->,b-> prevents parallelization
Loop carried dependence of c-> prevents parallelization
Loop carried backward dependence of c-> prevents vectorization
$ time ./t1
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 9.010000 seconds
real 0m9.079s
user 0m9.019s
sys 0m0.061s
$ time ./t1p
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 20.140000 seconds
real 0m0.563s
user 0m20.053s
sys 0m0.132s
$
在我的机器上,使用 gcc 编译的代码大约需要 9 秒,而使用 PGI OpenACC 编译器编译的代码大约需要 0.5 秒。
顺便说一句,我个人通常会避免使用您选择的数组分配方法,因为不保证各种 malloc
操作会导致 adjacent/contiguous 分配。然而,对于 multicore
目标,代码可以正常工作。
为了解决这个问题,我建议您像这样对代码进行一些更改:
$ cat t1.c
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
typedef int mt;
#define SIZE_XY 1000
typedef mt mat[SIZE_XY];
int main()
{
// seed the random number generator
srand(42);
// Pick some arbitrary constraints to make the problem harder
int i, j, k; // iterators
double time_spent = 0.0;
clock_t begin = clock();
// Generate two 2D arrays to be filled with random numbers
// and an array, c, with all 0s
mat * restrict a;
mat * restrict b;
mat * restrict c;
a = (mat *)malloc(SIZE_XY*SIZE_XY * sizeof(mt));
b = (mat *)malloc(SIZE_XY*SIZE_XY * sizeof(mt));
c = (mat *)malloc(SIZE_XY*SIZE_XY * sizeof(mt));
#pragma acc kernels
{
for (i = 0; i < SIZE_XY; i++)
{
for (j = 0; j < SIZE_XY; j++)
{
a[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
b[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
c[i][j] = 0;
}
}
}
printf("Array A allocated and filled with random numbers ...\n");
printf("Array B allocated and filled with random numbers ...\n");
printf("Array C initialized ...\n");
// Dot product the two arrays together into c
#pragma acc kernels
{
for (i = 0; i < SIZE_XY; i++)
{
for (j = 0; j < SIZE_XY; j++)
{
for (k = 0; k < SIZE_XY; k++)
{
c[i][j] = c[i][j] + a[i][k] * b[k][j];
}
}
}
}
printf("Matrices multiplied ...\n");
printf("The first three values of A x B are %d, %d, %d\n", c[0][0], c[0][1], c[0][2]);
clock_t end = clock();
time_spent += (double)(end - begin) / CLOCKS_PER_SEC;
printf("Time elpased is %f seconds", time_spent);
}
$ gcc -o t1 t1.c -std=c99 -O3
$ pgcc -acc -ta=multicore -Minfo=all,accel t1.c -o t1p
main:
32, Loop is parallelizable
Generating Multicore code
32, #pragma acc loop gang
34, Loop is parallelizable
51, Loop is parallelizable
Generating Multicore code
51, #pragma acc loop gang
53, Loop is parallelizable
55, Complex loop carried dependence of c-> prevents parallelization
Loop carried dependence of c-> prevents parallelization
Loop carried backward dependence of c-> prevents vectorization
$ time ./t1
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 0.650000 seconds
real 0m0.708s
user 0m0.663s
sys 0m0.047s
$ time ./t1p
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 17.510000 seconds
real 0m0.499s
user 0m17.466s
sys 0m0.093s
$
(gcc 4.8.5,pgcc 20.5-0,Xeon E5-2690 v2,总共 40 个内核)
有几个优点:
- 我们可以使用 c99
restrict
关键字将我们的意图传达给编译器,而无需使用额外的 pragma
- 这将是
a
、b
和 c
的连续分配,如果您决定从 multicore
切换到加速器目标
- OpenACC 编译器现在无需额外帮助即可处理前两个循环嵌套。
- gnu 编译器也喜欢这种级别的通信。在我的机器上,“普通”gnu 编译器 (
gcc
) 生成的代码运行速度几乎与 OpenACC 代码一样快。 (~0.7s 对比~0.5s)
我正在使用 OpenACC 比较 CPU 上使用 PGI Community Edition 19.10(在 Windows 上)的并行矩阵乘法与非并行矩阵乘法运算的执行时间。我使用的代码是:
#include <time.h>
#include <stdlib.h>
int main()
{
// seed the random number generator
srand(42);
// Pick some arbitrary constraints to make the problem harder
const int SIZE_XY = 1000;
const int MIN_VAL = 5000;
const int MAX_VAL = 7000000;
int i, j, k; // iterators
double time_spent = 0.0;
clock_t begin = clock();
// Generate two 2D arrays to be filled with random numbers
// and an array, c, with all 0s
int *a[SIZE_XY];
int *b[SIZE_XY];
int *c[SIZE_XY];
for (i = 0; i < SIZE_XY; i++)
{
a[i] = (int *)malloc(SIZE_XY * sizeof(int));
b[i] = (int *)malloc(SIZE_XY * sizeof(int));
c[i] = (int *)malloc(SIZE_XY * sizeof(int));
}
#pragma acc kernels
{
for (i = 0; i < SIZE_XY; i++)
{
for (j = 0; j < SIZE_XY; j++)
{
a[i][j] = (rand() % MAX_VAL) + MIN_VAL;
b[i][j] = (rand() % MAX_VAL) + MIN_VAL;
c[i][j] = 0;
}
}
}
printf("Array A allocated and filled with random numbers ...\n");
printf("Array B allocated and filled with random numbers ...\n");
printf("Array C initialized ...\n");
// Dot product the two arrays together into c
#pragma acc kernels //HERE
{
for (i = 0; i < SIZE_XY; i++)
{
for (j = 0; j < SIZE_XY; j++)
{
for (k = 0; k < SIZE_XY; k++)
{
c[i][j] = c[i][j] + a[i][k] * b[k][j];
}
}
}
}
printf("Matrices multiplied ...\n");
printf("The first three values of A x B are %d, %d, %d\n", c[0][0], c[0][1], c[0][2]);
clock_t end = clock();
time_spent += (double)(end - begin) / CLOCKS_PER_SEC;
printf("Time elpased is %f seconds", time_spent);
}
当我 运行 在 PGI CMD 中执行以下命令时: pgcc -acc -ta=multicore -Minfo=all,accel matrixACC.c
我收到以下内容:
59, Complex loop carried dependence of a->,c->,b-> prevents parallelization
62, Complex loop carried dependence of a->,c->,b-> prevents parallelization
64, Complex loop carried dependence of a->,c->,b-> prevents parallelization
Loop carried dependence due to exposed use of c[i1][i2] prevents parallelization
我能否得到一些帮助来理解为什么会发生这种情况以及我如何能够并行化计算矩阵乘法的循环。
谢谢
编译器无法确定您的 3 个指针变量(a
、b
、c
)是否可能 alias 彼此。如果它们以某种方式彼此别名,则无法确定计算任何特定 c[i][j]
的独立性,并且无法正确并行化(任何)循环。
解决此问题的一种可能方法是通知编译器,您作为程序员保证(例如)第一个循环代表独立的 activity(在其各种迭代中)。您可以通过在第一个 for
循环语句之前立即放置 #pragma acc loop independent
来做到这一点。对于您在此处选择的矩阵大小(以及多核目标),这将为您提供大量公开的并行性。 (编译器仍然会发出 Minfo
关于其他循环的非并行化的消息,但这没关系,很可能。对于多核目标,拥有 1000 个并行工作项应该足以获得良好的性能)。
请注意,对于您选择的初始化范围,您的计算很容易溢出 int
存储空间。你会得到毫无意义的结果。
以下代码可以解决上述问题:
$ cat t1.c
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
int main()
{
// seed the random number generator
srand(42);
// Pick some arbitrary constraints to make the problem harder
const int SIZE_XY = 1000;
const int MIN_VAL = 5000;
const int MAX_VAL = 7000000;
int i, j, k; // iterators
double time_spent = 0.0;
clock_t begin = clock();
// Generate two 2D arrays to be filled with random numbers
// and an array, c, with all 0s
int * restrict a[SIZE_XY];
int * restrict b[SIZE_XY];
int * restrict c[SIZE_XY];
for (i = 0; i < SIZE_XY; i++)
{
a[i] = (int *)malloc(SIZE_XY * sizeof(int));
b[i] = (int *)malloc(SIZE_XY * sizeof(int));
c[i] = (int *)malloc(SIZE_XY * sizeof(int));
}
#pragma acc kernels
{
for (i = 0; i < SIZE_XY; i++)
{
for (j = 0; j < SIZE_XY; j++)
{
a[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
b[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
c[i][j] = 0;
}
}
}
printf("Array A allocated and filled with random numbers ...\n");
printf("Array B allocated and filled with random numbers ...\n");
printf("Array C initialized ...\n");
// Dot product the two arrays together into c
#pragma acc kernels //HERE
{
#pragma acc loop independent
for (i = 0; i < SIZE_XY; i++)
{
for (j = 0; j < SIZE_XY; j++)
{
for (k = 0; k < SIZE_XY; k++)
{
c[i][j] = c[i][j] + a[i][k] * b[k][j];
}
}
}
}
printf("Matrices multiplied ...\n");
printf("The first three values of A x B are %d, %d, %d\n", c[0][0], c[0][1], c[0][2]);
clock_t end = clock();
time_spent += (double)(end - begin) / CLOCKS_PER_SEC;
printf("Time elpased is %f seconds", time_spent);
}
$ gcc -o t1 t1.c -std=c99
$ pgcc -acc -ta=multicore -Minfo=all,accel t1.c -o t1p
"t1.c", line 21: warning: use of a const variable in a constant expression is
nonstandard in C
int * restrict a[SIZE_XY];
^
"t1.c", line 22: warning: use of a const variable in a constant expression is
nonstandard in C
int * restrict b[SIZE_XY];
^
"t1.c", line 23: warning: use of a const variable in a constant expression is
nonstandard in C
int * restrict c[SIZE_XY];
^
"t1.c", line 11: warning: variable "MIN_VAL" was declared but never referenced
const int MIN_VAL = 5000;
^
"t1.c", line 12: warning: variable "MAX_VAL" was declared but never referenced
const int MAX_VAL = 7000000;
^
main:
33, Loop is parallelizable
Generating Multicore code
33, #pragma acc loop gang
35, Loop is parallelizable
52, Loop is parallelizable
Generating Multicore code
52, #pragma acc loop gang
54, Complex loop carried dependence of a->,c->,b-> prevents parallelization
56, Complex loop carried dependence of a->,c->,b-> prevents parallelization
Loop carried dependence of c-> prevents parallelization
Loop carried backward dependence of c-> prevents vectorization
$ time ./t1
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 9.010000 seconds
real 0m9.079s
user 0m9.019s
sys 0m0.061s
$ time ./t1p
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 20.140000 seconds
real 0m0.563s
user 0m20.053s
sys 0m0.132s
$
在我的机器上,使用 gcc 编译的代码大约需要 9 秒,而使用 PGI OpenACC 编译器编译的代码大约需要 0.5 秒。
顺便说一句,我个人通常会避免使用您选择的数组分配方法,因为不保证各种 malloc
操作会导致 adjacent/contiguous 分配。然而,对于 multicore
目标,代码可以正常工作。
为了解决这个问题,我建议您像这样对代码进行一些更改:
$ cat t1.c
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
typedef int mt;
#define SIZE_XY 1000
typedef mt mat[SIZE_XY];
int main()
{
// seed the random number generator
srand(42);
// Pick some arbitrary constraints to make the problem harder
int i, j, k; // iterators
double time_spent = 0.0;
clock_t begin = clock();
// Generate two 2D arrays to be filled with random numbers
// and an array, c, with all 0s
mat * restrict a;
mat * restrict b;
mat * restrict c;
a = (mat *)malloc(SIZE_XY*SIZE_XY * sizeof(mt));
b = (mat *)malloc(SIZE_XY*SIZE_XY * sizeof(mt));
c = (mat *)malloc(SIZE_XY*SIZE_XY * sizeof(mt));
#pragma acc kernels
{
for (i = 0; i < SIZE_XY; i++)
{
for (j = 0; j < SIZE_XY; j++)
{
a[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
b[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
c[i][j] = 0;
}
}
}
printf("Array A allocated and filled with random numbers ...\n");
printf("Array B allocated and filled with random numbers ...\n");
printf("Array C initialized ...\n");
// Dot product the two arrays together into c
#pragma acc kernels
{
for (i = 0; i < SIZE_XY; i++)
{
for (j = 0; j < SIZE_XY; j++)
{
for (k = 0; k < SIZE_XY; k++)
{
c[i][j] = c[i][j] + a[i][k] * b[k][j];
}
}
}
}
printf("Matrices multiplied ...\n");
printf("The first three values of A x B are %d, %d, %d\n", c[0][0], c[0][1], c[0][2]);
clock_t end = clock();
time_spent += (double)(end - begin) / CLOCKS_PER_SEC;
printf("Time elpased is %f seconds", time_spent);
}
$ gcc -o t1 t1.c -std=c99 -O3
$ pgcc -acc -ta=multicore -Minfo=all,accel t1.c -o t1p
main:
32, Loop is parallelizable
Generating Multicore code
32, #pragma acc loop gang
34, Loop is parallelizable
51, Loop is parallelizable
Generating Multicore code
51, #pragma acc loop gang
53, Loop is parallelizable
55, Complex loop carried dependence of c-> prevents parallelization
Loop carried dependence of c-> prevents parallelization
Loop carried backward dependence of c-> prevents vectorization
$ time ./t1
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 0.650000 seconds
real 0m0.708s
user 0m0.663s
sys 0m0.047s
$ time ./t1p
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 17.510000 seconds
real 0m0.499s
user 0m17.466s
sys 0m0.093s
$
(gcc 4.8.5,pgcc 20.5-0,Xeon E5-2690 v2,总共 40 个内核)
有几个优点:
- 我们可以使用 c99
restrict
关键字将我们的意图传达给编译器,而无需使用额外的 pragma - 这将是
a
、b
和c
的连续分配,如果您决定从multicore
切换到加速器目标 - OpenACC 编译器现在无需额外帮助即可处理前两个循环嵌套。
- gnu 编译器也喜欢这种级别的通信。在我的机器上,“普通”gnu 编译器 (
gcc
) 生成的代码运行速度几乎与 OpenACC 代码一样快。 (~0.7s 对比~0.5s)