使用 OpenACC 时输出为零
zero output when OpenACC is used
我使用 PGI 社区版 17.10 编译和 运行 休闲代码。为什么我添加 OpenACC 指令时输出错误?
你能帮我看看为什么会这样吗?
提前致谢,
萨贾德
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <assert.h>
#include <openacc.h>
#include<time.h>
#include <string.h>
#include <malloc.h>
#include <cuda_runtime_api.h>
#define NX 2
#define NY 2
#define NZ 2
int main(void)
{
static int i, j, k;
static double A[NX][NY][NZ]=2 ,B[NX][NY][NZ]=10.,C[NX]=10.,D[NY]=10.,E[NZ]=10.;
FILE *file;
file = fopen("BB-and-A.csv", "w");
#pragma acc data copy( A ,B,C,D,E,i, j, k)
{
#pragma acc kernels loop private(i, j, k)
for (i = 0; i <= NX; i++) {
for (j =0; j <= NY ; j++) {
for (k =0; k <= NZ ; k++) {
C[i]=i;
D[j]=j;
E[k]=k;
}
}
}
}
for (i = 0; i <= NX; i++) {
for (j =0; j <= NY ; j++) {
for (k =0; k <= NZ ; k++) {
fprintf(file, "%e, %e, %e \n", C[i], D[j],E[k] );
}
}
}
fclose(file);
}
这段代码有很多问题。
1) 你的数组边界不正确。由于循环从 1 到 <= N 但数组只有 N 个成员,因此您要注销数组的末尾。
2) 您的循环不可并行化,因为您正在从多个循环迭代中写入每个元素。要修复,我会制作这三个独立的循环。
3) 循环索引变量不应设为静态。这将它们置于全局存储中,从而导致依赖性。虽然您可以通过将它们放在 private 子句中来解决此问题,但最好删除 static 并让编译器隐式地将它们私有化。
4) 无需复制循环索引变量。
尝试如下操作:
% cat test2.c
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <assert.h>
#include <openacc.h>
#include<time.h>
#include <string.h>
#include <malloc.h>
// #include <cuda_runtime_api.h>
#define NX 2
#define NY 2
#define NZ 2
int main(void)
{
int i, j, k;
static double A[NX+1][NY+1][NZ+1]=2 ,B[NX+1][NY+1][NZ+1]=10.,C[NX+1]=10.,D[NY+1]=10.,E[NZ+1]=10.;
FILE *file;
file = fopen("BB-and-A.csv", "w");
#pragma acc data copy(A,B,C,D,E)
{
#pragma acc kernels
{
for (i = 0; i <= NX; i++) C[i]=i;
for (j =0; j <= NY ; j++) D[j]=j;
for (k =0; k <= NZ ; k++) E[k]=k;
} }
for (i = 0; i <= NX; i++) {
for (j =0; j <= NY ; j++) {
for (k =0; k <= NZ ; k++) {
fprintf(file,"%e, %e, %e \n", C[i], D[j],E[k] );
} } }
fclose(file);
}
% pgcc test2.c -ta=tesla:cc60 -Minfo=accel
main:
23, Generating copy(A[:][:][:],B[:][:][:],C[:],E[:],D[:])
27, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
27, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
28, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
28, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
29, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
29, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
% a.out
% cat BB-and-A.csv
0.000000e+00, 0.000000e+00, 0.000000e+00
0.000000e+00, 0.000000e+00, 1.000000e+00
0.000000e+00, 0.000000e+00, 2.000000e+00
0.000000e+00, 1.000000e+00, 0.000000e+00
0.000000e+00, 1.000000e+00, 1.000000e+00
0.000000e+00, 1.000000e+00, 2.000000e+00
0.000000e+00, 2.000000e+00, 0.000000e+00
0.000000e+00, 2.000000e+00, 1.000000e+00
0.000000e+00, 2.000000e+00, 2.000000e+00
1.000000e+00, 0.000000e+00, 0.000000e+00
1.000000e+00, 0.000000e+00, 1.000000e+00
1.000000e+00, 0.000000e+00, 2.000000e+00
1.000000e+00, 1.000000e+00, 0.000000e+00
1.000000e+00, 1.000000e+00, 1.000000e+00
1.000000e+00, 1.000000e+00, 2.000000e+00
1.000000e+00, 2.000000e+00, 0.000000e+00
1.000000e+00, 2.000000e+00, 1.000000e+00
1.000000e+00, 2.000000e+00, 2.000000e+00
2.000000e+00, 0.000000e+00, 0.000000e+00
2.000000e+00, 0.000000e+00, 1.000000e+00
2.000000e+00, 0.000000e+00, 2.000000e+00
2.000000e+00, 1.000000e+00, 0.000000e+00
2.000000e+00, 1.000000e+00, 1.000000e+00
2.000000e+00, 1.000000e+00, 2.000000e+00
2.000000e+00, 2.000000e+00, 0.000000e+00
2.000000e+00, 2.000000e+00, 1.000000e+00
2.000000e+00, 2.000000e+00, 2.000000e+00
我使用 PGI 社区版 17.10 编译和 运行 休闲代码。为什么我添加 OpenACC 指令时输出错误? 你能帮我看看为什么会这样吗? 提前致谢, 萨贾德
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <assert.h>
#include <openacc.h>
#include<time.h>
#include <string.h>
#include <malloc.h>
#include <cuda_runtime_api.h>
#define NX 2
#define NY 2
#define NZ 2
int main(void)
{
static int i, j, k;
static double A[NX][NY][NZ]=2 ,B[NX][NY][NZ]=10.,C[NX]=10.,D[NY]=10.,E[NZ]=10.;
FILE *file;
file = fopen("BB-and-A.csv", "w");
#pragma acc data copy( A ,B,C,D,E,i, j, k)
{
#pragma acc kernels loop private(i, j, k)
for (i = 0; i <= NX; i++) {
for (j =0; j <= NY ; j++) {
for (k =0; k <= NZ ; k++) {
C[i]=i;
D[j]=j;
E[k]=k;
}
}
}
}
for (i = 0; i <= NX; i++) {
for (j =0; j <= NY ; j++) {
for (k =0; k <= NZ ; k++) {
fprintf(file, "%e, %e, %e \n", C[i], D[j],E[k] );
}
}
}
fclose(file);
}
这段代码有很多问题。
1) 你的数组边界不正确。由于循环从 1 到 <= N 但数组只有 N 个成员,因此您要注销数组的末尾。
2) 您的循环不可并行化,因为您正在从多个循环迭代中写入每个元素。要修复,我会制作这三个独立的循环。
3) 循环索引变量不应设为静态。这将它们置于全局存储中,从而导致依赖性。虽然您可以通过将它们放在 private 子句中来解决此问题,但最好删除 static 并让编译器隐式地将它们私有化。
4) 无需复制循环索引变量。
尝试如下操作:
% cat test2.c
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <assert.h>
#include <openacc.h>
#include<time.h>
#include <string.h>
#include <malloc.h>
// #include <cuda_runtime_api.h>
#define NX 2
#define NY 2
#define NZ 2
int main(void)
{
int i, j, k;
static double A[NX+1][NY+1][NZ+1]=2 ,B[NX+1][NY+1][NZ+1]=10.,C[NX+1]=10.,D[NY+1]=10.,E[NZ+1]=10.;
FILE *file;
file = fopen("BB-and-A.csv", "w");
#pragma acc data copy(A,B,C,D,E)
{
#pragma acc kernels
{
for (i = 0; i <= NX; i++) C[i]=i;
for (j =0; j <= NY ; j++) D[j]=j;
for (k =0; k <= NZ ; k++) E[k]=k;
} }
for (i = 0; i <= NX; i++) {
for (j =0; j <= NY ; j++) {
for (k =0; k <= NZ ; k++) {
fprintf(file,"%e, %e, %e \n", C[i], D[j],E[k] );
} } }
fclose(file);
}
% pgcc test2.c -ta=tesla:cc60 -Minfo=accel
main:
23, Generating copy(A[:][:][:],B[:][:][:],C[:],E[:],D[:])
27, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
27, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
28, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
28, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
29, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
29, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
% a.out
% cat BB-and-A.csv
0.000000e+00, 0.000000e+00, 0.000000e+00
0.000000e+00, 0.000000e+00, 1.000000e+00
0.000000e+00, 0.000000e+00, 2.000000e+00
0.000000e+00, 1.000000e+00, 0.000000e+00
0.000000e+00, 1.000000e+00, 1.000000e+00
0.000000e+00, 1.000000e+00, 2.000000e+00
0.000000e+00, 2.000000e+00, 0.000000e+00
0.000000e+00, 2.000000e+00, 1.000000e+00
0.000000e+00, 2.000000e+00, 2.000000e+00
1.000000e+00, 0.000000e+00, 0.000000e+00
1.000000e+00, 0.000000e+00, 1.000000e+00
1.000000e+00, 0.000000e+00, 2.000000e+00
1.000000e+00, 1.000000e+00, 0.000000e+00
1.000000e+00, 1.000000e+00, 1.000000e+00
1.000000e+00, 1.000000e+00, 2.000000e+00
1.000000e+00, 2.000000e+00, 0.000000e+00
1.000000e+00, 2.000000e+00, 1.000000e+00
1.000000e+00, 2.000000e+00, 2.000000e+00
2.000000e+00, 0.000000e+00, 0.000000e+00
2.000000e+00, 0.000000e+00, 1.000000e+00
2.000000e+00, 0.000000e+00, 2.000000e+00
2.000000e+00, 1.000000e+00, 0.000000e+00
2.000000e+00, 1.000000e+00, 1.000000e+00
2.000000e+00, 1.000000e+00, 2.000000e+00
2.000000e+00, 2.000000e+00, 0.000000e+00
2.000000e+00, 2.000000e+00, 1.000000e+00
2.000000e+00, 2.000000e+00, 2.000000e+00