如何使用 PGI 编译器声明带有 C/OpenACC 的全局动态数组
How to declare a global dynamic array with C/OpenACC with PGI compiler
我正在尝试 运行 一个简单的测试用例,其中动态分配的数组 A
在外部定义并使用 OpenACC 上传到 GPU。全部使用 PGI 编译器。
我的 header.h
文件:
extern int *A;
#pragma acc declare create(A)
然后,我的 header.c
实现:
int *A;
#pragma acc declare copyin(A)
然后,在 main.c
我有
#include "header.h"
int main(int argc, char* argv[]){
printf("main() start\n");
int sum=0;
int N=0;
if(argc==1){
printf("usage: ./main.exe N");
}else{
N=atoi(argv[1]);
}
printf("N =%d\n", N);
A=(int*)malloc(N*sizeof(int));
for(int i=0;i<N;i++){A[i]=i;}
printf("almost data region\n");
#pragma acc data copy(sum)
{
printf("inside data region\n");
#pragma acc update device(A[0:N])
#pragma acc parallel loop reduction(+:sum)
for(int i=0;i<N;i++){
sum+=A[i];
}
}
printf("sum = %d\n",sum);
}
我使用以下命令编译代码:
$ cc -g -lnvToolsExt -O2 -acc -ta=tesla:cc60 -c11 -mp -Minfo -Mlarge_arrays -c -o header.o header.c
$ cc -g -lnvToolsExt -O2 -acc -ta=tesla:cc60 -c11 -mp -Minfo -Mlarge_arrays -c -o main.o main.c
PGC-W-0155-Pointer value created from a nonlong integral type (main.c: 12)
main:
13, Generated 2 alternate versions of the loop
Generated vector simd code for the loop
17, Generating copy(sum)
21, Generating update device(A[:N])
Accelerator kernel generated
Generating Tesla code
21, Generating reduction(+:sum)
22, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
PGC/x86-64 Linux 17.5-0: compilation completed with warnings
$ cc -g -lnvToolsExt -O2 -acc -ta=tesla:cc60 -c11 -mp -Minfo -Mlarge_arrays header.o main.o -o main.exe
我的PGI
编译器版本是:
$ cc -v
Export PGI=/opt/pgi/17.5.0
执行代码:
$ ACC_NOTIFY=3 srun cuda-memcheck --show-backtrace yes main.exe 10000
upload CUDA data file=/scratch/snx3000/ragagnin/2017/prova/main.c function=main line=17 device=0 threadid=1 variable=A bytes=8
upload CUDA data file=/scratch/snx3000/ragagnin/2017/prova/main.c function=main line=17 device=0 threadid=1 variable=sum bytes=4
Present table dump for device[1]: NVIDIA Tesla GPU 0, compute capability 6.0, threadid=1
host:0x606780 device:0x10216200000 size:8 presentcount:0+1 line:-1 name:A
host:0x7fffffff67ac device:0x1021a400000 size:4 presentcount:1+0 line:17 name:sum
allocated block device:0x1021a400000 size:512 thread:1
FATAL ERROR: data in update device clause was not found on device 1: name=A
file:/scratch/snx3000/ragagnin/2017/prova/main.c main line:21
main() start
N =10000
almost data region
inside data region
========= CUDA-MEMCHECK
========= Program hit CUDA_ERROR_INVALID_DEVICE (error 101) due to "invalid device ordinal" on CUDA API call to cuDevicePrimaryCtxRetain.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/opt/cray/nvidia/default/lib64/libcuda.so (cuDevicePrimaryCtxRetain + 0x15c) [0x1e497c]
========= Host Frame:/opt/pgi/17.5.0/linux86-64/17.5/lib/libaccnmp.so (__pgi_uacc_cuda_initdev + 0x962) [0x140e1]
========= Host Frame:/opt/pgi/17.5.0/linux86-64/17.5/lib/libaccgmp.so (__pgi_uacc_enumerate + 0x173) [0x12e31]
========= Host Frame:/opt/pgi/17.5.0/linux86-64/17.5/lib/libaccgmp.so (__pgi_uacc_initialize + 0x9b) [0x1340d]
========= Host Frame:/opt/pgi/17.5.0/linux86-64/17.5/lib/libaccgmp.so (__pgi_uacc_dataenterstart + 0x50) [0x9de1]
========= Host Frame:main.exe [0x16a5]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x206e5]
========= Host Frame:main.exe [0x11c9]
=========
========= ERROR SUMMARY: 1 error
srun: error: nid03948: task 0: Exited with exit code 1
srun: Terminating job step 4066800.15
我认为问题是PGI编译器发送了variable=A bytes=8
,因此忽略了我发送A[0:N]
的请求。
那么,如何使用 PGI 编译器 C/OpenACC 声明全局动态数组?
当您将 "declare" 与指针一起使用时,您正在创建一个全局设备指针,而不是指针指向的数组。因此,当您尝试更新数组时,它不存在以及运行时错误的原因。
要修复,您还需要将数组添加到数据区域,例如 "enter data" 指令,如下所示。当你把数组放在数据区域时,除了为数组创建 space 之外,运行时会返回 "attach" 到 "A",即填写 [= 的设备副本22=] 具有正确的设备指针值。
您还想通过在计算区域上放置 "present(A)" 来告诉编译器 "A" 已经存在于设备上。
请注意,不需要第二个 "declare copyin"。此外,"create" 设备数据未初始化,而 "copyin" 将使用主机值初始化变量。但由于主机值是一个主机指针,它在设备上仍然是垃圾。所以不一定是错的,只是不需要。
% cat header.h
#include <stdio.h>
#include <stdlib.h>
extern int *A;
#pragma acc declare create(A)
% cat header.c
#include <header.h>
int *A;
% cat test.c
#include "header.h"
int main(int argc, char* argv[]){
printf("main() start\n");
int sum=0;
int N=0;
if(argc==1){
printf("usage: ./main.exe N");
}else{
N=atoi(argv[1]);
}
printf("N =%d\n", N);
A=(int*)malloc(N*sizeof(int));
#pragma acc enter data create(A[0:N])
for(int i=0;i<N;i++){A[i]=i;}
printf("almost data region\n");
#pragma acc data copy(sum)
{
printf("inside data region\n");
#pragma acc update device(A[0:N])
#pragma acc parallel loop present(A) reduction(+:sum)
for(int i=0;i<N;i++){
sum+=A[i];
}
}
printf("sum = %d\n",sum);
#pragma acc exit data delete(A)
free(A);
exit(0);
}
% pgcc -I./ test.c header.c -ta=tesla:cc60 -Minfo=accel
test.c:
main:
13, Generating enter data create(A[:N])
17, Generating copy(sum)
21, Generating update device(A[:N])
Accelerator kernel generated
Generating Tesla code
21, Generating reduction(+:sum)
22, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
27, Generating exit data delete(A[:1])
header.c:
% setenv PGI_ACC_TIME 1
% a.out 1024
main() start
N =1024
almost data region
inside data region
sum = 523776
Accelerator Kernel Timing data
test.c
main NVIDIA devicenum=0
time(us): 124
13: upload reached 1 time
13: data copyin transfers: 1
device time(us): total=33 max=33 min=33 avg=33
13: data region reached 1 time
13: data copyin transfers: 1
device time(us): total=9 max=9 min=9 avg=9
17: data region reached 2 times
17: data copyin transfers: 1
device time(us): total=33 max=33 min=33 avg=33
26: data copyout transfers: 1
device time(us): total=22 max=22 min=22 avg=22
21: update directive reached 1 time
21: data copyin transfers: 1
device time(us): total=10 max=10 min=10 avg=10
21: compute region reached 1 time
21: kernel launched 1 time
grid: [8] block: [128]
device time(us): total=4 max=4 min=4 avg=4
elapsed time(us): total=589 max=589 min=589 avg=589
21: reduction kernel launched 1 time
grid: [1] block: [256]
device time(us): total=4 max=4 min=4 avg=4
elapsed time(us): total=27 max=27 min=27 avg=27
27: data region reached 1 time
27: data copyin transfers: 1
device time(us): total=9 max=9 min=9 avg=9
我正在尝试 运行 一个简单的测试用例,其中动态分配的数组 A
在外部定义并使用 OpenACC 上传到 GPU。全部使用 PGI 编译器。
我的 header.h
文件:
extern int *A;
#pragma acc declare create(A)
然后,我的 header.c
实现:
int *A;
#pragma acc declare copyin(A)
然后,在 main.c
我有
#include "header.h"
int main(int argc, char* argv[]){
printf("main() start\n");
int sum=0;
int N=0;
if(argc==1){
printf("usage: ./main.exe N");
}else{
N=atoi(argv[1]);
}
printf("N =%d\n", N);
A=(int*)malloc(N*sizeof(int));
for(int i=0;i<N;i++){A[i]=i;}
printf("almost data region\n");
#pragma acc data copy(sum)
{
printf("inside data region\n");
#pragma acc update device(A[0:N])
#pragma acc parallel loop reduction(+:sum)
for(int i=0;i<N;i++){
sum+=A[i];
}
}
printf("sum = %d\n",sum);
}
我使用以下命令编译代码:
$ cc -g -lnvToolsExt -O2 -acc -ta=tesla:cc60 -c11 -mp -Minfo -Mlarge_arrays -c -o header.o header.c
$ cc -g -lnvToolsExt -O2 -acc -ta=tesla:cc60 -c11 -mp -Minfo -Mlarge_arrays -c -o main.o main.c
PGC-W-0155-Pointer value created from a nonlong integral type (main.c: 12)
main:
13, Generated 2 alternate versions of the loop
Generated vector simd code for the loop
17, Generating copy(sum)
21, Generating update device(A[:N])
Accelerator kernel generated
Generating Tesla code
21, Generating reduction(+:sum)
22, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
PGC/x86-64 Linux 17.5-0: compilation completed with warnings
$ cc -g -lnvToolsExt -O2 -acc -ta=tesla:cc60 -c11 -mp -Minfo -Mlarge_arrays header.o main.o -o main.exe
我的PGI
编译器版本是:
$ cc -v
Export PGI=/opt/pgi/17.5.0
执行代码:
$ ACC_NOTIFY=3 srun cuda-memcheck --show-backtrace yes main.exe 10000
upload CUDA data file=/scratch/snx3000/ragagnin/2017/prova/main.c function=main line=17 device=0 threadid=1 variable=A bytes=8
upload CUDA data file=/scratch/snx3000/ragagnin/2017/prova/main.c function=main line=17 device=0 threadid=1 variable=sum bytes=4
Present table dump for device[1]: NVIDIA Tesla GPU 0, compute capability 6.0, threadid=1
host:0x606780 device:0x10216200000 size:8 presentcount:0+1 line:-1 name:A
host:0x7fffffff67ac device:0x1021a400000 size:4 presentcount:1+0 line:17 name:sum
allocated block device:0x1021a400000 size:512 thread:1
FATAL ERROR: data in update device clause was not found on device 1: name=A
file:/scratch/snx3000/ragagnin/2017/prova/main.c main line:21
main() start
N =10000
almost data region
inside data region
========= CUDA-MEMCHECK
========= Program hit CUDA_ERROR_INVALID_DEVICE (error 101) due to "invalid device ordinal" on CUDA API call to cuDevicePrimaryCtxRetain.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/opt/cray/nvidia/default/lib64/libcuda.so (cuDevicePrimaryCtxRetain + 0x15c) [0x1e497c]
========= Host Frame:/opt/pgi/17.5.0/linux86-64/17.5/lib/libaccnmp.so (__pgi_uacc_cuda_initdev + 0x962) [0x140e1]
========= Host Frame:/opt/pgi/17.5.0/linux86-64/17.5/lib/libaccgmp.so (__pgi_uacc_enumerate + 0x173) [0x12e31]
========= Host Frame:/opt/pgi/17.5.0/linux86-64/17.5/lib/libaccgmp.so (__pgi_uacc_initialize + 0x9b) [0x1340d]
========= Host Frame:/opt/pgi/17.5.0/linux86-64/17.5/lib/libaccgmp.so (__pgi_uacc_dataenterstart + 0x50) [0x9de1]
========= Host Frame:main.exe [0x16a5]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x206e5]
========= Host Frame:main.exe [0x11c9]
=========
========= ERROR SUMMARY: 1 error
srun: error: nid03948: task 0: Exited with exit code 1
srun: Terminating job step 4066800.15
我认为问题是PGI编译器发送了variable=A bytes=8
,因此忽略了我发送A[0:N]
的请求。
那么,如何使用 PGI 编译器 C/OpenACC 声明全局动态数组?
当您将 "declare" 与指针一起使用时,您正在创建一个全局设备指针,而不是指针指向的数组。因此,当您尝试更新数组时,它不存在以及运行时错误的原因。
要修复,您还需要将数组添加到数据区域,例如 "enter data" 指令,如下所示。当你把数组放在数据区域时,除了为数组创建 space 之外,运行时会返回 "attach" 到 "A",即填写 [= 的设备副本22=] 具有正确的设备指针值。
您还想通过在计算区域上放置 "present(A)" 来告诉编译器 "A" 已经存在于设备上。
请注意,不需要第二个 "declare copyin"。此外,"create" 设备数据未初始化,而 "copyin" 将使用主机值初始化变量。但由于主机值是一个主机指针,它在设备上仍然是垃圾。所以不一定是错的,只是不需要。
% cat header.h
#include <stdio.h>
#include <stdlib.h>
extern int *A;
#pragma acc declare create(A)
% cat header.c
#include <header.h>
int *A;
% cat test.c
#include "header.h"
int main(int argc, char* argv[]){
printf("main() start\n");
int sum=0;
int N=0;
if(argc==1){
printf("usage: ./main.exe N");
}else{
N=atoi(argv[1]);
}
printf("N =%d\n", N);
A=(int*)malloc(N*sizeof(int));
#pragma acc enter data create(A[0:N])
for(int i=0;i<N;i++){A[i]=i;}
printf("almost data region\n");
#pragma acc data copy(sum)
{
printf("inside data region\n");
#pragma acc update device(A[0:N])
#pragma acc parallel loop present(A) reduction(+:sum)
for(int i=0;i<N;i++){
sum+=A[i];
}
}
printf("sum = %d\n",sum);
#pragma acc exit data delete(A)
free(A);
exit(0);
}
% pgcc -I./ test.c header.c -ta=tesla:cc60 -Minfo=accel
test.c:
main:
13, Generating enter data create(A[:N])
17, Generating copy(sum)
21, Generating update device(A[:N])
Accelerator kernel generated
Generating Tesla code
21, Generating reduction(+:sum)
22, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
27, Generating exit data delete(A[:1])
header.c:
% setenv PGI_ACC_TIME 1
% a.out 1024
main() start
N =1024
almost data region
inside data region
sum = 523776
Accelerator Kernel Timing data
test.c
main NVIDIA devicenum=0
time(us): 124
13: upload reached 1 time
13: data copyin transfers: 1
device time(us): total=33 max=33 min=33 avg=33
13: data region reached 1 time
13: data copyin transfers: 1
device time(us): total=9 max=9 min=9 avg=9
17: data region reached 2 times
17: data copyin transfers: 1
device time(us): total=33 max=33 min=33 avg=33
26: data copyout transfers: 1
device time(us): total=22 max=22 min=22 avg=22
21: update directive reached 1 time
21: data copyin transfers: 1
device time(us): total=10 max=10 min=10 avg=10
21: compute region reached 1 time
21: kernel launched 1 time
grid: [8] block: [128]
device time(us): total=4 max=4 min=4 avg=4
elapsed time(us): total=589 max=589 min=589 avg=589
21: reduction kernel launched 1 time
grid: [1] block: [256]
device time(us): total=4 max=4 min=4 avg=4
elapsed time(us): total=27 max=27 min=27 avg=27
27: data region reached 1 time
27: data copyin transfers: 1
device time(us): total=9 max=9 min=9 avg=9