在 Cuda Fortran 中使用来自其他文件的托管模块变量的问题
Problem with using managed module variables from other files in Cuda Fortran
我在 module a1
的文件 p2.f95
中有一个托管变量。我正在将此托管变量传递给文件 p6.f95
中的内核。我能够访问文件 p6.f95
中的变量,但无法从内核访问该变量,即;我无法从设备打印变量值。谁能解释一下为什么……?
文件 1:p2.f95
module a1
implicit none
integer(kind=4),managed::lnode
end module a1
文件 2:p6.f95
module kernelsubroutine
contains
attributes(global) subroutine kernel(lnode)
implicit none
integer(kind=4)::lnode
print*,"on device",lnode
end subroutine kernel
end module kernelsubroutine
program main
use a1
use cudafor
use kernelsubroutine
implicit none
integer(kind=4)::n
lnode = 1
print*,"on host",lnode
call kernel<<<1,1>>>(lnode)
n = cudaDeviceSynchronize()
end program main
编译命令1:$pgf95 -Mcuda=rdc p2.f95 p6.f95
编译命令2:
$pgf95 -Mcuda=rdc -c p2.f95
$pgf95 -Mcuda=rdc p6.f95 p2.o
p6.f95:
两组编译命令的输出保持不变
输出命令:$ cuda-memcheck ./a.out
输出:
========= CUDA-MEMCHECK
on host 1
========= Invalid __global__ read of size 4
========= at 0x000010c8 in /home/vsriram/Documents/fortran_programs/p6.f95:11:kernelsubroutine_kernel_
========= by thread (0,0,0) in block (0,0,0)
========= Address 0x0060cf40 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so [0x25428a]
========= Host Frame:/opt/nvidia/hpc_sdk/Linux_x86_64/21.7/cuda/11.4/lib64/libcudart.so.11.0 [0x1402c]
========= Host Frame:/opt/nvidia/hpc_sdk/Linux_x86_64/21.7/cuda/11.4/lib64/libcudart.so.11.0 (cudaLaunchKernel + 0x1d8) [0x67e58]
========= Host Frame:/opt/nvidia/hpc_sdk/Linux_x86_64/21.7/compilers/lib/libcudafor.so (__pgiLaunchKernel + 0x1a6) [0x11194]
========= Host Frame:./a.out [0x14f4]
========= Host Frame:./a.out [0x1393]
=========
========= Program hit cudaErrorLaunchFailure (error 719) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so [0x355b43]
========= Host Frame:/opt/nvidia/hpc_sdk/Linux_x86_64/21.7/cuda/11.4/lib64/libcudart.so.11.0 (cudaDeviceSynchronize + 0x127) [0x43217]
========= Host Frame:/opt/nvidia/hpc_sdk/Linux_x86_64/21.7/compilers/lib/libcudafor_114.so (cudadevicesynchronize_ + 0x11) [0x7aca1]
========= Host Frame:./a.out [0x14fb]
========= Host Frame:./a.out [0x1393]
=========
========= ERROR SUMMARY: 2 errors```
使用“托管”属性时启用的 CUDA 统一内存目前仅适用于动态分配的内存。虽然我们希望将来支持静态数据,但您将无法在 'lnode'.
等静态变量上使用 'managed'
最简单(也可能是最好)的解决方案是按值传递 lnode。尽管您也可以使 'lnode' 可分配,或使主机 lnode 和 'device' lnode.
% cat sol1.cuf
module a1
implicit none
integer(kind=4)::lnode
end module a1
module kernelsubroutine
contains
attributes(global) subroutine kernel(lnode)
implicit none
integer(kind=4),value::lnode
print*,"on device",lnode
end subroutine kernel
end module kernelsubroutine
program main
use a1
use cudafor
use kernelsubroutine
implicit none
integer(kind=4)::n
lnode = 1
print*,"on host",lnode
call kernel<<<1,1>>>(lnode)
n = cudaDeviceSynchronize()
end program main
% nvfortran sol1.cuf; a.out
on host 1
on device 1
% cat sol2.cuf
module a1
implicit none
integer(kind=4),allocatable,managed::lnode
end module a1
module kernelsubroutine
contains
attributes(global) subroutine kernel(lnode)
implicit none
integer(kind=4)::lnode
print*,"on device",lnode
end subroutine kernel
end module kernelsubroutine
program main
use a1
use cudafor
use kernelsubroutine
implicit none
integer(kind=4)::n
allocate(lnode)
lnode = 1
print*,"on host",lnode
call kernel<<<1,1>>>(lnode)
n = cudaDeviceSynchronize()
deallocate(lnode)
end program main
% nvfortran sol2.cuf ; a.out
on host 1
on device 1
% cat sol3.cuf
module a1
implicit none
integer(kind=4)::lnode_h
integer(kind=4),device::lnode_d
end module a1
module kernelsubroutine
contains
attributes(global) subroutine kernel(lnode)
implicit none
integer(kind=4)::lnode
print*,"on device",lnode
end subroutine kernel
end module kernelsubroutine
program main
use a1
use cudafor
use kernelsubroutine
implicit none
integer(kind=4)::n
lnode_h = 1
print*,"on host",lnode_h
lnode_d=lnode_h
call kernel<<<1,1>>>(lnode_d)
n = cudaDeviceSynchronize()
end program main
% nvfortran sol3.cuf ; a.out
on host 1
on device 1
我在 module a1
的文件 p2.f95
中有一个托管变量。我正在将此托管变量传递给文件 p6.f95
中的内核。我能够访问文件 p6.f95
中的变量,但无法从内核访问该变量,即;我无法从设备打印变量值。谁能解释一下为什么……?
文件 1:p2.f95
module a1
implicit none
integer(kind=4),managed::lnode
end module a1
文件 2:p6.f95
module kernelsubroutine
contains
attributes(global) subroutine kernel(lnode)
implicit none
integer(kind=4)::lnode
print*,"on device",lnode
end subroutine kernel
end module kernelsubroutine
program main
use a1
use cudafor
use kernelsubroutine
implicit none
integer(kind=4)::n
lnode = 1
print*,"on host",lnode
call kernel<<<1,1>>>(lnode)
n = cudaDeviceSynchronize()
end program main
编译命令1:$pgf95 -Mcuda=rdc p2.f95 p6.f95
编译命令2:
$pgf95 -Mcuda=rdc -c p2.f95
$pgf95 -Mcuda=rdc p6.f95 p2.o
p6.f95:
两组编译命令的输出保持不变
输出命令:$ cuda-memcheck ./a.out
输出:
========= CUDA-MEMCHECK
on host 1
========= Invalid __global__ read of size 4
========= at 0x000010c8 in /home/vsriram/Documents/fortran_programs/p6.f95:11:kernelsubroutine_kernel_
========= by thread (0,0,0) in block (0,0,0)
========= Address 0x0060cf40 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so [0x25428a]
========= Host Frame:/opt/nvidia/hpc_sdk/Linux_x86_64/21.7/cuda/11.4/lib64/libcudart.so.11.0 [0x1402c]
========= Host Frame:/opt/nvidia/hpc_sdk/Linux_x86_64/21.7/cuda/11.4/lib64/libcudart.so.11.0 (cudaLaunchKernel + 0x1d8) [0x67e58]
========= Host Frame:/opt/nvidia/hpc_sdk/Linux_x86_64/21.7/compilers/lib/libcudafor.so (__pgiLaunchKernel + 0x1a6) [0x11194]
========= Host Frame:./a.out [0x14f4]
========= Host Frame:./a.out [0x1393]
=========
========= Program hit cudaErrorLaunchFailure (error 719) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so [0x355b43]
========= Host Frame:/opt/nvidia/hpc_sdk/Linux_x86_64/21.7/cuda/11.4/lib64/libcudart.so.11.0 (cudaDeviceSynchronize + 0x127) [0x43217]
========= Host Frame:/opt/nvidia/hpc_sdk/Linux_x86_64/21.7/compilers/lib/libcudafor_114.so (cudadevicesynchronize_ + 0x11) [0x7aca1]
========= Host Frame:./a.out [0x14fb]
========= Host Frame:./a.out [0x1393]
=========
========= ERROR SUMMARY: 2 errors```
使用“托管”属性时启用的 CUDA 统一内存目前仅适用于动态分配的内存。虽然我们希望将来支持静态数据,但您将无法在 'lnode'.
等静态变量上使用 'managed'最简单(也可能是最好)的解决方案是按值传递 lnode。尽管您也可以使 'lnode' 可分配,或使主机 lnode 和 'device' lnode.
% cat sol1.cuf
module a1
implicit none
integer(kind=4)::lnode
end module a1
module kernelsubroutine
contains
attributes(global) subroutine kernel(lnode)
implicit none
integer(kind=4),value::lnode
print*,"on device",lnode
end subroutine kernel
end module kernelsubroutine
program main
use a1
use cudafor
use kernelsubroutine
implicit none
integer(kind=4)::n
lnode = 1
print*,"on host",lnode
call kernel<<<1,1>>>(lnode)
n = cudaDeviceSynchronize()
end program main
% nvfortran sol1.cuf; a.out
on host 1
on device 1
% cat sol2.cuf
module a1
implicit none
integer(kind=4),allocatable,managed::lnode
end module a1
module kernelsubroutine
contains
attributes(global) subroutine kernel(lnode)
implicit none
integer(kind=4)::lnode
print*,"on device",lnode
end subroutine kernel
end module kernelsubroutine
program main
use a1
use cudafor
use kernelsubroutine
implicit none
integer(kind=4)::n
allocate(lnode)
lnode = 1
print*,"on host",lnode
call kernel<<<1,1>>>(lnode)
n = cudaDeviceSynchronize()
deallocate(lnode)
end program main
% nvfortran sol2.cuf ; a.out
on host 1
on device 1
% cat sol3.cuf
module a1
implicit none
integer(kind=4)::lnode_h
integer(kind=4),device::lnode_d
end module a1
module kernelsubroutine
contains
attributes(global) subroutine kernel(lnode)
implicit none
integer(kind=4)::lnode
print*,"on device",lnode
end subroutine kernel
end module kernelsubroutine
program main
use a1
use cudafor
use kernelsubroutine
implicit none
integer(kind=4)::n
lnode_h = 1
print*,"on host",lnode_h
lnode_d=lnode_h
call kernel<<<1,1>>>(lnode_d)
n = cudaDeviceSynchronize()
end program main
% nvfortran sol3.cuf ; a.out
on host 1
on device 1