MPI+OpenACC 编程中的 GPU 间通信
Inter GPU communication in MPI+OpenACC programming
我正在尝试学习如何使用以下玩具代码执行 GPU 间数据通信。该程序的任务是将 gpu-0 中的数组 'a' 数据发送到 gpu-1 的内存中。我采取了以下根来这样做,涉及四个步骤:
在 gpu0 上初始化数组 'a' 后,
- 第一步:从gpu0向cpu0发送数据(使用
!acc update self()
子句)
- step2:从cpu0向cpu1发送数据(使用
MPI_SEND()
)
- step3:从cpu0接收数据到cpu1(使用
MPI_RECV()
)
- step4:更新gpu1设备内存(使用
!$acc update device()
子句)
这工作得很好,但这看起来像是一条很长的路线,我认为有更好的方法来做到这一点。我试图阅读以下 post 中建议的 !$acc host_data use_device
子句,但无法实现它:
我想知道如何使用 !$acc host_data use_device
以高效的方式执行下面显示的任务。
PROGRAM TOY_MPI_OpenACC
implicit none
include 'mpif.h'
integer :: rank, nprocs, ierr, i, dest_rank, tag, from
integer :: status(MPI_STATUS_SIZE)
integer, parameter :: N = 10000
double precision, dimension(N) :: a
call MPI_INIT(ierr)
call MPI_COMM_RANK(MPI_COMM_WORLD,rank,ierr)
call MPI_COMM_SIZE(MPI_COMM_WORLD,nprocs,ierr)
print*, 'Process ', rank, ' of', nprocs, ' is alive'
!$acc data create(a)
! initialize 'a' on gpu0 (not cpu0)
IF (rank == 0) THEN
!$acc parallel loop default(present)
DO i = 1,N
a(i) = 1
ENDDO
ENDIF
! step1: send data from gpu0 to cpu0
!$acc update self(a)
print*, 'a in rank', rank, ' before communication is ', a(N/2)
IF (rank == 0) THEN
! step2: send from cpu0
dest_rank = 1; tag = 1999
call MPI_SEND(a, N, MPI_DOUBLE_PRECISION, dest_rank, tag, MPI_COMM_WORLD, ierr)
ELSEIF (rank == 1) THEN
! step3: recieve into cpu1
from = MPI_ANY_SOURCE; tag = MPI_ANY_TAG;
call MPI_RECV(a, N, MPI_DOUBLE_PRECISION, from, tag, MPI_COMM_WORLD, status, ierr)
! step4: send data in to gpu1 from cpu1
!$acc update device(a)
ENDIF
call MPI_BARRIER(MPI_COMM_WORLD, ierr)
print*, 'a in rank', rank, ' after communication is ', a(N/2)
!$acc end data
call MPI_BARRIER(MPI_COMM_WORLD, ierr)
END
编译:mpif90 -acc -ta=tesla toycode.f90
(来自 nvidia hpc-sdk 21.9 的 mpif90)
执行:mpirun -np 2 ./a.out
举个例子。请注意,我还添加了一些样板代码来对设备分配进行本地节点排名。我也更喜欢使用非结构化数据区域,因为它们更适合更复杂的代码,但在这里它们在语义上等同于您上面使用的结构化数据区域。我在 CUDA_AWARE_MPI 宏下保护 host_data 结构,因为并非所有 MPI 都启用了 CUDA Aware 支持。对于这些,您需要恢复到在主机和设备之间复制数据 before/after MPI 调用。
% cat mpi_acc.F90
PROGRAM TOY_MPI_OpenACC
use mpi
#ifdef _OPENACC
use openacc
#endif
implicit none
integer :: rank, nprocs, ierr, i, dest_rank, tag, from
integer :: status(MPI_STATUS_SIZE)
integer, parameter :: N = 10000
double precision, dimension(N) :: a
#ifdef _OPENACC
integer :: dev, devNum, local_rank, local_comm
integer(acc_device_kind) :: devtype
#endif
call MPI_INIT(ierr)
call MPI_COMM_RANK(MPI_COMM_WORLD,rank,ierr)
call MPI_COMM_SIZE(MPI_COMM_WORLD,nprocs,ierr)
print*, 'Process ', rank, ' of', nprocs, ' is alive'
#ifdef _OPENACC
! set the MPI rank to device mapping
! 1) Get the local node's rank number
! 2) Get the number of devices on the node
! 3) Round-Robin assignment of rank to device
call MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, &
MPI_INFO_NULL, local_comm,ierr)
call MPI_Comm_rank(local_comm, local_rank,ierr)
devtype = acc_get_device_type()
devNum = acc_get_num_devices(devtype)
dev = mod(local_rank,devNum)
call acc_set_device_num(dev, devtype)
print*, "Process ",rank," Using device ",dev
#endif
a = 0
!$acc enter data copyin(a)
! initialize 'a' on gpu0 (not cpu0)
IF (rank == 0) THEN
!$acc parallel loop default(present)
DO i = 1,N
a(i) = 1
ENDDO
!$acc update self(a)
ENDIF
! step1: send data from gpu0 to cpu0
print*, 'a in rank', rank, ' before communication is ', a(N/2)
IF (rank == 0) THEN
! step2: send from cpu0
dest_rank = 1; tag = 1999
#ifdef CUDA_AWARE_MPI
!$acc host_data use_device(a)
#endif
call MPI_SEND(a, N, MPI_DOUBLE_PRECISION, dest_rank, tag, MPI_COMM_WORLD, ierr)
#ifdef CUDA_AWARE_MPI
!$acc end host_data
#endif
ELSEIF (rank == 1) THEN
! step3: recieve into cpu1
from = MPI_ANY_SOURCE; tag = MPI_ANY_TAG;
#ifdef CUDA_AWARE_MPI
!$acc host_data use_device(a)
#endif
call MPI_RECV(a, N, MPI_DOUBLE_PRECISION, from, tag, MPI_COMM_WORLD, status, ierr)
#ifdef CUDA_AWARE_MPI
!$acc end host_data
#else
! step4: send data in to gpu1 from cpu1
!$acc update device(a)
#endif
ENDIF
call MPI_BARRIER(MPI_COMM_WORLD, ierr)
!$acc update self(a)
print*, 'a in rank', rank, ' after communication is ', a(N/2)
!$acc exit data delete(a)
call MPI_BARRIER(MPI_COMM_WORLD, ierr)
END
% which mpif90
/proj/nv/Linux_x86_64/21.9/comm_libs/mpi/bin//mpif90
% mpif90 -V
nvfortran 21.9-0 64-bit target on x86-64 Linux -tp skylake
NVIDIA Compilers and Tools
Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
% mpif90 -acc -Minfo=accel mpi_acc.F90
toy_mpi_openacc:
38, Generating enter data copyin(a(:))
42, Generating Tesla code
43, !$acc loop gang, vector(128) ! blockidx%x threadidx%x
42, Generating default present(a(:))
46, Generating update self(a(:))
76, Generating update device(a(:))
82, Generating update self(a(:))
85, Generating exit data delete(a(:))
% mpirun -np 2 ./a.out
Process 1 of 2 is alive
Process 0 of 2 is alive
Process 0 Using device 0
Process 1 Using device 1
a in rank 1 before communication is 0.000000000000000
a in rank 0 before communication is 1.000000000000000
a in rank 0 after communication is 1.000000000000000
a in rank 1 after communication is 1.000000000000000
% mpif90 -acc -Minfo=accel mpi_acc.F90 -DCUDA_AWARE_MPI=1
toy_mpi_openacc:
38, Generating enter data copyin(a(:))
42, Generating Tesla code
43, !$acc loop gang, vector(128) ! blockidx%x threadidx%x
42, Generating default present(a(:))
46, Generating update self(a(:))
82, Generating update self(a(:))
85, Generating exit data delete(a(:))
% mpirun -np 2 ./a.out
Process 0 of 2 is alive
Process 1 of 2 is alive
Process 1 Using device 1
Process 0 Using device 0
a in rank 1 before communication is 0.000000000000000
a in rank 0 before communication is 1.000000000000000
a in rank 1 after communication is 1.000000000000000
a in rank 0 after communication is 1.000000000000000
我正在尝试学习如何使用以下玩具代码执行 GPU 间数据通信。该程序的任务是将 gpu-0 中的数组 'a' 数据发送到 gpu-1 的内存中。我采取了以下根来这样做,涉及四个步骤:
在 gpu0 上初始化数组 'a' 后,
- 第一步:从gpu0向cpu0发送数据(使用
!acc update self()
子句) - step2:从cpu0向cpu1发送数据(使用
MPI_SEND()
) - step3:从cpu0接收数据到cpu1(使用
MPI_RECV()
) - step4:更新gpu1设备内存(使用
!$acc update device()
子句)
这工作得很好,但这看起来像是一条很长的路线,我认为有更好的方法来做到这一点。我试图阅读以下 post 中建议的 !$acc host_data use_device
子句,但无法实现它:
我想知道如何使用 !$acc host_data use_device
以高效的方式执行下面显示的任务。
PROGRAM TOY_MPI_OpenACC
implicit none
include 'mpif.h'
integer :: rank, nprocs, ierr, i, dest_rank, tag, from
integer :: status(MPI_STATUS_SIZE)
integer, parameter :: N = 10000
double precision, dimension(N) :: a
call MPI_INIT(ierr)
call MPI_COMM_RANK(MPI_COMM_WORLD,rank,ierr)
call MPI_COMM_SIZE(MPI_COMM_WORLD,nprocs,ierr)
print*, 'Process ', rank, ' of', nprocs, ' is alive'
!$acc data create(a)
! initialize 'a' on gpu0 (not cpu0)
IF (rank == 0) THEN
!$acc parallel loop default(present)
DO i = 1,N
a(i) = 1
ENDDO
ENDIF
! step1: send data from gpu0 to cpu0
!$acc update self(a)
print*, 'a in rank', rank, ' before communication is ', a(N/2)
IF (rank == 0) THEN
! step2: send from cpu0
dest_rank = 1; tag = 1999
call MPI_SEND(a, N, MPI_DOUBLE_PRECISION, dest_rank, tag, MPI_COMM_WORLD, ierr)
ELSEIF (rank == 1) THEN
! step3: recieve into cpu1
from = MPI_ANY_SOURCE; tag = MPI_ANY_TAG;
call MPI_RECV(a, N, MPI_DOUBLE_PRECISION, from, tag, MPI_COMM_WORLD, status, ierr)
! step4: send data in to gpu1 from cpu1
!$acc update device(a)
ENDIF
call MPI_BARRIER(MPI_COMM_WORLD, ierr)
print*, 'a in rank', rank, ' after communication is ', a(N/2)
!$acc end data
call MPI_BARRIER(MPI_COMM_WORLD, ierr)
END
编译:mpif90 -acc -ta=tesla toycode.f90
(来自 nvidia hpc-sdk 21.9 的 mpif90)
执行:mpirun -np 2 ./a.out
举个例子。请注意,我还添加了一些样板代码来对设备分配进行本地节点排名。我也更喜欢使用非结构化数据区域,因为它们更适合更复杂的代码,但在这里它们在语义上等同于您上面使用的结构化数据区域。我在 CUDA_AWARE_MPI 宏下保护 host_data 结构,因为并非所有 MPI 都启用了 CUDA Aware 支持。对于这些,您需要恢复到在主机和设备之间复制数据 before/after MPI 调用。
% cat mpi_acc.F90
PROGRAM TOY_MPI_OpenACC
use mpi
#ifdef _OPENACC
use openacc
#endif
implicit none
integer :: rank, nprocs, ierr, i, dest_rank, tag, from
integer :: status(MPI_STATUS_SIZE)
integer, parameter :: N = 10000
double precision, dimension(N) :: a
#ifdef _OPENACC
integer :: dev, devNum, local_rank, local_comm
integer(acc_device_kind) :: devtype
#endif
call MPI_INIT(ierr)
call MPI_COMM_RANK(MPI_COMM_WORLD,rank,ierr)
call MPI_COMM_SIZE(MPI_COMM_WORLD,nprocs,ierr)
print*, 'Process ', rank, ' of', nprocs, ' is alive'
#ifdef _OPENACC
! set the MPI rank to device mapping
! 1) Get the local node's rank number
! 2) Get the number of devices on the node
! 3) Round-Robin assignment of rank to device
call MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, &
MPI_INFO_NULL, local_comm,ierr)
call MPI_Comm_rank(local_comm, local_rank,ierr)
devtype = acc_get_device_type()
devNum = acc_get_num_devices(devtype)
dev = mod(local_rank,devNum)
call acc_set_device_num(dev, devtype)
print*, "Process ",rank," Using device ",dev
#endif
a = 0
!$acc enter data copyin(a)
! initialize 'a' on gpu0 (not cpu0)
IF (rank == 0) THEN
!$acc parallel loop default(present)
DO i = 1,N
a(i) = 1
ENDDO
!$acc update self(a)
ENDIF
! step1: send data from gpu0 to cpu0
print*, 'a in rank', rank, ' before communication is ', a(N/2)
IF (rank == 0) THEN
! step2: send from cpu0
dest_rank = 1; tag = 1999
#ifdef CUDA_AWARE_MPI
!$acc host_data use_device(a)
#endif
call MPI_SEND(a, N, MPI_DOUBLE_PRECISION, dest_rank, tag, MPI_COMM_WORLD, ierr)
#ifdef CUDA_AWARE_MPI
!$acc end host_data
#endif
ELSEIF (rank == 1) THEN
! step3: recieve into cpu1
from = MPI_ANY_SOURCE; tag = MPI_ANY_TAG;
#ifdef CUDA_AWARE_MPI
!$acc host_data use_device(a)
#endif
call MPI_RECV(a, N, MPI_DOUBLE_PRECISION, from, tag, MPI_COMM_WORLD, status, ierr)
#ifdef CUDA_AWARE_MPI
!$acc end host_data
#else
! step4: send data in to gpu1 from cpu1
!$acc update device(a)
#endif
ENDIF
call MPI_BARRIER(MPI_COMM_WORLD, ierr)
!$acc update self(a)
print*, 'a in rank', rank, ' after communication is ', a(N/2)
!$acc exit data delete(a)
call MPI_BARRIER(MPI_COMM_WORLD, ierr)
END
% which mpif90
/proj/nv/Linux_x86_64/21.9/comm_libs/mpi/bin//mpif90
% mpif90 -V
nvfortran 21.9-0 64-bit target on x86-64 Linux -tp skylake
NVIDIA Compilers and Tools
Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
% mpif90 -acc -Minfo=accel mpi_acc.F90
toy_mpi_openacc:
38, Generating enter data copyin(a(:))
42, Generating Tesla code
43, !$acc loop gang, vector(128) ! blockidx%x threadidx%x
42, Generating default present(a(:))
46, Generating update self(a(:))
76, Generating update device(a(:))
82, Generating update self(a(:))
85, Generating exit data delete(a(:))
% mpirun -np 2 ./a.out
Process 1 of 2 is alive
Process 0 of 2 is alive
Process 0 Using device 0
Process 1 Using device 1
a in rank 1 before communication is 0.000000000000000
a in rank 0 before communication is 1.000000000000000
a in rank 0 after communication is 1.000000000000000
a in rank 1 after communication is 1.000000000000000
% mpif90 -acc -Minfo=accel mpi_acc.F90 -DCUDA_AWARE_MPI=1
toy_mpi_openacc:
38, Generating enter data copyin(a(:))
42, Generating Tesla code
43, !$acc loop gang, vector(128) ! blockidx%x threadidx%x
42, Generating default present(a(:))
46, Generating update self(a(:))
82, Generating update self(a(:))
85, Generating exit data delete(a(:))
% mpirun -np 2 ./a.out
Process 0 of 2 is alive
Process 1 of 2 is alive
Process 1 Using device 1
Process 0 Using device 0
a in rank 1 before communication is 0.000000000000000
a in rank 0 before communication is 1.000000000000000
a in rank 1 after communication is 1.000000000000000
a in rank 0 after communication is 1.000000000000000