传递给可变数量节点的可变二维数组
Variable 2D Array delivered to a variable number of nodes
我不得不面对这样的情况:
给定 N 个 MPI 节点
和
给定 [N_ROWS,N_COLS] 维度
的二维实数数组
为了加速计算,我必须把它分成几部分,给每个节点
二维数组的一部分并利用节点数。
按照 Fortran 方式在内存中存储数据,数组首先使用变化最快的变量进行索引,数组的每个 [:,i] 列与其他列 "logically" 分开。
我环顾四周,发现了像这样的非常有启发性的问题 Sending 2D arrays in Fortran with MPI_Gather
我已经有了使用 mpi_scatterv
和 mpi_gatherv
的想法,但是我反对这样一个事实,因为在问题约束中,不可能保证每个MPI 节点它被赋予相同数量的数据,或者,在伪代码中:
#Number_of_MPI_nodes != N_ROWS*N_COLS
我想使用向量,因为每个 "column" 都有自己的 "independent" 系列数据,当我说 "independent" 我的意思是我必须对属于同一列的数据,不影响其他列。
显然,由于给出了不等式,一些MPI节点将有不同数量的"columns"来分析。
做完一些数学运算后,我需要使用 mpi_gatherv
收集数据
我将在几个小时后用一个工作示例更新问题!
非常感谢大家!
代码:
program main
use mpi
implicit none
integer:: N_COLS=100, N_ROWS=200
integer:: i, j
integer:: ID_mpi, COM_mpi, ERROR_mpi
integer:: master = 0, SIZE_mpi=0
integer:: to_each_cpu=0, to_each_cpu_oddment=0
integer:: sub_matrix_size=0
integer:: nans=0, infs=0, array_split =0, my_type=0
integer ,dimension(:), allocatable :: elem_to_each_cpu
integer ,dimension(:), allocatable :: displacements
integer,parameter:: seed = 12345
character*160:: message
real :: tot_sum = 0.0
real ,dimension(:,:), allocatable:: Data_Matrix
real ,dimension(:,:), allocatable:: sub_split_Data_Matrix
call srand(seed)
call MPI_INIT(ERROR_mpi)
COM_mpi = MPI_COMM_WORLD
call MPI_COMM_RANK(COM_mpi,ID_mpi,ERROR_mpi)
call MPI_COMM_SIZE(COM_mpi,SIZE_mpi,ERROR_mpi)
!! allocation Data_Matrix
i = 1; j = 1
if (ID_mpi .eq. master) then
i = N_ROWS; j = N_COLS
end if
allocate(Data_Matrix(i, j))
do j = 1, N_COLS
do i = 1, N_ROWS
Data_Matrix(i, j) = rand()
tot_sum = tot_sum + Data_Matrix(i, j)
enddo
enddo
write(message,*) "N_COLS:",N_COLS, "N_ROWS:", N_ROWS, " TOTAL_SUM:", tot_sum
write(*,*) message
!! SINCE THERE ARE NO RESTRICTIONS ON MPI NUMBER OR CPUS OR
!! SIZE OR Data_Matrix I NEED TO DO THIS
to_each_cpu =N_COLS / SIZE_mpi
to_each_cpu_oddment = N_COLS -( to_each_cpu * SIZE_mpi )
allocate(elem_to_each_cpu(SIZE_mpi))
elem_to_each_cpu = to_each_cpu
allocate(displacements(SIZE_mpi))
displacements = 0
!! I CHOOSE TO SPLIT THE DATA IN THIS WAY
if (ID_mpi .eq. master) then
write(message,*) "N_COLS:",N_COLS, "mpisize:", SIZE_mpi, "to_each_cpu\oddment:", to_each_cpu, " \ ", to_each_cpu_oddment
write(*,*) message
j=1
do i = 1 , to_each_cpu_oddment
elem_to_each_cpu(j) = elem_to_each_cpu(j) + 1
j = j + 1
if(j .gt. SIZE_mpi) j = 1
enddo
do j = 2, SIZE_mpi
displacements(j) = elem_to_each_cpu(j-1) + displacements(j-1)
enddo
do i = 1 , SIZE_mpi
write(message,*)i, " to_each_cpu:", &
elem_to_each_cpu(i), " sub_split_buff_displ:",displacements(i), "=",elem_to_each_cpu(i)+displacements(i)
write(*,*) message
enddo
end if
call MPI_BCAST(elem_to_each_cpu, SIZE_mpi, MPI_INT, 0, COM_mpi, ERROR_mpi)
call MPI_BCAST(displacements, SIZE_mpi, MPI_INT, 0, COM_mpi, ERROR_mpi)
allocate( sub_split_Data_Matrix(N_ROWS,elem_to_each_cpu(ID_mpi+1)) )
call MPI_TYPE_VECTOR(N_COLS,N_ROWS,N_ROWS,MPI_FLOAT,my_type,ERROR_mpi)
call MPI_TYPE_COMMIT(my_type, ERROR_mpi)
sub_split_Data_Matrix=0
sub_matrix_size = N_ROWS*elem_to_each_cpu(ID_mpi+1)
call MPI_scatterv( Data_Matrix,elem_to_each_cpu,displacements,&
MPI_FLOAT, sub_split_Data_Matrix, sub_matrix_size ,MPI_FLOAT, &
0, COM_mpi, ERROR_mpi)
!!! DOING SOME MATH ON SCATTERED MATRIX
call MPI_gatherv(&
sub_split_Data_Matrix, sub_matrix_size,MPI_FLOAT ,&
Data_Matrix, elem_to_each_cpu, displacements, &
MPI_FLOAT, 0, COM_mpi, ERROR_mpi)
!!! DOING SOME MATH ON GATHERED MATRIX
tot_sum = 0.0
do j = 1, N_COLS
do i = 1, N_ROWS
tot_sum = tot_sum + Data_Matrix(i, j)
enddo
enddo
write(message,*) "N_COLS:",N_COLS, "N_ROWS:", N_ROWS, " TOTAL_SUM:", tot_sum
write(*,*) message
deallocate(Data_Matrix)
if (ID_mpi .eq. master) then
deallocate(elem_to_each_cpu )
deallocate(displacements )
endif
deallocate(sub_split_Data_Matrix)
end
结果:
MPI_Gahterv 发生错误
在通讯器上 MPI_COMM_WORLD
无效内存引用
问题:
你能帮我找出错误吗?
或者更好,你能帮我展示一下这种方法吗
那我用的合适吗?
非常感谢!
我查看了您的代码并进行了一些更改以修复它:
- 不重要:这里和那里的一些风格/装饰元素(从我的角度来看,这是有争议的)提高了可读性。如果您不喜欢,请见谅。
- 进程 0 不需要是唯一计算
MPI_Scatterv()
/MPI_Gatherv()
调用的长度和位移的进程。所有进程都应该计算它们,因为它们都有必要的数据来这样做。而且还省了你们两个MPI_Bcast()
,挺好的
- 长度的计算很奇怪。我怀疑这是错误的,但我不确定,因为它太复杂了,我只是重写了它。
- 主要问题是矢量类型和标量类型之间的混淆:你的长度和位移是为你的矢量类型计算的,但你调用
MPI_Scatterv()
/MPI_Gatherv()
标量类型。此外,对于 Fortran,此标量类型是 MPI_REAL
,而不是 MPI_FLOAT
。在我在下面发布的代码中,我计算了 MPI_REAL
的长度和位移,但如果您愿意,可以将它们全部除以 N_ROWS
并使用 MPI_Type_contiguous( N_ROWS, MPI_REAL, my_type )
的结果而不是 MPI_REAL
在 scatter/gather 中得到相同的结果。
修改后的代码如下:
program main
use mpi
implicit none
integer, parameter :: N_COLS=100, N_ROWS=200, master=0
integer :: i, j
integer :: ID_mpi,SIZE_mpi, COM_mpi, ERROR_mpi, my_type
integer :: to_each_cpu, to_each_cpu_oddment, sub_matrix_size
integer, allocatable :: elem_to_each_cpu(:), displacements(:)
real :: tot_sum = 0.0
real, allocatable :: Data_Matrix(:,:), sub_split_Data_Matrix(:,:)
call MPI_Init( ERROR_mpi )
COM_mpi = MPI_COMM_WORLD
call MPI_Comm_rank( COM_mpi, ID_mpi, ERROR_mpi )
call MPI_Comm_size( COM_mpi, SIZE_mpi, ERROR_mpi )
!! allocation Data_Matrix
if ( ID_mpi == master ) then
allocate( Data_Matrix( N_ROWS, N_COLS ) )
call random_number( Data_Matrix )
do j = 1, N_COLS
do i = 1, N_ROWS
tot_sum = tot_sum + Data_Matrix(i, j)
enddo
enddo
print *, "N_COLS:", N_COLS, "N_ROWS:", N_ROWS, " TOTAL_SUM:", tot_sum
end if
!! SINCE THERE ARE NO RESTRICTIONS ON MPI NUMBER OR CPUS OR
!! SIZE OR Data_Matrix I NEED TO DO THIS
to_each_cpu = N_COLS / SIZE_mpi
to_each_cpu_oddment = N_COLS - ( to_each_cpu * SIZE_mpi )
allocate( elem_to_each_cpu(SIZE_mpi) )
elem_to_each_cpu = to_each_cpu * N_ROWS
allocate( displacements(SIZE_mpi) )
displacements = 0
!! I CHOOSE TO SPLIT THE DATA IN THIS WAY
if ( ID_mpi == master ) then
print *, "N_COLS:", N_COLS, "mpisize:", SIZE_mpi, "to_each_cpu\oddment:", to_each_cpu, " \ ", to_each_cpu_oddment
end if
do i = 1, to_each_cpu_oddment
elem_to_each_cpu(i) = elem_to_each_cpu(i) + N_ROWS
enddo
do i = 1, SIZE_mpi-1
displacements(i+1) = displacements(i) + elem_to_each_cpu(i)
enddo
if ( ID_mpi == master ) then
do i = 1, SIZE_mpi
print *, i, " to_each_cpu:", &
elem_to_each_cpu(i), " sub_split_buff_displ:", displacements(i), &
"=", elem_to_each_cpu(i) + displacements(i)
enddo
end if
allocate( sub_split_Data_Matrix(N_ROWS, elem_to_each_cpu(ID_mpi+1)/N_ROWS) )
sub_split_Data_Matrix = 0
sub_matrix_size = elem_to_each_cpu(ID_mpi+1)
call MPI_scatterv( Data_Matrix, elem_to_each_cpu ,displacements, MPI_REAL, &
sub_split_Data_Matrix, sub_matrix_size, MPI_REAL, &
master, COM_mpi, ERROR_mpi )
!!! DOING SOME MATH ON SCATTERED MATRIX
call MPI_gatherv( sub_split_Data_Matrix, sub_matrix_size, MPI_REAL, &
Data_Matrix, elem_to_each_cpu, displacements, MPI_REAL, &
master, COM_mpi, ERROR_mpi )
!!! DOING SOME MATH ON GATHERED MATRIX
if ( ID_mpi == master ) then
tot_sum = 0.0
do j = 1, N_COLS
do i = 1, N_ROWS
tot_sum = tot_sum + Data_Matrix(i, j)
enddo
enddo
print *, "N_COLS:", N_COLS, "N_ROWS:", N_ROWS, " TOTAL_SUM:", tot_sum
deallocate( Data_Matrix )
endif
deallocate( elem_to_each_cpu )
deallocate( displacements )
deallocate( sub_split_Data_Matrix )
end program main
经过这些修改,代码按预期工作:
$ mpif90 scat_gath2.f90
$ mpirun -n 3 ./a.out
N_COLS: 100 N_ROWS: 200 TOTAL_SUM: 10004.4443
N_COLS: 100 mpisize: 3 to_each_cpu\oddment: 33 \ 1
1 to_each_cpu: 6800 sub_split_buff_displ: 0 = 6800
2 to_each_cpu: 6600 sub_split_buff_displ: 6800 = 13400
3 to_each_cpu: 6600 sub_split_buff_displ: 13400 = 20000
N_COLS: 100 N_ROWS: 200 TOTAL_SUM: 10004.4443
我不得不面对这样的情况:
给定 N 个 MPI 节点 和 给定 [N_ROWS,N_COLS] 维度
的二维实数数组为了加速计算,我必须把它分成几部分,给每个节点 二维数组的一部分并利用节点数。
按照 Fortran 方式在内存中存储数据,数组首先使用变化最快的变量进行索引,数组的每个 [:,i] 列与其他列 "logically" 分开。
我环顾四周,发现了像这样的非常有启发性的问题 Sending 2D arrays in Fortran with MPI_Gather
我已经有了使用 mpi_scatterv
和 mpi_gatherv
的想法,但是我反对这样一个事实,因为在问题约束中,不可能保证每个MPI 节点它被赋予相同数量的数据,或者,在伪代码中:
#Number_of_MPI_nodes != N_ROWS*N_COLS
我想使用向量,因为每个 "column" 都有自己的 "independent" 系列数据,当我说 "independent" 我的意思是我必须对属于同一列的数据,不影响其他列。
显然,由于给出了不等式,一些MPI节点将有不同数量的"columns"来分析。
做完一些数学运算后,我需要使用 mpi_gatherv
我将在几个小时后用一个工作示例更新问题!
非常感谢大家!
代码:
program main
use mpi
implicit none
integer:: N_COLS=100, N_ROWS=200
integer:: i, j
integer:: ID_mpi, COM_mpi, ERROR_mpi
integer:: master = 0, SIZE_mpi=0
integer:: to_each_cpu=0, to_each_cpu_oddment=0
integer:: sub_matrix_size=0
integer:: nans=0, infs=0, array_split =0, my_type=0
integer ,dimension(:), allocatable :: elem_to_each_cpu
integer ,dimension(:), allocatable :: displacements
integer,parameter:: seed = 12345
character*160:: message
real :: tot_sum = 0.0
real ,dimension(:,:), allocatable:: Data_Matrix
real ,dimension(:,:), allocatable:: sub_split_Data_Matrix
call srand(seed)
call MPI_INIT(ERROR_mpi)
COM_mpi = MPI_COMM_WORLD
call MPI_COMM_RANK(COM_mpi,ID_mpi,ERROR_mpi)
call MPI_COMM_SIZE(COM_mpi,SIZE_mpi,ERROR_mpi)
!! allocation Data_Matrix
i = 1; j = 1
if (ID_mpi .eq. master) then
i = N_ROWS; j = N_COLS
end if
allocate(Data_Matrix(i, j))
do j = 1, N_COLS
do i = 1, N_ROWS
Data_Matrix(i, j) = rand()
tot_sum = tot_sum + Data_Matrix(i, j)
enddo
enddo
write(message,*) "N_COLS:",N_COLS, "N_ROWS:", N_ROWS, " TOTAL_SUM:", tot_sum
write(*,*) message
!! SINCE THERE ARE NO RESTRICTIONS ON MPI NUMBER OR CPUS OR
!! SIZE OR Data_Matrix I NEED TO DO THIS
to_each_cpu =N_COLS / SIZE_mpi
to_each_cpu_oddment = N_COLS -( to_each_cpu * SIZE_mpi )
allocate(elem_to_each_cpu(SIZE_mpi))
elem_to_each_cpu = to_each_cpu
allocate(displacements(SIZE_mpi))
displacements = 0
!! I CHOOSE TO SPLIT THE DATA IN THIS WAY
if (ID_mpi .eq. master) then
write(message,*) "N_COLS:",N_COLS, "mpisize:", SIZE_mpi, "to_each_cpu\oddment:", to_each_cpu, " \ ", to_each_cpu_oddment
write(*,*) message
j=1
do i = 1 , to_each_cpu_oddment
elem_to_each_cpu(j) = elem_to_each_cpu(j) + 1
j = j + 1
if(j .gt. SIZE_mpi) j = 1
enddo
do j = 2, SIZE_mpi
displacements(j) = elem_to_each_cpu(j-1) + displacements(j-1)
enddo
do i = 1 , SIZE_mpi
write(message,*)i, " to_each_cpu:", &
elem_to_each_cpu(i), " sub_split_buff_displ:",displacements(i), "=",elem_to_each_cpu(i)+displacements(i)
write(*,*) message
enddo
end if
call MPI_BCAST(elem_to_each_cpu, SIZE_mpi, MPI_INT, 0, COM_mpi, ERROR_mpi)
call MPI_BCAST(displacements, SIZE_mpi, MPI_INT, 0, COM_mpi, ERROR_mpi)
allocate( sub_split_Data_Matrix(N_ROWS,elem_to_each_cpu(ID_mpi+1)) )
call MPI_TYPE_VECTOR(N_COLS,N_ROWS,N_ROWS,MPI_FLOAT,my_type,ERROR_mpi)
call MPI_TYPE_COMMIT(my_type, ERROR_mpi)
sub_split_Data_Matrix=0
sub_matrix_size = N_ROWS*elem_to_each_cpu(ID_mpi+1)
call MPI_scatterv( Data_Matrix,elem_to_each_cpu,displacements,&
MPI_FLOAT, sub_split_Data_Matrix, sub_matrix_size ,MPI_FLOAT, &
0, COM_mpi, ERROR_mpi)
!!! DOING SOME MATH ON SCATTERED MATRIX
call MPI_gatherv(&
sub_split_Data_Matrix, sub_matrix_size,MPI_FLOAT ,&
Data_Matrix, elem_to_each_cpu, displacements, &
MPI_FLOAT, 0, COM_mpi, ERROR_mpi)
!!! DOING SOME MATH ON GATHERED MATRIX
tot_sum = 0.0
do j = 1, N_COLS
do i = 1, N_ROWS
tot_sum = tot_sum + Data_Matrix(i, j)
enddo
enddo
write(message,*) "N_COLS:",N_COLS, "N_ROWS:", N_ROWS, " TOTAL_SUM:", tot_sum
write(*,*) message
deallocate(Data_Matrix)
if (ID_mpi .eq. master) then
deallocate(elem_to_each_cpu )
deallocate(displacements )
endif
deallocate(sub_split_Data_Matrix)
end
结果:
MPI_Gahterv 发生错误 在通讯器上 MPI_COMM_WORLD
无效内存引用
问题:
你能帮我找出错误吗? 或者更好,你能帮我展示一下这种方法吗 那我用的合适吗?
非常感谢!
我查看了您的代码并进行了一些更改以修复它:
- 不重要:这里和那里的一些风格/装饰元素(从我的角度来看,这是有争议的)提高了可读性。如果您不喜欢,请见谅。
- 进程 0 不需要是唯一计算
MPI_Scatterv()
/MPI_Gatherv()
调用的长度和位移的进程。所有进程都应该计算它们,因为它们都有必要的数据来这样做。而且还省了你们两个MPI_Bcast()
,挺好的 - 长度的计算很奇怪。我怀疑这是错误的,但我不确定,因为它太复杂了,我只是重写了它。
- 主要问题是矢量类型和标量类型之间的混淆:你的长度和位移是为你的矢量类型计算的,但你调用
MPI_Scatterv()
/MPI_Gatherv()
标量类型。此外,对于 Fortran,此标量类型是MPI_REAL
,而不是MPI_FLOAT
。在我在下面发布的代码中,我计算了MPI_REAL
的长度和位移,但如果您愿意,可以将它们全部除以N_ROWS
并使用MPI_Type_contiguous( N_ROWS, MPI_REAL, my_type )
的结果而不是MPI_REAL
在 scatter/gather 中得到相同的结果。
修改后的代码如下:
program main
use mpi
implicit none
integer, parameter :: N_COLS=100, N_ROWS=200, master=0
integer :: i, j
integer :: ID_mpi,SIZE_mpi, COM_mpi, ERROR_mpi, my_type
integer :: to_each_cpu, to_each_cpu_oddment, sub_matrix_size
integer, allocatable :: elem_to_each_cpu(:), displacements(:)
real :: tot_sum = 0.0
real, allocatable :: Data_Matrix(:,:), sub_split_Data_Matrix(:,:)
call MPI_Init( ERROR_mpi )
COM_mpi = MPI_COMM_WORLD
call MPI_Comm_rank( COM_mpi, ID_mpi, ERROR_mpi )
call MPI_Comm_size( COM_mpi, SIZE_mpi, ERROR_mpi )
!! allocation Data_Matrix
if ( ID_mpi == master ) then
allocate( Data_Matrix( N_ROWS, N_COLS ) )
call random_number( Data_Matrix )
do j = 1, N_COLS
do i = 1, N_ROWS
tot_sum = tot_sum + Data_Matrix(i, j)
enddo
enddo
print *, "N_COLS:", N_COLS, "N_ROWS:", N_ROWS, " TOTAL_SUM:", tot_sum
end if
!! SINCE THERE ARE NO RESTRICTIONS ON MPI NUMBER OR CPUS OR
!! SIZE OR Data_Matrix I NEED TO DO THIS
to_each_cpu = N_COLS / SIZE_mpi
to_each_cpu_oddment = N_COLS - ( to_each_cpu * SIZE_mpi )
allocate( elem_to_each_cpu(SIZE_mpi) )
elem_to_each_cpu = to_each_cpu * N_ROWS
allocate( displacements(SIZE_mpi) )
displacements = 0
!! I CHOOSE TO SPLIT THE DATA IN THIS WAY
if ( ID_mpi == master ) then
print *, "N_COLS:", N_COLS, "mpisize:", SIZE_mpi, "to_each_cpu\oddment:", to_each_cpu, " \ ", to_each_cpu_oddment
end if
do i = 1, to_each_cpu_oddment
elem_to_each_cpu(i) = elem_to_each_cpu(i) + N_ROWS
enddo
do i = 1, SIZE_mpi-1
displacements(i+1) = displacements(i) + elem_to_each_cpu(i)
enddo
if ( ID_mpi == master ) then
do i = 1, SIZE_mpi
print *, i, " to_each_cpu:", &
elem_to_each_cpu(i), " sub_split_buff_displ:", displacements(i), &
"=", elem_to_each_cpu(i) + displacements(i)
enddo
end if
allocate( sub_split_Data_Matrix(N_ROWS, elem_to_each_cpu(ID_mpi+1)/N_ROWS) )
sub_split_Data_Matrix = 0
sub_matrix_size = elem_to_each_cpu(ID_mpi+1)
call MPI_scatterv( Data_Matrix, elem_to_each_cpu ,displacements, MPI_REAL, &
sub_split_Data_Matrix, sub_matrix_size, MPI_REAL, &
master, COM_mpi, ERROR_mpi )
!!! DOING SOME MATH ON SCATTERED MATRIX
call MPI_gatherv( sub_split_Data_Matrix, sub_matrix_size, MPI_REAL, &
Data_Matrix, elem_to_each_cpu, displacements, MPI_REAL, &
master, COM_mpi, ERROR_mpi )
!!! DOING SOME MATH ON GATHERED MATRIX
if ( ID_mpi == master ) then
tot_sum = 0.0
do j = 1, N_COLS
do i = 1, N_ROWS
tot_sum = tot_sum + Data_Matrix(i, j)
enddo
enddo
print *, "N_COLS:", N_COLS, "N_ROWS:", N_ROWS, " TOTAL_SUM:", tot_sum
deallocate( Data_Matrix )
endif
deallocate( elem_to_each_cpu )
deallocate( displacements )
deallocate( sub_split_Data_Matrix )
end program main
经过这些修改,代码按预期工作:
$ mpif90 scat_gath2.f90
$ mpirun -n 3 ./a.out
N_COLS: 100 N_ROWS: 200 TOTAL_SUM: 10004.4443
N_COLS: 100 mpisize: 3 to_each_cpu\oddment: 33 \ 1
1 to_each_cpu: 6800 sub_split_buff_displ: 0 = 6800
2 to_each_cpu: 6600 sub_split_buff_displ: 6800 = 13400
3 to_each_cpu: 6600 sub_split_buff_displ: 13400 = 20000
N_COLS: 100 N_ROWS: 200 TOTAL_SUM: 10004.4443