MPI 产卵和合并问题
Issue with MPI spawn and merge
我正在尝试开始在 MPI 中创建动态进程。我有一个 parent 代码 (main.c) 试图生成新的 worker/child 进程 (worker.c) 并将两者合并到一个内部通信器中。 parent 代码 (main.c) 是
#include<stdio.h>
#include "mpi.h"
MPI_Comm child_comm;
int rank, size;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if(rank == 0 )
{
int num_processes_to_spawn = 2;
MPI_Comm_spawn("worker", MPI_ARGV_NULL, num_processes_to_spawn, MPI_INFO_NULL, 0, MPI_COMM_SELF, &child_comm, MPI_ERRCODES_IGNORE );
MPI_Comm intra_comm;
MPI_Intercomm_merge(child_comm,0, &intra_comm);
MPI_Barrier(child_comm);
int tmp_size;
MPI_Comm_size(intra_comm, &tmp_size);
printf("size of intra comm world = %d\n", tmp_size);
MPI_Comm_size(child_comm, &tmp_size);
printf("size of child comm world = %d\n", tmp_size);
MPI_Comm_size(MPI_COMM_WORLD, &tmp_size);
printf("size of parent comm world = %d\n", tmp_size);
}
MPI_Finalize();
工人(child)代码是:
#include<stdio.h>
#include "mpi.h"
int main( int argc, char *argv[] )
{
int numprocs, myrank;
MPI_Comm parentcomm;
MPI_Comm intra_comm;
MPI_Init( &argc, &argv );
MPI_Comm_size( MPI_COMM_WORLD, &numprocs );
MPI_Comm_rank( MPI_COMM_WORLD, &myrank );
MPI_Comm_get_parent( &parentcomm );
MPI_Intercomm_merge(parentcomm, 1, &intra_comm);
MPI_Barrier(parentcomm);
if(myrank == 0)
{
int tmp_size;
MPI_Comm_size(parentcomm, &tmp_size);
printf("child size of parent comm world = %d\n", tmp_size);
MPI_Comm_size(MPI_COMM_WORLD, &tmp_size);
printf("child size of child comm world = %d\n", tmp_size);
MPI_Comm_size(intra_comm, &tmp_size);
printf("child size of intra comm world = %d\n", tmp_size);
MPI_Finalize( );
return 0;
}
}
我运行此代码使用
mpirun -np 12 main.c
拆分和合并后,我希望输出为
size of intra comm world = 14
size of child comm world = 2
size of parent comm world = 12
child size of parent comm world = 12
child size of child comm world = 2
child size of intra comm world = 14
但我得到以下不正确的输出。
size of intra comm world = 3
size of child comm world = 1
size of parent comm world = 12
child size of parent comm world = 2
child size of child comm world = 2
child size of intra comm world = 3
我不明白哪里错了,哪位好心人告诉我错在哪里。
谢谢,
克里斯
您的代码存在一些问题,我将在此处列出:
- 在master部分,只有进程0调用
MPI_Comm_spawn()
。这本身并不是一个错误(特别是因为您使用 MPI_COMM_SELF
作为父通信器),但它实际上从后续合并中排除了所有其他进程。
- 在 master 和 worker 部分,您使用
MPI_Comm_size()
而不是 MPI_Comm_remote_size()
来获取远程通信器的大小。因此,您只会获得内部通信器中 local 通信器的大小,而不是 remote 通信器的大小。
- master代码中,只处理0次调用
MPI_Finalise()
(更不用说缺少main()
和MPI_Init()
)
以下是您的代码的一些固定版本:
master.c
#include <stdio.h>
#include <mpi.h>
int main( int argc, char *argv[] ) {
MPI_Init( &argc, &argv );
int rank;
MPI_Comm_rank( MPI_COMM_WORLD, &rank );
MPI_Comm child_comm;
int num_processes_to_spawn = 2;
MPI_Comm_spawn( "./worker", MPI_ARGV_NULL,
num_processes_to_spawn, MPI_INFO_NULL,
0, MPI_COMM_WORLD,
&child_comm, MPI_ERRCODES_IGNORE );
MPI_Comm intra_comm;
MPI_Intercomm_merge( child_comm, 0, &intra_comm );
if ( rank == 0 ) {
int tmp_size;
MPI_Comm_size( intra_comm, &tmp_size );
printf( "size of intra comm world = %d\n", tmp_size );
MPI_Comm_remote_size( child_comm, &tmp_size );
printf( "size of child comm world = %d\n", tmp_size );
MPI_Comm_size( MPI_COMM_WORLD, &tmp_size );
printf( "size of parent comm world = %d\n", tmp_size );
}
MPI_Finalize();
return 0;
}
worker.c
#include <stdio.h>
#include <mpi.h>
int main( int argc, char *argv[] ) {
MPI_Init( &argc, &argv );
int myrank;
MPI_Comm_rank( MPI_COMM_WORLD, &myrank );
MPI_Comm parentcomm;
MPI_Comm_get_parent( &parentcomm );
MPI_Comm intra_comm;
MPI_Intercomm_merge( parentcomm, 1, &intra_comm );
if ( myrank == 0 ) {
int tmp_size;
MPI_Comm_remote_size( parentcomm, &tmp_size );
printf( "child size of parent comm world = %d\n", tmp_size );
MPI_Comm_size( MPI_COMM_WORLD, &tmp_size );
printf( "child size of child comm world = %d\n", tmp_size );
MPI_Comm_size( intra_comm, &tmp_size );
printf( "child size of intra comm world = %d\n", tmp_size );
}
MPI_Finalize();
return 0;
}
在我的笔记本电脑上显示:
~> mpirun -n 12 ./master
child size of parent comm world = 12
child size of child comm world = 2
child size of intra comm world = 14
size of intra comm world = 14
size of child comm world = 2
size of parent comm world = 12
我正在尝试开始在 MPI 中创建动态进程。我有一个 parent 代码 (main.c) 试图生成新的 worker/child 进程 (worker.c) 并将两者合并到一个内部通信器中。 parent 代码 (main.c) 是
#include<stdio.h>
#include "mpi.h"
MPI_Comm child_comm;
int rank, size;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if(rank == 0 )
{
int num_processes_to_spawn = 2;
MPI_Comm_spawn("worker", MPI_ARGV_NULL, num_processes_to_spawn, MPI_INFO_NULL, 0, MPI_COMM_SELF, &child_comm, MPI_ERRCODES_IGNORE );
MPI_Comm intra_comm;
MPI_Intercomm_merge(child_comm,0, &intra_comm);
MPI_Barrier(child_comm);
int tmp_size;
MPI_Comm_size(intra_comm, &tmp_size);
printf("size of intra comm world = %d\n", tmp_size);
MPI_Comm_size(child_comm, &tmp_size);
printf("size of child comm world = %d\n", tmp_size);
MPI_Comm_size(MPI_COMM_WORLD, &tmp_size);
printf("size of parent comm world = %d\n", tmp_size);
}
MPI_Finalize();
工人(child)代码是:
#include<stdio.h>
#include "mpi.h"
int main( int argc, char *argv[] )
{
int numprocs, myrank;
MPI_Comm parentcomm;
MPI_Comm intra_comm;
MPI_Init( &argc, &argv );
MPI_Comm_size( MPI_COMM_WORLD, &numprocs );
MPI_Comm_rank( MPI_COMM_WORLD, &myrank );
MPI_Comm_get_parent( &parentcomm );
MPI_Intercomm_merge(parentcomm, 1, &intra_comm);
MPI_Barrier(parentcomm);
if(myrank == 0)
{
int tmp_size;
MPI_Comm_size(parentcomm, &tmp_size);
printf("child size of parent comm world = %d\n", tmp_size);
MPI_Comm_size(MPI_COMM_WORLD, &tmp_size);
printf("child size of child comm world = %d\n", tmp_size);
MPI_Comm_size(intra_comm, &tmp_size);
printf("child size of intra comm world = %d\n", tmp_size);
MPI_Finalize( );
return 0;
}
}
我运行此代码使用
mpirun -np 12 main.c
拆分和合并后,我希望输出为
size of intra comm world = 14
size of child comm world = 2
size of parent comm world = 12
child size of parent comm world = 12
child size of child comm world = 2
child size of intra comm world = 14
但我得到以下不正确的输出。
size of intra comm world = 3
size of child comm world = 1
size of parent comm world = 12
child size of parent comm world = 2
child size of child comm world = 2
child size of intra comm world = 3
我不明白哪里错了,哪位好心人告诉我错在哪里。
谢谢, 克里斯
您的代码存在一些问题,我将在此处列出:
- 在master部分,只有进程0调用
MPI_Comm_spawn()
。这本身并不是一个错误(特别是因为您使用MPI_COMM_SELF
作为父通信器),但它实际上从后续合并中排除了所有其他进程。 - 在 master 和 worker 部分,您使用
MPI_Comm_size()
而不是MPI_Comm_remote_size()
来获取远程通信器的大小。因此,您只会获得内部通信器中 local 通信器的大小,而不是 remote 通信器的大小。 - master代码中,只处理0次调用
MPI_Finalise()
(更不用说缺少main()
和MPI_Init()
)
以下是您的代码的一些固定版本:
master.c
#include <stdio.h>
#include <mpi.h>
int main( int argc, char *argv[] ) {
MPI_Init( &argc, &argv );
int rank;
MPI_Comm_rank( MPI_COMM_WORLD, &rank );
MPI_Comm child_comm;
int num_processes_to_spawn = 2;
MPI_Comm_spawn( "./worker", MPI_ARGV_NULL,
num_processes_to_spawn, MPI_INFO_NULL,
0, MPI_COMM_WORLD,
&child_comm, MPI_ERRCODES_IGNORE );
MPI_Comm intra_comm;
MPI_Intercomm_merge( child_comm, 0, &intra_comm );
if ( rank == 0 ) {
int tmp_size;
MPI_Comm_size( intra_comm, &tmp_size );
printf( "size of intra comm world = %d\n", tmp_size );
MPI_Comm_remote_size( child_comm, &tmp_size );
printf( "size of child comm world = %d\n", tmp_size );
MPI_Comm_size( MPI_COMM_WORLD, &tmp_size );
printf( "size of parent comm world = %d\n", tmp_size );
}
MPI_Finalize();
return 0;
}
worker.c
#include <stdio.h>
#include <mpi.h>
int main( int argc, char *argv[] ) {
MPI_Init( &argc, &argv );
int myrank;
MPI_Comm_rank( MPI_COMM_WORLD, &myrank );
MPI_Comm parentcomm;
MPI_Comm_get_parent( &parentcomm );
MPI_Comm intra_comm;
MPI_Intercomm_merge( parentcomm, 1, &intra_comm );
if ( myrank == 0 ) {
int tmp_size;
MPI_Comm_remote_size( parentcomm, &tmp_size );
printf( "child size of parent comm world = %d\n", tmp_size );
MPI_Comm_size( MPI_COMM_WORLD, &tmp_size );
printf( "child size of child comm world = %d\n", tmp_size );
MPI_Comm_size( intra_comm, &tmp_size );
printf( "child size of intra comm world = %d\n", tmp_size );
}
MPI_Finalize();
return 0;
}
在我的笔记本电脑上显示:
~> mpirun -n 12 ./master
child size of parent comm world = 12
child size of child comm world = 2
child size of intra comm world = 14
size of intra comm world = 14
size of child comm world = 2
size of parent comm world = 12