MPI 产卵和合并问题

Issue with MPI spawn and merge

我正在尝试开始在 MPI 中创建动态进程。我有一个 parent 代码 (main.c) 试图生成新的 worker/child 进程 (worker.c) 并将两者合并到一个内部通信器中。 parent 代码 (main.c) 是

#include<stdio.h>
#include "mpi.h"

MPI_Comm child_comm;
int rank, size;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);

if(rank == 0 )
{
   int  num_processes_to_spawn = 2;
   MPI_Comm_spawn("worker", MPI_ARGV_NULL, num_processes_to_spawn, MPI_INFO_NULL, 0, MPI_COMM_SELF, &child_comm, MPI_ERRCODES_IGNORE );

MPI_Comm intra_comm;
MPI_Intercomm_merge(child_comm,0, &intra_comm);
MPI_Barrier(child_comm);


int tmp_size;
MPI_Comm_size(intra_comm, &tmp_size);
printf("size of intra comm world = %d\n", tmp_size);

MPI_Comm_size(child_comm, &tmp_size);
printf("size of child comm world = %d\n", tmp_size);

MPI_Comm_size(MPI_COMM_WORLD, &tmp_size);
printf("size of parent comm world = %d\n", tmp_size);

}

MPI_Finalize();

工人(child)代码是:

    #include<stdio.h> 
    #include "mpi.h"
    int main( int argc, char *argv[] )
    {
    int numprocs, myrank;
    MPI_Comm parentcomm;
    MPI_Comm intra_comm;

    MPI_Init( &argc, &argv );
    MPI_Comm_size( MPI_COMM_WORLD, &numprocs );
    MPI_Comm_rank( MPI_COMM_WORLD, &myrank );

    MPI_Comm_get_parent( &parentcomm );

    MPI_Intercomm_merge(parentcomm, 1, &intra_comm);
    MPI_Barrier(parentcomm);

    if(myrank == 0)
    {
    int tmp_size;
    MPI_Comm_size(parentcomm, &tmp_size);
    printf("child size of parent comm world = %d\n", tmp_size);

    MPI_Comm_size(MPI_COMM_WORLD, &tmp_size);
    printf("child size of child comm world = %d\n", tmp_size);

    MPI_Comm_size(intra_comm, &tmp_size);
    printf("child size of intra comm world = %d\n", tmp_size);

    MPI_Finalize( );
    return 0;
  }
 } 

我运行此代码使用

mpirun -np 12 main.c

拆分和合并后,我希望输出为

size of intra comm world = 14
size of child comm world = 2
size of parent comm world = 12
child size of parent comm world = 12
child size of child comm world = 2
child size of intra comm world = 14

但我得到以下不正确的输出。

   size of intra comm world = 3
    size of child comm world = 1
    size of parent comm world = 12
    child size of parent comm world = 2
    child size of child comm world = 2
    child size of intra comm world = 3

我不明白哪里错了,哪位好心人告诉我错在哪里。

谢谢, 克里斯

您的代码存在一些问题,我将在此处列出:

  • 在master部分,只有进程0调用MPI_Comm_spawn()。这本身并不是一个错误(特别是因为您使用 MPI_COMM_SELF 作为父通信器),但它实际上从后续合并中排除了所有其他进程。
  • 在 master 和 worker 部分,您使用 MPI_Comm_size() 而不是 MPI_Comm_remote_size() 来获取远程通信器的大小。因此,您只会获得内部通信器中 local 通信器的大小,而不是 remote 通信器的大小。
  • master代码中,只处理0次调用MPI_Finalise()(更不用说缺少main()MPI_Init()

以下是您的代码的一些固定版本:

master.c

#include <stdio.h>
#include <mpi.h>

int main( int argc, char *argv[] ) {

    MPI_Init( &argc, &argv );
    int rank;
    MPI_Comm_rank( MPI_COMM_WORLD, &rank );

    MPI_Comm child_comm;
    int  num_processes_to_spawn = 2;
    MPI_Comm_spawn( "./worker", MPI_ARGV_NULL,
                    num_processes_to_spawn, MPI_INFO_NULL,
                    0, MPI_COMM_WORLD,
                    &child_comm, MPI_ERRCODES_IGNORE );

    MPI_Comm intra_comm;
    MPI_Intercomm_merge( child_comm, 0, &intra_comm );

    if ( rank == 0 ) {
        int tmp_size;
        MPI_Comm_size( intra_comm, &tmp_size );
        printf( "size of intra comm world = %d\n", tmp_size );

        MPI_Comm_remote_size( child_comm, &tmp_size );
        printf( "size of child comm world = %d\n", tmp_size );

        MPI_Comm_size( MPI_COMM_WORLD, &tmp_size );
        printf( "size of parent comm world = %d\n", tmp_size );
    }

    MPI_Finalize();

    return 0;
}

worker.c

#include <stdio.h> 
#include <mpi.h>

int main( int argc, char *argv[] ) {

    MPI_Init( &argc, &argv );

    int myrank;
    MPI_Comm_rank( MPI_COMM_WORLD, &myrank );

    MPI_Comm parentcomm;
    MPI_Comm_get_parent( &parentcomm );

    MPI_Comm intra_comm;
    MPI_Intercomm_merge( parentcomm, 1, &intra_comm );

    if ( myrank == 0 ) {
        int tmp_size;
        MPI_Comm_remote_size( parentcomm, &tmp_size );
        printf( "child size of parent comm world = %d\n", tmp_size );

        MPI_Comm_size( MPI_COMM_WORLD, &tmp_size );
        printf( "child size of child comm world = %d\n", tmp_size );

        MPI_Comm_size( intra_comm, &tmp_size );
        printf( "child size of intra comm world = %d\n", tmp_size );
    }

    MPI_Finalize();

    return 0;
}

在我的笔记本电脑上显示:

~> mpirun -n 12 ./master
child size of parent comm world = 12
child size of child comm world = 2
child size of intra comm world = 14
size of intra comm world = 14
size of child comm world = 2
size of parent comm world = 12