获取正在使用的 MPI 通信器的数量
Get number of MPI Communicators in use
我有一个大代码,它因以下错误而崩溃:
Fatal error in PMPI_Comm_split: Other MPI error, error stack:
PMPI_Comm_split(532)................: MPI_Comm_split(comm=0xc4027cf0, color=0, key=0, new_comm=0x7ffdb50f2bd0) failed
PMPI_Comm_split(508)................: fail failed
MPIR_Comm_split_impl(260)...........: fail failed
MPIR_Get_contextid_sparse_group(676): Too many communicators (0/16384 free on this process; ignore_id=0)
Fatal error in PMPI_Comm_split: Other MPI error, error stack:
PMPI_Comm_split(532)................: MPI_Comm_split(comm=0xc401bcf1, color=1, key=0, new_comm=0x7ffed5aa4fd0) failed
PMPI_Comm_split(508)................: fail failed
MPIR_Comm_split_impl(260)...........: fail failed
MPIR_Get_contextid_sparse_group(676): Too many communicators (0/16384 free on this process; ignore_id=0)
Fatal error in PMPI_Comm_split: Other MPI error, error stack:
PMPI_Comm_split(532)................: MPI_Comm_split(comm=0xc4027ce9, color=0, key=0, new_comm=0x7ffe37e477d0) failed
PMPI_Comm_split(508)................: fail failed
MPIR_Comm_split_impl(260)...........: fail failed
MPIR_Get_contextid_sparse_group(676): Too many communicators (0/16384 free on this process; ignore_id=0)
Fatal error in PMPI_Comm_split: Other MPI error, error stack:
PMPI_Comm_split(532)................: MPI_Comm_split(comm=0xc401bcf1, color=1, key=0, new_comm=0x7ffd511ac4d0) failed
PMPI_Comm_split(508)................: fail failed
MPIR_Comm_split_impl(260)...........: fail failed
MPIR_Get_contextid_sparse_group(676): Too many communicators (0/16384 free on this process; ignore_id=0)
似乎存在某种 MPI 通信器泄漏。 MPI 似乎知道当前正在使用多少通信器:
Too many communicators (0/16384 free on this process; ignore_id=0)
有没有办法打印MPI使用的通讯器数量?这样我就可以缩小通信器泄漏的范围。
您可以覆盖 MPI_Comm_split
和 MPI_Comm_free
的实现以手动计算 communicator 的创建和销毁。
这是一个简单的例子
覆盖 MPI_Comm_split
和 MPI_Comm_free
#include "mpi.h"
#include "stdio.h"
static int comm_counter=0;
int MPI_Comm_split(MPI_Comm comm, int color, int key, MPI_Comm *newcomm)
{
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
comm_counter++;
printf("%s %i %s %i\n", "MPI_Comm_split ", comm_counter, " from ", world_rank);
return PMPI_Comm_split(comm, color, key, newcomm);
}
int MPI_Comm_free(MPI_Comm *comm)
{
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
comm_counter--;
printf("%s %i %s %i\n", "PMPI_Comm_free ", comm_counter, " from ", world_rank);
return PMPI_Comm_free(comm);
}
编译此代码以进行 linked。
就我而言,我做了 mpicc -c comm_split.c -o comm_split.o
您的代码保持不变。您可以在不进行其他修改的情况下使用它。
使用 MPI_Comm_split
和 MPI_Comm_free
的主程序简单示例
C++ 案例
#include "mpi.h"
int main()
{
MPI_Init(NULL, NULL);
// Get the rank and size in the original communicator
int world_rank, world_size;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
int color = world_rank / 4; // Determine color based on row
// Split the communicator based on the color and use the
// original rank for ordering
MPI_Comm row_comm, row_comm2;
MPI_Comm_split(MPI_COMM_WORLD, color, world_rank, &row_comm);
MPI_Comm_split(MPI_COMM_WORLD, color, world_rank, &row_comm2);
int row_rank, row_size;
MPI_Comm_rank(row_comm, &row_rank);
MPI_Comm_size(row_comm, &row_size);
printf("WORLD RANK/SIZE: %d/%d \t ROW RANK/SIZE: %d/%d\n",
world_rank, world_size, row_rank, row_size);
MPI_Comm_free(&row_comm);
MPI_Finalize();
}
Fortran 大小写
program test
include "mpif.h"
integer comm_world, group_world, new_comm, new_comm2, ierr
integer world_rank, world_size;
integer color
call MPI_INIT(ierr)
comm_world = MPI_COMM_WORLD
call MPI_Comm_rank(comm_world, world_rank, ierr);
color = world_rank / 4
call MPI_Comm_split(comm_world, color, world_rank, new_comm, ierr)
call MPI_Comm_split(comm_world, color, world_rank,
& new_comm2, ierr)
call MPI_Comm_free(new_comm, ierr)
call MPI_Finalize(ierr)
end program
编译 + link 重新定义 MPI_Comm_split
和 MPI_Comm_free
mpif77 test.f comm_split.o
mpiCC test.cpp comm_split.o
对于 Fortran 案例 你会得到类似于
MPI_Comm_split 1 from 3
MPI_Comm_split 1 from 0
MPI_Comm_split 1 from 1
MPI_Comm_split 1 from 2
MPI_Comm_split 2 from 0
PMPI_Comm_free 1 from 0
MPI_Comm_split 2 from 1
PMPI_Comm_free 1 from 1
MPI_Comm_split 2 from 2
PMPI_Comm_free 1 from 2
MPI_Comm_split 2 from 3
PMPI_Comm_free 1 from 3
它为您提供有关每个过程中涉及的通信者数量的信息。
我有一个大代码,它因以下错误而崩溃:
Fatal error in PMPI_Comm_split: Other MPI error, error stack:
PMPI_Comm_split(532)................: MPI_Comm_split(comm=0xc4027cf0, color=0, key=0, new_comm=0x7ffdb50f2bd0) failed
PMPI_Comm_split(508)................: fail failed
MPIR_Comm_split_impl(260)...........: fail failed
MPIR_Get_contextid_sparse_group(676): Too many communicators (0/16384 free on this process; ignore_id=0)
Fatal error in PMPI_Comm_split: Other MPI error, error stack:
PMPI_Comm_split(532)................: MPI_Comm_split(comm=0xc401bcf1, color=1, key=0, new_comm=0x7ffed5aa4fd0) failed
PMPI_Comm_split(508)................: fail failed
MPIR_Comm_split_impl(260)...........: fail failed
MPIR_Get_contextid_sparse_group(676): Too many communicators (0/16384 free on this process; ignore_id=0)
Fatal error in PMPI_Comm_split: Other MPI error, error stack:
PMPI_Comm_split(532)................: MPI_Comm_split(comm=0xc4027ce9, color=0, key=0, new_comm=0x7ffe37e477d0) failed
PMPI_Comm_split(508)................: fail failed
MPIR_Comm_split_impl(260)...........: fail failed
MPIR_Get_contextid_sparse_group(676): Too many communicators (0/16384 free on this process; ignore_id=0)
Fatal error in PMPI_Comm_split: Other MPI error, error stack:
PMPI_Comm_split(532)................: MPI_Comm_split(comm=0xc401bcf1, color=1, key=0, new_comm=0x7ffd511ac4d0) failed
PMPI_Comm_split(508)................: fail failed
MPIR_Comm_split_impl(260)...........: fail failed
MPIR_Get_contextid_sparse_group(676): Too many communicators (0/16384 free on this process; ignore_id=0)
似乎存在某种 MPI 通信器泄漏。 MPI 似乎知道当前正在使用多少通信器:
Too many communicators (0/16384 free on this process; ignore_id=0)
有没有办法打印MPI使用的通讯器数量?这样我就可以缩小通信器泄漏的范围。
您可以覆盖 MPI_Comm_split
和 MPI_Comm_free
的实现以手动计算 communicator 的创建和销毁。
这是一个简单的例子
覆盖 MPI_Comm_split
和 MPI_Comm_free
#include "mpi.h"
#include "stdio.h"
static int comm_counter=0;
int MPI_Comm_split(MPI_Comm comm, int color, int key, MPI_Comm *newcomm)
{
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
comm_counter++;
printf("%s %i %s %i\n", "MPI_Comm_split ", comm_counter, " from ", world_rank);
return PMPI_Comm_split(comm, color, key, newcomm);
}
int MPI_Comm_free(MPI_Comm *comm)
{
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
comm_counter--;
printf("%s %i %s %i\n", "PMPI_Comm_free ", comm_counter, " from ", world_rank);
return PMPI_Comm_free(comm);
}
编译此代码以进行 linked。
就我而言,我做了 mpicc -c comm_split.c -o comm_split.o
您的代码保持不变。您可以在不进行其他修改的情况下使用它。
使用 MPI_Comm_split
和 MPI_Comm_free
C++ 案例
#include "mpi.h"
int main()
{
MPI_Init(NULL, NULL);
// Get the rank and size in the original communicator
int world_rank, world_size;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
int color = world_rank / 4; // Determine color based on row
// Split the communicator based on the color and use the
// original rank for ordering
MPI_Comm row_comm, row_comm2;
MPI_Comm_split(MPI_COMM_WORLD, color, world_rank, &row_comm);
MPI_Comm_split(MPI_COMM_WORLD, color, world_rank, &row_comm2);
int row_rank, row_size;
MPI_Comm_rank(row_comm, &row_rank);
MPI_Comm_size(row_comm, &row_size);
printf("WORLD RANK/SIZE: %d/%d \t ROW RANK/SIZE: %d/%d\n",
world_rank, world_size, row_rank, row_size);
MPI_Comm_free(&row_comm);
MPI_Finalize();
}
Fortran 大小写
program test
include "mpif.h"
integer comm_world, group_world, new_comm, new_comm2, ierr
integer world_rank, world_size;
integer color
call MPI_INIT(ierr)
comm_world = MPI_COMM_WORLD
call MPI_Comm_rank(comm_world, world_rank, ierr);
color = world_rank / 4
call MPI_Comm_split(comm_world, color, world_rank, new_comm, ierr)
call MPI_Comm_split(comm_world, color, world_rank,
& new_comm2, ierr)
call MPI_Comm_free(new_comm, ierr)
call MPI_Finalize(ierr)
end program
编译 + link 重新定义 MPI_Comm_split
和 MPI_Comm_free
mpif77 test.f comm_split.o
mpiCC test.cpp comm_split.o
对于 Fortran 案例 你会得到类似于
MPI_Comm_split 1 from 3
MPI_Comm_split 1 from 0
MPI_Comm_split 1 from 1
MPI_Comm_split 1 from 2
MPI_Comm_split 2 from 0
PMPI_Comm_free 1 from 0
MPI_Comm_split 2 from 1
PMPI_Comm_free 1 from 1
MPI_Comm_split 2 from 2
PMPI_Comm_free 1 from 2
MPI_Comm_split 2 from 3
PMPI_Comm_free 1 from 3
它为您提供有关每个过程中涉及的通信者数量的信息。