通过自定义通信器发送数组
Send an array over a custom communicator
1。目标
我必须通过自定义通信器(不是 MPI_COMM_WORLD
)分发一个名为 A_loc
的数组。假设我们想通过 mesh_r
通信器分发一个数组:
P0-P1
| |
P2-P3
其中-
代表mesh_r
(mesh_rows)通讯器,|
代表mesh_c
(mesh_columns)通讯器,通过[=构建19=]程序。
2。代码
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <mpi.h>
bool is_divisible(int, int);
void build_mesh(MPI_Comm*, MPI_Comm*, MPI_Comm*, int, int, int, int, int*);
int *fill_matrix(int*, int, int);
void print_matrix(int*, int, int, int, int);
void handle_errors(int, int, int, int);
void distribute(int*, int*, int, int, int, int, int, int, int);
void debug(int*, int*, int, int, int, int, int, int, int);
int main(int argc, char *argv[])
{
int process_rank, world_size;
int mesh_rows, mesh_columns;
int mesh_dimension = 2;
int *process_coordinates;
MPI_Comm mesh, mesh_r, mesh_c;
int process_rank_mesh;
int *A, *A_loc;
int *B, *B_loc;
int m, n, mloc, nloc;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &process_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
if (process_rank == 0) {
m = n = world_size * 1; // multiple of world_size = 4
}
MPI_Bcast(&m, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
A = fill_matrix(A, m, n);
B = fill_matrix(A, m, n);
if (process_rank == 0)
mesh_rows = 2;
if (is_divisible(world_size, mesh_rows))
mesh_columns = world_size / mesh_rows;
else {
mesh_rows = 1;
mesh_columns = world_size / mesh_rows;
}
MPI_Bcast(&mesh_rows, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&mesh_columns, 1, MPI_INT, 0, MPI_COMM_WORLD);
process_coordinates = (int*) calloc(mesh_dimension, sizeof(int));
build_mesh(&mesh, &mesh_r, &mesh_c, process_rank, world_size, mesh_rows, mesh_columns, process_coordinates);
MPI_Comm_rank(mesh, &process_rank_mesh);
mloc = m / mesh_rows;
nloc = m / mesh_columns;
handle_errors(m, n, world_size, process_rank);
A_loc = (int*) calloc(mloc * nloc, sizeof(int));
distribute(A, A_loc, m, n, mloc, nloc, world_size, mesh_rows, mesh_columns);
B_loc = (int*) calloc(mloc * nloc, sizeof(int));
distribute(B, B_loc, m, n, mloc, nloc, world_size, mesh_rows, mesh_columns);
// I want to re-write this part so I can exploit mesh_r communicator instead of MPI_COMM_WORLD...
int *A_loc_add = (int*) calloc(mloc * nloc, sizeof(int));
if (process_rank == 0) {
MPI_Send(A_loc, mloc * nloc, MPI_INT, 1, 10, MPI_COMM_WORLD);
} else if (process_rank == 3) {
MPI_Send(A_loc, mloc * nloc, MPI_INT, 2, 20, MPI_COMM_WORLD);
}
MPI_Status status;
if (process_rank == 1) {
MPI_Recv(A_loc_add, mloc * nloc, MPI_INT, 0, 10, MPI_COMM_WORLD, &status);
} else if (process_rank == 2) {
MPI_Recv(A_loc_add, mloc * nloc, MPI_INT, 3, 20, MPI_COMM_WORLD, &status);
}
MPI_Finalize();
return 0;
}
void distribute(int *Mat, int *Mat_loc, int m, int n, int mloc, int nloc, int world_size, int mesh_rows, int mesh_columns)
{
MPI_Datatype square_block;
int stride = n;
int count = mloc;
int block_length = nloc;
MPI_Type_vector(count, block_length, stride, MPI_INT, &square_block);
MPI_Datatype square_block_resized;
MPI_Type_create_resized(square_block, 0, sizeof(int), &square_block_resized);
MPI_Type_commit(&square_block_resized);
int *send_counts = (int*) calloc(world_size, sizeof(int));
int *displs = (int*) calloc(world_size, sizeof(int));
for (int i = 0; i < mesh_rows; i++) {
for (int j = 0; j < mesh_columns; j++) {
send_counts[i * mesh_columns + j] = 1;
displs[i * mesh_columns + j] = i * n * block_length + j * block_length;
}
}
MPI_Scatterv(Mat, send_counts, displs, square_block_resized, Mat_loc, mloc * nloc, MPI_INT, 0, MPI_COMM_WORLD);
}
bool is_divisible(int dividend, int divisor)
{
return dividend % divisor == 0;
}
void build_mesh(MPI_Comm *mesh, MPI_Comm *mesh_r, MPI_Comm *mesh_c, int process_rank, int world_size,
int mesh_rows, int mesh_columns, int *process_coordinates)
{
int mesh_dimension = 2;
int *mesh_n_dimension;
int mesh_reorder = 0;
int *mesh_period;
int *remain_dims = (int*) calloc(mesh_dimension, sizeof(int));
mesh_n_dimension = (int*) calloc(mesh_dimension, sizeof(int));
mesh_n_dimension[0] = mesh_rows;
mesh_n_dimension[1] = mesh_columns;
mesh_period = (int*) calloc(mesh_dimension, sizeof(int));
mesh_period[0] = mesh_period[1] = 0;
MPI_Cart_create(MPI_COMM_WORLD, mesh_dimension, mesh_n_dimension, mesh_period, mesh_reorder, mesh);
MPI_Cart_coords(*mesh, process_rank, mesh_dimension, process_coordinates);
remain_dims[0] = 0;
remain_dims[1] = 1;
MPI_Cart_sub(*mesh, remain_dims, mesh_r);
remain_dims[0] = 1;
remain_dims[1] = 0;
MPI_Cart_sub(*mesh, remain_dims, mesh_c);
}
int *fill_matrix(int *Mat, int m, int n)
{
int k = 0;
Mat = (int*) calloc(m * n, sizeof(int));
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
Mat[i * n + j] = ++k;
return Mat;
}
如您所见,这工作正常,但我希望我可以重写注释部分,以便我可以利用 mesh_r
通信器并在 mesh_r
上的每个处理器上分发 A_loc
,而不是硬编码 send
与 dest = 1
和 dest = 2
而不是 MPI_COMM_WORLD
.
- 有什么帮助吗?
您应该像在早期版本的代码中那样使用 Bcast
而不是发送和接收。您的问题是您没有以分布式方式思考,而是试图保持全局视图。我的意思是,在您创建子通信器 mesh_r
之后,每个进程似乎都在该通信器中,但是 它来了 :有多个 mesh_r
传播者,每个过程都是一个过程的一部分。每个 MPI 进程都准确地看到它所属的一个 mesh_r
通信器。因此,单个代码行 MPI_Bcast( ...buffer stuff...., mesh_r )
进行多次广播,每个网格行一个。
1。目标
我必须通过自定义通信器(不是 MPI_COMM_WORLD
)分发一个名为 A_loc
的数组。假设我们想通过 mesh_r
通信器分发一个数组:
P0-P1
| |
P2-P3
其中-
代表mesh_r
(mesh_rows)通讯器,|
代表mesh_c
(mesh_columns)通讯器,通过[=构建19=]程序。
2。代码
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <mpi.h>
bool is_divisible(int, int);
void build_mesh(MPI_Comm*, MPI_Comm*, MPI_Comm*, int, int, int, int, int*);
int *fill_matrix(int*, int, int);
void print_matrix(int*, int, int, int, int);
void handle_errors(int, int, int, int);
void distribute(int*, int*, int, int, int, int, int, int, int);
void debug(int*, int*, int, int, int, int, int, int, int);
int main(int argc, char *argv[])
{
int process_rank, world_size;
int mesh_rows, mesh_columns;
int mesh_dimension = 2;
int *process_coordinates;
MPI_Comm mesh, mesh_r, mesh_c;
int process_rank_mesh;
int *A, *A_loc;
int *B, *B_loc;
int m, n, mloc, nloc;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &process_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
if (process_rank == 0) {
m = n = world_size * 1; // multiple of world_size = 4
}
MPI_Bcast(&m, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
A = fill_matrix(A, m, n);
B = fill_matrix(A, m, n);
if (process_rank == 0)
mesh_rows = 2;
if (is_divisible(world_size, mesh_rows))
mesh_columns = world_size / mesh_rows;
else {
mesh_rows = 1;
mesh_columns = world_size / mesh_rows;
}
MPI_Bcast(&mesh_rows, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&mesh_columns, 1, MPI_INT, 0, MPI_COMM_WORLD);
process_coordinates = (int*) calloc(mesh_dimension, sizeof(int));
build_mesh(&mesh, &mesh_r, &mesh_c, process_rank, world_size, mesh_rows, mesh_columns, process_coordinates);
MPI_Comm_rank(mesh, &process_rank_mesh);
mloc = m / mesh_rows;
nloc = m / mesh_columns;
handle_errors(m, n, world_size, process_rank);
A_loc = (int*) calloc(mloc * nloc, sizeof(int));
distribute(A, A_loc, m, n, mloc, nloc, world_size, mesh_rows, mesh_columns);
B_loc = (int*) calloc(mloc * nloc, sizeof(int));
distribute(B, B_loc, m, n, mloc, nloc, world_size, mesh_rows, mesh_columns);
// I want to re-write this part so I can exploit mesh_r communicator instead of MPI_COMM_WORLD...
int *A_loc_add = (int*) calloc(mloc * nloc, sizeof(int));
if (process_rank == 0) {
MPI_Send(A_loc, mloc * nloc, MPI_INT, 1, 10, MPI_COMM_WORLD);
} else if (process_rank == 3) {
MPI_Send(A_loc, mloc * nloc, MPI_INT, 2, 20, MPI_COMM_WORLD);
}
MPI_Status status;
if (process_rank == 1) {
MPI_Recv(A_loc_add, mloc * nloc, MPI_INT, 0, 10, MPI_COMM_WORLD, &status);
} else if (process_rank == 2) {
MPI_Recv(A_loc_add, mloc * nloc, MPI_INT, 3, 20, MPI_COMM_WORLD, &status);
}
MPI_Finalize();
return 0;
}
void distribute(int *Mat, int *Mat_loc, int m, int n, int mloc, int nloc, int world_size, int mesh_rows, int mesh_columns)
{
MPI_Datatype square_block;
int stride = n;
int count = mloc;
int block_length = nloc;
MPI_Type_vector(count, block_length, stride, MPI_INT, &square_block);
MPI_Datatype square_block_resized;
MPI_Type_create_resized(square_block, 0, sizeof(int), &square_block_resized);
MPI_Type_commit(&square_block_resized);
int *send_counts = (int*) calloc(world_size, sizeof(int));
int *displs = (int*) calloc(world_size, sizeof(int));
for (int i = 0; i < mesh_rows; i++) {
for (int j = 0; j < mesh_columns; j++) {
send_counts[i * mesh_columns + j] = 1;
displs[i * mesh_columns + j] = i * n * block_length + j * block_length;
}
}
MPI_Scatterv(Mat, send_counts, displs, square_block_resized, Mat_loc, mloc * nloc, MPI_INT, 0, MPI_COMM_WORLD);
}
bool is_divisible(int dividend, int divisor)
{
return dividend % divisor == 0;
}
void build_mesh(MPI_Comm *mesh, MPI_Comm *mesh_r, MPI_Comm *mesh_c, int process_rank, int world_size,
int mesh_rows, int mesh_columns, int *process_coordinates)
{
int mesh_dimension = 2;
int *mesh_n_dimension;
int mesh_reorder = 0;
int *mesh_period;
int *remain_dims = (int*) calloc(mesh_dimension, sizeof(int));
mesh_n_dimension = (int*) calloc(mesh_dimension, sizeof(int));
mesh_n_dimension[0] = mesh_rows;
mesh_n_dimension[1] = mesh_columns;
mesh_period = (int*) calloc(mesh_dimension, sizeof(int));
mesh_period[0] = mesh_period[1] = 0;
MPI_Cart_create(MPI_COMM_WORLD, mesh_dimension, mesh_n_dimension, mesh_period, mesh_reorder, mesh);
MPI_Cart_coords(*mesh, process_rank, mesh_dimension, process_coordinates);
remain_dims[0] = 0;
remain_dims[1] = 1;
MPI_Cart_sub(*mesh, remain_dims, mesh_r);
remain_dims[0] = 1;
remain_dims[1] = 0;
MPI_Cart_sub(*mesh, remain_dims, mesh_c);
}
int *fill_matrix(int *Mat, int m, int n)
{
int k = 0;
Mat = (int*) calloc(m * n, sizeof(int));
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
Mat[i * n + j] = ++k;
return Mat;
}
如您所见,这工作正常,但我希望我可以重写注释部分,以便我可以利用 mesh_r
通信器并在 mesh_r
上的每个处理器上分发 A_loc
,而不是硬编码 send
与 dest = 1
和 dest = 2
而不是 MPI_COMM_WORLD
.
- 有什么帮助吗?
您应该像在早期版本的代码中那样使用 Bcast
而不是发送和接收。您的问题是您没有以分布式方式思考,而是试图保持全局视图。我的意思是,在您创建子通信器 mesh_r
之后,每个进程似乎都在该通信器中,但是 它来了 :有多个 mesh_r
传播者,每个过程都是一个过程的一部分。每个 MPI 进程都准确地看到它所属的一个 mesh_r
通信器。因此,单个代码行 MPI_Bcast( ...buffer stuff...., mesh_r )
进行多次广播,每个网格行一个。