解决处理笛卡尔网格的 MPI 程序中的死锁问题
Trouble Resolving Deadlock in MPI Program dealing with a Cartesian mesh
我正在实现大炮的算法。我 运行 它使用 4 个处理器。当我进入循环时遇到死锁:
for (i=0; i<dims[0]; i++) {
Multiply(nlocal, a, b, c);
MPI_Sendrecv_replace(a, nlocal*nlocal, MPI_DOUBLE,leftrank, 1, rightrank, 1, comm_2d, &status);
MPI_Sendrecv_replace(b, nlocal*nlocal, MPI_DOUBLE,uprank, 1, downrank, 1, comm_2d, &status);
}
完整代码在这里:
#include <math.h>
#include <mpi.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
void Multiply(int n, double *a, double *b, double *c);
double* readMatrix(char* filename, int* size);
void writeMatrix(double* matrix, char* filename, int size);
int main(int argc, char* argv[])
{
MPI_Init(&argc, &argv);
double* a,*b,*c;
int i, t, n;
int nlocal;
int npes, dims[2], periods[2];
int myrank, my2drank, mycoords[2];
int uprank, downrank, leftrank, rightrank, coords[2];
int shiftsource, shiftdest;
MPI_Status status;
MPI_Comm comm_2d;
MPI_Comm_size(MPI_COMM_WORLD, &npes);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Barrier(MPI_COMM_WORLD);
t = -MPI_Wtime();
if (myrank == 0) {
int sizeA,sizeB;
printf("Reading %s\n", argv[1]);
a = readMatrix(argv[1], &sizeA);
b = readMatrix(argv[2], &sizeB);
printf("Reading %s\n", argv[2]);
c = calloc(sizeA*sizeB, sizeof(double));
n = sizeA;
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(a, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Bcast(b, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Bcast(c, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
if (sizeA != sizeB) {
printf("Matrix not sized n^2\n");
MPI_Abort(MPI_COMM_WORLD, 0);
}
}
else {
a = calloc(n*n, sizeof(double));
b = calloc(n*n, sizeof(double));
c = calloc(n*n, sizeof(double));
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(a, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Bcast(b, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Bcast(c, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
}
dims[0] = dims[1] = sqrt(npes);
periods[0] = periods[1] = 1;
MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 1, &comm_2d);
MPI_Comm_rank(comm_2d, &my2drank);
MPI_Cart_coords(comm_2d, my2drank, 2, mycoords);
MPI_Cart_shift(comm_2d, 0, -1, &rightrank, &leftrank);
MPI_Cart_shift(comm_2d, 1, -1, &downrank, &uprank);
nlocal = n/dims[0];
MPI_Cart_shift(comm_2d, 0, -mycoords[0], &shiftsource, &shiftdest);
MPI_Sendrecv_replace(a, nlocal*nlocal, MPI_DOUBLE, shiftdest,1, shiftsource, 1, comm_2d, &status);
MPI_Cart_shift(comm_2d, 1, -mycoords[1], &shiftsource, &shiftdest);
MPI_Sendrecv_replace(b, nlocal*nlocal, MPI_DOUBLE,shiftdest, 1, shiftsource, 1, comm_2d, &status);
printf("rank[%d] has entered loop\n", myrank);
for (i=0; i<dims[0]; i++) {
Multiply(nlocal, a, b, c);
MPI_Sendrecv_replace(a, nlocal*nlocal, MPI_DOUBLE,leftrank, 1, rightrank, 1, comm_2d, &status);
MPI_Sendrecv_replace(b, nlocal*nlocal, MPI_DOUBLE,uprank, 1, downrank, 1, comm_2d, &status);
}
printf("rank[%d] has left loop\n", myrank);
MPI_Cart_shift(comm_2d, 0, +mycoords[0], &shiftsource, &shiftdest);
MPI_Sendrecv_replace(a, nlocal*nlocal, MPI_DOUBLE,shiftdest, 1, shiftsource, 1, comm_2d, &status);
MPI_Cart_shift(comm_2d, 1, +mycoords[1], &shiftsource, &shiftdest);
MPI_Sendrecv_replace(b, nlocal*nlocal, MPI_DOUBLE,shiftdest, 1, shiftsource, 1, comm_2d, &status);
printf("rank[%d] has reached the barrier...\n", myrank);
MPI_Barrier(MPI_COMM_WORLD);
if (myrank == 0) {
t += MPI_Wtime();
writeMatrix(c, argv[3], n);
printf("%s %d second(s)\n", "Finshed in", t);
}
free(a); free(b); free(c);
MPI_Comm_free(&comm_2d);
MPI_Finalize();
}
double* readMatrix(char* filename, int* size) {
FILE* file_handle = fopen(filename, "r");
int row;
int col;
fread(&row, sizeof(int), 1, file_handle);
fread(&col, sizeof(int), 1, file_handle);
if (row == col) {
*size = row;
}
else {
*size = -1;
return NULL;
}
double* buffer = calloc(row*col, sizeof(double));
for(int i = 0; i < row; i++) {
for(int j = 0; j < col; j++) {
double x;
fread(&x, sizeof(double), 1, file_handle);
buffer[row * i + j] = x;
}
}
fclose(file_handle);
printf("Buffer has size %d\n", row*col);
return buffer;
}
void writeMatrix(double* matrix, char* filename, int size) {
FILE* file_handle = fopen(filename, "w");
fwrite(&size, sizeof(int), 1, file_handle);
fwrite(&size, sizeof(int), 1, file_handle);
for(int i = 0; i < size; i++) {
for(int j = 0; j < size; j++) {
double x = matrix[size * i + j];
fwrite(&x, sizeof(double), 1, file_handle);
}
}
fclose(file_handle);
}
void Multiply(int n, double *a, double *b, double *c)
{
int i, j, k;
for (i=0; i<n; i++)
for (j=0; j<n; j++)
for (k=0; k<n; k++)
c[i*n+j] += a[i*n+k]*b[k*n+j];
}
如果代码太多,我可以轻松删除某些部分。我只是想知道是什么导致了僵局以及如何解决它。提前感谢您的宝贵时间。
重要信息:
等级0总是撞到障碍。但是由于其他 3 个陷入僵局,第 0 级被卡住,直到所有人都遇到障碍。
输出
Reading 10
Buffer has size 100
Buffer has size 100
Reading 10
rank[0] has entered loop
rank[0] has left loop
rank[0] has reached the barrier...
rank[1] has entered loop
rank[2] has entered loop
rank[3] has entered loop
有两个小问题可以使某些东西正常工作:
行数:
a = calloc(n*n, sizeof(double));
b = calloc(n*n, sizeof(double));
c = calloc(n*n, sizeof(double));
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
n
应在分配 a
之前广播。否则,n
未初始化且输出未定义。它可以触发分段错误。
在函数MPI_Cart_shift
, the third argument is the displacement : negative for downward and positive for upward. I changed it to set the same displacement for everyone and it worked fine. Even if MPI_Sendrecv_replace()
中,一个进程收到的消息数必须与发送给该进程的消息数相匹配。您致电 MPI_Sendrecv_replace()
时可能并非如此:
MPI_Cart_shift(comm_2d, 0, -mycoords[0], &shiftsource, &shiftdest);
MPI_Sendrecv_replace(a, nlocal*nlocal, MPI_DOUBLE, shiftdest,1, shiftsource, 1, comm_2d, &status);
在"skew" example of open-mpi中略有不同:
C compute shift source and destination
CALL MPI_CART_SHIFT(comm, 0, coords(2), source,
dest, ierr)
C skew array
CALL MPI_SENDRECV_REPLACE(A, 1, MPI_REAL, dest, 0,
source, 0, comm, status,
ierr)
在这种情况下,每行中的所有进程都获得相同的位移。因此,每个进程都应该发送一条消息,每个进程都应该接收一条消息。然而,位移取决于线和矩阵是倾斜的。
这是生成的代码。它由 mpicc main.c -o main -lm -Wall
和 运行 由 mpirun -np 4 main
编译:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "mpi.h"
int main(int argc, char **argv)
{
MPI_Init(&argc, &argv);
double* a,*b,*c;
int i, t, n;
int nlocal;
int npes, dims[2], periods[2];
int myrank, my2drank, mycoords[2];
int uprank, downrank, leftrank, rightrank;
int shiftsource, shiftdest;
MPI_Status status;
MPI_Comm comm_2d;
MPI_Comm_size(MPI_COMM_WORLD, &npes);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Barrier(MPI_COMM_WORLD);
t = -MPI_Wtime();
if (myrank == 0) {
int sizeA,sizeB;
printf("Reading \n");
// a = readMatrix(argv[1], &sizeA);
sizeA=16;
a=malloc(sizeA*sizeA*sizeof(double));
// b = readMatrix(argv[2], &sizeB);
sizeB=16;
b=malloc(sizeB*sizeB*sizeof(double));
printf("Reading \n");
c = calloc(sizeA*sizeB, sizeof(double));
n = sizeA;
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(a, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Bcast(b, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Bcast(c, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
if (sizeA != sizeB) {
printf("Matrix not sized n^2\n");
MPI_Abort(MPI_COMM_WORLD, 0);
}
}
else {
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);//n should be broadcast before allocation
a = calloc(n*n, sizeof(double));
b = calloc(n*n, sizeof(double));
c = calloc(n*n, sizeof(double));
MPI_Bcast(a, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Bcast(b, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Bcast(c, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
}
dims[0] = dims[1] = sqrt(npes);
periods[0] = periods[1] = 1;
MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 1, &comm_2d);
MPI_Comm_rank(comm_2d, &my2drank);
MPI_Cart_coords(comm_2d, my2drank, 2, mycoords);
MPI_Cart_shift(comm_2d, 0, -1, &rightrank, &leftrank);
MPI_Cart_shift(comm_2d, 1, -1, &downrank, &uprank);
nlocal = n/dims[0];
MPI_Cart_shift(comm_2d, 0, -1, &shiftsource, &shiftdest);
// MPI_Cart_shift(comm_2d, 0, -mycoords[0], &shiftsource, &shiftdest);
MPI_Sendrecv_replace(a, nlocal*nlocal, MPI_DOUBLE, shiftdest,5, shiftsource, 5, comm_2d, &status);
// MPI_Cart_shift(comm_2d, 1, -mycoords[1], &shiftsource, &shiftdest);
MPI_Cart_shift(comm_2d, 1, -1, &shiftsource, &shiftdest);
MPI_Sendrecv_replace(b, nlocal*nlocal, MPI_DOUBLE,shiftdest, 6, shiftsource, 6, comm_2d, &status);
printf("rank[%d] has entered loop dim %d\n", myrank,dims[0]);fflush(stdout);
for (i=0; i<dims[0]; i++) {
// Multiply(nlocal, a, b, c);
MPI_Sendrecv_replace(a, nlocal*nlocal, MPI_DOUBLE,leftrank, 1, rightrank, 1, comm_2d, &status);
MPI_Sendrecv_replace(b, nlocal*nlocal, MPI_DOUBLE,uprank, 2, downrank, 2, comm_2d, &status);
}
printf("rank[%d] has left loop\n", myrank);fflush(stdout);
MPI_Barrier(MPI_COMM_WORLD);
// MPI_Cart_shift(comm_2d, 0, +mycoords[0], &shiftsource, &shiftdest);
MPI_Cart_shift(comm_2d, 0, 1, &shiftsource, &shiftdest);
MPI_Sendrecv_replace(a, nlocal*nlocal, MPI_DOUBLE,shiftdest, 3, shiftsource, 3, comm_2d, &status);
MPI_Cart_shift(comm_2d, 1, 1, &shiftsource, &shiftdest);
//MPI_Cart_shift(comm_2d, 1, +mycoords[1], &shiftsource, &shiftdest);
MPI_Sendrecv_replace(b, nlocal*nlocal, MPI_DOUBLE,shiftdest, 4, shiftsource, 4, comm_2d, &status);
printf("rank[%d] has reached the barrier...\n", myrank);fflush(stdout);
MPI_Barrier(MPI_COMM_WORLD);
if (myrank == 0) {
t += MPI_Wtime();
// writeMatrix(c, argv[3], n);
printf("Finshed in %d second(s)\n",t);
}
free(a); free(b); free(c);
MPI_Comm_free(&comm_2d);
MPI_Finalize();
return 0;
}
我正在实现大炮的算法。我 运行 它使用 4 个处理器。当我进入循环时遇到死锁:
for (i=0; i<dims[0]; i++) {
Multiply(nlocal, a, b, c);
MPI_Sendrecv_replace(a, nlocal*nlocal, MPI_DOUBLE,leftrank, 1, rightrank, 1, comm_2d, &status);
MPI_Sendrecv_replace(b, nlocal*nlocal, MPI_DOUBLE,uprank, 1, downrank, 1, comm_2d, &status);
}
完整代码在这里:
#include <math.h>
#include <mpi.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
void Multiply(int n, double *a, double *b, double *c);
double* readMatrix(char* filename, int* size);
void writeMatrix(double* matrix, char* filename, int size);
int main(int argc, char* argv[])
{
MPI_Init(&argc, &argv);
double* a,*b,*c;
int i, t, n;
int nlocal;
int npes, dims[2], periods[2];
int myrank, my2drank, mycoords[2];
int uprank, downrank, leftrank, rightrank, coords[2];
int shiftsource, shiftdest;
MPI_Status status;
MPI_Comm comm_2d;
MPI_Comm_size(MPI_COMM_WORLD, &npes);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Barrier(MPI_COMM_WORLD);
t = -MPI_Wtime();
if (myrank == 0) {
int sizeA,sizeB;
printf("Reading %s\n", argv[1]);
a = readMatrix(argv[1], &sizeA);
b = readMatrix(argv[2], &sizeB);
printf("Reading %s\n", argv[2]);
c = calloc(sizeA*sizeB, sizeof(double));
n = sizeA;
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(a, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Bcast(b, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Bcast(c, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
if (sizeA != sizeB) {
printf("Matrix not sized n^2\n");
MPI_Abort(MPI_COMM_WORLD, 0);
}
}
else {
a = calloc(n*n, sizeof(double));
b = calloc(n*n, sizeof(double));
c = calloc(n*n, sizeof(double));
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(a, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Bcast(b, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Bcast(c, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
}
dims[0] = dims[1] = sqrt(npes);
periods[0] = periods[1] = 1;
MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 1, &comm_2d);
MPI_Comm_rank(comm_2d, &my2drank);
MPI_Cart_coords(comm_2d, my2drank, 2, mycoords);
MPI_Cart_shift(comm_2d, 0, -1, &rightrank, &leftrank);
MPI_Cart_shift(comm_2d, 1, -1, &downrank, &uprank);
nlocal = n/dims[0];
MPI_Cart_shift(comm_2d, 0, -mycoords[0], &shiftsource, &shiftdest);
MPI_Sendrecv_replace(a, nlocal*nlocal, MPI_DOUBLE, shiftdest,1, shiftsource, 1, comm_2d, &status);
MPI_Cart_shift(comm_2d, 1, -mycoords[1], &shiftsource, &shiftdest);
MPI_Sendrecv_replace(b, nlocal*nlocal, MPI_DOUBLE,shiftdest, 1, shiftsource, 1, comm_2d, &status);
printf("rank[%d] has entered loop\n", myrank);
for (i=0; i<dims[0]; i++) {
Multiply(nlocal, a, b, c);
MPI_Sendrecv_replace(a, nlocal*nlocal, MPI_DOUBLE,leftrank, 1, rightrank, 1, comm_2d, &status);
MPI_Sendrecv_replace(b, nlocal*nlocal, MPI_DOUBLE,uprank, 1, downrank, 1, comm_2d, &status);
}
printf("rank[%d] has left loop\n", myrank);
MPI_Cart_shift(comm_2d, 0, +mycoords[0], &shiftsource, &shiftdest);
MPI_Sendrecv_replace(a, nlocal*nlocal, MPI_DOUBLE,shiftdest, 1, shiftsource, 1, comm_2d, &status);
MPI_Cart_shift(comm_2d, 1, +mycoords[1], &shiftsource, &shiftdest);
MPI_Sendrecv_replace(b, nlocal*nlocal, MPI_DOUBLE,shiftdest, 1, shiftsource, 1, comm_2d, &status);
printf("rank[%d] has reached the barrier...\n", myrank);
MPI_Barrier(MPI_COMM_WORLD);
if (myrank == 0) {
t += MPI_Wtime();
writeMatrix(c, argv[3], n);
printf("%s %d second(s)\n", "Finshed in", t);
}
free(a); free(b); free(c);
MPI_Comm_free(&comm_2d);
MPI_Finalize();
}
double* readMatrix(char* filename, int* size) {
FILE* file_handle = fopen(filename, "r");
int row;
int col;
fread(&row, sizeof(int), 1, file_handle);
fread(&col, sizeof(int), 1, file_handle);
if (row == col) {
*size = row;
}
else {
*size = -1;
return NULL;
}
double* buffer = calloc(row*col, sizeof(double));
for(int i = 0; i < row; i++) {
for(int j = 0; j < col; j++) {
double x;
fread(&x, sizeof(double), 1, file_handle);
buffer[row * i + j] = x;
}
}
fclose(file_handle);
printf("Buffer has size %d\n", row*col);
return buffer;
}
void writeMatrix(double* matrix, char* filename, int size) {
FILE* file_handle = fopen(filename, "w");
fwrite(&size, sizeof(int), 1, file_handle);
fwrite(&size, sizeof(int), 1, file_handle);
for(int i = 0; i < size; i++) {
for(int j = 0; j < size; j++) {
double x = matrix[size * i + j];
fwrite(&x, sizeof(double), 1, file_handle);
}
}
fclose(file_handle);
}
void Multiply(int n, double *a, double *b, double *c)
{
int i, j, k;
for (i=0; i<n; i++)
for (j=0; j<n; j++)
for (k=0; k<n; k++)
c[i*n+j] += a[i*n+k]*b[k*n+j];
}
如果代码太多,我可以轻松删除某些部分。我只是想知道是什么导致了僵局以及如何解决它。提前感谢您的宝贵时间。
重要信息:
等级0总是撞到障碍。但是由于其他 3 个陷入僵局,第 0 级被卡住,直到所有人都遇到障碍。
输出
Reading 10
Buffer has size 100
Buffer has size 100
Reading 10
rank[0] has entered loop
rank[0] has left loop
rank[0] has reached the barrier...
rank[1] has entered loop
rank[2] has entered loop
rank[3] has entered loop
有两个小问题可以使某些东西正常工作:
行数:
a = calloc(n*n, sizeof(double)); b = calloc(n*n, sizeof(double)); c = calloc(n*n, sizeof(double)); MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
n
应在分配 a
之前广播。否则,n
未初始化且输出未定义。它可以触发分段错误。
在函数
MPI_Cart_shift
, the third argument is the displacement : negative for downward and positive for upward. I changed it to set the same displacement for everyone and it worked fine. Even ifMPI_Sendrecv_replace()
中,一个进程收到的消息数必须与发送给该进程的消息数相匹配。您致电MPI_Sendrecv_replace()
时可能并非如此:MPI_Cart_shift(comm_2d, 0, -mycoords[0], &shiftsource, &shiftdest); MPI_Sendrecv_replace(a, nlocal*nlocal, MPI_DOUBLE, shiftdest,1, shiftsource, 1, comm_2d, &status);
在"skew" example of open-mpi中略有不同:
C compute shift source and destination
CALL MPI_CART_SHIFT(comm, 0, coords(2), source,
dest, ierr)
C skew array
CALL MPI_SENDRECV_REPLACE(A, 1, MPI_REAL, dest, 0,
source, 0, comm, status,
ierr)
在这种情况下,每行中的所有进程都获得相同的位移。因此,每个进程都应该发送一条消息,每个进程都应该接收一条消息。然而,位移取决于线和矩阵是倾斜的。
这是生成的代码。它由 mpicc main.c -o main -lm -Wall
和 运行 由 mpirun -np 4 main
编译:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "mpi.h"
int main(int argc, char **argv)
{
MPI_Init(&argc, &argv);
double* a,*b,*c;
int i, t, n;
int nlocal;
int npes, dims[2], periods[2];
int myrank, my2drank, mycoords[2];
int uprank, downrank, leftrank, rightrank;
int shiftsource, shiftdest;
MPI_Status status;
MPI_Comm comm_2d;
MPI_Comm_size(MPI_COMM_WORLD, &npes);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Barrier(MPI_COMM_WORLD);
t = -MPI_Wtime();
if (myrank == 0) {
int sizeA,sizeB;
printf("Reading \n");
// a = readMatrix(argv[1], &sizeA);
sizeA=16;
a=malloc(sizeA*sizeA*sizeof(double));
// b = readMatrix(argv[2], &sizeB);
sizeB=16;
b=malloc(sizeB*sizeB*sizeof(double));
printf("Reading \n");
c = calloc(sizeA*sizeB, sizeof(double));
n = sizeA;
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(a, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Bcast(b, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Bcast(c, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
if (sizeA != sizeB) {
printf("Matrix not sized n^2\n");
MPI_Abort(MPI_COMM_WORLD, 0);
}
}
else {
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);//n should be broadcast before allocation
a = calloc(n*n, sizeof(double));
b = calloc(n*n, sizeof(double));
c = calloc(n*n, sizeof(double));
MPI_Bcast(a, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Bcast(b, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Bcast(c, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
}
dims[0] = dims[1] = sqrt(npes);
periods[0] = periods[1] = 1;
MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 1, &comm_2d);
MPI_Comm_rank(comm_2d, &my2drank);
MPI_Cart_coords(comm_2d, my2drank, 2, mycoords);
MPI_Cart_shift(comm_2d, 0, -1, &rightrank, &leftrank);
MPI_Cart_shift(comm_2d, 1, -1, &downrank, &uprank);
nlocal = n/dims[0];
MPI_Cart_shift(comm_2d, 0, -1, &shiftsource, &shiftdest);
// MPI_Cart_shift(comm_2d, 0, -mycoords[0], &shiftsource, &shiftdest);
MPI_Sendrecv_replace(a, nlocal*nlocal, MPI_DOUBLE, shiftdest,5, shiftsource, 5, comm_2d, &status);
// MPI_Cart_shift(comm_2d, 1, -mycoords[1], &shiftsource, &shiftdest);
MPI_Cart_shift(comm_2d, 1, -1, &shiftsource, &shiftdest);
MPI_Sendrecv_replace(b, nlocal*nlocal, MPI_DOUBLE,shiftdest, 6, shiftsource, 6, comm_2d, &status);
printf("rank[%d] has entered loop dim %d\n", myrank,dims[0]);fflush(stdout);
for (i=0; i<dims[0]; i++) {
// Multiply(nlocal, a, b, c);
MPI_Sendrecv_replace(a, nlocal*nlocal, MPI_DOUBLE,leftrank, 1, rightrank, 1, comm_2d, &status);
MPI_Sendrecv_replace(b, nlocal*nlocal, MPI_DOUBLE,uprank, 2, downrank, 2, comm_2d, &status);
}
printf("rank[%d] has left loop\n", myrank);fflush(stdout);
MPI_Barrier(MPI_COMM_WORLD);
// MPI_Cart_shift(comm_2d, 0, +mycoords[0], &shiftsource, &shiftdest);
MPI_Cart_shift(comm_2d, 0, 1, &shiftsource, &shiftdest);
MPI_Sendrecv_replace(a, nlocal*nlocal, MPI_DOUBLE,shiftdest, 3, shiftsource, 3, comm_2d, &status);
MPI_Cart_shift(comm_2d, 1, 1, &shiftsource, &shiftdest);
//MPI_Cart_shift(comm_2d, 1, +mycoords[1], &shiftsource, &shiftdest);
MPI_Sendrecv_replace(b, nlocal*nlocal, MPI_DOUBLE,shiftdest, 4, shiftsource, 4, comm_2d, &status);
printf("rank[%d] has reached the barrier...\n", myrank);fflush(stdout);
MPI_Barrier(MPI_COMM_WORLD);
if (myrank == 0) {
t += MPI_Wtime();
// writeMatrix(c, argv[3], n);
printf("Finshed in %d second(s)\n",t);
}
free(a); free(b); free(c);
MPI_Comm_free(&comm_2d);
MPI_Finalize();
return 0;
}