MPI_Bcast矩阵乘法设置
MPI_Bcast Matrix Multiplication Setup
我正在使用 MPI 并行乘以两个矩阵(二维数组),方法是将行平均划分并将它们分散在子进程中。主人也处理一大块行。我了解如何执行此操作并使用 MPI_Send/MPI_Recv 成功完成,但现在我正在尝试使用 MPI_Bcast 执行此操作并且无法弄清楚何时进行 Bcast 以及究竟要发送什么。当我在不同点输出完成的矩阵 (C) 时,似乎并非所有行都是 calculated/updated,我知道这可能是因为我没有正确指定缓冲区。
代码:
#include <iostream>
#include <stdlib.h>
#include <mpi.h>
#include <stdio.h>
#include <time.h>
using namespace std;
int main(int argc, char *argv[])
{
int myid, nproc;
int Ibuffer[200]; // Integer buffer, use proper size and type
double Dbuffer[2000]; // Double buffer, use proper size and type
char Sbuffer[200]; // String Buffer
int msg_len;
int i, j, k;
// initialize the MPI Environment and get the needed Data
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
// Get the name of processor
MPI_Get_processor_name(Sbuffer, &msg_len);
int RowA = 5,
ColA = 2,
RowB = ColA,
ColB = 3,
RowC = RowA,
ColC = ColB;
// Start clock
double start_time = MPI_Wtime();
// Initialize matrices
double **matA = new double*[RowA];
for (int i = 0; i < RowA; ++i)
matA[i] = new double[ColA];
double **matB = new double*[RowB];
for (int i = 0; i < RowB; ++i)
matB[i] = new double[ColB];
double **matC = new double*[RowC];
for (int i = 0; i < RowC; ++i)
matC[i] = new double[ColC];
for (int i = 0; i < RowA; i++) // MatA
{
for (int j = 0; j < ColA; j++)
{
matA[i][j] = 2;
}
}
for (int i = 0; i < RowB; i++) // MatB
{
for (int j = 0; j < ColB; j++)
{
matB[i][j] = 2;
}
}
for (int i = 0; i < RowC; i++) // MatC
{
for (int j = 0; j < ColC; j++)
{
matC[i][j] = 0;
}
}
// All procs compute the chunk size, no need to send separate
int chunk = RowA / nproc;
int rest = RowA % nproc;
int my_start_row = myid * chunk; // find my start row
int my_end_row = (myid + 1) * chunk; // find my end row
// assign rest ot last worker
if (myid == nproc-1) my_end_row += rest;
int Dcount = ColA * chunk; // Data count for A to send to worker
MPI_Status status; // Status variable neede for the receive
if (myid == 0)
{
// Send the rows needed for workers (Don't know if I need this or not)
//MPI_Bcast(matA, Dcount, MPI_DOUBLE, 0, MPI_COMM_WORLD);
// Then work on your own part
for (int i= my_start_row; i < my_end_row; i++)
{
for(int j=0; j < ColB; j++)
{
for(int k=0; k < RowB; k++)
{
matC[i][j] = matC[i][j] + (matA[i][k] * matB[k][j]);
}
}
}
for (int n=1; n<nproc; n++)
{
MPI_Bcast(matC, Dcount, MPI_DOUBLE, n, MPI_COMM_WORLD);
printf("\n ==++ Master Receive Result by Worker[%d], \n", n);
}
}
else
{
// This is worker, receive the needed info and start working
//MPI_Bcast(matA, Dcount, MPI_DOUBLE, 0, MPI_COMM_WORLD);
//printf("\n +++ Worker[%d], recived %d rows from Master \n", myid, myid*chunk);
cout << "\n === Master sent rows " << myid * chunk << " through " << (myid+1) * chunk << " to process #" << myid << endl;
// Do the work first
for (int i= my_start_row; i < my_end_row; i++)
{
for(int j=0; j < ColB; j++)
{
for(int k=0; k < RowB; k++)
{
matC[i][j] = matC[i][j] + (matA[i][k] * matB[k][j]);
}
}
}
// Send the result to the Master
MPI_Bcast(matC, Dcount, MPI_DOUBLE, myid, MPI_COMM_WORLD);
printf("\n --- Worker[%d], Sent Result to Master \n", myid);
}
// End clock
double end_time = MPI_Wtime();
if (myid == 0) {
cout << "\nParallel Exec time: " << end_time - start_time << endl;
}
MPI_Finalize();
// Clean up and release the storage
for (int i=0; i< RowA; i++)
{
delete [] matA[i];
matA[i] = NULL;
}
delete [] matA;
matA = NULL;
for (int i=0; i< RowA; i++)
{
delete [] matC[i];
matC[i] = NULL;
}
delete [] matC;
matC = NULL;
}
如果我理解这个问题太模糊或太麻烦,我只是想知道我是否错误地理解了如何以及何时使用 Bcast。
如果我没有看错的话,这段代码会为每个处理器在A、B、C开头生成三个相同的矩阵,然后计算A与B的乘法,但仅限于某些索引.这样,处理器rank
的乘法结果是
C(rank) = A(begin;end) * B
在考虑的行,
C(rank) = 0
外面。
所以问题出在MPI_Bcast
不添加矩阵,也不连接它,它是一个广播函数,并将缓冲区(这里是矩阵C)从根处理器发送到所有其他人。所以每个处理器,通过执行 Bcast,覆盖之前的 Bcast。
连接缓冲区,使用的函数是MPI_Gather
。但是在这里,由于矩阵在开始时大小合适,因此串联并不是一个好主意。
两个选项:
- 使用执行添加操作和收集数据的函数。你可以看到
MPI_Reduce
和MPI_Allreduce
(但是要完成的操作是x+(nbprocs-1)*0
,所以调用这样的函数并不是很有用)
- 将 A 和 C 拆分成较小的矩阵,然后使用
MPI_Gather
重新合并结果。
希望对您有所帮助!
祝你好运
我正在使用 MPI 并行乘以两个矩阵(二维数组),方法是将行平均划分并将它们分散在子进程中。主人也处理一大块行。我了解如何执行此操作并使用 MPI_Send/MPI_Recv 成功完成,但现在我正在尝试使用 MPI_Bcast 执行此操作并且无法弄清楚何时进行 Bcast 以及究竟要发送什么。当我在不同点输出完成的矩阵 (C) 时,似乎并非所有行都是 calculated/updated,我知道这可能是因为我没有正确指定缓冲区。
代码:
#include <iostream>
#include <stdlib.h>
#include <mpi.h>
#include <stdio.h>
#include <time.h>
using namespace std;
int main(int argc, char *argv[])
{
int myid, nproc;
int Ibuffer[200]; // Integer buffer, use proper size and type
double Dbuffer[2000]; // Double buffer, use proper size and type
char Sbuffer[200]; // String Buffer
int msg_len;
int i, j, k;
// initialize the MPI Environment and get the needed Data
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
// Get the name of processor
MPI_Get_processor_name(Sbuffer, &msg_len);
int RowA = 5,
ColA = 2,
RowB = ColA,
ColB = 3,
RowC = RowA,
ColC = ColB;
// Start clock
double start_time = MPI_Wtime();
// Initialize matrices
double **matA = new double*[RowA];
for (int i = 0; i < RowA; ++i)
matA[i] = new double[ColA];
double **matB = new double*[RowB];
for (int i = 0; i < RowB; ++i)
matB[i] = new double[ColB];
double **matC = new double*[RowC];
for (int i = 0; i < RowC; ++i)
matC[i] = new double[ColC];
for (int i = 0; i < RowA; i++) // MatA
{
for (int j = 0; j < ColA; j++)
{
matA[i][j] = 2;
}
}
for (int i = 0; i < RowB; i++) // MatB
{
for (int j = 0; j < ColB; j++)
{
matB[i][j] = 2;
}
}
for (int i = 0; i < RowC; i++) // MatC
{
for (int j = 0; j < ColC; j++)
{
matC[i][j] = 0;
}
}
// All procs compute the chunk size, no need to send separate
int chunk = RowA / nproc;
int rest = RowA % nproc;
int my_start_row = myid * chunk; // find my start row
int my_end_row = (myid + 1) * chunk; // find my end row
// assign rest ot last worker
if (myid == nproc-1) my_end_row += rest;
int Dcount = ColA * chunk; // Data count for A to send to worker
MPI_Status status; // Status variable neede for the receive
if (myid == 0)
{
// Send the rows needed for workers (Don't know if I need this or not)
//MPI_Bcast(matA, Dcount, MPI_DOUBLE, 0, MPI_COMM_WORLD);
// Then work on your own part
for (int i= my_start_row; i < my_end_row; i++)
{
for(int j=0; j < ColB; j++)
{
for(int k=0; k < RowB; k++)
{
matC[i][j] = matC[i][j] + (matA[i][k] * matB[k][j]);
}
}
}
for (int n=1; n<nproc; n++)
{
MPI_Bcast(matC, Dcount, MPI_DOUBLE, n, MPI_COMM_WORLD);
printf("\n ==++ Master Receive Result by Worker[%d], \n", n);
}
}
else
{
// This is worker, receive the needed info and start working
//MPI_Bcast(matA, Dcount, MPI_DOUBLE, 0, MPI_COMM_WORLD);
//printf("\n +++ Worker[%d], recived %d rows from Master \n", myid, myid*chunk);
cout << "\n === Master sent rows " << myid * chunk << " through " << (myid+1) * chunk << " to process #" << myid << endl;
// Do the work first
for (int i= my_start_row; i < my_end_row; i++)
{
for(int j=0; j < ColB; j++)
{
for(int k=0; k < RowB; k++)
{
matC[i][j] = matC[i][j] + (matA[i][k] * matB[k][j]);
}
}
}
// Send the result to the Master
MPI_Bcast(matC, Dcount, MPI_DOUBLE, myid, MPI_COMM_WORLD);
printf("\n --- Worker[%d], Sent Result to Master \n", myid);
}
// End clock
double end_time = MPI_Wtime();
if (myid == 0) {
cout << "\nParallel Exec time: " << end_time - start_time << endl;
}
MPI_Finalize();
// Clean up and release the storage
for (int i=0; i< RowA; i++)
{
delete [] matA[i];
matA[i] = NULL;
}
delete [] matA;
matA = NULL;
for (int i=0; i< RowA; i++)
{
delete [] matC[i];
matC[i] = NULL;
}
delete [] matC;
matC = NULL;
}
如果我理解这个问题太模糊或太麻烦,我只是想知道我是否错误地理解了如何以及何时使用 Bcast。
如果我没有看错的话,这段代码会为每个处理器在A、B、C开头生成三个相同的矩阵,然后计算A与B的乘法,但仅限于某些索引.这样,处理器rank
的乘法结果是
C(rank) = A(begin;end) * B
在考虑的行,
C(rank) = 0
外面。
所以问题出在MPI_Bcast
不添加矩阵,也不连接它,它是一个广播函数,并将缓冲区(这里是矩阵C)从根处理器发送到所有其他人。所以每个处理器,通过执行 Bcast,覆盖之前的 Bcast。
连接缓冲区,使用的函数是MPI_Gather
。但是在这里,由于矩阵在开始时大小合适,因此串联并不是一个好主意。
两个选项:
- 使用执行添加操作和收集数据的函数。你可以看到
MPI_Reduce
和MPI_Allreduce
(但是要完成的操作是x+(nbprocs-1)*0
,所以调用这样的函数并不是很有用) - 将 A 和 C 拆分成较小的矩阵,然后使用
MPI_Gather
重新合并结果。
希望对您有所帮助! 祝你好运