MPI 中的矩阵乘法
Matrix Multiplication in MPI
我正在尝试使用 1、2、4 或 8 个处理器使用 MPI 创建一个简单的矩阵乘法程序。我的代码适用于 1(在这种情况下它只进行正常的矩阵乘法,我的意思是,如果你只在一个等级上 运行 就没有工作可以拆分)。它还适用于 2 和 4 处理器。但是,当我尝试使用 8 个处理器时(-n 8 在命令行上 运行ning 程序时),我没有在矩阵 c.
的所有位置上获得正确的值
这里有例子:如果SIZE = 8(即a和b和c都是8x8的矩阵),得到的矩阵如下:
8.00 8.00 8.00 8.00 8.00 8.00 8.00 8.00
8.00 8.00 8.00 8.00 8.00 8.00 8.00 8.00
8.00 8.00 8.00 8.00 8.00 8.00 8.00 8.00
8.00 8.00 8.00 8.00 8.00 8.00 8.00 8.00
16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00
16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00
16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00
0.00 0.00 16.00 16.00 16.00 16.00 16.00 16.00
如果 SIZE = 16:
16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00
16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00
16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00
16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00
16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00
16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00
16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00
16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00
32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00
32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00
32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00
32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00
32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00
32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00
0.00 0.00 0.00 0.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00
32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00
如您所见,零在左下角弹出。 Rank 7 正在做的事情导致这些坐标变为 0。
我现在一直盯着自己的代码看,我觉得我只需要另一双眼睛看它们。据我所知,所有发送和接收都正常工作,所有不同的任务都获得了它们应有的价值。从我所做的测试来看,没有任务实际上给 c 矩阵中的任何位置赋予 0 值。我不知道为什么会发生,如何发生,也不知道我能做些什么来解决它。
代码如下:
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#define SIZE 16 /*assumption: SIZE a multiple of number of nodes*/
#define FROM_MASTER 1/*setting a message type*/
#define FROM_WORKER 2/*setting a message type*/
#define DEBUG 1/*1 = debug on, 0 = debug off*/
MPI_Status status;
static double a[SIZE][SIZE];
static double b[SIZE][SIZE];
static double c[SIZE][SIZE];
static double b_to_trans[SIZE][SIZE];
static void init_matrix(void)
{
int i, j;
for (i = 0; i < SIZE; i++)
{
for (j = 0; j < SIZE; j++) {
a[i][j] = 1.0;
if(i >= SIZE/2) a[i][j] = 2.0;
b_to_trans[i][j] = 1.0;
if(j >= SIZE/2) b[i][j] = 2.0;
// c[i][j] = 1.0;
}
}
}
static void print_matrix(void)
{
int i, j;
for(i = 0; i < SIZE; i++) {
for(j = 0; j < SIZE; j++) {
printf("%7.2f", c[i][j]);
}
printf("\n");
}
}
static void transpose_matrix()
{
int i, j;
for(i = 0; i<SIZE; i++)
for(j = 0; j<SIZE;j++)
b[i][j] = b_to_trans[j][i];
}
int main(int argc, char **argv)
{
int myrank, nproc;
int rows; /*amount of work per node (rows per worker)*/
int mtype; /*message type: send/recv between master and workers*/
int dest, src, offseta, offsetb;
int runthrough, runmod;
double start_time, end_time;
int i, j, k, l;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
rows = SIZE/nproc;
mtype = FROM_MASTER;
if (myrank == 0) {
/*Initialization*/
printf("SIZE = %d, number of nodes = %d\n", SIZE, nproc);
init_matrix();
transpose_matrix();
start_time = MPI_Wtime();
if(nproc == 1) { /*In case we only run on one processor, the master will simply do a regular matrix-matrix multiplacation.*/
for(i = 0; i < SIZE; i++) {
for(j = 0; j < SIZE; j++) {
for(k = 0; k < SIZE; k++)
c[i][j] = c[i][j] + a[i][k]*b[j][k];
}
}
end_time = MPI_Wtime();
if(DEBUG) /*Prints the resulting matrix c*/
print_matrix();
printf("Execution time on %2d nodes: %f\n", nproc, end_time-start_time);
}
else {
for(l = 0; l < nproc; l++){
offsetb = rows*l;
offseta = rows;
mtype = FROM_MASTER;
for(dest = 1; dest < nproc; dest++){
MPI_Send(&offseta, 1, MPI_INT, dest, mtype, MPI_COMM_WORLD);
MPI_Send(&offsetb, 1, MPI_INT, dest, mtype, MPI_COMM_WORLD);
MPI_Send(&rows, 1, MPI_INT, dest, mtype, MPI_COMM_WORLD);
MPI_Send(&a[offseta][0], rows*SIZE, MPI_DOUBLE, dest, mtype, MPI_COMM_WORLD);
MPI_Send(&b[offsetb][0], rows*SIZE, MPI_DOUBLE, dest, mtype, MPI_COMM_WORLD);
offseta += rows;
offsetb = (offsetb+rows)%SIZE;
}
offseta = rows;
offsetb = rows*l;
//printf("Rank: %d, offseta: %d, offsetb: %d\n", myrank, offseta, offsetb);
//printf("Offseta: %d\n", offseta);
//printf("Offsetb: %d\n", offsetb);
for(i = 0; i < offseta; i++) {
for(j = offsetb; j < offsetb+rows; j++) {
for(k = 0; k < SIZE; k++){
c[i][j] = c[i][j] + a[i][k]*b[j][k];
}
}
}
mtype = FROM_WORKER;
for(src = 1; src < nproc; src++){
MPI_Recv(&offseta, 1, MPI_INT, src, mtype, MPI_COMM_WORLD, &status);
MPI_Recv(&offsetb, 1, MPI_INT, src, mtype, MPI_COMM_WORLD, &status);
MPI_Recv(&rows, 1, MPI_INT, src, mtype, MPI_COMM_WORLD, &status);
for(i = 0; i < rows; i++) {
MPI_Recv(&c[offseta+i][offsetb], offseta, MPI_DOUBLE, src, mtype, MPI_COMM_WORLD, &status); /*returns answer c(1,1)*/
}
}
}
end_time = MPI_Wtime();
if(DEBUG) /*Prints the resulting matrix c*/
print_matrix();
printf("Execution time on %2d nodes: %f\n", nproc, end_time-start_time);
}
}
else{
if(nproc > 1) {
for(l = 0; l < nproc; l++){
mtype = FROM_MASTER;
MPI_Recv(&offseta, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD, &status);
MPI_Recv(&offsetb, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD, &status);
MPI_Recv(&rows, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD, &status);
MPI_Recv(&a[offseta][0], rows*SIZE, MPI_DOUBLE, 0, mtype, MPI_COMM_WORLD, &status);
MPI_Recv(&b[offsetb][0], rows*SIZE, MPI_DOUBLE, 0, mtype, MPI_COMM_WORLD, &status);
for(i = offseta; i < offseta+rows; i++) {
for(j = offsetb; j < offsetb+rows; j++) {
for(k = 0; k < SIZE; k++){
c[i][j] = c[i][j] + a[i][k]*b[j][k];
}
}
}
mtype = FROM_WORKER;
MPI_Send(&offseta, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD);
MPI_Send(&offsetb, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD);
MPI_Send(&rows, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD);
for(i = 0; i < rows; i++){
MPI_Send(&c[offseta+i][offsetb], offseta, MPI_DOUBLE, 0, mtype, MPI_COMM_WORLD);
}
}
}
}
MPI_Finalize();
return 0;
}
任何建议都会有所帮助,预先谢谢你。
这不是一个确定的答案,但肯定会帮助您进行调试。
我做了一个测试,在 master 从 workers 收到最终数据的地方添加了以下代码。在一堆输出中,我只显示重要的。请注意,j+count
永远不会超过 SIZE
,处理器数量为 8 的情况除外。这很重要,因为您写入的是未分配的内存。
for(i = 0; i < rows; i++) {
MPI_Recv(&c[offseta+i][offsetb], offseta, MPI_DOUBLE, src, mtype, MPI_COMM_WORLD, &status);
// I added the following for debugging.
if (src == nproc-1)
{
printf("src = %i\n", src);
printf("i = %i\n", offseta+i);
printf("j = %i\n", offsetb);
printf("count = %i\n", offseta);
}
}
np=2
src = 1
i = 15
j = 8
count = 8
np=4
src = 3
i = 15
j = 4
count = 12
np=8
src = 7
i = 15
j = 10
count = 14
我正在尝试使用 1、2、4 或 8 个处理器使用 MPI 创建一个简单的矩阵乘法程序。我的代码适用于 1(在这种情况下它只进行正常的矩阵乘法,我的意思是,如果你只在一个等级上 运行 就没有工作可以拆分)。它还适用于 2 和 4 处理器。但是,当我尝试使用 8 个处理器时(-n 8 在命令行上 运行ning 程序时),我没有在矩阵 c.
的所有位置上获得正确的值这里有例子:如果SIZE = 8(即a和b和c都是8x8的矩阵),得到的矩阵如下:
8.00 8.00 8.00 8.00 8.00 8.00 8.00 8.00
8.00 8.00 8.00 8.00 8.00 8.00 8.00 8.00
8.00 8.00 8.00 8.00 8.00 8.00 8.00 8.00
8.00 8.00 8.00 8.00 8.00 8.00 8.00 8.00
16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00
16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00
16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00
0.00 0.00 16.00 16.00 16.00 16.00 16.00 16.00
如果 SIZE = 16:
16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00
16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00
16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00
16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00
16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00
16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00
16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00
16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00 16.00
32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00
32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00
32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00
32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00
32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00
32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00
0.00 0.00 0.00 0.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00
32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00 32.00
如您所见,零在左下角弹出。 Rank 7 正在做的事情导致这些坐标变为 0。
我现在一直盯着自己的代码看,我觉得我只需要另一双眼睛看它们。据我所知,所有发送和接收都正常工作,所有不同的任务都获得了它们应有的价值。从我所做的测试来看,没有任务实际上给 c 矩阵中的任何位置赋予 0 值。我不知道为什么会发生,如何发生,也不知道我能做些什么来解决它。
代码如下:
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#define SIZE 16 /*assumption: SIZE a multiple of number of nodes*/
#define FROM_MASTER 1/*setting a message type*/
#define FROM_WORKER 2/*setting a message type*/
#define DEBUG 1/*1 = debug on, 0 = debug off*/
MPI_Status status;
static double a[SIZE][SIZE];
static double b[SIZE][SIZE];
static double c[SIZE][SIZE];
static double b_to_trans[SIZE][SIZE];
static void init_matrix(void)
{
int i, j;
for (i = 0; i < SIZE; i++)
{
for (j = 0; j < SIZE; j++) {
a[i][j] = 1.0;
if(i >= SIZE/2) a[i][j] = 2.0;
b_to_trans[i][j] = 1.0;
if(j >= SIZE/2) b[i][j] = 2.0;
// c[i][j] = 1.0;
}
}
}
static void print_matrix(void)
{
int i, j;
for(i = 0; i < SIZE; i++) {
for(j = 0; j < SIZE; j++) {
printf("%7.2f", c[i][j]);
}
printf("\n");
}
}
static void transpose_matrix()
{
int i, j;
for(i = 0; i<SIZE; i++)
for(j = 0; j<SIZE;j++)
b[i][j] = b_to_trans[j][i];
}
int main(int argc, char **argv)
{
int myrank, nproc;
int rows; /*amount of work per node (rows per worker)*/
int mtype; /*message type: send/recv between master and workers*/
int dest, src, offseta, offsetb;
int runthrough, runmod;
double start_time, end_time;
int i, j, k, l;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
rows = SIZE/nproc;
mtype = FROM_MASTER;
if (myrank == 0) {
/*Initialization*/
printf("SIZE = %d, number of nodes = %d\n", SIZE, nproc);
init_matrix();
transpose_matrix();
start_time = MPI_Wtime();
if(nproc == 1) { /*In case we only run on one processor, the master will simply do a regular matrix-matrix multiplacation.*/
for(i = 0; i < SIZE; i++) {
for(j = 0; j < SIZE; j++) {
for(k = 0; k < SIZE; k++)
c[i][j] = c[i][j] + a[i][k]*b[j][k];
}
}
end_time = MPI_Wtime();
if(DEBUG) /*Prints the resulting matrix c*/
print_matrix();
printf("Execution time on %2d nodes: %f\n", nproc, end_time-start_time);
}
else {
for(l = 0; l < nproc; l++){
offsetb = rows*l;
offseta = rows;
mtype = FROM_MASTER;
for(dest = 1; dest < nproc; dest++){
MPI_Send(&offseta, 1, MPI_INT, dest, mtype, MPI_COMM_WORLD);
MPI_Send(&offsetb, 1, MPI_INT, dest, mtype, MPI_COMM_WORLD);
MPI_Send(&rows, 1, MPI_INT, dest, mtype, MPI_COMM_WORLD);
MPI_Send(&a[offseta][0], rows*SIZE, MPI_DOUBLE, dest, mtype, MPI_COMM_WORLD);
MPI_Send(&b[offsetb][0], rows*SIZE, MPI_DOUBLE, dest, mtype, MPI_COMM_WORLD);
offseta += rows;
offsetb = (offsetb+rows)%SIZE;
}
offseta = rows;
offsetb = rows*l;
//printf("Rank: %d, offseta: %d, offsetb: %d\n", myrank, offseta, offsetb);
//printf("Offseta: %d\n", offseta);
//printf("Offsetb: %d\n", offsetb);
for(i = 0; i < offseta; i++) {
for(j = offsetb; j < offsetb+rows; j++) {
for(k = 0; k < SIZE; k++){
c[i][j] = c[i][j] + a[i][k]*b[j][k];
}
}
}
mtype = FROM_WORKER;
for(src = 1; src < nproc; src++){
MPI_Recv(&offseta, 1, MPI_INT, src, mtype, MPI_COMM_WORLD, &status);
MPI_Recv(&offsetb, 1, MPI_INT, src, mtype, MPI_COMM_WORLD, &status);
MPI_Recv(&rows, 1, MPI_INT, src, mtype, MPI_COMM_WORLD, &status);
for(i = 0; i < rows; i++) {
MPI_Recv(&c[offseta+i][offsetb], offseta, MPI_DOUBLE, src, mtype, MPI_COMM_WORLD, &status); /*returns answer c(1,1)*/
}
}
}
end_time = MPI_Wtime();
if(DEBUG) /*Prints the resulting matrix c*/
print_matrix();
printf("Execution time on %2d nodes: %f\n", nproc, end_time-start_time);
}
}
else{
if(nproc > 1) {
for(l = 0; l < nproc; l++){
mtype = FROM_MASTER;
MPI_Recv(&offseta, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD, &status);
MPI_Recv(&offsetb, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD, &status);
MPI_Recv(&rows, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD, &status);
MPI_Recv(&a[offseta][0], rows*SIZE, MPI_DOUBLE, 0, mtype, MPI_COMM_WORLD, &status);
MPI_Recv(&b[offsetb][0], rows*SIZE, MPI_DOUBLE, 0, mtype, MPI_COMM_WORLD, &status);
for(i = offseta; i < offseta+rows; i++) {
for(j = offsetb; j < offsetb+rows; j++) {
for(k = 0; k < SIZE; k++){
c[i][j] = c[i][j] + a[i][k]*b[j][k];
}
}
}
mtype = FROM_WORKER;
MPI_Send(&offseta, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD);
MPI_Send(&offsetb, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD);
MPI_Send(&rows, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD);
for(i = 0; i < rows; i++){
MPI_Send(&c[offseta+i][offsetb], offseta, MPI_DOUBLE, 0, mtype, MPI_COMM_WORLD);
}
}
}
}
MPI_Finalize();
return 0;
}
任何建议都会有所帮助,预先谢谢你。
这不是一个确定的答案,但肯定会帮助您进行调试。
我做了一个测试,在 master 从 workers 收到最终数据的地方添加了以下代码。在一堆输出中,我只显示重要的。请注意,j+count
永远不会超过 SIZE
,处理器数量为 8 的情况除外。这很重要,因为您写入的是未分配的内存。
for(i = 0; i < rows; i++) {
MPI_Recv(&c[offseta+i][offsetb], offseta, MPI_DOUBLE, src, mtype, MPI_COMM_WORLD, &status);
// I added the following for debugging.
if (src == nproc-1)
{
printf("src = %i\n", src);
printf("i = %i\n", offseta+i);
printf("j = %i\n", offsetb);
printf("count = %i\n", offseta);
}
}
np=2
src = 1
i = 15
j = 8
count = 8
np=4
src = 3
i = 15
j = 4
count = 12
np=8
src = 7
i = 15
j = 10
count = 14