矩阵乘法 MPI + OMP

Matrix Multiplication MPI + OMP

我的同时使用 MPI 和 OMP 的矩阵乘法代码有问题。代码已正确编译,但它给了我错误的结果,矩阵 c(在 matmul 函数中)中的值太大而矩阵 C(在 main 中)甚至没有从函数 matmul 中获得结果。如果有人知道如何修复它,请帮助。

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <omp.h>
#include <mpi.h>

int offset,rows,br_elemenata,cvor_id,cvor,ukupno;
MPI_Status status;

double gettime(void) {
   struct timeval tv;
   gettimeofday(&tv, NULL);
   return tv.tv_sec + 1e-6 * tv.tv_usec;
}

void matfill(long N, double *mat, double val) {
   long i, j;

   for(i = 0; i < N; i ++)
      for(j = 0; j < N; j ++)
         mat[i * N + j] = val;
}

void matmul(long N, double *a, double *b, double *c) {
   long i, j, k;

  br_elemenata = N / ukupno;            //odredjujemo broj elemenata po cvoru

  if (N % ukupno != 0) br_elemenata++;      //inkrementujemo broj elemenata po cvoru kako ne bismo neki izostavili

  if (cvor == 0){
    for (cvor_id=1;cvor_id<ukupno;cvor_id++){
      offset = cvor_id * br_elemenata;
      rows = N - offset;
      if (rows > br_elemenata)
    rows = br_elemenata;
      // slanje podataka sa cvora 0 na ostale cvorove
      MPI_Send(&offset, 1, MPI_INT, cvor_id, 0, MPI_COMM_WORLD);
      MPI_Send(&rows, 1, MPI_INT, cvor_id, 0, MPI_COMM_WORLD);
      MPI_Send(a+offset, rows*N, MPI_DOUBLE, cvor_id, 0, MPI_COMM_WORLD);
      MPI_Send(b, N*N, MPI_DOUBLE, cvor_id, 0, MPI_COMM_WORLD);
  }
  offset = 0;
  rows = br_elemenata;
  } else {
    // Primanje podataka sa cvora 0
    MPI_Recv(&offset, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
    MPI_Recv(&rows, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
    MPI_Recv(a+offset, rows*N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &status);
    MPI_Recv(b, N*N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &status);
}

MPI_Barrier(MPI_COMM_WORLD);

#pragma omp parallel for shared(a,b,c) private(i,j,k)
   for (i = offset; i < offset + rows; i ++)
      for (j = 0; j < N; j ++)
         for (k = 0; k < N; k ++)
            c[i + j] += a[i + k] * b[k * N + j];
  printf("Clan: %5.2f\n",c[i]);
  if (cvor == 0) {
    for (cvor_id = 1; cvor_id < ukupno; cvor_id++) {
    MPI_Recv(&offset, 1, MPI_INT, cvor_id, 1, MPI_COMM_WORLD, &status);
    MPI_Recv(&rows, 1, MPI_INT, cvor_id, 1, MPI_COMM_WORLD, &status);
    MPI_Recv(c+offset, rows*N, MPI_DOUBLE, cvor_id, 1, MPI_COMM_WORLD, &status);
    }
  } else {
    MPI_Send(&offset, 1, MPI_INT, 0, 1, MPI_COMM_WORLD);
    MPI_Send(&rows, 1, MPI_INT, 0, 1, MPI_COMM_WORLD);
    MPI_Send(c+offset, rows*N, MPI_DOUBLE, 0, 1, MPI_COMM_WORLD);
  }  
}

int main(int argc, char **argv) {
   long N;
   double *A, *B, *C, t;


   MPI_Init(&argc,&argv);       //Inicijalizacija MPI

   MPI_Comm_size(MPI_COMM_WORLD,&ukupno);   //odredjujemo ukupan broj cvorova
   MPI_Comm_rank(MPI_COMM_WORLD,&cvor);     //odredjuje redni broj cvora, nacin da se svaki cvor identifikuje u komunikaciji


   if (argc!=2) {
     if (cvor==0) printf("Morate unijeti dimenziju matrice!");
     MPI_Finalize();                        // ako ne postoji argument pri pozivu funkcije, zavrsiti program
     return 1;
   }

   N = atoi(argv[1]);
   A = (double *) malloc(N * N * sizeof(double));
   B = (double *) malloc(N * N * sizeof(double));
   C = (double *) malloc(N * N * sizeof(double));
   matfill(N, A, 1.0);
   matfill(N, B, 2.0);
   matfill(N, C, 0.0);




   t = gettime();
   matmul(N, A, B, C);
   t = gettime() - t;

   // if (cvor == 0){
      fprintf(stdout, "%ld\t%le\t%le\n", N, t, (2 * N - 1) * N * N / t);
      fflush(stdout);

      printf("Clan: %f\n",C[6]);
  //  }

   free(A);
   free(B);
   free(C);

   return EXIT_SUCCESS;
}

主要问题是 offset 通信操作期间。应该是 offset*N.

更正代码:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <omp.h>
#include <mpi.h>

int offset,rows,br_elemenata,cvor_id,cvor,ukupno;
MPI_Status status;

double gettime(void) {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return tv.tv_sec + 1e-6 * tv.tv_usec;
}

void matfill(long N, double *mat, double val) {
    long i, j;

    for(i = 0; i < N; i ++)
        for(j = 0; j < N; j ++)
            mat[i * N + j] = val;
}

void matprint(long N, double *mat) {
    long i, j;

    for(i = 0; i < N; i ++){
        for(j = 0; j < N; j ++){
            printf("%g ",mat[i*N+j]);
        }
        printf("\n");
    }
}

void matdiag(long N, double *mat, double val) {
    long i, j;

    for(i = 0; i < N; i ++)
        for(j = 0; j < N; j ++)
            if(i==j){
                mat[i * N + j] = (double)i;
            }else{
                mat[i * N + j] =0;
            }
}

void matmul(long N, double *a, double *b, double *c) {
    long i, j, k;

    br_elemenata = N / ukupno;            //odredjujemo broj elemenata po cvoru

    if (N % ukupno != 0) br_elemenata++;      //inkrementujemo broj elemenata po cvoru kako ne bismo neki izostavili

    if (cvor == 0){
        for (cvor_id=1;cvor_id<ukupno;cvor_id++){
            offset = cvor_id * br_elemenata;
            rows = N - offset;
            if (rows > br_elemenata)
                rows = br_elemenata;
            // slanje podataka sa cvora 0 na ostale cvorove
            MPI_Send(&offset, 1, MPI_INT, cvor_id, 0, MPI_COMM_WORLD);
            MPI_Send(&rows, 1, MPI_INT, cvor_id, 1, MPI_COMM_WORLD);
            MPI_Send(a+(offset*N), rows*N, MPI_DOUBLE, cvor_id, 2, MPI_COMM_WORLD);
            MPI_Send(b, N*N, MPI_DOUBLE, cvor_id, 3, MPI_COMM_WORLD);
        }
        offset = 0;
        rows = br_elemenata;
    } else {
        // Primanje podataka sa cvora 0
        MPI_Recv(&offset, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
        MPI_Recv(&rows, 1, MPI_INT, 0, 1, MPI_COMM_WORLD, &status);
        MPI_Recv(a+(offset*N), rows*N, MPI_DOUBLE, 0, 2, MPI_COMM_WORLD, &status);
        MPI_Recv(b, N*N, MPI_DOUBLE, 0, 3, MPI_COMM_WORLD, &status);
    }

    MPI_Barrier(MPI_COMM_WORLD);

#pragma omp parallel for shared(a,b,c) private(i,j,k)
    for (i = offset; i < offset + rows; i ++)
        for (j = 0; j < N; j ++)
            for (k = 0; k < N; k ++)
                c[i*N + j] += a[i*N + k] * b[k * N + j];
    printf("Clan: %5.2f\n",c[i]);
    if (cvor == 0) {
        for (cvor_id = 1; cvor_id < ukupno; cvor_id++) {
            MPI_Recv(&offset, 1, MPI_INT, cvor_id, 4, MPI_COMM_WORLD, &status);
            MPI_Recv(&rows, 1, MPI_INT, cvor_id, 5, MPI_COMM_WORLD, &status);
            MPI_Recv(c+(N*offset), rows*N, MPI_DOUBLE, cvor_id, 6, MPI_COMM_WORLD, &status);
        }
    } else {
        MPI_Send(&offset, 1, MPI_INT, 0, 4, MPI_COMM_WORLD);
        MPI_Send(&rows, 1, MPI_INT, 0, 5, MPI_COMM_WORLD);
        MPI_Send(c+(N*offset), rows*N, MPI_DOUBLE, 0, 6, MPI_COMM_WORLD);
    }  
}

int main(int argc, char **argv) {
    long N;
    double *A, *B, *C, t;


    MPI_Init(&argc,&argv);       //Inicijalizacija MPI

    MPI_Comm_size(MPI_COMM_WORLD,&ukupno);   //odredjujemo ukupan broj cvorova
    MPI_Comm_rank(MPI_COMM_WORLD,&cvor);     //odredjuje redni broj cvora, nacin da se svaki cvor identifikuje u komunikaciji


    if (argc!=2) {
        if (cvor==0) printf("Morate unijeti dimenziju matrice!");
        MPI_Finalize();                        // ako ne postoji argument pri pozivu funkcije, zavrsiti program
        return 1;
    }

    N = atoi(argv[1]);
    A = (double *) malloc(N * N * sizeof(double));
    B = (double *) malloc(N * N * sizeof(double));
    C = (double *) malloc(N * N * sizeof(double));
    matfill(N, A, 1.0);
    matfill(N, B, 2.0);
    matfill(N, C, 0.0);
    matdiag(N,A, 1) ;




    t = gettime();
    matmul(N, A, B, C);
    t = gettime() - t;

    if (cvor == 0){
        fprintf(stdout, "%ld\t%le\t%le\n", N, t, (2 * N - 1) * N * N / t);
        fflush(stdout);
        printf("Clan: %f\n",C[6]);
        printf("A\n");
        matprint(N, A) ;
        printf("B\n");
        matprint(N, B) ;
        printf("C\n");
        matprint(N, C) ;

    }


    free(A);
    free(B);
    free(C);
    MPI_Finalize();
    return EXIT_SUCCESS;
}

编译:mpicc main.c -o main 到 运行:mpirun -np 4 main

如果您想更进一步,您会对 MPI_Bcast() function, which sends the same thing to everyone. MPI_Scatter() and MPI_Gather() 有助于分发矩阵或在给定过程中取回矩阵感兴趣。

此外,dgemm() function of BLAS 可用于加速给定进程的计算。

为了减少内存占用,AC 的分配大小可能会减少以解决 br_elemenata(进程 0 除外)...并且偏移量将有再次改变...!