主要作业正常终止,但 1 个进程返回非零退出代码。根据用户指示,作业已中止

Primary job terminated normally, but 1 process returned a non-zero exit code. Per user-direction, the job has been aborted

#include <stdio.h>
#include <iostream>
using namespace std;

void findCoords(int elem_num, int n, int& i, int& j){
    int d=(elem_num+1)/n;
    int q=(elem_num+1)%n;
    i=d-1+int(q!=0);
    j=(q-1+n)%n;
}

int main(int argc, char const *argv[])
{
    const int m=10,n=10,o=10,p=10;
    double A[m][n]={
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10}
    };
    double B[o][p]={
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10}
    };

    double C[m][p];
    int size=10;

    for (int rank=0;rank<size;rank++){
        int oneD_idx=rank;
        int i, j;
        int elements_number=((m*p-1)-rank)/size+1;
        double values_coords[elements_number][3];

        for (int a=0;a<elements_number;a++){
            findCoords(oneD_idx, p, i, j);
            oneD_idx+=size;
            double s=0;
            for (int k=0;k<n;k++){
                s+=A[i][k]*B[k][j];
            }

            values_coords[a][0]=s;
            values_coords[a][1]=i;
            values_coords[a][2]=j;
        }

        for (int x=0;x<elements_number;x++){
            i=values_coords[x][1];
            j=values_coords[x][2];
            double value=values_coords[x][0];
            C[i][j]=value;
        }
    }

    for (int i=0;i<m;i++){
        for (int j=0;j<p;j++){
            cout << C[i][j]<<"  ";
        }
        cout<<endl;
    }

    return 0;
}

以上代码通过循环模拟MPI。它用于矩阵乘法。想法是对于每个等级都有必须计算的坐标,它适用于 2-m*p 过程,其中 m 和 p 是输出矩阵的维度。代码运行良好。

然而,当我将下面的代码与 MPI 一起使用时,我不断收到在第二个代码之后显示的错误。

#include <stdio.h>
#include <mpi.h>
#include <iostream>

using namespace std;

void findCoords(int oneD_idx, int n, int& i, int& j){
    int d=(oneD_idx+1)/n;
    int q=(oneD_idx+1)%n;
    i=d-1+int(q!=0);
    j=(q-1+n)%n;
}

int main( int argc, char *argv[])
{
    int rank, size;
    /*const int m=10,n=10,o=10,p=10;
    double A[m][n]={
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10}
    };
    double B[o][p]={
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10},
        {1,2,3,4,5,6,7,8,9,10}
    };*/
    const int m=3,n=3,o=3,p=3;
    double A[m][n]={
        {1,2,3},
        {1,2,3},
        {1,2,3}
    };
    double B[o][p]={
        {1,2,3},
        {1,2,3},
        {1,2,3}
    };
    if (n!=o)
    {
        printf("Can not multiply because of the wrong shape!");
        return 0;
    }

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    int oneD_idx=rank;
    int i, j;
    int elements_number=((m*p-1)-rank)/size+1;
    double values_coords[elements_number][3];

    for (int a=0;a<elements_number;a++){
        findCoords(oneD_idx, p, i, j);
        oneD_idx+=size;
        double s=0;
        for (int k=0;k<n;k++){
            s+=A[i][k]*B[k][j];
        }

        values_coords[a][0]=s;
        values_coords[a][1]=i;
        values_coords[a][2]=j;
    }
    
    MPI_Status status1, status2;
    if (rank==0)
    {
        double C[m][p];
        for (int x=0;x<elements_number;x++){
            i=values_coords[x][1];
            j=values_coords[x][2];
            double value=values_coords[x][0];
            C[i][j]=value;
        }

        for (int r=1;r<size;r++){
            int recv_elements_number;
            MPI_Recv( &recv_elements_number , 1 , MPI_INT , r , 403 , MPI_COMM_WORLD , &status1);
            int recv_values_coords[recv_elements_number][3];
            MPI_Recv( &recv_values_coords , recv_elements_number*3 , MPI_DOUBLE , r , 404 , MPI_COMM_WORLD , &status2);
            for (int x=0;x<recv_elements_number;x++){
                i=recv_values_coords[x][1];
                j=recv_values_coords[x][2];
                double value=recv_values_coords[x][0];
                C[i][j]=value;
            }
        }

        for (int i=0;i<m;i++){
            for (int j=0;j<p;j++){
                cout << C[i][j]<<"  ";
            }
            cout<<endl;
        }
    }
    else
    {
        MPI_Send( &elements_number , 1 , MPI_INT , 0 , 403 , MPI_COMM_WORLD);
        MPI_Send( &values_coords , elements_number*3 , MPI_DOUBLE , 0 , 404 , MPI_COMM_WORLD);
    }
        
    MPI_Finalize();
        
    
    return 0;
}
Primary job  terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.

我使用 mpi_send 和 mpi_recv 来完成这项任务。好像是沟通有问题,我被堆积了。

正如 gilles-gouaillardet 在评论中所说,从 int 到 double 的转换存在问题,我应该使用 struct