仅在多个节点上出现 MPI 分段错误

MPI Segmentation Fault On Multiple Nodes Only

所以我目前正在构建一个控制程序的基础,以 运行 在多个树莓派上使用每个树莓派上的所有可用内核。当我使用所有内核在其中一个节点上测试我的代码时,它工作正常,但使用多个节点会给我一个分段错误。

我查看了过去提出的所有类似问题,但它们都有问题,只会在一个节点上破坏我的代码。

完整代码:

#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <stdbool.h>
#include <time.h> 
int main(int argc, char *argv[])
{
        FILE *input;
        char batLine[86];   //may need to be made larger if bat commands get longer
        char sentbatch[86];
        int currentTask;
        int numTasks, rank, rc, i;
        MPI_Status stat;
        bool exitFlag = false;

        //mpi stuff
        MPI_Init(&argc,&argv);  //initilize mpi enviroment
        MPI_Comm_size(MPI_COMM_WORLD, &numTasks);
        MPI_Comm_rank(MPI_COMM_WORLD,&rank);
        //printf("Number of tasks: %d \n", numTasks);
        //printf ("MPI task %d has started...\n", rank);
        if(argc != 2)
        {
            printf("Usage: batallocation *.bat");
            exit(1); //exit with 1 indicates a failure
        }
        //contains file name: argv[1]
        input = fopen(argv[1],"r");

        currentTask = 0;
        if (rank ==0)
        {
            while(1)
            {
                if(exitFlag)
                    break; //allows to break out of while and for when no more lines exist
                char command[89] = "./";
                for(i=0; i < 16; i++) //will need to be 16 for full testing
                {

                    //fgets needs to be character count of longest line + 2 or it fails
                    if(fgets(batLine,86,input) != NULL)
                    {
                        printf("preview:%s\n",batLine);
                        if(i==0)
                        {
                            strcat(command,batLine);
                            printf("rank0 gets: %s\n", command);
                            //system(command);
                        }
                        else
                        {
                            //MPI_Send(buffer,count,type,dest,tag,comm)
                            MPI_Send(batLine,85,MPI_CHAR,i,i,MPI_COMM_WORLD); 
                            printf("sent rank%d: %s\n",i,batLine);
                        }
                    }
                    else
                    {
                        exitFlag = true; //flag to break out of while loop
                        break;
                    }


                }   
                //need to recieve data from other nodes here
                //put the data together in proper order
                //and only after that can the next sets be sent out

            }
        }
        else
        {
            char command[89] = "./";
            //MPI_Recv(buffer,count,type,source,tag,comm,status)
            MPI_Recv(sentbatch,86,MPI_CHAR,0,rank,MPI_COMM_WORLD,&stat);
            //using rank as flag makes it so only the wanted rank gets sent the data
            strcat(command,sentbatch); //adds needed ./ before batch data
            printf("rank=%d recieved data:%s",rank,sentbatch);
            //system(command); //should run batch line
        }
        fclose(input);
        MPI_Finalize();
        return(0);
}

传递的文件内容:


LAMOSTv108 spec-56321-GAC099N59V1_sp01-001.flx spec-56321-GAC099N59V1_sp01-001.nor f
LAMOSTv108 spec-56321-GAC099N59V1_sp01-003.flx spec-56321-GAC099N59V1_sp01-003.nor f
LAMOSTv108 spec-56321-GAC099N59V1_sp01-004.flx spec-56321-GAC099N59V1_sp01-004.nor f
LAMOSTv108 spec-56321-GAC099N59V1_sp01-005.flx spec-56321-GAC099N59V1_sp01-005.nor f
LAMOSTv108 spec-56321-GAC099N59V1_sp01-006.flx spec-56321-GAC099N59V1_sp01-006.nor f
LAMOSTv108 spec-56321-GAC099N59V1_sp01-008.flx spec-56321-GAC099N59V1_sp01-008.nor f
LAMOSTv108 spec-56321-GAC099N59V1_sp01-010.flx spec-56321-GAC099N59V1_sp01-010.nor f
LAMOSTv108 spec-56321-GAC099N59V1_sp01-013.flx spec-56321-GAC099N59V1_sp01-013.nor f
LAMOSTv108 spec-56321-GAC099N59V1_sp01-015.flx spec-56321-GAC099N59V1_sp01-015.nor f
LAMOSTv108 spec-56321-GAC099N59V1_sp01-018.flx spec-56321-GAC099N59V1_sp01-018.nor f
LAMOSTv108 spec-56321-GAC099N59V1_sp01-022.flx spec-56321-GAC099N59V1_sp01-022.nor f
LAMOSTv108 spec-56321-GAC099N59V1_sp01-023.flx spec-56321-GAC099N59V1_sp01-023.nor f
LAMOSTv108 spec-56321-GAC099N59V1_sp01-024.flx spec-56321-GAC099N59V1_sp01-024.nor f
LAMOSTv108 spec-56321-GAC099N59V1_sp01-025.flx spec-56321-GAC099N59V1_sp01-025.nor f
LAMOSTv108 spec-56321-GAC099N59V1_sp01-028.flx spec-56321-GAC099N59V1_sp01-028.nor f
LAMOSTv108 spec-56321-GAC099N59V1_sp01-029.flx spec-56321-GAC099N59V1_sp01-029.nor f

您会注意到我还没有做一些将在最终版本中完成的事情,它们在评论中以便于解决问题。主要是因为 LAMOST 代码速度不快,我不想等待它完成。

有效的命令提示符及其输出:

 $mpiexec -N 4 --host 10.0.0.3 -oversubscribe batTest2 shortpass2.bat
preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-001.flx spec-56321-GAC099N59V1_sp01-001.nor f

rank0 gets: ./LAMOSTv108 spec-56321-GAC099N59V1_sp01-001.flx spec-56321-GAC099N59V1_sp01-001.nor f

preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-003.flx spec-56321-GAC099N59V1_sp01-003.nor f

sent rank1: LAMOSTv108 spec-56321-GAC099N59V1_sp01-003.flx spec-56321-GAC099N59V1_sp01-003.nor f

preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-004.flx spec-56321-GAC099N59V1_sp01-004.nor f

sent rank2: LAMOSTv108 spec-56321-GAC099N59V1_sp01-004.flx spec-56321-GAC099N59V1_sp01-004.nor f

preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-005.flx spec-56321-GAC099N59V1_sp01-005.nor f

sent rank3: LAMOSTv108 spec-56321-GAC099N59V1_sp01-005.flx spec-56321-GAC099N59V1_sp01-005.nor f

rank=1 recieved data:LAMOSTv108 spec-56321-GAC099N59V1_sp01-003.flx spec-56321-GAC099N59V1_sp01-003.nor f
rank=3 recieved data:LAMOSTv108 spec-56321-GAC099N59V1_sp01-005.flx spec-56321-GAC099N59V1_sp01-005.nor f
rank=2 recieved data:LAMOSTv108 spec-56321-GAC099N59V1_sp01-004.flx spec-56321-GAC099N59V1_sp01-004.nor f

Shortpass2 只是同一个文件,但只有前 4 行。我的代码理论上应该适用于所有 16 行,但我将在解决当前问题后使用完整文件对其进行测试。

运行 在多个节点上的命令和输出:

$mpiexec -N 4 --host 10.0.0.3,10.0.0.4,10.0.0.5,10.0.0.6 -oversubscribe batTest2 shortpass.bat

preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-001.flx spec-56321-GAC099N59V1_sp01-001.nor f

rank0 gets: ./LAMOSTv108 spec-56321-GAC099N59V1_sp01-001.flx spec-56321-GAC099N59V1_sp01-001.nor f

preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-003.flx spec-56321-GAC099N59V1_sp01-003.nor f

sent rank1: LAMOSTv108 spec-56321-GAC099N59V1_sp01-003.flx spec-56321-GAC099N59V1_sp01-003.nor f

preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-004.flx spec-56321-GAC099N59V1_sp01-004.nor f

rank=1 recieved data:LAMOSTv108 spec-56321-GAC099N59V1_sp01-003.flx spec-56321-GAC099N59V1_sp01-003.nor f
sent rank2: LAMOSTv108 spec-56321-GAC099N59V1_sp01-004.flx spec-56321-GAC099N59V1_sp01-004.nor f

preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-005.flx spec-56321-GAC099N59V1_sp01-005.nor f

rank=2 recieved data:LAMOSTv108 spec-56321-GAC099N59V1_sp01-004.flx spec-56321-GAC099N59V1_sp01-004.nor f
sent rank3: LAMOSTv108 spec-56321-GAC099N59V1_sp01-005.flx spec-56321-GAC099N59V1_sp01-005.nor f

preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-006.flx spec-56321-GAC099N59V1_sp01-006.nor f

rank=3 recieved data:LAMOSTv108 spec-56321-GAC099N59V1_sp01-005.flx spec-56321-GAC099N59V1_sp01-005.nor f
sent rank4: LAMOSTv108 spec-56321-GAC099N59V1_sp01-006.flx spec-56321-GAC099N59V1_sp01-006.nor f

preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-008.flx spec-56321-GAC099N59V1_sp01-008.nor f

rank=4 recieved data:LAMOSTv108 spec-56321-GAC099N59V1_sp01-006.flx spec-56321-GAC099N59V1_sp01-006.nor f
[node2:27622] *** Process received signal ***
[node2:27622] Signal: Segmentation fault (11)
[node2:27622] Signal code: Address not mapped (1)
[node2:27622] Failing at address: (nil)
[node2:27622] *** End of error message ***
--------------------------------------------------------------------------
Primary job  terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
corrupted double-linked list
Aborted

有时会在完全中止之前成功进入第5位,并且会出现多个相同错误消息的实例。此外,安装的 Open MPI 支持多线程,所以这不是问题所在。这是我第一次使用 MPI,但这不是整个项目的第一部分,我已经对 MPI 进行了大量研究,甚至可以做到这一点。

我知道它不是由我的数组引起的,因为它也会在 node1 上中断。所有 pis 都是相同的,因此数组导致分段错误是没有意义的。 (虽然我承认我在这个项目的不同部分工作时多次遇到这个问题,因为我更习惯 Java 和 C# 处理数组的方式)

编辑: 我检查了是否可以 运行 它来自其他节点之一的 4 个核心并且工作正常并产生与它相同的输出在node1上做了。所以这证实它不是只发生在其他节点上的阵列问题。 还添加了预览打印输出代码中缺少的一行。

Edit2: Per Gilles 建议:该代码也适用于 运行ning 16 个任务,所有任务都在一个节点上。这是输出结果:

$ mpiexec -N 16 --host 10.0.0.3 -oversubscribe batTest4 shortpass.bat
preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-001.flx spec-56321-GAC099N59V1_sp01-001.nor f

rank0 gets: ./LAMOSTv108 spec-56321-GAC099N59V1_sp01-001.flx spec-56321-GAC099N59V1_sp01-001.nor f

preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-003.flx spec-56321-GAC099N59V1_sp01-003.nor f

sent rank1: LAMOSTv108 spec-56321-GAC099N59V1_sp01-003.flx spec-56321-GAC099N59V1_sp01-003.nor f

preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-004.flx spec-56321-GAC099N59V1_sp01-004.nor f

sent rank2: LAMOSTv108 spec-56321-GAC099N59V1_sp01-004.flx spec-56321-GAC099N59V1_sp01-004.nor f
preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-005.flx spec-56321-GAC099N59V1_sp01-005.nor f

sent rank3: LAMOSTv108 spec-56321-GAC099N59V1_sp01-005.flx spec-56321-GAC099N59V1_sp01-005.nor f

preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-006.flx spec-56321-GAC099N59V1_sp01-006.nor f

sent rank4: LAMOSTv108 spec-56321-GAC099N59V1_sp01-006.flx spec-56321-GAC099N59V1_sp01-006.nor f

preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-008.flx spec-56321-GAC099N59V1_sp01-008.nor f

sent rank5: LAMOSTv108 spec-56321-GAC099N59V1_sp01-008.flx spec-56321-GAC099N59V1_sp01-008.nor f

preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-010.flx spec-56321-GAC099N59V1_sp01-010.nor f

sent rank6: LAMOSTv108 spec-56321-GAC099N59V1_sp01-010.flx spec-56321-GAC099N59V1_sp01-010.nor f

preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-013.flx spec-56321-GAC099N59V1_sp01-013.nor f

sent rank7: LAMOSTv108 spec-56321-GAC099N59V1_sp01-013.flx spec-56321-GAC099N59V1_sp01-013.nor f

preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-015.flx spec-56321-GAC099N59V1_sp01-015.nor f

sent rank8: LAMOSTv108 spec-56321-GAC099N59V1_sp01-015.flx spec-56321-GAC099N59V1_sp01-015.nor f

preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-018.flx spec-56321-GAC099N59V1_sp01-018.nor f

sent rank9: LAMOSTv108 spec-56321-GAC099N59V1_sp01-018.flx spec-56321-GAC099N59V1_sp01-018.nor f

preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-022.flx spec-56321-GAC099N59V1_sp01-022.nor f

sent rank10: LAMOSTv108 spec-56321-GAC099N59V1_sp01-022.flx spec-56321-GAC099N59V1_sp01-022.nor f

preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-023.flx spec-56321-GAC099N59V1_sp01-023.nor f

sent rank11: LAMOSTv108 spec-56321-GAC099N59V1_sp01-023.flx spec-56321-GAC099N59V1_sp01-023.nor f

preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-024.flx spec-56321-GAC099N59V1_sp01-024.nor f

sent rank12: LAMOSTv108 spec-56321-GAC099N59V1_sp01-024.flx spec-56321-GAC099N59V1_sp01-024.nor f

preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-025.flx spec-56321-GAC099N59V1_sp01-025.nor f

sent rank13: LAMOSTv108 spec-56321-GAC099N59V1_sp01-025.flx spec-56321-GAC099N59V1_sp01-025.nor f

preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-028.flx spec-56321-GAC099N59V1_sp01-028.nor f

sent rank14: LAMOSTv108 spec-56321-GAC099N59V1_sp01-028.flx spec-56321-GAC099N59V1_sp01-028.nor f

preview:LAMOSTv108 spec-56321-GAC099N59V1_sp01-029.flx spec-56321-GAC099N59V1_sp01-029.nor f

sent rank15: LAMOSTv108 spec-56321-GAC099N59V1_sp01-029.flx spec-56321-GAC099N59V1_sp01-029.nor f

rank=3 recieved data:LAMOSTv108 spec-56321-GAC099N59V1_sp01-005.flx spec-56321-GAC099N59V1_sp01-005.nor f
rank=5 recieved data:LAMOSTv108 spec-56321-GAC099N59V1_sp01-008.flx spec-56321-GAC099N59V1_sp01-008.nor f
rank=6 recieved data:LAMOSTv108 spec-56321-GAC099N59V1_sp01-010.flx spec-56321-GAC099N59V1_sp01-010.nor f
rank=7 recieved data:LAMOSTv108 spec-56321-GAC099N59V1_sp01-013.flx spec-56321-GAC099N59V1_sp01-013.nor f
rank=11 recieved data:LAMOSTv108 spec-56321-GAC099N59V1_sp01-023.flx spec-56321-GAC099N59V1_sp01-023.nor f
rank=12 recieved data:LAMOSTv108 spec-56321-GAC099N59V1_sp01-024.flx spec-56321-GAC099N59V1_sp01-024.nor f
rank=9 recieved data:LAMOSTv108 spec-56321-GAC099N59V1_sp01-018.flx spec-56321-GAC099N59V1_sp01-018.nor f
rank=2 recieved data:LAMOSTv108 spec-56321-GAC099N59V1_sp01-004.flx spec-56321-GAC099N59V1_sp01-004.nor f
rank=4 recieved data:LAMOSTv108 spec-56321-GAC099N59V1_sp01-006.flx spec-56321-GAC099N59V1_sp01-006.nor f
rank=8 recieved data:LAMOSTv108 spec-56321-GAC099N59V1_sp01-015.flx spec-56321-GAC099N59V1_sp01-015.nor f
rank=10 recieved data:LAMOSTv108 spec-56321-GAC099N59V1_sp01-022.flx spec-56321-GAC099N59V1_sp01-022.nor f
rank=15 recieved data:LAMOSTv108 spec-56321-GAC099N59V1_sp01-029.flx spec-56321-GAC099N59V1_sp01-029.nor f
rank=1 recieved data:LAMOSTv108 spec-56321-GAC099N59V1_sp01-003.flx spec-56321-GAC099N59V1_sp01-003.nor f
rank=13 recieved data:LAMOSTv108 spec-56321-GAC099N59V1_sp01-025.flx spec-56321-GAC099N59V1_sp01-025.nor f
rank=14 recieved data:LAMOSTv108 spec-56321-GAC099N59V1_sp01-028.flx spec-56321-GAC099N59V1_sp01-028.nor f

不确定那是否是 的问题,但肯定是 a 的问题:

您正在阅读然后从 batLine 发送 85 个字符到这里:

char batLine[86];

//fgets needs to be character count of longest line + 2 or it fails
if(fgets(batLine,86,input) != NULL)
{
    // ...
    MPI_Send(batLine,85,MPI_CHAR,i,i,MPI_COMM_WORLD);
    // ...
}

鉴于 batLine[] 是 86 个元素,而 LAMOSTv108 spec-56321-GAC099N59V1_sp01-001.flx spec-56321-GAC099N59V1_sp01-001.nor f\n 是 85 个字符长,您发送的字符串不包括 [=16=] 终止符,它位于 86-第 array 元素。

在接收端你有:

char sentbatch[86];

{
    char command[89] = "./";
    // ...
    MPI_Recv(sentbatch,86,MPI_CHAR,0,rank,MPI_COMM_WORLD,&stat);
    strcat(command,sentbatch);
    // ...
}

sentbatch 从未被初始化,所以最初它包含垃圾。由于所有传入消息的长度都是 85 个字符,因此第 86 个字符永远不会被覆盖,它会保留最初存在的任何垃圾。因此,如果那不是 [=16=],那么 strcat() 将继续从 sentbatch 读取第 85 个字符之后的垃圾并附加到 command。由于 commandsentbatch 都在堆栈上,因此读取将继续,直到它在堆栈上的某个地方遇到 0x00 ,此时写入超过 command 的末尾将破坏其他局部变量甚至堆栈帧,稍后导致潜在的段错误,或者直到它到达堆栈的末尾,这肯定会导致段错误。它有时在某些级别起作用纯属偶然。

要么将 MPI_Send 更改为发送 86 个字符,要么将 sentbatch 的第 86 个元素显式归零。或者,更好的是,使用 strncat(command, sentbatch, 85) 附加不超过 85 个字符或使用

直接接收到 command
MPI_Recv(&command[2],86,MPI_CHAR,0,rank,MPI_COMM_WORLD,&stat);

char command[89] = "./";[=16=] 填充 command[] 剩余的 87 个元素,所以在这种情况下终止符没有问题。

经过多次搜索类似问题,我终于找到了代码错误的答案。只需在许多不同的可能输入中搜索错误消息即可。

行:

input = fopen(argv[1],"1");
fclose(input);

只需要在等级 0 内。意味着在多个节点上将其设置为 运行 的正确代码是:

//has file open and closed moved to hopefully work on multiple nodes
//now only occurs for task0 which is on node1
#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <stdbool.h>
#include <time.h> 
int main(int argc, char *argv[])
{
        FILE *input;
        char batLine[86];   //may need to be made larger if bat commands get longer
        char sentbatch[86];
        int currentTask;
        int numTasks, rank, rc, i;
        MPI_Status stat;
        bool exitFlag = false;

        //mpi stuff
        MPI_Init(&argc,&argv);  //initilize mpi enviroment
        MPI_Comm_size(MPI_COMM_WORLD, &numTasks);
        MPI_Comm_rank(MPI_COMM_WORLD,&rank);
        //printf("Number of tasks: %d \n", numTasks);
        //printf ("MPI task %d has started...\n", rank);
        if(argc != 2)
        {
            printf("Usage: batallocation *.bat");
            exit(1); //exit with 1 indicates a failure
        }
        if (rank ==0)
        {
            //contains file name: argv[1]
            input = fopen(argv[1],"r");
            while(1)
            {
                if(exitFlag)
                    break; //allows to break out of while and for when no more lines exist
                char command[89] = "./";
                for(i=0; i < 16; i++) //will need to be 16 for full testing
                {

                    //fgets needs to be character count of longest line + 2 or it fails
                    if(fgets(batLine,86,input) != NULL)
                    {
                        if(i==0)
                        {
                            strcat(command,batLine);
                            printf("rank0 gets: %s\n", command);
                            //system(command);
                        }
                        else
                        {
                            //MPI_Send(buffer,count,type,dest,tag,comm)
                            MPI_Send(batLine,85,MPI_CHAR,i,i,MPI_COMM_WORLD); 
                            printf("sent rank%d: %s\n",i,batLine);
                        }
                    }
                    else
                    {
                        exitFlag = true; //flag to break out of while loop
                        break;
                    }


                }   
                //need to recieve data from other nodes here
                //put the data together in proper order
                //and only after that can the next sets be sent out

            }
            fclose(input);
        }
        else
        {
            char command[89] = "./";
            //MPI_Recv(buffer,count,type,source,tag,comm,status)
            MPI_Recv(sentbatch,86,MPI_CHAR,0,rank,MPI_COMM_WORLD,&stat);
            //using rank as flag makes it so only the wanted rank gets sent the data
            strcat(command,sentbatch); //adds needed ./ before batch data
            printf("rank=%d recieved data:%s",rank,sentbatch);
            //system(command); //should run batch line
        }

        MPI_Finalize();
        return(0);
}

我不知道回答你自己的问题是否可以接受,但我想确保如果有人遇到同样的问题,他们知道如何解决。我知道我讨厌当我发现一个类似的问题时只看到提问者编辑说他们解决了这个问题而没有解释他们是如何解决的。