mpich-3.3 中的死锁，但在其他版本中没有

Question

我有以下程序，它使用 openmpi 和 mpich-3.2.1 进行编译和运行，但是在使用 mpich-3.3 调用 MPI_Waitany 时出现死锁。程序必须是运行，列数为偶数，其中一半是通过对讲机向另一半发送数据。这是一个比更大的自定义模拟程序小得多的版本，我试着做一个最小的例子。

一个奇怪的部分是将 nwork 变量增加到 2 会使死锁消失。

#include <cstdio>
#include <mpi.h>
#include <vector>

const int endMsg = -1;
const int endTag = 424242;

class Work
{
public:
    Work(const MPI_Comm& comm, const MPI_Comm& interComm, int tag) :
        comm(comm), interComm(interComm), tag(tag)
    {
        MPI_Comm_rank(comm, &rank);
    }

    void waitPrevSend()
    {
        printf("[work %d] waiting for previous message\n", tag);
        MPI_Wait(&sizeReq, MPI_STATUS_IGNORE);
        MPI_Wait(&dataReq, MPI_STATUS_IGNORE);
        sizeReq = MPI_REQUEST_NULL;
        dataReq = MPI_REQUEST_NULL;
    }

    void workAndSend()
    {
        waitPrevSend();
        printf("[work %d] creating data\n", tag);
        data.resize(tag + 42, tag);
        sizeInBytes = data.size();
        MPI_Issend(&sizeInBytes, 1, MPI_INT, rank, 2*tag+0, interComm, &sizeReq);
        MPI_Issend(data.data(), data.size(), MPI_BYTE, rank, 2*tag+1, interComm, &dataReq);
        printf("[work %d] has sent %d bytes of data\n", tag, sizeInBytes);
    }


    MPI_Request wait()
    {
        MPI_Request req;
        printf("[work %d] posted recv of size\n", tag);
        MPI_Irecv(&sizeInBytes, 1, MPI_INT, rank, 2*tag+0, interComm, &req);
        return req;
    }

    void recv()
    {
        data.resize(sizeInBytes);
        MPI_Recv(data.data(), data.size(), MPI_BYTE, rank, 2*tag+1, interComm, MPI_STATUS_IGNORE);
        printf("[work %d] has recved %d bytes of data\n", tag, sizeInBytes);
    }


    MPI_Comm comm, interComm;
    int rank;
    int tag;


    MPI_Request sizeReq {MPI_REQUEST_NULL}, dataReq {MPI_REQUEST_NULL};
    std::vector<char> data;
    int sizeInBytes;
};

class Master
{
public:
    Master(const MPI_Comm& comm, const MPI_Comm& interComm) :
        comm(comm), interComm(interComm)
    {
        MPI_Comm_rank(comm, &rank);
    }

    void run(std::vector<Work>& work, int niter)
    {
        for (int i = 0; i < niter; ++i)
            for (auto& w : work)
                w.workAndSend();
        sendEndMsg();
    }

    void sendEndMsg()
    {
        MPI_Ssend(&endMsg, 1, MPI_INT, rank, endTag, interComm);
    }

    MPI_Comm comm, interComm;
    int rank;
};

class Slave
{
public:
    Slave(const MPI_Comm& comm, const MPI_Comm& interComm) :
        comm(comm), interComm(interComm)
    {
        MPI_Comm_rank(comm, &rank);
    }

    void run(std::vector<Work>& work)
    {
        std::vector<MPI_Request> reqs;
        for (auto& w : work)
            reqs.push_back(w.wait());
        reqs.push_back(recvEndMsg());

        while (true)
        {
            int id;
            MPI_Status status;
            printf("waiting for one of %d requests to complete\n", (int) reqs.size());
            MPI_Waitany(reqs.size(), reqs.data(), &id, &status);

            if (id == (int) reqs.size() - 1)
            {
                for (auto& req : reqs)
                {
                    if (req != MPI_REQUEST_NULL)
                    {
                        MPI_Cancel(&req);
                        MPI_Request_free(&req);
                    }
                }
                return;
            }
            else
            {
                work[id].recv();
                reqs[id] = work[id].wait();
            }
        }
    }

    MPI_Request recvEndMsg()
    {
        MPI_Request req;
        int msg;
        MPI_Irecv(&msg, 1, MPI_INT, rank, endTag, interComm, &req);
        return req;
    }

    MPI_Comm comm, interComm;
    int rank;
};


int main(int argc, char **argv)
{
    MPI_Init(&argc, &argv);

    int rank;
    int size;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    if ((size%2) != 0)
        MPI_Abort(MPI_COMM_WORLD, 1);

    MPI_Comm teamComm, interComm;
    int team = rank % 2;

    MPI_Comm_split(MPI_COMM_WORLD, team, rank, &teamComm);

    const int localLeader  = 0;
    const int remoteLeader = team ? 0 : 1;
    const int tag = 42;
    const int nwork = 1;

    MPI_Intercomm_create(teamComm, localLeader, MPI_COMM_WORLD, remoteLeader, tag, &interComm);

    std::vector<Work> work;
    for (int i = 0; i < nwork; ++i)
        work.emplace_back(Work(teamComm, interComm, i));

    if (team == 0)
    {
        Master master(teamComm, interComm);
        master.run(work, 10);
    }
    else
    {
        Slave slave(teamComm, interComm);
        slave.run(work);
    }

    MPI_Comm_free(&interComm);
    MPI_Comm_free(&teamComm);

    MPI_Finalize();
    return 0;
}

运行

mpirun -n 2 -l ./test_intercomm

仅在 mpich-3.3 中导致死锁。有什么想法吗？

编辑：我还尝试按照建议将停止标记减少到更小的值，同样的行为。上述命令的输出为：

[0] [work 0] waiting for previous message
[0] [work 0] creating data
[0] [work 0] has sent 42 bytes of data
[1] [work 0] posted recv of size
[1] waiting for one of 2 requests to complete
[0] [work 0] waiting for previous message

因此等级 1 在 waitany 死锁，等级 0 在等待发送请求完成时死锁（第二个，实际数据，其 recv 只有在通过 waitany 后才由等级 1 发布）。对我来说，MPI_Waitany 似乎阻止了一切。

Answer 1

这是由 MPICH v3.3 中的错误引起的。它已在提交 0f7be7196cc05bf0c908761e148628e88d635190 中修复。将修复应用到 v3.3 解决了死锁。

此修复包含在版本 3.3.1 中，因此您应该升级到该版本。

为了提供更多上下文，提交消息说：

Both testany and waitany functions skip over inactive or NULL requests before handing down to the device layer. However, the method for discovering the first non-NULL request could erroneously skip the first request in the array. To fix, we initialize the first non-NULL request to an invalid index in the array (count) and set to valid index later if one is found.

mpich-3.3 中的死锁，但在其他版本中没有

Deadlock in mpich-3.3 but not in other versions

c++

mpi