进程共享条件变量：一个进程死后如何恢复？

Question

我正在开发一个简单的 FIFO 队列来同步服务器进程的多个实例。

这与 Linux synchronization with FIFO waiting queue, except dealing with multiple processes instead of threads. I adapted caf's ticket lock 使用共享内存段中的进程共享互斥量和条件变量。它还处理超时，以防一个进程在处理请求时死亡：

#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <pthread.h>
#include <signal.h>
#include <errno.h>

static inline void fail(char *str)
{
    perror(str);
    exit(1);
}

/***************************************************************************************************/

/* Simple ticket lock queue with pthreads
 * 
 */

typedef struct ticket_lock {
    pthread_mutex_t mutex;
    pthread_cond_t  cond;
    int queue_head, queue_tail;
} ticket_lock_t;

static void
ticket_init(ticket_lock_t *t)
{
    pthread_mutexattr_t mattr;
    pthread_mutexattr_init(&mattr);
    pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED);
    pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST);
    pthread_mutex_init(&t->mutex, &mattr);
    pthread_mutexattr_destroy(&mattr);

    pthread_condattr_t cattr;
    pthread_condattr_init(&cattr);
    pthread_condattr_setpshared(&cattr, PTHREAD_PROCESS_SHARED);
    pthread_cond_init(&t->cond, &cattr);
    pthread_condattr_destroy(&cattr);

    t->queue_head = t->queue_tail = 0;
}

static void
ticket_broadcast(ticket_lock_t *ticket)
{
    pthread_cond_broadcast(&ticket->cond);
}

static int
ticket_lock(ticket_lock_t *ticket)
{
    pthread_mutex_lock(&ticket->mutex);
    int queue_me = ticket->queue_tail++;

    while (queue_me > ticket->queue_head) {
        time_t sec = time(NULL) + 5;  /* 5s timeout */
        struct timespec ts = { .tv_sec = sec, .tv_nsec = 0 };

        fprintf(stderr, "%i: waiting, current: %i  me: %i\n", getpid(), ticket->queue_head, queue_me);

        if (pthread_cond_timedwait(&ticket->cond, &ticket->mutex, &ts) == 0)
            continue;

        if (errno != ETIMEDOUT) fail("pthread_cond_timedwait");

        /* Timeout, kick current user... */
        fprintf(stderr, "kicking stale ticket %i\n", ticket->queue_head);
        ticket->queue_head++;
        ticket_broadcast(ticket);
    }

    pthread_mutex_unlock(&ticket->mutex);
    return queue_me;
}

static void
ticket_unlock(ticket_lock_t *ticket, int me)
{
    pthread_mutex_lock(&ticket->mutex);
    if (ticket->queue_head == me) {  /* Normal case: we haven't timed out. */
        ticket->queue_head++;
        ticket_broadcast(ticket);
    }
    pthread_mutex_unlock(&ticket->mutex);
}


/***************************************************************************************************/
/* Shared memory */

#define SHM_NAME    "fifo_sched"
#define SHM_MAGIC   0xdeadbeef

struct sched_shm {
    int  size;
    int  magic;
    int  ready;

    /* sched stuff */
    ticket_lock_t queue;
};

static unsigned int shm_size = 256;
static struct sched_shm *shm = 0;

/* Create new shared memory segment */
static void
create_shm()
{
       int fd = shm_open(SHM_NAME, O_RDWR | O_CREAT | O_TRUNC, 0644);
       assert(fd != -1);
       int r = ftruncate(fd, shm_size);  assert(r == 0);
       void *pt = mmap(0, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
       assert(pt != MAP_FAILED);
       fprintf(stderr, "Created shared memory.\n");

       shm = pt;
       memset(shm, 0, sizeof(*shm));
       shm->size = shm_size;
       shm->magic = SHM_MAGIC;
       shm->ready = 0;

       ticket_init(&shm->queue);

       shm->ready = 1;
}

/* Attach existing shared memory segment */
static int
attach_shm()
{
    int fd = shm_open(SHM_NAME, O_RDWR, 0);
    if (fd == -1)  return 0;  /* Doesn't exist yet... */

    shm = mmap(0, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    if (shm == MAP_FAILED)  fail("mmap");
    fprintf(stderr, "Mapped shared memory.\n");

    assert(shm->magic == SHM_MAGIC);
    assert(shm->ready);
    return 1;
}

static void
shm_init()
{  
    fprintf(stderr, "shm_init()\n");
    assert(shm_size >= sizeof(struct sched_shm));
    if (!attach_shm())
        create_shm();
}


/***************************************************************************************************/

int main()
{
    shm_init();

    while (1) {
        int ticket = ticket_lock(&shm->queue);
        printf("%i: start %i\n", getpid(), ticket);
        printf("%i: done  %i\n", getpid(), ticket);
        ticket_unlock(&shm->queue, ticket);
    }
    return 0;
}

这在独立运行和添加额外进程时运行良好：

$ gcc -g -Wall -std=gnu99 -o foo foo.c -lpthread -lrt
$ ./foo
$ ./foo     # (in other term)
...
26370: waiting, current: 134803  me: 134804
26370: start 134804
26370: done  134804
26370: waiting, current: 134805  me: 134806
26370: start 134806
26370: done  134806
26370: waiting, current: 134807  me: 134808

然而，杀死第二个实例会破坏第一个实例中的 pthread_cond_timedwait()：

pthread_cond_timedwait: No such file or directory

这在某种程度上是有道理的，条件变量一直在跟踪这个过程，但它不再存在了。

肯定有办法从中恢复吗？

Answer 1

[评论太长]

 pthread_cond_timedwait: No such file or directory

呼！ :-)

pthread_*() 系列函数不会设置 errno 任何错误代码，但 returns它。

因此，要获得任何可用的结果，请更改此

    if (pthread_cond_timedwait(&ticket->cond, &ticket->mutex, &ts) == 0)
        continue;

    if (errno != ETIMEDOUT) fail("pthread_cond_timedwait");

成为

    if ((errno = pthread_cond_timedwait(&ticket->cond, &ticket->mutex, &ts)) == 0)
        continue;

    if (errno != ETIMEDOUT) fail("pthread_cond_timedwait");

Answer 2

好的，引用 posix pthread_mutex_lock() 参考：

If mutex is a robust mutex and the process containing the owning thread terminated while holding the mutex lock, a call to pthread_mutex_lock() shall return the error value [EOWNERDEAD]. [...] In these cases, the mutex is locked by the thread but the state it protects is marked as inconsistent. The application should ensure that the state is made consistent for reuse and when that is complete call pthread_mutex_consistent(). If the application is unable to recover the state, it should unlock the mutex without a prior call to pthread_mutex_consistent(), after which the mutex is marked permanently unusable.

因此，除了 alk 的评论以稳健地处理因互斥锁锁定而死亡的进程外，我们还需要在调用 pthread_mutex_lock() 和 pthread_cond_timedwait() 时注意 EOWNERDEAD，并调用 pthread_mutex_consistent（）在上面。

类似于：

if ((errno = pthread_cond_timedwait(&ticket->cond, &ticket->mutex, &ts)) == 0)
    continue;
if (errno == EOWNERDEAD)  /* Recover mutex owned by dead process */
    pthread_mutex_consistent(&ticket->mutex);
else if (errno != ETIMEDOUT)
    fail("pthread_cond_timedwait");

进程共享条件变量：一个进程死后如何恢复？

Process-shared condition variable : how to recover after one process dies?

c

linux

pthreads

condition-variable