c fork() 和 kill() 同时不起作用?

c fork() and kill() at the same time not working?

主程序:启动一定数量的child进程,然后立即发送SIGINT

int     main()
{
    pid_t   childs[CHILDS];
    char    *execv_argv[3];
    int     n = CHILDS;

    execv_argv[0] = "./debugging_procs/wait_time_at_interrupt";
    execv_argv[1] = "2";
    execv_argv[2] = NULL;

    for (int i = 0; i < n; i++)
    {
        childs[i] = fork();
        if (childs[i] == 0)
        {
            execv(execv_argv[0], execv_argv);
            if (errno != 0)
                perror(strerror(errno));        
            _exit(1);
        }
    }

    if (errno != 0)
        perror(strerror(errno));

    // sleep(1);

    for (int i = 0; i < n; i++)
        kill(childs[i], SIGINT);

    if (errno != 0)
        perror(strerror(errno));

    // Wait for all children.
    while (wait(NULL) > 0);

    return 0;
}

分叉程序:等待任何信号,如果发送 SIGINT,则打开某个文件并将 SIGINT 和当前 pid 写入其中并等待指定的秒数(在这种情况下,我从主程序发送 2 ).

#include <signal.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>

void    sigint_handler(int signum)
{
    int     fd = open("./aux/log1", O_WRONLY | O_APPEND);
    char    buf[124];

    (void)signum;
    sprintf(buf, "SIGINT %d\n", getpid());
    write(fd, buf, strlen(buf));
    close(fd);
}

int     main(int argc, char **argv)
{
    int wait_time;

    wait_time = (argv[1]) ? atoi(argv[1]) : 5;
    signal(SIGINT, &sigint_handler);

    // Wait for any signal.
    pause();
    sleep(wait_time);
    return 0;
}

问题是,children 应该写入的日志文件没有 n 行,这意味着并非所有 children 都写入了它。有时没有人写任何东西,主程序根本没有 wait (意味着在这种情况下 sleep() 没有被调用)。

但是如果我在主程序中取消注释 sleep(1),一切都会如我所料。

我怀疑 child 进程没有足够的时间来收听 SIGINT

我正在处理的程序是一个任务控件,当我 运行 一个命令时: restart my_program; restart my_program 我的行为不稳定。当我调用重新启动时,发送一个 SIGINT,然后调用一个新的 fork(),然后发送另一个 SIGINT,就像上面的示例一样。

如何确保所有 children 都能在没有 sleep(1) 行的情况下解析 SIGINT?我正在测试我的程序是否可以处理在发送 SIGINT 后不会立即退出的程序。

例如,如果我在 child 程序的顶部添加 printf("child process started\n");,它不会被打印并且主程序不会等待任何东西,除非我 sleep 一秒钟。即使只有 1 个 child 进程也会发生这种情况。

尝试在 for 循环中使用 waitpid() 命令。这样下一个 child 只会在第一个 child 完成后写入

一切正常。你的一些 child 进程在设置信号处理程序之前,甚至在它们开始执行 child 二进制文件之前就被信号杀死了。

在您的 parent 进程中,您可以检查每个进程的标识和退出状态,而不是仅仅 wait() 直到没有更多的 child 进程。将 while (wait(NULL) > 0); 替换为

{
    pid_t  p;
    int    status;

    while ((p = wait(&status)) > 0) {
        if (WIFEXITED(status))
            printf("Child %ld exit status was %d.\n", (long)p, WEXITSTATUS(status));
        else
        if (WIFSIGNALED(status))
            printf("Child %ld was killed by signal %d.\n", (long)p, WTERMSIG(status));
        else
            printf("Child %ld was lost.\n", (long)p);
        fflush(stdout);
    }
}

你会看到 "missing" child 进程被信号终止了。这意味着 child 进程在准备好捕捉信号之前就被终止了。


我自己编写了示例程序对,具有完整的错误检查功能。我决定使用 sigprocmask()sigwaitinfo() 而不是信号处理程序,只是为了展示另一种方法来做同样的事情(并且不限于信号处理程序中的 async-signal 安全函数).

parent.c:

#define  _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <signal.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>

const char *signal_name(const int signum)
{
    static char  buffer[32];
    switch (signum) {
    case SIGINT:  return "INT";
    case SIGHUP:  return "HUP";
    case SIGTERM: return "TERM";
    default:
        snprintf(buffer, sizeof buffer, "%d", signum);
        return (const char *)buffer;
    }
}

static int compare_pids(const void *p1, const void *p2)
{
    const pid_t  pid1 = *(const pid_t *)p1;
    const pid_t  pid2 = *(const pid_t *)p2;

    return (pid1 < pid2) ? -1 :
           (pid1 > pid2) ? +1 : 0;
}

int main(int argc, char *argv[])
{
    size_t  count, r, i;
    int     status;
    pid_t  *child, *reaped, p;
    char    dummy;

    if (argc < 3 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv[0]);
        fprintf(stderr, "       %s COUNT PATH-TO-BINARY [ ARGS ... ]\n", argv[0]);
        fprintf(stderr, "\n");
        fprintf(stderr, "This program will fork COUNT child processes,\n");
        fprintf(stderr, "each child process executing PATH-TO-BINARY.\n");
        fprintf(stderr, "Immediately after all child processes have been forked,\n");
        fprintf(stderr, "they are sent a SIGINT signal.\n");
        fprintf(stderr, "\n");
        return EXIT_FAILURE;
    }
    if (sscanf(argv[1], " %zu %c", &count, &dummy) != 1 || count < 1) {
        fprintf(stderr, "%s: Invalid count.\n", argv[1]);
        return EXIT_FAILURE;
    }

    child = malloc(count * sizeof child[0]);
    reaped = malloc(count * sizeof reaped[0]);
    if (!child || !reaped) {
        fprintf(stderr, "%s: Count is too large; out of memory.\n", argv[1]);
        return EXIT_FAILURE;
    }

    for (i = 0; i < count; i++) {
        p = fork();
        if (p == -1) {
            if (i == 0) {
                fprintf(stderr, "Cannot fork child processes: %s.\n", strerror(errno));
                return EXIT_FAILURE;
            } else {
                fprintf(stderr, "Cannot fork child %zu: %s.\n", i + 1, strerror(errno));
                count = i;
                break;
            }
        } else
        if (!p) {
            /* Child process */
            execvp(argv[2], argv + 2);
            {
                const char *errmsg = strerror(errno);
                fprintf(stderr, "Child process %ld: Cannot execute %s: %s.\n",
                                (long)getpid(), argv[2], errmsg);
                exit(EXIT_FAILURE);
            }
        } else {
            /* Parent process. */
            child[i] = p;
        }
    }

    /* Send all children the INT signal. */
    for (i = 0; i < count; i++)
        kill(child[i], SIGINT);

    /* Reap and report each child. */
    r = 0;
    while (1) {
        p = wait(&status);

        if (p == -1) {
            if (errno == ECHILD)
                break;
            fprintf(stderr, "Error waiting for child processes: %s.\n", strerror(errno));
            return EXIT_FAILURE;
        }

        if (r < count)
            reaped[r++] = p;
        else
            fprintf(stderr, "Reaped an extra child process!\n");

        if (WIFEXITED(status)) {
            switch (WEXITSTATUS(status)) {
            case EXIT_SUCCESS:
                printf("Parent: Reaped child process %ld: EXIT_SUCCESS.\n", (long)p);
                break;
            case EXIT_FAILURE:
                printf("Parent: Reaped child process %ld: EXIT_FAILURE.\n", (long)p);
                break;
            default:
                printf("Parent: Reaped child process %ld: Exit status %d.\n", (long)p, WEXITSTATUS(status));
                break;
            }
            fflush(stdout);

        } else
        if (WIFSIGNALED(status)) {
            printf("Parent: Reaped child process %ld: Terminated by %s.\n", (long)p, signal_name(WTERMSIG(status)));
            fflush(stdout);

        } else {
            printf("Parent: Reaped child process %ld: Lost.\n", (long)p);
            fflush(stdout);
        }
    }

    if (r == count) {
        /* Sort both pid arrays. */
        qsort(child, count, sizeof child[0], compare_pids);
        qsort(reaped, count, sizeof reaped[0], compare_pids);
        for (i = 0; i < count; i++)
            if (child[i] != reaped[i])
                break;
        if (i == count)
            printf("Parent: All %zu child processes were reaped successfully.\n", count);
    }

    return EXIT_SUCCESS;
}

child.c:

#define  _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <signal.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>

const char *signal_name(const int signum)
{
    static char  buffer[32];
    switch (signum) {
    case SIGINT:  return "INT";
    case SIGHUP:  return "HUP";
    case SIGTERM: return "TERM";
    default:
        snprintf(buffer, sizeof buffer, "%d", signum);
        return (const char *)buffer;
    }
}

int main(void)
{
    const long mypid = getpid();
    sigset_t   set;
    siginfo_t  info;
    int        result;

    printf("Child: Child process %ld started!\n", mypid);
    fflush(stdout);

    sigemptyset(&set);
    sigaddset(&set, SIGINT);
    sigaddset(&set, SIGHUP);
    sigaddset(&set, SIGTERM);
    sigprocmask(SIG_BLOCK, &set, NULL);
    result = sigwaitinfo(&set, &info);
    if (result == -1) {
        printf("Child: Child process %ld failed: %s.\n", mypid, strerror(errno));
        return EXIT_FAILURE;
    }

    if (info.si_pid == 0)
        printf("Child: Child process %ld terminated by signal %s via terminal.\n", mypid, signal_name(result));
    else
    if (info.si_pid == getppid())
        printf("Child: Child process %ld terminated by signal %s sent by the parent process %ld.\n",
               mypid, signal_name(result), (long)info.si_pid);
    else
        printf("Child: Child process %ld terminated by signal %s sent by process %ld.\n",
               mypid, signal_name(result), (long)info.si_pid);
    return EXIT_SUCCESS;
}

使用例如

编译两者
gcc -Wall -O2 parent.c -o parent
gcc -Wall -O2 child.c -o child

和运行他们使用例如

./parent 100 ./child

其中 100 是要分叉的 child 个进程的数量,每个 运行ning ./child.

错误输出到标准错误。从 parent 到标准输出的每一行都以 Parent: 开头,从任何 child 到标准输出的每一行都以 Child:.

开头

在我的机器上,输出的最后一行始终是 Parent: All # child processes were reaped successfully.,这意味着每个 child 进程 fork()ed,都使用 wait() 进行收集和报告.什么都没有丢失,fork()kill().

也没有问题

(请注意,如果您指定的 child 个进程多于允许的 fork,parent 程序不会认为这是一个错误,而只会使用允许的 child 测试过程。)

在我的机器上,fork 和 reaping 100 个 child 进程对于 parent 进程来说已经足够了,这样每个 child 进程都可以到达准备捕获的部分信号。

另一方面,parent 可以处理 10 个 child 进程 (运行ning ./parent 10 ./child) 如此之快以至于每个 child进程在准备好处理信号之前被 INT 信号杀死。

这是一个非常典型的案例的输出,当 运行ning ./parent 20 ./child:

Child: Child process 19982 started!
Child: Child process 19983 started!
Child: Child process 19984 started!
Child: Child process 19982 terminated by signal INT sent by the parent process 19981.
Child: Child process 19992 started!
Child: Child process 19983 terminated by signal INT sent by the parent process 19981.
Child: Child process 19984 terminated by signal INT sent by the parent process 19981.
Parent: Reaped child process 19982: EXIT_SUCCESS.
Parent: Reaped child process 19985: Terminated by INT.
Parent: Reaped child process 19986: Terminated by INT.
Parent: Reaped child process 19984: EXIT_SUCCESS.
Parent: Reaped child process 19987: Terminated by INT.
Parent: Reaped child process 19988: Terminated by INT.
Parent: Reaped child process 19989: Terminated by INT.
Parent: Reaped child process 19990: Terminated by INT.
Parent: Reaped child process 19991: Terminated by INT.
Parent: Reaped child process 19992: Terminated by INT.
Parent: Reaped child process 19993: Terminated by INT.
Parent: Reaped child process 19994: Terminated by INT.
Parent: Reaped child process 19995: Terminated by INT.
Parent: Reaped child process 19996: Terminated by INT.
Parent: Reaped child process 19983: EXIT_SUCCESS.
Parent: Reaped child process 19997: Terminated by INT.
Parent: Reaped child process 19998: Terminated by INT.
Parent: Reaped child process 19999: Terminated by INT.
Parent: Reaped child process 20000: Terminated by INT.
Parent: Reaped child process 20001: Terminated by INT.
Parent: All 20 child processes were reaped successfully.

在 20 个 child 进程中,有 16 个在执行第一个 printf()(或 fflush(stdout))行之前被 INT 信号杀死。 (我们可以在 execvp() 行之前向 parent.c 添加一个 printf("Child: Child process %ld executing %s\n", (long)getpid(), argv[2]); fflush(stdout);,以查看是否有任何 child 进程在执行之前被杀死。)

在剩余的四个 child 进程(19982、19983、19984 和 19992)中,一个 (19982) 在第一个 printf()fflush() 之后但在它之前终止设法 运行 setprocmask(),阻止信号并准备 child 捕捉它。

只有剩下的三个 child 进程(19983、19984 和 19992)捕获了 parent 进程发送的 INT 信号。

如您所见,只需添加完整的错误检查,并添加足够的输出(和 fflush(stdout); 在有用的地方,因为默认情况下标准输出是缓冲的),让您 运行 几个测试用例,并对正在发生的事情构建一个更好的整体图景。


The program I'm working on is a task control and when I run a command like: restart my_program; restart my_program I get an unstable behaviour. When I call restart, a SIGINT is sent, then a new fork() is called then another SIGINT is sent, just like the example above.

在那种情况下,您将在新分叉准备就绪之前发送信号,因此 default disposition of the signal(终止,对于 INT)定义了发生的情况。

这个潜在问题的解决方案各不相同。请注意,它是许多 init system 问题的核心。如果child(这里是my_program)co-operates很容易解决,但在所有其他情况下都很难。

一个简单的 co-operation 方法是让 child 向其 parent 进程发送一个信号,只要它准备好执行操作。为避免杀死 parent 个未准备好接收此类信息的进程,可以使用默认忽略的信号(例如 SIGWINCH)。

睡眠一段时间的选项,以便新的 child 进程有足够的时间准备好采取行动,这是一种常见但非常不可靠的缓解此问题的方法。 (具体而言,所需的持续时间取决于 child 进程优先级和机器的总体负载。)