如何修复分段错误?
How to fix the segmentation fault?
(编辑:我刚刚修复了 getpid
缓存问题并重新运行 gdb
和 valgrind
。)
(编辑:我只是将子堆栈的大小从 200
字节增加到 2000
字节。)
我写了下面的程序来学习如何在 linux
x86-64
机器上使用 clone
和 CLONE_VM | CLONE_VFORK | CLONE_PARENT
:
// test.c
#define _GNU_SOURCE
#include <stdio.h>
#include <assert.h>
#include <syscall.h> // For syscall to call getpid
#include <signal.h> // For SIGCHILD
#include <sys/types.h>// For getppid
#include <unistd.h> // For getppid and sleep
#include <sched.h> // For clone
#include <stdlib.h> // For calloc and free
#define STACK_SIZE 2000
void Puts(const char *str)
{
assert(fputs(str, stderr) != EOF);
}
void Sleep(unsigned int sec)
{
do {
sec = sleep(sec);
} while(sec > 0);
}
int child(void *useless)
{
Puts("The new process is created.\n");
assert(fprintf(stderr, "pid = %d, ppid = %d\n", (pid_t) syscall(SYS_getpid), getppid()) > 0);
Puts("sleep for 120 secs\n");
Sleep(120);
return 0;
}
int main(int argc, char* argv[])
{
Puts("Allocate stack for new process\n");
void *stack = calloc(STACK_SIZE, sizeof(char));
void *stack_top = (void*) ((char*) stack + STACK_SIZE - 1);
assert(fprintf(stderr, "stack = %p, stack top = %p\n", stack, stack_top) > 0);
Puts("clone\n");
int ret = clone(child, stack_top, CLONE_VM | CLONE_VFORK | CLONE_PARENT | SIGCHLD, NULL);
Puts("clone returns\n");
Puts("Free the stack\n");
free(stack);
if (ret == -1)
perror("clone(child, stack, CLONE_VM | CLONE_VFORK, NULL)");
else {
ret = 0;
Puts("Child dies...\n");
}
return ret;
}
我在 bash
中使用 clang-7 test.c
和 运行 编译程序 ./a.out
。它立即返回以下输出:
Allocate stack for new process
stack = 0x492260, stack top = 0x492a2f
clone
The new process is created.
Segmentation fault
它 returns 139
意味着信号 SIGSEGV
被发送到我的进程。
然后我用-g
重新编译它并用valgrind --trace-children=yes ./a.out
调试它:
|| ==14494== Memcheck, a memory error detector
|| ==14494== Copyright (C) 2002-2015, and GNU GPL'd, by Julian Seward et al.
|| ==14494== Using Valgrind-3.12.0.SVN and LibVEX; rerun with -h for copyright info
|| ==14494== Command: ./a.out
|| ==14494==
|| Allocate stack for new process
|| stack = 0x51f3040, stack top = 0x51f380f
|| clone
|| clone returns
|| Free the stack
|| Child dies...
|| ==14495== Invalid write of size 4
|| ==14495== at 0x201322: ??? (in /home/nobodyxu/a.out)
|| ==14495== by 0x4F2FCBE: clone (clone.S:95)
|| ==14495== Address 0xffffffffffffffdc is not stack'd, malloc'd or (recently) free'd
|| ==14495==
|| ==14495==
|| ==14495== Process terminating with default action of signal 11 (SIGSEGV)
|| ==14495== Access not within mapped region at address 0xFFFFFFFFFFFFFFDC
|| ==14495== at 0x201322: ??? (in /home/nobodyxu/a.out)
|| ==14495== by 0x4F2FCBE: clone (clone.S:95)
|| ==14495== If you believe this happened as a result of a stack
|| ==14495== overflow in your program's main thread (unlikely but
|| ==14495== possible), you can try to increase the size of the
|| ==14495== main thread stack using the --main-stacksize= flag.
|| ==14495== The main thread stack size used in this run was 8388608.
|| ==14495==
|| ==14495== HEAP SUMMARY:
|| ==14495== in use at exit: 2,000 bytes in 1 blocks
|| ==14495== total heap usage: 1 allocs, 0 frees, 2,000 bytes allocated
|| ==14495==
|| ==14495== LEAK SUMMARY:
|| ==14495== definitely lost: 0 bytes in 0 blocks
|| ==14495== indirectly lost: 0 bytes in 0 blocks
|| ==14495== possibly lost: 0 bytes in 0 blocks
|| ==14495== still reachable: 2,000 bytes in 1 blocks
|| ==14495== suppressed: 0 bytes in 0 blocks
|| ==14495== Rerun with --leak-check=full to see details of leaked memory
|| ==14495==
|| ==14495== For counts of detected and suppressed errors, rerun with: -v
|| ==14495== ERROR SUMMARY: 1 errors from 1 contexts (suppressed: 0 from 0)
|| ==14494==
|| ==14494== HEAP SUMMARY:
|| ==14494== in use at exit: 0 bytes in 0 blocks
|| ==14494== total heap usage: 1 allocs, 1 frees, 2,000 bytes allocated
|| ==14494==
|| ==14494== All heap blocks were freed -- no leaks are possible
|| ==14494==
|| ==14494== For counts of detected and suppressed errors, rerun with: -v
|| ==14494== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)
它也立即返回并打印了这些。
我检查了 0x201322
的生成程序集,发现它属于 int main(int argc, char* argv[])
:
|| 20131d: e8 8e 01 00 00 callq 2014b0 <clone@plt>
|| 201322: 89 45 dc mov %eax,-0x24(%rbp)
|| 201325: 48 bf 54 09 20 00 00 movabs [=13=]x200954,%rdi
|| 20132c: 00 00 00
|| 20132f: e8 dc fd ff ff callq 201110 <Puts>
|| 201334: 48 bf ad 08 20 00 00 movabs [=13=]x2008ad,%rdi
|| 20133b: 00 00 00
我也试过在gdb
中使用set follow-fork-mode child
来调试它,但是这不起作用。
如何修复分段错误?
函数 printf 和 fprintf 在没有各种保护的情况下似乎不是线程安全的 rails。这在 .
中有详细说明
我通过记下最后一次打印发生的位置的蛮力方法发现了问题,然后注释掉之后的行,直到错误消失。
我用gdb调试你的程序。报错信息如下。
你为子函数申请的stack可能在子函数真正执行fprintf之前就已经释放了
在子函数中,在assert后面加上fflush(stdout);
可能会解决你的问题
Continuing.
Allocate stack for new process
stack = 0x602010, stack top = 0x6027df
clone
The new process is created.
sleep for 20 secs
clone returns
Free the stack
*** Error in `test': double free or corruption (out): 0x0000000000602010 ***
======= Backtrace: =========
/lib/x86_64-linux-gnu/libc.so.6(+0x777e5)[0x7ffff7a847e5]
/lib/x86_64-linux-gnu/libc.so.6(+0x8037a)[0x7ffff7a8d37a]
/lib/x86_64-linux-gnu/libc.so.6(cfree+0x4c)[0x7ffff7a9153c]
/***/***/tmp/test[0x400969]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf0)[0x7ffff7a2d830]
/***/***/tmp/test[0x400729]
======= Memory map: ========
00400000-00401000 r-xp 00000000 08:21 12848672 /***/***/tmp/test
00600000-00601000 r--p 00000000 08:21 12848672 /***/***/tmp/test
00601000-00602000 rw-p 00001000 08:21 12848672 /***/***/tmp/test
00602000-00623000 rw-p 00000000 00:00 0 [heap]
7ffff0000000-7ffff0021000 rw-p 00000000 00:00 0
7ffff0021000-7ffff4000000 ---p 00000000 00:00 0
7ffff77f7000-7ffff780d000 r-xp 00000000 08:01 786957 /lib/x86_64-linux-gnu/libgcc_s.so.1
7ffff780d000-7ffff7a0c000 ---p 00016000 08:01 786957 /lib/x86_64-linux-gnu/libgcc_s.so.1
7ffff7a0c000-7ffff7a0d000 rw-p 00015000 08:01 786957 /lib/x86_64-linux-gnu/libgcc_s.so.1
7ffff7a0d000-7ffff7bcd000 r-xp 00000000 08:01 791529 /lib/x86_64-linux-gnu/libc-2.23.so
7ffff7bcd000-7ffff7dcd000 ---p 001c0000 08:01 791529 /lib/x86_64-linux-gnu/libc-2.23.so
7ffff7dcd000-7ffff7dd1000 r--p 001c0000 08:01 791529 /lib/x86_64-linux-gnu/libc-2.23.so
7ffff7dd1000-7ffff7dd3000 rw-p 001c4000 08:01 791529 /lib/x86_64-linux-gnu/libc-2.23.so
7ffff7dd3000-7ffff7dd7000 rw-p 00000000 00:00 0
7ffff7dd7000-7ffff7dfd000 r-xp 00000000 08:01 791311 /lib/x86_64-linux-gnu/ld-2.23.so
7ffff7fd3000-7ffff7fd6000 rw-p 00000000 00:00 0
7ffff7ff7000-7ffff7ff8000 rw-p 00000000 00:00 0
7ffff7ff8000-7ffff7ffa000 r--p 00000000 00:00 0 [vvar]
7ffff7ffa000-7ffff7ffc000 r-xp 00000000 00:00 0 [vdso]
7ffff7ffc000-7ffff7ffd000 r--p 00025000 08:01 791311 /lib/x86_64-linux-gnu/ld-2.23.so
7ffff7ffd000-7ffff7ffe000 rw-p 00026000 08:01 791311 /lib/x86_64-linux-gnu/ld-2.23.so
7ffff7ffe000-7ffff7fff000 rw-p 00000000 00:00 0
7ffffffde000-7ffffffff000 rw-p 00000000 00:00 0 [stack]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
Program received signal SIGSEGV, Segmentation fault.
__GI_abort () at abort.c:125
125 abort.c: No such file or directory.
这个段错误可能是 glibc 特有的。我用 musl libc 构建了这个代码片段,它工作正常。这似乎与 fprintf
的线程安全无关,因为 clone
是通过 CLONE_VFORK
传递的,它会暂停父进程。
(编辑:我刚刚修复了 getpid
缓存问题并重新运行 gdb
和 valgrind
。)
(编辑:我只是将子堆栈的大小从 200
字节增加到 2000
字节。)
我写了下面的程序来学习如何在 linux
x86-64
机器上使用 clone
和 CLONE_VM | CLONE_VFORK | CLONE_PARENT
:
// test.c
#define _GNU_SOURCE
#include <stdio.h>
#include <assert.h>
#include <syscall.h> // For syscall to call getpid
#include <signal.h> // For SIGCHILD
#include <sys/types.h>// For getppid
#include <unistd.h> // For getppid and sleep
#include <sched.h> // For clone
#include <stdlib.h> // For calloc and free
#define STACK_SIZE 2000
void Puts(const char *str)
{
assert(fputs(str, stderr) != EOF);
}
void Sleep(unsigned int sec)
{
do {
sec = sleep(sec);
} while(sec > 0);
}
int child(void *useless)
{
Puts("The new process is created.\n");
assert(fprintf(stderr, "pid = %d, ppid = %d\n", (pid_t) syscall(SYS_getpid), getppid()) > 0);
Puts("sleep for 120 secs\n");
Sleep(120);
return 0;
}
int main(int argc, char* argv[])
{
Puts("Allocate stack for new process\n");
void *stack = calloc(STACK_SIZE, sizeof(char));
void *stack_top = (void*) ((char*) stack + STACK_SIZE - 1);
assert(fprintf(stderr, "stack = %p, stack top = %p\n", stack, stack_top) > 0);
Puts("clone\n");
int ret = clone(child, stack_top, CLONE_VM | CLONE_VFORK | CLONE_PARENT | SIGCHLD, NULL);
Puts("clone returns\n");
Puts("Free the stack\n");
free(stack);
if (ret == -1)
perror("clone(child, stack, CLONE_VM | CLONE_VFORK, NULL)");
else {
ret = 0;
Puts("Child dies...\n");
}
return ret;
}
我在 bash
中使用 clang-7 test.c
和 运行 编译程序 ./a.out
。它立即返回以下输出:
Allocate stack for new process
stack = 0x492260, stack top = 0x492a2f
clone
The new process is created.
Segmentation fault
它 returns 139
意味着信号 SIGSEGV
被发送到我的进程。
然后我用-g
重新编译它并用valgrind --trace-children=yes ./a.out
调试它:
|| ==14494== Memcheck, a memory error detector
|| ==14494== Copyright (C) 2002-2015, and GNU GPL'd, by Julian Seward et al.
|| ==14494== Using Valgrind-3.12.0.SVN and LibVEX; rerun with -h for copyright info
|| ==14494== Command: ./a.out
|| ==14494==
|| Allocate stack for new process
|| stack = 0x51f3040, stack top = 0x51f380f
|| clone
|| clone returns
|| Free the stack
|| Child dies...
|| ==14495== Invalid write of size 4
|| ==14495== at 0x201322: ??? (in /home/nobodyxu/a.out)
|| ==14495== by 0x4F2FCBE: clone (clone.S:95)
|| ==14495== Address 0xffffffffffffffdc is not stack'd, malloc'd or (recently) free'd
|| ==14495==
|| ==14495==
|| ==14495== Process terminating with default action of signal 11 (SIGSEGV)
|| ==14495== Access not within mapped region at address 0xFFFFFFFFFFFFFFDC
|| ==14495== at 0x201322: ??? (in /home/nobodyxu/a.out)
|| ==14495== by 0x4F2FCBE: clone (clone.S:95)
|| ==14495== If you believe this happened as a result of a stack
|| ==14495== overflow in your program's main thread (unlikely but
|| ==14495== possible), you can try to increase the size of the
|| ==14495== main thread stack using the --main-stacksize= flag.
|| ==14495== The main thread stack size used in this run was 8388608.
|| ==14495==
|| ==14495== HEAP SUMMARY:
|| ==14495== in use at exit: 2,000 bytes in 1 blocks
|| ==14495== total heap usage: 1 allocs, 0 frees, 2,000 bytes allocated
|| ==14495==
|| ==14495== LEAK SUMMARY:
|| ==14495== definitely lost: 0 bytes in 0 blocks
|| ==14495== indirectly lost: 0 bytes in 0 blocks
|| ==14495== possibly lost: 0 bytes in 0 blocks
|| ==14495== still reachable: 2,000 bytes in 1 blocks
|| ==14495== suppressed: 0 bytes in 0 blocks
|| ==14495== Rerun with --leak-check=full to see details of leaked memory
|| ==14495==
|| ==14495== For counts of detected and suppressed errors, rerun with: -v
|| ==14495== ERROR SUMMARY: 1 errors from 1 contexts (suppressed: 0 from 0)
|| ==14494==
|| ==14494== HEAP SUMMARY:
|| ==14494== in use at exit: 0 bytes in 0 blocks
|| ==14494== total heap usage: 1 allocs, 1 frees, 2,000 bytes allocated
|| ==14494==
|| ==14494== All heap blocks were freed -- no leaks are possible
|| ==14494==
|| ==14494== For counts of detected and suppressed errors, rerun with: -v
|| ==14494== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)
它也立即返回并打印了这些。
我检查了 0x201322
的生成程序集,发现它属于 int main(int argc, char* argv[])
:
|| 20131d: e8 8e 01 00 00 callq 2014b0 <clone@plt>
|| 201322: 89 45 dc mov %eax,-0x24(%rbp)
|| 201325: 48 bf 54 09 20 00 00 movabs [=13=]x200954,%rdi
|| 20132c: 00 00 00
|| 20132f: e8 dc fd ff ff callq 201110 <Puts>
|| 201334: 48 bf ad 08 20 00 00 movabs [=13=]x2008ad,%rdi
|| 20133b: 00 00 00
我也试过在gdb
中使用set follow-fork-mode child
来调试它,但是这不起作用。
如何修复分段错误?
函数 printf 和 fprintf 在没有各种保护的情况下似乎不是线程安全的 rails。这在
我通过记下最后一次打印发生的位置的蛮力方法发现了问题,然后注释掉之后的行,直到错误消失。
我用gdb调试你的程序。报错信息如下。
你为子函数申请的stack可能在子函数真正执行fprintf之前就已经释放了
在子函数中,在assert后面加上fflush(stdout);
可能会解决你的问题
Continuing.
Allocate stack for new process
stack = 0x602010, stack top = 0x6027df
clone
The new process is created.
sleep for 20 secs
clone returns
Free the stack
*** Error in `test': double free or corruption (out): 0x0000000000602010 ***
======= Backtrace: =========
/lib/x86_64-linux-gnu/libc.so.6(+0x777e5)[0x7ffff7a847e5]
/lib/x86_64-linux-gnu/libc.so.6(+0x8037a)[0x7ffff7a8d37a]
/lib/x86_64-linux-gnu/libc.so.6(cfree+0x4c)[0x7ffff7a9153c]
/***/***/tmp/test[0x400969]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf0)[0x7ffff7a2d830]
/***/***/tmp/test[0x400729]
======= Memory map: ========
00400000-00401000 r-xp 00000000 08:21 12848672 /***/***/tmp/test
00600000-00601000 r--p 00000000 08:21 12848672 /***/***/tmp/test
00601000-00602000 rw-p 00001000 08:21 12848672 /***/***/tmp/test
00602000-00623000 rw-p 00000000 00:00 0 [heap]
7ffff0000000-7ffff0021000 rw-p 00000000 00:00 0
7ffff0021000-7ffff4000000 ---p 00000000 00:00 0
7ffff77f7000-7ffff780d000 r-xp 00000000 08:01 786957 /lib/x86_64-linux-gnu/libgcc_s.so.1
7ffff780d000-7ffff7a0c000 ---p 00016000 08:01 786957 /lib/x86_64-linux-gnu/libgcc_s.so.1
7ffff7a0c000-7ffff7a0d000 rw-p 00015000 08:01 786957 /lib/x86_64-linux-gnu/libgcc_s.so.1
7ffff7a0d000-7ffff7bcd000 r-xp 00000000 08:01 791529 /lib/x86_64-linux-gnu/libc-2.23.so
7ffff7bcd000-7ffff7dcd000 ---p 001c0000 08:01 791529 /lib/x86_64-linux-gnu/libc-2.23.so
7ffff7dcd000-7ffff7dd1000 r--p 001c0000 08:01 791529 /lib/x86_64-linux-gnu/libc-2.23.so
7ffff7dd1000-7ffff7dd3000 rw-p 001c4000 08:01 791529 /lib/x86_64-linux-gnu/libc-2.23.so
7ffff7dd3000-7ffff7dd7000 rw-p 00000000 00:00 0
7ffff7dd7000-7ffff7dfd000 r-xp 00000000 08:01 791311 /lib/x86_64-linux-gnu/ld-2.23.so
7ffff7fd3000-7ffff7fd6000 rw-p 00000000 00:00 0
7ffff7ff7000-7ffff7ff8000 rw-p 00000000 00:00 0
7ffff7ff8000-7ffff7ffa000 r--p 00000000 00:00 0 [vvar]
7ffff7ffa000-7ffff7ffc000 r-xp 00000000 00:00 0 [vdso]
7ffff7ffc000-7ffff7ffd000 r--p 00025000 08:01 791311 /lib/x86_64-linux-gnu/ld-2.23.so
7ffff7ffd000-7ffff7ffe000 rw-p 00026000 08:01 791311 /lib/x86_64-linux-gnu/ld-2.23.so
7ffff7ffe000-7ffff7fff000 rw-p 00000000 00:00 0
7ffffffde000-7ffffffff000 rw-p 00000000 00:00 0 [stack]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
Program received signal SIGSEGV, Segmentation fault.
__GI_abort () at abort.c:125
125 abort.c: No such file or directory.
这个段错误可能是 glibc 特有的。我用 musl libc 构建了这个代码片段,它工作正常。这似乎与 fprintf
的线程安全无关,因为 clone
是通过 CLONE_VFORK
传递的,它会暂停父进程。