madvise(DODUMP) 在与成功的 madvise(DONTDUMP) 相同的 ptr/size 上失败并返回 EINVAL
madvise(DODUMP) on the same ptr/size as a successful madvise(DONTDUMP) fails with EINVAL
测试 mysqld 的 MariaDB 异常(10.3 分支)它在启动时所做的事情:
内存分配 returns ptr=0x7fffe1a00000
for bytes=2097152
在 madvise 系统调用之前,/proc/{pid}/smap 条目是:
7fffe1a00000-7fffe1c00000 rw-s 00000000 00:0f 18481215 /SYSV00000000 (deleted)
Size: 2048 kB
KernelPageSize: 2048 kB
MMUPageSize: 2048 kB
Rss: 0 kB
Pss: 0 kB
Shared_Clean: 0 kB
Shared_Dirty: 0 kB
Private_Clean: 0 kB
Private_Dirty: 0 kB
Referenced: 0 kB
Anonymous: 0 kB
LazyFree: 0 kB
AnonHugePages: 0 kB
ShmemPmdMapped: 0 kB
Shared_Hugetlb: 0 kB
Private_Hugetlb: 0 kB
Swap: 0 kB
SwapPss: 0 kB
Locked: 0 kB
VmFlags: rd wr sh mr mw me ms de ht sd
通话后:
madvise(ptr, bytes, MADV_DONTDUMP)
页面按预期选择了 dd
"don't dump" 标志:
7fffe1a00000-7fffe1c00000 rw-s 00000000 00:0f 18481215 /SYSV00000000 (deleted)
Size: 2048 kB
KernelPageSize: 2048 kB
MMUPageSize: 2048 kB
Rss: 0 kB
Pss: 0 kB
Shared_Clean: 0 kB
Shared_Dirty: 0 kB
Private_Clean: 0 kB
Private_Dirty: 0 kB
Referenced: 0 kB
Anonymous: 0 kB
LazyFree: 0 kB
AnonHugePages: 0 kB
ShmemPmdMapped: 0 kB
Shared_Hugetlb: 0 kB
Private_Hugetlb: 0 kB
Swap: 0 kB
SwapPss: 0 kB
Locked: 0 kB
VmFlags: rd wr sh mr mw me ms de ht dd sd
一段时间后,就在 madvise(ptr, m_size, MADV_DODUMP)
之前,地图是一样的:
7fffe1a00000-7fffe1c00000 rw-s 00000000 00:0f 18481215 /SYSV00000000 (deleted)
Size: 2048 kB
KernelPageSize: 2048 kB
MMUPageSize: 2048 kB
Rss: 0 kB
Pss: 0 kB
Shared_Clean: 0 kB
Shared_Dirty: 0 kB
Private_Clean: 0 kB
Private_Dirty: 0 kB
Referenced: 0 kB
Anonymous: 0 kB
LazyFree: 0 kB
AnonHugePages: 0 kB
ShmemPmdMapped: 0 kB
Shared_Hugetlb: 0 kB
Private_Hugetlb: 0 kB
Swap: 0 kB
SwapPss: 0 kB
Locked: 0 kB
VmFlags: rd wr sh mr mw me ms de ht dd sd
下一个代码是:
madvise(ptr, m_size, MADV_DODUMP)
GDB 显示使用了相同的值:
(gdb) p size
= 2097152
(gdb) p ptr
= (void *) 0x7fffe1a00000
madvise(ptr,size,MADV_DODUMP)
为returns-1,errno=EINVAL
,页面映射保持不变。
内核版本:
$ uname -a
Linux 4.18.9-300.fc29.x86_64 #1 SMP Thu Sep 20 02:32:53 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux
为了完整性,strace -fe trace=%memory ...
从分配到同一程序(不同执行)的 EINVAL
中提取:
[pid 6036] shmat(18874431, NULL, 0) = 0x7f6ebda00000
[pid 6036] madvise(0x7f6ebda00000, 2097152, MADV_DONTDUMP) = 0
[pid 6036] mmap(NULL, 2215936, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6ebd7e3000
[pid 6036] brk(NULL) = 0x55caa0d76000
[pid 6036] brk(0x55caa0de7000) = 0x55caa0de7000
[pid 6036] brk(NULL) = 0x55caa0de7000
[pid 6036] brk(0x55caa0e38000) = 0x55caa0e38000
[pid 6036] brk(NULL) = 0x55caa0e38000
[pid 6036] brk(0x55caa0e8a000) = 0x55caa0e8a000
[pid 6036] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f6ebcfe2000
[pid 6036] mprotect(0x7f6ebcfe3000, 8388608, PROT_READ|PROT_WRITE) = 0
strace: Process 6039 attached
[pid 6036] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f6ebc7e1000
[pid 6036] mprotect(0x7f6ebc7e2000, 8388608, PROT_READ|PROT_WRITE) = 0
strace: Process 6040 attached
[pid 6036] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f6ead3ff000
[pid 6036] mprotect(0x7f6ead400000, 8388608, PROT_READ|PROT_WRITE) = 0
strace: Process 6041 attached
[pid 6036] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f6eacbfe000
[pid 6036] mprotect(0x7f6eacbff000, 8388608, PROT_READ|PROT_WRITE) = 0
strace: Process 6042 attached
[pid 6036] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f6eac3fd000
[pid 6036] mprotect(0x7f6eac3fe000, 8388608, PROT_READ|PROT_WRITE) = 0
strace: Process 6043 attached
[pid 6036] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f6eabbfc000
[pid 6036] mprotect(0x7f6eabbfd000, 8388608, PROT_READ|PROT_WRITE) = 0
strace: Process 6044 attached
[pid 6036] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f6eab3fb000
[pid 6036] mprotect(0x7f6eab3fc000, 8388608, PROT_READ|PROT_WRITE) = 0
strace: Process 6045 attached
[pid 6036] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f6eaabfa000
[pid 6036] mprotect(0x7f6eaabfb000, 8388608, PROT_READ|PROT_WRITE) = 0
strace: Process 6046 attached
[pid 6036] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f6eaa3f9000
[pid 6036] mprotect(0x7f6eaa3fa000, 8388608, PROT_READ|PROT_WRITE) = 0
strace: Process 6047 attached
[pid 6036] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f6ea9bf8000
[pid 6036] mprotect(0x7f6ea9bf9000, 8388608, PROT_READ|PROT_WRITE) = 0
strace: Process 6048 attached
[pid 6036] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f6ea93f7000
[pid 6036] mprotect(0x7f6ea93f8000, 8388608, PROT_READ|PROT_WRITE) = 0
strace: Process 6049 attached
[pid 6049] mmap(NULL, 134217728, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE, -1, 0) = 0x7f6ea13f7000
[pid 6049] munmap(0x7f6ea13f7000, 46174208) = 0
[pid 6049] munmap(0x7f6ea8000000, 20934656) = 0
[pid 6049] mprotect(0x7f6ea4000000, 135168, PROT_READ|PROT_WRITE) = 0
[pid 6036] brk(NULL) = 0x55caa0e8a000
[pid 6036] brk(0x55caa0eab000) = 0x55caa0eab000
[pid 6036] mmap(NULL, 2117632, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6ebc5dc000
[pid 6036] munmap(0x7f6ebd7e3000, 2215936) = 0
[pid 6036] brk(NULL) = 0x55caa0eab000
[pid 6036] brk(0x55caa10d5000) = 0x55caa10d5000
[pid 6036] brk(NULL) = 0x55caa10d5000
[pid 6036] brk(0x55caa1118000) = 0x55caa1118000
[pid 6036] brk(NULL) = 0x55caa1118000
[pid 6036] brk(0x55caa115c000) = 0x55caa115c000
[pid 6036] madvise(0x7f6ebda00000, 2097152, MADV_DODUMP) = -1 EINVAL (Invalid argument)
关于为什么为 madvise(MADV_DODUMP)
返回 EINVAL 的任何线索?
代码为:mariadb-10.3 分支
de
引用 VM_DONTEXPAND
,内核明确拒绝 MADV_DODUMP
的标志:
#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
…
case MADV_DODUMP:
if (new_flags & VM_SPECIAL) {
error = -EINVAL;
goto out;
}
new_flags &= ~VM_DONTDUMP;
break;
此检查自 2012 年提交 0103bd16fb90bc741c7a03fd1ea4e8a505abad23(“mm:准备 VM_DONTDUMP
用于驱动程序”)以来一直存在。
此映射可能来自 hugetlbfs(fs/hugetlbfs/inode.c
中的 hugetlbfs_file_mmap
),因为还设置了 ht
位。
测试 mysqld 的 MariaDB 异常(10.3 分支)它在启动时所做的事情:
内存分配 returns ptr=0x7fffe1a00000
for bytes=2097152
在 madvise 系统调用之前,/proc/{pid}/smap 条目是:
7fffe1a00000-7fffe1c00000 rw-s 00000000 00:0f 18481215 /SYSV00000000 (deleted)
Size: 2048 kB
KernelPageSize: 2048 kB
MMUPageSize: 2048 kB
Rss: 0 kB
Pss: 0 kB
Shared_Clean: 0 kB
Shared_Dirty: 0 kB
Private_Clean: 0 kB
Private_Dirty: 0 kB
Referenced: 0 kB
Anonymous: 0 kB
LazyFree: 0 kB
AnonHugePages: 0 kB
ShmemPmdMapped: 0 kB
Shared_Hugetlb: 0 kB
Private_Hugetlb: 0 kB
Swap: 0 kB
SwapPss: 0 kB
Locked: 0 kB
VmFlags: rd wr sh mr mw me ms de ht sd
通话后:
madvise(ptr, bytes, MADV_DONTDUMP)
页面按预期选择了 dd
"don't dump" 标志:
7fffe1a00000-7fffe1c00000 rw-s 00000000 00:0f 18481215 /SYSV00000000 (deleted)
Size: 2048 kB
KernelPageSize: 2048 kB
MMUPageSize: 2048 kB
Rss: 0 kB
Pss: 0 kB
Shared_Clean: 0 kB
Shared_Dirty: 0 kB
Private_Clean: 0 kB
Private_Dirty: 0 kB
Referenced: 0 kB
Anonymous: 0 kB
LazyFree: 0 kB
AnonHugePages: 0 kB
ShmemPmdMapped: 0 kB
Shared_Hugetlb: 0 kB
Private_Hugetlb: 0 kB
Swap: 0 kB
SwapPss: 0 kB
Locked: 0 kB
VmFlags: rd wr sh mr mw me ms de ht dd sd
一段时间后,就在 madvise(ptr, m_size, MADV_DODUMP)
之前,地图是一样的:
7fffe1a00000-7fffe1c00000 rw-s 00000000 00:0f 18481215 /SYSV00000000 (deleted)
Size: 2048 kB
KernelPageSize: 2048 kB
MMUPageSize: 2048 kB
Rss: 0 kB
Pss: 0 kB
Shared_Clean: 0 kB
Shared_Dirty: 0 kB
Private_Clean: 0 kB
Private_Dirty: 0 kB
Referenced: 0 kB
Anonymous: 0 kB
LazyFree: 0 kB
AnonHugePages: 0 kB
ShmemPmdMapped: 0 kB
Shared_Hugetlb: 0 kB
Private_Hugetlb: 0 kB
Swap: 0 kB
SwapPss: 0 kB
Locked: 0 kB
VmFlags: rd wr sh mr mw me ms de ht dd sd
下一个代码是:
madvise(ptr, m_size, MADV_DODUMP)
GDB 显示使用了相同的值:
(gdb) p size
= 2097152
(gdb) p ptr
= (void *) 0x7fffe1a00000
madvise(ptr,size,MADV_DODUMP)
为returns-1,errno=EINVAL
,页面映射保持不变。
内核版本:
$ uname -a
Linux 4.18.9-300.fc29.x86_64 #1 SMP Thu Sep 20 02:32:53 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux
为了完整性,strace -fe trace=%memory ...
从分配到同一程序(不同执行)的 EINVAL
中提取:
[pid 6036] shmat(18874431, NULL, 0) = 0x7f6ebda00000
[pid 6036] madvise(0x7f6ebda00000, 2097152, MADV_DONTDUMP) = 0
[pid 6036] mmap(NULL, 2215936, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6ebd7e3000
[pid 6036] brk(NULL) = 0x55caa0d76000
[pid 6036] brk(0x55caa0de7000) = 0x55caa0de7000
[pid 6036] brk(NULL) = 0x55caa0de7000
[pid 6036] brk(0x55caa0e38000) = 0x55caa0e38000
[pid 6036] brk(NULL) = 0x55caa0e38000
[pid 6036] brk(0x55caa0e8a000) = 0x55caa0e8a000
[pid 6036] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f6ebcfe2000
[pid 6036] mprotect(0x7f6ebcfe3000, 8388608, PROT_READ|PROT_WRITE) = 0
strace: Process 6039 attached
[pid 6036] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f6ebc7e1000
[pid 6036] mprotect(0x7f6ebc7e2000, 8388608, PROT_READ|PROT_WRITE) = 0
strace: Process 6040 attached
[pid 6036] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f6ead3ff000
[pid 6036] mprotect(0x7f6ead400000, 8388608, PROT_READ|PROT_WRITE) = 0
strace: Process 6041 attached
[pid 6036] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f6eacbfe000
[pid 6036] mprotect(0x7f6eacbff000, 8388608, PROT_READ|PROT_WRITE) = 0
strace: Process 6042 attached
[pid 6036] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f6eac3fd000
[pid 6036] mprotect(0x7f6eac3fe000, 8388608, PROT_READ|PROT_WRITE) = 0
strace: Process 6043 attached
[pid 6036] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f6eabbfc000
[pid 6036] mprotect(0x7f6eabbfd000, 8388608, PROT_READ|PROT_WRITE) = 0
strace: Process 6044 attached
[pid 6036] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f6eab3fb000
[pid 6036] mprotect(0x7f6eab3fc000, 8388608, PROT_READ|PROT_WRITE) = 0
strace: Process 6045 attached
[pid 6036] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f6eaabfa000
[pid 6036] mprotect(0x7f6eaabfb000, 8388608, PROT_READ|PROT_WRITE) = 0
strace: Process 6046 attached
[pid 6036] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f6eaa3f9000
[pid 6036] mprotect(0x7f6eaa3fa000, 8388608, PROT_READ|PROT_WRITE) = 0
strace: Process 6047 attached
[pid 6036] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f6ea9bf8000
[pid 6036] mprotect(0x7f6ea9bf9000, 8388608, PROT_READ|PROT_WRITE) = 0
strace: Process 6048 attached
[pid 6036] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f6ea93f7000
[pid 6036] mprotect(0x7f6ea93f8000, 8388608, PROT_READ|PROT_WRITE) = 0
strace: Process 6049 attached
[pid 6049] mmap(NULL, 134217728, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE, -1, 0) = 0x7f6ea13f7000
[pid 6049] munmap(0x7f6ea13f7000, 46174208) = 0
[pid 6049] munmap(0x7f6ea8000000, 20934656) = 0
[pid 6049] mprotect(0x7f6ea4000000, 135168, PROT_READ|PROT_WRITE) = 0
[pid 6036] brk(NULL) = 0x55caa0e8a000
[pid 6036] brk(0x55caa0eab000) = 0x55caa0eab000
[pid 6036] mmap(NULL, 2117632, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f6ebc5dc000
[pid 6036] munmap(0x7f6ebd7e3000, 2215936) = 0
[pid 6036] brk(NULL) = 0x55caa0eab000
[pid 6036] brk(0x55caa10d5000) = 0x55caa10d5000
[pid 6036] brk(NULL) = 0x55caa10d5000
[pid 6036] brk(0x55caa1118000) = 0x55caa1118000
[pid 6036] brk(NULL) = 0x55caa1118000
[pid 6036] brk(0x55caa115c000) = 0x55caa115c000
[pid 6036] madvise(0x7f6ebda00000, 2097152, MADV_DODUMP) = -1 EINVAL (Invalid argument)
关于为什么为 madvise(MADV_DODUMP)
返回 EINVAL 的任何线索?
代码为:mariadb-10.3 分支
de
引用 VM_DONTEXPAND
,内核明确拒绝 MADV_DODUMP
的标志:
#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
…
case MADV_DODUMP:
if (new_flags & VM_SPECIAL) {
error = -EINVAL;
goto out;
}
new_flags &= ~VM_DONTDUMP;
break;
此检查自 2012 年提交 0103bd16fb90bc741c7a03fd1ea4e8a505abad23(“mm:准备 VM_DONTDUMP
用于驱动程序”)以来一直存在。
此映射可能来自 hugetlbfs(fs/hugetlbfs/inode.c
中的 hugetlbfs_file_mmap
),因为还设置了 ht
位。