如何在内核space中调用系统调用?
How to call system call in kernel space?
我正在尝试在内核 space 中触发系统调用,如果系统调用不采用 getpid()
.
等参数,它工作正常
方法我是怎么做的:
- 获取系统地址table
static void **syscall_table;
- 将它与你想要的系统调用号一起用作函数指针:
typedef long (*sys_call_ptr_t)(const struct __user pt_regs *);
// call system call
((sys_call_ptr_t *)syscall_table)[system_call_number](reg);
- 如果系统调用有参数,在调用之前将它们存储到 regs 中:
struct __user pt_regs *reg = kmalloc....;
reg->di = ...
reg->si = ...
目前,我正在尝试使用 write
但它失败了。
write(int fd, const void *buf, size_t count);
对于buf
,我已经尝试了用户space地址和内核space地址。 count
可能不是问题。所以,我想问题可能出现在文件描述符中(可能 fd
在较低级别和用户 space 之间是不同的)。对于基本测试,我只想将文本写入终端,所以 fd
应该是 1
(至少在用户 space 中)。
这里有两个问题:
由于某些原因,我需要坚持使用上述调用系统调用的方法。这是合理的还是我错过的任何步骤导致使用失败 write
?
如果我调用write
时出错了?问题是否来自 fd
?如果是这样,我如何在用户 space 中获得相应的 fd
和 1
?
前言
根据定义,系统调用是系统向用户space应用程序提供的服务。当一个在系统内运行时,他不应该调用
一项针对用户 space 的服务。因此,不建议这样做。
首先尝试使用内核 space 缓冲区
write()系统调用定义在fs/read_write.c中。它调用 ksys_write() 调用 vfs_write():
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
if (unlikely(!access_ok(buf, count)))
return -EFAULT;
ret = rw_verify_area(WRITE, file, pos, count);
if (!ret) {
if (count > MAX_RW_COUNT)
count = MAX_RW_COUNT;
file_start_write(file);
ret = __vfs_write(file, buf, count, pos);
if (ret > 0) {
fsnotify_modify(file);
add_wchar(current, ret);
}
inc_syscw(current);
file_end_write(file);
}
return ret;
}
[...]
ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
loff_t pos, *ppos = file_ppos(f.file);
if (ppos) {
pos = *ppos;
ppos = &pos;
}
ret = vfs_write(f.file, buf, count, ppos);
if (ret >= 0 && ppos)
f.file->f_pos = pos;
fdput_pos(f);
}
return ret;
}
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
size_t, count)
{
return ksys_write(fd, buf, count);
}
作为第一个参数传递的文件描述符没有问题。从用户 space 传递的值用于检索输出文件的 file 结构(在 ksys_write() ).但是第二个参数必须引用一个用户space内存区域。
在vfs_write()中,检查第二个参数:
if (unlikely(!access_ok(buf, count)))
return -EFAULT;
access_ok()检查缓冲区是否在用户级别space。因此,如果你
传递一个引用内核的地址 space,read() 返回的代码将是 -EFAULT (-14).
下面的示例是一个简单的模块,它使用内核 space 缓冲区调用 write() 系统调用。在 x86_64 上,系统调用的参数约定为:
RDI = arg#0
RSI = arg#1
RDX = arg#2
R10 = arg#3
R8 = arg#4
R9 = arg#5
#include <linux/version.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/ptrace.h>
#include <linux/socket.h>
#include <linux/kallsyms.h>
MODULE_LICENSE("GPL");
typedef int (* syscall_wrapper)(struct pt_regs *);
unsigned long sys_call_table_addr;
#define DEV_NAME "[DEVICE2]"
#define DEV_STR DEV_NAME "String from driver"
static char buf[1024];
static int __init device2_init(void) {
syscall_wrapper write_syscall;
int rc;
struct pt_regs param;
printk(KERN_INFO DEV_NAME "module has been loaded\n");
sys_call_table_addr = kallsyms_lookup_name("sys_call_table");
printk(KERN_INFO DEV_NAME "sys_call_table@%lx\n", sys_call_table_addr);
write_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_write];
/*
Call to write() system call with a kernel space buffer
*/
snprintf(buf, sizeof(buf), "%s\n", DEV_STR);
param.di = 1;
param.si = (unsigned long)buf;
param.dx = strlen(buf);
rc = (* write_syscall)(¶m);
printk(KERN_INFO DEV_NAME "write() with a kernel space buffer = %d\n", rc);
return 0;
}
static void __exit device2_exit(void) {
printk(KERN_INFO DEV_NAME "module has been unloaded\n");
}
module_init(device2_init);
module_exit(device2_exit);
在模块插入时,我们可以验证系统调用returns -EFAULT:
$ sudo insmod ./device2.ko
$ dmesg
[15716.262977] [DEVICE2]module has been loaded
[15716.270566] [DEVICE2]sys_call_table@ffffffff926013a0
[15716.270568] [DEVICE2]write() with a kernel space buffer = -14
但是带有系统调用的同一个模块,如 dup() 涉及文件描述符,但没有用户 space 缓冲区,这是可行的。让我们将之前的代码更改为:
static int __init device2_init(void) {
syscall_wrapper write_syscall;
syscall_wrapper dup_syscall;
syscall_wrapper close_syscall;
int rc;
struct pt_regs param;
printk(KERN_INFO DEV_NAME "module has been loaded\n");
sys_call_table_addr = kallsyms_lookup_name("sys_call_table");
printk(KERN_INFO DEV_NAME "sys_call_table@%lx\n", sys_call_table_addr);
write_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_write];
dup_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_dup];
close_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_close];
/*
Call to write() system call with a kernel space buffer
*/
snprintf(buf, sizeof(buf), "%s\n", DEV_STR);
param.di = 1;
param.si = (unsigned long)buf;
param.dx = strlen(buf);
rc = (* write_syscall)(¶m);
printk(KERN_INFO DEV_NAME "write() with a kernel space buffer = %d\n", rc);
/*
Call to dup() system call
*/
param.di = 1;
rc = (* dup_syscall)(¶m);
printk(KERN_INFO DEV_NAME "dup() = %d\n", rc);
/*
Call to close() system call
*/
param.di = 0;
rc = (* close_syscall)(¶m);
printk(KERN_INFO DEV_NAME "close() = %d\n", rc);
/*
Call to dup() system call ==> Must return 0 as it is available
*/
param.di = 1;
rc = (* dup_syscall)(¶m);
printk(KERN_INFO DEV_NAME "dup() = %d\n", rc);
return 0;
}
dup()的结果正常:
$ sudo insmod ./device2.ko
$ dmesg
[17444.098469] [DEVICE2]module has been loaded
[17444.106935] [DEVICE2]sys_call_table@ffffffff926013a0
[17444.106937] [DEVICE2]write() with a kernel space buffer = -14
[17444.106939] [DEVICE2]dup() = 4
[17444.106940] [DEVICE2]close() = 0
[17444.106940] [DEVICE2]dup() = 0
第一次调用dup() returns 4因为当前进程是insmod
。后者打开模块文件并得到文件描述符 3。因此,第一个可用的文件描述符是 4。第二次调用 dup() returns 0 因为我们关闭了文件描述符 0.
第二次尝试使用用户 space 缓冲区
要使用用户 space 缓冲区,让我们向内核模块添加一些文件操作 (open(), release() 和 write())。在 write() 入口点,我们使用用户回显从用户 space 传递到 stderr(文件描述符 2)的内容space 缓冲区传递给 write() 入口点:
#include <linux/version.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/ptrace.h>
#include <linux/socket.h>
#include <linux/kallsyms.h>
#include <linux/cdev.h>
MODULE_LICENSE("GPL");
typedef int (* syscall_wrapper)(struct pt_regs *);
static unsigned long sys_call_table_addr;
#define DEV_NAME "[DEVICE2]"
static syscall_wrapper write_syscall;
static ssize_t device2_write(struct file *filp, const char *buff, size_t len, loff_t * off)
{
struct pt_regs param;
int rc;
printk(KERN_INFO DEV_NAME "write %p, %zu\n", buff, len);
/*
Call to write() system call to echo the write to stderr
*/
param.di = 2;
param.si = (unsigned long)buff;
param.dx = len;
rc = (* write_syscall)(¶m);
printk(KERN_INFO DEV_NAME "write() = %d\n", rc);
return len; // <-------------- To stop the write
}
static int device2_open(struct inode *inode, struct file *file)
{
printk(KERN_INFO DEV_NAME "open\n");
return 0;
}
static int device2_release(struct inode *inode, struct file *file)
{
printk(KERN_INFO DEV_NAME "released\n");
return 0;
}
static const struct file_operations fops =
{
.owner= THIS_MODULE,
.write=device2_write,
.open= device2_open,
.release= device2_release
};
struct cdev *device_cdev;
dev_t deviceNumbers;
static int __init device2_init(void) {
int rc;
printk(KERN_INFO DEV_NAME "module has been loaded\n");
// This returns the major number chosen dynamically in deviceNumbers
rc = alloc_chrdev_region(&deviceNumbers, 0, 1, DEV_NAME);
if (rc < 0) {
printk(KERN_ALERT DEV_NAME "Error registering: %d\n", rc);
return -1;
}
device_cdev = cdev_alloc();
cdev_init(device_cdev, &fops);
cdev_add(device_cdev, deviceNumbers, 1);
printk(KERN_INFO DEV_NAME "initialized (major number is %d)\n", MAJOR(deviceNumbers));
sys_call_table_addr = kallsyms_lookup_name("sys_call_table");
printk(KERN_INFO DEV_NAME "sys_call_table@%lx\n", sys_call_table_addr);
write_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_write];
printk(KERN_INFO DEV_NAME "write_syscall@%p\n", write_syscall);
return 0;
}
static void __exit device2_exit(void) {
printk(KERN_INFO DEV_NAME "module has been unloaded\n");
}
module_init(device2_init);
module_exit(device2_exit);
模块的加载:
$ sudo insmod device2.ko
$ dmesg
[ 2255.183196] [DEVICE2]module has been loaded
[ 2255.183202] [DEVICE2]initialized (major number is 508)
[ 2255.193255] [DEVICE2]sys_call_table@ffffffffbcc013a0
[ 2255.193256] [DEVICE2]write_syscall@0000000030394929
使文件系统中的设备条目能够写入其中:
$ sudo mknod /dev/device2 c 508 0
$ sudo chmod 666 /dev/device2
$ sudo ls -l /dev/device2
crw-rw-rw- 1 root root 508, 0 janv. 24 16:55 /dev/device2
写入设备会触发 stderr 上的预期回显:
$ echo "qwerty for test purposes" > /dev/device2
qwerty for test purposes
$ echo "another string" > /dev/device2
another string
$ dmesg
[ 2255.183196] [DEVICE2]module has been loaded
[ 2255.183202] [DEVICE2]initialized (major number is 508)
[ 2255.193255] [DEVICE2]sys_call_table@ffffffffbcc013a0
[ 2255.193256] [DEVICE2]write_syscall@0000000030394929
[ 2441.674250] [DEVICE2]open
[ 2441.674268] [DEVICE2]write 0000000032fb5249, 25
[ 2441.674281] [DEVICE2]write() = 25
[ 2441.674286] [DEVICE2]released
[ 2475.538140] [DEVICE2]open
[ 2475.538159] [DEVICE2]write 0000000032fb5249, 15
[ 2475.538171] [DEVICE2]write() = 15
[ 2475.538175] [DEVICE2]released
我正在尝试在内核 space 中触发系统调用,如果系统调用不采用 getpid()
.
方法我是怎么做的:
- 获取系统地址table
static void **syscall_table;
- 将它与你想要的系统调用号一起用作函数指针:
typedef long (*sys_call_ptr_t)(const struct __user pt_regs *);
// call system call
((sys_call_ptr_t *)syscall_table)[system_call_number](reg);
- 如果系统调用有参数,在调用之前将它们存储到 regs 中:
struct __user pt_regs *reg = kmalloc....;
reg->di = ...
reg->si = ...
目前,我正在尝试使用 write
但它失败了。
write(int fd, const void *buf, size_t count);
对于buf
,我已经尝试了用户space地址和内核space地址。 count
可能不是问题。所以,我想问题可能出现在文件描述符中(可能 fd
在较低级别和用户 space 之间是不同的)。对于基本测试,我只想将文本写入终端,所以 fd
应该是 1
(至少在用户 space 中)。
这里有两个问题:
由于某些原因,我需要坚持使用上述调用系统调用的方法。这是合理的还是我错过的任何步骤导致使用失败
write
?如果我调用
write
时出错了?问题是否来自fd
?如果是这样,我如何在用户 space 中获得相应的fd
和1
?
前言
根据定义,系统调用是系统向用户space应用程序提供的服务。当一个在系统内运行时,他不应该调用 一项针对用户 space 的服务。因此,不建议这样做。
首先尝试使用内核 space 缓冲区
write()系统调用定义在fs/read_write.c中。它调用 ksys_write() 调用 vfs_write():
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
if (unlikely(!access_ok(buf, count)))
return -EFAULT;
ret = rw_verify_area(WRITE, file, pos, count);
if (!ret) {
if (count > MAX_RW_COUNT)
count = MAX_RW_COUNT;
file_start_write(file);
ret = __vfs_write(file, buf, count, pos);
if (ret > 0) {
fsnotify_modify(file);
add_wchar(current, ret);
}
inc_syscw(current);
file_end_write(file);
}
return ret;
}
[...]
ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
loff_t pos, *ppos = file_ppos(f.file);
if (ppos) {
pos = *ppos;
ppos = &pos;
}
ret = vfs_write(f.file, buf, count, ppos);
if (ret >= 0 && ppos)
f.file->f_pos = pos;
fdput_pos(f);
}
return ret;
}
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
size_t, count)
{
return ksys_write(fd, buf, count);
}
作为第一个参数传递的文件描述符没有问题。从用户 space 传递的值用于检索输出文件的 file 结构(在 ksys_write() ).但是第二个参数必须引用一个用户space内存区域。 在vfs_write()中,检查第二个参数:
if (unlikely(!access_ok(buf, count)))
return -EFAULT;
access_ok()检查缓冲区是否在用户级别space。因此,如果你 传递一个引用内核的地址 space,read() 返回的代码将是 -EFAULT (-14).
下面的示例是一个简单的模块,它使用内核 space 缓冲区调用 write() 系统调用。在 x86_64 上,系统调用的参数约定为:
RDI = arg#0
RSI = arg#1
RDX = arg#2
R10 = arg#3
R8 = arg#4
R9 = arg#5
#include <linux/version.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/ptrace.h>
#include <linux/socket.h>
#include <linux/kallsyms.h>
MODULE_LICENSE("GPL");
typedef int (* syscall_wrapper)(struct pt_regs *);
unsigned long sys_call_table_addr;
#define DEV_NAME "[DEVICE2]"
#define DEV_STR DEV_NAME "String from driver"
static char buf[1024];
static int __init device2_init(void) {
syscall_wrapper write_syscall;
int rc;
struct pt_regs param;
printk(KERN_INFO DEV_NAME "module has been loaded\n");
sys_call_table_addr = kallsyms_lookup_name("sys_call_table");
printk(KERN_INFO DEV_NAME "sys_call_table@%lx\n", sys_call_table_addr);
write_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_write];
/*
Call to write() system call with a kernel space buffer
*/
snprintf(buf, sizeof(buf), "%s\n", DEV_STR);
param.di = 1;
param.si = (unsigned long)buf;
param.dx = strlen(buf);
rc = (* write_syscall)(¶m);
printk(KERN_INFO DEV_NAME "write() with a kernel space buffer = %d\n", rc);
return 0;
}
static void __exit device2_exit(void) {
printk(KERN_INFO DEV_NAME "module has been unloaded\n");
}
module_init(device2_init);
module_exit(device2_exit);
在模块插入时,我们可以验证系统调用returns -EFAULT:
$ sudo insmod ./device2.ko
$ dmesg
[15716.262977] [DEVICE2]module has been loaded
[15716.270566] [DEVICE2]sys_call_table@ffffffff926013a0
[15716.270568] [DEVICE2]write() with a kernel space buffer = -14
但是带有系统调用的同一个模块,如 dup() 涉及文件描述符,但没有用户 space 缓冲区,这是可行的。让我们将之前的代码更改为:
static int __init device2_init(void) {
syscall_wrapper write_syscall;
syscall_wrapper dup_syscall;
syscall_wrapper close_syscall;
int rc;
struct pt_regs param;
printk(KERN_INFO DEV_NAME "module has been loaded\n");
sys_call_table_addr = kallsyms_lookup_name("sys_call_table");
printk(KERN_INFO DEV_NAME "sys_call_table@%lx\n", sys_call_table_addr);
write_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_write];
dup_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_dup];
close_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_close];
/*
Call to write() system call with a kernel space buffer
*/
snprintf(buf, sizeof(buf), "%s\n", DEV_STR);
param.di = 1;
param.si = (unsigned long)buf;
param.dx = strlen(buf);
rc = (* write_syscall)(¶m);
printk(KERN_INFO DEV_NAME "write() with a kernel space buffer = %d\n", rc);
/*
Call to dup() system call
*/
param.di = 1;
rc = (* dup_syscall)(¶m);
printk(KERN_INFO DEV_NAME "dup() = %d\n", rc);
/*
Call to close() system call
*/
param.di = 0;
rc = (* close_syscall)(¶m);
printk(KERN_INFO DEV_NAME "close() = %d\n", rc);
/*
Call to dup() system call ==> Must return 0 as it is available
*/
param.di = 1;
rc = (* dup_syscall)(¶m);
printk(KERN_INFO DEV_NAME "dup() = %d\n", rc);
return 0;
}
dup()的结果正常:
$ sudo insmod ./device2.ko
$ dmesg
[17444.098469] [DEVICE2]module has been loaded
[17444.106935] [DEVICE2]sys_call_table@ffffffff926013a0
[17444.106937] [DEVICE2]write() with a kernel space buffer = -14
[17444.106939] [DEVICE2]dup() = 4
[17444.106940] [DEVICE2]close() = 0
[17444.106940] [DEVICE2]dup() = 0
第一次调用dup() returns 4因为当前进程是insmod
。后者打开模块文件并得到文件描述符 3。因此,第一个可用的文件描述符是 4。第二次调用 dup() returns 0 因为我们关闭了文件描述符 0.
第二次尝试使用用户 space 缓冲区
要使用用户 space 缓冲区,让我们向内核模块添加一些文件操作 (open(), release() 和 write())。在 write() 入口点,我们使用用户回显从用户 space 传递到 stderr(文件描述符 2)的内容space 缓冲区传递给 write() 入口点:
#include <linux/version.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/ptrace.h>
#include <linux/socket.h>
#include <linux/kallsyms.h>
#include <linux/cdev.h>
MODULE_LICENSE("GPL");
typedef int (* syscall_wrapper)(struct pt_regs *);
static unsigned long sys_call_table_addr;
#define DEV_NAME "[DEVICE2]"
static syscall_wrapper write_syscall;
static ssize_t device2_write(struct file *filp, const char *buff, size_t len, loff_t * off)
{
struct pt_regs param;
int rc;
printk(KERN_INFO DEV_NAME "write %p, %zu\n", buff, len);
/*
Call to write() system call to echo the write to stderr
*/
param.di = 2;
param.si = (unsigned long)buff;
param.dx = len;
rc = (* write_syscall)(¶m);
printk(KERN_INFO DEV_NAME "write() = %d\n", rc);
return len; // <-------------- To stop the write
}
static int device2_open(struct inode *inode, struct file *file)
{
printk(KERN_INFO DEV_NAME "open\n");
return 0;
}
static int device2_release(struct inode *inode, struct file *file)
{
printk(KERN_INFO DEV_NAME "released\n");
return 0;
}
static const struct file_operations fops =
{
.owner= THIS_MODULE,
.write=device2_write,
.open= device2_open,
.release= device2_release
};
struct cdev *device_cdev;
dev_t deviceNumbers;
static int __init device2_init(void) {
int rc;
printk(KERN_INFO DEV_NAME "module has been loaded\n");
// This returns the major number chosen dynamically in deviceNumbers
rc = alloc_chrdev_region(&deviceNumbers, 0, 1, DEV_NAME);
if (rc < 0) {
printk(KERN_ALERT DEV_NAME "Error registering: %d\n", rc);
return -1;
}
device_cdev = cdev_alloc();
cdev_init(device_cdev, &fops);
cdev_add(device_cdev, deviceNumbers, 1);
printk(KERN_INFO DEV_NAME "initialized (major number is %d)\n", MAJOR(deviceNumbers));
sys_call_table_addr = kallsyms_lookup_name("sys_call_table");
printk(KERN_INFO DEV_NAME "sys_call_table@%lx\n", sys_call_table_addr);
write_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_write];
printk(KERN_INFO DEV_NAME "write_syscall@%p\n", write_syscall);
return 0;
}
static void __exit device2_exit(void) {
printk(KERN_INFO DEV_NAME "module has been unloaded\n");
}
module_init(device2_init);
module_exit(device2_exit);
模块的加载:
$ sudo insmod device2.ko
$ dmesg
[ 2255.183196] [DEVICE2]module has been loaded
[ 2255.183202] [DEVICE2]initialized (major number is 508)
[ 2255.193255] [DEVICE2]sys_call_table@ffffffffbcc013a0
[ 2255.193256] [DEVICE2]write_syscall@0000000030394929
使文件系统中的设备条目能够写入其中:
$ sudo mknod /dev/device2 c 508 0
$ sudo chmod 666 /dev/device2
$ sudo ls -l /dev/device2
crw-rw-rw- 1 root root 508, 0 janv. 24 16:55 /dev/device2
写入设备会触发 stderr 上的预期回显:
$ echo "qwerty for test purposes" > /dev/device2
qwerty for test purposes
$ echo "another string" > /dev/device2
another string
$ dmesg
[ 2255.183196] [DEVICE2]module has been loaded
[ 2255.183202] [DEVICE2]initialized (major number is 508)
[ 2255.193255] [DEVICE2]sys_call_table@ffffffffbcc013a0
[ 2255.193256] [DEVICE2]write_syscall@0000000030394929
[ 2441.674250] [DEVICE2]open
[ 2441.674268] [DEVICE2]write 0000000032fb5249, 25
[ 2441.674281] [DEVICE2]write() = 25
[ 2441.674286] [DEVICE2]released
[ 2475.538140] [DEVICE2]open
[ 2475.538159] [DEVICE2]write 0000000032fb5249, 15
[ 2475.538171] [DEVICE2]write() = 15
[ 2475.538175] [DEVICE2]released