如何在内核space中调用系统调用?

How to call system call in kernel space?

我正在尝试在内核 space 中触发系统调用,如果系统调用不采用 getpid().

等参数,它工作正常

方法我是怎么做的:

  1. 获取系统地址table
static void **syscall_table;
  1. 将它与你想要的系统调用号一起用作函数指针:
typedef long (*sys_call_ptr_t)(const struct __user pt_regs *);

// call system call
((sys_call_ptr_t *)syscall_table)[system_call_number](reg);
  1. 如果系统调用有参数,在调用之前将它们存储到 regs 中:
struct __user pt_regs *reg = kmalloc....;
reg->di = ...
reg->si = ...

目前,我正在尝试使用 write 但它失败了。

write(int fd, const void *buf, size_t count);

对于buf,我已经尝试了用户space地址和内核space地址。 count 可能不是问题。所以,我想问题可能出现在文件描述符中(可能 fd 在较低级别和用户 space 之间是不同的)。对于基本测试,我只想将文本写入终端,所以 fd 应该是 1(至少在用户 space 中)。

这里有两个问题:

  1. 由于某些原因,我需要坚持使用上述调用系统调用的方法。这是合理的还是我错过的任何步骤导致使用失败 write?

  2. 如果我调用write时出错了?问题是否来自 fd?如果是这样,我如何在用户 space 中获得相应的 fd1?

前言

根据定义,系统调用是系统向用户space应用程序提供的服务。当一个在系统内运行时,他不应该调用 一项针对用户 space 的服务。因此,不建议这样做。

首先尝试使用内核 space 缓冲区

write()系统调用定义在fs/read_write.c中。它调用 ksys_write() 调用 vfs_write():

ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
    ssize_t ret;

    if (!(file->f_mode & FMODE_WRITE))
        return -EBADF;
    if (!(file->f_mode & FMODE_CAN_WRITE))
        return -EINVAL;
    if (unlikely(!access_ok(buf, count)))
        return -EFAULT;

    ret = rw_verify_area(WRITE, file, pos, count);
    if (!ret) {
        if (count > MAX_RW_COUNT)
            count =  MAX_RW_COUNT;
        file_start_write(file);
        ret = __vfs_write(file, buf, count, pos);
        if (ret > 0) {
            fsnotify_modify(file);
            add_wchar(current, ret);
        }
        inc_syscw(current);
        file_end_write(file);
    }

    return ret;
}
[...]
ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
    struct fd f = fdget_pos(fd);
    ssize_t ret = -EBADF;

    if (f.file) {
        loff_t pos, *ppos = file_ppos(f.file);
        if (ppos) {
            pos = *ppos;
            ppos = &pos;
        }
        ret = vfs_write(f.file, buf, count, ppos);
        if (ret >= 0 && ppos)
            f.file->f_pos = pos;
        fdput_pos(f);
    }

    return ret;
}

SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
        size_t, count)
{
    return ksys_write(fd, buf, count);
}

作为第一个参数传递的文件描述符没有问题。从用户 space 传递的值用于检索输出文件的 file 结构(在 ksys_write() ).但是第二个参数必须引用一个用户space内存区域。 在vfs_write()中,检查第二个参数:

    if (unlikely(!access_ok(buf, count)))
        return -EFAULT;

access_ok()检查缓冲区是否在用户级别space。因此,如果你 传递一个引用内核的地址 space,read() 返回的代码将是 -EFAULT (-14).

下面的示例是一个简单的模块,它使用内核 space 缓冲区调用 write() 系统调用。在 x86_64 上,系统调用的参数约定为:

   RDI = arg#0
   RSI = arg#1
   RDX = arg#2
   R10 = arg#3
   R8  = arg#4
   R9  = arg#5
#include <linux/version.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/ptrace.h>
#include <linux/socket.h>
#include <linux/kallsyms.h>


MODULE_LICENSE("GPL");

typedef int (* syscall_wrapper)(struct pt_regs *);

unsigned long sys_call_table_addr;

#define DEV_NAME "[DEVICE2]"


#define DEV_STR  DEV_NAME "String from driver"

static char buf[1024];


static int __init device2_init(void) {

  syscall_wrapper write_syscall;
  int rc;
  struct pt_regs param;

  printk(KERN_INFO DEV_NAME "module has been loaded\n");

  sys_call_table_addr = kallsyms_lookup_name("sys_call_table");

  printk(KERN_INFO DEV_NAME "sys_call_table@%lx\n", sys_call_table_addr);

  write_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_write];

  /*
    Call to write() system call with a kernel space buffer
  */
  snprintf(buf, sizeof(buf), "%s\n", DEV_STR);
  param.di = 1;
  param.si = (unsigned long)buf;
  param.dx = strlen(buf);
  rc = (* write_syscall)(&param);

  printk(KERN_INFO DEV_NAME "write() with a kernel space buffer = %d\n", rc);

  return 0;
}

static void __exit device2_exit(void) {
  printk(KERN_INFO DEV_NAME "module has been unloaded\n");
}

module_init(device2_init);
module_exit(device2_exit);

在模块插入时,我们可以验证系统调用returns -EFAULT:

$ sudo insmod ./device2.ko
$ dmesg
[15716.262977] [DEVICE2]module has been loaded
[15716.270566] [DEVICE2]sys_call_table@ffffffff926013a0
[15716.270568] [DEVICE2]write() with a kernel space buffer = -14

但是带有系统调用的同一个模块,如 dup() 涉及文件描述符,但没有用户 space 缓冲区,这是可行的。让我们将之前的代码更改为:

static int __init device2_init(void) {

  syscall_wrapper write_syscall;
  syscall_wrapper dup_syscall;
  syscall_wrapper close_syscall;
  int rc;
  struct pt_regs param;

  printk(KERN_INFO DEV_NAME "module has been loaded\n");

  sys_call_table_addr = kallsyms_lookup_name("sys_call_table");

  printk(KERN_INFO DEV_NAME "sys_call_table@%lx\n", sys_call_table_addr);

  write_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_write];
  dup_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_dup];
  close_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_close];

  /*
    Call to write() system call with a kernel space buffer
  */
  snprintf(buf, sizeof(buf), "%s\n", DEV_STR);
  param.di = 1;
  param.si = (unsigned long)buf;
  param.dx = strlen(buf);
  rc = (* write_syscall)(&param);

  printk(KERN_INFO DEV_NAME "write() with a kernel space buffer = %d\n", rc);

  /*
    Call to dup() system call
  */
  param.di = 1;
  rc = (* dup_syscall)(&param);

  printk(KERN_INFO DEV_NAME "dup() = %d\n", rc);

  /*
    Call to close() system call
  */
  param.di = 0;
  rc = (* close_syscall)(&param);

  printk(KERN_INFO DEV_NAME "close() = %d\n", rc);

  /*
    Call to dup() system call ==> Must return 0 as it is available
  */
  param.di = 1;
  rc = (* dup_syscall)(&param);

  printk(KERN_INFO DEV_NAME "dup() = %d\n", rc);

  return 0;
}

dup()的结果正常:

$ sudo insmod ./device2.ko
$ dmesg
[17444.098469] [DEVICE2]module has been loaded
[17444.106935] [DEVICE2]sys_call_table@ffffffff926013a0
[17444.106937] [DEVICE2]write() with a kernel space buffer = -14
[17444.106939] [DEVICE2]dup() = 4
[17444.106940] [DEVICE2]close() = 0
[17444.106940] [DEVICE2]dup() = 0

第一次调用dup() returns 4因为当前进程是insmod。后者打开模块文件并得到文件描述符 3。因此,第一个可用的文件描述符是 4。第二次调用 dup() returns 0 因为我们关闭了文件描述符 0.

第二次尝试使用用户 space 缓冲区

要使用用户 space 缓冲区,让我们向内核模块添加一些文件操作 (open(), release()write())。在 write() 入口点,我们使用用户回显从用户 space 传递到 stderr(文件描述符 2)的内容space 缓冲区传递给 write() 入口点:

#include <linux/version.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/ptrace.h>
#include <linux/socket.h>
#include <linux/kallsyms.h>
#include <linux/cdev.h>


MODULE_LICENSE("GPL");

typedef int (* syscall_wrapper)(struct pt_regs *);

static unsigned long sys_call_table_addr;

#define DEV_NAME "[DEVICE2]"

static syscall_wrapper write_syscall;

static ssize_t device2_write(struct file *filp, const char *buff, size_t len, loff_t * off)
{
  struct pt_regs param;
  int rc;

  printk(KERN_INFO DEV_NAME "write %p, %zu\n", buff, len);

  /*
    Call to write() system call to echo the write to stderr
  */
  param.di = 2;
  param.si = (unsigned long)buff;
  param.dx = len;
  rc = (* write_syscall)(&param);

  printk(KERN_INFO DEV_NAME "write() = %d\n", rc);

  return len;  // <-------------- To stop the write
}

static int device2_open(struct inode *inode, struct file *file)
{
    printk(KERN_INFO DEV_NAME "open\n");
    return 0;
}

static int device2_release(struct inode *inode, struct file *file)
{
    printk(KERN_INFO DEV_NAME "released\n");
    return 0;
}

static const struct file_operations fops =
{
    .owner= THIS_MODULE,
    .write=device2_write,
    .open= device2_open,
    .release= device2_release

};

struct cdev *device_cdev;
dev_t deviceNumbers;

static int __init device2_init(void) {

  int rc;

  printk(KERN_INFO DEV_NAME "module has been loaded\n");

  // This returns the major number chosen dynamically in deviceNumbers
  rc = alloc_chrdev_region(&deviceNumbers, 0, 1, DEV_NAME);

  if (rc < 0) {
    printk(KERN_ALERT DEV_NAME "Error registering: %d\n", rc);
    return -1;
  }

  device_cdev = cdev_alloc();

  cdev_init(device_cdev, &fops);

  cdev_add(device_cdev, deviceNumbers, 1);

  printk(KERN_INFO DEV_NAME "initialized (major number is %d)\n", MAJOR(deviceNumbers));

  sys_call_table_addr = kallsyms_lookup_name("sys_call_table");

  printk(KERN_INFO DEV_NAME "sys_call_table@%lx\n", sys_call_table_addr);

  write_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_write];

  printk(KERN_INFO DEV_NAME "write_syscall@%p\n", write_syscall);

  return 0;
}

static void __exit device2_exit(void) {
  printk(KERN_INFO DEV_NAME "module has been unloaded\n");
}

module_init(device2_init);
module_exit(device2_exit);

模块的加载:

$ sudo insmod device2.ko
$ dmesg
[ 2255.183196] [DEVICE2]module has been loaded
[ 2255.183202] [DEVICE2]initialized (major number is 508)
[ 2255.193255] [DEVICE2]sys_call_table@ffffffffbcc013a0
[ 2255.193256] [DEVICE2]write_syscall@0000000030394929

使文件系统中的设备条目能够写入其中:

$ sudo mknod /dev/device2 c 508 0
$ sudo chmod 666 /dev/device2
$ sudo ls -l /dev/device2
crw-rw-rw- 1 root root 508, 0 janv.  24 16:55 /dev/device2

写入设备会触发 stderr 上的预期回显:

$ echo "qwerty for test purposes" > /dev/device2
qwerty for test purposes
$ echo "another string" > /dev/device2
another string
$ dmesg
[ 2255.183196] [DEVICE2]module has been loaded
[ 2255.183202] [DEVICE2]initialized (major number is 508)
[ 2255.193255] [DEVICE2]sys_call_table@ffffffffbcc013a0
[ 2255.193256] [DEVICE2]write_syscall@0000000030394929
[ 2441.674250] [DEVICE2]open
[ 2441.674268] [DEVICE2]write 0000000032fb5249, 25
[ 2441.674281] [DEVICE2]write() = 25
[ 2441.674286] [DEVICE2]released
[ 2475.538140] [DEVICE2]open
[ 2475.538159] [DEVICE2]write 0000000032fb5249, 15
[ 2475.538171] [DEVICE2]write() = 15
[ 2475.538175] [DEVICE2]released