用于虚拟到物理地址转换的 FreeBSD 模块

FreeBSD module for virtual-to-physical address translation

我正在学习 FreeBSD 上的 LKM 编程,作为第一个项目,我正在尝试编写一个系统调用,它将进程地址 space 的虚拟内存地址作为参数,并且 returns RAM 中相应的物理地址(如果所讨论的虚拟地址映射到内存中——如果不是,则系统调用 returns 出错)。我在 Intel x64 芯片上 运行,所以我通读了 Intel 开发人员手册第 3A 卷的第 4 章,其中详细介绍了系统如何处理虚拟到物理地址的转换。我相信我已经在我的内核模块代码中正确地实现了这个过程,如下所示:

#include <sys/types.h>
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/module.h>
#include <sys/sysent.h>
#include <sys/kernel.h>
#include <sys/sysproto.h>
#include <sys/systm.h>

struct vtp_args {
    unsigned long vaddr;
    unsigned long *to_fill; };

/////////////////////////////////////////////////////
//virtual address masks
#define PML5_MASK(x)    ((x)&0x01ff000000000000)    //bits 56 to 48
#define PML4_MASK(x)    ((x)&0x0000ff8000000000)    //bits 47 to 39
#define PDPT_MASK(x)    ((x)&0x0000007fc0000000)    //bits 38 to 30
#define PD_MASK(x)      ((x)&0x000000003fe00000)    //bits 29 to 21
#define PT_MASK(x)      ((x)&0x00000000001ff000)    //bits 20 to 12
/////////////////////////////////////////////////////

/////////////////////////////////////////////////////
//page structure entry masks
#define PE_ADDR_MASK(x) ((x)&0x000ffffffffff000)    //bits 51 to 12
#define PE_PS_FLAG(x)   ( (x) & ((long)1<<7) )      //page size flag
#define PE_P_FLAG(x)    ((x)&1)                     //present flag
/////////////////////////////////////////////////////

/////////////////////////////////////////////////////
#define DMAP_MIN_ADDRESS    (0xfffff80000000000)
#define PHYS_TO_VIRT(x)     ((x)|DMAP_MIN_ADDRESS)
/////////////////////////////////////////////////////

static int
vtp(struct thread *td, void *args) {
    struct vtp_args *uap=args;
    unsigned long vaddr=uap->vaddr;
    unsigned long *to_fill=uap->to_fill;

    //asm block checks to see if 4 or 5-level paging is enabled
    //if so, moves the cr3 register into the cr3 variable
    //and sets la57_flag to assert whether 4-level or 5-level
    int la57_flag=0;
    unsigned long cr3=0;
    __asm__ __volatile__ (
        "mov %%cr0, %%rax;"         //check bit 31 of cr0 (PG flag)
        "test [=10=]x80000000, %%eax;"  //deny request if 0
        "jz fail;"                  //(ie if paging is not enabled)

        "mov [=10=]xc0000080, %%ecx;"   //check bit 8 of ia32_efer (LME flag)
        "rdmsr;"                    //deny request if 0
        "test [=10=]x100, %%eax;"       //(module currently can't handle pae paging)
        "jz fail;"
        
    "success:\n"
        "mov %%cr3, %0;"
        "mov %%cr4, %%rax;"
        "shr , %%rax;"
        "and , %%rax;"
        "mov %%eax, %1;"
        "jmp break;"
    "fail:\n"
        "mov [=10=], %0;"
    "break:\n"
    
        : "=r"(cr3), "=r"(la57_flag)
        ::"rax", "ecx", "memory");
    if(!cr3) {
        return EOPNOTSUPP; }
    /////////////////////////////////////////////////////
    unsigned long psentry=0, paddr=0;
    
    //get pml5e (if applicable)
    if(la57_flag) {         //5-level paging
        psentry=*(unsigned long *)\
            PHYS_TO_VIRT( PE_ADDR_MASK(cr3)|(PML5_MASK(vaddr)>>51) );
            if(!PE_P_FLAG(psentry)) {
                return EFAULT; }}
    else {
        psentry=cr3; }
   
    //get pml4e
    uprintf("[debug]: cr3:    0x%lx\n", psentry);
    uprintf("[debug]: &pml4e: 0x%lx\n", PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PML4_MASK(vaddr)>>42) ));
    psentry=*(unsigned long *)\
        PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PML4_MASK(vaddr)>>42) );
    uprintf("[debug]: pml4e:  0x%lx\n", psentry);
    if(!PE_P_FLAG(psentry)) {
        return EFAULT; }
   
    //get pdpte
    psentry=*(unsigned long *)\
        PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PDPT_MASK(vaddr)>>33) );
    uprintf("[debug]: pdpte:  0x%lx\n", psentry);
    if(PE_PS_FLAG(psentry)) {   //1GB page
        //bits (51 to 30) | bits (29 to 0)
        paddr=(psentry&0x0ffffc00000000)|(vaddr&0x3fffffff);
        return copyout(&paddr, to_fill, sizeof(unsigned long)); }
    if(!PE_P_FLAG(psentry)) {
        return EFAULT; }
   
    //get pde
    psentry=*(unsigned long *)\
        PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PD_MASK(vaddr)>>24) );
    uprintf("[debug]: pde:    0x%lx\n", psentry);
    if(PE_PS_FLAG(psentry)) {   //2MB page
        //bits (51 to 21) | bits (20 to 0)
        paddr=(psentry&0x0ffffffffe0000)|(vaddr&0x1ffff);
        return copyout(&paddr, to_fill, sizeof(unsigned long)); }
    if(!PE_P_FLAG(psentry)) {
        return EFAULT; }
     
    //get pte
    psentry=*(unsigned long *)\
        PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PT_MASK(vaddr)>>15) );
    uprintf("[debug]: pte:    0x%lx\n", psentry);
    paddr=(psentry&0x0ffffffffff000)|(vaddr&0xfff);
    return copyout(&paddr, to_fill, sizeof(unsigned long)); }
     

static
struct sysent vtp_sysent = {
    2,
    vtp };

static int offset=NO_SYSCALL;

static int
load(struct module *module, int cmd, void *arg) {
    int error=0;
    switch(cmd) {
        case MOD_LOAD:
            uprintf("loading syscall at offset %d\n", offset);
            break;
        case MOD_UNLOAD:
            uprintf("unloading syscall from offset %d\n", offset);
            break;
        default:
            error=EOPNOTSUPP;
            break; }
    return error; }

SYSCALL_MODULE(vtp, &offset, &vtp_sysent, load, NULL);

我只是通过 grepping /sys/amd64 目录找到了 DMAP_MIN_ADDRESS 常量,我相当有信心我有正确的常量,因为代码不会导致任何内核调用时恐慌。加载模块后,我使用以下代码测试系统调用:

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/module.h>

int main() {
    int x=0;
    unsigned long vaddr=(unsigned long)&x;
    unsigned long paddr=0;

    int syscall_num;
    int modid;
    struct module_stat stat;
    stat.version=sizeof(stat);
    if((modid=modfind("sys/vtp"))==-1) {
        perror("fatal in modfind");
        exit(-1); }
    if(modstat(modid, &stat)==-1) {
        perror("fatal in modstat");
        exit(-1); }
    syscall_num=stat.data.intval;
    
    if(syscall(syscall_num, vaddr, &paddr)) {
        perror("fatal in syscall");
        exit(-1); }
    printf("virtual address:    %p\n"
           "physical address:   %p\n",
           (void *)vaddr, (void *)paddr);
    return 0; }

不幸的是,我得到以下奇怪的输出:

$ ./vtp_test
[debug]: cr3:    0x2d48663c
[debug]: &pml4e: 0xfffff8002d48601f
[debug]: pml4e:  0x0
fatal in syscall: Bad address

因此,出于某种原因,“PML4 条目”返回为 0,这显然是不正确的。我怀疑问题一定出在我对英特尔开发人员指南中给出的地址解析算法的实现中,但我看不出错误在哪里。任何人都可以提供一些见解吗?

P.S。我当然 运行 这是在虚拟机上,所以这可能会导致一些问题吗?

啊,这是一个愚蠢的错误;给出了不正确的移位值。以下代码已更正(添加了一些额外的调试语句):

#include <sys/types.h>
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/module.h>
#include <sys/sysent.h>
#include <sys/kernel.h>
#include <sys/sysproto.h>
#include <sys/systm.h>

struct vtp_args {
    unsigned long vaddr;
    unsigned long *to_fill; };

/////////////////////////////////////////////////////
//virtual address masks
#define PML5_MASK(x)    ((x)&0x01ff000000000000)    //bits 56 to 48
#define PML4_MASK(x)    ((x)&0x0000ff8000000000)    //bits 47 to 39
#define PDPT_MASK(x)    ((x)&0x0000007fc0000000)    //bits 38 to 30
#define PD_MASK(x)      ((x)&0x000000003fe00000)    //bits 29 to 21
#define PT_MASK(x)      ((x)&0x00000000001ff000)    //bits 20 to 12
/////////////////////////////////////////////////////

/////////////////////////////////////////////////////
//page structure entry masks
#define PE_ADDR_MASK(x) ((x)&0x000ffffffffff000)    //bits 51 to 12
#define PE_PS_FLAG(x)   ( (x) & ((long)1<<7) )      //page size flag
#define PE_P_FLAG(x)    ((x)&1)                     //present flag
/////////////////////////////////////////////////////

/////////////////////////////////////////////////////
#define DMAP_MIN_ADDRESS    (0xfffff80000000000)
#define PHYS_TO_VIRT(x)     ((x)|DMAP_MIN_ADDRESS)
/////////////////////////////////////////////////////

static int
vtp(struct thread *td, void *args) {
    struct vtp_args *uap=args;
    unsigned long vaddr=uap->vaddr;
    unsigned long *to_fill=uap->to_fill;

    //asm block checks to see if 4 or 5-level paging is enabled
    //if so, moves the cr3 register into the cr3 variable
    //and sets la57_flag to assert whether 4-level or 5-level
    int la57_flag=0;
    unsigned long cr3=0;
    __asm__ __volatile__ (
        "mov %%cr0, %%rax;"         //check bit 31 of cr0 (PG flag)
        "test [=10=]x80000000, %%eax;"  //deny request if 0
        "jz fail;"                  //(ie if paging is not enabled)

        "mov [=10=]xc0000080, %%ecx;"   //check bit 8 of ia32_efer (LME flag)
        "rdmsr;"                    //deny request if 0
        "test [=10=]x100, %%eax;"       //(module currently can't handle pae paging)
        "jz fail;"
        
    "success:\n"
        "mov %%cr3, %0;"
        "mov %%cr4, %%rax;"
        "shr , %%rax;"
        "and , %%rax;"
        "mov %%eax, %1;"
        "jmp break;"
    "fail:\n"
        "mov [=10=], %0;"
    "break:\n"
    
        : "=r"(cr3), "=r"(la57_flag)
        ::"rax", "ecx", "memory");
    if(!cr3) {
        return EOPNOTSUPP; }
    ////////////////////////////////////////////////////////////////////
    unsigned long psentry=0, paddr=0;

    //pml5e (if applicable)
    if(la57_flag) {         //5-level paging
        printf("[debug]: &pml5e:\t0x%lx\n", PHYS_TO_VIRT( PE_ADDR_MASK(cr3)|(PML5_MASK(vaddr)>>45) ));
        psentry=*(unsigned long *)\
            PHYS_TO_VIRT( PE_ADDR_MASK(cr3)|(PML5_MASK(vaddr)>>45) );
        printf("[debug]: pml5e:\t\t0x%lx\n", psentry);
        if(!PE_P_FLAG(psentry)) {
            return EFAULT; }}
    else {
        psentry=cr3; }

    //pml4e
    uprintf("[debug]: &pml4e:\t0x%lx\n", PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PML4_MASK(vaddr)>>36) ));
    psentry=*(unsigned long *)\
        PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PML4_MASK(vaddr)>>36) );
    uprintf("[debug]: pml4e:\t\t0x%lx\n", psentry);
    if(!PE_P_FLAG(psentry)) {
        return EFAULT; }

    //pdpte
    uprintf("[debug]: &pdpte:\t0x%lx\n", PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PDPT_MASK(vaddr)>>27) ));
    psentry=*(unsigned long *)\
        PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PDPT_MASK(vaddr)>>27) );
    uprintf("[debug]: pdpte:\t\t0x%lx\n", psentry);
    if(PE_PS_FLAG(psentry)) {   //1GB page
        //bits (51 to 30) | bits (29 to 0)
        paddr=(psentry&0x0ffffc00000000)|(vaddr&0x3fffffff);
        return copyout(&paddr, to_fill, sizeof(unsigned long)); }
    if(!PE_P_FLAG(psentry)) {
        return EFAULT; }

    //pde
    uprintf("[debug]: &pde:\t\t0x%lx\n", PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PD_MASK(vaddr)>>18) ));
    psentry=*(unsigned long *)\
        PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PD_MASK(vaddr)>>18) );
    uprintf("[debug]: pde:\t\t0x%lx\n", psentry);
    if(PE_PS_FLAG(psentry)) {   //2MB page
        //bits (51 to 21) | bits (20 to 0)
        paddr=(psentry&0x0ffffffffe0000)|(vaddr&0x1ffff);
        return copyout(&paddr, to_fill, sizeof(unsigned long)); }
    if(!PE_P_FLAG(psentry)) {
        return EFAULT; }

    //pte
    uprintf("[debug]: &pte:\t\t0x%lx\n", PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PT_MASK(vaddr)>>9) ));
    psentry=*(unsigned long *)\
        PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PT_MASK(vaddr)>>9) );
    uprintf("[debug]: pte:\t\t0x%lx\n", psentry);
    paddr=(psentry&0x0ffffffffff000)|(vaddr&0xfff);
    return copyout(&paddr, to_fill, sizeof(unsigned long)); }

static
struct sysent vtp_sysent = {
    2,
    vtp };

static int offset=NO_SYSCALL;

static int
load(struct module *module, int cmd, void *arg) {
    int error=0;
    switch(cmd) {
        case MOD_LOAD:
            uprintf("loading syscall at offset %d\n", offset);
            break;
        case MOD_UNLOAD:
            uprintf("unloading syscall from offset %d\n", offset);
            break;
        default:
            error=EOPNOTSUPP;
            break; }
    return error; }

SYSCALL_MODULE(vtp, &offset, &vtp_sysent, load, NULL);

并产生更好看的输出:

$ ./vtp_test
[debug]: &pml4e:        0xfffff800341a27f8
[debug]: pml4e:         0x80000000341fc067
[debug]: &pdpte:        0xfffff800341fcff8
[debug]: pdpte:         0x341b7067
[debug]: &pde:          0xfffff800341b7ff8
[debug]: pde:           0x34174067
[debug]: &pte:          0xfffff80034174ff0
[debug]: pte:           0x8000000030de9467
virtual address:        0x7fffffffea9c
physical address:       0x30de9a9c