由于访客状态,在 VMX 中启用 EPT 导致条目失败

Enabling EPT in VMX causes failed entry due to guest state

我正在家里构建管理程序,但在启用 EPT 时我遇到了进入 VMX 的问题。 下面代码用于设置guest模式,成功进入VMX。 但是,当我启用 EPT 时,出现 VMX 条目失败,异常号为 no。 33(由于来宾状态导致 vm 条目失败)。这是我取消注释以下代码的时候:

vmcs_write(SECONDARY_VM_EXEC_CONTROL, adjust_msr_control(
        MSR_IA32_VMX_PROCBASED_CTLS2, CPU_BASED_CTL2_RDTSCP | CPU_BASED_CTL2_ENABLE_INVPCID /* | CPU_BASED_CTL2_ENABLE_VPID | CPU_BASED_CTL2_ENABLE_XSAVE_XRSTORS */ | CPU_BASED_CTL2_ENABLE_EPT
    )); <--- I add CPU_BASED_CTL2_ENABLE_EPT
vmcs_write64(EPT_POINTER, vms->eptp.value);

我多次查阅 Intel 手册以确保我遵循来宾状态检查,但我不明白为什么只有在启用 EPT 时我的输入才会失败。 任何 suggestion/help 都会有所帮助,谢谢 :)

static noinline void vmwrite_error(unsigned long field, unsigned long value){
         printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
                field, value, (int)(vmcs_read(VM_INSTRUCTION_ERROR)));
         dump_stack();
         BUG_ON(1);
}

static void vmcs_write(unsigned long field, unsigned long value){
        uint8_t err;
        __asm__ __volatile__(
                "vmwrite %[value],%[field]; setna %[err]"
                : [err]"=rm"(err)
                : [field]"r"(field), [value]"r"(value)
               : "cc", "memory"
        );
        if(err)
                vmwrite_error(field, value);
        else
                printk(KERN_INFO "vmwrite log: reg %lx value %lx\n", field, value);
}

EPTP alloc_ept(int initial_pages_count){
    int i;
    EPTP eptp;
    EPT_PML4E *ept_pml4;
    EPT_PDPTE *ept_pdpt;
    EPT_PDE *ept_pd;
    EPT_PTE *ept_pt;
    eptp.value = 0;

    ept_pml4 = kzalloc(4096, GFP_KERNEL | GFP_NOWAIT);
    if(!ept_pml4)
        goto pml4err;
    ept_pdpt = kzalloc(4096, GFP_KERNEL | GFP_NOWAIT);
    if(!ept_pdpt)
        goto pdpterr;
    ept_pd = kzalloc(4096, GFP_KERNEL | GFP_NOWAIT);
    if(!ept_pd)
        goto pderr;
    ept_pt = kzalloc(4096, GFP_KERNEL | GFP_NOWAIT);
    if(!ept_pt)
        goto pterr; 

    for(i = 0; i < initial_pages_count; i++){
        ept_pt[i].fields.read_access = 1;
        ept_pt[i].fields.write_access = 1;
        ept_pt[i].fields.execute_access = 1;
        ept_pt[i].fields.ept_memtype = 6;
        ept_pt[i].fields.phys_addr = virt_to_phys(kzalloc(4096, GFP_KERNEL | GFP_NOWAIT));
    }

    ept_pd->fields.read_access = 1;
    ept_pd->fields.write_access = 1;
    ept_pd->fields.execute_access = 1;
    ept_pd->fields.phys_addr = virt_to_phys(ept_pt);    

    ept_pdpt->fields.read_access = 1;
    ept_pdpt->fields.write_access = 1;
    ept_pdpt->fields.execute_access = 1;
    ept_pdpt->fields.phys_addr = virt_to_phys(ept_pd);

    ept_pml4->fields.read_access = 1;
    ept_pml4->fields.write_access = 1;
    ept_pml4->fields.execute_access = 1;
    ept_pml4->fields.phys_addr = virt_to_phys(ept_pdpt);

    eptp.fields.memtype = 6;
    eptp.fields.page_walk = 3;
    eptp.fields.accessed_and_dirty_flags_enabled = 1;
    eptp.fields.pml4_phys_addr = virt_to_phys(ept_pml4);

    return eptp;

    pterr:
    kfree(ept_pd);
    pderr:
    kfree(ept_pdpt);
    pdpterr:
    kfree(ept_pml4);
    pml4err:
    panic("EPT ALLOC ERROR!");
}

static void setup_vm_code(vmstate *vms){
    int i;
        EPT_PML4E *pml = phys_to_virt(vms->eptp.fields.pml4_phys_addr);
        EPT_PDPTE *pdpt = phys_to_virt(pml->fields.phys_addr);
        EPT_PDE *pd = phys_to_virt(pdpt->fields.phys_addr);
        EPT_PTE *pt = phys_to_virt(pd->fields.phys_addr);

    vms->initial_rip = (unsigned long)phys_to_virt(pt[0].fields.phys_addr);
    for(i = 0; i < 4096; i++){  
        // hlt
        *(char*)(vms->initial_rip+i) = 0xf4;
    }
    printk(KERN_INFO "INITIAL_RIP: %lu", vms->initial_rip);
    // Stack grows down
    vms->initial_rsp = (unsigned long)phys_to_virt(pt[9].fields.phys_addr) + 4095;
}

static void prepare_vmx_cpu(void *info){
    uint32_t vmcs_revid = 0;
    uint32_t hi = 0;
    vmstate *vms = per_cpu(cpu_vms, smp_processor_id());

    // Populate VMCS revision id in vmxon region
    rdmsr_safe(MSR_IA32_VMX_BASIC, &vmcs_revid, &hi);
    memcpy(vms->vmxon_region, &vmcs_revid, 4);
    memcpy(vms->vmcs_region, &vmcs_revid, 4);

    vms->eptp = alloc_ept(10);
    setup_vm_code(vms);

    vmx_enable();   
}

//static void handle_vmexit(void) __attribute__((used));
static void handle_vmexit(void){
    int exit_reason = vmcs_read32(VM_EXIT_REASON);
    int basic_exit_code = exit_reason & 0xffff;
    int exit_qualification = vmcs_read32(EXIT_QUALIFICATION);
    int vm_entry_failure = exit_reason & 0x80000000;
    panic("VMEXIT WITH CODE %d, VM ENTRY FAILURE: %s, QUAL: %d", basic_exit_code, vm_entry_failure ? "true" : "false", exit_qualification);
    vmx_dump_cpu();
    panic("ERR");
    VMRESUME();
    //TODO: switch error reasons
}

static void vmx_setup_vm_controls(void){
    // VM Execution Controls
    vmcs_write(PIN_BASED_VM_EXEC_CONTROL, adjust_msr_control(MSR_IA32_VMX_PINBASED_CTLS, 0));
    vmcs_write(CPU_BASED_VM_EXEC_CONTROL, adjust_msr_control(
        MSR_IA32_VMX_PROCBASED_CTLS, CPU_BASED_HLT_EXITING | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS));
    vmcs_write(SECONDARY_VM_EXEC_CONTROL, adjust_msr_control(
        MSR_IA32_VMX_PROCBASED_CTLS2, CPU_BASED_CTL2_RDTSCP | CPU_BASED_CTL2_ENABLE_INVPCID /* | CPU_BASED_CTL2_ENABLE_VPID | CPU_BASED_CTL2_ENABLE_XSAVE_XRSTORS */ | CPU_BASED_CTL2_ENABLE_EPT
    ));

    //vmcs_write64(TSC_OFFSET, 0);  

    vmcs_write(CR0_READ_SHADOW, read_cr0());
    vmcs_write(CR4_READ_SHADOW, __read_cr4());
    vmcs_write(CR0_GUEST_HOST_MASK, ~0ul);
    vmcs_write(CR4_GUEST_HOST_MASK, ~0ul);

    // How many CR3_TARGET_VALUEs are considered without VM exit when MOV CR3, VAL
    vmcs_write(CR3_TARGET_COUNT, 0);

    // VM Entry & Exit Controls
    vmcs_write(VM_EXIT_CONTROLS, adjust_msr_control(MSR_IA32_VMX_EXIT_CTLS, VM_EXIT_IA32E_MODE | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_HOST_ADDR_SPACE_SIZE));
    vmcs_write(VM_ENTRY_CONTROLS, adjust_msr_control(MSR_IA32_VMX_ENTRY_CTLS, VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER));
}

static void vmx_setup_initial_host_state(vmstate *vms){
    struct desc_ptr gdtptr, idt;

    vmcs_write(HOST_CR0, read_cr0());
    vmcs_write(HOST_CR3, __read_cr3());
    vmcs_write(HOST_CR4, __read_cr4());
    vmcs_write(HOST_RSP, (unsigned long)vms->vmm_handle_stack + vms->vmm_handle_stack_size - 1);
    vmcs_write(HOST_RIP, (unsigned long)handle_vmexit);

    /* An explanation of segment selectors: https://medium.com/hungys-blog/linux-kernel-memory-addressing-a0d304283af3 */
    // Segment Selectors
    vmcs_write(HOST_CS_SELECTOR, __KERNEL_CS);
    vmcs_write(HOST_DS_SELECTOR, __KERNEL_DS);
    vmcs_write(HOST_ES_SELECTOR, __KERNEL_DS);
    vmcs_write(HOST_SS_SELECTOR, __KERNEL_DS);
    vmcs_write(HOST_FS_SELECTOR, 0);
    vmcs_write(HOST_GS_SELECTOR, 0);
    vmcs_write(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);

    // Segment Base Adresses
    vmcs_write(HOST_FS_BASE, native_read_msr(MSR_FS_BASE));
    vmcs_write(HOST_GS_BASE, native_read_msr(MSR_GS_BASE));
    vmcs_write(HOST_TR_BASE, read_tr_base());
    native_store_gdt(&gdtptr);
    vmcs_write(HOST_GDTR_BASE, gdtptr.address);
    store_idt(&idt);
    vmcs_write(HOST_IDTR_BASE, idt.address);

    // MSRs
    vmcs_write(HOST_IA32_SYSENTER_CS, native_read_msr(MSR_IA32_SYSENTER_CS));
    vmcs_write(HOST_IA32_SYSENTER_ESP, native_read_msr(MSR_IA32_SYSENTER_ESP));
    vmcs_write(HOST_IA32_SYSENTER_EIP, native_read_msr(MSR_IA32_SYSENTER_EIP));
    vmcs_write64(HOST_IA32_EFER, native_read_msr(MSR_EFER));
}

static void RIPTEST(void) __attribute__((used));
static void RIPTEST(void){
    __asm__ __volatile__("hlt; hlt; hlt; hlt; hlt; hlt");
}

static void vmx_setup_initial_guest_state(vmstate *vms){
    vmcs_write(GUEST_CR0, read_cr0());
    vmcs_write(GUEST_CR3, __read_cr3());
    vmcs_write(GUEST_CR4, __read_cr4());
    vmcs_write(GUEST_DR7, 0);

    vmcs_write(GUEST_RIP, vms->initial_rip);
    //vmcs_write(GUEST_RIP, (unsigned long)RIPTEST);
    vmcs_write(GUEST_RSP, vms->initial_rsp);
    vmcs_write(GUEST_RFLAGS, 0x2); // Reserved flag

    // Setup selectors
    vmcs_write(GUEST_CS_SELECTOR, 0);
    vmcs_write(GUEST_SS_SELECTOR, 0);
    vmcs_write(GUEST_DS_SELECTOR, 0);
    vmcs_write(GUEST_ES_SELECTOR, 0);
    vmcs_write(GUEST_FS_SELECTOR, 0);
    vmcs_write(GUEST_GS_SELECTOR, 0);
    vmcs_write(GUEST_LDTR_SELECTOR, 0);
    vmcs_write(GUEST_TR_SELECTOR, 0);

    // Setup base addresses
    vmcs_write(GUEST_CS_BASE, 0);
    vmcs_write(GUEST_SS_BASE, 0);
    vmcs_write(GUEST_DS_BASE, 0);
    vmcs_write(GUEST_ES_BASE, 0);
    vmcs_write(GUEST_FS_BASE, native_read_msr(MSR_FS_BASE));
    vmcs_write(GUEST_GS_BASE, native_read_msr(MSR_GS_BASE));
    vmcs_write(GUEST_LDTR_BASE, 0);
    vmcs_write(GUEST_TR_BASE, 0);

    // Setup guest segment limits   
    vmcs_write(GUEST_CS_LIMIT, 0xFFFFFFFF);
    vmcs_write(GUEST_SS_LIMIT, 0xFFFFFFFF);
    vmcs_write(GUEST_DS_LIMIT, 0xFFFFFFFF);
    vmcs_write(GUEST_ES_LIMIT, 0xFFFFFFFF);
    vmcs_write(GUEST_FS_LIMIT, 0xFFFFFFFF);
    vmcs_write(GUEST_GS_LIMIT, 0xFFFFFFFF);
    vmcs_write(GUEST_LDTR_LIMIT, 0);
    vmcs_write(GUEST_TR_LIMIT, 0xFF);

    // Setup guest segment access rights
    // https://www.amd.com/system/files/TechDocs/24593.pdf#G10.910849
    vmcs_write(GUEST_CS_AR_BYTES, 0xA09B);
    vmcs_write(GUEST_SS_AR_BYTES, 0xA093);
    vmcs_write(GUEST_DS_AR_BYTES, 0xA093);
    vmcs_write(GUEST_ES_AR_BYTES, 0xA093);
    vmcs_write(GUEST_FS_AR_BYTES, 0xA093);
    vmcs_write(GUEST_GS_AR_BYTES, 0xA093);
    vmcs_write(GUEST_LDTR_AR_BYTES, 0x0082);
    vmcs_write(GUEST_TR_AR_BYTES, 0x008B);

    // Setup GDTR & IDTR
    vmcs_write(GUEST_GDTR_BASE, 0);
    vmcs_write(GUEST_IDTR_BASE, 0);
    vmcs_write(GUEST_GDTR_LIMIT, 0);
    vmcs_write(GUEST_IDTR_LIMIT, 0);

    vmcs_write(GUEST_IA32_EFER, native_read_msr(MSR_EFER));
    vmcs_write64(GUEST_IA32_DEBUGCTL, 0);

    // Setup sysenter primitives
    vmcs_write(GUEST_SYSENTER_CS, 0);
    vmcs_write(GUEST_SYSENTER_ESP, 0);
    vmcs_write(GUEST_SYSENTER_EIP, 0);
}

static void init_vmcs(vmstate *vms){
    VMPTRLD(vms->vmcs_physical);
    vmx_setup_vm_controls();
    vmx_setup_initial_guest_state(vms);
    vmx_setup_initial_host_state(vms);

    vmcs_write64(VMCS_LINK_POINTER, -1ull);

    //vmcs_write(EXCEPTION_BITMAP, 8192);

    vmcs_write64(EPT_POINTER, vms->eptp.value);
    //vmcs_write(VIRTUAL_PROCESSOR_ID, vms->vpid);
}

int vmx_launch(void){
    int cpu = smp_processor_id();
    vmstate *vms = per_cpu(cpu_vms, smp_processor_id());

    printk(KERN_INFO "Launching VM on CPU %d\n", cpu);
    init_vmcs(vms);
    VMLAUNCH();

    put_cpu();
    return 0;
}

int vmx_setup(void){
    int i;
    vmstate* vms;
    printk(KERN_INFO "NUM CPUS: %d\n", num_online_cpus());

    for_each_online_cpu(i){
        vms = create_vmstate();
        vms->vmxon_region = kmalloc(4096, GFP_KERNEL);
        vms->vmxon_physical = virt_to_phys(vms->vmxon_region);
        vms->vmcs_region = kzalloc(4096, GFP_KERNEL);
        vms->vmcs_physical = virt_to_phys(vms->vmcs_region);
        vms->vmm_handle_stack_size = 4096;
        vms->vmm_handle_stack = kmalloc(vms->vmm_handle_stack_size, GFP_KERNEL);
        vms->vpid = get_free_vpid();
        per_cpu(cpu_vms, i) = vms;
    }

    on_each_cpu(prepare_vmx_cpu, NULL, 1);
    printk(KERN_INFO "CPUS prepared!");

    for_each_online_cpu(i){
        vms = per_cpu(cpu_vms, i);
        if(vms->vmx_enabled == false) {
            printk(KERN_ALERT "Tearing down after VMXON failed!");
            vmx_teardown();
            return -1;
        }
    }
    printk(KERN_INFO "VMX turned on for all CPUs!");
    return 0;
}

VMCS 转储:

***Guest State***
[   72.414906] CR0: actual=0x0000000080050033, shadow=0x0000000080050033, gh_mask=ffffffffffffffff
[   72.416865] CR4: actual=0x00000000000626e0, shadow=0x00000000000626e0, gh_mask=ffffffffffffffff
[   72.419147] CR3 = 0x00000000307ce004
[   72.419950] PDPTR0 = 0x0000000000000000  PDPTR1 = 0x0000000000000000
[   72.421384] PDPTR2 = 0x0000000000000000  PDPTR3 = 0x0000000000000000
[   72.422753] RSP = 0xffff9c9cb31f8fff  RIP = 0xffff9c9cb5005000
[   72.424510] RFLAGS=0x00000002         DR7 = 0x0000000000000000
[   72.426501] Sysenter RSP=0000000000000000 CS:RIP=0000:0000000000000000
[   72.428141] CS:   sel=0x0000, attr=0x0a09b, limit=0xffffffff, base=0x0000000000000000
[   72.430162] DS:   sel=0x0000, attr=0x0a093, limit=0xffffffff, base=0x0000000000000000
[   72.432075] SS:   sel=0x0000, attr=0x0a093, limit=0xffffffff, base=0x0000000000000000
[   72.433982] ES:   sel=0x0000, attr=0x0a093, limit=0xffffffff, base=0x0000000000000000
[   72.436152] FS:   sel=0x0000, attr=0x0a093, limit=0xffffffff, base=0x00007f8e51f0c4c0
[   72.438437] GS:   sel=0x0000, attr=0x0a093, limit=0xffffffff, base=0xffff9c9cbeb00000
[   72.440579] GDTR:                           limit=0x00000000, base=0x0000000000000000
[   72.442241] LDTR: sel=0x0000, attr=0x00082, limit=0x00000000, base=0x0000000000000000
[   72.443414] IDTR:                           limit=0x00000000, base=0x0000000000000000
[   72.444591] TR:   sel=0x0000, attr=0x0008b, limit=0x000000ff, base=0x0000000000000000
[   72.447023] EFER =     0x0000000000000d01  PAT = 0x0000000000000000
[   72.448999] DebugCtl = 0x0000000000000000  DebugExceptions = 0x0000000000000000
[   72.451813] PerfGlobCtl = 0x0000000000000000
[   72.453316] BndCfgS = 0x0000000000000000
[   72.454528] Interruptibility = 00000000  ActivityState = 00000000
[   72.456302] InterruptStatus = 0000
[   72.456997] *** Host State ***
[   72.457622] RIP = 0xffffffffc0789b90  RSP = 0xffff9c9cb5019fff
[   72.458766] CS=0010 SS=0018 DS=0018 ES=0018 FS=0000 GS=0000 TR=0040
[   72.460007] FSBase=00007f8e51f0c4c0 GSBase=ffff9c9cbeb00000 TRBase=0000000000000000
[   72.461588] GDTBase=fffffe000002c000 IDTBase=fffffe0000000000
[   72.462711] CR0=0000000080050033 CR3=00000000307ce004 CR4=00000000000626e0
[   72.464083] Sysenter RSP=fffffe000002d200 CS:RIP=0010:ffffffff848015f0
[   72.465472] EFER = 0x0000000000000d01  PAT = 0x0000000000000000
[   72.467041] PerfGlobCtl = 0x0000000000000000
[   72.468110] *** Control State ***
[   72.469024] PinBased=00000016 CPUBased=8401e1f2 SecondaryExec=0000000a
[   72.470863] EntryControls=000093ff ExitControls=00236fff
[   72.472268] ExceptionBitmap=00000000 PFECmask=00000000 PFECmatch=00000000
[   72.474137] VMEntry: intr_info=00000000 errcode=00000000 ilen=00000000
[   72.475580] VMExit: intr_info=00000000 errcode=00000000 ilen=00000000
[   72.477230]         reason=80000021 qualification=0000000000000000
[   72.478806] IDTVectoring: info=00000000 errcode=00000000
[   72.480156] TSC Offset = 0x0000000000000000
[   72.481316] SVI|RVI = 00|00 TPR Threshold = 0x00
[   72.482305] APIC-access addr = 0x0000000000000000 virt-APIC addr = 0x0000000000000000
[   72.484216] PostedIntrVec = 0x00
[   72.484928] EPT pointer = 0x000003500200005e
[   72.485835] Virtual processor ID = 0x0000

问题是 EPTP 在处理器物理地址宽度之上有非零位。 (我认为i3-2130的物理地址宽度是36位。)

不应将此报告为无效的访客状态错误。相反,它应该是一个无效的控制字段错误(错误代码为 7 的 VM 条目失败),这是我在真实硬件上测试它时看到的。我认为 KVM 错误地虚拟化了这个错误。

如果 PDPTE 无效,启用 EPT 会导致无效访客状态错误的唯一方法,只有当访客寻呼模式为 PAE 而不是 ia32e 时才会发生这种情况。 (第 26.3.1.6 节。)

代码中的问题是在将地址存储到phys_addr 字段之前需要将地址右移12 位。 请参阅第 24.6.11 节中 EPTP 的定义。 pml4_phys_addr 字段应包含物理地址的 35:12 位。位 11:0 未表示(因为它们都是 0)。 您可以使用以下解决方案之一:

选项A:

eptp.fields.memtype = 6;
eptp.fields.page_walk = 3;
eptp.fields.accessed_and_dirty_flags_enabled = 1;
eptp.fields.pml4_phys_addr = virt_to_phys(ept_pml4) >> 12;

选项 B:

eptp.fields.memtype = 6;
eptp.fields.page_walk = 3;
eptp.fields.accessed_and_dirty_flags_enabled = 1;
eptp.value |= virt_to_phys(ept_pml4);

选项 C:

eptp.value = virt_to_phys(ept_pml4);
eptp.fields.memtype = 6;
eptp.fields.page_walk = 3;
eptp.fields.accessed_and_dirty_flags_enabled = 1;

对初始化 EPT 条目的所有代码进行类似的更改。