由于访客状态,在 VMX 中启用 EPT 导致条目失败
Enabling EPT in VMX causes failed entry due to guest state
我正在家里构建管理程序,但在启用 EPT 时我遇到了进入 VMX 的问题。
下面代码用于设置guest模式,成功进入VMX。
但是,当我启用 EPT 时,出现 VMX 条目失败,异常号为 no。 33(由于来宾状态导致 vm 条目失败)。这是我取消注释以下代码的时候:
vmcs_write(SECONDARY_VM_EXEC_CONTROL, adjust_msr_control(
MSR_IA32_VMX_PROCBASED_CTLS2, CPU_BASED_CTL2_RDTSCP | CPU_BASED_CTL2_ENABLE_INVPCID /* | CPU_BASED_CTL2_ENABLE_VPID | CPU_BASED_CTL2_ENABLE_XSAVE_XRSTORS */ | CPU_BASED_CTL2_ENABLE_EPT
)); <--- I add CPU_BASED_CTL2_ENABLE_EPT
vmcs_write64(EPT_POINTER, vms->eptp.value);
我多次查阅 Intel 手册以确保我遵循来宾状态检查,但我不明白为什么只有在启用 EPT 时我的输入才会失败。
任何 suggestion/help 都会有所帮助,谢谢 :)
static noinline void vmwrite_error(unsigned long field, unsigned long value){
printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
field, value, (int)(vmcs_read(VM_INSTRUCTION_ERROR)));
dump_stack();
BUG_ON(1);
}
static void vmcs_write(unsigned long field, unsigned long value){
uint8_t err;
__asm__ __volatile__(
"vmwrite %[value],%[field]; setna %[err]"
: [err]"=rm"(err)
: [field]"r"(field), [value]"r"(value)
: "cc", "memory"
);
if(err)
vmwrite_error(field, value);
else
printk(KERN_INFO "vmwrite log: reg %lx value %lx\n", field, value);
}
EPTP alloc_ept(int initial_pages_count){
int i;
EPTP eptp;
EPT_PML4E *ept_pml4;
EPT_PDPTE *ept_pdpt;
EPT_PDE *ept_pd;
EPT_PTE *ept_pt;
eptp.value = 0;
ept_pml4 = kzalloc(4096, GFP_KERNEL | GFP_NOWAIT);
if(!ept_pml4)
goto pml4err;
ept_pdpt = kzalloc(4096, GFP_KERNEL | GFP_NOWAIT);
if(!ept_pdpt)
goto pdpterr;
ept_pd = kzalloc(4096, GFP_KERNEL | GFP_NOWAIT);
if(!ept_pd)
goto pderr;
ept_pt = kzalloc(4096, GFP_KERNEL | GFP_NOWAIT);
if(!ept_pt)
goto pterr;
for(i = 0; i < initial_pages_count; i++){
ept_pt[i].fields.read_access = 1;
ept_pt[i].fields.write_access = 1;
ept_pt[i].fields.execute_access = 1;
ept_pt[i].fields.ept_memtype = 6;
ept_pt[i].fields.phys_addr = virt_to_phys(kzalloc(4096, GFP_KERNEL | GFP_NOWAIT));
}
ept_pd->fields.read_access = 1;
ept_pd->fields.write_access = 1;
ept_pd->fields.execute_access = 1;
ept_pd->fields.phys_addr = virt_to_phys(ept_pt);
ept_pdpt->fields.read_access = 1;
ept_pdpt->fields.write_access = 1;
ept_pdpt->fields.execute_access = 1;
ept_pdpt->fields.phys_addr = virt_to_phys(ept_pd);
ept_pml4->fields.read_access = 1;
ept_pml4->fields.write_access = 1;
ept_pml4->fields.execute_access = 1;
ept_pml4->fields.phys_addr = virt_to_phys(ept_pdpt);
eptp.fields.memtype = 6;
eptp.fields.page_walk = 3;
eptp.fields.accessed_and_dirty_flags_enabled = 1;
eptp.fields.pml4_phys_addr = virt_to_phys(ept_pml4);
return eptp;
pterr:
kfree(ept_pd);
pderr:
kfree(ept_pdpt);
pdpterr:
kfree(ept_pml4);
pml4err:
panic("EPT ALLOC ERROR!");
}
static void setup_vm_code(vmstate *vms){
int i;
EPT_PML4E *pml = phys_to_virt(vms->eptp.fields.pml4_phys_addr);
EPT_PDPTE *pdpt = phys_to_virt(pml->fields.phys_addr);
EPT_PDE *pd = phys_to_virt(pdpt->fields.phys_addr);
EPT_PTE *pt = phys_to_virt(pd->fields.phys_addr);
vms->initial_rip = (unsigned long)phys_to_virt(pt[0].fields.phys_addr);
for(i = 0; i < 4096; i++){
// hlt
*(char*)(vms->initial_rip+i) = 0xf4;
}
printk(KERN_INFO "INITIAL_RIP: %lu", vms->initial_rip);
// Stack grows down
vms->initial_rsp = (unsigned long)phys_to_virt(pt[9].fields.phys_addr) + 4095;
}
static void prepare_vmx_cpu(void *info){
uint32_t vmcs_revid = 0;
uint32_t hi = 0;
vmstate *vms = per_cpu(cpu_vms, smp_processor_id());
// Populate VMCS revision id in vmxon region
rdmsr_safe(MSR_IA32_VMX_BASIC, &vmcs_revid, &hi);
memcpy(vms->vmxon_region, &vmcs_revid, 4);
memcpy(vms->vmcs_region, &vmcs_revid, 4);
vms->eptp = alloc_ept(10);
setup_vm_code(vms);
vmx_enable();
}
//static void handle_vmexit(void) __attribute__((used));
static void handle_vmexit(void){
int exit_reason = vmcs_read32(VM_EXIT_REASON);
int basic_exit_code = exit_reason & 0xffff;
int exit_qualification = vmcs_read32(EXIT_QUALIFICATION);
int vm_entry_failure = exit_reason & 0x80000000;
panic("VMEXIT WITH CODE %d, VM ENTRY FAILURE: %s, QUAL: %d", basic_exit_code, vm_entry_failure ? "true" : "false", exit_qualification);
vmx_dump_cpu();
panic("ERR");
VMRESUME();
//TODO: switch error reasons
}
static void vmx_setup_vm_controls(void){
// VM Execution Controls
vmcs_write(PIN_BASED_VM_EXEC_CONTROL, adjust_msr_control(MSR_IA32_VMX_PINBASED_CTLS, 0));
vmcs_write(CPU_BASED_VM_EXEC_CONTROL, adjust_msr_control(
MSR_IA32_VMX_PROCBASED_CTLS, CPU_BASED_HLT_EXITING | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS));
vmcs_write(SECONDARY_VM_EXEC_CONTROL, adjust_msr_control(
MSR_IA32_VMX_PROCBASED_CTLS2, CPU_BASED_CTL2_RDTSCP | CPU_BASED_CTL2_ENABLE_INVPCID /* | CPU_BASED_CTL2_ENABLE_VPID | CPU_BASED_CTL2_ENABLE_XSAVE_XRSTORS */ | CPU_BASED_CTL2_ENABLE_EPT
));
//vmcs_write64(TSC_OFFSET, 0);
vmcs_write(CR0_READ_SHADOW, read_cr0());
vmcs_write(CR4_READ_SHADOW, __read_cr4());
vmcs_write(CR0_GUEST_HOST_MASK, ~0ul);
vmcs_write(CR4_GUEST_HOST_MASK, ~0ul);
// How many CR3_TARGET_VALUEs are considered without VM exit when MOV CR3, VAL
vmcs_write(CR3_TARGET_COUNT, 0);
// VM Entry & Exit Controls
vmcs_write(VM_EXIT_CONTROLS, adjust_msr_control(MSR_IA32_VMX_EXIT_CTLS, VM_EXIT_IA32E_MODE | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_HOST_ADDR_SPACE_SIZE));
vmcs_write(VM_ENTRY_CONTROLS, adjust_msr_control(MSR_IA32_VMX_ENTRY_CTLS, VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER));
}
static void vmx_setup_initial_host_state(vmstate *vms){
struct desc_ptr gdtptr, idt;
vmcs_write(HOST_CR0, read_cr0());
vmcs_write(HOST_CR3, __read_cr3());
vmcs_write(HOST_CR4, __read_cr4());
vmcs_write(HOST_RSP, (unsigned long)vms->vmm_handle_stack + vms->vmm_handle_stack_size - 1);
vmcs_write(HOST_RIP, (unsigned long)handle_vmexit);
/* An explanation of segment selectors: https://medium.com/hungys-blog/linux-kernel-memory-addressing-a0d304283af3 */
// Segment Selectors
vmcs_write(HOST_CS_SELECTOR, __KERNEL_CS);
vmcs_write(HOST_DS_SELECTOR, __KERNEL_DS);
vmcs_write(HOST_ES_SELECTOR, __KERNEL_DS);
vmcs_write(HOST_SS_SELECTOR, __KERNEL_DS);
vmcs_write(HOST_FS_SELECTOR, 0);
vmcs_write(HOST_GS_SELECTOR, 0);
vmcs_write(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);
// Segment Base Adresses
vmcs_write(HOST_FS_BASE, native_read_msr(MSR_FS_BASE));
vmcs_write(HOST_GS_BASE, native_read_msr(MSR_GS_BASE));
vmcs_write(HOST_TR_BASE, read_tr_base());
native_store_gdt(&gdtptr);
vmcs_write(HOST_GDTR_BASE, gdtptr.address);
store_idt(&idt);
vmcs_write(HOST_IDTR_BASE, idt.address);
// MSRs
vmcs_write(HOST_IA32_SYSENTER_CS, native_read_msr(MSR_IA32_SYSENTER_CS));
vmcs_write(HOST_IA32_SYSENTER_ESP, native_read_msr(MSR_IA32_SYSENTER_ESP));
vmcs_write(HOST_IA32_SYSENTER_EIP, native_read_msr(MSR_IA32_SYSENTER_EIP));
vmcs_write64(HOST_IA32_EFER, native_read_msr(MSR_EFER));
}
static void RIPTEST(void) __attribute__((used));
static void RIPTEST(void){
__asm__ __volatile__("hlt; hlt; hlt; hlt; hlt; hlt");
}
static void vmx_setup_initial_guest_state(vmstate *vms){
vmcs_write(GUEST_CR0, read_cr0());
vmcs_write(GUEST_CR3, __read_cr3());
vmcs_write(GUEST_CR4, __read_cr4());
vmcs_write(GUEST_DR7, 0);
vmcs_write(GUEST_RIP, vms->initial_rip);
//vmcs_write(GUEST_RIP, (unsigned long)RIPTEST);
vmcs_write(GUEST_RSP, vms->initial_rsp);
vmcs_write(GUEST_RFLAGS, 0x2); // Reserved flag
// Setup selectors
vmcs_write(GUEST_CS_SELECTOR, 0);
vmcs_write(GUEST_SS_SELECTOR, 0);
vmcs_write(GUEST_DS_SELECTOR, 0);
vmcs_write(GUEST_ES_SELECTOR, 0);
vmcs_write(GUEST_FS_SELECTOR, 0);
vmcs_write(GUEST_GS_SELECTOR, 0);
vmcs_write(GUEST_LDTR_SELECTOR, 0);
vmcs_write(GUEST_TR_SELECTOR, 0);
// Setup base addresses
vmcs_write(GUEST_CS_BASE, 0);
vmcs_write(GUEST_SS_BASE, 0);
vmcs_write(GUEST_DS_BASE, 0);
vmcs_write(GUEST_ES_BASE, 0);
vmcs_write(GUEST_FS_BASE, native_read_msr(MSR_FS_BASE));
vmcs_write(GUEST_GS_BASE, native_read_msr(MSR_GS_BASE));
vmcs_write(GUEST_LDTR_BASE, 0);
vmcs_write(GUEST_TR_BASE, 0);
// Setup guest segment limits
vmcs_write(GUEST_CS_LIMIT, 0xFFFFFFFF);
vmcs_write(GUEST_SS_LIMIT, 0xFFFFFFFF);
vmcs_write(GUEST_DS_LIMIT, 0xFFFFFFFF);
vmcs_write(GUEST_ES_LIMIT, 0xFFFFFFFF);
vmcs_write(GUEST_FS_LIMIT, 0xFFFFFFFF);
vmcs_write(GUEST_GS_LIMIT, 0xFFFFFFFF);
vmcs_write(GUEST_LDTR_LIMIT, 0);
vmcs_write(GUEST_TR_LIMIT, 0xFF);
// Setup guest segment access rights
// https://www.amd.com/system/files/TechDocs/24593.pdf#G10.910849
vmcs_write(GUEST_CS_AR_BYTES, 0xA09B);
vmcs_write(GUEST_SS_AR_BYTES, 0xA093);
vmcs_write(GUEST_DS_AR_BYTES, 0xA093);
vmcs_write(GUEST_ES_AR_BYTES, 0xA093);
vmcs_write(GUEST_FS_AR_BYTES, 0xA093);
vmcs_write(GUEST_GS_AR_BYTES, 0xA093);
vmcs_write(GUEST_LDTR_AR_BYTES, 0x0082);
vmcs_write(GUEST_TR_AR_BYTES, 0x008B);
// Setup GDTR & IDTR
vmcs_write(GUEST_GDTR_BASE, 0);
vmcs_write(GUEST_IDTR_BASE, 0);
vmcs_write(GUEST_GDTR_LIMIT, 0);
vmcs_write(GUEST_IDTR_LIMIT, 0);
vmcs_write(GUEST_IA32_EFER, native_read_msr(MSR_EFER));
vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
// Setup sysenter primitives
vmcs_write(GUEST_SYSENTER_CS, 0);
vmcs_write(GUEST_SYSENTER_ESP, 0);
vmcs_write(GUEST_SYSENTER_EIP, 0);
}
static void init_vmcs(vmstate *vms){
VMPTRLD(vms->vmcs_physical);
vmx_setup_vm_controls();
vmx_setup_initial_guest_state(vms);
vmx_setup_initial_host_state(vms);
vmcs_write64(VMCS_LINK_POINTER, -1ull);
//vmcs_write(EXCEPTION_BITMAP, 8192);
vmcs_write64(EPT_POINTER, vms->eptp.value);
//vmcs_write(VIRTUAL_PROCESSOR_ID, vms->vpid);
}
int vmx_launch(void){
int cpu = smp_processor_id();
vmstate *vms = per_cpu(cpu_vms, smp_processor_id());
printk(KERN_INFO "Launching VM on CPU %d\n", cpu);
init_vmcs(vms);
VMLAUNCH();
put_cpu();
return 0;
}
int vmx_setup(void){
int i;
vmstate* vms;
printk(KERN_INFO "NUM CPUS: %d\n", num_online_cpus());
for_each_online_cpu(i){
vms = create_vmstate();
vms->vmxon_region = kmalloc(4096, GFP_KERNEL);
vms->vmxon_physical = virt_to_phys(vms->vmxon_region);
vms->vmcs_region = kzalloc(4096, GFP_KERNEL);
vms->vmcs_physical = virt_to_phys(vms->vmcs_region);
vms->vmm_handle_stack_size = 4096;
vms->vmm_handle_stack = kmalloc(vms->vmm_handle_stack_size, GFP_KERNEL);
vms->vpid = get_free_vpid();
per_cpu(cpu_vms, i) = vms;
}
on_each_cpu(prepare_vmx_cpu, NULL, 1);
printk(KERN_INFO "CPUS prepared!");
for_each_online_cpu(i){
vms = per_cpu(cpu_vms, i);
if(vms->vmx_enabled == false) {
printk(KERN_ALERT "Tearing down after VMXON failed!");
vmx_teardown();
return -1;
}
}
printk(KERN_INFO "VMX turned on for all CPUs!");
return 0;
}
VMCS 转储:
***Guest State***
[ 72.414906] CR0: actual=0x0000000080050033, shadow=0x0000000080050033, gh_mask=ffffffffffffffff
[ 72.416865] CR4: actual=0x00000000000626e0, shadow=0x00000000000626e0, gh_mask=ffffffffffffffff
[ 72.419147] CR3 = 0x00000000307ce004
[ 72.419950] PDPTR0 = 0x0000000000000000 PDPTR1 = 0x0000000000000000
[ 72.421384] PDPTR2 = 0x0000000000000000 PDPTR3 = 0x0000000000000000
[ 72.422753] RSP = 0xffff9c9cb31f8fff RIP = 0xffff9c9cb5005000
[ 72.424510] RFLAGS=0x00000002 DR7 = 0x0000000000000000
[ 72.426501] Sysenter RSP=0000000000000000 CS:RIP=0000:0000000000000000
[ 72.428141] CS: sel=0x0000, attr=0x0a09b, limit=0xffffffff, base=0x0000000000000000
[ 72.430162] DS: sel=0x0000, attr=0x0a093, limit=0xffffffff, base=0x0000000000000000
[ 72.432075] SS: sel=0x0000, attr=0x0a093, limit=0xffffffff, base=0x0000000000000000
[ 72.433982] ES: sel=0x0000, attr=0x0a093, limit=0xffffffff, base=0x0000000000000000
[ 72.436152] FS: sel=0x0000, attr=0x0a093, limit=0xffffffff, base=0x00007f8e51f0c4c0
[ 72.438437] GS: sel=0x0000, attr=0x0a093, limit=0xffffffff, base=0xffff9c9cbeb00000
[ 72.440579] GDTR: limit=0x00000000, base=0x0000000000000000
[ 72.442241] LDTR: sel=0x0000, attr=0x00082, limit=0x00000000, base=0x0000000000000000
[ 72.443414] IDTR: limit=0x00000000, base=0x0000000000000000
[ 72.444591] TR: sel=0x0000, attr=0x0008b, limit=0x000000ff, base=0x0000000000000000
[ 72.447023] EFER = 0x0000000000000d01 PAT = 0x0000000000000000
[ 72.448999] DebugCtl = 0x0000000000000000 DebugExceptions = 0x0000000000000000
[ 72.451813] PerfGlobCtl = 0x0000000000000000
[ 72.453316] BndCfgS = 0x0000000000000000
[ 72.454528] Interruptibility = 00000000 ActivityState = 00000000
[ 72.456302] InterruptStatus = 0000
[ 72.456997] *** Host State ***
[ 72.457622] RIP = 0xffffffffc0789b90 RSP = 0xffff9c9cb5019fff
[ 72.458766] CS=0010 SS=0018 DS=0018 ES=0018 FS=0000 GS=0000 TR=0040
[ 72.460007] FSBase=00007f8e51f0c4c0 GSBase=ffff9c9cbeb00000 TRBase=0000000000000000
[ 72.461588] GDTBase=fffffe000002c000 IDTBase=fffffe0000000000
[ 72.462711] CR0=0000000080050033 CR3=00000000307ce004 CR4=00000000000626e0
[ 72.464083] Sysenter RSP=fffffe000002d200 CS:RIP=0010:ffffffff848015f0
[ 72.465472] EFER = 0x0000000000000d01 PAT = 0x0000000000000000
[ 72.467041] PerfGlobCtl = 0x0000000000000000
[ 72.468110] *** Control State ***
[ 72.469024] PinBased=00000016 CPUBased=8401e1f2 SecondaryExec=0000000a
[ 72.470863] EntryControls=000093ff ExitControls=00236fff
[ 72.472268] ExceptionBitmap=00000000 PFECmask=00000000 PFECmatch=00000000
[ 72.474137] VMEntry: intr_info=00000000 errcode=00000000 ilen=00000000
[ 72.475580] VMExit: intr_info=00000000 errcode=00000000 ilen=00000000
[ 72.477230] reason=80000021 qualification=0000000000000000
[ 72.478806] IDTVectoring: info=00000000 errcode=00000000
[ 72.480156] TSC Offset = 0x0000000000000000
[ 72.481316] SVI|RVI = 00|00 TPR Threshold = 0x00
[ 72.482305] APIC-access addr = 0x0000000000000000 virt-APIC addr = 0x0000000000000000
[ 72.484216] PostedIntrVec = 0x00
[ 72.484928] EPT pointer = 0x000003500200005e
[ 72.485835] Virtual processor ID = 0x0000
问题是 EPTP 在处理器物理地址宽度之上有非零位。 (我认为i3-2130的物理地址宽度是36位。)
不应将此报告为无效的访客状态错误。相反,它应该是一个无效的控制字段错误(错误代码为 7 的 VM 条目失败),这是我在真实硬件上测试它时看到的。我认为 KVM 错误地虚拟化了这个错误。
如果 PDPTE 无效,启用 EPT 会导致无效访客状态错误的唯一方法,只有当访客寻呼模式为 PAE 而不是 ia32e 时才会发生这种情况。 (第 26.3.1.6 节。)
代码中的问题是在将地址存储到phys_addr 字段之前需要将地址右移12 位。
请参阅第 24.6.11 节中 EPTP 的定义。 pml4_phys_addr 字段应包含物理地址的 35:12 位。位 11:0 未表示(因为它们都是 0)。
您可以使用以下解决方案之一:
选项A:
eptp.fields.memtype = 6;
eptp.fields.page_walk = 3;
eptp.fields.accessed_and_dirty_flags_enabled = 1;
eptp.fields.pml4_phys_addr = virt_to_phys(ept_pml4) >> 12;
选项 B:
eptp.fields.memtype = 6;
eptp.fields.page_walk = 3;
eptp.fields.accessed_and_dirty_flags_enabled = 1;
eptp.value |= virt_to_phys(ept_pml4);
选项 C:
eptp.value = virt_to_phys(ept_pml4);
eptp.fields.memtype = 6;
eptp.fields.page_walk = 3;
eptp.fields.accessed_and_dirty_flags_enabled = 1;
对初始化 EPT 条目的所有代码进行类似的更改。
我正在家里构建管理程序,但在启用 EPT 时我遇到了进入 VMX 的问题。 下面代码用于设置guest模式,成功进入VMX。 但是,当我启用 EPT 时,出现 VMX 条目失败,异常号为 no。 33(由于来宾状态导致 vm 条目失败)。这是我取消注释以下代码的时候:
vmcs_write(SECONDARY_VM_EXEC_CONTROL, adjust_msr_control(
MSR_IA32_VMX_PROCBASED_CTLS2, CPU_BASED_CTL2_RDTSCP | CPU_BASED_CTL2_ENABLE_INVPCID /* | CPU_BASED_CTL2_ENABLE_VPID | CPU_BASED_CTL2_ENABLE_XSAVE_XRSTORS */ | CPU_BASED_CTL2_ENABLE_EPT
)); <--- I add CPU_BASED_CTL2_ENABLE_EPT
vmcs_write64(EPT_POINTER, vms->eptp.value);
我多次查阅 Intel 手册以确保我遵循来宾状态检查,但我不明白为什么只有在启用 EPT 时我的输入才会失败。 任何 suggestion/help 都会有所帮助,谢谢 :)
static noinline void vmwrite_error(unsigned long field, unsigned long value){
printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
field, value, (int)(vmcs_read(VM_INSTRUCTION_ERROR)));
dump_stack();
BUG_ON(1);
}
static void vmcs_write(unsigned long field, unsigned long value){
uint8_t err;
__asm__ __volatile__(
"vmwrite %[value],%[field]; setna %[err]"
: [err]"=rm"(err)
: [field]"r"(field), [value]"r"(value)
: "cc", "memory"
);
if(err)
vmwrite_error(field, value);
else
printk(KERN_INFO "vmwrite log: reg %lx value %lx\n", field, value);
}
EPTP alloc_ept(int initial_pages_count){
int i;
EPTP eptp;
EPT_PML4E *ept_pml4;
EPT_PDPTE *ept_pdpt;
EPT_PDE *ept_pd;
EPT_PTE *ept_pt;
eptp.value = 0;
ept_pml4 = kzalloc(4096, GFP_KERNEL | GFP_NOWAIT);
if(!ept_pml4)
goto pml4err;
ept_pdpt = kzalloc(4096, GFP_KERNEL | GFP_NOWAIT);
if(!ept_pdpt)
goto pdpterr;
ept_pd = kzalloc(4096, GFP_KERNEL | GFP_NOWAIT);
if(!ept_pd)
goto pderr;
ept_pt = kzalloc(4096, GFP_KERNEL | GFP_NOWAIT);
if(!ept_pt)
goto pterr;
for(i = 0; i < initial_pages_count; i++){
ept_pt[i].fields.read_access = 1;
ept_pt[i].fields.write_access = 1;
ept_pt[i].fields.execute_access = 1;
ept_pt[i].fields.ept_memtype = 6;
ept_pt[i].fields.phys_addr = virt_to_phys(kzalloc(4096, GFP_KERNEL | GFP_NOWAIT));
}
ept_pd->fields.read_access = 1;
ept_pd->fields.write_access = 1;
ept_pd->fields.execute_access = 1;
ept_pd->fields.phys_addr = virt_to_phys(ept_pt);
ept_pdpt->fields.read_access = 1;
ept_pdpt->fields.write_access = 1;
ept_pdpt->fields.execute_access = 1;
ept_pdpt->fields.phys_addr = virt_to_phys(ept_pd);
ept_pml4->fields.read_access = 1;
ept_pml4->fields.write_access = 1;
ept_pml4->fields.execute_access = 1;
ept_pml4->fields.phys_addr = virt_to_phys(ept_pdpt);
eptp.fields.memtype = 6;
eptp.fields.page_walk = 3;
eptp.fields.accessed_and_dirty_flags_enabled = 1;
eptp.fields.pml4_phys_addr = virt_to_phys(ept_pml4);
return eptp;
pterr:
kfree(ept_pd);
pderr:
kfree(ept_pdpt);
pdpterr:
kfree(ept_pml4);
pml4err:
panic("EPT ALLOC ERROR!");
}
static void setup_vm_code(vmstate *vms){
int i;
EPT_PML4E *pml = phys_to_virt(vms->eptp.fields.pml4_phys_addr);
EPT_PDPTE *pdpt = phys_to_virt(pml->fields.phys_addr);
EPT_PDE *pd = phys_to_virt(pdpt->fields.phys_addr);
EPT_PTE *pt = phys_to_virt(pd->fields.phys_addr);
vms->initial_rip = (unsigned long)phys_to_virt(pt[0].fields.phys_addr);
for(i = 0; i < 4096; i++){
// hlt
*(char*)(vms->initial_rip+i) = 0xf4;
}
printk(KERN_INFO "INITIAL_RIP: %lu", vms->initial_rip);
// Stack grows down
vms->initial_rsp = (unsigned long)phys_to_virt(pt[9].fields.phys_addr) + 4095;
}
static void prepare_vmx_cpu(void *info){
uint32_t vmcs_revid = 0;
uint32_t hi = 0;
vmstate *vms = per_cpu(cpu_vms, smp_processor_id());
// Populate VMCS revision id in vmxon region
rdmsr_safe(MSR_IA32_VMX_BASIC, &vmcs_revid, &hi);
memcpy(vms->vmxon_region, &vmcs_revid, 4);
memcpy(vms->vmcs_region, &vmcs_revid, 4);
vms->eptp = alloc_ept(10);
setup_vm_code(vms);
vmx_enable();
}
//static void handle_vmexit(void) __attribute__((used));
static void handle_vmexit(void){
int exit_reason = vmcs_read32(VM_EXIT_REASON);
int basic_exit_code = exit_reason & 0xffff;
int exit_qualification = vmcs_read32(EXIT_QUALIFICATION);
int vm_entry_failure = exit_reason & 0x80000000;
panic("VMEXIT WITH CODE %d, VM ENTRY FAILURE: %s, QUAL: %d", basic_exit_code, vm_entry_failure ? "true" : "false", exit_qualification);
vmx_dump_cpu();
panic("ERR");
VMRESUME();
//TODO: switch error reasons
}
static void vmx_setup_vm_controls(void){
// VM Execution Controls
vmcs_write(PIN_BASED_VM_EXEC_CONTROL, adjust_msr_control(MSR_IA32_VMX_PINBASED_CTLS, 0));
vmcs_write(CPU_BASED_VM_EXEC_CONTROL, adjust_msr_control(
MSR_IA32_VMX_PROCBASED_CTLS, CPU_BASED_HLT_EXITING | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS));
vmcs_write(SECONDARY_VM_EXEC_CONTROL, adjust_msr_control(
MSR_IA32_VMX_PROCBASED_CTLS2, CPU_BASED_CTL2_RDTSCP | CPU_BASED_CTL2_ENABLE_INVPCID /* | CPU_BASED_CTL2_ENABLE_VPID | CPU_BASED_CTL2_ENABLE_XSAVE_XRSTORS */ | CPU_BASED_CTL2_ENABLE_EPT
));
//vmcs_write64(TSC_OFFSET, 0);
vmcs_write(CR0_READ_SHADOW, read_cr0());
vmcs_write(CR4_READ_SHADOW, __read_cr4());
vmcs_write(CR0_GUEST_HOST_MASK, ~0ul);
vmcs_write(CR4_GUEST_HOST_MASK, ~0ul);
// How many CR3_TARGET_VALUEs are considered without VM exit when MOV CR3, VAL
vmcs_write(CR3_TARGET_COUNT, 0);
// VM Entry & Exit Controls
vmcs_write(VM_EXIT_CONTROLS, adjust_msr_control(MSR_IA32_VMX_EXIT_CTLS, VM_EXIT_IA32E_MODE | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_HOST_ADDR_SPACE_SIZE));
vmcs_write(VM_ENTRY_CONTROLS, adjust_msr_control(MSR_IA32_VMX_ENTRY_CTLS, VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER));
}
static void vmx_setup_initial_host_state(vmstate *vms){
struct desc_ptr gdtptr, idt;
vmcs_write(HOST_CR0, read_cr0());
vmcs_write(HOST_CR3, __read_cr3());
vmcs_write(HOST_CR4, __read_cr4());
vmcs_write(HOST_RSP, (unsigned long)vms->vmm_handle_stack + vms->vmm_handle_stack_size - 1);
vmcs_write(HOST_RIP, (unsigned long)handle_vmexit);
/* An explanation of segment selectors: https://medium.com/hungys-blog/linux-kernel-memory-addressing-a0d304283af3 */
// Segment Selectors
vmcs_write(HOST_CS_SELECTOR, __KERNEL_CS);
vmcs_write(HOST_DS_SELECTOR, __KERNEL_DS);
vmcs_write(HOST_ES_SELECTOR, __KERNEL_DS);
vmcs_write(HOST_SS_SELECTOR, __KERNEL_DS);
vmcs_write(HOST_FS_SELECTOR, 0);
vmcs_write(HOST_GS_SELECTOR, 0);
vmcs_write(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);
// Segment Base Adresses
vmcs_write(HOST_FS_BASE, native_read_msr(MSR_FS_BASE));
vmcs_write(HOST_GS_BASE, native_read_msr(MSR_GS_BASE));
vmcs_write(HOST_TR_BASE, read_tr_base());
native_store_gdt(&gdtptr);
vmcs_write(HOST_GDTR_BASE, gdtptr.address);
store_idt(&idt);
vmcs_write(HOST_IDTR_BASE, idt.address);
// MSRs
vmcs_write(HOST_IA32_SYSENTER_CS, native_read_msr(MSR_IA32_SYSENTER_CS));
vmcs_write(HOST_IA32_SYSENTER_ESP, native_read_msr(MSR_IA32_SYSENTER_ESP));
vmcs_write(HOST_IA32_SYSENTER_EIP, native_read_msr(MSR_IA32_SYSENTER_EIP));
vmcs_write64(HOST_IA32_EFER, native_read_msr(MSR_EFER));
}
static void RIPTEST(void) __attribute__((used));
static void RIPTEST(void){
__asm__ __volatile__("hlt; hlt; hlt; hlt; hlt; hlt");
}
static void vmx_setup_initial_guest_state(vmstate *vms){
vmcs_write(GUEST_CR0, read_cr0());
vmcs_write(GUEST_CR3, __read_cr3());
vmcs_write(GUEST_CR4, __read_cr4());
vmcs_write(GUEST_DR7, 0);
vmcs_write(GUEST_RIP, vms->initial_rip);
//vmcs_write(GUEST_RIP, (unsigned long)RIPTEST);
vmcs_write(GUEST_RSP, vms->initial_rsp);
vmcs_write(GUEST_RFLAGS, 0x2); // Reserved flag
// Setup selectors
vmcs_write(GUEST_CS_SELECTOR, 0);
vmcs_write(GUEST_SS_SELECTOR, 0);
vmcs_write(GUEST_DS_SELECTOR, 0);
vmcs_write(GUEST_ES_SELECTOR, 0);
vmcs_write(GUEST_FS_SELECTOR, 0);
vmcs_write(GUEST_GS_SELECTOR, 0);
vmcs_write(GUEST_LDTR_SELECTOR, 0);
vmcs_write(GUEST_TR_SELECTOR, 0);
// Setup base addresses
vmcs_write(GUEST_CS_BASE, 0);
vmcs_write(GUEST_SS_BASE, 0);
vmcs_write(GUEST_DS_BASE, 0);
vmcs_write(GUEST_ES_BASE, 0);
vmcs_write(GUEST_FS_BASE, native_read_msr(MSR_FS_BASE));
vmcs_write(GUEST_GS_BASE, native_read_msr(MSR_GS_BASE));
vmcs_write(GUEST_LDTR_BASE, 0);
vmcs_write(GUEST_TR_BASE, 0);
// Setup guest segment limits
vmcs_write(GUEST_CS_LIMIT, 0xFFFFFFFF);
vmcs_write(GUEST_SS_LIMIT, 0xFFFFFFFF);
vmcs_write(GUEST_DS_LIMIT, 0xFFFFFFFF);
vmcs_write(GUEST_ES_LIMIT, 0xFFFFFFFF);
vmcs_write(GUEST_FS_LIMIT, 0xFFFFFFFF);
vmcs_write(GUEST_GS_LIMIT, 0xFFFFFFFF);
vmcs_write(GUEST_LDTR_LIMIT, 0);
vmcs_write(GUEST_TR_LIMIT, 0xFF);
// Setup guest segment access rights
// https://www.amd.com/system/files/TechDocs/24593.pdf#G10.910849
vmcs_write(GUEST_CS_AR_BYTES, 0xA09B);
vmcs_write(GUEST_SS_AR_BYTES, 0xA093);
vmcs_write(GUEST_DS_AR_BYTES, 0xA093);
vmcs_write(GUEST_ES_AR_BYTES, 0xA093);
vmcs_write(GUEST_FS_AR_BYTES, 0xA093);
vmcs_write(GUEST_GS_AR_BYTES, 0xA093);
vmcs_write(GUEST_LDTR_AR_BYTES, 0x0082);
vmcs_write(GUEST_TR_AR_BYTES, 0x008B);
// Setup GDTR & IDTR
vmcs_write(GUEST_GDTR_BASE, 0);
vmcs_write(GUEST_IDTR_BASE, 0);
vmcs_write(GUEST_GDTR_LIMIT, 0);
vmcs_write(GUEST_IDTR_LIMIT, 0);
vmcs_write(GUEST_IA32_EFER, native_read_msr(MSR_EFER));
vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
// Setup sysenter primitives
vmcs_write(GUEST_SYSENTER_CS, 0);
vmcs_write(GUEST_SYSENTER_ESP, 0);
vmcs_write(GUEST_SYSENTER_EIP, 0);
}
static void init_vmcs(vmstate *vms){
VMPTRLD(vms->vmcs_physical);
vmx_setup_vm_controls();
vmx_setup_initial_guest_state(vms);
vmx_setup_initial_host_state(vms);
vmcs_write64(VMCS_LINK_POINTER, -1ull);
//vmcs_write(EXCEPTION_BITMAP, 8192);
vmcs_write64(EPT_POINTER, vms->eptp.value);
//vmcs_write(VIRTUAL_PROCESSOR_ID, vms->vpid);
}
int vmx_launch(void){
int cpu = smp_processor_id();
vmstate *vms = per_cpu(cpu_vms, smp_processor_id());
printk(KERN_INFO "Launching VM on CPU %d\n", cpu);
init_vmcs(vms);
VMLAUNCH();
put_cpu();
return 0;
}
int vmx_setup(void){
int i;
vmstate* vms;
printk(KERN_INFO "NUM CPUS: %d\n", num_online_cpus());
for_each_online_cpu(i){
vms = create_vmstate();
vms->vmxon_region = kmalloc(4096, GFP_KERNEL);
vms->vmxon_physical = virt_to_phys(vms->vmxon_region);
vms->vmcs_region = kzalloc(4096, GFP_KERNEL);
vms->vmcs_physical = virt_to_phys(vms->vmcs_region);
vms->vmm_handle_stack_size = 4096;
vms->vmm_handle_stack = kmalloc(vms->vmm_handle_stack_size, GFP_KERNEL);
vms->vpid = get_free_vpid();
per_cpu(cpu_vms, i) = vms;
}
on_each_cpu(prepare_vmx_cpu, NULL, 1);
printk(KERN_INFO "CPUS prepared!");
for_each_online_cpu(i){
vms = per_cpu(cpu_vms, i);
if(vms->vmx_enabled == false) {
printk(KERN_ALERT "Tearing down after VMXON failed!");
vmx_teardown();
return -1;
}
}
printk(KERN_INFO "VMX turned on for all CPUs!");
return 0;
}
VMCS 转储:
***Guest State***
[ 72.414906] CR0: actual=0x0000000080050033, shadow=0x0000000080050033, gh_mask=ffffffffffffffff
[ 72.416865] CR4: actual=0x00000000000626e0, shadow=0x00000000000626e0, gh_mask=ffffffffffffffff
[ 72.419147] CR3 = 0x00000000307ce004
[ 72.419950] PDPTR0 = 0x0000000000000000 PDPTR1 = 0x0000000000000000
[ 72.421384] PDPTR2 = 0x0000000000000000 PDPTR3 = 0x0000000000000000
[ 72.422753] RSP = 0xffff9c9cb31f8fff RIP = 0xffff9c9cb5005000
[ 72.424510] RFLAGS=0x00000002 DR7 = 0x0000000000000000
[ 72.426501] Sysenter RSP=0000000000000000 CS:RIP=0000:0000000000000000
[ 72.428141] CS: sel=0x0000, attr=0x0a09b, limit=0xffffffff, base=0x0000000000000000
[ 72.430162] DS: sel=0x0000, attr=0x0a093, limit=0xffffffff, base=0x0000000000000000
[ 72.432075] SS: sel=0x0000, attr=0x0a093, limit=0xffffffff, base=0x0000000000000000
[ 72.433982] ES: sel=0x0000, attr=0x0a093, limit=0xffffffff, base=0x0000000000000000
[ 72.436152] FS: sel=0x0000, attr=0x0a093, limit=0xffffffff, base=0x00007f8e51f0c4c0
[ 72.438437] GS: sel=0x0000, attr=0x0a093, limit=0xffffffff, base=0xffff9c9cbeb00000
[ 72.440579] GDTR: limit=0x00000000, base=0x0000000000000000
[ 72.442241] LDTR: sel=0x0000, attr=0x00082, limit=0x00000000, base=0x0000000000000000
[ 72.443414] IDTR: limit=0x00000000, base=0x0000000000000000
[ 72.444591] TR: sel=0x0000, attr=0x0008b, limit=0x000000ff, base=0x0000000000000000
[ 72.447023] EFER = 0x0000000000000d01 PAT = 0x0000000000000000
[ 72.448999] DebugCtl = 0x0000000000000000 DebugExceptions = 0x0000000000000000
[ 72.451813] PerfGlobCtl = 0x0000000000000000
[ 72.453316] BndCfgS = 0x0000000000000000
[ 72.454528] Interruptibility = 00000000 ActivityState = 00000000
[ 72.456302] InterruptStatus = 0000
[ 72.456997] *** Host State ***
[ 72.457622] RIP = 0xffffffffc0789b90 RSP = 0xffff9c9cb5019fff
[ 72.458766] CS=0010 SS=0018 DS=0018 ES=0018 FS=0000 GS=0000 TR=0040
[ 72.460007] FSBase=00007f8e51f0c4c0 GSBase=ffff9c9cbeb00000 TRBase=0000000000000000
[ 72.461588] GDTBase=fffffe000002c000 IDTBase=fffffe0000000000
[ 72.462711] CR0=0000000080050033 CR3=00000000307ce004 CR4=00000000000626e0
[ 72.464083] Sysenter RSP=fffffe000002d200 CS:RIP=0010:ffffffff848015f0
[ 72.465472] EFER = 0x0000000000000d01 PAT = 0x0000000000000000
[ 72.467041] PerfGlobCtl = 0x0000000000000000
[ 72.468110] *** Control State ***
[ 72.469024] PinBased=00000016 CPUBased=8401e1f2 SecondaryExec=0000000a
[ 72.470863] EntryControls=000093ff ExitControls=00236fff
[ 72.472268] ExceptionBitmap=00000000 PFECmask=00000000 PFECmatch=00000000
[ 72.474137] VMEntry: intr_info=00000000 errcode=00000000 ilen=00000000
[ 72.475580] VMExit: intr_info=00000000 errcode=00000000 ilen=00000000
[ 72.477230] reason=80000021 qualification=0000000000000000
[ 72.478806] IDTVectoring: info=00000000 errcode=00000000
[ 72.480156] TSC Offset = 0x0000000000000000
[ 72.481316] SVI|RVI = 00|00 TPR Threshold = 0x00
[ 72.482305] APIC-access addr = 0x0000000000000000 virt-APIC addr = 0x0000000000000000
[ 72.484216] PostedIntrVec = 0x00
[ 72.484928] EPT pointer = 0x000003500200005e
[ 72.485835] Virtual processor ID = 0x0000
问题是 EPTP 在处理器物理地址宽度之上有非零位。 (我认为i3-2130的物理地址宽度是36位。)
不应将此报告为无效的访客状态错误。相反,它应该是一个无效的控制字段错误(错误代码为 7 的 VM 条目失败),这是我在真实硬件上测试它时看到的。我认为 KVM 错误地虚拟化了这个错误。
如果 PDPTE 无效,启用 EPT 会导致无效访客状态错误的唯一方法,只有当访客寻呼模式为 PAE 而不是 ia32e 时才会发生这种情况。 (第 26.3.1.6 节。)
代码中的问题是在将地址存储到phys_addr 字段之前需要将地址右移12 位。 请参阅第 24.6.11 节中 EPTP 的定义。 pml4_phys_addr 字段应包含物理地址的 35:12 位。位 11:0 未表示(因为它们都是 0)。 您可以使用以下解决方案之一:
选项A:
eptp.fields.memtype = 6;
eptp.fields.page_walk = 3;
eptp.fields.accessed_and_dirty_flags_enabled = 1;
eptp.fields.pml4_phys_addr = virt_to_phys(ept_pml4) >> 12;
选项 B:
eptp.fields.memtype = 6;
eptp.fields.page_walk = 3;
eptp.fields.accessed_and_dirty_flags_enabled = 1;
eptp.value |= virt_to_phys(ept_pml4);
选项 C:
eptp.value = virt_to_phys(ept_pml4);
eptp.fields.memtype = 6;
eptp.fields.page_walk = 3;
eptp.fields.accessed_and_dirty_flags_enabled = 1;
对初始化 EPT 条目的所有代码进行类似的更改。