2 * This file is part of the Palacios Virtual Machine Monitor developed
3 * by the V3VEE Project with funding from the United States National
4 * Science Foundation and the Department of Energy.
6 * The V3VEE Project is a joint project between Northwestern University
7 * and the University of New Mexico. You can find out more at
10 * Copyright (c) 2008, Peter Dinda <pdinda@northwestern.edu>
11 * Copyright (c) 2008, Jack Lange <jarusl@cs.northwestern.edu>
12 * Copyright (c) 2008, The V3VEE Project <http://www.v3vee.org>
13 * All rights reserved.
15 * Author: Peter Dinda <pdinda@northwestern.edu>
16 * Jack Lange <jarusl@cs.northwestern.edu>
18 * This is free software. You are permitted to use,
19 * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
23 #include <palacios/vmx.h>
24 #include <palacios/vmcs.h>
25 #include <palacios/vmm.h>
26 #include <palacios/vmx_lowlevel.h>
27 #include <palacios/vmm_lowlevel.h>
28 #include <palacios/vmm_config.h>
29 #include <palacios/vmm_ctrl_regs.h>
30 #include <palacios/vm_guest_mem.h>
32 static addr_t vmxon_ptr_phys;
33 extern int v3_vmx_exit_handler();
34 extern int v3_vmx_vmlaunch(struct v3_gprs * vm_regs);
36 static int inline check_vmcs_write(vmcs_field_t field, addr_t val)
39 ret = vmcs_write(field,val);
41 if (ret != VMX_SUCCESS) {
42 PrintError("VMWRITE error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
49 static int update_vmcs_host_state(struct guest_info * info) {
52 struct vmx_data * arch_data = (struct vmx_data *)(info->vmm_data);
53 struct v3_msr tmp_msr;
55 __asm__ __volatile__ ( "movq %%cr0, %0; "
59 vmx_ret |= check_vmcs_write(VMCS_HOST_CR0, tmp);
62 __asm__ __volatile__ ( "movq %%cr3, %0; "
66 vmx_ret |= check_vmcs_write(VMCS_HOST_CR3, tmp);
69 __asm__ __volatile__ ( "movq %%cr4, %0; "
73 vmx_ret |= check_vmcs_write(VMCS_HOST_CR4, tmp);
77 vmx_ret |= check_vmcs_write(VMCS_HOST_GDTR_BASE, arch_data->host_state.gdtr.base);
78 vmx_ret |= check_vmcs_write(VMCS_HOST_IDTR_BASE, arch_data->host_state.idtr.base);
79 vmx_ret |= check_vmcs_write(VMCS_HOST_TR_BASE, arch_data->host_state.tr.base);
81 #define FS_BASE_MSR 0xc0000100
82 #define GS_BASE_MSR 0xc0000101
85 v3_get_msr(FS_BASE_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
86 vmx_ret |= check_vmcs_write(VMCS_HOST_FS_BASE, tmp_msr.value);
89 v3_get_msr(GS_BASE_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
90 vmx_ret |= check_vmcs_write(VMCS_HOST_GS_BASE, tmp_msr.value);
94 __asm__ __volatile__ ( "movq %%cs, %0; "
98 vmx_ret |= check_vmcs_write(VMCS_HOST_CS_SELECTOR, tmp);
100 __asm__ __volatile__ ( "movq %%ss, %0; "
104 vmx_ret |= check_vmcs_write(VMCS_HOST_SS_SELECTOR, tmp);
106 __asm__ __volatile__ ( "movq %%ds, %0; "
110 vmx_ret |= check_vmcs_write(VMCS_HOST_DS_SELECTOR, tmp);
112 __asm__ __volatile__ ( "movq %%es, %0; "
116 vmx_ret |= check_vmcs_write(VMCS_HOST_ES_SELECTOR, tmp);
118 __asm__ __volatile__ ( "movq %%fs, %0; "
122 vmx_ret |= check_vmcs_write(VMCS_HOST_FS_SELECTOR, tmp);
124 __asm__ __volatile__ ( "movq %%gs, %0; "
128 vmx_ret |= check_vmcs_write(VMCS_HOST_GS_SELECTOR, tmp);
130 vmx_ret |= check_vmcs_write(VMCS_HOST_TR_SELECTOR, arch_data->host_state.tr.selector);
133 #define SYSENTER_CS_MSR 0x00000174
134 #define SYSENTER_ESP_MSR 0x00000175
135 #define SYSENTER_EIP_MSR 0x00000176
138 v3_get_msr(SYSENTER_CS_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
139 vmx_ret |= check_vmcs_write(VMCS_HOST_SYSENTER_CS, tmp_msr.lo);
142 v3_get_msr(SYSENTER_ESP_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
143 vmx_ret |= check_vmcs_write(VMCS_HOST_SYSENTER_ESP, tmp_msr.value);
146 v3_get_msr(SYSENTER_EIP_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
147 vmx_ret |= check_vmcs_write(VMCS_HOST_SYSENTER_EIP, tmp_msr.value);
158 // For the 32 bit reserved bit fields
159 // MB1s are in the low 32 bits, MBZs are in the high 32 bits of the MSR
160 static uint32_t sanitize_bits1(uint32_t msr_num, uint32_t val) {
163 PrintDebug("sanitize_bits1 (MSR:%x)\n", msr_num);
165 v3_get_msr(msr_num, &mask_msr.hi, &mask_msr.lo);
167 PrintDebug("MSR %x = %x : %x \n", msr_num, mask_msr.hi, mask_msr.lo);
177 static addr_t sanitize_bits2(uint32_t msr_num0, uint32_t msr_num1, addr_t val) {
179 addr_t msr0_val, msr1_val;
181 PrintDebug("sanitize_bits2 (MSR0=%x, MSR1=%x)\n", msr_num0, msr_num1);
183 v3_get_msr(msr_num0, &msr0.hi, &msr0.lo);
184 v3_get_msr(msr_num1, &msr1.hi, &msr1.lo);
186 // This generates a mask that is the natural bit width of the CPU
187 msr0_val = msr0.value;
188 msr1_val = msr1.value;
190 PrintDebug("MSR %x = %p, %x = %p \n", msr_num0, (void*)msr0_val, msr_num1, (void*)msr1_val);
198 static int setup_base_host_state() {
202 // vmwrite(HOST_IDTR_BASE,
211 static void inline translate_segment_access(struct v3_segment * v3_seg,
212 struct vmcs_segment_access * access)
214 access->type = v3_seg->type;
215 access->desc_type = v3_seg->system;
216 access->dpl = v3_seg->dpl;
217 access->present = v3_seg->present;
218 access->avail = v3_seg->avail;
219 access->long_mode = v3_seg->long_mode;
220 access->db = v3_seg->db;
221 access->granularity = v3_seg->granularity;
224 static int inline vmcs_write_guest_segments(struct guest_info* info)
227 struct vmcs_segment_access access;
229 memset(&access, 0, sizeof(access));
232 translate_segment_access(&(info->segments.cs), &access);
234 ret |= check_vmcs_write(VMCS_GUEST_CS_BASE, info->segments.cs.base);
235 ret |= check_vmcs_write(VMCS_GUEST_CS_SELECTOR, info->segments.cs.selector);
236 ret |= check_vmcs_write(VMCS_GUEST_CS_LIMIT, info->segments.cs.limit);
237 ret |= check_vmcs_write(VMCS_GUEST_CS_ACCESS, access.value);
240 translate_segment_access(&(info->segments.ss), &access);
242 ret |= check_vmcs_write(VMCS_GUEST_SS_BASE, info->segments.ss.base);
243 ret |= check_vmcs_write(VMCS_GUEST_SS_SELECTOR, info->segments.ss.selector);
244 ret |= check_vmcs_write(VMCS_GUEST_SS_LIMIT, info->segments.ss.limit);
245 ret |= check_vmcs_write(VMCS_GUEST_SS_ACCESS, access.value);
248 translate_segment_access(&(info->segments.ds), &access);
250 ret |= check_vmcs_write(VMCS_GUEST_DS_BASE, info->segments.ds.base);
251 ret |= check_vmcs_write(VMCS_GUEST_DS_SELECTOR, info->segments.ds.selector);
252 ret |= check_vmcs_write(VMCS_GUEST_DS_LIMIT, info->segments.ds.limit);
253 ret |= check_vmcs_write(VMCS_GUEST_DS_ACCESS, access.value);
257 translate_segment_access(&(info->segments.es), &access);
259 ret |= check_vmcs_write(VMCS_GUEST_ES_BASE, info->segments.es.base);
260 ret |= check_vmcs_write(VMCS_GUEST_ES_SELECTOR, info->segments.es.selector);
261 ret |= check_vmcs_write(VMCS_GUEST_ES_LIMIT, info->segments.es.limit);
262 ret |= check_vmcs_write(VMCS_GUEST_ES_ACCESS, access.value);
265 translate_segment_access(&(info->segments.fs), &access);
267 ret |= check_vmcs_write(VMCS_GUEST_FS_BASE, info->segments.fs.base);
268 ret |= check_vmcs_write(VMCS_GUEST_FS_SELECTOR, info->segments.fs.selector);
269 ret |= check_vmcs_write(VMCS_GUEST_FS_LIMIT, info->segments.fs.limit);
270 ret |= check_vmcs_write(VMCS_GUEST_FS_ACCESS, access.value);
273 translate_segment_access(&(info->segments.gs), &access);
275 ret |= check_vmcs_write(VMCS_GUEST_GS_BASE, info->segments.gs.base);
276 ret |= check_vmcs_write(VMCS_GUEST_GS_SELECTOR, info->segments.gs.selector);
277 ret |= check_vmcs_write(VMCS_GUEST_GS_LIMIT, info->segments.gs.limit);
278 ret |= check_vmcs_write(VMCS_GUEST_GS_ACCESS, access.value);
281 translate_segment_access(&(info->segments.ldtr), &access);
283 ret |= check_vmcs_write(VMCS_GUEST_LDTR_BASE, info->segments.ldtr.base);
284 ret |= check_vmcs_write(VMCS_GUEST_LDTR_SELECTOR, info->segments.ldtr.selector);
285 ret |= check_vmcs_write(VMCS_GUEST_LDTR_LIMIT, info->segments.ldtr.limit);
286 ret |= check_vmcs_write(VMCS_GUEST_LDTR_ACCESS, access.value);
289 translate_segment_access(&(info->segments.tr), &access);
291 ret |= check_vmcs_write(VMCS_GUEST_TR_BASE, info->segments.tr.base);
292 ret |= check_vmcs_write(VMCS_GUEST_TR_SELECTOR, info->segments.ldtr.selector);
293 ret |= check_vmcs_write(VMCS_GUEST_TR_LIMIT, info->segments.tr.limit);
294 ret |= check_vmcs_write(VMCS_GUEST_TR_ACCESS, access.value);
298 ret |= check_vmcs_write(VMCS_GUEST_GDTR_BASE, info->segments.gdtr.base);
299 ret |= check_vmcs_write(VMCS_GUEST_GDTR_LIMIT, info->segments.gdtr.limit);
302 ret |= check_vmcs_write(VMCS_GUEST_IDTR_BASE, info->segments.idtr.base);
303 ret |= check_vmcs_write(VMCS_GUEST_IDTR_LIMIT, info->segments.idtr.limit);
308 static void setup_v8086_mode_for_boot(struct guest_info * vm_info)
311 ((struct vmx_data *)vm_info->vmm_data)->state = VMXASSIST_V8086_BIOS;
312 ((struct rflags *)&(vm_info->ctrl_regs.rflags))->vm = 1;
313 ((struct rflags *)&(vm_info->ctrl_regs.rflags))->iopl = 3;
316 vm_info->rip = 0xd0000;
317 vm_info->vm_regs.rsp = 0x80000;
319 vm_info->segments.cs.selector = 0xf000;
320 vm_info->segments.cs.base = 0xf000 << 4;
321 vm_info->segments.cs.limit = 0xffff;
322 vm_info->segments.cs.type = 3;
323 vm_info->segments.cs.system = 1;
324 vm_info->segments.cs.dpl = 3;
325 vm_info->segments.cs.present = 1;
326 vm_info->segments.cs.granularity = 0;
329 struct v3_segment * seg_ptr = (struct v3_segment *)&(vm_info->segments);
331 /* Set values for selectors ds through ss */
332 for(i = 1; i < 6 ; i++) {
333 seg_ptr[i].selector = 0x0000;
334 seg_ptr[i].base = 0x00000;
336 seg_ptr[i].system = 1;
338 seg_ptr[i].present = 1;
339 seg_ptr[i].granularity = 0;
342 for(i = 6; i < 10; i++) {
343 seg_ptr[i].base = 0x0;
344 seg_ptr[i].limit = 0xffff;
347 vm_info->segments.ldtr.selector = 0x0;
348 vm_info->segments.ldtr.type = 2;
349 vm_info->segments.ldtr.system = 0;
350 vm_info->segments.ldtr.present = 1;
351 vm_info->segments.ldtr.granularity = 0;
353 vm_info->segments.tr.selector = 0x0;
354 vm_info->segments.tr.type = 3;
355 vm_info->segments.tr.system = 0;
356 vm_info->segments.tr.present = 1;
357 vm_info->segments.tr.granularity = 0;
361 static addr_t allocate_vmcs()
364 PrintDebug("Allocating page\n");
365 struct vmcs_data * vmcs_page = (struct vmcs_data *)V3_VAddr(V3_AllocPages(1));
368 memset(vmcs_page, 0, 4096);
370 v3_get_msr(VMX_BASIC_MSR, &(msr.e_reg.high), &(msr.e_reg.low));
372 vmcs_page->revision = ((struct vmx_basic_msr*)&msr)->revision;
373 PrintDebug("VMX Revision: 0x%x\n",vmcs_page->revision);
375 return (addr_t)V3_PAddr((void *)vmcs_page);
379 static int init_vmcs_bios(struct guest_info * vm_info)
382 setup_v8086_mode_for_boot(vm_info);
384 // TODO: Fix vmcs fields so they're 32-bit
385 struct vmx_data * vmx_data = (struct vmx_data *)vm_info->vmm_data;
388 PrintDebug("Clearing VMCS: %p\n",(void*)vmx_data->vmcs_ptr_phys);
389 vmx_ret = vmcs_clear(vmx_data->vmcs_ptr_phys);
391 if (vmx_ret != VMX_SUCCESS) {
392 PrintError("VMCLEAR failed\n");
396 PrintDebug("Loading VMCS\n");
397 vmx_ret = vmcs_load(vmx_data->vmcs_ptr_phys);
399 if (vmx_ret != VMX_SUCCESS) {
400 PrintError("VMPTRLD failed\n");
404 struct v3_msr tmp_msr;
406 /* Write VMX Control Fields */
407 v3_get_msr(VMX_PINBASED_CTLS_MSR,&(tmp_msr.hi),&(tmp_msr.lo));
408 /* Add NMI exiting */
409 tmp_msr.lo |= NMI_EXIT;
410 check_vmcs_write(VMCS_PIN_CTRLS, tmp_msr.lo);
412 v3_get_msr(VMX_PROCBASED_CTLS_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
413 /* Add unconditional I/O */
414 tmp_msr.lo |= UNCOND_IO_EXIT;
415 check_vmcs_write(VMCS_PROC_CTRLS, tmp_msr.lo);
417 v3_get_msr(VMX_EXIT_CTLS_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
418 tmp_msr.lo |= HOST_ADDR_SPACE_SIZE;
419 check_vmcs_write(VMCS_EXIT_CTRLS, tmp_msr.lo);
422 v3_get_msr(VMX_ENTRY_CTLS_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
423 check_vmcs_write(VMCS_ENTRY_CTRLS, tmp_msr.lo);
426 check_vmcs_write(VMCS_EXCP_BITMAP, 0xffffffff);
428 /* Cache GDTR, IDTR, and TR in host struct */
432 } __attribute__((packed)) tmp_seg;
436 __asm__ __volatile__(
442 gdtr_base = tmp_seg.base;
443 vmx_data->host_state.gdtr.base = gdtr_base;
445 __asm__ __volatile__(
451 vmx_data->host_state.idtr.base = tmp_seg.base;
453 __asm__ __volatile__(
459 vmx_data->host_state.tr.selector = tmp_seg.selector;
461 /* The GDTR *index* is bits 3-15 of the selector. */
462 struct tss_descriptor * desc = (struct tss_descriptor *)
463 (gdtr_base + 8*(tmp_seg.selector>>3));
467 (desc->base2 << 16) |
468 (desc->base3 << 24) |
470 ((uint64_t)desc->base4 << 32)
476 vmx_data->host_state.tr.base = tmp_seg.base;
478 if(update_vmcs_host_state(vm_info)) {
479 PrintError("Could not write host state\n");
484 // TODO: This is not 32-bit safe!
485 vmx_ret |= check_vmcs_write(VMCS_GUEST_RIP, vm_info->rip);
486 vmx_ret |= check_vmcs_write(VMCS_GUEST_RSP, vm_info->vm_regs.rsp);
487 vmx_ret |= check_vmcs_write(VMCS_GUEST_CR0, 0x80000021);
489 vmx_ret |= vmcs_write_guest_segments(vm_info);
491 vmx_ret |= check_vmcs_write(VMCS_GUEST_RFLAGS, vm_info->ctrl_regs.rflags);
492 vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, 0xffffffffffffffff);
495 PrintError("Could not initialize VMCS segments\n");
499 #define VMXASSIST_START 0x000d0000
500 extern uint8_t vmxassist_start[];
501 extern uint8_t vmxassist_end[];
503 addr_t vmxassist_dst = 0;
504 if(guest_pa_to_host_va(vm_info, VMXASSIST_START, &vmxassist_dst) == -1) {
505 PrintError("Could not find VMXASSIST destination\n");
508 memcpy((void*)vmxassist_dst, vmxassist_start, vmxassist_end-vmxassist_start);
510 v3_print_vmcs_host_state();
511 v3_print_vmcs_guest_state();
515 static int init_vmx_guest(struct guest_info * info, struct v3_vm_config * config_ptr) {
516 v3_pre_config_guest(info, config_ptr);
518 struct vmx_data * data = NULL;
520 data = (struct vmx_data *)V3_Malloc(sizeof(struct vmx_data));
522 PrintDebug("vmx_data pointer: %p\n", (void *)data);
524 PrintDebug("Allocating VMCS\n");
525 data->vmcs_ptr_phys = allocate_vmcs();
527 PrintDebug("VMCS pointer: %p\n", (void *)(data->vmcs_ptr_phys));
529 info->vmm_data = data;
531 PrintDebug("Initializing VMCS (addr=%p)\n", info->vmm_data);
533 if (init_vmcs_bios(info) != 0) {
534 PrintError("Could not initialize VMCS BIOS\n");
538 //v3_post_config_guest(info, config_ptr);
544 static int start_vmx_guest(struct guest_info* info) {
548 PrintDebug("Attempting VMLAUNCH\n");
550 ret = v3_vmx_vmlaunch(&(info->vm_regs));
551 if (ret != VMX_SUCCESS) {
552 vmcs_read(VMCS_INSTR_ERR, &error, 4);
553 PrintError("VMLAUNCH failed: %d\n", error);
555 v3_print_vmcs_guest_state();
556 v3_print_vmcs_host_state();
558 PrintDebug("Returned from VMLAUNCH ret=%d(0x%x)\n", ret, ret);
568 int v3_is_vmx_capable() {
569 v3_msr_t feature_msr;
570 addr_t eax = 0, ebx = 0, ecx = 0, edx = 0;
572 v3_cpuid(0x1, &eax, &ebx, &ecx, &edx);
574 PrintDebug("ECX: %p\n", (void*)ecx);
576 if (ecx & CPUID_1_ECX_VTXFLAG) {
577 v3_get_msr(VMX_FEATURE_CONTROL_MSR, &(feature_msr.hi), &(feature_msr.lo));
579 PrintTrace("MSRREGlow: 0x%.8x\n", feature_msr.lo);
581 if ((feature_msr.lo & FEATURE_CONTROL_VALID) != FEATURE_CONTROL_VALID) {
582 PrintDebug("VMX is locked -- enable in the BIOS\n");
587 PrintDebug("VMX not supported on this cpu\n");
594 static int has_vmx_nested_paging() {
600 void v3_init_vmx(struct v3_ctrl_ops * vm_ops) {
601 extern v3_cpu_arch_t v3_cpu_type;
603 struct v3_msr tmp_msr;
606 v3_get_msr(VMX_CR4_FIXED0_MSR,&(tmp_msr.hi),&(tmp_msr.lo));
608 __asm__ __volatile__ (
610 "orq $0x00002000, %%rbx;"
617 if((~ret & tmp_msr.value) == 0) {
618 __asm__ __volatile__ (
624 PrintError("Invalid CR4 Settings!\n");
627 __asm__ __volatile__ (
628 "movq %%cr0, %%rbx; "
629 "orq $0x00000020,%%rbx; "
636 // Should check and return Error here....
639 // Setup VMXON Region
640 vmxon_ptr_phys = allocate_vmcs();
641 PrintDebug("VMXON pointer: 0x%p\n", (void*)vmxon_ptr_phys);
643 if (v3_enable_vmx(vmxon_ptr_phys) == VMX_SUCCESS) {
644 PrintDebug("VMX Enabled\n");
646 PrintError("VMX initialization failure\n");
651 if (has_vmx_nested_paging() == 1) {
652 v3_cpu_type = V3_VMX_EPT_CPU;
654 v3_cpu_type = V3_VMX_CPU;
657 // Setup the VMX specific vmm operations
658 vm_ops->init_guest = &init_vmx_guest;
659 vm_ops->start_guest = &start_vmx_guest;
660 vm_ops->has_nested_paging = &has_vmx_nested_paging;