2 * This file is part of the Palacios Virtual Machine Monitor developed
3 * by the V3VEE Project with funding from the United States National
4 * Science Foundation and the Department of Energy.
6 * The V3VEE Project is a joint project between Northwestern University
7 * and the University of New Mexico. You can find out more at
10 * Copyright (c) 2008, Peter Dinda <pdinda@northwestern.edu>
11 * Copyright (c) 2008, Jack Lange <jarusl@cs.northwestern.edu>
12 * Copyright (c) 2008, The V3VEE Project <http://www.v3vee.org>
13 * All rights reserved.
15 * Author: Peter Dinda <pdinda@northwestern.edu>
16 * Jack Lange <jarusl@cs.northwestern.edu>
18 * This is free software. You are permitted to use,
19 * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
23 #include <palacios/vmx.h>
24 #include <palacios/vmcs.h>
25 #include <palacios/vmm.h>
26 #include <palacios/vmx_lowlevel.h>
27 #include <palacios/vmm_lowlevel.h>
28 #include <palacios/vmm_config.h>
39 #include <palacios/vmm_util.h>
40 #include <palacios/vmm_string.h>
41 #include <palacios/vmm_ctrl_regs.h>
45 extern int Launch_VM(ullong_t vmcsPtr, uint_t eip);
47 #define NUMPORTS 65536
50 #define VMXASSIST_INFO_PORT 0x0e9
51 #define ROMBIOS_PANIC_PORT 0x400
52 #define ROMBIOS_PANIC_PORT2 0x401
53 #define ROMBIOS_INFO_PORT 0x402
54 #define ROMBIOS_DEBUG_PORT 0x403
58 static uint_t GetLinearIP(struct VM * vm) {
59 if (vm->state == VM_VMXASSIST_V8086_BIOS || vm->state == VM_VMXASSIST_V8086) {
60 return vm->vmcs.guestStateArea.cs.baseAddr + vm->vmcs.guestStateArea.rip;
62 return vm->vmcs.guestStateArea.rip;
70 #define INSTR_OFFSET_START 17
71 #define NOP_SEQ_LEN 10
72 #define INSTR_OFFSET_END (INSTR_OFFSET_START + NOP_SEQ_LEN - 1)
73 #define TEMPLATE_CODE_LEN 35
82 extern uint_t VMCS_LAUNCH();
83 extern uint_t Init_VMCS_HostState();
84 extern uint_t Init_VMCS_GuestState();
96 void DecodeCurrentInstruction(struct VM *vm, struct Instruction *inst)
98 // this is a gruesome hack
99 uint_t address = GetLinearIP(vm);
100 uint_t length = vm->vmcs.exitInfoFields.instrLength;
101 unsigned char *t = (unsigned char *) address;
105 PrintTrace("DecodeCurrentInstruction: instruction is\n");
106 PrintTraceMemDump(t,length);
108 if (length==3 && t[0]==0x0f && t[1]==0x22 && t[2]==0xc0) {
109 // mov from eax to cr0
110 // usually used to signal
111 inst->type=VM_MOV_TO_CR0;
112 inst->address=address;
114 inst->input1=vm->registers.eax;
115 inst->input2=vm->vmcs.guestStateArea.cr0;
116 inst->output=vm->registers.eax;
117 PrintTrace("MOV FROM EAX TO CR0\n");
119 inst->type=VM_UNKNOWN_INST;
124 static void setup_v8086_mode_for_boot(struct guest_info* vm_info)
127 ((struct vmx_data*)vm_info->vmm_data)->state = VMXASSIST_V8086_BIOS;
128 ((struct rflags)info->ctrl_regs.rflags).vm = 1;
129 ((struct rflags)info->ctrl_regs.rflags).iopl = 3;
132 vm_info->rip = 0xfff0;
134 vm_info->segments.cs.selector = 0xf000;
135 vm_info->segments.cs.base = 0xf000<<4;
136 vm_info->segments.cs.limit = 0xffff;
137 vm_info->segments.cs.type = 3;
138 vm_info->segments.cs.system = 1;
139 vm_info->segments.cs.dpl = 3;
140 vm_info->segments.cs.present = 1;
141 vm_info->segments.cs.granularity = 0;
143 vm_info->segments.ss.selector = 0x0000;
144 vm_info->segments.ss.base = 0x0000<<4;
145 vm_info->segments.ss.limit = 0xffff;
146 vm_info->segments.ss.type = 3;
147 vm_info->segments.ss.system = 1;
148 vm_info->segments.ss.dpl = 3;
149 vm_info->segments.ss.present = 1;
150 vm_info->segments.ss.granularity = 0;
152 vm_info->segments.es.selector = 0x0000;
153 vm_info->segments.es.base = 0x0000<<4;
154 vm_info->segments.es.limit = 0xffff;
155 vm_info->segments.es.type = 3;
156 vm_info->segments.es.system = 1;
157 vm_info->segments.es.dpl = 3;
158 vm_info->segments.es.present = 1;
159 vm_info->segments.es.granularity = 0;
161 vm_info->segments.fs.selector = 0x0000;
162 vm_info->segments.fs.base = 0x0000<<4;
163 vm_info->segments.fs.limit = 0xffff;
164 vm_info->segments.fs.type = 3;
165 vm_info->segments.fs.system = 1;
166 vm_info->segments.fs.dpl = 3;
167 vm_info->segments.fs.present = 1;
168 vm_info->segments.fs.granularity = 0;
170 vm_info->segments.gs.selector = 0x0000;
171 vm_info->segments.gs.base = 0x0000<<4;
172 vm_info->segments.gs.limit = 0xffff;
173 vm_info->segments.gs.type = 3;
174 vm_info->segments.gs.system = 1;
175 vm_info->segments.gs.dpl = 3;
176 vm_info->segments.gs.present = 1;
177 vm_info->segments.gs.granularity = 0;
180 static void ConfigureExits(struct VM *vm)
182 CopyOutVMCSExecCtrlFields(&(vm->vmcs.execCtrlFields));
184 vm->vmcs.execCtrlFields.pinCtrls |= 0
185 // EXTERNAL_INTERRUPT_EXITING
187 vm->vmcs.execCtrlFields.procCtrls |= 0
188 // INTERRUPT_WINDOWS_EXIT
196 | UNCONDITION_IO_EXITING
200 CopyInVMCSExecCtrlFields(&(vm->vmcs.execCtrlFields));
202 CopyOutVMCSExitCtrlFields(&(vm->vmcs.exitCtrlFields));
204 vm->vmcs.exitCtrlFields.exitCtrls |= ACK_IRQ_ON_EXIT;
206 CopyInVMCSExitCtrlFields(&(vm->vmcs.exitCtrlFields));
209 /* VMCS_READ(VM_EXIT_CTRLS, &flags); */
210 /* flags |= ACK_IRQ_ON_EXIT; */
211 /* VMCS_WRITE(VM_EXIT_CTRLS, &flags); */
216 extern int SAFE_VM_LAUNCH();
218 int MyLaunch(struct VM *vm)
220 ullong_t vmcs = (ullong_t)((uint_t) (vm->vmcsregion));
221 uint_t entry_eip = vm->descriptor.entry_ip;
222 uint_t exit_eip = vm->descriptor.exit_eip;
223 uint_t guest_esp = vm->descriptor.guest_esp;
224 uint_t f = 0xffffffff;
229 PrintTrace("Guest ESP: 0x%x (%u)\n", guest_esp, guest_esp);
231 exit_eip = (uint_t)RunVMM;
233 PrintTrace("Clear\n");
235 PrintTrace("Load\n");
239 PrintTrace("VMCS_LINK_PTR\n");
240 VMCS_WRITE(VMCS_LINK_PTR, &f);
241 PrintTrace("VMCS_LINK_PTR_HIGH\n");
242 VMCS_WRITE(VMCS_LINK_PTR_HIGH, &f);
245 SetCtrlBitsCorrectly(IA32_VMX_PINBASED_CTLS_MSR, PIN_VM_EXEC_CTRLS);
246 SetCtrlBitsCorrectly(IA32_VMX_PROCBASED_CTLS_MSR, PROC_VM_EXEC_CTRLS);
247 SetCtrlBitsCorrectly(IA32_VMX_EXIT_CTLS_MSR, VM_EXIT_CTRLS);
248 SetCtrlBitsCorrectly(IA32_VMX_ENTRY_CTLS_MSR, VM_ENTRY_CTRLS);
252 //SetCtrlBitsCorrectly(IA32_something,GUEST_IA32_DEBUGCTL);
253 //SetCtrlBitsCorrectly(IA32_something,GUEST_IA32_DEBUGCTL_HIGH);
257 PrintTrace("Setting up host state\n");
258 SetCRBitsCorrectly(IA32_VMX_CR0_FIXED0_MSR, IA32_VMX_CR0_FIXED1_MSR, HOST_CR0);
259 SetCRBitsCorrectly(IA32_VMX_CR4_FIXED0_MSR, IA32_VMX_CR4_FIXED1_MSR, HOST_CR4);
260 ret = Init_VMCS_HostState();
262 if (ret != VMX_SUCCESS) {
263 if (ret == VMX_FAIL_VALID) {
264 PrintTrace("Init Host state: VMCS FAILED WITH ERROR\n");
266 PrintTrace("Init Host state: Invalid VMCS\n");
271 // PrintTrace("HOST_RIP: %x (%u)\n", exit_eip, exit_eip);
272 VMCS_WRITE(HOST_RIP, &exit_eip);
275 PrintTrace("Setting up guest state\n");
276 PrintTrace("GUEST_RIP: %x (%u)\n", entry_eip, entry_eip);
277 VMCS_WRITE(GUEST_RIP, &entry_eip);
279 SetCRBitsCorrectly(IA32_VMX_CR0_FIXED0_MSR, IA32_VMX_CR0_FIXED1_MSR, GUEST_CR0);
280 SetCRBitsCorrectly(IA32_VMX_CR4_FIXED0_MSR, IA32_VMX_CR4_FIXED1_MSR, GUEST_CR4);
281 ret = Init_VMCS_GuestState();
283 PrintTrace("InitGuestState returned\n");
285 if (ret != VMX_SUCCESS) {
286 if (ret == VMX_FAIL_VALID) {
287 PrintTrace("Init Guest state: VMCS FAILED WITH ERROR\n");
289 PrintTrace("Init Guest state: Invalid VMCS\n");
293 PrintTrace("GUEST_RSP: %x (%u)\n", guest_esp, (uint_t)guest_esp);
294 VMCS_WRITE(GUEST_RSP, &guest_esp);
298 if (VMCS_WRITE(EXCEPTION_BITMAP, &tmpReg) != VMX_SUCCESS) {
299 PrintInfo("Bitmap error\n");
304 PrintTrace("VMCS_LAUNCH\n");
306 vm->state=VM_VMXASSIST_STARTUP;
308 vmm_ret = SAFE_VM_LAUNCH();
310 PrintTrace("VMM error %d\n", vmm_ret);
318 int VMLaunch(struct VMDescriptor *vm)
320 VMCS * vmcs = CreateVMCS();
323 ullong_t vmcs_ptr = (ullong_t)((uint_t)vmcs);
324 uint_t top = (vmcs_ptr >> 32) & 0xffffffff;
325 uint_t bottom = (vmcs_ptr) & 0xffffffff;
327 theVM.vmcsregion = vmcs;
328 theVM.descriptor = *vm;
330 PrintTrace("vmcs_ptr_top=%x vmcs_ptr_bottom=%x, eip=%x\n", top, bottom, vm->entry_ip);
331 rc = MyLaunch(&theVM); // vmcs_ptr, vm->entry_ip, vm->exit_eip, vm->guest_esp);
332 PrintTrace("Returned from MyLaunch();\n");
347 static int update_vmcs_host_state(struct guest_info * info) {
353 } __attribute__((packed)) tmp_seg;
356 struct v3_msr tmp_msr;
358 __asm__ __volatile__ ( "movq %%cr0, %0; "
362 vmcs_write(VMCS_HOST_CR0, tmp);
365 __asm__ __volatile__ ( "movq %%cr3, %0; "
369 vmcs_write(VMCS_HOST_CR3, tmp);
372 __asm__ __volatile__ ( "movq %%cr4, %0; "
376 vmcs_write(VMCS_HOST_CR4, tmp);
381 __asm__ __volatile__ ("sgdt (%0); "
386 vmcs_write(VMCS_HOST_GDTR_BASE, tmp_seg.base);
389 __asm__ __volatile__ ("sidt (%0); "
394 vmcs_write(VMCS_HOST_IDTR_BASE, tmp_seg.base);
396 /* How do we handle this...?
397 __asm__ __volatile__ ("str (%0); "
402 vmcs_write(VMCS_HOST_TR_BASE, tmp_seg.base);
405 #define FS_BASE_MSR 0xc0000100
406 #define GS_BASE_MSR 0xc0000101
409 v3_get_msr(FS_BASE_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
410 vmcs_write(VMCS_HOST_FS_BASE, tmp_msr.value);
413 v3_get_msr(GS_BASE_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
414 vmcs_write(VMCS_HOST_GS_BASE, tmp_msr.value);
418 __asm__ __volatile__ ( "movq %%cs, %0; "
422 vmcs_write(VMCS_HOST_CS_SELECTOR, tmp);
424 __asm__ __volatile__ ( "movq %%ss, %0; "
428 vmcs_write(VMCS_HOST_SS_SELECTOR, tmp);
430 __asm__ __volatile__ ( "movq %%ds, %0; "
434 vmcs_write(VMCS_HOST_DS_SELECTOR, tmp);
436 __asm__ __volatile__ ( "movq %%es, %0; "
440 vmcs_write(VMCS_HOST_ES_SELECTOR, tmp);
442 __asm__ __volatile__ ( "movq %%fs, %0; "
446 vmcs_write(VMCS_HOST_FS_SELECTOR, tmp);
448 __asm__ __volatile__ ( "movq %%gs, %0; "
452 vmcs_write(VMCS_HOST_GS_SELECTOR, tmp);
454 __asm__ __volatile__ ( "str %0; "
458 vmcs_write(VMCS_HOST_TR_SELECTOR, tmp);
461 #define SYSENTER_CS_MSR 0x00000174
462 #define SYSENTER_ESP_MSR 0x00000175
463 #define SYSENTER_EIP_MSR 0x00000176
466 v3_get_msr(SYSENTER_CS_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
467 vmcs_write(VMCS_HOST_SYSENTER_CS, tmp_msr.value);
470 v3_get_msr(SYSENTER_ESP_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
471 vmcs_write(VMCS_HOST_SYSENTER_ESP, tmp_msr.value);
474 v3_get_msr(SYSENTER_EIP_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
475 vmcs_write(VMCS_HOST_SYSENTER_EIP, tmp_msr.value);
488 static struct vmcs_data* vmxon_ptr;
492 // For the 32 bit reserved bit fields
493 // MB1s are in the low 32 bits, MBZs are in the high 32 bits of the MSR
494 static uint32_t sanitize_bits1(uint32_t msr_num, uint32_t val) {
497 PrintDebug("sanitize_bits1 (MSR:%x)\n", msr_num);
499 v3_get_msr(msr_num, &mask_msr.hi, &mask_msr.lo);
501 PrintDebug("MSR %x = %x : %x \n", msr_num, mask_msr.hi, mask_msr.lo);
511 static addr_t sanitize_bits2(uint32_t msr_num0, uint32_t msr_num1, addr_t val) {
513 addr_t msr0_val, msr1_val;
515 PrintDebug("sanitize_bits2 (MSR0=%x, MSR1=%x)\n", msr_num0, msr_num1);
517 v3_get_msr(msr_num0, &msr0.hi, &msr0.lo);
518 v3_get_msr(msr_num1, &msr1.hi, &msr1.lo);
520 // This generates a mask that is the natural bit width of the CPU
521 msr0_val = msr0.value;
522 msr1_val = msr1.value;
524 PrintDebug("MSR %x = %p, %x = %p \n", msr_num0, (void*)msr0_val, msr_num1, (void*)msr1_val);
532 static int setup_base_host_state() {
536 // vmwrite(HOST_IDTR_BASE,
544 static struct vmcs_data* allocate_vmcs() {
546 struct vmcs_data* vmcs_page = (struct vmcs_data*)V3_VAddr(V3_AllocPages(1));
548 memset(vmcs_page, 0, 4096);
550 v3_get_msr(VMX_BASIC_MSR, &(msr.e_reg.high), &(msr.e_reg.low));
552 vmcs_page->revision = ((struct vmx_basic_msr*)&msr)->revision;
559 static void init_vmcs_bios(struct guest_info * vm_info)
567 static int init_vmx_guest(struct guest_info * info, struct v3_vm_config * config_ptr) {
568 v3_pre_config_guest(info, config_ptr);
570 struct vmx_data* data;
572 PrintDebug("Allocating vmx_data\n");
573 data = (struct vmx_data*)V3_Malloc(sizeof(struct vmx_data));
574 PrintDebug("Allocating VMCS\n");
575 data->vmcs = allocate_vmcs();
577 info->vmm_data = (void*)data;
579 PrintDebug("Initializing VMCS (addr=%p)\n", info->vmm_data);
580 init_vmcs_bios(info);
582 v3_post_config_guest(info, config_ptr);
590 static int start_vmx_guest(struct guest_info *info) {
591 struct vmx_data* vmx_data = (struct vmx_data*)info->vmm_data;
594 // Have to do a whole lot of flag setting here
595 vmx_ret = vmcs_clear(vmx_data->vmcs);
596 if(vmx_ret != VMX_SUCCESS) {
597 PrintDebug("VMCS Clear failed\n");
600 vmx_ret = vmcs_load(vmx_data->vmcs);
601 if(vmx_ret != VMX_SUCCESS) {
602 PrintDebug("Executing VMPTRLD\n");
607 update_vmcs_host_state(info);
618 int v3_is_vmx_capable() {
619 v3_msr_t feature_msr;
620 addr_t eax = 0, ebx = 0, ecx = 0, edx = 0;
622 v3_cpuid(0x1, &eax, &ebx, &ecx, &edx);
624 PrintDebug("ECX: %p\n", (void*)ecx);
626 if (ecx & CPUID_1_ECX_VTXFLAG) {
627 v3_get_msr(VMX_FEATURE_CONTROL_MSR, &(feature_msr.hi), &(feature_msr.lo));
629 PrintTrace("MSRREGlow: 0x%.8x\n", feature_msr.lo);
631 if ((feature_msr.lo & FEATURE_CONTROL_VALID) != FEATURE_CONTROL_VALID) {
632 PrintDebug("VMX is locked -- enable in the BIOS\n");
637 PrintDebug("VMX not supported on this cpu\n");
644 static int has_vmx_nested_paging() {
650 // We set up the global host state that is unlikely to change across processes here
651 // Segment Descriptors mainly
653 struct seg_descriptor {
660 void v3_init_vmx(struct v3_ctrl_ops * vmm_ops) {
661 extern v3_cpu_arch_t v3_cpu_type;
664 __asm__ __volatile__ (
665 "movq %%cr4, %%rbx; "
666 "orq $0x00002000,%%rbx; "
675 // Should check and return Error here....
676 __asm__ __volatile__ (
677 "movq %%cr0, %%rbx; "
678 "orq $0x00000020,%%rbx; "
685 // Setup VMXON Region
686 vmxon_ptr = allocate_vmcs();
687 PrintDebug("VMX revision: 0x%p\n", (void*)vmxon_ptr);
689 if (v3_enable_vmx(vmxon_ptr) == 0) {
690 PrintDebug("VMX Enabled\n");
692 PrintError("VMX initialization failure\n");
697 if (has_vmx_nested_paging() == 1) {
698 v3_cpu_type = V3_VMX_EPT_CPU;
700 v3_cpu_type = V3_VMX_CPU;
703 // Setup the VMX specific vmm operations
704 vmm_ops->init_guest = &init_vmx_guest;
705 vmm_ops->start_guest = &start_vmx_guest;
706 vmm_ops->has_nested_paging = &has_vmx_nested_paging;