2 * This file is part of the Palacios Virtual Machine Monitor developed
3 * by the V3VEE Project with funding from the United States National
4 * Science Foundation and the Department of Energy.
6 * The V3VEE Project is a joint project between Northwestern University
7 * and the University of New Mexico. You can find out more at
10 * Copyright (c) 2008, Peter Dinda <pdinda@northwestern.edu>
11 * Copyright (c) 2008, Jack Lange <jarusl@cs.northwestern.edu>
12 * Copyright (c) 2008, The V3VEE Project <http://www.v3vee.org>
13 * All rights reserved.
15 * Author: Peter Dinda <pdinda@northwestern.edu>
16 * Jack Lange <jarusl@cs.northwestern.edu>
18 * This is free software. You are permitted to use,
19 * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
23 #include <palacios/vmx.h>
24 #include <palacios/vmcs.h>
25 #include <palacios/vmm.h>
26 #include <palacios/vmx_lowlevel.h>
37 #include <palacios/vmm_util.h>
38 #include <palacios/vmm_string.h>
39 #include <palacios/vmm_ctrl_regs.h>
43 extern int Launch_VM(ullong_t vmcsPtr, uint_t eip);
45 #define NUMPORTS 65536
48 #define VMXASSIST_INFO_PORT 0x0e9
49 #define ROMBIOS_PANIC_PORT 0x400
50 #define ROMBIOS_PANIC_PORT2 0x401
51 #define ROMBIOS_INFO_PORT 0x402
52 #define ROMBIOS_DEBUG_PORT 0x403
56 static uint_t GetLinearIP(struct VM * vm) {
57 if (vm->state == VM_VMXASSIST_V8086_BIOS || vm->state == VM_VMXASSIST_V8086) {
58 return vm->vmcs.guestStateArea.cs.baseAddr + vm->vmcs.guestStateArea.rip;
60 return vm->vmcs.guestStateArea.rip;
68 #define INSTR_OFFSET_START 17
69 #define NOP_SEQ_LEN 10
70 #define INSTR_OFFSET_END (INSTR_OFFSET_START + NOP_SEQ_LEN - 1)
71 #define TEMPLATE_CODE_LEN 35
81 extern uint_t VMCS_LAUNCH();
82 extern uint_t Init_VMCS_HostState();
83 extern uint_t Init_VMCS_GuestState();
95 void DecodeCurrentInstruction(struct VM *vm, struct Instruction *inst)
97 // this is a gruesome hack
98 uint_t address = GetLinearIP(vm);
99 uint_t length = vm->vmcs.exitInfoFields.instrLength;
100 unsigned char *t = (unsigned char *) address;
104 PrintTrace("DecodeCurrentInstruction: instruction is\n");
105 PrintTraceMemDump(t,length);
107 if (length==3 && t[0]==0x0f && t[1]==0x22 && t[2]==0xc0) {
108 // mov from eax to cr0
109 // usually used to signal
110 inst->type=VM_MOV_TO_CR0;
111 inst->address=address;
113 inst->input1=vm->registers.eax;
114 inst->input2=vm->vmcs.guestStateArea.cr0;
115 inst->output=vm->registers.eax;
116 PrintTrace("MOV FROM EAX TO CR0\n");
118 inst->type=VM_UNKNOWN_INST;
123 static void setup_v8086_mode_for_boot(struct guest_info* vm_info)
126 ((struct vmx_data*)vm_info->vmm_data)->state = VMXASSIST_V8086_BIOS;
127 ((struct rflags)info->ctrl_regs.rflags).vm = 1;
128 ((struct rflags)info->ctrl_regs.rflags).iopl = 3;
131 vm_info->rip = 0xfff0;
133 vm_info->segments.cs.selector = 0xf000;
134 vm_info->segments.cs.base = 0xf000<<4;
135 vm_info->segments.cs.limit = 0xffff;
136 vm_info->segments.cs.type = 3;
137 vm_info->segments.cs.system = 1;
138 vm_info->segments.cs.dpl = 3;
139 vm_info->segments.cs.present = 1;
140 vm_info->segments.cs.granularity = 0;
142 vm_info->segments.ss.selector = 0x0000;
143 vm_info->segments.ss.base = 0x0000<<4;
144 vm_info->segments.ss.limit = 0xffff;
145 vm_info->segments.ss.type = 3;
146 vm_info->segments.ss.system = 1;
147 vm_info->segments.ss.dpl = 3;
148 vm_info->segments.ss.present = 1;
149 vm_info->segments.ss.granularity = 0;
151 vm_info->segments.es.selector = 0x0000;
152 vm_info->segments.es.base = 0x0000<<4;
153 vm_info->segments.es.limit = 0xffff;
154 vm_info->segments.es.type = 3;
155 vm_info->segments.es.system = 1;
156 vm_info->segments.es.dpl = 3;
157 vm_info->segments.es.present = 1;
158 vm_info->segments.es.granularity = 0;
160 vm_info->segments.fs.selector = 0x0000;
161 vm_info->segments.fs.base = 0x0000<<4;
162 vm_info->segments.fs.limit = 0xffff;
163 vm_info->segments.fs.type = 3;
164 vm_info->segments.fs.system = 1;
165 vm_info->segments.fs.dpl = 3;
166 vm_info->segments.fs.present = 1;
167 vm_info->segments.fs.granularity = 0;
169 vm_info->segments.gs.selector = 0x0000;
170 vm_info->segments.gs.base = 0x0000<<4;
171 vm_info->segments.gs.limit = 0xffff;
172 vm_info->segments.gs.type = 3;
173 vm_info->segments.gs.system = 1;
174 vm_info->segments.gs.dpl = 3;
175 vm_info->segments.gs.present = 1;
176 vm_info->segments.gs.granularity = 0;
179 static void ConfigureExits(struct VM *vm)
181 CopyOutVMCSExecCtrlFields(&(vm->vmcs.execCtrlFields));
183 vm->vmcs.execCtrlFields.pinCtrls |= 0
184 // EXTERNAL_INTERRUPT_EXITING
186 vm->vmcs.execCtrlFields.procCtrls |= 0
187 // INTERRUPT_WINDOWS_EXIT
195 | UNCONDITION_IO_EXITING
199 CopyInVMCSExecCtrlFields(&(vm->vmcs.execCtrlFields));
201 CopyOutVMCSExitCtrlFields(&(vm->vmcs.exitCtrlFields));
203 vm->vmcs.exitCtrlFields.exitCtrls |= ACK_IRQ_ON_EXIT;
205 CopyInVMCSExitCtrlFields(&(vm->vmcs.exitCtrlFields));
208 /* VMCS_READ(VM_EXIT_CTRLS, &flags); */
209 /* flags |= ACK_IRQ_ON_EXIT; */
210 /* VMCS_WRITE(VM_EXIT_CTRLS, &flags); */
215 extern int SAFE_VM_LAUNCH();
217 int MyLaunch(struct VM *vm)
219 ullong_t vmcs = (ullong_t)((uint_t) (vm->vmcsregion));
220 uint_t entry_eip = vm->descriptor.entry_ip;
221 uint_t exit_eip = vm->descriptor.exit_eip;
222 uint_t guest_esp = vm->descriptor.guest_esp;
223 uint_t f = 0xffffffff;
228 PrintTrace("Guest ESP: 0x%x (%u)\n", guest_esp, guest_esp);
230 exit_eip = (uint_t)RunVMM;
232 PrintTrace("Clear\n");
234 PrintTrace("Load\n");
238 PrintTrace("VMCS_LINK_PTR\n");
239 VMCS_WRITE(VMCS_LINK_PTR, &f);
240 PrintTrace("VMCS_LINK_PTR_HIGH\n");
241 VMCS_WRITE(VMCS_LINK_PTR_HIGH, &f);
244 SetCtrlBitsCorrectly(IA32_VMX_PINBASED_CTLS_MSR, PIN_VM_EXEC_CTRLS);
245 SetCtrlBitsCorrectly(IA32_VMX_PROCBASED_CTLS_MSR, PROC_VM_EXEC_CTRLS);
246 SetCtrlBitsCorrectly(IA32_VMX_EXIT_CTLS_MSR, VM_EXIT_CTRLS);
247 SetCtrlBitsCorrectly(IA32_VMX_ENTRY_CTLS_MSR, VM_ENTRY_CTRLS);
251 //SetCtrlBitsCorrectly(IA32_something,GUEST_IA32_DEBUGCTL);
252 //SetCtrlBitsCorrectly(IA32_something,GUEST_IA32_DEBUGCTL_HIGH);
256 PrintTrace("Setting up host state\n");
257 SetCRBitsCorrectly(IA32_VMX_CR0_FIXED0_MSR, IA32_VMX_CR0_FIXED1_MSR, HOST_CR0);
258 SetCRBitsCorrectly(IA32_VMX_CR4_FIXED0_MSR, IA32_VMX_CR4_FIXED1_MSR, HOST_CR4);
259 ret = Init_VMCS_HostState();
261 if (ret != VMX_SUCCESS) {
262 if (ret == VMX_FAIL_VALID) {
263 PrintTrace("Init Host state: VMCS FAILED WITH ERROR\n");
265 PrintTrace("Init Host state: Invalid VMCS\n");
270 // PrintTrace("HOST_RIP: %x (%u)\n", exit_eip, exit_eip);
271 VMCS_WRITE(HOST_RIP, &exit_eip);
274 PrintTrace("Setting up guest state\n");
275 PrintTrace("GUEST_RIP: %x (%u)\n", entry_eip, entry_eip);
276 VMCS_WRITE(GUEST_RIP, &entry_eip);
278 SetCRBitsCorrectly(IA32_VMX_CR0_FIXED0_MSR, IA32_VMX_CR0_FIXED1_MSR, GUEST_CR0);
279 SetCRBitsCorrectly(IA32_VMX_CR4_FIXED0_MSR, IA32_VMX_CR4_FIXED1_MSR, GUEST_CR4);
280 ret = Init_VMCS_GuestState();
282 PrintTrace("InitGuestState returned\n");
284 if (ret != VMX_SUCCESS) {
285 if (ret == VMX_FAIL_VALID) {
286 PrintTrace("Init Guest state: VMCS FAILED WITH ERROR\n");
288 PrintTrace("Init Guest state: Invalid VMCS\n");
292 PrintTrace("GUEST_RSP: %x (%u)\n", guest_esp, (uint_t)guest_esp);
293 VMCS_WRITE(GUEST_RSP, &guest_esp);
297 if (VMCS_WRITE(EXCEPTION_BITMAP, &tmpReg) != VMX_SUCCESS) {
298 PrintInfo("Bitmap error\n");
303 PrintTrace("VMCS_LAUNCH\n");
305 vm->state=VM_VMXASSIST_STARTUP;
307 vmm_ret = SAFE_VM_LAUNCH();
309 PrintTrace("VMM error %d\n", vmm_ret);
317 int VMLaunch(struct VMDescriptor *vm)
319 VMCS * vmcs = CreateVMCS();
322 ullong_t vmcs_ptr = (ullong_t)((uint_t)vmcs);
323 uint_t top = (vmcs_ptr >> 32) & 0xffffffff;
324 uint_t bottom = (vmcs_ptr) & 0xffffffff;
326 theVM.vmcsregion = vmcs;
327 theVM.descriptor = *vm;
329 PrintTrace("vmcs_ptr_top=%x vmcs_ptr_bottom=%x, eip=%x\n", top, bottom, vm->entry_ip);
330 rc = MyLaunch(&theVM); // vmcs_ptr, vm->entry_ip, vm->exit_eip, vm->guest_esp);
331 PrintTrace("Returned from MyLaunch();\n");
345 // For the 32 bit reserved bit fields
346 // MB1s are in the low 32 bits, MBZs are in the high 32 bits of the MSR
347 static uint32_t sanitize_bits1(uint32_t msr_num, uint32_t val) {
350 PrintDebug("sanitize_bits1 (MSR:%x)\n", msr_num);
352 v3_get_msr(msr_num, &mask_msr.hi, &mask_msr.lo);
354 PrintDebug("MSR %x = %x : %x \n", msr_num, msr.hi, msr.lo);
363 static addr_t sanitize_bits2(uint32_t msr_num0, uint32_t msr_num1, addr_t val) {
365 addr_t msr0_val, msr1_val;
367 PrintDebug("sanitize_bits2 (MSR0=%x, MSR1=%x)\n", msr_num0, msr_num1);
369 v3_get_msr(msr_num0, &msr0.hi, &msr0.lo);
370 v3_get_msr(msr_num1, &msr1.hi, &msr1.lo);
372 // This generates a mask that is the natural bit width of the CPU
373 msr0_val = msr0.value;
374 msr1_val = msr1.value;
376 PrintDebug("MSR %x = %p, %x = %p \n", msr_num0, msr0_val, msr_num1, msr1_val);
386 static vmcs_data* allocate_vmcs() {
388 vmcs_data* vmcs_page = (vmcs_data*)V3_VAddr(V3_AllocPages(1));
390 memset(vmcs_page, 0, 4096);
392 v3_get_msr(VMX_BASIC_MSR, &(msr.e_reg.high), &(msr.e_reg.low));
394 vmcs_page->revision = ((struct vmx_basic_msr)msr).revision;
401 static void init_vmcs_bios(vmcs_t * vmcs, struct guest_info * vm_info) {
407 static int init_vmx_guest(struct guest_info * info, struct v3_vm_config * config_ptr) {
408 v3_pre_config_guest(info, config_ptr);
410 struct vmx_data* data;
412 PrintDebug("Allocating vmx_data\n");
413 data = (struct vmx_data*)V3_Malloc(sizeof(vmx_data));
414 PrintDebug("Allocating VMCS\n");
415 data->vmcs = allocate_vmcs();
417 info->vmm_data = (void*)data;
419 PrintDebug("Initializing VMCS (addr=%p)\n", (void *)info->vmm_data);
420 init_vmcs_bios((vmcs_t *)(info->vmm_data), info);
422 v3_post_config_guest(info, config_ptr);
430 static int start_svm_guest(struct guest_info *info) {
439 int v3_is_vmx_capable() {
441 v3_msr_t feature_msr;
442 addr_t eax = 0, ebx = 0, ecx = 0, edx = 0;
444 v3_cpuid(CPUID_FEATURE_IDS, &eax, &ebx, &ecx, &edx);
446 if (ecx & CPUID_1_ECX_VTXFLAG) {
447 v3_get_msr(IA32_FEATURE_CONTROL_MSR, &(feature_msr.hi), &(feature_msr.lo));
449 PrintTrace("MSRREGlow: 0x%.8x\n", feature_msr.lo);
451 if ((feature_msr.lo & FEATURE_CONTROL_VALID) != FEATURE_CONTROL_VALID) {
452 PrintDebug("VMX is locked -- enable in the BIOS\n");
457 PrintDebug("VMX not supported on this cpu\n");
464 static int has_vmx_nested_paging() {
470 // We set up the global host state that is unlikely to change across processes here
471 // Segment Descriptors mainly
473 struct seg_descriptor {
478 static int setup_base_host_state() {
483 // vmwrite(HOST_IDTR_BASE,
490 void v3_init_vmx(struct v3_ctrl_ops * vm_ops) {
493 // Setup the host state save area
494 void * host_state = V3_AllocPages(1);
496 v3_get_msr(VMX_BASIC_MSR, &(basic_msr.hi), &(basic_msr.lo));
498 *(uint32_t *)host_state = ((struct vmx_basic_msr *)basic_msr.value)->revision;
500 PrintDebug("VMX revision: 0x%p\n", host_state);
502 __asm__ __volatile__ (
503 "movl %%cr4, %%ebx; "
504 "orl %%ebx, 0x00002000; "
510 // Should check and return Error here....
511 __asm__ __volatile__ (
512 "movl %%cr0, %%ebx; "
513 "orl %%ebx, 0x00000020; "
518 if (v3_enable_vmx(host_state) == 0) {
519 PrintDebug("VMX Enabled\n");
521 PrintError("VMX initialization failure\n");
526 if (has_vmx_nested_paging() == 1) {
527 v3_cpu_type = V3_VMX_EPT_CPU;
529 v3_cpu_type = V3_VMX_CPU;
532 // Setup the VMX specific vmm operations
533 vmm_ops->init_guest = &init_vmx_guest;
534 vmm_ops->start_guest = &start_vmx_guest;
535 vmm_ops->has_nested_paging = &has_vmx_nested_paging;