Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


moved VM runstate settings to avoid race condition
[palacios-OLD.git] / palacios / src / palacios / vmx.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2011, Jack Lange <jarusl@cs.northwestern.edu>
11  * Copyright (c) 2011, The V3VEE Project <http://www.v3vee.org> 
12  * All rights reserved.
13  *
14  * Author: Jack Lange <jarusl@cs.northwestern.edu>
15  *
16  * This is free software.  You are permitted to use,
17  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
18  */
19
20
21 #include <palacios/vmx.h>
22 #include <palacios/vmm.h>
23 #include <palacios/vmx_handler.h>
24 #include <palacios/vmcs.h>
25 #include <palacios/vmx_lowlevel.h>
26 #include <palacios/vmm_lowlevel.h>
27 #include <palacios/vmm_ctrl_regs.h>
28 #include <palacios/vmm_config.h>
29 #include <palacios/vmm_time.h>
30 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_direct_paging.h>
32 #include <palacios/vmx_io.h>
33 #include <palacios/vmx_msr.h>
34 #include <palacios/vmm_decoder.h>
35 #include <palacios/vmm_barrier.h>
36
37 #ifdef V3_CONFIG_CHECKPOINT
38 #include <palacios/vmm_checkpoint.h>
39 #endif
40
41 #include <palacios/vmx_ept.h>
42 #include <palacios/vmx_assist.h>
43 #include <palacios/vmx_hw_info.h>
44
45 #ifndef V3_CONFIG_DEBUG_VMX
46 #undef PrintDebug
47 #define PrintDebug(fmt, args...)
48 #endif
49
50
51 /* These fields contain the hardware feature sets supported by the local CPU */
52 static struct vmx_hw_info hw_info;
53
54 extern v3_cpu_arch_t v3_cpu_types[];
55
56 static addr_t host_vmcs_ptrs[V3_CONFIG_MAX_CPUS] = { [0 ... V3_CONFIG_MAX_CPUS - 1] = 0};
57
58 extern int v3_vmx_launch(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
59 extern int v3_vmx_resume(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
60
61 static int inline check_vmcs_write(vmcs_field_t field, addr_t val) {
62     int ret = 0;
63
64     ret = vmcs_write(field, val);
65
66     if (ret != VMX_SUCCESS) {
67         PrintError("VMWRITE error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
68         return 1;
69     }
70
71     return 0;
72 }
73
74 static int inline check_vmcs_read(vmcs_field_t field, void * val) {
75     int ret = 0;
76
77     ret = vmcs_read(field, val);
78
79     if (ret != VMX_SUCCESS) {
80         PrintError("VMREAD error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
81     }
82
83     return ret;
84 }
85
86
87
88
89 static addr_t allocate_vmcs() {
90     struct vmcs_data * vmcs_page = NULL;
91
92     PrintDebug("Allocating page\n");
93
94     vmcs_page = (struct vmcs_data *)V3_VAddr(V3_AllocPages(1));
95     memset(vmcs_page, 0, 4096);
96
97     vmcs_page->revision = hw_info.basic_info.revision;
98     PrintDebug("VMX Revision: 0x%x\n", vmcs_page->revision);
99
100     return (addr_t)V3_PAddr((void *)vmcs_page);
101 }
102
103
104
105
106 static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state) {
107     int vmx_ret = 0;
108
109     // disable global interrupts for vm state initialization
110     v3_disable_ints();
111
112     PrintDebug("Loading VMCS\n");
113     vmx_ret = vmcs_load(vmx_state->vmcs_ptr_phys);
114     vmx_state->state = VMX_UNLAUNCHED;
115
116     if (vmx_ret != VMX_SUCCESS) {
117         PrintError("VMPTRLD failed\n");
118         return -1;
119     }
120
121
122     /*** Setup default state from HW ***/
123
124     vmx_state->pin_ctrls.value = hw_info.pin_ctrls.def_val;
125     vmx_state->pri_proc_ctrls.value = hw_info.proc_ctrls.def_val;
126     vmx_state->exit_ctrls.value = hw_info.exit_ctrls.def_val;
127     vmx_state->entry_ctrls.value = hw_info.entry_ctrls.def_val;
128     vmx_state->sec_proc_ctrls.value = hw_info.sec_proc_ctrls.def_val;
129
130     /* Print Control MSRs */
131     PrintDebug("CR0 MSR: %p\n", (void *)(addr_t)hw_info.cr0.value);
132     PrintDebug("CR4 MSR: %p\n", (void *)(addr_t)hw_info.cr4.value);
133
134
135
136     /******* Setup Host State **********/
137
138     /* Cache GDTR, IDTR, and TR in host struct */
139
140
141     /********** Setup VMX Control Fields ***********/
142
143     /* Add external interrupts, NMI exiting, and virtual NMI */
144     vmx_state->pin_ctrls.nmi_exit = 1;
145     vmx_state->pin_ctrls.ext_int_exit = 1;
146
147
148     vmx_state->pri_proc_ctrls.hlt_exit = 1;
149
150
151     vmx_state->pri_proc_ctrls.pause_exit = 0;
152     vmx_state->pri_proc_ctrls.tsc_offset = 1;
153 #ifdef V3_CONFIG_TIME_VIRTUALIZE_TSC
154     vmx_state->pri_proc_ctrls.rdtsc_exit = 1;
155 #endif
156
157     /* Setup IO map */
158     vmx_state->pri_proc_ctrls.use_io_bitmap = 1;
159     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_A_ADDR, (addr_t)V3_PAddr(core->vm_info->io_map.arch_data));
160     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_B_ADDR, 
161             (addr_t)V3_PAddr(core->vm_info->io_map.arch_data) + PAGE_SIZE_4KB);
162
163
164     vmx_state->pri_proc_ctrls.use_msr_bitmap = 1;
165     vmx_ret |= check_vmcs_write(VMCS_MSR_BITMAP, (addr_t)V3_PAddr(core->vm_info->msr_map.arch_data));
166
167
168
169 #ifdef __V3_64BIT__
170     // Ensure host runs in 64-bit mode at each VM EXIT
171     vmx_state->exit_ctrls.host_64_on = 1;
172 #endif
173
174     // Hook all accesses to EFER register
175     v3_hook_msr(core->vm_info, EFER_MSR, 
176                 &v3_handle_efer_read,
177                 &v3_handle_efer_write, 
178                 core);
179
180     // Restore host's EFER register on each VM EXIT
181     vmx_state->exit_ctrls.ld_efer = 1;
182
183     // Save/restore guest's EFER register to/from VMCS on VM EXIT/ENTRY
184     vmx_state->exit_ctrls.save_efer = 1;
185     vmx_state->entry_ctrls.ld_efer  = 1;
186
187     // Cause VM_EXIT whenever CR4.VMXE or CR4.PAE bits are written
188     vmx_ret |= check_vmcs_write(VMCS_CR4_MASK, CR4_VMXE | CR4_PAE);
189
190
191     /* Setup paging */
192     if (core->shdw_pg_mode == SHADOW_PAGING) {
193         PrintDebug("Creating initial shadow page table\n");
194
195         if (v3_init_passthrough_pts(core) == -1) {
196             PrintError("Could not initialize passthrough page tables\n");
197             return -1;
198         }
199         
200 #define CR0_PE 0x00000001
201 #define CR0_PG 0x80000000
202 #define CR0_WP 0x00010000 // To ensure mem hooks work
203         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
204
205         core->ctrl_regs.cr3 = core->direct_map_pt;
206
207         // vmx_state->pinbased_ctrls |= NMI_EXIT;
208
209         /* Add CR exits */
210         vmx_state->pri_proc_ctrls.cr3_ld_exit = 1;
211         vmx_state->pri_proc_ctrls.cr3_str_exit = 1;
212         
213         vmx_state->pri_proc_ctrls.invlpg_exit = 1;
214         
215         /* Add page fault exits */
216         vmx_state->excp_bmap.pf = 1;
217
218         // Setup VMX Assist
219         v3_vmxassist_init(core, vmx_state);
220
221     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
222                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_CPU)) {
223
224 #define CR0_PE 0x00000001
225 #define CR0_PG 0x80000000
226 #define CR0_WP 0x00010000 // To ensure mem hooks work
227         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
228
229         // vmx_state->pinbased_ctrls |= NMI_EXIT;
230
231         /* Disable CR exits */
232         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
233         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
234
235         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
236
237         /* Add page fault exits */
238         //      vmx_state->excp_bmap.pf = 1; // This should never happen..., enabled to catch bugs
239         
240         // Setup VMX Assist
241         v3_vmxassist_init(core, vmx_state);
242
243         /* Enable EPT */
244         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
245         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
246
247
248
249         if (v3_init_ept(core, &hw_info) == -1) {
250             PrintError("Error initializing EPT\n");
251             return -1;
252         }
253
254     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
255                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
256         int i = 0;
257         // For now we will assume that unrestricted guest mode is assured w/ EPT
258
259
260         core->vm_regs.rsp = 0x00;
261         core->rip = 0xfff0;
262         core->vm_regs.rdx = 0x00000f00;
263         core->ctrl_regs.rflags = 0x00000002; // The reserved bit is always 1
264         core->ctrl_regs.cr0 = 0x60010010; // Set the WP flag so the memory hooks work in real-mode
265
266
267         core->segments.cs.selector = 0xf000;
268         core->segments.cs.limit = 0xffff;
269         core->segments.cs.base = 0x0000000f0000LL;
270
271         // (raw attributes = 0xf3)
272         core->segments.cs.type = 0xb;
273         core->segments.cs.system = 0x1;
274         core->segments.cs.dpl = 0x0;
275         core->segments.cs.present = 1;
276
277
278
279         struct v3_segment * segregs [] = {&(core->segments.ss), &(core->segments.ds), 
280                                           &(core->segments.es), &(core->segments.fs), 
281                                           &(core->segments.gs), NULL};
282
283         for ( i = 0; segregs[i] != NULL; i++) {
284             struct v3_segment * seg = segregs[i];
285         
286             seg->selector = 0x0000;
287             //    seg->base = seg->selector << 4;
288             seg->base = 0x00000000;
289             seg->limit = 0xffff;
290
291
292             seg->type = 0x3;
293             seg->system = 0x1;
294             seg->dpl = 0x0;
295             seg->present = 1;
296             //    seg->granularity = 1;
297
298         }
299
300
301         core->segments.gdtr.limit = 0x0000ffff;
302         core->segments.gdtr.base = 0x0000000000000000LL;
303
304         core->segments.idtr.limit = 0x0000ffff;
305         core->segments.idtr.base = 0x0000000000000000LL;
306
307         core->segments.ldtr.selector = 0x0000;
308         core->segments.ldtr.limit = 0x0000ffff;
309         core->segments.ldtr.base = 0x0000000000000000LL;
310         core->segments.ldtr.type = 2;
311         core->segments.ldtr.present = 1;
312
313         core->segments.tr.selector = 0x0000;
314         core->segments.tr.limit = 0x0000ffff;
315         core->segments.tr.base = 0x0000000000000000LL;
316         core->segments.tr.type = 0xb;
317         core->segments.tr.present = 1;
318
319         //      core->dbg_regs.dr6 = 0x00000000ffff0ff0LL;
320         core->dbg_regs.dr7 = 0x0000000000000400LL;
321
322         /* Enable EPT */
323         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
324         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
325         vmx_state->sec_proc_ctrls.unrstrct_guest = 1; // enable unrestricted guest operation
326
327
328         /* Disable shadow paging stuff */
329         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
330         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
331
332         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
333
334
335         if (v3_init_ept(core, &hw_info) == -1) {
336             PrintError("Error initializing EPT\n");
337             return -1;
338         }
339
340     } else {
341         PrintError("Invalid Virtual paging mode\n");
342         return -1;
343     }
344
345
346     // hook vmx msrs
347
348     // Setup SYSCALL/SYSENTER MSRs in load/store area
349     
350     // save STAR, LSTAR, FMASK, KERNEL_GS_BASE MSRs in MSR load/store area
351     {
352 #define IA32_STAR 0xc0000081
353 #define IA32_LSTAR 0xc0000082
354 #define IA32_FMASK 0xc0000084
355 #define IA32_KERN_GS_BASE 0xc0000102
356
357 #define IA32_CSTAR 0xc0000083 // Compatibility mode STAR (ignored for now... hopefully its not that important...)
358
359         int msr_ret = 0;
360
361         struct vmcs_msr_entry * exit_store_msrs = NULL;
362         struct vmcs_msr_entry * exit_load_msrs = NULL;
363         struct vmcs_msr_entry * entry_load_msrs = NULL;;
364         int max_msrs = (hw_info.misc_info.max_msr_cache_size + 1) * 4;
365
366         V3_Print("Setting up MSR load/store areas (max_msr_count=%d)\n", max_msrs);
367
368         if (max_msrs < 4) {
369             PrintError("Max MSR cache size is too small (%d)\n", max_msrs);
370             return -1;
371         }
372
373         vmx_state->msr_area = V3_VAddr(V3_AllocPages(1));
374
375         if (vmx_state->msr_area == NULL) {
376             PrintError("could not allocate msr load/store area\n");
377             return -1;
378         }
379
380         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_CNT, 4);
381         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_CNT, 4);
382         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_CNT, 4);
383         
384         
385         exit_store_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area);
386         exit_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 4));
387         entry_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 8));
388
389
390         exit_store_msrs[0].index = IA32_STAR;
391         exit_store_msrs[1].index = IA32_LSTAR;
392         exit_store_msrs[2].index = IA32_FMASK;
393         exit_store_msrs[3].index = IA32_KERN_GS_BASE;
394         
395         memcpy(exit_store_msrs, exit_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
396         memcpy(exit_store_msrs, entry_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
397
398         
399         v3_get_msr(IA32_STAR, &(exit_load_msrs[0].hi), &(exit_load_msrs[0].lo));
400         v3_get_msr(IA32_LSTAR, &(exit_load_msrs[1].hi), &(exit_load_msrs[1].lo));
401         v3_get_msr(IA32_FMASK, &(exit_load_msrs[2].hi), &(exit_load_msrs[2].lo));
402         v3_get_msr(IA32_KERN_GS_BASE, &(exit_load_msrs[3].hi), &(exit_load_msrs[3].lo));
403
404         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_ADDR, (addr_t)V3_PAddr(exit_store_msrs));
405         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_ADDR, (addr_t)V3_PAddr(exit_load_msrs));
406         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_ADDR, (addr_t)V3_PAddr(entry_load_msrs));
407
408     }    
409
410     /* Sanity check ctrl/reg fields against hw_defaults */
411
412
413
414
415     /*** Write all the info to the VMCS ***/
416   
417     /*
418     {
419         // IS THIS NECESSARY???
420 #define DEBUGCTL_MSR 0x1d9
421         struct v3_msr tmp_msr;
422         v3_get_msr(DEBUGCTL_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
423         vmx_ret |= check_vmcs_write(VMCS_GUEST_DBG_CTL, tmp_msr.value);
424         core->dbg_regs.dr7 = 0x400;
425     }
426     */
427
428 #ifdef __V3_64BIT__
429     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffffffffffULL);
430 #else
431     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffUL);
432     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR_HIGH, (addr_t)0xffffffffUL);
433 #endif
434
435
436  
437
438     if (v3_update_vmcs_ctrl_fields(core)) {
439         PrintError("Could not write control fields!\n");
440         return -1;
441     }
442     
443     if (v3_update_vmcs_host_state(core)) {
444         PrintError("Could not write host state\n");
445         return -1;
446     }
447
448     // reenable global interrupts for vm state initialization now
449     // that the vm state is initialized. If another VM kicks us off, 
450     // it'll update our vmx state so that we know to reload ourself
451     v3_enable_ints();
452
453     return 0;
454 }
455
456 int v3_init_vmx_vmcs(struct guest_info * core, v3_vm_class_t vm_class) {
457     struct vmx_data * vmx_state = NULL;
458     int vmx_ret = 0;
459     
460     vmx_state = (struct vmx_data *)V3_Malloc(sizeof(struct vmx_data));
461     memset(vmx_state, 0, sizeof(struct vmx_data));
462
463     PrintDebug("vmx_data pointer: %p\n", (void *)vmx_state);
464
465     PrintDebug("Allocating VMCS\n");
466     vmx_state->vmcs_ptr_phys = allocate_vmcs();
467
468     PrintDebug("VMCS pointer: %p\n", (void *)(vmx_state->vmcs_ptr_phys));
469
470     core->vmm_data = vmx_state;
471     vmx_state->state = VMX_UNLAUNCHED;
472
473     PrintDebug("Initializing VMCS (addr=%p)\n", core->vmm_data);
474     
475     // TODO: Fix vmcs fields so they're 32-bit
476
477     PrintDebug("Clearing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
478     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
479
480     if (vmx_ret != VMX_SUCCESS) {
481         PrintError("VMCLEAR failed\n");
482         return -1; 
483     }
484
485     if (vm_class == V3_PC_VM) {
486         PrintDebug("Initializing VMCS\n");
487         if (init_vmcs_bios(core, vmx_state) == -1) {
488             PrintError("Error initializing VMCS to BIOS state\n");
489             return -1;
490         }
491     } else {
492         PrintError("Invalid VM Class\n");
493         return -1;
494     }
495
496     PrintDebug("Serializing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
497     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
498
499     return 0;
500 }
501
502
503 int v3_deinit_vmx_vmcs(struct guest_info * core) {
504     struct vmx_data * vmx_state = core->vmm_data;
505
506     V3_FreePages((void *)(vmx_state->vmcs_ptr_phys), 1);
507     V3_FreePages(V3_PAddr(vmx_state->msr_area), 1);
508
509     V3_Free(vmx_state);
510
511     return 0;
512 }
513
514
515
516 #ifdef V3_CONFIG_CHECKPOINT
517 /* 
518  * JRL: This is broken
519  */
520 int v3_vmx_save_core(struct guest_info * core, void * ctx){
521     uint64_t vmcs_ptr = vmcs_store();
522
523     v3_chkpt_save(ctx, "vmcs_data", PAGE_SIZE, (void *)vmcs_ptr);
524
525     return 0;
526 }
527
528 int v3_vmx_load_core(struct guest_info * core, void * ctx){
529     struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
530     struct cr0_32 * shadow_cr0;
531     char vmcs[PAGE_SIZE_4KB];
532
533     v3_chkpt_load(ctx, "vmcs_data", PAGE_SIZE_4KB, vmcs);
534
535     vmcs_clear(vmx_info->vmcs_ptr_phys);
536     vmcs_load((addr_t)vmcs);
537
538     v3_vmx_save_vmcs(core);
539
540     shadow_cr0 = (struct cr0_32 *)&(core->ctrl_regs.cr0);
541
542
543     /* Get the CPU mode to set the guest_ia32e entry ctrl */
544
545     if (core->shdw_pg_mode == SHADOW_PAGING) {
546         if (v3_get_vm_mem_mode(core) == VIRTUAL_MEM) {
547             if (v3_activate_shadow_pt(core) == -1) {
548                 PrintError("Failed to activate shadow page tables\n");
549                 return -1;
550             }
551         } else {
552             if (v3_activate_passthrough_pt(core) == -1) {
553                 PrintError("Failed to activate passthrough page tables\n");
554                 return -1;
555             }
556         }
557     }
558
559     return 0;
560 }
561 #endif
562
563
564 void v3_flush_vmx_vm_core(struct guest_info * core) {
565     struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
566     vmcs_clear(vmx_info->vmcs_ptr_phys);
567     vmx_info->state = VMX_UNLAUNCHED;
568 }
569
570
571
572 static int update_irq_exit_state(struct guest_info * info) {
573     struct vmx_exit_idt_vec_info idt_vec_info;
574
575     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
576
577     if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 0)) {
578 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
579         V3_Print("Calling v3_injecting_intr\n");
580 #endif
581         info->intr_core_state.irq_started = 0;
582         v3_injecting_intr(info, info->intr_core_state.irq_vector, V3_EXTERNAL_IRQ);
583     }
584
585     return 0;
586 }
587
588 static int update_irq_entry_state(struct guest_info * info) {
589     struct vmx_exit_idt_vec_info idt_vec_info;
590     struct vmcs_interrupt_state intr_core_state;
591     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
592
593     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
594     check_vmcs_read(VMCS_GUEST_INT_STATE, &(intr_core_state));
595
596     /* Check for pending exceptions to inject */
597     if (v3_excp_pending(info)) {
598         struct vmx_entry_int_info int_info;
599         int_info.value = 0;
600
601         // In VMX, almost every exception is hardware
602         // Software exceptions are pretty much only for breakpoint or overflow
603         int_info.type = 3;
604         int_info.vector = v3_get_excp_number(info);
605
606         if (info->excp_state.excp_error_code_valid) {
607             check_vmcs_write(VMCS_ENTRY_EXCP_ERR, info->excp_state.excp_error_code);
608             int_info.error_code = 1;
609
610 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
611             V3_Print("Injecting exception %d with error code %x\n", 
612                     int_info.vector, info->excp_state.excp_error_code);
613 #endif
614         }
615
616         int_info.valid = 1;
617 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
618         V3_Print("Injecting exception %d (EIP=%p)\n", int_info.vector, (void *)(addr_t)info->rip);
619 #endif
620         check_vmcs_write(VMCS_ENTRY_INT_INFO, int_info.value);
621
622         v3_injecting_excp(info, int_info.vector);
623
624     } else if ((((struct rflags *)&(info->ctrl_regs.rflags))->intr == 1) && 
625                (intr_core_state.val == 0)) {
626        
627         if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 1)) {
628
629 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
630             V3_Print("IRQ pending from previous injection\n");
631 #endif
632
633             // Copy the IDT vectoring info over to reinject the old interrupt
634             if (idt_vec_info.error_code == 1) {
635                 uint32_t err_code = 0;
636
637                 check_vmcs_read(VMCS_IDT_VECTOR_ERR, &err_code);
638                 check_vmcs_write(VMCS_ENTRY_EXCP_ERR, err_code);
639             }
640
641             idt_vec_info.undef = 0;
642             check_vmcs_write(VMCS_ENTRY_INT_INFO, idt_vec_info.value);
643
644         } else {
645             struct vmx_entry_int_info ent_int;
646             ent_int.value = 0;
647
648             switch (v3_intr_pending(info)) {
649                 case V3_EXTERNAL_IRQ: {
650                     info->intr_core_state.irq_vector = v3_get_intr(info); 
651                     ent_int.vector = info->intr_core_state.irq_vector;
652                     ent_int.type = 0;
653                     ent_int.error_code = 0;
654                     ent_int.valid = 1;
655
656 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
657                     V3_Print("Injecting Interrupt %d at exit %u(EIP=%p)\n", 
658                                info->intr_core_state.irq_vector, 
659                                (uint32_t)info->num_exits, 
660                                (void *)(addr_t)info->rip);
661 #endif
662
663                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
664                     info->intr_core_state.irq_started = 1;
665
666                     break;
667                 }
668                 case V3_NMI:
669                     PrintDebug("Injecting NMI\n");
670
671                     ent_int.type = 2;
672                     ent_int.vector = 2;
673                     ent_int.valid = 1;
674                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
675
676                     break;
677                 case V3_SOFTWARE_INTR:
678                     PrintDebug("Injecting software interrupt\n");
679                     ent_int.type = 4;
680
681                     ent_int.valid = 1;
682                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
683
684                     break;
685                 case V3_VIRTUAL_IRQ:
686                     // Not sure what to do here, Intel doesn't have virtual IRQs
687                     // May be the same as external interrupts/IRQs
688
689                     break;
690                 case V3_INVALID_INTR:
691                 default:
692                     break;
693             }
694         }
695     } else if ((v3_intr_pending(info)) && (vmx_info->pri_proc_ctrls.int_wndw_exit == 0)) {
696         // Enable INTR window exiting so we know when IF=1
697         uint32_t instr_len;
698
699         check_vmcs_read(VMCS_EXIT_INSTR_LEN, &instr_len);
700
701 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
702         V3_Print("Enabling Interrupt-Window exiting: %d\n", instr_len);
703 #endif
704
705         vmx_info->pri_proc_ctrls.int_wndw_exit = 1;
706         check_vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
707     }
708
709
710     return 0;
711 }
712
713
714
715 static struct vmx_exit_info exit_log[10];
716
717 static void print_exit_log(struct guest_info * info) {
718     int cnt = info->num_exits % 10;
719     int i = 0;
720     
721
722     V3_Print("\nExit Log (%d total exits):\n", (uint32_t)info->num_exits);
723
724     for (i = 0; i < 10; i++) {
725         struct vmx_exit_info * tmp = &exit_log[cnt];
726
727         V3_Print("%d:\texit_reason = %p\n", i, (void *)(addr_t)tmp->exit_reason);
728         V3_Print("\texit_qual = %p\n", (void *)tmp->exit_qual);
729         V3_Print("\tint_info = %p\n", (void *)(addr_t)tmp->int_info);
730         V3_Print("\tint_err = %p\n", (void *)(addr_t)tmp->int_err);
731         V3_Print("\tinstr_info = %p\n", (void *)(addr_t)tmp->instr_info);
732
733         cnt--;
734
735         if (cnt == -1) {
736             cnt = 9;
737         }
738
739     }
740
741 }
742
743 /* 
744  * CAUTION and DANGER!!! 
745  * 
746  * The VMCS CANNOT(!!) be accessed outside of the cli/sti calls inside this function
747  * When exectuing a symbiotic call, the VMCS WILL be overwritten, so any dependencies 
748  * on its contents will cause things to break. The contents at the time of the exit WILL 
749  * change before the exit handler is executed.
750  */
751 int v3_vmx_enter(struct guest_info * info) {
752     int ret = 0;
753     uint32_t tsc_offset_low, tsc_offset_high;
754     struct vmx_exit_info exit_info;
755     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
756
757     // Conditionally yield the CPU if the timeslice has expired
758     v3_yield_cond(info);
759
760     // Perform any additional yielding needed for time adjustment
761     v3_adjust_time(info);
762
763     // disable global interrupts for vm state transition
764     v3_disable_ints();
765
766     // Update timer devices late after being in the VM so that as much 
767     // of hte time in the VM is accounted for as possible. Also do it before
768     // updating IRQ entry state so that any interrupts the timers raise get 
769     // handled on the next VM entry. Must be done with interrupts disabled.
770     v3_update_timers(info);
771
772     if (vmcs_store() != vmx_info->vmcs_ptr_phys) {
773         vmcs_clear(vmx_info->vmcs_ptr_phys);
774         vmcs_load(vmx_info->vmcs_ptr_phys);
775         vmx_info->state = VMX_UNLAUNCHED;
776     }
777
778     v3_vmx_restore_vmcs(info);
779
780
781 #ifdef V3_CONFIG_SYMCALL
782     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
783         update_irq_entry_state(info);
784     }
785 #else 
786     update_irq_entry_state(info);
787 #endif
788
789     {
790         addr_t guest_cr3;
791         vmcs_read(VMCS_GUEST_CR3, &guest_cr3);
792         vmcs_write(VMCS_GUEST_CR3, guest_cr3);
793     }
794
795     // Perform last-minute time bookkeeping prior to entering the VM
796     v3_time_enter_vm(info);
797
798     tsc_offset_high = (uint32_t)((v3_tsc_host_offset(&info->time_state) >> 32) & 0xffffffff);
799     tsc_offset_low = (uint32_t)(v3_tsc_host_offset(&info->time_state) & 0xffffffff);
800     check_vmcs_write(VMCS_TSC_OFFSET_HIGH, tsc_offset_high);
801     check_vmcs_write(VMCS_TSC_OFFSET, tsc_offset_low);
802
803
804     if (v3_update_vmcs_host_state(info)) {
805         v3_enable_ints();
806         PrintError("Could not write host state\n");
807         return -1;
808     }
809
810
811     if (vmx_info->state == VMX_UNLAUNCHED) {
812         vmx_info->state = VMX_LAUNCHED;
813         ret = v3_vmx_launch(&(info->vm_regs), info, &(info->ctrl_regs));
814     } else {
815         V3_ASSERT(vmx_info->state != VMX_UNLAUNCHED);
816         ret = v3_vmx_resume(&(info->vm_regs), info, &(info->ctrl_regs));
817     }
818     
819
820
821     //  PrintDebug("VMX Exit: ret=%d\n", ret);
822
823     if (ret != VMX_SUCCESS) {
824         uint32_t error = 0;
825         vmcs_read(VMCS_INSTR_ERR, &error);
826
827         v3_enable_ints();
828
829         PrintError("VMENTRY Error: %d (launch_ret = %d)\n", error, ret);
830         return -1;
831     }
832
833
834
835     // Immediate exit from VM time bookkeeping
836     v3_time_exit_vm(info);
837
838     info->num_exits++;
839
840     /* Update guest state */
841     v3_vmx_save_vmcs(info);
842
843     // info->cpl = info->segments.cs.selector & 0x3;
844
845     info->mem_mode = v3_get_vm_mem_mode(info);
846     info->cpu_mode = v3_get_vm_cpu_mode(info);
847
848
849     check_vmcs_read(VMCS_EXIT_INSTR_LEN, &(exit_info.instr_len));
850     check_vmcs_read(VMCS_EXIT_INSTR_INFO, &(exit_info.instr_info));
851     check_vmcs_read(VMCS_EXIT_REASON, &(exit_info.exit_reason));
852     check_vmcs_read(VMCS_EXIT_QUAL, &(exit_info.exit_qual));
853     check_vmcs_read(VMCS_EXIT_INT_INFO, &(exit_info.int_info));
854     check_vmcs_read(VMCS_EXIT_INT_ERR, &(exit_info.int_err));
855     check_vmcs_read(VMCS_GUEST_LINEAR_ADDR, &(exit_info.guest_linear_addr));
856
857     if (info->shdw_pg_mode == NESTED_PAGING) {
858         check_vmcs_read(VMCS_GUEST_PHYS_ADDR, &(exit_info.ept_fault_addr));
859     }
860
861     //PrintDebug("VMX Exit taken, id-qual: %u-%lu\n", exit_info.exit_reason, exit_info.exit_qual);
862
863     exit_log[info->num_exits % 10] = exit_info;
864
865 #ifdef V3_CONFIG_SYMCALL
866     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
867         update_irq_exit_state(info);
868     }
869 #else
870     update_irq_exit_state(info);
871 #endif
872
873     if (exit_info.exit_reason == VMEXIT_INTR_WINDOW) {
874         // This is a special case whose only job is to inject an interrupt
875         vmcs_read(VMCS_PROC_CTRLS, &(vmx_info->pri_proc_ctrls.value));
876         vmx_info->pri_proc_ctrls.int_wndw_exit = 0;
877         vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
878
879 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
880        V3_Print("Interrupts available again! (RIP=%llx)\n", info->rip);
881 #endif
882     }
883
884     // reenable global interrupts after vm exit
885     v3_enable_ints();
886
887     // Conditionally yield the CPU if the timeslice has expired
888     v3_yield_cond(info);
889
890     if (v3_handle_vmx_exit(info, &exit_info) == -1) {
891         PrintError("Error in VMX exit handler (Exit reason=%x)\n", exit_info.exit_reason);
892         return -1;
893     }
894
895     return 0;
896 }
897
898
899 int v3_start_vmx_guest(struct guest_info * info) {
900
901     PrintDebug("Starting VMX core %u\n", info->vcpu_id);
902
903     if (info->vcpu_id == 0) {
904         info->core_run_state = CORE_RUNNING;
905     } else {
906
907         PrintDebug("VMX core %u: Waiting for core initialization\n", info->vcpu_id);
908
909         while (info->core_run_state == CORE_STOPPED) {
910
911             if (info->vm_info->run_state == VM_STOPPED) {
912                 // The VM was stopped before this core was initialized. 
913                 return 0;
914             }
915
916             v3_yield(info);
917             //PrintDebug("VMX core %u: still waiting for INIT\n",info->vcpu_id);
918         }
919         
920         PrintDebug("VMX core %u initialized\n", info->vcpu_id);
921
922         // We'll be paranoid about race conditions here
923         v3_wait_at_barrier(info);
924     }
925
926
927     PrintDebug("VMX core %u: I am starting at CS=0x%x (base=0x%p, limit=0x%x),  RIP=0x%p\n",
928                info->vcpu_id, info->segments.cs.selector, (void *)(info->segments.cs.base),
929                info->segments.cs.limit, (void *)(info->rip));
930
931
932     PrintDebug("VMX core %u: Launching VMX VM on logical core %u\n", info->vcpu_id, info->pcpu_id);
933
934     v3_start_time(info);
935
936     while (1) {
937
938         if (info->vm_info->run_state == VM_STOPPED) {
939             info->core_run_state = CORE_STOPPED;
940             break;
941         }
942
943         if (v3_vmx_enter(info) == -1) {
944
945             addr_t host_addr;
946             addr_t linear_addr = 0;
947             
948             info->vm_info->run_state = VM_ERROR;
949             
950             V3_Print("VMX core %u: VMX ERROR!!\n", info->vcpu_id); 
951             
952             v3_print_guest_state(info);
953             
954             V3_Print("VMX core %u\n", info->vcpu_id); 
955
956             linear_addr = get_addr_linear(info, info->rip, &(info->segments.cs));
957             
958             if (info->mem_mode == PHYSICAL_MEM) {
959                 v3_gpa_to_hva(info, linear_addr, &host_addr);
960             } else if (info->mem_mode == VIRTUAL_MEM) {
961                 v3_gva_to_hva(info, linear_addr, &host_addr);
962             }
963             
964             V3_Print("VMX core %u: Host Address of rip = 0x%p\n", info->vcpu_id, (void *)host_addr);
965             
966             V3_Print("VMX core %u: Instr (15 bytes) at %p:\n", info->vcpu_id, (void *)host_addr);
967             v3_dump_mem((uint8_t *)host_addr, 15);
968             
969             v3_print_stack(info);
970
971
972             v3_print_vmcs();
973             print_exit_log(info);
974             return -1;
975         }
976
977         v3_wait_at_barrier(info);
978
979
980         if (info->vm_info->run_state == VM_STOPPED) {
981             info->core_run_state = CORE_STOPPED;
982             break;
983         }
984 /*
985         if ((info->num_exits % 5000) == 0) {
986             V3_Print("VMX Exit number %d\n", (uint32_t)info->num_exits);
987         }
988 */
989
990     }
991
992     return 0;
993 }
994
995
996
997
998 #define VMX_FEATURE_CONTROL_MSR     0x0000003a
999 #define CPUID_VMX_FEATURES 0x00000005  /* LOCK and VMXON */
1000 #define CPUID_1_ECX_VTXFLAG 0x00000020
1001
1002 int v3_is_vmx_capable() {
1003     v3_msr_t feature_msr;
1004     uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
1005
1006     v3_cpuid(0x1, &eax, &ebx, &ecx, &edx);
1007
1008     PrintDebug("ECX: 0x%x\n", ecx);
1009
1010     if (ecx & CPUID_1_ECX_VTXFLAG) {
1011         v3_get_msr(VMX_FEATURE_CONTROL_MSR, &(feature_msr.hi), &(feature_msr.lo));
1012         
1013         PrintDebug("MSRREGlow: 0x%.8x\n", feature_msr.lo);
1014
1015         if ((feature_msr.lo & CPUID_VMX_FEATURES) != CPUID_VMX_FEATURES) {
1016             PrintDebug("VMX is locked -- enable in the BIOS\n");
1017             return 0;
1018         }
1019
1020     } else {
1021         PrintDebug("VMX not supported on this cpu\n");
1022         return 0;
1023     }
1024
1025     return 1;
1026 }
1027
1028
1029 int v3_reset_vmx_vm_core(struct guest_info * core, addr_t rip) {
1030     // init vmcs bios
1031     
1032     if ((core->shdw_pg_mode == NESTED_PAGING) && 
1033         (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
1034         // easy 
1035         core->rip = 0;
1036         core->segments.cs.selector = rip << 8;
1037         core->segments.cs.limit = 0xffff;
1038         core->segments.cs.base = rip << 12;
1039     } else {
1040         core->vm_regs.rdx = core->vcpu_id;
1041         core->vm_regs.rbx = rip;
1042     }
1043
1044     return 0;
1045 }
1046
1047
1048
1049 void v3_init_vmx_cpu(int cpu_id) {
1050     addr_t vmx_on_region = 0;
1051
1052     if (cpu_id == 0) {
1053         if (v3_init_vmx_hw(&hw_info) == -1) {
1054             PrintError("Could not initialize VMX hardware features on cpu %d\n", cpu_id);
1055             return;
1056         }
1057     }
1058
1059     enable_vmx();
1060
1061
1062     // Setup VMXON Region
1063     vmx_on_region = allocate_vmcs();
1064
1065
1066     if (vmx_on(vmx_on_region) == VMX_SUCCESS) {
1067         V3_Print("VMX Enabled\n");
1068         host_vmcs_ptrs[cpu_id] = vmx_on_region;
1069     } else {
1070         V3_Print("VMX already enabled\n");
1071         V3_FreePages((void *)vmx_on_region, 1);
1072     }
1073
1074     PrintDebug("VMXON pointer: 0x%p\n", (void *)host_vmcs_ptrs[cpu_id]);    
1075
1076     {
1077         struct vmx_sec_proc_ctrls sec_proc_ctrls;
1078         sec_proc_ctrls.value = v3_vmx_get_ctrl_features(&(hw_info.sec_proc_ctrls));
1079         
1080         if (sec_proc_ctrls.enable_ept == 0) {
1081             V3_Print("VMX EPT (Nested) Paging not supported\n");
1082             v3_cpu_types[cpu_id] = V3_VMX_CPU;
1083         } else if (sec_proc_ctrls.unrstrct_guest == 0) {
1084             V3_Print("VMX EPT (Nested) Paging supported\n");
1085             v3_cpu_types[cpu_id] = V3_VMX_EPT_CPU;
1086         } else {
1087             V3_Print("VMX EPT (Nested) Paging + Unrestricted guest supported\n");
1088             v3_cpu_types[cpu_id] = V3_VMX_EPT_UG_CPU;
1089         }
1090     }
1091 }
1092
1093
1094 void v3_deinit_vmx_cpu(int cpu_id) {
1095     extern v3_cpu_arch_t v3_cpu_types[];
1096     v3_cpu_types[cpu_id] = V3_INVALID_CPU;
1097
1098     if (host_vmcs_ptrs[cpu_id] != 0) {
1099         V3_Print("Disabling VMX\n");
1100
1101         if (vmx_off() != VMX_SUCCESS) {
1102             PrintError("Error executing VMXOFF\n");
1103         }
1104
1105         V3_FreePages((void *)host_vmcs_ptrs[cpu_id], 1);
1106
1107         host_vmcs_ptrs[cpu_id] = 0;
1108     }
1109 }