Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


revert timer modifications to fix guest deadlocks
[palacios-OLD.git] / palacios / src / palacios / vmx.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2011, Jack Lange <jarusl@cs.northwestern.edu>
11  * Copyright (c) 2011, The V3VEE Project <http://www.v3vee.org> 
12  * All rights reserved.
13  *
14  * Author: Jack Lange <jarusl@cs.northwestern.edu>
15  *
16  * This is free software.  You are permitted to use,
17  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
18  */
19
20
21 #include <palacios/vmx.h>
22 #include <palacios/vmm.h>
23 #include <palacios/vmx_handler.h>
24 #include <palacios/vmcs.h>
25 #include <palacios/vmx_lowlevel.h>
26 #include <palacios/vmm_lowlevel.h>
27 #include <palacios/vmm_ctrl_regs.h>
28 #include <palacios/vmm_config.h>
29 #include <palacios/vmm_time.h>
30 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_direct_paging.h>
32 #include <palacios/vmx_io.h>
33 #include <palacios/vmx_msr.h>
34 #include <palacios/vmm_decoder.h>
35 #include <palacios/vmm_barrier.h>
36
37 #ifdef V3_CONFIG_CHECKPOINT
38 #include <palacios/vmm_checkpoint.h>
39 #endif
40
41 #include <palacios/vmx_ept.h>
42 #include <palacios/vmx_assist.h>
43 #include <palacios/vmx_hw_info.h>
44
45 #ifndef V3_CONFIG_DEBUG_VMX
46 #undef PrintDebug
47 #define PrintDebug(fmt, args...)
48 #endif
49
50
51 /* These fields contain the hardware feature sets supported by the local CPU */
52 static struct vmx_hw_info hw_info;
53
54 extern v3_cpu_arch_t v3_cpu_types[];
55
56 static addr_t host_vmcs_ptrs[V3_CONFIG_MAX_CPUS] = { [0 ... V3_CONFIG_MAX_CPUS - 1] = 0};
57
58 extern int v3_vmx_launch(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
59 extern int v3_vmx_resume(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
60
61 static int inline check_vmcs_write(vmcs_field_t field, addr_t val) {
62     int ret = 0;
63
64     ret = vmcs_write(field, val);
65
66     if (ret != VMX_SUCCESS) {
67         PrintError("VMWRITE error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
68         return 1;
69     }
70
71     return 0;
72 }
73
74 static int inline check_vmcs_read(vmcs_field_t field, void * val) {
75     int ret = 0;
76
77     ret = vmcs_read(field, val);
78
79     if (ret != VMX_SUCCESS) {
80         PrintError("VMREAD error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
81     }
82
83     return ret;
84 }
85
86
87
88
89 static addr_t allocate_vmcs() {
90     struct vmcs_data * vmcs_page = NULL;
91
92     PrintDebug("Allocating page\n");
93
94     vmcs_page = (struct vmcs_data *)V3_VAddr(V3_AllocPages(1));
95     memset(vmcs_page, 0, 4096);
96
97     vmcs_page->revision = hw_info.basic_info.revision;
98     PrintDebug("VMX Revision: 0x%x\n", vmcs_page->revision);
99
100     return (addr_t)V3_PAddr((void *)vmcs_page);
101 }
102
103
104
105
106 static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state) {
107     int vmx_ret = 0;
108
109     // disable global interrupts for vm state initialization
110     v3_disable_ints();
111
112     PrintDebug("Loading VMCS\n");
113     vmx_ret = vmcs_load(vmx_state->vmcs_ptr_phys);
114     vmx_state->state = VMX_UNLAUNCHED;
115
116     if (vmx_ret != VMX_SUCCESS) {
117         PrintError("VMPTRLD failed\n");
118         return -1;
119     }
120
121
122     /*** Setup default state from HW ***/
123
124     vmx_state->pin_ctrls.value = hw_info.pin_ctrls.def_val;
125     vmx_state->pri_proc_ctrls.value = hw_info.proc_ctrls.def_val;
126     vmx_state->exit_ctrls.value = hw_info.exit_ctrls.def_val;
127     vmx_state->entry_ctrls.value = hw_info.entry_ctrls.def_val;
128     vmx_state->sec_proc_ctrls.value = hw_info.sec_proc_ctrls.def_val;
129
130     /* Print Control MSRs */
131     PrintDebug("CR0 MSR: %p\n", (void *)(addr_t)hw_info.cr0.value);
132     PrintDebug("CR4 MSR: %p\n", (void *)(addr_t)hw_info.cr4.value);
133
134
135
136     /******* Setup Host State **********/
137
138     /* Cache GDTR, IDTR, and TR in host struct */
139
140
141     /********** Setup VMX Control Fields ***********/
142
143     /* Add external interrupts, NMI exiting, and virtual NMI */
144     vmx_state->pin_ctrls.nmi_exit = 1;
145     vmx_state->pin_ctrls.ext_int_exit = 1;
146
147
148     vmx_state->pri_proc_ctrls.hlt_exit = 1;
149
150
151     vmx_state->pri_proc_ctrls.pause_exit = 0;
152     vmx_state->pri_proc_ctrls.tsc_offset = 1;
153 #ifdef V3_CONFIG_TIME_VIRTUALIZE_TSC
154     vmx_state->pri_proc_ctrls.rdtsc_exit = 1;
155 #endif
156
157     /* Setup IO map */
158     vmx_state->pri_proc_ctrls.use_io_bitmap = 1;
159     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_A_ADDR, (addr_t)V3_PAddr(core->vm_info->io_map.arch_data));
160     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_B_ADDR, 
161             (addr_t)V3_PAddr(core->vm_info->io_map.arch_data) + PAGE_SIZE_4KB);
162
163
164     vmx_state->pri_proc_ctrls.use_msr_bitmap = 1;
165     vmx_ret |= check_vmcs_write(VMCS_MSR_BITMAP, (addr_t)V3_PAddr(core->vm_info->msr_map.arch_data));
166
167
168
169 #ifdef __V3_64BIT__
170     // Ensure host runs in 64-bit mode at each VM EXIT
171     vmx_state->exit_ctrls.host_64_on = 1;
172 #endif
173
174     // Hook all accesses to EFER register
175     v3_hook_msr(core->vm_info, EFER_MSR, 
176                 &v3_handle_efer_read,
177                 &v3_handle_efer_write, 
178                 core);
179
180     // Restore host's EFER register on each VM EXIT
181     vmx_state->exit_ctrls.ld_efer = 1;
182
183     // Save/restore guest's EFER register to/from VMCS on VM EXIT/ENTRY
184     vmx_state->exit_ctrls.save_efer = 1;
185     vmx_state->entry_ctrls.ld_efer  = 1;
186
187     // Cause VM_EXIT whenever CR4.VMXE or CR4.PAE bits are written
188     vmx_ret |= check_vmcs_write(VMCS_CR4_MASK, CR4_VMXE | CR4_PAE);
189
190
191     /* Setup paging */
192     if (core->shdw_pg_mode == SHADOW_PAGING) {
193         PrintDebug("Creating initial shadow page table\n");
194
195         if (v3_init_passthrough_pts(core) == -1) {
196             PrintError("Could not initialize passthrough page tables\n");
197             return -1;
198         }
199         
200 #define CR0_PE 0x00000001
201 #define CR0_PG 0x80000000
202 #define CR0_WP 0x00010000 // To ensure mem hooks work
203         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
204
205         core->ctrl_regs.cr3 = core->direct_map_pt;
206
207         // vmx_state->pinbased_ctrls |= NMI_EXIT;
208
209         /* Add CR exits */
210         vmx_state->pri_proc_ctrls.cr3_ld_exit = 1;
211         vmx_state->pri_proc_ctrls.cr3_str_exit = 1;
212         
213         vmx_state->pri_proc_ctrls.invlpg_exit = 1;
214         
215         /* Add page fault exits */
216         vmx_state->excp_bmap.pf = 1;
217
218         // Setup VMX Assist
219         v3_vmxassist_init(core, vmx_state);
220
221     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
222                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_CPU)) {
223
224 #define CR0_PE 0x00000001
225 #define CR0_PG 0x80000000
226 #define CR0_WP 0x00010000 // To ensure mem hooks work
227         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
228
229         // vmx_state->pinbased_ctrls |= NMI_EXIT;
230
231         /* Disable CR exits */
232         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
233         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
234
235         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
236
237         /* Add page fault exits */
238         //      vmx_state->excp_bmap.pf = 1; // This should never happen..., enabled to catch bugs
239         
240         // Setup VMX Assist
241         v3_vmxassist_init(core, vmx_state);
242
243         /* Enable EPT */
244         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
245         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
246
247
248
249         if (v3_init_ept(core, &hw_info) == -1) {
250             PrintError("Error initializing EPT\n");
251             return -1;
252         }
253
254     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
255                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
256         int i = 0;
257         // For now we will assume that unrestricted guest mode is assured w/ EPT
258
259
260         core->vm_regs.rsp = 0x00;
261         core->rip = 0xfff0;
262         core->vm_regs.rdx = 0x00000f00;
263         core->ctrl_regs.rflags = 0x00000002; // The reserved bit is always 1
264         core->ctrl_regs.cr0 = 0x60010010; // Set the WP flag so the memory hooks work in real-mode
265
266
267         core->segments.cs.selector = 0xf000;
268         core->segments.cs.limit = 0xffff;
269         core->segments.cs.base = 0x0000000f0000LL;
270
271         // (raw attributes = 0xf3)
272         core->segments.cs.type = 0xb;
273         core->segments.cs.system = 0x1;
274         core->segments.cs.dpl = 0x0;
275         core->segments.cs.present = 1;
276
277
278
279         struct v3_segment * segregs [] = {&(core->segments.ss), &(core->segments.ds), 
280                                           &(core->segments.es), &(core->segments.fs), 
281                                           &(core->segments.gs), NULL};
282
283         for ( i = 0; segregs[i] != NULL; i++) {
284             struct v3_segment * seg = segregs[i];
285         
286             seg->selector = 0x0000;
287             //    seg->base = seg->selector << 4;
288             seg->base = 0x00000000;
289             seg->limit = 0xffff;
290
291
292             seg->type = 0x3;
293             seg->system = 0x1;
294             seg->dpl = 0x0;
295             seg->present = 1;
296             //    seg->granularity = 1;
297
298         }
299
300
301         core->segments.gdtr.limit = 0x0000ffff;
302         core->segments.gdtr.base = 0x0000000000000000LL;
303
304         core->segments.idtr.limit = 0x0000ffff;
305         core->segments.idtr.base = 0x0000000000000000LL;
306
307         core->segments.ldtr.selector = 0x0000;
308         core->segments.ldtr.limit = 0x0000ffff;
309         core->segments.ldtr.base = 0x0000000000000000LL;
310         core->segments.ldtr.type = 2;
311         core->segments.ldtr.present = 1;
312
313         core->segments.tr.selector = 0x0000;
314         core->segments.tr.limit = 0x0000ffff;
315         core->segments.tr.base = 0x0000000000000000LL;
316         core->segments.tr.type = 0xb;
317         core->segments.tr.present = 1;
318
319         //      core->dbg_regs.dr6 = 0x00000000ffff0ff0LL;
320         core->dbg_regs.dr7 = 0x0000000000000400LL;
321
322         /* Enable EPT */
323         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
324         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
325         vmx_state->sec_proc_ctrls.unrstrct_guest = 1; // enable unrestricted guest operation
326
327
328         /* Disable shadow paging stuff */
329         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
330         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
331
332         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
333
334
335         if (v3_init_ept(core, &hw_info) == -1) {
336             PrintError("Error initializing EPT\n");
337             return -1;
338         }
339
340     } else {
341         PrintError("Invalid Virtual paging mode\n");
342         return -1;
343     }
344
345
346     // hook vmx msrs
347
348     // Setup SYSCALL/SYSENTER MSRs in load/store area
349     
350     // save STAR, LSTAR, FMASK, KERNEL_GS_BASE MSRs in MSR load/store area
351     {
352 #define IA32_STAR 0xc0000081
353 #define IA32_LSTAR 0xc0000082
354 #define IA32_FMASK 0xc0000084
355 #define IA32_KERN_GS_BASE 0xc0000102
356
357 #define IA32_CSTAR 0xc0000083 // Compatibility mode STAR (ignored for now... hopefully its not that important...)
358
359         int msr_ret = 0;
360
361         struct vmcs_msr_entry * exit_store_msrs = NULL;
362         struct vmcs_msr_entry * exit_load_msrs = NULL;
363         struct vmcs_msr_entry * entry_load_msrs = NULL;;
364         int max_msrs = (hw_info.misc_info.max_msr_cache_size + 1) * 4;
365
366         V3_Print("Setting up MSR load/store areas (max_msr_count=%d)\n", max_msrs);
367
368         if (max_msrs < 4) {
369             PrintError("Max MSR cache size is too small (%d)\n", max_msrs);
370             return -1;
371         }
372
373         vmx_state->msr_area = V3_VAddr(V3_AllocPages(1));
374
375         if (vmx_state->msr_area == NULL) {
376             PrintError("could not allocate msr load/store area\n");
377             return -1;
378         }
379
380         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_CNT, 4);
381         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_CNT, 4);
382         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_CNT, 4);
383         
384         
385         exit_store_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area);
386         exit_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 4));
387         entry_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 8));
388
389
390         exit_store_msrs[0].index = IA32_STAR;
391         exit_store_msrs[1].index = IA32_LSTAR;
392         exit_store_msrs[2].index = IA32_FMASK;
393         exit_store_msrs[3].index = IA32_KERN_GS_BASE;
394         
395         memcpy(exit_store_msrs, exit_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
396         memcpy(exit_store_msrs, entry_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
397
398         
399         v3_get_msr(IA32_STAR, &(exit_load_msrs[0].hi), &(exit_load_msrs[0].lo));
400         v3_get_msr(IA32_LSTAR, &(exit_load_msrs[1].hi), &(exit_load_msrs[1].lo));
401         v3_get_msr(IA32_FMASK, &(exit_load_msrs[2].hi), &(exit_load_msrs[2].lo));
402         v3_get_msr(IA32_KERN_GS_BASE, &(exit_load_msrs[3].hi), &(exit_load_msrs[3].lo));
403
404         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_ADDR, (addr_t)V3_PAddr(exit_store_msrs));
405         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_ADDR, (addr_t)V3_PAddr(exit_load_msrs));
406         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_ADDR, (addr_t)V3_PAddr(entry_load_msrs));
407
408     }    
409
410     /* Sanity check ctrl/reg fields against hw_defaults */
411
412
413
414
415     /*** Write all the info to the VMCS ***/
416   
417     /*
418     {
419         // IS THIS NECESSARY???
420 #define DEBUGCTL_MSR 0x1d9
421         struct v3_msr tmp_msr;
422         v3_get_msr(DEBUGCTL_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
423         vmx_ret |= check_vmcs_write(VMCS_GUEST_DBG_CTL, tmp_msr.value);
424         core->dbg_regs.dr7 = 0x400;
425     }
426     */
427
428 #ifdef __V3_64BIT__
429     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffffffffffULL);
430 #else
431     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffUL);
432     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR_HIGH, (addr_t)0xffffffffUL);
433 #endif
434
435
436  
437
438     if (v3_update_vmcs_ctrl_fields(core)) {
439         PrintError("Could not write control fields!\n");
440         return -1;
441     }
442     
443     if (v3_update_vmcs_host_state(core)) {
444         PrintError("Could not write host state\n");
445         return -1;
446     }
447
448     // reenable global interrupts for vm state initialization now
449     // that the vm state is initialized. If another VM kicks us off, 
450     // it'll update our vmx state so that we know to reload ourself
451     v3_enable_ints();
452
453     return 0;
454 }
455
456 int v3_init_vmx_vmcs(struct guest_info * core, v3_vm_class_t vm_class) {
457     struct vmx_data * vmx_state = NULL;
458     int vmx_ret = 0;
459     
460     vmx_state = (struct vmx_data *)V3_Malloc(sizeof(struct vmx_data));
461     memset(vmx_state, 0, sizeof(struct vmx_data));
462
463     PrintDebug("vmx_data pointer: %p\n", (void *)vmx_state);
464
465     PrintDebug("Allocating VMCS\n");
466     vmx_state->vmcs_ptr_phys = allocate_vmcs();
467
468     PrintDebug("VMCS pointer: %p\n", (void *)(vmx_state->vmcs_ptr_phys));
469
470     core->vmm_data = vmx_state;
471     vmx_state->state = VMX_UNLAUNCHED;
472
473     PrintDebug("Initializing VMCS (addr=%p)\n", core->vmm_data);
474     
475     // TODO: Fix vmcs fields so they're 32-bit
476
477     PrintDebug("Clearing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
478     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
479
480     if (vmx_ret != VMX_SUCCESS) {
481         PrintError("VMCLEAR failed\n");
482         return -1; 
483     }
484
485     if (vm_class == V3_PC_VM) {
486         PrintDebug("Initializing VMCS\n");
487         if (init_vmcs_bios(core, vmx_state) == -1) {
488             PrintError("Error initializing VMCS to BIOS state\n");
489             return -1;
490         }
491     } else {
492         PrintError("Invalid VM Class\n");
493         return -1;
494     }
495
496     PrintDebug("Serializing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
497     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
498
499     return 0;
500 }
501
502
503 int v3_deinit_vmx_vmcs(struct guest_info * core) {
504     struct vmx_data * vmx_state = core->vmm_data;
505
506     V3_FreePages((void *)(vmx_state->vmcs_ptr_phys), 1);
507     V3_FreePages(V3_PAddr(vmx_state->msr_area), 1);
508
509     V3_Free(vmx_state);
510
511     return 0;
512 }
513
514
515
516 #ifdef V3_CONFIG_CHECKPOINT
517 /* 
518  * JRL: This is broken
519  */
520 int v3_vmx_save_core(struct guest_info * core, void * ctx){
521     uint64_t vmcs_ptr = vmcs_store();
522
523     v3_chkpt_save(ctx, "vmcs_data", PAGE_SIZE, (void *)vmcs_ptr);
524
525     return 0;
526 }
527
528 int v3_vmx_load_core(struct guest_info * core, void * ctx){
529     struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
530     struct cr0_32 * shadow_cr0;
531     char vmcs[PAGE_SIZE_4KB];
532
533     v3_chkpt_load(ctx, "vmcs_data", PAGE_SIZE_4KB, vmcs);
534
535     vmcs_clear(vmx_info->vmcs_ptr_phys);
536     vmcs_load((addr_t)vmcs);
537
538     v3_vmx_save_vmcs(core);
539
540     shadow_cr0 = (struct cr0_32 *)&(core->ctrl_regs.cr0);
541
542
543     /* Get the CPU mode to set the guest_ia32e entry ctrl */
544
545     if (core->shdw_pg_mode == SHADOW_PAGING) {
546         if (v3_get_vm_mem_mode(core) == VIRTUAL_MEM) {
547             if (v3_activate_shadow_pt(core) == -1) {
548                 PrintError("Failed to activate shadow page tables\n");
549                 return -1;
550             }
551         } else {
552             if (v3_activate_passthrough_pt(core) == -1) {
553                 PrintError("Failed to activate passthrough page tables\n");
554                 return -1;
555             }
556         }
557     }
558
559     return 0;
560 }
561 #endif
562
563
564 void v3_flush_vmx_vm_core(struct guest_info * core) {
565     struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
566     vmcs_clear(vmx_info->vmcs_ptr_phys);
567     vmx_info->state = VMX_UNLAUNCHED;
568 }
569
570
571
572 static int update_irq_exit_state(struct guest_info * info) {
573     struct vmx_exit_idt_vec_info idt_vec_info;
574
575     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
576
577     if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 0)) {
578 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
579         V3_Print("Calling v3_injecting_intr\n");
580 #endif
581         info->intr_core_state.irq_started = 0;
582         v3_injecting_intr(info, info->intr_core_state.irq_vector, V3_EXTERNAL_IRQ);
583     }
584
585     return 0;
586 }
587
588 static int update_irq_entry_state(struct guest_info * info) {
589     struct vmx_exit_idt_vec_info idt_vec_info;
590     struct vmcs_interrupt_state intr_core_state;
591     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
592
593     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
594     check_vmcs_read(VMCS_GUEST_INT_STATE, &(intr_core_state));
595
596     /* Check for pending exceptions to inject */
597     if (v3_excp_pending(info)) {
598         struct vmx_entry_int_info int_info;
599         int_info.value = 0;
600
601         // In VMX, almost every exception is hardware
602         // Software exceptions are pretty much only for breakpoint or overflow
603         int_info.type = 3;
604         int_info.vector = v3_get_excp_number(info);
605
606         if (info->excp_state.excp_error_code_valid) {
607             check_vmcs_write(VMCS_ENTRY_EXCP_ERR, info->excp_state.excp_error_code);
608             int_info.error_code = 1;
609
610 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
611             V3_Print("Injecting exception %d with error code %x\n", 
612                     int_info.vector, info->excp_state.excp_error_code);
613 #endif
614         }
615
616         int_info.valid = 1;
617 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
618         V3_Print("Injecting exception %d (EIP=%p)\n", int_info.vector, (void *)(addr_t)info->rip);
619 #endif
620         check_vmcs_write(VMCS_ENTRY_INT_INFO, int_info.value);
621
622         v3_injecting_excp(info, int_info.vector);
623
624     } else if ((((struct rflags *)&(info->ctrl_regs.rflags))->intr == 1) && 
625                (intr_core_state.val == 0)) {
626        
627         if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 1)) {
628
629 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
630             V3_Print("IRQ pending from previous injection\n");
631 #endif
632
633             // Copy the IDT vectoring info over to reinject the old interrupt
634             if (idt_vec_info.error_code == 1) {
635                 uint32_t err_code = 0;
636
637                 check_vmcs_read(VMCS_IDT_VECTOR_ERR, &err_code);
638                 check_vmcs_write(VMCS_ENTRY_EXCP_ERR, err_code);
639             }
640
641             idt_vec_info.undef = 0;
642             check_vmcs_write(VMCS_ENTRY_INT_INFO, idt_vec_info.value);
643
644         } else {
645             struct vmx_entry_int_info ent_int;
646             ent_int.value = 0;
647
648             switch (v3_intr_pending(info)) {
649                 case V3_EXTERNAL_IRQ: {
650                     info->intr_core_state.irq_vector = v3_get_intr(info); 
651                     ent_int.vector = info->intr_core_state.irq_vector;
652                     ent_int.type = 0;
653                     ent_int.error_code = 0;
654                     ent_int.valid = 1;
655
656 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
657                     V3_Print("Injecting Interrupt %d at exit %u(EIP=%p)\n", 
658                                info->intr_core_state.irq_vector, 
659                                (uint32_t)info->num_exits, 
660                                (void *)(addr_t)info->rip);
661 #endif
662
663                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
664                     info->intr_core_state.irq_started = 1;
665
666                     break;
667                 }
668                 case V3_NMI:
669                     PrintDebug("Injecting NMI\n");
670
671                     ent_int.type = 2;
672                     ent_int.vector = 2;
673                     ent_int.valid = 1;
674                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
675
676                     break;
677                 case V3_SOFTWARE_INTR:
678                     PrintDebug("Injecting software interrupt\n");
679                     ent_int.type = 4;
680
681                     ent_int.valid = 1;
682                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
683
684                     break;
685                 case V3_VIRTUAL_IRQ:
686                     // Not sure what to do here, Intel doesn't have virtual IRQs
687                     // May be the same as external interrupts/IRQs
688
689                     break;
690                 case V3_INVALID_INTR:
691                 default:
692                     break;
693             }
694         }
695     } else if ((v3_intr_pending(info)) && (vmx_info->pri_proc_ctrls.int_wndw_exit == 0)) {
696         // Enable INTR window exiting so we know when IF=1
697         uint32_t instr_len;
698
699         check_vmcs_read(VMCS_EXIT_INSTR_LEN, &instr_len);
700
701 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
702         V3_Print("Enabling Interrupt-Window exiting: %d\n", instr_len);
703 #endif
704
705         vmx_info->pri_proc_ctrls.int_wndw_exit = 1;
706         check_vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
707     }
708
709
710     return 0;
711 }
712
713
714
715 static struct vmx_exit_info exit_log[10];
716
717 static void print_exit_log(struct guest_info * info) {
718     int cnt = info->num_exits % 10;
719     int i = 0;
720     
721
722     V3_Print("\nExit Log (%d total exits):\n", (uint32_t)info->num_exits);
723
724     for (i = 0; i < 10; i++) {
725         struct vmx_exit_info * tmp = &exit_log[cnt];
726
727         V3_Print("%d:\texit_reason = %p\n", i, (void *)(addr_t)tmp->exit_reason);
728         V3_Print("\texit_qual = %p\n", (void *)tmp->exit_qual);
729         V3_Print("\tint_info = %p\n", (void *)(addr_t)tmp->int_info);
730         V3_Print("\tint_err = %p\n", (void *)(addr_t)tmp->int_err);
731         V3_Print("\tinstr_info = %p\n", (void *)(addr_t)tmp->instr_info);
732
733         cnt--;
734
735         if (cnt == -1) {
736             cnt = 9;
737         }
738
739     }
740
741 }
742
743 /* 
744  * CAUTION and DANGER!!! 
745  * 
746  * The VMCS CANNOT(!!) be accessed outside of the cli/sti calls inside this function
747  * When exectuing a symbiotic call, the VMCS WILL be overwritten, so any dependencies 
748  * on its contents will cause things to break. The contents at the time of the exit WILL 
749  * change before the exit handler is executed.
750  */
751 int v3_vmx_enter(struct guest_info * info) {
752     int ret = 0;
753     uint32_t tsc_offset_low, tsc_offset_high;
754     struct vmx_exit_info exit_info;
755     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
756
757     // Conditionally yield the CPU if the timeslice has expired
758     v3_yield_cond(info);
759
760     // Perform any additional yielding needed for time adjustment
761     v3_adjust_time(info);
762
763     // disable global interrupts for vm state transition
764     v3_disable_ints();
765
766     // Update timer devices late after being in the VM so that as much 
767     // of hte time in the VM is accounted for as possible. Also do it before
768     // updating IRQ entry state so that any interrupts the timers raise get 
769     // handled on the next VM entry. Must be done with interrupts disabled.
770     v3_update_timers(info);
771
772     if (vmcs_store() != vmx_info->vmcs_ptr_phys) {
773         vmcs_clear(vmx_info->vmcs_ptr_phys);
774         vmcs_load(vmx_info->vmcs_ptr_phys);
775         vmx_info->state = VMX_UNLAUNCHED;
776     }
777
778     v3_vmx_restore_vmcs(info);
779
780
781 #ifdef V3_CONFIG_SYMCALL
782     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
783         update_irq_entry_state(info);
784     }
785 #else 
786     update_irq_entry_state(info);
787 #endif
788
789     {
790         addr_t guest_cr3;
791         vmcs_read(VMCS_GUEST_CR3, &guest_cr3);
792         vmcs_write(VMCS_GUEST_CR3, guest_cr3);
793     }
794
795     // Perform last-minute time bookkeeping prior to entering the VM
796     v3_time_enter_vm(info);
797
798     tsc_offset_high = (uint32_t)((v3_tsc_host_offset(&info->time_state) >> 32) & 0xffffffff);
799     tsc_offset_low = (uint32_t)(v3_tsc_host_offset(&info->time_state) & 0xffffffff);
800     check_vmcs_write(VMCS_TSC_OFFSET_HIGH, tsc_offset_high);
801     check_vmcs_write(VMCS_TSC_OFFSET, tsc_offset_low);
802
803
804     if (v3_update_vmcs_host_state(info)) {
805         v3_enable_ints();
806         PrintError("Could not write host state\n");
807         return -1;
808     }
809
810
811     if (vmx_info->state == VMX_UNLAUNCHED) {
812         vmx_info->state = VMX_LAUNCHED;
813
814         info->vm_info->run_state = VM_RUNNING;
815         ret = v3_vmx_launch(&(info->vm_regs), info, &(info->ctrl_regs));
816     } else {
817         V3_ASSERT(vmx_info->state != VMX_UNLAUNCHED);
818         ret = v3_vmx_resume(&(info->vm_regs), info, &(info->ctrl_regs));
819     }
820     
821
822
823     //  PrintDebug("VMX Exit: ret=%d\n", ret);
824
825     if (ret != VMX_SUCCESS) {
826         uint32_t error = 0;
827         vmcs_read(VMCS_INSTR_ERR, &error);
828
829         v3_enable_ints();
830
831         PrintError("VMENTRY Error: %d (launch_ret = %d)\n", error, ret);
832         return -1;
833     }
834
835
836
837     // Immediate exit from VM time bookkeeping
838     v3_time_exit_vm(info);
839
840     info->num_exits++;
841
842     /* Update guest state */
843     v3_vmx_save_vmcs(info);
844
845     // info->cpl = info->segments.cs.selector & 0x3;
846
847     info->mem_mode = v3_get_vm_mem_mode(info);
848     info->cpu_mode = v3_get_vm_cpu_mode(info);
849
850
851     check_vmcs_read(VMCS_EXIT_INSTR_LEN, &(exit_info.instr_len));
852     check_vmcs_read(VMCS_EXIT_INSTR_INFO, &(exit_info.instr_info));
853     check_vmcs_read(VMCS_EXIT_REASON, &(exit_info.exit_reason));
854     check_vmcs_read(VMCS_EXIT_QUAL, &(exit_info.exit_qual));
855     check_vmcs_read(VMCS_EXIT_INT_INFO, &(exit_info.int_info));
856     check_vmcs_read(VMCS_EXIT_INT_ERR, &(exit_info.int_err));
857     check_vmcs_read(VMCS_GUEST_LINEAR_ADDR, &(exit_info.guest_linear_addr));
858
859     if (info->shdw_pg_mode == NESTED_PAGING) {
860         check_vmcs_read(VMCS_GUEST_PHYS_ADDR, &(exit_info.ept_fault_addr));
861     }
862
863     //PrintDebug("VMX Exit taken, id-qual: %u-%lu\n", exit_info.exit_reason, exit_info.exit_qual);
864
865     exit_log[info->num_exits % 10] = exit_info;
866
867 #ifdef V3_CONFIG_SYMCALL
868     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
869         update_irq_exit_state(info);
870     }
871 #else
872     update_irq_exit_state(info);
873 #endif
874
875     if (exit_info.exit_reason == VMEXIT_INTR_WINDOW) {
876         // This is a special case whose only job is to inject an interrupt
877         vmcs_read(VMCS_PROC_CTRLS, &(vmx_info->pri_proc_ctrls.value));
878         vmx_info->pri_proc_ctrls.int_wndw_exit = 0;
879         vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
880
881 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
882        V3_Print("Interrupts available again! (RIP=%llx)\n", info->rip);
883 #endif
884     }
885
886     // reenable global interrupts after vm exit
887     v3_enable_ints();
888
889     // Conditionally yield the CPU if the timeslice has expired
890     v3_yield_cond(info);
891
892     if (v3_handle_vmx_exit(info, &exit_info) == -1) {
893         PrintError("Error in VMX exit handler (Exit reason=%x)\n", exit_info.exit_reason);
894         return -1;
895     }
896
897     return 0;
898 }
899
900
901 int v3_start_vmx_guest(struct guest_info * info) {
902
903     PrintDebug("Starting VMX core %u\n", info->vcpu_id);
904
905     if (info->vcpu_id == 0) {
906         info->core_run_state = CORE_RUNNING;
907         info->vm_info->run_state = VM_RUNNING;
908     } else {
909
910         PrintDebug("VMX core %u: Waiting for core initialization\n", info->vcpu_id);
911
912         while (info->core_run_state == CORE_STOPPED) {
913             v3_yield(info);
914             //PrintDebug("VMX core %u: still waiting for INIT\n",info->vcpu_id);
915         }
916         
917         PrintDebug("VMX core %u initialized\n", info->vcpu_id);
918
919         // We'll be paranoid about race conditions here
920         v3_wait_at_barrier(info);
921     }
922
923
924     PrintDebug("VMX core %u: I am starting at CS=0x%x (base=0x%p, limit=0x%x),  RIP=0x%p\n",
925                info->vcpu_id, info->segments.cs.selector, (void *)(info->segments.cs.base),
926                info->segments.cs.limit, (void *)(info->rip));
927
928
929     PrintDebug("VMX core %u: Launching VMX VM on logical core %u\n", info->vcpu_id, info->pcpu_id);
930
931     v3_start_time(info);
932
933     while (1) {
934
935         if (info->vm_info->run_state == VM_STOPPED) {
936             info->core_run_state = CORE_STOPPED;
937             break;
938         }
939
940         if (v3_vmx_enter(info) == -1) {
941
942             addr_t host_addr;
943             addr_t linear_addr = 0;
944             
945             info->vm_info->run_state = VM_ERROR;
946             
947             V3_Print("VMX core %u: VMX ERROR!!\n", info->vcpu_id); 
948             
949             v3_print_guest_state(info);
950             
951             V3_Print("VMX core %u\n", info->vcpu_id); 
952
953             linear_addr = get_addr_linear(info, info->rip, &(info->segments.cs));
954             
955             if (info->mem_mode == PHYSICAL_MEM) {
956                 v3_gpa_to_hva(info, linear_addr, &host_addr);
957             } else if (info->mem_mode == VIRTUAL_MEM) {
958                 v3_gva_to_hva(info, linear_addr, &host_addr);
959             }
960             
961             V3_Print("VMX core %u: Host Address of rip = 0x%p\n", info->vcpu_id, (void *)host_addr);
962             
963             V3_Print("VMX core %u: Instr (15 bytes) at %p:\n", info->vcpu_id, (void *)host_addr);
964             v3_dump_mem((uint8_t *)host_addr, 15);
965             
966             v3_print_stack(info);
967
968
969             v3_print_vmcs();
970             print_exit_log(info);
971             return -1;
972         }
973
974         v3_wait_at_barrier(info);
975
976
977         if (info->vm_info->run_state == VM_STOPPED) {
978             info->core_run_state = CORE_STOPPED;
979             break;
980         }
981 /*
982         if ((info->num_exits % 5000) == 0) {
983             V3_Print("VMX Exit number %d\n", (uint32_t)info->num_exits);
984         }
985 */
986
987     }
988
989     return 0;
990 }
991
992
993
994
995 #define VMX_FEATURE_CONTROL_MSR     0x0000003a
996 #define CPUID_VMX_FEATURES 0x00000005  /* LOCK and VMXON */
997 #define CPUID_1_ECX_VTXFLAG 0x00000020
998
999 int v3_is_vmx_capable() {
1000     v3_msr_t feature_msr;
1001     uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
1002
1003     v3_cpuid(0x1, &eax, &ebx, &ecx, &edx);
1004
1005     PrintDebug("ECX: 0x%x\n", ecx);
1006
1007     if (ecx & CPUID_1_ECX_VTXFLAG) {
1008         v3_get_msr(VMX_FEATURE_CONTROL_MSR, &(feature_msr.hi), &(feature_msr.lo));
1009         
1010         PrintDebug("MSRREGlow: 0x%.8x\n", feature_msr.lo);
1011
1012         if ((feature_msr.lo & CPUID_VMX_FEATURES) != CPUID_VMX_FEATURES) {
1013             PrintDebug("VMX is locked -- enable in the BIOS\n");
1014             return 0;
1015         }
1016
1017     } else {
1018         PrintDebug("VMX not supported on this cpu\n");
1019         return 0;
1020     }
1021
1022     return 1;
1023 }
1024
1025
1026 int v3_reset_vmx_vm_core(struct guest_info * core, addr_t rip) {
1027     // init vmcs bios
1028     
1029     if ((core->shdw_pg_mode == NESTED_PAGING) && 
1030         (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
1031         // easy 
1032         core->rip = 0;
1033         core->segments.cs.selector = rip << 8;
1034         core->segments.cs.limit = 0xffff;
1035         core->segments.cs.base = rip << 12;
1036     } else {
1037         core->vm_regs.rdx = core->vcpu_id;
1038         core->vm_regs.rbx = rip;
1039     }
1040
1041     return 0;
1042 }
1043
1044
1045
1046 void v3_init_vmx_cpu(int cpu_id) {
1047     addr_t vmx_on_region = 0;
1048
1049     if (cpu_id == 0) {
1050         if (v3_init_vmx_hw(&hw_info) == -1) {
1051             PrintError("Could not initialize VMX hardware features on cpu %d\n", cpu_id);
1052             return;
1053         }
1054     }
1055
1056     enable_vmx();
1057
1058
1059     // Setup VMXON Region
1060     vmx_on_region = allocate_vmcs();
1061
1062
1063     if (vmx_on(vmx_on_region) == VMX_SUCCESS) {
1064         V3_Print("VMX Enabled\n");
1065         host_vmcs_ptrs[cpu_id] = vmx_on_region;
1066     } else {
1067         V3_Print("VMX already enabled\n");
1068         V3_FreePages((void *)vmx_on_region, 1);
1069     }
1070
1071     PrintDebug("VMXON pointer: 0x%p\n", (void *)host_vmcs_ptrs[cpu_id]);    
1072
1073     {
1074         struct vmx_sec_proc_ctrls sec_proc_ctrls;
1075         sec_proc_ctrls.value = v3_vmx_get_ctrl_features(&(hw_info.sec_proc_ctrls));
1076         
1077         if (sec_proc_ctrls.enable_ept == 0) {
1078             V3_Print("VMX EPT (Nested) Paging not supported\n");
1079             v3_cpu_types[cpu_id] = V3_VMX_CPU;
1080         } else if (sec_proc_ctrls.unrstrct_guest == 0) {
1081             V3_Print("VMX EPT (Nested) Paging supported\n");
1082             v3_cpu_types[cpu_id] = V3_VMX_EPT_CPU;
1083         } else {
1084             V3_Print("VMX EPT (Nested) Paging + Unrestricted guest supported\n");
1085             v3_cpu_types[cpu_id] = V3_VMX_EPT_UG_CPU;
1086         }
1087     }
1088 }
1089
1090
1091 void v3_deinit_vmx_cpu(int cpu_id) {
1092     extern v3_cpu_arch_t v3_cpu_types[];
1093     v3_cpu_types[cpu_id] = V3_INVALID_CPU;
1094
1095     if (host_vmcs_ptrs[cpu_id] != 0) {
1096         V3_Print("Disabling VMX\n");
1097
1098         if (vmx_off() != VMX_SUCCESS) {
1099             PrintError("Error executing VMXOFF\n");
1100         }
1101
1102         V3_FreePages((void *)host_vmcs_ptrs[cpu_id], 1);
1103
1104         host_vmcs_ptrs[cpu_id] = 0;
1105     }
1106 }