Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


bdfc8c04cb19fa8ac923a71cfd8d35679fec1bc4
[palacios.git] / palacios / src / palacios / vmx.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2011, Jack Lange <jarusl@cs.northwestern.edu>
11  * Copyright (c) 2011, The V3VEE Project <http://www.v3vee.org> 
12  * All rights reserved.
13  *
14  * Author: Jack Lange <jarusl@cs.northwestern.edu>
15  *
16  * This is free software.  You are permitted to use,
17  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
18  */
19
20
21 #include <palacios/vmx.h>
22 #include <palacios/vmm.h>
23 #include <palacios/vmx_handler.h>
24 #include <palacios/vmcs.h>
25 #include <palacios/vmx_lowlevel.h>
26 #include <palacios/vmm_lowlevel.h>
27 #include <palacios/vmm_ctrl_regs.h>
28 #include <palacios/vmm_config.h>
29 #include <palacios/vmm_time.h>
30 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_direct_paging.h>
32 #include <palacios/vmx_io.h>
33 #include <palacios/vmx_msr.h>
34 #include <palacios/vmm_decoder.h>
35 #include <palacios/vmm_barrier.h>
36
37 #ifdef V3_CONFIG_CHECKPOINT
38 #include <palacios/vmm_checkpoint.h>
39 #endif
40
41 #include <palacios/vmx_ept.h>
42 #include <palacios/vmx_assist.h>
43 #include <palacios/vmx_hw_info.h>
44
45 #ifndef V3_CONFIG_DEBUG_VMX
46 #undef PrintDebug
47 #define PrintDebug(fmt, args...)
48 #endif
49
50
51 /* These fields contain the hardware feature sets supported by the local CPU */
52 static struct vmx_hw_info hw_info;
53
54 extern v3_cpu_arch_t v3_cpu_types[];
55
56 static addr_t host_vmcs_ptrs[V3_CONFIG_MAX_CPUS] = { [0 ... V3_CONFIG_MAX_CPUS - 1] = 0};
57
58 extern int v3_vmx_launch(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
59 extern int v3_vmx_resume(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
60
61 static int inline check_vmcs_write(vmcs_field_t field, addr_t val) {
62     int ret = 0;
63
64     ret = vmcs_write(field, val);
65
66     if (ret != VMX_SUCCESS) {
67         PrintError("VMWRITE error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
68         return 1;
69     }
70
71     return 0;
72 }
73
74 static int inline check_vmcs_read(vmcs_field_t field, void * val) {
75     int ret = 0;
76
77     ret = vmcs_read(field, val);
78
79     if (ret != VMX_SUCCESS) {
80         PrintError("VMREAD error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
81     }
82
83     return ret;
84 }
85
86
87
88
89 static addr_t allocate_vmcs() {
90     struct vmcs_data * vmcs_page = NULL;
91
92     PrintDebug("Allocating page\n");
93
94     vmcs_page = (struct vmcs_data *)V3_VAddr(V3_AllocPages(1));
95     memset(vmcs_page, 0, 4096);
96
97     vmcs_page->revision = hw_info.basic_info.revision;
98     PrintDebug("VMX Revision: 0x%x\n", vmcs_page->revision);
99
100     return (addr_t)V3_PAddr((void *)vmcs_page);
101 }
102
103
104
105
106 static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state) {
107     int vmx_ret = 0;
108
109     // disable global interrupts for vm state initialization
110     v3_disable_ints();
111
112     PrintDebug("Loading VMCS\n");
113     vmx_ret = vmcs_load(vmx_state->vmcs_ptr_phys);
114     vmx_state->state = VMX_UNLAUNCHED;
115
116     if (vmx_ret != VMX_SUCCESS) {
117         PrintError("VMPTRLD failed\n");
118         return -1;
119     }
120
121
122     /*** Setup default state from HW ***/
123
124     vmx_state->pin_ctrls.value = hw_info.pin_ctrls.def_val;
125     vmx_state->pri_proc_ctrls.value = hw_info.proc_ctrls.def_val;
126     vmx_state->exit_ctrls.value = hw_info.exit_ctrls.def_val;
127     vmx_state->entry_ctrls.value = hw_info.entry_ctrls.def_val;
128     vmx_state->sec_proc_ctrls.value = hw_info.sec_proc_ctrls.def_val;
129
130     /* Print Control MSRs */
131     PrintDebug("CR0 MSR: %p\n", (void *)(addr_t)hw_info.cr0.value);
132     PrintDebug("CR4 MSR: %p\n", (void *)(addr_t)hw_info.cr4.value);
133
134
135
136     /******* Setup Host State **********/
137
138     /* Cache GDTR, IDTR, and TR in host struct */
139
140
141     /********** Setup VMX Control Fields ***********/
142
143     /* Add external interrupts, NMI exiting, and virtual NMI */
144     vmx_state->pin_ctrls.nmi_exit = 1;
145     vmx_state->pin_ctrls.ext_int_exit = 1;
146
147
148     vmx_state->pri_proc_ctrls.hlt_exit = 1;
149
150
151     vmx_state->pri_proc_ctrls.pause_exit = 0;
152     vmx_state->pri_proc_ctrls.tsc_offset = 1;
153 #ifdef V3_CONFIG_TIME_VIRTUALIZE_TSC
154     vmx_state->pri_proc_ctrls.rdtsc_exit = 1;
155 #endif
156
157     /* Setup IO map */
158     vmx_state->pri_proc_ctrls.use_io_bitmap = 1;
159     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_A_ADDR, (addr_t)V3_PAddr(core->vm_info->io_map.arch_data));
160     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_B_ADDR, 
161             (addr_t)V3_PAddr(core->vm_info->io_map.arch_data) + PAGE_SIZE_4KB);
162
163
164     vmx_state->pri_proc_ctrls.use_msr_bitmap = 1;
165     vmx_ret |= check_vmcs_write(VMCS_MSR_BITMAP, (addr_t)V3_PAddr(core->vm_info->msr_map.arch_data));
166
167
168
169 #ifdef __V3_64BIT__
170     // Ensure host runs in 64-bit mode at each VM EXIT
171     vmx_state->exit_ctrls.host_64_on = 1;
172 #endif
173
174     // Hook all accesses to EFER register
175     v3_hook_msr(core->vm_info, EFER_MSR, 
176                 &v3_handle_efer_read,
177                 &v3_handle_efer_write, 
178                 core);
179
180     // Restore host's EFER register on each VM EXIT
181     vmx_state->exit_ctrls.ld_efer = 1;
182
183     // Save/restore guest's EFER register to/from VMCS on VM EXIT/ENTRY
184     vmx_state->exit_ctrls.save_efer = 1;
185     vmx_state->entry_ctrls.ld_efer  = 1;
186
187     // Cause VM_EXIT whenever CR4.VMXE or CR4.PAE bits are written
188     vmx_ret |= check_vmcs_write(VMCS_CR4_MASK, CR4_VMXE | CR4_PAE);
189
190
191     /* Setup paging */
192     if (core->shdw_pg_mode == SHADOW_PAGING) {
193         PrintDebug("Creating initial shadow page table\n");
194
195         if (v3_init_passthrough_pts(core) == -1) {
196             PrintError("Could not initialize passthrough page tables\n");
197             return -1;
198         }
199         
200 #define CR0_PE 0x00000001
201 #define CR0_PG 0x80000000
202 #define CR0_WP 0x00010000 // To ensure mem hooks work
203         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
204
205         core->ctrl_regs.cr3 = core->direct_map_pt;
206
207         // vmx_state->pinbased_ctrls |= NMI_EXIT;
208
209         /* Add CR exits */
210         vmx_state->pri_proc_ctrls.cr3_ld_exit = 1;
211         vmx_state->pri_proc_ctrls.cr3_str_exit = 1;
212         
213         vmx_state->pri_proc_ctrls.invlpg_exit = 1;
214         
215         /* Add page fault exits */
216         vmx_state->excp_bmap.pf = 1;
217
218         // Setup VMX Assist
219         v3_vmxassist_init(core, vmx_state);
220
221     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
222                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_CPU)) {
223
224 #define CR0_PE 0x00000001
225 #define CR0_PG 0x80000000
226 #define CR0_WP 0x00010000 // To ensure mem hooks work
227         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
228
229         // vmx_state->pinbased_ctrls |= NMI_EXIT;
230
231         /* Disable CR exits */
232         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
233         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
234
235         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
236
237         /* Add page fault exits */
238         //      vmx_state->excp_bmap.pf = 1; // This should never happen..., enabled to catch bugs
239         
240         // Setup VMX Assist
241         v3_vmxassist_init(core, vmx_state);
242
243         /* Enable EPT */
244         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
245         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
246
247
248
249         if (v3_init_ept(core, &hw_info) == -1) {
250             PrintError("Error initializing EPT\n");
251             return -1;
252         }
253
254     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
255                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
256         int i = 0;
257         // For now we will assume that unrestricted guest mode is assured w/ EPT
258
259
260         core->vm_regs.rsp = 0x00;
261         core->rip = 0xfff0;
262         core->vm_regs.rdx = 0x00000f00;
263         core->ctrl_regs.rflags = 0x00000002; // The reserved bit is always 1
264         core->ctrl_regs.cr0 = 0x60010010; // Set the WP flag so the memory hooks work in real-mode
265
266
267         core->segments.cs.selector = 0xf000;
268         core->segments.cs.limit = 0xffff;
269         core->segments.cs.base = 0x0000000f0000LL;
270
271         // (raw attributes = 0xf3)
272         core->segments.cs.type = 0xb;
273         core->segments.cs.system = 0x1;
274         core->segments.cs.dpl = 0x0;
275         core->segments.cs.present = 1;
276
277
278
279         struct v3_segment * segregs [] = {&(core->segments.ss), &(core->segments.ds), 
280                                           &(core->segments.es), &(core->segments.fs), 
281                                           &(core->segments.gs), NULL};
282
283         for ( i = 0; segregs[i] != NULL; i++) {
284             struct v3_segment * seg = segregs[i];
285         
286             seg->selector = 0x0000;
287             //    seg->base = seg->selector << 4;
288             seg->base = 0x00000000;
289             seg->limit = 0xffff;
290
291
292             seg->type = 0x3;
293             seg->system = 0x1;
294             seg->dpl = 0x0;
295             seg->present = 1;
296             //    seg->granularity = 1;
297
298         }
299
300
301         core->segments.gdtr.limit = 0x0000ffff;
302         core->segments.gdtr.base = 0x0000000000000000LL;
303
304         core->segments.idtr.limit = 0x0000ffff;
305         core->segments.idtr.base = 0x0000000000000000LL;
306
307         core->segments.ldtr.selector = 0x0000;
308         core->segments.ldtr.limit = 0x0000ffff;
309         core->segments.ldtr.base = 0x0000000000000000LL;
310         core->segments.ldtr.type = 2;
311         core->segments.ldtr.present = 1;
312
313         core->segments.tr.selector = 0x0000;
314         core->segments.tr.limit = 0x0000ffff;
315         core->segments.tr.base = 0x0000000000000000LL;
316         core->segments.tr.type = 0xb;
317         core->segments.tr.present = 1;
318
319         //      core->dbg_regs.dr6 = 0x00000000ffff0ff0LL;
320         core->dbg_regs.dr7 = 0x0000000000000400LL;
321
322         /* Enable EPT */
323         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
324         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
325         vmx_state->sec_proc_ctrls.unrstrct_guest = 1; // enable unrestricted guest operation
326
327
328         /* Disable shadow paging stuff */
329         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
330         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
331
332         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
333
334
335         if (v3_init_ept(core, &hw_info) == -1) {
336             PrintError("Error initializing EPT\n");
337             return -1;
338         }
339
340     } else {
341         PrintError("Invalid Virtual paging mode\n");
342         return -1;
343     }
344
345
346     // hook vmx msrs
347
348     // Setup SYSCALL/SYSENTER MSRs in load/store area
349     
350     // save STAR, LSTAR, FMASK, KERNEL_GS_BASE MSRs in MSR load/store area
351     {
352 #define IA32_STAR 0xc0000081
353 #define IA32_LSTAR 0xc0000082
354 #define IA32_FMASK 0xc0000084
355 #define IA32_KERN_GS_BASE 0xc0000102
356
357 #define IA32_CSTAR 0xc0000083 // Compatibility mode STAR (ignored for now... hopefully its not that important...)
358
359         int msr_ret = 0;
360
361         struct vmcs_msr_entry * exit_store_msrs = NULL;
362         struct vmcs_msr_entry * exit_load_msrs = NULL;
363         struct vmcs_msr_entry * entry_load_msrs = NULL;;
364         int max_msrs = (hw_info.misc_info.max_msr_cache_size + 1) * 4;
365
366         V3_Print("Setting up MSR load/store areas (max_msr_count=%d)\n", max_msrs);
367
368         if (max_msrs < 4) {
369             PrintError("Max MSR cache size is too small (%d)\n", max_msrs);
370             return -1;
371         }
372
373         vmx_state->msr_area = V3_VAddr(V3_AllocPages(1));
374
375         if (vmx_state->msr_area == NULL) {
376             PrintError("could not allocate msr load/store area\n");
377             return -1;
378         }
379
380         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_CNT, 4);
381         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_CNT, 4);
382         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_CNT, 4);
383         
384         
385         exit_store_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area);
386         exit_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 4));
387         entry_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 8));
388
389
390         exit_store_msrs[0].index = IA32_STAR;
391         exit_store_msrs[1].index = IA32_LSTAR;
392         exit_store_msrs[2].index = IA32_FMASK;
393         exit_store_msrs[3].index = IA32_KERN_GS_BASE;
394         
395         memcpy(exit_store_msrs, exit_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
396         memcpy(exit_store_msrs, entry_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
397
398         
399         v3_get_msr(IA32_STAR, &(exit_load_msrs[0].hi), &(exit_load_msrs[0].lo));
400         v3_get_msr(IA32_LSTAR, &(exit_load_msrs[1].hi), &(exit_load_msrs[1].lo));
401         v3_get_msr(IA32_FMASK, &(exit_load_msrs[2].hi), &(exit_load_msrs[2].lo));
402         v3_get_msr(IA32_KERN_GS_BASE, &(exit_load_msrs[3].hi), &(exit_load_msrs[3].lo));
403
404         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_ADDR, (addr_t)V3_PAddr(exit_store_msrs));
405         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_ADDR, (addr_t)V3_PAddr(exit_load_msrs));
406         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_ADDR, (addr_t)V3_PAddr(entry_load_msrs));
407
408     }    
409
410     /* Sanity check ctrl/reg fields against hw_defaults */
411
412
413
414
415     /*** Write all the info to the VMCS ***/
416   
417     /*
418     {
419         // IS THIS NECESSARY???
420 #define DEBUGCTL_MSR 0x1d9
421         struct v3_msr tmp_msr;
422         v3_get_msr(DEBUGCTL_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
423         vmx_ret |= check_vmcs_write(VMCS_GUEST_DBG_CTL, tmp_msr.value);
424         core->dbg_regs.dr7 = 0x400;
425     }
426     */
427
428 #ifdef __V3_64BIT__
429     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffffffffffULL);
430 #else
431     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffUL);
432     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR_HIGH, (addr_t)0xffffffffUL);
433 #endif
434
435
436  
437
438     if (v3_update_vmcs_ctrl_fields(core)) {
439         PrintError("Could not write control fields!\n");
440         return -1;
441     }
442     
443     if (v3_update_vmcs_host_state(core)) {
444         PrintError("Could not write host state\n");
445         return -1;
446     }
447
448     // reenable global interrupts for vm state initialization now
449     // that the vm state is initialized. If another VM kicks us off, 
450     // it'll update our vmx state so that we know to reload ourself
451     v3_enable_ints();
452
453     return 0;
454 }
455
456 int v3_init_vmx_vmcs(struct guest_info * core, v3_vm_class_t vm_class) {
457     struct vmx_data * vmx_state = NULL;
458     int vmx_ret = 0;
459     
460     vmx_state = (struct vmx_data *)V3_Malloc(sizeof(struct vmx_data));
461     memset(vmx_state, 0, sizeof(struct vmx_data));
462
463     PrintDebug("vmx_data pointer: %p\n", (void *)vmx_state);
464
465     PrintDebug("Allocating VMCS\n");
466     vmx_state->vmcs_ptr_phys = allocate_vmcs();
467
468     PrintDebug("VMCS pointer: %p\n", (void *)(vmx_state->vmcs_ptr_phys));
469
470     core->vmm_data = vmx_state;
471     vmx_state->state = VMX_UNLAUNCHED;
472
473     PrintDebug("Initializing VMCS (addr=%p)\n", core->vmm_data);
474     
475     // TODO: Fix vmcs fields so they're 32-bit
476
477     PrintDebug("Clearing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
478     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
479
480     if (vmx_ret != VMX_SUCCESS) {
481         PrintError("VMCLEAR failed\n");
482         return -1; 
483     }
484
485     if (vm_class == V3_PC_VM) {
486         PrintDebug("Initializing VMCS\n");
487         if (init_vmcs_bios(core, vmx_state) == -1) {
488             PrintError("Error initializing VMCS to BIOS state\n");
489             return -1;
490         }
491     } else {
492         PrintError("Invalid VM Class\n");
493         return -1;
494     }
495
496     PrintDebug("Serializing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
497     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
498
499     return 0;
500 }
501
502
503 int v3_deinit_vmx_vmcs(struct guest_info * core) {
504     struct vmx_data * vmx_state = core->vmm_data;
505
506     V3_FreePages((void *)(vmx_state->vmcs_ptr_phys), 1);
507     V3_FreePages(V3_PAddr(vmx_state->msr_area), 1);
508
509     V3_Free(vmx_state);
510
511     return 0;
512 }
513
514
515
516 #ifdef V3_CONFIG_CHECKPOINT
517 /* 
518  * JRL: This is broken
519  */
520 int v3_vmx_save_core(struct guest_info * core, void * ctx){
521     uint64_t vmcs_ptr = vmcs_store();
522
523     v3_chkpt_save(ctx, "vmcs_data", PAGE_SIZE, (void *)vmcs_ptr);
524
525     return 0;
526 }
527
528 int v3_vmx_load_core(struct guest_info * core, void * ctx){
529     struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
530     struct cr0_32 * shadow_cr0;
531     char vmcs[PAGE_SIZE_4KB];
532
533     v3_chkpt_load(ctx, "vmcs_data", PAGE_SIZE_4KB, vmcs);
534
535     vmcs_clear(vmx_info->vmcs_ptr_phys);
536     vmcs_load((addr_t)vmcs);
537
538     v3_vmx_save_vmcs(core);
539
540     shadow_cr0 = (struct cr0_32 *)&(core->ctrl_regs.cr0);
541
542
543     /* Get the CPU mode to set the guest_ia32e entry ctrl */
544
545     if (core->shdw_pg_mode == SHADOW_PAGING) {
546         if (v3_get_vm_mem_mode(core) == VIRTUAL_MEM) {
547             if (v3_activate_shadow_pt(core) == -1) {
548                 PrintError("Failed to activate shadow page tables\n");
549                 return -1;
550             }
551         } else {
552             if (v3_activate_passthrough_pt(core) == -1) {
553                 PrintError("Failed to activate passthrough page tables\n");
554                 return -1;
555             }
556         }
557     }
558
559     return 0;
560 }
561 #endif
562
563
564 void v3_flush_vmx_vm_core(struct guest_info * core) {
565     struct vmx_data * vmx_info = (struct vmx_data *)(core->vmm_data);
566     vmcs_clear(vmx_info->vmcs_ptr_phys);
567     vmx_info->state = VMX_UNLAUNCHED;
568 }
569
570
571
572 static int update_irq_exit_state(struct guest_info * info) {
573     struct vmx_exit_idt_vec_info idt_vec_info;
574
575     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
576
577     if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 0)) {
578 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
579         V3_Print("Calling v3_injecting_intr\n");
580 #endif
581         info->intr_core_state.irq_started = 0;
582         v3_injecting_intr(info, info->intr_core_state.irq_vector, V3_EXTERNAL_IRQ);
583     }
584
585     return 0;
586 }
587
588 static int update_irq_entry_state(struct guest_info * info) {
589     struct vmx_exit_idt_vec_info idt_vec_info;
590     struct vmcs_interrupt_state intr_core_state;
591     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
592
593     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
594     check_vmcs_read(VMCS_GUEST_INT_STATE, &(intr_core_state));
595
596     /* Check for pending exceptions to inject */
597     if (v3_excp_pending(info)) {
598         struct vmx_entry_int_info int_info;
599         int_info.value = 0;
600
601         // In VMX, almost every exception is hardware
602         // Software exceptions are pretty much only for breakpoint or overflow
603         int_info.type = 3;
604         int_info.vector = v3_get_excp_number(info);
605
606         if (info->excp_state.excp_error_code_valid) {
607             check_vmcs_write(VMCS_ENTRY_EXCP_ERR, info->excp_state.excp_error_code);
608             int_info.error_code = 1;
609
610 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
611             V3_Print("Injecting exception %d with error code %x\n", 
612                     int_info.vector, info->excp_state.excp_error_code);
613 #endif
614         }
615
616         int_info.valid = 1;
617 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
618         V3_Print("Injecting exception %d (EIP=%p)\n", int_info.vector, (void *)(addr_t)info->rip);
619 #endif
620         check_vmcs_write(VMCS_ENTRY_INT_INFO, int_info.value);
621
622         v3_injecting_excp(info, int_info.vector);
623
624     } else if ((((struct rflags *)&(info->ctrl_regs.rflags))->intr == 1) && 
625                (intr_core_state.val == 0)) {
626        
627         if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 1)) {
628
629 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
630             V3_Print("IRQ pending from previous injection\n");
631 #endif
632
633             // Copy the IDT vectoring info over to reinject the old interrupt
634             if (idt_vec_info.error_code == 1) {
635                 uint32_t err_code = 0;
636
637                 check_vmcs_read(VMCS_IDT_VECTOR_ERR, &err_code);
638                 check_vmcs_write(VMCS_ENTRY_EXCP_ERR, err_code);
639             }
640
641             idt_vec_info.undef = 0;
642             check_vmcs_write(VMCS_ENTRY_INT_INFO, idt_vec_info.value);
643
644         } else {
645             struct vmx_entry_int_info ent_int;
646             ent_int.value = 0;
647
648             switch (v3_intr_pending(info)) {
649                 case V3_EXTERNAL_IRQ: {
650                     info->intr_core_state.irq_vector = v3_get_intr(info); 
651                     ent_int.vector = info->intr_core_state.irq_vector;
652                     ent_int.type = 0;
653                     ent_int.error_code = 0;
654                     ent_int.valid = 1;
655
656 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
657                     V3_Print("Injecting Interrupt %d at exit %u(EIP=%p)\n", 
658                                info->intr_core_state.irq_vector, 
659                                (uint32_t)info->num_exits, 
660                                (void *)(addr_t)info->rip);
661 #endif
662
663                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
664                     info->intr_core_state.irq_started = 1;
665
666                     break;
667                 }
668                 case V3_NMI:
669                     PrintDebug("Injecting NMI\n");
670
671                     ent_int.type = 2;
672                     ent_int.vector = 2;
673                     ent_int.valid = 1;
674                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
675
676                     break;
677                 case V3_SOFTWARE_INTR:
678                     PrintDebug("Injecting software interrupt\n");
679                     ent_int.type = 4;
680
681                     ent_int.valid = 1;
682                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
683
684                     break;
685                 case V3_VIRTUAL_IRQ:
686                     // Not sure what to do here, Intel doesn't have virtual IRQs
687                     // May be the same as external interrupts/IRQs
688
689                     break;
690                 case V3_INVALID_INTR:
691                 default:
692                     break;
693             }
694         }
695     } else if ((v3_intr_pending(info)) && (vmx_info->pri_proc_ctrls.int_wndw_exit == 0)) {
696         // Enable INTR window exiting so we know when IF=1
697         uint32_t instr_len;
698
699         check_vmcs_read(VMCS_EXIT_INSTR_LEN, &instr_len);
700
701 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
702         V3_Print("Enabling Interrupt-Window exiting: %d\n", instr_len);
703 #endif
704
705         vmx_info->pri_proc_ctrls.int_wndw_exit = 1;
706         check_vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
707     }
708
709
710     return 0;
711 }
712
713
714
715 static struct vmx_exit_info exit_log[10];
716
717 static void print_exit_log(struct guest_info * info) {
718     int cnt = info->num_exits % 10;
719     int i = 0;
720     
721
722     V3_Print("\nExit Log (%d total exits):\n", (uint32_t)info->num_exits);
723
724     for (i = 0; i < 10; i++) {
725         struct vmx_exit_info * tmp = &exit_log[cnt];
726
727         V3_Print("%d:\texit_reason = %p\n", i, (void *)(addr_t)tmp->exit_reason);
728         V3_Print("\texit_qual = %p\n", (void *)tmp->exit_qual);
729         V3_Print("\tint_info = %p\n", (void *)(addr_t)tmp->int_info);
730         V3_Print("\tint_err = %p\n", (void *)(addr_t)tmp->int_err);
731         V3_Print("\tinstr_info = %p\n", (void *)(addr_t)tmp->instr_info);
732
733         cnt--;
734
735         if (cnt == -1) {
736             cnt = 9;
737         }
738
739     }
740
741 }
742
743 /* 
744  * CAUTION and DANGER!!! 
745  * 
746  * The VMCS CANNOT(!!) be accessed outside of the cli/sti calls inside this function
747  * When exectuing a symbiotic call, the VMCS WILL be overwritten, so any dependencies 
748  * on its contents will cause things to break. The contents at the time of the exit WILL 
749  * change before the exit handler is executed.
750  */
751 int v3_vmx_enter(struct guest_info * info) {
752     int ret = 0;
753     //uint32_t tsc_offset_low, tsc_offset_high;
754     struct vmx_exit_info exit_info;
755     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
756
757     // Conditionally yield the CPU if the timeslice has expired
758     v3_yield_cond(info);
759
760     // Perform any additional yielding needed for time adjustment
761     v3_adjust_time(info);
762
763     // disable global interrupts for vm state transition
764     v3_disable_ints();
765
766     // Update timer devices late after being in the VM so that as much 
767     // of hte time in the VM is accounted for as possible. Also do it before
768     // updating IRQ entry state so that any interrupts the timers raise get 
769     // handled on the next VM entry. Must be done with interrupts disabled.
770     v3_update_timers(info);
771
772     if (vmcs_store() != vmx_info->vmcs_ptr_phys) {
773         vmcs_clear(vmx_info->vmcs_ptr_phys);
774         vmcs_load(vmx_info->vmcs_ptr_phys);
775         vmx_info->state = VMX_UNLAUNCHED;
776     }
777
778     v3_vmx_restore_vmcs(info);
779
780
781 #ifdef V3_CONFIG_SYMCALL
782     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
783         update_irq_entry_state(info);
784     }
785 #else 
786     update_irq_entry_state(info);
787 #endif
788
789     {
790         addr_t guest_cr3;
791         vmcs_read(VMCS_GUEST_CR3, &guest_cr3);
792         vmcs_write(VMCS_GUEST_CR3, guest_cr3);
793     }
794
795     // Perform last-minute time bookkeeping prior to entering the VM
796     v3_time_enter_vm(info);
797
798     // tsc_offset_high = (uint32_t)((v3_tsc_host_offset(&info->time_state) >> 32) & 0xffffffff);
799     // tsc_offset_low = (uint32_t)(v3_tsc_host_offset(&info->time_state) & 0xffffffff);
800     // check_vmcs_write(VMCS_TSC_OFFSET_HIGH, tsc_offset_high);
801     // check_vmcs_write(VMCS_TSC_OFFSET, tsc_offset_low);
802
803     if (v3_update_vmcs_host_state(info)) {
804         v3_enable_ints();
805         PrintError("Could not write host state\n");
806         return -1;
807     }
808
809
810     if (vmx_info->state == VMX_UNLAUNCHED) {
811         vmx_info->state = VMX_LAUNCHED;
812
813         info->vm_info->run_state = VM_RUNNING;
814         ret = v3_vmx_launch(&(info->vm_regs), info, &(info->ctrl_regs));
815     } else {
816         V3_ASSERT(vmx_info->state != VMX_UNLAUNCHED);
817         ret = v3_vmx_resume(&(info->vm_regs), info, &(info->ctrl_regs));
818     }
819     
820
821
822     //  PrintDebug("VMX Exit: ret=%d\n", ret);
823
824     if (ret != VMX_SUCCESS) {
825         uint32_t error = 0;
826         vmcs_read(VMCS_INSTR_ERR, &error);
827
828         v3_enable_ints();
829
830         PrintError("VMENTRY Error: %d (launch_ret = %d)\n", error, ret);
831         return -1;
832     }
833
834
835
836     // Immediate exit from VM time bookkeeping
837     v3_time_exit_vm(info);
838
839     info->num_exits++;
840
841     /* Update guest state */
842     v3_vmx_save_vmcs(info);
843
844     // info->cpl = info->segments.cs.selector & 0x3;
845
846     info->mem_mode = v3_get_vm_mem_mode(info);
847     info->cpu_mode = v3_get_vm_cpu_mode(info);
848
849
850     check_vmcs_read(VMCS_EXIT_INSTR_LEN, &(exit_info.instr_len));
851     check_vmcs_read(VMCS_EXIT_INSTR_INFO, &(exit_info.instr_info));
852     check_vmcs_read(VMCS_EXIT_REASON, &(exit_info.exit_reason));
853     check_vmcs_read(VMCS_EXIT_QUAL, &(exit_info.exit_qual));
854     check_vmcs_read(VMCS_EXIT_INT_INFO, &(exit_info.int_info));
855     check_vmcs_read(VMCS_EXIT_INT_ERR, &(exit_info.int_err));
856     check_vmcs_read(VMCS_GUEST_LINEAR_ADDR, &(exit_info.guest_linear_addr));
857
858     if (info->shdw_pg_mode == NESTED_PAGING) {
859         check_vmcs_read(VMCS_GUEST_PHYS_ADDR, &(exit_info.ept_fault_addr));
860     }
861
862     //PrintDebug("VMX Exit taken, id-qual: %u-%lu\n", exit_info.exit_reason, exit_info.exit_qual);
863
864     exit_log[info->num_exits % 10] = exit_info;
865
866 #ifdef V3_CONFIG_SYMCALL
867     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
868         update_irq_exit_state(info);
869     }
870 #else
871     update_irq_exit_state(info);
872 #endif
873
874     if (exit_info.exit_reason == VMEXIT_INTR_WINDOW) {
875         // This is a special case whose only job is to inject an interrupt
876         vmcs_read(VMCS_PROC_CTRLS, &(vmx_info->pri_proc_ctrls.value));
877         vmx_info->pri_proc_ctrls.int_wndw_exit = 0;
878         vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
879
880 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
881        V3_Print("Interrupts available again! (RIP=%llx)\n", info->rip);
882 #endif
883     }
884
885     // reenable global interrupts after vm exit
886     v3_enable_ints();
887
888     // Conditionally yield the CPU if the timeslice has expired
889     v3_yield_cond(info);
890
891     if (v3_handle_vmx_exit(info, &exit_info) == -1) {
892         PrintError("Error in VMX exit handler (Exit reason=%x)\n", exit_info.exit_reason);
893         return -1;
894     }
895
896     return 0;
897 }
898
899
900 int v3_start_vmx_guest(struct guest_info * info) {
901
902     PrintDebug("Starting VMX core %u\n", info->vcpu_id);
903
904     if (info->vcpu_id == 0) {
905         info->core_run_state = CORE_RUNNING;
906         info->vm_info->run_state = VM_RUNNING;
907     } else {
908
909         PrintDebug("VMX core %u: Waiting for core initialization\n", info->vcpu_id);
910
911         while (info->core_run_state == CORE_STOPPED) {
912             v3_yield(info);
913             //PrintDebug("VMX core %u: still waiting for INIT\n",info->vcpu_id);
914         }
915         
916         PrintDebug("VMX core %u initialized\n", info->vcpu_id);
917
918         // We'll be paranoid about race conditions here
919         v3_wait_at_barrier(info);
920     }
921
922
923     PrintDebug("VMX core %u: I am starting at CS=0x%x (base=0x%p, limit=0x%x),  RIP=0x%p\n",
924                info->vcpu_id, info->segments.cs.selector, (void *)(info->segments.cs.base),
925                info->segments.cs.limit, (void *)(info->rip));
926
927
928     PrintDebug("VMX core %u: Launching VMX VM on logical core %u\n", info->vcpu_id, info->pcpu_id);
929
930     v3_start_time(info);
931
932     while (1) {
933
934         if (info->vm_info->run_state == VM_STOPPED) {
935             info->core_run_state = CORE_STOPPED;
936             break;
937         }
938
939         if (v3_vmx_enter(info) == -1) {
940
941             addr_t host_addr;
942             addr_t linear_addr = 0;
943             
944             info->vm_info->run_state = VM_ERROR;
945             
946             V3_Print("VMX core %u: VMX ERROR!!\n", info->vcpu_id); 
947             
948             v3_print_guest_state(info);
949             
950             V3_Print("VMX core %u\n", info->vcpu_id); 
951
952             linear_addr = get_addr_linear(info, info->rip, &(info->segments.cs));
953             
954             if (info->mem_mode == PHYSICAL_MEM) {
955                 v3_gpa_to_hva(info, linear_addr, &host_addr);
956             } else if (info->mem_mode == VIRTUAL_MEM) {
957                 v3_gva_to_hva(info, linear_addr, &host_addr);
958             }
959             
960             V3_Print("VMX core %u: Host Address of rip = 0x%p\n", info->vcpu_id, (void *)host_addr);
961             
962             V3_Print("VMX core %u: Instr (15 bytes) at %p:\n", info->vcpu_id, (void *)host_addr);
963             v3_dump_mem((uint8_t *)host_addr, 15);
964             
965             v3_print_stack(info);
966
967
968             v3_print_vmcs();
969             print_exit_log(info);
970             return -1;
971         }
972
973         v3_wait_at_barrier(info);
974
975
976         if (info->vm_info->run_state == VM_STOPPED) {
977             info->core_run_state = CORE_STOPPED;
978             break;
979         }
980 /*
981         if ((info->num_exits % 5000) == 0) {
982             V3_Print("VMX Exit number %d\n", (uint32_t)info->num_exits);
983         }
984 */
985
986     }
987
988     return 0;
989 }
990
991
992
993
994 #define VMX_FEATURE_CONTROL_MSR     0x0000003a
995 #define CPUID_VMX_FEATURES 0x00000005  /* LOCK and VMXON */
996 #define CPUID_1_ECX_VTXFLAG 0x00000020
997
998 int v3_is_vmx_capable() {
999     v3_msr_t feature_msr;
1000     uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
1001
1002     v3_cpuid(0x1, &eax, &ebx, &ecx, &edx);
1003
1004     PrintDebug("ECX: 0x%x\n", ecx);
1005
1006     if (ecx & CPUID_1_ECX_VTXFLAG) {
1007         v3_get_msr(VMX_FEATURE_CONTROL_MSR, &(feature_msr.hi), &(feature_msr.lo));
1008         
1009         PrintDebug("MSRREGlow: 0x%.8x\n", feature_msr.lo);
1010
1011         if ((feature_msr.lo & CPUID_VMX_FEATURES) != CPUID_VMX_FEATURES) {
1012             PrintDebug("VMX is locked -- enable in the BIOS\n");
1013             return 0;
1014         }
1015
1016     } else {
1017         PrintDebug("VMX not supported on this cpu\n");
1018         return 0;
1019     }
1020
1021     return 1;
1022 }
1023
1024
1025 int v3_reset_vmx_vm_core(struct guest_info * core, addr_t rip) {
1026     // init vmcs bios
1027     
1028     if ((core->shdw_pg_mode == NESTED_PAGING) && 
1029         (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
1030         // easy 
1031         core->rip = 0;
1032         core->segments.cs.selector = rip << 8;
1033         core->segments.cs.limit = 0xffff;
1034         core->segments.cs.base = rip << 12;
1035     } else {
1036         core->vm_regs.rdx = core->vcpu_id;
1037         core->vm_regs.rbx = rip;
1038     }
1039
1040     return 0;
1041 }
1042
1043
1044
1045 void v3_init_vmx_cpu(int cpu_id) {
1046     addr_t vmx_on_region = 0;
1047
1048     if (cpu_id == 0) {
1049         if (v3_init_vmx_hw(&hw_info) == -1) {
1050             PrintError("Could not initialize VMX hardware features on cpu %d\n", cpu_id);
1051             return;
1052         }
1053     }
1054
1055     enable_vmx();
1056
1057
1058     // Setup VMXON Region
1059     vmx_on_region = allocate_vmcs();
1060
1061
1062     if (vmx_on(vmx_on_region) == VMX_SUCCESS) {
1063         V3_Print("VMX Enabled\n");
1064         host_vmcs_ptrs[cpu_id] = vmx_on_region;
1065     } else {
1066         V3_Print("VMX already enabled\n");
1067         V3_FreePages((void *)vmx_on_region, 1);
1068     }
1069
1070     PrintDebug("VMXON pointer: 0x%p\n", (void *)host_vmcs_ptrs[cpu_id]);    
1071
1072     {
1073         struct vmx_sec_proc_ctrls sec_proc_ctrls;
1074         sec_proc_ctrls.value = v3_vmx_get_ctrl_features(&(hw_info.sec_proc_ctrls));
1075         
1076         if (sec_proc_ctrls.enable_ept == 0) {
1077             V3_Print("VMX EPT (Nested) Paging not supported\n");
1078             v3_cpu_types[cpu_id] = V3_VMX_CPU;
1079         } else if (sec_proc_ctrls.unrstrct_guest == 0) {
1080             V3_Print("VMX EPT (Nested) Paging supported\n");
1081             v3_cpu_types[cpu_id] = V3_VMX_EPT_CPU;
1082         } else {
1083             V3_Print("VMX EPT (Nested) Paging + Unrestricted guest supported\n");
1084             v3_cpu_types[cpu_id] = V3_VMX_EPT_UG_CPU;
1085         }
1086     }
1087 }
1088
1089
1090 void v3_deinit_vmx_cpu(int cpu_id) {
1091     extern v3_cpu_arch_t v3_cpu_types[];
1092     v3_cpu_types[cpu_id] = V3_INVALID_CPU;
1093
1094     if (host_vmcs_ptrs[cpu_id] != 0) {
1095         V3_Print("Disabling VMX\n");
1096
1097         if (vmx_off() != VMX_SUCCESS) {
1098             PrintError("Error executing VMXOFF\n");
1099         }
1100
1101         V3_FreePages((void *)host_vmcs_ptrs[cpu_id], 1);
1102
1103         host_vmcs_ptrs[cpu_id] = 0;
1104     }
1105 }