Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


various bug fixes to the block layer
[palacios.git] / palacios / src / palacios / vmx.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2011, Jack Lange <jarusl@cs.northwestern.edu>
11  * Copyright (c) 2011, The V3VEE Project <http://www.v3vee.org> 
12  * All rights reserved.
13  *
14  * Author: Jack Lange <jarusl@cs.northwestern.edu>
15  *
16  * This is free software.  You are permitted to use,
17  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
18  */
19
20
21 #include <palacios/vmx.h>
22 #include <palacios/vmm.h>
23 #include <palacios/vmx_handler.h>
24 #include <palacios/vmcs.h>
25 #include <palacios/vmx_lowlevel.h>
26 #include <palacios/vmm_lowlevel.h>
27 #include <palacios/vmm_ctrl_regs.h>
28 #include <palacios/vmm_config.h>
29 #include <palacios/vmm_time.h>
30 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_direct_paging.h>
32 #include <palacios/vmx_io.h>
33 #include <palacios/vmx_msr.h>
34 #include <palacios/vmm_decoder.h>
35
36 #include <palacios/vmx_ept.h>
37 #include <palacios/vmx_assist.h>
38 #include <palacios/vmx_hw_info.h>
39
40 #ifndef V3_CONFIG_DEBUG_VMX
41 #undef PrintDebug
42 #define PrintDebug(fmt, args...)
43 #endif
44
45
46 /* These fields contain the hardware feature sets supported by the local CPU */
47 static struct vmx_hw_info hw_info;
48
49 extern v3_cpu_arch_t v3_cpu_types[];
50
51 static addr_t host_vmcs_ptrs[V3_CONFIG_MAX_CPUS] = { [0 ... V3_CONFIG_MAX_CPUS - 1] = 0};
52
53 extern int v3_vmx_launch(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
54 extern int v3_vmx_resume(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
55
56 static int inline check_vmcs_write(vmcs_field_t field, addr_t val) {
57     int ret = 0;
58
59     ret = vmcs_write(field, val);
60
61     if (ret != VMX_SUCCESS) {
62         PrintError("VMWRITE error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
63         return 1;
64     }
65
66     return 0;
67 }
68
69 static int inline check_vmcs_read(vmcs_field_t field, void * val) {
70     int ret = 0;
71
72     ret = vmcs_read(field, val);
73
74     if (ret != VMX_SUCCESS) {
75         PrintError("VMREAD error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
76     }
77
78     return ret;
79 }
80
81
82
83
84 static addr_t allocate_vmcs() {
85     struct vmcs_data * vmcs_page = NULL;
86
87     PrintDebug("Allocating page\n");
88
89     vmcs_page = (struct vmcs_data *)V3_VAddr(V3_AllocPages(1));
90     memset(vmcs_page, 0, 4096);
91
92     vmcs_page->revision = hw_info.basic_info.revision;
93     PrintDebug("VMX Revision: 0x%x\n", vmcs_page->revision);
94
95     return (addr_t)V3_PAddr((void *)vmcs_page);
96 }
97
98
99
100
101 static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state) {
102     int vmx_ret = 0;
103
104     // disable global interrupts for vm state initialization
105     v3_disable_ints();
106
107     PrintDebug("Loading VMCS\n");
108     vmx_ret = vmcs_load(vmx_state->vmcs_ptr_phys);
109     vmx_state->state = VMX_UNLAUNCHED;
110
111     if (vmx_ret != VMX_SUCCESS) {
112         PrintError("VMPTRLD failed\n");
113         return -1;
114     }
115
116
117     /*** Setup default state from HW ***/
118
119     vmx_state->pin_ctrls.value = hw_info.pin_ctrls.def_val;
120     vmx_state->pri_proc_ctrls.value = hw_info.proc_ctrls.def_val;
121     vmx_state->exit_ctrls.value = hw_info.exit_ctrls.def_val;
122     vmx_state->entry_ctrls.value = hw_info.entry_ctrls.def_val;
123     vmx_state->sec_proc_ctrls.value = hw_info.sec_proc_ctrls.def_val;
124
125     /* Print Control MSRs */
126     PrintDebug("CR0 MSR: %p\n", (void *)(addr_t)hw_info.cr0.value);
127     PrintDebug("CR4 MSR: %p\n", (void *)(addr_t)hw_info.cr4.value);
128
129
130
131     /******* Setup Host State **********/
132
133     /* Cache GDTR, IDTR, and TR in host struct */
134     addr_t gdtr_base;
135     struct {
136         uint16_t selector;
137         addr_t   base;
138     } __attribute__((packed)) tmp_seg;
139     
140
141     __asm__ __volatile__(
142                          "sgdt (%0);"
143                          :
144                          : "q"(&tmp_seg)
145                          : "memory"
146                          );
147     gdtr_base = tmp_seg.base;
148     vmx_state->host_state.gdtr.base = gdtr_base;
149
150     __asm__ __volatile__(
151                          "sidt (%0);"
152                          :
153                          : "q"(&tmp_seg)
154                          : "memory"
155                          );
156     vmx_state->host_state.idtr.base = tmp_seg.base;
157
158     __asm__ __volatile__(
159                          "str (%0);"
160                          :
161                          : "q"(&tmp_seg)
162                          : "memory"
163                          );
164     vmx_state->host_state.tr.selector = tmp_seg.selector;
165
166     /* The GDTR *index* is bits 3-15 of the selector. */
167     struct tss_descriptor * desc = NULL;
168     desc = (struct tss_descriptor *)(gdtr_base + (8 * (tmp_seg.selector >> 3)));
169
170     tmp_seg.base = ((desc->base1) |
171                     (desc->base2 << 16) |
172                     (desc->base3 << 24) |
173 #ifdef __V3_64BIT__
174                     ((uint64_t)desc->base4 << 32)
175 #else 
176                     (0)
177 #endif
178                     );
179
180     vmx_state->host_state.tr.base = tmp_seg.base;
181
182
183     /********** Setup VMX Control Fields ***********/
184
185     /* Add external interrupts, NMI exiting, and virtual NMI */
186     vmx_state->pin_ctrls.nmi_exit = 1;
187     vmx_state->pin_ctrls.ext_int_exit = 1;
188
189
190     vmx_state->pri_proc_ctrls.hlt_exit = 1;
191
192
193     vmx_state->pri_proc_ctrls.pause_exit = 0;
194     vmx_state->pri_proc_ctrls.tsc_offset = 1;
195 #ifdef V3_CONFIG_TIME_VIRTUALIZE_TSC
196     vmx_state->pri_proc_ctrls.rdtsc_exit = 1;
197 #endif
198
199     /* Setup IO map */
200     vmx_state->pri_proc_ctrls.use_io_bitmap = 1;
201     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_A_ADDR, (addr_t)V3_PAddr(core->vm_info->io_map.arch_data));
202     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_B_ADDR, 
203             (addr_t)V3_PAddr(core->vm_info->io_map.arch_data) + PAGE_SIZE_4KB);
204
205
206     vmx_state->pri_proc_ctrls.use_msr_bitmap = 1;
207     vmx_ret |= check_vmcs_write(VMCS_MSR_BITMAP, (addr_t)V3_PAddr(core->vm_info->msr_map.arch_data));
208
209
210
211     
212
213
214
215 #ifdef __V3_64BIT__
216     // Ensure host runs in 64-bit mode at each VM EXIT
217     vmx_state->exit_ctrls.host_64_on = 1;
218 #endif
219
220     // Hook all accesses to EFER register
221     v3_hook_msr(core->vm_info, EFER_MSR, 
222                 &v3_handle_efer_read,
223                 &v3_handle_efer_write, 
224                 core);
225
226     // Restore host's EFER register on each VM EXIT
227     vmx_state->exit_ctrls.ld_efer = 1;
228
229     // Save/restore guest's EFER register to/from VMCS on VM EXIT/ENTRY
230     vmx_state->exit_ctrls.save_efer = 1;
231     vmx_state->entry_ctrls.ld_efer  = 1;
232
233     // Cause VM_EXIT whenever CR4.VMXE or CR4.PAE bits are written
234     vmx_ret |= check_vmcs_write(VMCS_CR4_MASK, CR4_VMXE | CR4_PAE);
235
236
237     /* Setup paging */
238     if (core->shdw_pg_mode == SHADOW_PAGING) {
239         PrintDebug("Creating initial shadow page table\n");
240
241         if (v3_init_passthrough_pts(core) == -1) {
242             PrintError("Could not initialize passthrough page tables\n");
243             return -1;
244         }
245         
246 #define CR0_PE 0x00000001
247 #define CR0_PG 0x80000000
248 #define CR0_WP 0x00010000 // To ensure mem hooks work
249         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
250
251         core->ctrl_regs.cr3 = core->direct_map_pt;
252
253         // vmx_state->pinbased_ctrls |= NMI_EXIT;
254
255         /* Add CR exits */
256         vmx_state->pri_proc_ctrls.cr3_ld_exit = 1;
257         vmx_state->pri_proc_ctrls.cr3_str_exit = 1;
258         
259         vmx_state->pri_proc_ctrls.invlpg_exit = 1;
260         
261         /* Add page fault exits */
262         vmx_state->excp_bmap.pf = 1;
263
264         // Setup VMX Assist
265         v3_vmxassist_init(core, vmx_state);
266
267     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
268                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_CPU)) {
269
270 #define CR0_PE 0x00000001
271 #define CR0_PG 0x80000000
272 #define CR0_WP 0x00010000 // To ensure mem hooks work
273         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
274
275         // vmx_state->pinbased_ctrls |= NMI_EXIT;
276
277         /* Disable CR exits */
278         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
279         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
280
281         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
282
283         /* Add page fault exits */
284         //      vmx_state->excp_bmap.pf = 1; // This should never happen..., enabled to catch bugs
285         
286         // Setup VMX Assist
287         v3_vmxassist_init(core, vmx_state);
288
289         /* Enable EPT */
290         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
291         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
292
293
294
295         if (v3_init_ept(core, &hw_info) == -1) {
296             PrintError("Error initializing EPT\n");
297             return -1;
298         }
299
300     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
301                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
302         int i = 0;
303         // For now we will assume that unrestricted guest mode is assured w/ EPT
304
305
306         core->vm_regs.rsp = 0x00;
307         core->rip = 0xfff0;
308         core->vm_regs.rdx = 0x00000f00;
309         core->ctrl_regs.rflags = 0x00000002; // The reserved bit is always 1
310         core->ctrl_regs.cr0 = 0x60010010; // Set the WP flag so the memory hooks work in real-mode
311
312
313         core->segments.cs.selector = 0xf000;
314         core->segments.cs.limit = 0xffff;
315         core->segments.cs.base = 0x0000000f0000LL;
316
317         // (raw attributes = 0xf3)
318         core->segments.cs.type = 0xb;
319         core->segments.cs.system = 0x1;
320         core->segments.cs.dpl = 0x0;
321         core->segments.cs.present = 1;
322
323
324
325         struct v3_segment * segregs [] = {&(core->segments.ss), &(core->segments.ds), 
326                                           &(core->segments.es), &(core->segments.fs), 
327                                           &(core->segments.gs), NULL};
328
329         for ( i = 0; segregs[i] != NULL; i++) {
330             struct v3_segment * seg = segregs[i];
331         
332             seg->selector = 0x0000;
333             //    seg->base = seg->selector << 4;
334             seg->base = 0x00000000;
335             seg->limit = 0xffff;
336
337
338             seg->type = 0x3;
339             seg->system = 0x1;
340             seg->dpl = 0x0;
341             seg->present = 1;
342             //    seg->granularity = 1;
343
344         }
345
346
347         core->segments.gdtr.limit = 0x0000ffff;
348         core->segments.gdtr.base = 0x0000000000000000LL;
349
350         core->segments.idtr.limit = 0x0000ffff;
351         core->segments.idtr.base = 0x0000000000000000LL;
352
353         core->segments.ldtr.selector = 0x0000;
354         core->segments.ldtr.limit = 0x0000ffff;
355         core->segments.ldtr.base = 0x0000000000000000LL;
356         core->segments.ldtr.type = 2;
357         core->segments.ldtr.present = 1;
358
359         core->segments.tr.selector = 0x0000;
360         core->segments.tr.limit = 0x0000ffff;
361         core->segments.tr.base = 0x0000000000000000LL;
362         core->segments.tr.type = 0xb;
363         core->segments.tr.present = 1;
364
365         //      core->dbg_regs.dr6 = 0x00000000ffff0ff0LL;
366         core->dbg_regs.dr7 = 0x0000000000000400LL;
367
368         /* Enable EPT */
369         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
370         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
371         vmx_state->sec_proc_ctrls.unrstrct_guest = 1; // enable unrestricted guest operation
372
373
374         /* Disable shadow paging stuff */
375         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
376         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
377
378         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
379
380
381         if (v3_init_ept(core, &hw_info) == -1) {
382             PrintError("Error initializing EPT\n");
383             return -1;
384         }
385
386     } else {
387         PrintError("Invalid Virtual paging mode\n");
388         return -1;
389     }
390
391
392     // hook vmx msrs
393
394     // Setup SYSCALL/SYSENTER MSRs in load/store area
395     
396     // save STAR, LSTAR, FMASK, KERNEL_GS_BASE MSRs in MSR load/store area
397     {
398 #define IA32_STAR 0xc0000081
399 #define IA32_LSTAR 0xc0000082
400 #define IA32_FMASK 0xc0000084
401 #define IA32_KERN_GS_BASE 0xc0000102
402
403 #define IA32_CSTAR 0xc0000083 // Compatibility mode STAR (ignored for now... hopefully its not that important...)
404
405         int msr_ret = 0;
406
407         struct vmcs_msr_entry * exit_store_msrs = NULL;
408         struct vmcs_msr_entry * exit_load_msrs = NULL;
409         struct vmcs_msr_entry * entry_load_msrs = NULL;;
410         int max_msrs = (hw_info.misc_info.max_msr_cache_size + 1) * 4;
411
412         V3_Print("Setting up MSR load/store areas (max_msr_count=%d)\n", max_msrs);
413
414         if (max_msrs < 4) {
415             PrintError("Max MSR cache size is too small (%d)\n", max_msrs);
416             return -1;
417         }
418
419         vmx_state->msr_area = V3_VAddr(V3_AllocPages(1));
420
421         if (vmx_state->msr_area == NULL) {
422             PrintError("could not allocate msr load/store area\n");
423             return -1;
424         }
425
426         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_CNT, 4);
427         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_CNT, 4);
428         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_CNT, 4);
429         
430         
431         exit_store_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area);
432         exit_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 4));
433         entry_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 8));
434
435
436         exit_store_msrs[0].index = IA32_STAR;
437         exit_store_msrs[1].index = IA32_LSTAR;
438         exit_store_msrs[2].index = IA32_FMASK;
439         exit_store_msrs[3].index = IA32_KERN_GS_BASE;
440         
441         memcpy(exit_store_msrs, exit_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
442         memcpy(exit_store_msrs, entry_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
443
444         
445         v3_get_msr(IA32_STAR, &(exit_load_msrs[0].hi), &(exit_load_msrs[0].lo));
446         v3_get_msr(IA32_LSTAR, &(exit_load_msrs[1].hi), &(exit_load_msrs[1].lo));
447         v3_get_msr(IA32_FMASK, &(exit_load_msrs[2].hi), &(exit_load_msrs[2].lo));
448         v3_get_msr(IA32_KERN_GS_BASE, &(exit_load_msrs[3].hi), &(exit_load_msrs[3].lo));
449
450         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_ADDR, (addr_t)V3_PAddr(exit_store_msrs));
451         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_ADDR, (addr_t)V3_PAddr(exit_load_msrs));
452         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_ADDR, (addr_t)V3_PAddr(entry_load_msrs));
453
454     }    
455
456     /* Sanity check ctrl/reg fields against hw_defaults */
457
458
459
460
461     /*** Write all the info to the VMCS ***/
462   
463     /*
464     {
465         // IS THIS NECESSARY???
466 #define DEBUGCTL_MSR 0x1d9
467         struct v3_msr tmp_msr;
468         v3_get_msr(DEBUGCTL_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
469         vmx_ret |= check_vmcs_write(VMCS_GUEST_DBG_CTL, tmp_msr.value);
470         core->dbg_regs.dr7 = 0x400;
471     }
472     */
473
474 #ifdef __V3_64BIT__
475     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffffffffffULL);
476 #else
477     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffUL);
478     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR_HIGH, (addr_t)0xffffffffUL);
479 #endif
480
481
482  
483
484     if (v3_update_vmcs_ctrl_fields(core)) {
485         PrintError("Could not write control fields!\n");
486         return -1;
487     }
488     
489     if (v3_update_vmcs_host_state(core)) {
490         PrintError("Could not write host state\n");
491         return -1;
492     }
493
494     // reenable global interrupts for vm state initialization now
495     // that the vm state is initialized. If another VM kicks us off, 
496     // it'll update our vmx state so that we know to reload ourself
497     v3_enable_ints();
498
499     return 0;
500 }
501
502 int v3_init_vmx_vmcs(struct guest_info * core, v3_vm_class_t vm_class) {
503     struct vmx_data * vmx_state = NULL;
504     int vmx_ret = 0;
505     
506     vmx_state = (struct vmx_data *)V3_Malloc(sizeof(struct vmx_data));
507     memset(vmx_state, 0, sizeof(struct vmx_data));
508
509     PrintDebug("vmx_data pointer: %p\n", (void *)vmx_state);
510
511     PrintDebug("Allocating VMCS\n");
512     vmx_state->vmcs_ptr_phys = allocate_vmcs();
513
514     PrintDebug("VMCS pointer: %p\n", (void *)(vmx_state->vmcs_ptr_phys));
515
516     core->vmm_data = vmx_state;
517     vmx_state->state = VMX_UNLAUNCHED;
518
519     PrintDebug("Initializing VMCS (addr=%p)\n", core->vmm_data);
520     
521     // TODO: Fix vmcs fields so they're 32-bit
522
523     PrintDebug("Clearing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
524     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
525
526     if (vmx_ret != VMX_SUCCESS) {
527         PrintError("VMCLEAR failed\n");
528         return -1; 
529     }
530
531     if (vm_class == V3_PC_VM) {
532         PrintDebug("Initializing VMCS\n");
533         if (init_vmcs_bios(core, vmx_state) == -1) {
534             PrintError("Error initializing VMCS to BIOS state\n");
535             return -1;
536         }
537     } else {
538         PrintError("Invalid VM Class\n");
539         return -1;
540     }
541
542     PrintDebug("Serializing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
543     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
544
545     return 0;
546 }
547
548
549 int v3_deinit_vmx_vmcs(struct guest_info * core) {
550     struct vmx_data * vmx_state = core->vmm_data;
551
552     V3_FreePages((void *)(vmx_state->vmcs_ptr_phys), 1);
553     V3_FreePages(V3_PAddr(vmx_state->msr_area), 1);
554
555     V3_Free(vmx_state);
556
557     return 0;
558 }
559
560
561 static int update_irq_exit_state(struct guest_info * info) {
562     struct vmx_exit_idt_vec_info idt_vec_info;
563
564     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
565
566     if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 0)) {
567 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
568         V3_Print("Calling v3_injecting_intr\n");
569 #endif
570         info->intr_core_state.irq_started = 0;
571         v3_injecting_intr(info, info->intr_core_state.irq_vector, V3_EXTERNAL_IRQ);
572     }
573
574     return 0;
575 }
576
577 static int update_irq_entry_state(struct guest_info * info) {
578     struct vmx_exit_idt_vec_info idt_vec_info;
579     struct vmcs_interrupt_state intr_core_state;
580     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
581
582     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
583     check_vmcs_read(VMCS_GUEST_INT_STATE, &(intr_core_state));
584
585     /* Check for pending exceptions to inject */
586     if (v3_excp_pending(info)) {
587         struct vmx_entry_int_info int_info;
588         int_info.value = 0;
589
590         // In VMX, almost every exception is hardware
591         // Software exceptions are pretty much only for breakpoint or overflow
592         int_info.type = 3;
593         int_info.vector = v3_get_excp_number(info);
594
595         if (info->excp_state.excp_error_code_valid) {
596             check_vmcs_write(VMCS_ENTRY_EXCP_ERR, info->excp_state.excp_error_code);
597             int_info.error_code = 1;
598
599 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
600             V3_Print("Injecting exception %d with error code %x\n", 
601                     int_info.vector, info->excp_state.excp_error_code);
602 #endif
603         }
604
605         int_info.valid = 1;
606 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
607         V3_Print("Injecting exception %d (EIP=%p)\n", int_info.vector, (void *)(addr_t)info->rip);
608 #endif
609         check_vmcs_write(VMCS_ENTRY_INT_INFO, int_info.value);
610
611         v3_injecting_excp(info, int_info.vector);
612
613     } else if ((((struct rflags *)&(info->ctrl_regs.rflags))->intr == 1) && 
614                (intr_core_state.val == 0)) {
615        
616         if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 1)) {
617
618 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
619             V3_Print("IRQ pending from previous injection\n");
620 #endif
621
622             // Copy the IDT vectoring info over to reinject the old interrupt
623             if (idt_vec_info.error_code == 1) {
624                 uint32_t err_code = 0;
625
626                 check_vmcs_read(VMCS_IDT_VECTOR_ERR, &err_code);
627                 check_vmcs_write(VMCS_ENTRY_EXCP_ERR, err_code);
628             }
629
630             idt_vec_info.undef = 0;
631             check_vmcs_write(VMCS_ENTRY_INT_INFO, idt_vec_info.value);
632
633         } else {
634             struct vmx_entry_int_info ent_int;
635             ent_int.value = 0;
636
637             switch (v3_intr_pending(info)) {
638                 case V3_EXTERNAL_IRQ: {
639                     info->intr_core_state.irq_vector = v3_get_intr(info); 
640                     ent_int.vector = info->intr_core_state.irq_vector;
641                     ent_int.type = 0;
642                     ent_int.error_code = 0;
643                     ent_int.valid = 1;
644
645 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
646                     V3_Print("Injecting Interrupt %d at exit %u(EIP=%p)\n", 
647                                info->intr_core_state.irq_vector, 
648                                (uint32_t)info->num_exits, 
649                                (void *)(addr_t)info->rip);
650 #endif
651
652                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
653                     info->intr_core_state.irq_started = 1;
654
655                     break;
656                 }
657                 case V3_NMI:
658                     PrintDebug("Injecting NMI\n");
659
660                     ent_int.type = 2;
661                     ent_int.vector = 2;
662                     ent_int.valid = 1;
663                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
664
665                     break;
666                 case V3_SOFTWARE_INTR:
667                     PrintDebug("Injecting software interrupt\n");
668                     ent_int.type = 4;
669
670                     ent_int.valid = 1;
671                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
672
673                     break;
674                 case V3_VIRTUAL_IRQ:
675                     // Not sure what to do here, Intel doesn't have virtual IRQs
676                     // May be the same as external interrupts/IRQs
677
678                     break;
679                 case V3_INVALID_INTR:
680                 default:
681                     break;
682             }
683         }
684     } else if ((v3_intr_pending(info)) && (vmx_info->pri_proc_ctrls.int_wndw_exit == 0)) {
685         // Enable INTR window exiting so we know when IF=1
686         uint32_t instr_len;
687
688         check_vmcs_read(VMCS_EXIT_INSTR_LEN, &instr_len);
689
690 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
691         V3_Print("Enabling Interrupt-Window exiting: %d\n", instr_len);
692 #endif
693
694         vmx_info->pri_proc_ctrls.int_wndw_exit = 1;
695         check_vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
696     }
697
698
699     return 0;
700 }
701
702
703
704 static struct vmx_exit_info exit_log[10];
705
706 static void print_exit_log(struct guest_info * info) {
707     int cnt = info->num_exits % 10;
708     int i = 0;
709     
710
711     V3_Print("\nExit Log (%d total exits):\n", (uint32_t)info->num_exits);
712
713     for (i = 0; i < 10; i++) {
714         struct vmx_exit_info * tmp = &exit_log[cnt];
715
716         V3_Print("%d:\texit_reason = %p\n", i, (void *)(addr_t)tmp->exit_reason);
717         V3_Print("\texit_qual = %p\n", (void *)tmp->exit_qual);
718         V3_Print("\tint_info = %p\n", (void *)(addr_t)tmp->int_info);
719         V3_Print("\tint_err = %p\n", (void *)(addr_t)tmp->int_err);
720         V3_Print("\tinstr_info = %p\n", (void *)(addr_t)tmp->instr_info);
721
722         cnt--;
723
724         if (cnt == -1) {
725             cnt = 9;
726         }
727
728     }
729
730 }
731
732 /* 
733  * CAUTION and DANGER!!! 
734  * 
735  * The VMCS CANNOT(!!) be accessed outside of the cli/sti calls inside this function
736  * When exectuing a symbiotic call, the VMCS WILL be overwritten, so any dependencies 
737  * on its contents will cause things to break. The contents at the time of the exit WILL 
738  * change before the exit handler is executed.
739  */
740 int v3_vmx_enter(struct guest_info * info) {
741     int ret = 0;
742     uint32_t tsc_offset_low, tsc_offset_high;
743     struct vmx_exit_info exit_info;
744     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
745
746     // Conditionally yield the CPU if the timeslice has expired
747     v3_yield_cond(info);
748
749     // Perform any additional yielding needed for time adjustment
750     v3_adjust_time(info);
751
752     // disable global interrupts for vm state transition
753     v3_disable_ints();
754
755     // Update timer devices late after being in the VM so that as much 
756     // of hte time in the VM is accounted for as possible. Also do it before
757     // updating IRQ entry state so that any interrupts the timers raise get 
758     // handled on the next VM entry. Must be done with interrupts disabled.
759     v3_update_timers(info);
760
761     if (vmcs_store() != vmx_info->vmcs_ptr_phys) {
762         vmcs_load(vmx_info->vmcs_ptr_phys);
763     }
764
765     v3_vmx_restore_vmcs(info);
766
767
768 #ifdef V3_CONFIG_SYMCALL
769     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
770         update_irq_entry_state(info);
771     }
772 #else 
773     update_irq_entry_state(info);
774 #endif
775
776     {
777         addr_t guest_cr3;
778         vmcs_read(VMCS_GUEST_CR3, &guest_cr3);
779         vmcs_write(VMCS_GUEST_CR3, guest_cr3);
780     }
781
782     // Perform last-minute time bookkeeping prior to entering the VM
783     v3_time_enter_vm(info);
784
785     tsc_offset_high = (uint32_t)((v3_tsc_host_offset(&info->time_state) >> 32) & 0xffffffff);
786     tsc_offset_low = (uint32_t)(v3_tsc_host_offset(&info->time_state) & 0xffffffff);
787     check_vmcs_write(VMCS_TSC_OFFSET_HIGH, tsc_offset_high);
788     check_vmcs_write(VMCS_TSC_OFFSET, tsc_offset_low);
789
790     if (v3_update_vmcs_host_state(info)) {
791         v3_enable_ints();
792         PrintError("Could not write host state\n");
793         return -1;
794     }
795
796
797     if (vmx_info->state == VMX_UNLAUNCHED) {
798         vmx_info->state = VMX_LAUNCHED;
799         info->vm_info->run_state = VM_RUNNING;
800         ret = v3_vmx_launch(&(info->vm_regs), info, &(info->ctrl_regs));
801     } else {
802         V3_ASSERT(vmx_info->state != VMX_UNLAUNCHED);
803         ret = v3_vmx_resume(&(info->vm_regs), info, &(info->ctrl_regs));
804     }
805     
806     //  PrintDebug("VMX Exit: ret=%d\n", ret);
807
808     if (ret != VMX_SUCCESS) {
809         uint32_t error = 0;
810
811         vmcs_read(VMCS_INSTR_ERR, &error);
812
813         v3_enable_ints();
814
815         PrintError("VMENTRY Error: %d\n", error);
816         return -1;
817     }
818
819     // Immediate exit from VM time bookkeeping
820     v3_time_exit_vm(info);
821
822     info->num_exits++;
823
824     /* Update guest state */
825     v3_vmx_save_vmcs(info);
826
827     // info->cpl = info->segments.cs.selector & 0x3;
828
829     info->mem_mode = v3_get_vm_mem_mode(info);
830     info->cpu_mode = v3_get_vm_cpu_mode(info);
831
832
833     check_vmcs_read(VMCS_EXIT_INSTR_LEN, &(exit_info.instr_len));
834     check_vmcs_read(VMCS_EXIT_INSTR_INFO, &(exit_info.instr_info));
835     check_vmcs_read(VMCS_EXIT_REASON, &(exit_info.exit_reason));
836     check_vmcs_read(VMCS_EXIT_QUAL, &(exit_info.exit_qual));
837     check_vmcs_read(VMCS_EXIT_INT_INFO, &(exit_info.int_info));
838     check_vmcs_read(VMCS_EXIT_INT_ERR, &(exit_info.int_err));
839     check_vmcs_read(VMCS_GUEST_LINEAR_ADDR, &(exit_info.guest_linear_addr));
840
841     if (info->shdw_pg_mode == NESTED_PAGING) {
842         check_vmcs_read(VMCS_GUEST_PHYS_ADDR, &(exit_info.ept_fault_addr));
843     }
844
845     //PrintDebug("VMX Exit taken, id-qual: %u-%lu\n", exit_info.exit_reason, exit_info.exit_qual);
846
847     exit_log[info->num_exits % 10] = exit_info;
848
849 #ifdef V3_CONFIG_SYMCALL
850     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
851         update_irq_exit_state(info);
852     }
853 #else
854     update_irq_exit_state(info);
855 #endif
856
857     if (exit_info.exit_reason == VMEXIT_INTR_WINDOW) {
858         // This is a special case whose only job is to inject an interrupt
859         vmcs_read(VMCS_PROC_CTRLS, &(vmx_info->pri_proc_ctrls.value));
860         vmx_info->pri_proc_ctrls.int_wndw_exit = 0;
861         vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
862
863 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
864        V3_Print("Interrupts available again! (RIP=%llx)\n", info->rip);
865 #endif
866     }
867
868     // reenable global interrupts after vm exit
869     v3_enable_ints();
870
871     // Conditionally yield the CPU if the timeslice has expired
872     v3_yield_cond(info);
873
874     if (v3_handle_vmx_exit(info, &exit_info) == -1) {
875         PrintError("Error in VMX exit handler (Exit reason=%x)\n", exit_info.exit_reason);
876         return -1;
877     }
878
879     return 0;
880 }
881
882
883 int v3_start_vmx_guest(struct guest_info * info) {
884
885     PrintDebug("Starting VMX core %u\n", info->vcpu_id);
886
887     if (info->vcpu_id == 0) {
888         info->core_run_state = CORE_RUNNING;
889         info->vm_info->run_state = VM_RUNNING;
890     } else {
891
892         PrintDebug("VMX core %u: Waiting for core initialization\n", info->vcpu_id);
893
894         while (info->core_run_state == CORE_STOPPED) {
895             v3_yield(info);
896             //PrintDebug("VMX core %u: still waiting for INIT\n",info->vcpu_id);
897         }
898         
899         PrintDebug("VMX core %u initialized\n", info->vcpu_id);
900     }
901
902
903     PrintDebug("VMX core %u: I am starting at CS=0x%x (base=0x%p, limit=0x%x),  RIP=0x%p\n",
904                info->vcpu_id, info->segments.cs.selector, (void *)(info->segments.cs.base),
905                info->segments.cs.limit, (void *)(info->rip));
906
907
908     PrintDebug("VMX core %u: Launching VMX VM on logical core %u\n", info->vcpu_id, info->pcpu_id);
909
910     v3_start_time(info);
911
912     while (1) {
913
914         if (info->vm_info->run_state == VM_STOPPED) {
915             info->core_run_state = CORE_STOPPED;
916             break;
917         }
918
919         if (v3_vmx_enter(info) == -1) {
920
921   addr_t host_addr;
922             addr_t linear_addr = 0;
923             
924             info->vm_info->run_state = VM_ERROR;
925             
926             V3_Print("VMX core %u: VMX ERROR!!\n", info->vcpu_id); 
927             
928             v3_print_guest_state(info);
929             
930             V3_Print("VMX core %u\n", info->vcpu_id); 
931             
932
933             
934             linear_addr = get_addr_linear(info, info->rip, &(info->segments.cs));
935             
936             if (info->mem_mode == PHYSICAL_MEM) {
937                 v3_gpa_to_hva(info, linear_addr, &host_addr);
938             } else if (info->mem_mode == VIRTUAL_MEM) {
939                 v3_gva_to_hva(info, linear_addr, &host_addr);
940             }
941             
942             V3_Print("VMX core %u: Host Address of rip = 0x%p\n", info->vcpu_id, (void *)host_addr);
943             
944             V3_Print("VMX core %u: Instr (15 bytes) at %p:\n", info->vcpu_id, (void *)host_addr);
945             v3_dump_mem((uint8_t *)host_addr, 15);
946             
947             v3_print_stack(info);
948
949
950             v3_print_vmcs();
951             print_exit_log(info);
952             return -1;
953         }
954
955
956
957         if (info->vm_info->run_state == VM_STOPPED) {
958             info->core_run_state = CORE_STOPPED;
959             break;
960         }
961 /*
962         if ((info->num_exits % 5000) == 0) {
963             V3_Print("VMX Exit number %d\n", (uint32_t)info->num_exits);
964         }
965 */
966
967     }
968
969     return 0;
970 }
971
972
973
974
975 #define VMX_FEATURE_CONTROL_MSR     0x0000003a
976 #define CPUID_VMX_FEATURES 0x00000005  /* LOCK and VMXON */
977 #define CPUID_1_ECX_VTXFLAG 0x00000020
978
979 int v3_is_vmx_capable() {
980     v3_msr_t feature_msr;
981     uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
982
983     v3_cpuid(0x1, &eax, &ebx, &ecx, &edx);
984
985     PrintDebug("ECX: 0x%x\n", ecx);
986
987     if (ecx & CPUID_1_ECX_VTXFLAG) {
988         v3_get_msr(VMX_FEATURE_CONTROL_MSR, &(feature_msr.hi), &(feature_msr.lo));
989         
990         PrintDebug("MSRREGlow: 0x%.8x\n", feature_msr.lo);
991
992         if ((feature_msr.lo & CPUID_VMX_FEATURES) != CPUID_VMX_FEATURES) {
993             PrintDebug("VMX is locked -- enable in the BIOS\n");
994             return 0;
995         }
996
997     } else {
998         PrintDebug("VMX not supported on this cpu\n");
999         return 0;
1000     }
1001
1002     return 1;
1003 }
1004
1005
1006 int v3_reset_vmx_vm_core(struct guest_info * core, addr_t rip) {
1007     // init vmcs bios
1008     
1009     if ((core->shdw_pg_mode == NESTED_PAGING) && 
1010         (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
1011         // easy 
1012         core->rip = 0;
1013         core->segments.cs.selector = rip << 8;
1014         core->segments.cs.limit = 0xffff;
1015         core->segments.cs.base = rip << 12;
1016     } else {
1017         core->vm_regs.rdx = core->vcpu_id;
1018         core->vm_regs.rbx = rip;
1019     }
1020
1021     return 0;
1022 }
1023
1024
1025
1026 void v3_init_vmx_cpu(int cpu_id) {
1027     addr_t vmx_on_region = 0;
1028
1029     if (cpu_id == 0) {
1030         if (v3_init_vmx_hw(&hw_info) == -1) {
1031             PrintError("Could not initialize VMX hardware features on cpu %d\n", cpu_id);
1032             return;
1033         }
1034     }
1035
1036     enable_vmx();
1037
1038
1039     // Setup VMXON Region
1040     vmx_on_region = allocate_vmcs();
1041
1042
1043     if (vmx_on(vmx_on_region) == VMX_SUCCESS) {
1044         V3_Print("VMX Enabled\n");
1045         host_vmcs_ptrs[cpu_id] = vmx_on_region;
1046     } else {
1047         V3_Print("VMX already enabled\n");
1048         V3_FreePages((void *)vmx_on_region, 1);
1049     }
1050
1051     PrintDebug("VMXON pointer: 0x%p\n", (void *)host_vmcs_ptrs[cpu_id]);    
1052
1053     {
1054         struct vmx_sec_proc_ctrls sec_proc_ctrls;
1055         sec_proc_ctrls.value = v3_vmx_get_ctrl_features(&(hw_info.sec_proc_ctrls));
1056         
1057         if (sec_proc_ctrls.enable_ept == 0) {
1058             V3_Print("VMX EPT (Nested) Paging not supported\n");
1059             v3_cpu_types[cpu_id] = V3_VMX_CPU;
1060         } else if (sec_proc_ctrls.unrstrct_guest == 0) {
1061             V3_Print("VMX EPT (Nested) Paging supported\n");
1062             v3_cpu_types[cpu_id] = V3_VMX_EPT_CPU;
1063         } else {
1064             V3_Print("VMX EPT (Nested) Paging + Unrestricted guest supported\n");
1065             v3_cpu_types[cpu_id] = V3_VMX_EPT_UG_CPU;
1066         }
1067     }
1068 }
1069
1070
1071 void v3_deinit_vmx_cpu(int cpu_id) {
1072     extern v3_cpu_arch_t v3_cpu_types[];
1073     v3_cpu_types[cpu_id] = V3_INVALID_CPU;
1074
1075     if (host_vmcs_ptrs[cpu_id] != 0) {
1076         V3_Print("Disabling VMX\n");
1077
1078         if (vmx_off() != VMX_SUCCESS) {
1079             PrintError("Error executing VMXOFF\n");
1080         }
1081
1082         V3_FreePages((void *)host_vmcs_ptrs[cpu_id], 1);
1083
1084         host_vmcs_ptrs[cpu_id] = 0;
1085     }
1086 }