Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


More timing cleanup
[palacios.git] / palacios / src / palacios / vmx.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2011, Jack Lange <jarusl@cs.northwestern.edu>
11  * Copyright (c) 2011, The V3VEE Project <http://www.v3vee.org> 
12  * All rights reserved.
13  *
14  * Author: Jack Lange <jarusl@cs.northwestern.edu>
15  *
16  * This is free software.  You are permitted to use,
17  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
18  */
19
20
21 #include <palacios/vmx.h>
22 #include <palacios/vmm.h>
23 #include <palacios/vmx_handler.h>
24 #include <palacios/vmcs.h>
25 #include <palacios/vmx_lowlevel.h>
26 #include <palacios/vmm_lowlevel.h>
27 #include <palacios/vmm_ctrl_regs.h>
28 #include <palacios/vmm_config.h>
29 #include <palacios/vmm_time.h>
30 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_direct_paging.h>
32 #include <palacios/vmx_io.h>
33 #include <palacios/vmx_msr.h>
34
35 #include <palacios/vmx_ept.h>
36 #include <palacios/vmx_assist.h>
37 #include <palacios/vmx_hw_info.h>
38
39 #ifndef V3_CONFIG_DEBUG_VMX
40 #undef PrintDebug
41 #define PrintDebug(fmt, args...)
42 #endif
43
44
45 /* These fields contain the hardware feature sets supported by the local CPU */
46 static struct vmx_hw_info hw_info;
47
48 extern v3_cpu_arch_t v3_cpu_types[];
49
50 static addr_t host_vmcs_ptrs[V3_CONFIG_MAX_CPUS] = { [0 ... V3_CONFIG_MAX_CPUS - 1] = 0};
51
52 extern int v3_vmx_launch(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
53 extern int v3_vmx_resume(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
54
55 static int inline check_vmcs_write(vmcs_field_t field, addr_t val) {
56     int ret = 0;
57
58     ret = vmcs_write(field, val);
59
60     if (ret != VMX_SUCCESS) {
61         PrintError("VMWRITE error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
62         return 1;
63     }
64
65     return 0;
66 }
67
68 static int inline check_vmcs_read(vmcs_field_t field, void * val) {
69     int ret = 0;
70
71     ret = vmcs_read(field, val);
72
73     if (ret != VMX_SUCCESS) {
74         PrintError("VMREAD error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
75     }
76
77     return ret;
78 }
79
80
81
82
83 static addr_t allocate_vmcs() {
84     struct vmcs_data * vmcs_page = NULL;
85
86     PrintDebug("Allocating page\n");
87
88     vmcs_page = (struct vmcs_data *)V3_VAddr(V3_AllocPages(1));
89     memset(vmcs_page, 0, 4096);
90
91     vmcs_page->revision = hw_info.basic_info.revision;
92     PrintDebug("VMX Revision: 0x%x\n", vmcs_page->revision);
93
94     return (addr_t)V3_PAddr((void *)vmcs_page);
95 }
96
97
98
99
100 static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state) {
101     int vmx_ret = 0;
102
103     // disable global interrupts for vm state initialization
104     v3_disable_ints();
105
106     PrintDebug("Loading VMCS\n");
107     vmx_ret = vmcs_load(vmx_state->vmcs_ptr_phys);
108     vmx_state->state = VMX_UNLAUNCHED;
109
110     if (vmx_ret != VMX_SUCCESS) {
111         PrintError("VMPTRLD failed\n");
112         return -1;
113     }
114
115
116     /*** Setup default state from HW ***/
117
118     vmx_state->pin_ctrls.value = hw_info.pin_ctrls.def_val;
119     vmx_state->pri_proc_ctrls.value = hw_info.proc_ctrls.def_val;
120     vmx_state->exit_ctrls.value = hw_info.exit_ctrls.def_val;
121     vmx_state->entry_ctrls.value = hw_info.entry_ctrls.def_val;
122     vmx_state->sec_proc_ctrls.value = hw_info.sec_proc_ctrls.def_val;
123
124     /* Print Control MSRs */
125     PrintDebug("CR0 MSR: %p\n", (void *)(addr_t)hw_info.cr0.value);
126     PrintDebug("CR4 MSR: %p\n", (void *)(addr_t)hw_info.cr4.value);
127
128
129
130     /******* Setup Host State **********/
131
132     /* Cache GDTR, IDTR, and TR in host struct */
133     addr_t gdtr_base;
134     struct {
135         uint16_t selector;
136         addr_t   base;
137     } __attribute__((packed)) tmp_seg;
138     
139
140     __asm__ __volatile__(
141                          "sgdt (%0);"
142                          :
143                          : "q"(&tmp_seg)
144                          : "memory"
145                          );
146     gdtr_base = tmp_seg.base;
147     vmx_state->host_state.gdtr.base = gdtr_base;
148
149     __asm__ __volatile__(
150                          "sidt (%0);"
151                          :
152                          : "q"(&tmp_seg)
153                          : "memory"
154                          );
155     vmx_state->host_state.idtr.base = tmp_seg.base;
156
157     __asm__ __volatile__(
158                          "str (%0);"
159                          :
160                          : "q"(&tmp_seg)
161                          : "memory"
162                          );
163     vmx_state->host_state.tr.selector = tmp_seg.selector;
164
165     /* The GDTR *index* is bits 3-15 of the selector. */
166     struct tss_descriptor * desc = NULL;
167     desc = (struct tss_descriptor *)(gdtr_base + (8 * (tmp_seg.selector >> 3)));
168
169     tmp_seg.base = ((desc->base1) |
170                     (desc->base2 << 16) |
171                     (desc->base3 << 24) |
172 #ifdef __V3_64BIT__
173                     ((uint64_t)desc->base4 << 32)
174 #else 
175                     (0)
176 #endif
177                     );
178
179     vmx_state->host_state.tr.base = tmp_seg.base;
180
181
182     /********** Setup VMX Control Fields ***********/
183
184     /* Add external interrupts, NMI exiting, and virtual NMI */
185     vmx_state->pin_ctrls.nmi_exit = 1;
186     vmx_state->pin_ctrls.ext_int_exit = 1;
187
188
189     vmx_state->pri_proc_ctrls.hlt_exit = 1;
190
191
192     vmx_state->pri_proc_ctrls.pause_exit = 0;
193     vmx_state->pri_proc_ctrls.tsc_offset = 1;
194 #ifdef V3_CONFIG_TIME_VIRTUALIZE_TSC
195     vmx_state->pri_proc_ctrls.rdtsc_exit = 1;
196 #endif
197
198     /* Setup IO map */
199     vmx_state->pri_proc_ctrls.use_io_bitmap = 1;
200     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_A_ADDR, (addr_t)V3_PAddr(core->vm_info->io_map.arch_data));
201     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_B_ADDR, 
202             (addr_t)V3_PAddr(core->vm_info->io_map.arch_data) + PAGE_SIZE_4KB);
203
204
205     vmx_state->pri_proc_ctrls.use_msr_bitmap = 1;
206     vmx_ret |= check_vmcs_write(VMCS_MSR_BITMAP, (addr_t)V3_PAddr(core->vm_info->msr_map.arch_data));
207
208
209
210     
211
212
213
214 #ifdef __V3_64BIT__
215     vmx_state->exit_ctrls.host_64_on = 1;
216 #endif
217
218
219     /* Not sure how exactly to handle this... */
220     v3_hook_msr(core->vm_info, EFER_MSR, 
221                 &v3_handle_efer_read,
222                 &v3_handle_efer_write, 
223                 core);
224
225     // Or is it this??? 
226     vmx_state->entry_ctrls.ld_efer = 1;
227     vmx_state->exit_ctrls.ld_efer = 1;
228     vmx_state->exit_ctrls.save_efer = 1;
229     /*   ***   */
230
231     vmx_ret |= check_vmcs_write(VMCS_CR4_MASK, CR4_VMXE);
232
233
234     /* Setup paging */
235     if (core->shdw_pg_mode == SHADOW_PAGING) {
236         PrintDebug("Creating initial shadow page table\n");
237
238         if (v3_init_passthrough_pts(core) == -1) {
239             PrintError("Could not initialize passthrough page tables\n");
240             return -1;
241         }
242         
243 #define CR0_PE 0x00000001
244 #define CR0_PG 0x80000000
245 #define CR0_WP 0x00010000 // To ensure mem hooks work
246         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
247
248         core->ctrl_regs.cr3 = core->direct_map_pt;
249
250         // vmx_state->pinbased_ctrls |= NMI_EXIT;
251
252         /* Add CR exits */
253         vmx_state->pri_proc_ctrls.cr3_ld_exit = 1;
254         vmx_state->pri_proc_ctrls.cr3_str_exit = 1;
255         
256         vmx_state->pri_proc_ctrls.invlpg_exit = 1;
257         
258         /* Add page fault exits */
259         vmx_state->excp_bmap.pf = 1;
260
261         // Setup VMX Assist
262         v3_vmxassist_init(core, vmx_state);
263
264     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
265                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_CPU)) {
266
267 #define CR0_PE 0x00000001
268 #define CR0_PG 0x80000000
269 #define CR0_WP 0x00010000 // To ensure mem hooks work
270         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
271
272         // vmx_state->pinbased_ctrls |= NMI_EXIT;
273
274         /* Disable CR exits */
275         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
276         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
277
278         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
279
280         /* Add page fault exits */
281         //      vmx_state->excp_bmap.pf = 1; // This should never happen..., enabled to catch bugs
282         
283         // Setup VMX Assist
284         v3_vmxassist_init(core, vmx_state);
285
286         /* Enable EPT */
287         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
288         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
289
290
291
292         if (v3_init_ept(core, &hw_info) == -1) {
293             PrintError("Error initializing EPT\n");
294             return -1;
295         }
296
297     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
298                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
299         int i = 0;
300         // For now we will assume that unrestricted guest mode is assured w/ EPT
301
302
303         core->vm_regs.rsp = 0x00;
304         core->rip = 0xfff0;
305         core->vm_regs.rdx = 0x00000f00;
306         core->ctrl_regs.rflags = 0x00000002; // The reserved bit is always 1
307         core->ctrl_regs.cr0 = 0x60010010; // Set the WP flag so the memory hooks work in real-mode
308
309
310         core->segments.cs.selector = 0xf000;
311         core->segments.cs.limit = 0xffff;
312         core->segments.cs.base = 0x0000000f0000LL;
313
314         // (raw attributes = 0xf3)
315         core->segments.cs.type = 0xb;
316         core->segments.cs.system = 0x1;
317         core->segments.cs.dpl = 0x0;
318         core->segments.cs.present = 1;
319
320
321
322         struct v3_segment * segregs [] = {&(core->segments.ss), &(core->segments.ds), 
323                                           &(core->segments.es), &(core->segments.fs), 
324                                           &(core->segments.gs), NULL};
325
326         for ( i = 0; segregs[i] != NULL; i++) {
327             struct v3_segment * seg = segregs[i];
328         
329             seg->selector = 0x0000;
330             //    seg->base = seg->selector << 4;
331             seg->base = 0x00000000;
332             seg->limit = 0xffff;
333
334
335             seg->type = 0x3;
336             seg->system = 0x1;
337             seg->dpl = 0x0;
338             seg->present = 1;
339             //    seg->granularity = 1;
340
341         }
342
343
344         core->segments.gdtr.limit = 0x0000ffff;
345         core->segments.gdtr.base = 0x0000000000000000LL;
346
347         core->segments.idtr.limit = 0x0000ffff;
348         core->segments.idtr.base = 0x0000000000000000LL;
349
350         core->segments.ldtr.selector = 0x0000;
351         core->segments.ldtr.limit = 0x0000ffff;
352         core->segments.ldtr.base = 0x0000000000000000LL;
353         core->segments.ldtr.type = 2;
354         core->segments.ldtr.present = 1;
355
356         core->segments.tr.selector = 0x0000;
357         core->segments.tr.limit = 0x0000ffff;
358         core->segments.tr.base = 0x0000000000000000LL;
359         core->segments.tr.type = 0xb;
360         core->segments.tr.present = 1;
361
362         //      core->dbg_regs.dr6 = 0x00000000ffff0ff0LL;
363         core->dbg_regs.dr7 = 0x0000000000000400LL;
364
365         /* Enable EPT */
366         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
367         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
368         vmx_state->sec_proc_ctrls.unrstrct_guest = 1; // enable unrestricted guest operation
369
370
371         /* Disable shadow paging stuff */
372         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
373         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
374
375         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
376
377
378         if (v3_init_ept(core, &hw_info) == -1) {
379             PrintError("Error initializing EPT\n");
380             return -1;
381         }
382
383     } else {
384         PrintError("Invalid Virtual paging mode\n");
385         return -1;
386     }
387
388
389     // hook vmx msrs
390
391     // Setup SYSCALL/SYSENTER MSRs in load/store area
392     
393     // save STAR, LSTAR, FMASK, KERNEL_GS_BASE MSRs in MSR load/store area
394     {
395 #define IA32_STAR 0xc0000081
396 #define IA32_LSTAR 0xc0000082
397 #define IA32_FMASK 0xc0000084
398 #define IA32_KERN_GS_BASE 0xc0000102
399
400 #define IA32_CSTAR 0xc0000083 // Compatibility mode STAR (ignored for now... hopefully its not that important...)
401
402         int msr_ret = 0;
403
404         struct vmcs_msr_entry * exit_store_msrs = NULL;
405         struct vmcs_msr_entry * exit_load_msrs = NULL;
406         struct vmcs_msr_entry * entry_load_msrs = NULL;;
407         int max_msrs = (hw_info.misc_info.max_msr_cache_size + 1) * 4;
408
409         V3_Print("Setting up MSR load/store areas (max_msr_count=%d)\n", max_msrs);
410
411         if (max_msrs < 4) {
412             PrintError("Max MSR cache size is too small (%d)\n", max_msrs);
413             return -1;
414         }
415
416         vmx_state->msr_area = V3_VAddr(V3_AllocPages(1));
417
418         if (vmx_state->msr_area == NULL) {
419             PrintError("could not allocate msr load/store area\n");
420             return -1;
421         }
422
423         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_CNT, 4);
424         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_CNT, 4);
425         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_CNT, 4);
426         
427         
428         exit_store_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area);
429         exit_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 4));
430         entry_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 8));
431
432
433         exit_store_msrs[0].index = IA32_STAR;
434         exit_store_msrs[1].index = IA32_LSTAR;
435         exit_store_msrs[2].index = IA32_FMASK;
436         exit_store_msrs[3].index = IA32_KERN_GS_BASE;
437         
438         memcpy(exit_store_msrs, exit_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
439         memcpy(exit_store_msrs, entry_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
440
441         
442         v3_get_msr(IA32_STAR, &(exit_load_msrs[0].hi), &(exit_load_msrs[0].lo));
443         v3_get_msr(IA32_LSTAR, &(exit_load_msrs[1].hi), &(exit_load_msrs[1].lo));
444         v3_get_msr(IA32_FMASK, &(exit_load_msrs[2].hi), &(exit_load_msrs[2].lo));
445         v3_get_msr(IA32_KERN_GS_BASE, &(exit_load_msrs[3].hi), &(exit_load_msrs[3].lo));
446
447         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_ADDR, (addr_t)V3_PAddr(exit_store_msrs));
448         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_ADDR, (addr_t)V3_PAddr(exit_load_msrs));
449         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_ADDR, (addr_t)V3_PAddr(entry_load_msrs));
450
451     }    
452
453     /* Sanity check ctrl/reg fields against hw_defaults */
454
455
456
457
458     /*** Write all the info to the VMCS ***/
459   
460     /*
461     {
462         // IS THIS NECESSARY???
463 #define DEBUGCTL_MSR 0x1d9
464         struct v3_msr tmp_msr;
465         v3_get_msr(DEBUGCTL_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
466         vmx_ret |= check_vmcs_write(VMCS_GUEST_DBG_CTL, tmp_msr.value);
467         core->dbg_regs.dr7 = 0x400;
468     }
469     */
470
471 #ifdef __V3_64BIT__
472     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffffffffffULL);
473 #else
474     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffUL);
475     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR_HIGH, (addr_t)0xffffffffUL);
476 #endif
477
478
479  
480
481     if (v3_update_vmcs_ctrl_fields(core)) {
482         PrintError("Could not write control fields!\n");
483         return -1;
484     }
485     
486     if (v3_update_vmcs_host_state(core)) {
487         PrintError("Could not write host state\n");
488         return -1;
489     }
490
491     // reenable global interrupts for vm state initialization now
492     // that the vm state is initialized. If another VM kicks us off, 
493     // it'll update our vmx state so that we know to reload ourself
494     v3_enable_ints();
495
496     return 0;
497 }
498
499 int v3_init_vmx_vmcs(struct guest_info * core, v3_vm_class_t vm_class) {
500     struct vmx_data * vmx_state = NULL;
501     int vmx_ret = 0;
502     
503     vmx_state = (struct vmx_data *)V3_Malloc(sizeof(struct vmx_data));
504     memset(vmx_state, 0, sizeof(struct vmx_data));
505
506     PrintDebug("vmx_data pointer: %p\n", (void *)vmx_state);
507
508     PrintDebug("Allocating VMCS\n");
509     vmx_state->vmcs_ptr_phys = allocate_vmcs();
510
511     PrintDebug("VMCS pointer: %p\n", (void *)(vmx_state->vmcs_ptr_phys));
512
513     core->vmm_data = vmx_state;
514     vmx_state->state = VMX_UNLAUNCHED;
515
516     PrintDebug("Initializing VMCS (addr=%p)\n", core->vmm_data);
517     
518     // TODO: Fix vmcs fields so they're 32-bit
519
520     PrintDebug("Clearing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
521     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
522
523     if (vmx_ret != VMX_SUCCESS) {
524         PrintError("VMCLEAR failed\n");
525         return -1; 
526     }
527
528     if (vm_class == V3_PC_VM) {
529         PrintDebug("Initializing VMCS\n");
530         if (init_vmcs_bios(core, vmx_state) == -1) {
531             PrintError("Error initializing VMCS to BIOS state\n");
532             return -1;
533         }
534     } else {
535         PrintError("Invalid VM Class\n");
536         return -1;
537     }
538
539     PrintDebug("Serializing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
540     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
541
542     return 0;
543 }
544
545
546 int v3_deinit_vmx_vmcs(struct guest_info * core) {
547     struct vmx_data * vmx_state = core->vmm_data;
548
549     V3_FreePages((void *)(vmx_state->vmcs_ptr_phys), 1);
550     V3_FreePages(vmx_state->msr_area, 1);
551
552     V3_Free(vmx_state);
553
554     return 0;
555 }
556
557
558 static int update_irq_exit_state(struct guest_info * info) {
559     struct vmx_exit_idt_vec_info idt_vec_info;
560
561     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
562
563     if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 0)) {
564 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
565         V3_Print("Calling v3_injecting_intr\n");
566 #endif
567         info->intr_core_state.irq_started = 0;
568         v3_injecting_intr(info, info->intr_core_state.irq_vector, V3_EXTERNAL_IRQ);
569     }
570
571     return 0;
572 }
573
574 static int update_irq_entry_state(struct guest_info * info) {
575     struct vmx_exit_idt_vec_info idt_vec_info;
576     struct vmcs_interrupt_state intr_core_state;
577     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
578
579     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
580     check_vmcs_read(VMCS_GUEST_INT_STATE, &(intr_core_state));
581
582     /* Check for pending exceptions to inject */
583     if (v3_excp_pending(info)) {
584         struct vmx_entry_int_info int_info;
585         int_info.value = 0;
586
587         // In VMX, almost every exception is hardware
588         // Software exceptions are pretty much only for breakpoint or overflow
589         int_info.type = 3;
590         int_info.vector = v3_get_excp_number(info);
591
592         if (info->excp_state.excp_error_code_valid) {
593             check_vmcs_write(VMCS_ENTRY_EXCP_ERR, info->excp_state.excp_error_code);
594             int_info.error_code = 1;
595
596 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
597             V3_Print("Injecting exception %d with error code %x\n", 
598                     int_info.vector, info->excp_state.excp_error_code);
599 #endif
600         }
601
602         int_info.valid = 1;
603 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
604         V3_Print("Injecting exception %d (EIP=%p)\n", int_info.vector, (void *)(addr_t)info->rip);
605 #endif
606         check_vmcs_write(VMCS_ENTRY_INT_INFO, int_info.value);
607
608         v3_injecting_excp(info, int_info.vector);
609
610     } else if ((((struct rflags *)&(info->ctrl_regs.rflags))->intr == 1) && 
611                (intr_core_state.val == 0)) {
612        
613         if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 1)) {
614
615 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
616             V3_Print("IRQ pending from previous injection\n");
617 #endif
618
619             // Copy the IDT vectoring info over to reinject the old interrupt
620             if (idt_vec_info.error_code == 1) {
621                 uint32_t err_code = 0;
622
623                 check_vmcs_read(VMCS_IDT_VECTOR_ERR, &err_code);
624                 check_vmcs_write(VMCS_ENTRY_EXCP_ERR, err_code);
625             }
626
627             idt_vec_info.undef = 0;
628             check_vmcs_write(VMCS_ENTRY_INT_INFO, idt_vec_info.value);
629
630         } else {
631             struct vmx_entry_int_info ent_int;
632             ent_int.value = 0;
633
634             switch (v3_intr_pending(info)) {
635                 case V3_EXTERNAL_IRQ: {
636                     info->intr_core_state.irq_vector = v3_get_intr(info); 
637                     ent_int.vector = info->intr_core_state.irq_vector;
638                     ent_int.type = 0;
639                     ent_int.error_code = 0;
640                     ent_int.valid = 1;
641
642 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
643                     V3_Print("Injecting Interrupt %d at exit %u(EIP=%p)\n", 
644                                info->intr_core_state.irq_vector, 
645                                (uint32_t)info->num_exits, 
646                                (void *)(addr_t)info->rip);
647 #endif
648
649                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
650                     info->intr_core_state.irq_started = 1;
651
652                     break;
653                 }
654                 case V3_NMI:
655                     PrintDebug("Injecting NMI\n");
656
657                     ent_int.type = 2;
658                     ent_int.vector = 2;
659                     ent_int.valid = 1;
660                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
661
662                     break;
663                 case V3_SOFTWARE_INTR:
664                     PrintDebug("Injecting software interrupt\n");
665                     ent_int.type = 4;
666
667                     ent_int.valid = 1;
668                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
669
670                     break;
671                 case V3_VIRTUAL_IRQ:
672                     // Not sure what to do here, Intel doesn't have virtual IRQs
673                     // May be the same as external interrupts/IRQs
674
675                     break;
676                 case V3_INVALID_INTR:
677                 default:
678                     break;
679             }
680         }
681     } else if ((v3_intr_pending(info)) && (vmx_info->pri_proc_ctrls.int_wndw_exit == 0)) {
682         // Enable INTR window exiting so we know when IF=1
683         uint32_t instr_len;
684
685         check_vmcs_read(VMCS_EXIT_INSTR_LEN, &instr_len);
686
687 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
688         V3_Print("Enabling Interrupt-Window exiting: %d\n", instr_len);
689 #endif
690
691         vmx_info->pri_proc_ctrls.int_wndw_exit = 1;
692         check_vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
693     }
694
695
696     return 0;
697 }
698
699
700
701 static struct vmx_exit_info exit_log[10];
702
703 static void print_exit_log(struct guest_info * info) {
704     int cnt = info->num_exits % 10;
705     int i = 0;
706     
707
708     V3_Print("\nExit Log (%d total exits):\n", (uint32_t)info->num_exits);
709
710     for (i = 0; i < 10; i++) {
711         struct vmx_exit_info * tmp = &exit_log[cnt];
712
713         V3_Print("%d:\texit_reason = %p\n", i, (void *)(addr_t)tmp->exit_reason);
714         V3_Print("\texit_qual = %p\n", (void *)tmp->exit_qual);
715         V3_Print("\tint_info = %p\n", (void *)(addr_t)tmp->int_info);
716         V3_Print("\tint_err = %p\n", (void *)(addr_t)tmp->int_err);
717         V3_Print("\tinstr_info = %p\n", (void *)(addr_t)tmp->instr_info);
718
719         cnt--;
720
721         if (cnt == -1) {
722             cnt = 9;
723         }
724
725     }
726
727 }
728
729 /* 
730  * CAUTION and DANGER!!! 
731  * 
732  * The VMCS CANNOT(!!) be accessed outside of the cli/sti calls inside this function
733  * When exectuing a symbiotic call, the VMCS WILL be overwritten, so any dependencies 
734  * on its contents will cause things to break. The contents at the time of the exit WILL 
735  * change before the exit handler is executed.
736  */
737 int v3_vmx_enter(struct guest_info * info) {
738     int ret = 0;
739     uint32_t tsc_offset_low, tsc_offset_high;
740     struct vmx_exit_info exit_info;
741     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
742
743     // Conditionally yield the CPU if the timeslice has expired
744     v3_yield_cond(info);
745
746     // Perform any additional yielding needed for time adjustment
747     v3_adjust_time(info);
748
749     // disable global interrupts for vm state transition
750     v3_disable_ints();
751
752     // Update timer devices late after being in the VM so that as much 
753     // of hte time in the VM is accounted for as possible. Also do it before
754     // updating IRQ entry state so that any interrupts the timers raise get 
755     // handled on the next VM entry. Must be done with interrupts disabled.
756     v3_update_timers(info);
757
758     if (vmcs_store() != vmx_info->vmcs_ptr_phys) {
759         vmcs_load(vmx_info->vmcs_ptr_phys);
760     }
761
762     v3_vmx_restore_vmcs(info);
763
764
765 #ifdef V3_CONFIG_SYMCALL
766     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
767         update_irq_entry_state(info);
768     }
769 #else 
770     update_irq_entry_state(info);
771 #endif
772
773     {
774         addr_t guest_cr3;
775         vmcs_read(VMCS_GUEST_CR3, &guest_cr3);
776         vmcs_write(VMCS_GUEST_CR3, guest_cr3);
777     }
778
779     // Perform last-minute time bookkeeping prior to entering the VM
780     v3_time_enter_vm(info);
781
782     tsc_offset_high = (uint32_t)((v3_tsc_host_offset(&info->time_state) >> 32) & 0xffffffff);
783     tsc_offset_low = (uint32_t)(v3_tsc_host_offset(&info->time_state) & 0xffffffff);
784     check_vmcs_write(VMCS_TSC_OFFSET_HIGH, tsc_offset_high);
785     check_vmcs_write(VMCS_TSC_OFFSET, tsc_offset_low);
786
787     if (v3_update_vmcs_host_state(info)) {
788         v3_enable_ints();
789         PrintError("Could not write host state\n");
790         return -1;
791     }
792
793
794     if (vmx_info->state == VMX_UNLAUNCHED) {
795         vmx_info->state = VMX_LAUNCHED;
796         info->vm_info->run_state = VM_RUNNING;
797         ret = v3_vmx_launch(&(info->vm_regs), info, &(info->ctrl_regs));
798     } else {
799         V3_ASSERT(vmx_info->state != VMX_UNLAUNCHED);
800         ret = v3_vmx_resume(&(info->vm_regs), info, &(info->ctrl_regs));
801     }
802     
803     //  PrintDebug("VMX Exit: ret=%d\n", ret);
804
805     if (ret != VMX_SUCCESS) {
806         uint32_t error = 0;
807
808         vmcs_read(VMCS_INSTR_ERR, &error);
809
810         v3_enable_ints();
811
812         PrintError("VMENTRY Error: %d\n", error);
813         return -1;
814     }
815
816     // Immediate exit from VM time bookkeeping
817     v3_time_exit_vm(info);
818
819     info->num_exits++;
820
821     /* Update guest state */
822     v3_vmx_save_vmcs(info);
823
824     // info->cpl = info->segments.cs.selector & 0x3;
825
826     info->mem_mode = v3_get_vm_mem_mode(info);
827     info->cpu_mode = v3_get_vm_cpu_mode(info);
828
829
830     check_vmcs_read(VMCS_EXIT_INSTR_LEN, &(exit_info.instr_len));
831     check_vmcs_read(VMCS_EXIT_INSTR_INFO, &(exit_info.instr_info));
832     check_vmcs_read(VMCS_EXIT_REASON, &(exit_info.exit_reason));
833     check_vmcs_read(VMCS_EXIT_QUAL, &(exit_info.exit_qual));
834     check_vmcs_read(VMCS_EXIT_INT_INFO, &(exit_info.int_info));
835     check_vmcs_read(VMCS_EXIT_INT_ERR, &(exit_info.int_err));
836     check_vmcs_read(VMCS_GUEST_LINEAR_ADDR, &(exit_info.guest_linear_addr));
837
838     if (info->shdw_pg_mode == NESTED_PAGING) {
839         check_vmcs_read(VMCS_GUEST_PHYS_ADDR, &(exit_info.ept_fault_addr));
840     }
841
842     //PrintDebug("VMX Exit taken, id-qual: %u-%lu\n", exit_info.exit_reason, exit_info.exit_qual);
843
844     exit_log[info->num_exits % 10] = exit_info;
845
846 #ifdef V3_CONFIG_SYMCALL
847     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
848         update_irq_exit_state(info);
849     }
850 #else
851     update_irq_exit_state(info);
852 #endif
853
854     if (exit_info.exit_reason == VMEXIT_INTR_WINDOW) {
855         // This is a special case whose only job is to inject an interrupt
856         vmcs_read(VMCS_PROC_CTRLS, &(vmx_info->pri_proc_ctrls.value));
857         vmx_info->pri_proc_ctrls.int_wndw_exit = 0;
858         vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
859
860 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
861        V3_Print("Interrupts available again! (RIP=%llx)\n", info->rip);
862 #endif
863     }
864
865     // reenable global interrupts after vm exit
866     v3_enable_ints();
867
868     // Conditionally yield the CPU if the timeslice has expired
869     v3_yield_cond(info);
870
871     if (v3_handle_vmx_exit(info, &exit_info) == -1) {
872         PrintError("Error in VMX exit handler\n");
873         return -1;
874     }
875
876     return 0;
877 }
878
879
880 int v3_start_vmx_guest(struct guest_info * info) {
881
882     PrintDebug("Starting VMX core %u\n", info->vcpu_id);
883
884     if (info->vcpu_id == 0) {
885         info->core_run_state = CORE_RUNNING;
886         info->vm_info->run_state = VM_RUNNING;
887     } else {
888
889         PrintDebug("VMX core %u: Waiting for core initialization\n", info->vcpu_id);
890
891         while (info->core_run_state == CORE_STOPPED) {
892             v3_yield(info);
893             //PrintDebug("VMX core %u: still waiting for INIT\n",info->vcpu_id);
894         }
895         
896         PrintDebug("VMX core %u initialized\n", info->vcpu_id);
897     }
898
899
900     PrintDebug("VMX core %u: I am starting at CS=0x%x (base=0x%p, limit=0x%x),  RIP=0x%p\n",
901                info->vcpu_id, info->segments.cs.selector, (void *)(info->segments.cs.base),
902                info->segments.cs.limit, (void *)(info->rip));
903
904
905     PrintDebug("VMX core %u: Launching VMX VM on logical core %u\n", info->vcpu_id, info->pcpu_id);
906
907     v3_start_time(info);
908
909     while (1) {
910
911         if (info->vm_info->run_state == VM_STOPPED) {
912             info->core_run_state = CORE_STOPPED;
913             break;
914         }
915
916         if (v3_vmx_enter(info) == -1) {
917             v3_print_vmcs();
918             print_exit_log(info);
919             return -1;
920         }
921
922
923
924         if (info->vm_info->run_state == VM_STOPPED) {
925             info->core_run_state = CORE_STOPPED;
926             break;
927         }
928 /*
929         if ((info->num_exits % 5000) == 0) {
930             V3_Print("VMX Exit number %d\n", (uint32_t)info->num_exits);
931         }
932 */
933
934     }
935
936     return 0;
937 }
938
939
940
941
942 #define VMX_FEATURE_CONTROL_MSR     0x0000003a
943 #define CPUID_VMX_FEATURES 0x00000005  /* LOCK and VMXON */
944 #define CPUID_1_ECX_VTXFLAG 0x00000020
945
946 int v3_is_vmx_capable() {
947     v3_msr_t feature_msr;
948     uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
949
950     v3_cpuid(0x1, &eax, &ebx, &ecx, &edx);
951
952     PrintDebug("ECX: 0x%x\n", ecx);
953
954     if (ecx & CPUID_1_ECX_VTXFLAG) {
955         v3_get_msr(VMX_FEATURE_CONTROL_MSR, &(feature_msr.hi), &(feature_msr.lo));
956         
957         PrintDebug("MSRREGlow: 0x%.8x\n", feature_msr.lo);
958
959         if ((feature_msr.lo & CPUID_VMX_FEATURES) != CPUID_VMX_FEATURES) {
960             PrintDebug("VMX is locked -- enable in the BIOS\n");
961             return 0;
962         }
963
964     } else {
965         PrintDebug("VMX not supported on this cpu\n");
966         return 0;
967     }
968
969     return 1;
970 }
971
972
973 int v3_reset_vmx_vm_core(struct guest_info * core, addr_t rip) {
974     // init vmcs bios
975     
976     if ((core->shdw_pg_mode == NESTED_PAGING) && 
977         (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
978         // easy 
979         core->rip = 0;
980         core->segments.cs.selector = rip << 8;
981         core->segments.cs.limit = 0xffff;
982         core->segments.cs.base = rip << 12;
983     } else {
984         core->vm_regs.rdx = core->vcpu_id;
985         core->vm_regs.rbx = rip;
986     }
987
988     return 0;
989 }
990
991
992
993 void v3_init_vmx_cpu(int cpu_id) {
994
995     if (cpu_id == 0) {
996         if (v3_init_vmx_hw(&hw_info) == -1) {
997             PrintError("Could not initialize VMX hardware features on cpu %d\n", cpu_id);
998             return;
999         }
1000     }
1001
1002     enable_vmx();
1003
1004
1005     // Setup VMXON Region
1006     host_vmcs_ptrs[cpu_id] = allocate_vmcs();
1007
1008     PrintDebug("VMXON pointer: 0x%p\n", (void *)host_vmcs_ptrs[cpu_id]);
1009
1010     if (vmx_on(host_vmcs_ptrs[cpu_id]) == VMX_SUCCESS) {
1011         V3_Print("VMX Enabled\n");
1012     } else {
1013         PrintError("VMX initialization failure\n");
1014         return;
1015     }
1016     
1017
1018     {
1019         struct vmx_sec_proc_ctrls sec_proc_ctrls;
1020         sec_proc_ctrls.value = v3_vmx_get_ctrl_features(&(hw_info.sec_proc_ctrls));
1021         
1022         if (sec_proc_ctrls.enable_ept == 0) {
1023             V3_Print("VMX EPT (Nested) Paging not supported\n");
1024             v3_cpu_types[cpu_id] = V3_VMX_CPU;
1025         } else if (sec_proc_ctrls.unrstrct_guest == 0) {
1026             V3_Print("VMX EPT (Nested) Paging supported\n");
1027             v3_cpu_types[cpu_id] = V3_VMX_EPT_CPU;
1028         } else {
1029             V3_Print("VMX EPT (Nested) Paging + Unrestricted guest supported\n");
1030             v3_cpu_types[cpu_id] = V3_VMX_EPT_UG_CPU;
1031         }
1032     }
1033 }
1034
1035
1036 void v3_deinit_vmx_cpu(int cpu_id) {
1037     extern v3_cpu_arch_t v3_cpu_types[];
1038     v3_cpu_types[cpu_id] = V3_INVALID_CPU;
1039     V3_FreePages((void *)host_vmcs_ptrs[cpu_id], 1);
1040 }