Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


added barrier sync point to vmx
[palacios.git] / palacios / src / palacios / vmx.c
1 /* 
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2011, Jack Lange <jarusl@cs.northwestern.edu>
11  * Copyright (c) 2011, The V3VEE Project <http://www.v3vee.org> 
12  * All rights reserved.
13  *
14  * Author: Jack Lange <jarusl@cs.northwestern.edu>
15  *
16  * This is free software.  You are permitted to use,
17  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
18  */
19
20
21 #include <palacios/vmx.h>
22 #include <palacios/vmm.h>
23 #include <palacios/vmx_handler.h>
24 #include <palacios/vmcs.h>
25 #include <palacios/vmx_lowlevel.h>
26 #include <palacios/vmm_lowlevel.h>
27 #include <palacios/vmm_ctrl_regs.h>
28 #include <palacios/vmm_config.h>
29 #include <palacios/vmm_time.h>
30 #include <palacios/vm_guest_mem.h>
31 #include <palacios/vmm_direct_paging.h>
32 #include <palacios/vmx_io.h>
33 #include <palacios/vmx_msr.h>
34 #include <palacios/vmm_decoder.h>
35 #include <palacios/vmm_barrier.h>
36
37 #include <palacios/vmx_ept.h>
38 #include <palacios/vmx_assist.h>
39 #include <palacios/vmx_hw_info.h>
40
41 #ifndef V3_CONFIG_DEBUG_VMX
42 #undef PrintDebug
43 #define PrintDebug(fmt, args...)
44 #endif
45
46
47 /* These fields contain the hardware feature sets supported by the local CPU */
48 static struct vmx_hw_info hw_info;
49
50 extern v3_cpu_arch_t v3_cpu_types[];
51
52 static addr_t host_vmcs_ptrs[V3_CONFIG_MAX_CPUS] = { [0 ... V3_CONFIG_MAX_CPUS - 1] = 0};
53
54 extern int v3_vmx_launch(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
55 extern int v3_vmx_resume(struct v3_gprs * vm_regs, struct guest_info * info, struct v3_ctrl_regs * ctrl_regs);
56
57 static int inline check_vmcs_write(vmcs_field_t field, addr_t val) {
58     int ret = 0;
59
60     ret = vmcs_write(field, val);
61
62     if (ret != VMX_SUCCESS) {
63         PrintError("VMWRITE error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
64         return 1;
65     }
66
67     return 0;
68 }
69
70 static int inline check_vmcs_read(vmcs_field_t field, void * val) {
71     int ret = 0;
72
73     ret = vmcs_read(field, val);
74
75     if (ret != VMX_SUCCESS) {
76         PrintError("VMREAD error on %s!: %d\n", v3_vmcs_field_to_str(field), ret);
77     }
78
79     return ret;
80 }
81
82
83
84
85 static addr_t allocate_vmcs() {
86     struct vmcs_data * vmcs_page = NULL;
87
88     PrintDebug("Allocating page\n");
89
90     vmcs_page = (struct vmcs_data *)V3_VAddr(V3_AllocPages(1));
91     memset(vmcs_page, 0, 4096);
92
93     vmcs_page->revision = hw_info.basic_info.revision;
94     PrintDebug("VMX Revision: 0x%x\n", vmcs_page->revision);
95
96     return (addr_t)V3_PAddr((void *)vmcs_page);
97 }
98
99
100
101
102 static int init_vmcs_bios(struct guest_info * core, struct vmx_data * vmx_state) {
103     int vmx_ret = 0;
104
105     // disable global interrupts for vm state initialization
106     v3_disable_ints();
107
108     PrintDebug("Loading VMCS\n");
109     vmx_ret = vmcs_load(vmx_state->vmcs_ptr_phys);
110     vmx_state->state = VMX_UNLAUNCHED;
111
112     if (vmx_ret != VMX_SUCCESS) {
113         PrintError("VMPTRLD failed\n");
114         return -1;
115     }
116
117
118     /*** Setup default state from HW ***/
119
120     vmx_state->pin_ctrls.value = hw_info.pin_ctrls.def_val;
121     vmx_state->pri_proc_ctrls.value = hw_info.proc_ctrls.def_val;
122     vmx_state->exit_ctrls.value = hw_info.exit_ctrls.def_val;
123     vmx_state->entry_ctrls.value = hw_info.entry_ctrls.def_val;
124     vmx_state->sec_proc_ctrls.value = hw_info.sec_proc_ctrls.def_val;
125
126     /* Print Control MSRs */
127     PrintDebug("CR0 MSR: %p\n", (void *)(addr_t)hw_info.cr0.value);
128     PrintDebug("CR4 MSR: %p\n", (void *)(addr_t)hw_info.cr4.value);
129
130
131
132     /******* Setup Host State **********/
133
134     /* Cache GDTR, IDTR, and TR in host struct */
135     addr_t gdtr_base;
136     struct {
137         uint16_t selector;
138         addr_t   base;
139     } __attribute__((packed)) tmp_seg;
140     
141
142     __asm__ __volatile__(
143                          "sgdt (%0);"
144                          :
145                          : "q"(&tmp_seg)
146                          : "memory"
147                          );
148     gdtr_base = tmp_seg.base;
149     vmx_state->host_state.gdtr.base = gdtr_base;
150
151     __asm__ __volatile__(
152                          "sidt (%0);"
153                          :
154                          : "q"(&tmp_seg)
155                          : "memory"
156                          );
157     vmx_state->host_state.idtr.base = tmp_seg.base;
158
159     __asm__ __volatile__(
160                          "str (%0);"
161                          :
162                          : "q"(&tmp_seg)
163                          : "memory"
164                          );
165     vmx_state->host_state.tr.selector = tmp_seg.selector;
166
167     /* The GDTR *index* is bits 3-15 of the selector. */
168     struct tss_descriptor * desc = NULL;
169     desc = (struct tss_descriptor *)(gdtr_base + (8 * (tmp_seg.selector >> 3)));
170
171     tmp_seg.base = ((desc->base1) |
172                     (desc->base2 << 16) |
173                     (desc->base3 << 24) |
174 #ifdef __V3_64BIT__
175                     ((uint64_t)desc->base4 << 32)
176 #else 
177                     (0)
178 #endif
179                     );
180
181     vmx_state->host_state.tr.base = tmp_seg.base;
182
183
184     /********** Setup VMX Control Fields ***********/
185
186     /* Add external interrupts, NMI exiting, and virtual NMI */
187     vmx_state->pin_ctrls.nmi_exit = 1;
188     vmx_state->pin_ctrls.ext_int_exit = 1;
189
190
191     vmx_state->pri_proc_ctrls.hlt_exit = 1;
192
193
194     vmx_state->pri_proc_ctrls.pause_exit = 0;
195     vmx_state->pri_proc_ctrls.tsc_offset = 1;
196 #ifdef V3_CONFIG_TIME_VIRTUALIZE_TSC
197     vmx_state->pri_proc_ctrls.rdtsc_exit = 1;
198 #endif
199
200     /* Setup IO map */
201     vmx_state->pri_proc_ctrls.use_io_bitmap = 1;
202     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_A_ADDR, (addr_t)V3_PAddr(core->vm_info->io_map.arch_data));
203     vmx_ret |= check_vmcs_write(VMCS_IO_BITMAP_B_ADDR, 
204             (addr_t)V3_PAddr(core->vm_info->io_map.arch_data) + PAGE_SIZE_4KB);
205
206
207     vmx_state->pri_proc_ctrls.use_msr_bitmap = 1;
208     vmx_ret |= check_vmcs_write(VMCS_MSR_BITMAP, (addr_t)V3_PAddr(core->vm_info->msr_map.arch_data));
209
210
211
212     
213
214
215
216 #ifdef __V3_64BIT__
217     // Ensure host runs in 64-bit mode at each VM EXIT
218     vmx_state->exit_ctrls.host_64_on = 1;
219 #endif
220
221     // Hook all accesses to EFER register
222     v3_hook_msr(core->vm_info, EFER_MSR, 
223                 &v3_handle_efer_read,
224                 &v3_handle_efer_write, 
225                 core);
226
227     // Restore host's EFER register on each VM EXIT
228     vmx_state->exit_ctrls.ld_efer = 1;
229
230     // Save/restore guest's EFER register to/from VMCS on VM EXIT/ENTRY
231     vmx_state->exit_ctrls.save_efer = 1;
232     vmx_state->entry_ctrls.ld_efer  = 1;
233
234     // Cause VM_EXIT whenever CR4.VMXE or CR4.PAE bits are written
235     vmx_ret |= check_vmcs_write(VMCS_CR4_MASK, CR4_VMXE | CR4_PAE);
236
237
238     /* Setup paging */
239     if (core->shdw_pg_mode == SHADOW_PAGING) {
240         PrintDebug("Creating initial shadow page table\n");
241
242         if (v3_init_passthrough_pts(core) == -1) {
243             PrintError("Could not initialize passthrough page tables\n");
244             return -1;
245         }
246         
247 #define CR0_PE 0x00000001
248 #define CR0_PG 0x80000000
249 #define CR0_WP 0x00010000 // To ensure mem hooks work
250         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
251
252         core->ctrl_regs.cr3 = core->direct_map_pt;
253
254         // vmx_state->pinbased_ctrls |= NMI_EXIT;
255
256         /* Add CR exits */
257         vmx_state->pri_proc_ctrls.cr3_ld_exit = 1;
258         vmx_state->pri_proc_ctrls.cr3_str_exit = 1;
259         
260         vmx_state->pri_proc_ctrls.invlpg_exit = 1;
261         
262         /* Add page fault exits */
263         vmx_state->excp_bmap.pf = 1;
264
265         // Setup VMX Assist
266         v3_vmxassist_init(core, vmx_state);
267
268     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
269                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_CPU)) {
270
271 #define CR0_PE 0x00000001
272 #define CR0_PG 0x80000000
273 #define CR0_WP 0x00010000 // To ensure mem hooks work
274         vmx_ret |= check_vmcs_write(VMCS_CR0_MASK, (CR0_PE | CR0_PG | CR0_WP));
275
276         // vmx_state->pinbased_ctrls |= NMI_EXIT;
277
278         /* Disable CR exits */
279         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
280         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
281
282         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
283
284         /* Add page fault exits */
285         //      vmx_state->excp_bmap.pf = 1; // This should never happen..., enabled to catch bugs
286         
287         // Setup VMX Assist
288         v3_vmxassist_init(core, vmx_state);
289
290         /* Enable EPT */
291         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
292         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
293
294
295
296         if (v3_init_ept(core, &hw_info) == -1) {
297             PrintError("Error initializing EPT\n");
298             return -1;
299         }
300
301     } else if ((core->shdw_pg_mode == NESTED_PAGING) && 
302                (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
303         int i = 0;
304         // For now we will assume that unrestricted guest mode is assured w/ EPT
305
306
307         core->vm_regs.rsp = 0x00;
308         core->rip = 0xfff0;
309         core->vm_regs.rdx = 0x00000f00;
310         core->ctrl_regs.rflags = 0x00000002; // The reserved bit is always 1
311         core->ctrl_regs.cr0 = 0x60010010; // Set the WP flag so the memory hooks work in real-mode
312
313
314         core->segments.cs.selector = 0xf000;
315         core->segments.cs.limit = 0xffff;
316         core->segments.cs.base = 0x0000000f0000LL;
317
318         // (raw attributes = 0xf3)
319         core->segments.cs.type = 0xb;
320         core->segments.cs.system = 0x1;
321         core->segments.cs.dpl = 0x0;
322         core->segments.cs.present = 1;
323
324
325
326         struct v3_segment * segregs [] = {&(core->segments.ss), &(core->segments.ds), 
327                                           &(core->segments.es), &(core->segments.fs), 
328                                           &(core->segments.gs), NULL};
329
330         for ( i = 0; segregs[i] != NULL; i++) {
331             struct v3_segment * seg = segregs[i];
332         
333             seg->selector = 0x0000;
334             //    seg->base = seg->selector << 4;
335             seg->base = 0x00000000;
336             seg->limit = 0xffff;
337
338
339             seg->type = 0x3;
340             seg->system = 0x1;
341             seg->dpl = 0x0;
342             seg->present = 1;
343             //    seg->granularity = 1;
344
345         }
346
347
348         core->segments.gdtr.limit = 0x0000ffff;
349         core->segments.gdtr.base = 0x0000000000000000LL;
350
351         core->segments.idtr.limit = 0x0000ffff;
352         core->segments.idtr.base = 0x0000000000000000LL;
353
354         core->segments.ldtr.selector = 0x0000;
355         core->segments.ldtr.limit = 0x0000ffff;
356         core->segments.ldtr.base = 0x0000000000000000LL;
357         core->segments.ldtr.type = 2;
358         core->segments.ldtr.present = 1;
359
360         core->segments.tr.selector = 0x0000;
361         core->segments.tr.limit = 0x0000ffff;
362         core->segments.tr.base = 0x0000000000000000LL;
363         core->segments.tr.type = 0xb;
364         core->segments.tr.present = 1;
365
366         //      core->dbg_regs.dr6 = 0x00000000ffff0ff0LL;
367         core->dbg_regs.dr7 = 0x0000000000000400LL;
368
369         /* Enable EPT */
370         vmx_state->pri_proc_ctrls.sec_ctrls = 1; // Enable secondary proc controls
371         vmx_state->sec_proc_ctrls.enable_ept = 1; // enable EPT paging
372         vmx_state->sec_proc_ctrls.unrstrct_guest = 1; // enable unrestricted guest operation
373
374
375         /* Disable shadow paging stuff */
376         vmx_state->pri_proc_ctrls.cr3_ld_exit = 0;
377         vmx_state->pri_proc_ctrls.cr3_str_exit = 0;
378
379         vmx_state->pri_proc_ctrls.invlpg_exit = 0;
380
381
382         if (v3_init_ept(core, &hw_info) == -1) {
383             PrintError("Error initializing EPT\n");
384             return -1;
385         }
386
387     } else {
388         PrintError("Invalid Virtual paging mode\n");
389         return -1;
390     }
391
392
393     // hook vmx msrs
394
395     // Setup SYSCALL/SYSENTER MSRs in load/store area
396     
397     // save STAR, LSTAR, FMASK, KERNEL_GS_BASE MSRs in MSR load/store area
398     {
399 #define IA32_STAR 0xc0000081
400 #define IA32_LSTAR 0xc0000082
401 #define IA32_FMASK 0xc0000084
402 #define IA32_KERN_GS_BASE 0xc0000102
403
404 #define IA32_CSTAR 0xc0000083 // Compatibility mode STAR (ignored for now... hopefully its not that important...)
405
406         int msr_ret = 0;
407
408         struct vmcs_msr_entry * exit_store_msrs = NULL;
409         struct vmcs_msr_entry * exit_load_msrs = NULL;
410         struct vmcs_msr_entry * entry_load_msrs = NULL;;
411         int max_msrs = (hw_info.misc_info.max_msr_cache_size + 1) * 4;
412
413         V3_Print("Setting up MSR load/store areas (max_msr_count=%d)\n", max_msrs);
414
415         if (max_msrs < 4) {
416             PrintError("Max MSR cache size is too small (%d)\n", max_msrs);
417             return -1;
418         }
419
420         vmx_state->msr_area = V3_VAddr(V3_AllocPages(1));
421
422         if (vmx_state->msr_area == NULL) {
423             PrintError("could not allocate msr load/store area\n");
424             return -1;
425         }
426
427         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_CNT, 4);
428         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_CNT, 4);
429         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_CNT, 4);
430         
431         
432         exit_store_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area);
433         exit_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 4));
434         entry_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 8));
435
436
437         exit_store_msrs[0].index = IA32_STAR;
438         exit_store_msrs[1].index = IA32_LSTAR;
439         exit_store_msrs[2].index = IA32_FMASK;
440         exit_store_msrs[3].index = IA32_KERN_GS_BASE;
441         
442         memcpy(exit_store_msrs, exit_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
443         memcpy(exit_store_msrs, entry_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
444
445         
446         v3_get_msr(IA32_STAR, &(exit_load_msrs[0].hi), &(exit_load_msrs[0].lo));
447         v3_get_msr(IA32_LSTAR, &(exit_load_msrs[1].hi), &(exit_load_msrs[1].lo));
448         v3_get_msr(IA32_FMASK, &(exit_load_msrs[2].hi), &(exit_load_msrs[2].lo));
449         v3_get_msr(IA32_KERN_GS_BASE, &(exit_load_msrs[3].hi), &(exit_load_msrs[3].lo));
450
451         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_ADDR, (addr_t)V3_PAddr(exit_store_msrs));
452         msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_ADDR, (addr_t)V3_PAddr(exit_load_msrs));
453         msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_ADDR, (addr_t)V3_PAddr(entry_load_msrs));
454
455     }    
456
457     /* Sanity check ctrl/reg fields against hw_defaults */
458
459
460
461
462     /*** Write all the info to the VMCS ***/
463   
464     /*
465     {
466         // IS THIS NECESSARY???
467 #define DEBUGCTL_MSR 0x1d9
468         struct v3_msr tmp_msr;
469         v3_get_msr(DEBUGCTL_MSR, &(tmp_msr.hi), &(tmp_msr.lo));
470         vmx_ret |= check_vmcs_write(VMCS_GUEST_DBG_CTL, tmp_msr.value);
471         core->dbg_regs.dr7 = 0x400;
472     }
473     */
474
475 #ifdef __V3_64BIT__
476     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffffffffffULL);
477 #else
478     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR, (addr_t)0xffffffffUL);
479     vmx_ret |= check_vmcs_write(VMCS_LINK_PTR_HIGH, (addr_t)0xffffffffUL);
480 #endif
481
482
483  
484
485     if (v3_update_vmcs_ctrl_fields(core)) {
486         PrintError("Could not write control fields!\n");
487         return -1;
488     }
489     
490     if (v3_update_vmcs_host_state(core)) {
491         PrintError("Could not write host state\n");
492         return -1;
493     }
494
495     // reenable global interrupts for vm state initialization now
496     // that the vm state is initialized. If another VM kicks us off, 
497     // it'll update our vmx state so that we know to reload ourself
498     v3_enable_ints();
499
500     return 0;
501 }
502
503 int v3_init_vmx_vmcs(struct guest_info * core, v3_vm_class_t vm_class) {
504     struct vmx_data * vmx_state = NULL;
505     int vmx_ret = 0;
506     
507     vmx_state = (struct vmx_data *)V3_Malloc(sizeof(struct vmx_data));
508     memset(vmx_state, 0, sizeof(struct vmx_data));
509
510     PrintDebug("vmx_data pointer: %p\n", (void *)vmx_state);
511
512     PrintDebug("Allocating VMCS\n");
513     vmx_state->vmcs_ptr_phys = allocate_vmcs();
514
515     PrintDebug("VMCS pointer: %p\n", (void *)(vmx_state->vmcs_ptr_phys));
516
517     core->vmm_data = vmx_state;
518     vmx_state->state = VMX_UNLAUNCHED;
519
520     PrintDebug("Initializing VMCS (addr=%p)\n", core->vmm_data);
521     
522     // TODO: Fix vmcs fields so they're 32-bit
523
524     PrintDebug("Clearing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
525     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
526
527     if (vmx_ret != VMX_SUCCESS) {
528         PrintError("VMCLEAR failed\n");
529         return -1; 
530     }
531
532     if (vm_class == V3_PC_VM) {
533         PrintDebug("Initializing VMCS\n");
534         if (init_vmcs_bios(core, vmx_state) == -1) {
535             PrintError("Error initializing VMCS to BIOS state\n");
536             return -1;
537         }
538     } else {
539         PrintError("Invalid VM Class\n");
540         return -1;
541     }
542
543     PrintDebug("Serializing VMCS: %p\n", (void *)vmx_state->vmcs_ptr_phys);
544     vmx_ret = vmcs_clear(vmx_state->vmcs_ptr_phys);
545
546     return 0;
547 }
548
549
550 int v3_deinit_vmx_vmcs(struct guest_info * core) {
551     struct vmx_data * vmx_state = core->vmm_data;
552
553     V3_FreePages((void *)(vmx_state->vmcs_ptr_phys), 1);
554     V3_FreePages(V3_PAddr(vmx_state->msr_area), 1);
555
556     V3_Free(vmx_state);
557
558     return 0;
559 }
560
561
562 static int update_irq_exit_state(struct guest_info * info) {
563     struct vmx_exit_idt_vec_info idt_vec_info;
564
565     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
566
567     if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 0)) {
568 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
569         V3_Print("Calling v3_injecting_intr\n");
570 #endif
571         info->intr_core_state.irq_started = 0;
572         v3_injecting_intr(info, info->intr_core_state.irq_vector, V3_EXTERNAL_IRQ);
573     }
574
575     return 0;
576 }
577
578 static int update_irq_entry_state(struct guest_info * info) {
579     struct vmx_exit_idt_vec_info idt_vec_info;
580     struct vmcs_interrupt_state intr_core_state;
581     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
582
583     check_vmcs_read(VMCS_IDT_VECTOR_INFO, &(idt_vec_info.value));
584     check_vmcs_read(VMCS_GUEST_INT_STATE, &(intr_core_state));
585
586     /* Check for pending exceptions to inject */
587     if (v3_excp_pending(info)) {
588         struct vmx_entry_int_info int_info;
589         int_info.value = 0;
590
591         // In VMX, almost every exception is hardware
592         // Software exceptions are pretty much only for breakpoint or overflow
593         int_info.type = 3;
594         int_info.vector = v3_get_excp_number(info);
595
596         if (info->excp_state.excp_error_code_valid) {
597             check_vmcs_write(VMCS_ENTRY_EXCP_ERR, info->excp_state.excp_error_code);
598             int_info.error_code = 1;
599
600 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
601             V3_Print("Injecting exception %d with error code %x\n", 
602                     int_info.vector, info->excp_state.excp_error_code);
603 #endif
604         }
605
606         int_info.valid = 1;
607 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
608         V3_Print("Injecting exception %d (EIP=%p)\n", int_info.vector, (void *)(addr_t)info->rip);
609 #endif
610         check_vmcs_write(VMCS_ENTRY_INT_INFO, int_info.value);
611
612         v3_injecting_excp(info, int_info.vector);
613
614     } else if ((((struct rflags *)&(info->ctrl_regs.rflags))->intr == 1) && 
615                (intr_core_state.val == 0)) {
616        
617         if ((info->intr_core_state.irq_started == 1) && (idt_vec_info.valid == 1)) {
618
619 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
620             V3_Print("IRQ pending from previous injection\n");
621 #endif
622
623             // Copy the IDT vectoring info over to reinject the old interrupt
624             if (idt_vec_info.error_code == 1) {
625                 uint32_t err_code = 0;
626
627                 check_vmcs_read(VMCS_IDT_VECTOR_ERR, &err_code);
628                 check_vmcs_write(VMCS_ENTRY_EXCP_ERR, err_code);
629             }
630
631             idt_vec_info.undef = 0;
632             check_vmcs_write(VMCS_ENTRY_INT_INFO, idt_vec_info.value);
633
634         } else {
635             struct vmx_entry_int_info ent_int;
636             ent_int.value = 0;
637
638             switch (v3_intr_pending(info)) {
639                 case V3_EXTERNAL_IRQ: {
640                     info->intr_core_state.irq_vector = v3_get_intr(info); 
641                     ent_int.vector = info->intr_core_state.irq_vector;
642                     ent_int.type = 0;
643                     ent_int.error_code = 0;
644                     ent_int.valid = 1;
645
646 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
647                     V3_Print("Injecting Interrupt %d at exit %u(EIP=%p)\n", 
648                                info->intr_core_state.irq_vector, 
649                                (uint32_t)info->num_exits, 
650                                (void *)(addr_t)info->rip);
651 #endif
652
653                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
654                     info->intr_core_state.irq_started = 1;
655
656                     break;
657                 }
658                 case V3_NMI:
659                     PrintDebug("Injecting NMI\n");
660
661                     ent_int.type = 2;
662                     ent_int.vector = 2;
663                     ent_int.valid = 1;
664                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
665
666                     break;
667                 case V3_SOFTWARE_INTR:
668                     PrintDebug("Injecting software interrupt\n");
669                     ent_int.type = 4;
670
671                     ent_int.valid = 1;
672                     check_vmcs_write(VMCS_ENTRY_INT_INFO, ent_int.value);
673
674                     break;
675                 case V3_VIRTUAL_IRQ:
676                     // Not sure what to do here, Intel doesn't have virtual IRQs
677                     // May be the same as external interrupts/IRQs
678
679                     break;
680                 case V3_INVALID_INTR:
681                 default:
682                     break;
683             }
684         }
685     } else if ((v3_intr_pending(info)) && (vmx_info->pri_proc_ctrls.int_wndw_exit == 0)) {
686         // Enable INTR window exiting so we know when IF=1
687         uint32_t instr_len;
688
689         check_vmcs_read(VMCS_EXIT_INSTR_LEN, &instr_len);
690
691 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
692         V3_Print("Enabling Interrupt-Window exiting: %d\n", instr_len);
693 #endif
694
695         vmx_info->pri_proc_ctrls.int_wndw_exit = 1;
696         check_vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
697     }
698
699
700     return 0;
701 }
702
703
704
705 static struct vmx_exit_info exit_log[10];
706
707 static void print_exit_log(struct guest_info * info) {
708     int cnt = info->num_exits % 10;
709     int i = 0;
710     
711
712     V3_Print("\nExit Log (%d total exits):\n", (uint32_t)info->num_exits);
713
714     for (i = 0; i < 10; i++) {
715         struct vmx_exit_info * tmp = &exit_log[cnt];
716
717         V3_Print("%d:\texit_reason = %p\n", i, (void *)(addr_t)tmp->exit_reason);
718         V3_Print("\texit_qual = %p\n", (void *)tmp->exit_qual);
719         V3_Print("\tint_info = %p\n", (void *)(addr_t)tmp->int_info);
720         V3_Print("\tint_err = %p\n", (void *)(addr_t)tmp->int_err);
721         V3_Print("\tinstr_info = %p\n", (void *)(addr_t)tmp->instr_info);
722
723         cnt--;
724
725         if (cnt == -1) {
726             cnt = 9;
727         }
728
729     }
730
731 }
732
733 /* 
734  * CAUTION and DANGER!!! 
735  * 
736  * The VMCS CANNOT(!!) be accessed outside of the cli/sti calls inside this function
737  * When exectuing a symbiotic call, the VMCS WILL be overwritten, so any dependencies 
738  * on its contents will cause things to break. The contents at the time of the exit WILL 
739  * change before the exit handler is executed.
740  */
741 int v3_vmx_enter(struct guest_info * info) {
742     int ret = 0;
743     uint32_t tsc_offset_low, tsc_offset_high;
744     struct vmx_exit_info exit_info;
745     struct vmx_data * vmx_info = (struct vmx_data *)(info->vmm_data);
746
747     // Conditionally yield the CPU if the timeslice has expired
748     v3_yield_cond(info);
749
750     // Perform any additional yielding needed for time adjustment
751     v3_adjust_time(info);
752
753     // disable global interrupts for vm state transition
754     v3_disable_ints();
755
756     // Update timer devices late after being in the VM so that as much 
757     // of hte time in the VM is accounted for as possible. Also do it before
758     // updating IRQ entry state so that any interrupts the timers raise get 
759     // handled on the next VM entry. Must be done with interrupts disabled.
760     v3_update_timers(info);
761
762     if (vmcs_store() != vmx_info->vmcs_ptr_phys) {
763         vmcs_clear(vmx_info->vmcs_ptr_phys);
764         vmcs_load(vmx_info->vmcs_ptr_phys);
765         vmx_info->state = VMX_UNLAUNCHED;
766     }
767
768     v3_vmx_restore_vmcs(info);
769
770
771 #ifdef V3_CONFIG_SYMCALL
772     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
773         update_irq_entry_state(info);
774     }
775 #else 
776     update_irq_entry_state(info);
777 #endif
778
779     {
780         addr_t guest_cr3;
781         vmcs_read(VMCS_GUEST_CR3, &guest_cr3);
782         vmcs_write(VMCS_GUEST_CR3, guest_cr3);
783     }
784
785     // Perform last-minute time bookkeeping prior to entering the VM
786     v3_time_enter_vm(info);
787
788     tsc_offset_high = (uint32_t)((v3_tsc_host_offset(&info->time_state) >> 32) & 0xffffffff);
789     tsc_offset_low = (uint32_t)(v3_tsc_host_offset(&info->time_state) & 0xffffffff);
790     check_vmcs_write(VMCS_TSC_OFFSET_HIGH, tsc_offset_high);
791     check_vmcs_write(VMCS_TSC_OFFSET, tsc_offset_low);
792
793
794     if (v3_update_vmcs_host_state(info)) {
795         v3_enable_ints();
796         PrintError("Could not write host state\n");
797         return -1;
798     }
799
800
801     if (vmx_info->state == VMX_UNLAUNCHED) {
802         vmx_info->state = VMX_LAUNCHED;
803
804         info->vm_info->run_state = VM_RUNNING;
805         ret = v3_vmx_launch(&(info->vm_regs), info, &(info->ctrl_regs));
806     } else {
807         V3_ASSERT(vmx_info->state != VMX_UNLAUNCHED);
808         ret = v3_vmx_resume(&(info->vm_regs), info, &(info->ctrl_regs));
809     }
810     
811     //  PrintDebug("VMX Exit: ret=%d\n", ret);
812
813     if (ret != VMX_SUCCESS) {
814         uint32_t error = 0;
815
816         vmcs_read(VMCS_INSTR_ERR, &error);
817
818         v3_enable_ints();
819
820         PrintError("VMENTRY Error: %d\n", error);
821         return -1;
822     }
823
824     // Immediate exit from VM time bookkeeping
825     v3_time_exit_vm(info);
826
827     info->num_exits++;
828
829     /* Update guest state */
830     v3_vmx_save_vmcs(info);
831
832     // info->cpl = info->segments.cs.selector & 0x3;
833
834     info->mem_mode = v3_get_vm_mem_mode(info);
835     info->cpu_mode = v3_get_vm_cpu_mode(info);
836
837
838     check_vmcs_read(VMCS_EXIT_INSTR_LEN, &(exit_info.instr_len));
839     check_vmcs_read(VMCS_EXIT_INSTR_INFO, &(exit_info.instr_info));
840     check_vmcs_read(VMCS_EXIT_REASON, &(exit_info.exit_reason));
841     check_vmcs_read(VMCS_EXIT_QUAL, &(exit_info.exit_qual));
842     check_vmcs_read(VMCS_EXIT_INT_INFO, &(exit_info.int_info));
843     check_vmcs_read(VMCS_EXIT_INT_ERR, &(exit_info.int_err));
844     check_vmcs_read(VMCS_GUEST_LINEAR_ADDR, &(exit_info.guest_linear_addr));
845
846     if (info->shdw_pg_mode == NESTED_PAGING) {
847         check_vmcs_read(VMCS_GUEST_PHYS_ADDR, &(exit_info.ept_fault_addr));
848     }
849
850     //PrintDebug("VMX Exit taken, id-qual: %u-%lu\n", exit_info.exit_reason, exit_info.exit_qual);
851
852     exit_log[info->num_exits % 10] = exit_info;
853
854 #ifdef V3_CONFIG_SYMCALL
855     if (info->sym_core_state.symcall_state.sym_call_active == 0) {
856         update_irq_exit_state(info);
857     }
858 #else
859     update_irq_exit_state(info);
860 #endif
861
862     if (exit_info.exit_reason == VMEXIT_INTR_WINDOW) {
863         // This is a special case whose only job is to inject an interrupt
864         vmcs_read(VMCS_PROC_CTRLS, &(vmx_info->pri_proc_ctrls.value));
865         vmx_info->pri_proc_ctrls.int_wndw_exit = 0;
866         vmcs_write(VMCS_PROC_CTRLS, vmx_info->pri_proc_ctrls.value);
867
868 #ifdef V3_CONFIG_DEBUG_INTERRUPTS
869        V3_Print("Interrupts available again! (RIP=%llx)\n", info->rip);
870 #endif
871     }
872
873     // reenable global interrupts after vm exit
874     v3_enable_ints();
875
876     // Conditionally yield the CPU if the timeslice has expired
877     v3_yield_cond(info);
878
879     if (v3_handle_vmx_exit(info, &exit_info) == -1) {
880         PrintError("Error in VMX exit handler (Exit reason=%x)\n", exit_info.exit_reason);
881         return -1;
882     }
883
884     return 0;
885 }
886
887
888 int v3_start_vmx_guest(struct guest_info * info) {
889
890     PrintDebug("Starting VMX core %u\n", info->vcpu_id);
891
892     if (info->vcpu_id == 0) {
893         info->core_run_state = CORE_RUNNING;
894         info->vm_info->run_state = VM_RUNNING;
895     } else {
896
897         PrintDebug("VMX core %u: Waiting for core initialization\n", info->vcpu_id);
898
899         while (info->core_run_state == CORE_STOPPED) {
900             v3_yield(info);
901             //PrintDebug("VMX core %u: still waiting for INIT\n",info->vcpu_id);
902         }
903         
904         PrintDebug("VMX core %u initialized\n", info->vcpu_id);
905     }
906
907
908     PrintDebug("VMX core %u: I am starting at CS=0x%x (base=0x%p, limit=0x%x),  RIP=0x%p\n",
909                info->vcpu_id, info->segments.cs.selector, (void *)(info->segments.cs.base),
910                info->segments.cs.limit, (void *)(info->rip));
911
912
913     PrintDebug("VMX core %u: Launching VMX VM on logical core %u\n", info->vcpu_id, info->pcpu_id);
914
915     v3_start_time(info);
916
917     while (1) {
918
919         if (info->vm_info->run_state == VM_STOPPED) {
920             info->core_run_state = CORE_STOPPED;
921             break;
922         }
923
924         if (v3_vmx_enter(info) == -1) {
925
926             addr_t host_addr;
927             addr_t linear_addr = 0;
928             
929             info->vm_info->run_state = VM_ERROR;
930             
931             V3_Print("VMX core %u: VMX ERROR!!\n", info->vcpu_id); 
932             
933             v3_print_guest_state(info);
934             
935             V3_Print("VMX core %u\n", info->vcpu_id); 
936
937             linear_addr = get_addr_linear(info, info->rip, &(info->segments.cs));
938             
939             if (info->mem_mode == PHYSICAL_MEM) {
940                 v3_gpa_to_hva(info, linear_addr, &host_addr);
941             } else if (info->mem_mode == VIRTUAL_MEM) {
942                 v3_gva_to_hva(info, linear_addr, &host_addr);
943             }
944             
945             V3_Print("VMX core %u: Host Address of rip = 0x%p\n", info->vcpu_id, (void *)host_addr);
946             
947             V3_Print("VMX core %u: Instr (15 bytes) at %p:\n", info->vcpu_id, (void *)host_addr);
948             v3_dump_mem((uint8_t *)host_addr, 15);
949             
950             v3_print_stack(info);
951
952
953             v3_print_vmcs();
954             print_exit_log(info);
955             return -1;
956         }
957
958         v3_wait_at_barrier(info);
959
960
961         if (info->vm_info->run_state == VM_STOPPED) {
962             info->core_run_state = CORE_STOPPED;
963             break;
964         }
965 /*
966         if ((info->num_exits % 5000) == 0) {
967             V3_Print("VMX Exit number %d\n", (uint32_t)info->num_exits);
968         }
969 */
970
971     }
972
973     return 0;
974 }
975
976
977
978
979 #define VMX_FEATURE_CONTROL_MSR     0x0000003a
980 #define CPUID_VMX_FEATURES 0x00000005  /* LOCK and VMXON */
981 #define CPUID_1_ECX_VTXFLAG 0x00000020
982
983 int v3_is_vmx_capable() {
984     v3_msr_t feature_msr;
985     uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
986
987     v3_cpuid(0x1, &eax, &ebx, &ecx, &edx);
988
989     PrintDebug("ECX: 0x%x\n", ecx);
990
991     if (ecx & CPUID_1_ECX_VTXFLAG) {
992         v3_get_msr(VMX_FEATURE_CONTROL_MSR, &(feature_msr.hi), &(feature_msr.lo));
993         
994         PrintDebug("MSRREGlow: 0x%.8x\n", feature_msr.lo);
995
996         if ((feature_msr.lo & CPUID_VMX_FEATURES) != CPUID_VMX_FEATURES) {
997             PrintDebug("VMX is locked -- enable in the BIOS\n");
998             return 0;
999         }
1000
1001     } else {
1002         PrintDebug("VMX not supported on this cpu\n");
1003         return 0;
1004     }
1005
1006     return 1;
1007 }
1008
1009
1010 int v3_reset_vmx_vm_core(struct guest_info * core, addr_t rip) {
1011     // init vmcs bios
1012     
1013     if ((core->shdw_pg_mode == NESTED_PAGING) && 
1014         (v3_cpu_types[core->pcpu_id] == V3_VMX_EPT_UG_CPU)) {
1015         // easy 
1016         core->rip = 0;
1017         core->segments.cs.selector = rip << 8;
1018         core->segments.cs.limit = 0xffff;
1019         core->segments.cs.base = rip << 12;
1020     } else {
1021         core->vm_regs.rdx = core->vcpu_id;
1022         core->vm_regs.rbx = rip;
1023     }
1024
1025     return 0;
1026 }
1027
1028
1029
1030 void v3_init_vmx_cpu(int cpu_id) {
1031     addr_t vmx_on_region = 0;
1032
1033     if (cpu_id == 0) {
1034         if (v3_init_vmx_hw(&hw_info) == -1) {
1035             PrintError("Could not initialize VMX hardware features on cpu %d\n", cpu_id);
1036             return;
1037         }
1038     }
1039
1040     enable_vmx();
1041
1042
1043     // Setup VMXON Region
1044     vmx_on_region = allocate_vmcs();
1045
1046
1047     if (vmx_on(vmx_on_region) == VMX_SUCCESS) {
1048         V3_Print("VMX Enabled\n");
1049         host_vmcs_ptrs[cpu_id] = vmx_on_region;
1050     } else {
1051         V3_Print("VMX already enabled\n");
1052         V3_FreePages((void *)vmx_on_region, 1);
1053     }
1054
1055     PrintDebug("VMXON pointer: 0x%p\n", (void *)host_vmcs_ptrs[cpu_id]);    
1056
1057     {
1058         struct vmx_sec_proc_ctrls sec_proc_ctrls;
1059         sec_proc_ctrls.value = v3_vmx_get_ctrl_features(&(hw_info.sec_proc_ctrls));
1060         
1061         if (sec_proc_ctrls.enable_ept == 0) {
1062             V3_Print("VMX EPT (Nested) Paging not supported\n");
1063             v3_cpu_types[cpu_id] = V3_VMX_CPU;
1064         } else if (sec_proc_ctrls.unrstrct_guest == 0) {
1065             V3_Print("VMX EPT (Nested) Paging supported\n");
1066             v3_cpu_types[cpu_id] = V3_VMX_EPT_CPU;
1067         } else {
1068             V3_Print("VMX EPT (Nested) Paging + Unrestricted guest supported\n");
1069             v3_cpu_types[cpu_id] = V3_VMX_EPT_UG_CPU;
1070         }
1071     }
1072 }
1073
1074
1075 void v3_deinit_vmx_cpu(int cpu_id) {
1076     extern v3_cpu_arch_t v3_cpu_types[];
1077     v3_cpu_types[cpu_id] = V3_INVALID_CPU;
1078
1079     if (host_vmcs_ptrs[cpu_id] != 0) {
1080         V3_Print("Disabling VMX\n");
1081
1082         if (vmx_off() != VMX_SUCCESS) {
1083             PrintError("Error executing VMXOFF\n");
1084         }
1085
1086         V3_FreePages((void *)host_vmcs_ptrs[cpu_id], 1);
1087
1088         host_vmcs_ptrs[cpu_id] = 0;
1089     }
1090 }